diff --git a/.github/workflows/bisect-dispatch.yaml b/.github/workflows/bisect-dispatch.yaml
index dce44222ea7..3905c86cd0c 100644
--- a/.github/workflows/bisect-dispatch.yaml
+++ b/.github/workflows/bisect-dispatch.yaml
@@ -50,12 +50,12 @@ on:
         required: true
         type: string
         description: "Timeout (eg: 5m, 1h)"
-      description:
-        type: string
-        default: "Git bisect dispatch"
+      patch:
         required: false
+        type: string
+        description: "Commit-ish to cherry-pick for each step"
 
-run-name: ${{ inputs.description }}
+run-name: "Bisect on ${{ inputs.runner-label }}"
 jobs:
   build-artifact:
     uses: ./.github/workflows/build-artifact.yaml
@@ -88,8 +88,11 @@ jobs:
       - uses: ./.github/actions/install-python-deps
       - name: Run Git Bisect
         shell: bash
+        env:
+          GIT_COMMITTER_NAME: "GitHub Actions"
+          GIT_COMMITTER_EMAIL: "actions@github.com"
         run: |
           source ${{ github.workspace }}/python_env/bin/activate
           cd $TT_METAL_HOME
           export PYTHONPATH=$TT_METAL_HOME
-          ./tests/scripts/tt_bisect.sh -t ${{ inputs.timeout }} -f "${{ inputs.command }}" -b ${{ inputs.bad-commit }} -g ${{ inputs.good-commit }}
+          ./tests/scripts/tt_bisect.sh -t ${{ inputs.timeout }} -f "${{ inputs.command }}" -b ${{ inputs.bad-commit }} -g ${{ inputs.good-commit }} -p "${{ inputs.patch }}"
diff --git a/.github/workflows/tgg-unit-tests-impl.yaml b/.github/workflows/tgg-unit-tests-impl.yaml
index 140230c82b2..22a56b63189 100644
--- a/.github/workflows/tgg-unit-tests-impl.yaml
+++ b/.github/workflows/tgg-unit-tests-impl.yaml
@@ -2,6 +2,13 @@ name: "[internal] TGG unit tests impl"
 
 on:
   workflow_call:
+    inputs:
+      docker-image:
+        required: true
+        type: string
+      wheel-artifact-name:
+        required: true
+        type: string
 
 jobs:
   TGG-tests:
@@ -17,26 +24,60 @@ jobs:
           },
         ]
     name: ${{ matrix.test-group.name }}
-    env:
-      ARCH_NAME: ${{ matrix.test-group.arch }}
-      LOGURU_LEVEL: INFO
-      LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
     runs-on: ${{ matrix.test-group.runs-on }}
+    container:
+      image: ${{ inputs.docker-image }}
+      env:
+        TT_METAL_HOME: /work
+        PYTHONPATH: /work
+        LD_LIBRARY_PATH: /work/build/lib
+        LOGURU_LEVEL: INFO
+        ARCH_NAME: ${{ matrix.test-group.arch }}
+      volumes:
+        - ${{ github.workspace }}/docker-job:/work # Subdir to workaround https://github.com/actions/runner/issues/691
+        - /dev/hugepages-1G:/dev/hugepages-1G
+        - /mnt/MLPerf:/mnt/MLPerf
+      options: "--device /dev/tenstorrent"
+    defaults:
+      run:
+        shell: bash
+        working-directory: /work # https://github.com/actions/runner/issues/878
     steps:
-      - uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main
-      - name: Set up dynamic env vars for build
-        run: |
-          echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
+      - name: ⬇️ Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: docker-job # Here be dragons; keep it scoped to our desired volume, yet must be under github.workspace and be sure to clean up at the end
       - uses: actions/download-artifact@v4
         with:
           name: TTMetal_build_any
+          path: /work
       - name: Extract files
         run: tar -xvf ttm_any.tar
-      - uses: ./.github/actions/install-python-deps
+      - name: ⬇️ Download Wheel
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ inputs.wheel-artifact-name }}
+          path: /work
+      - name: Install Wheel
+        run: |
+          WHEEL_FILENAME=$(ls -1 *.whl)
+          pip3 install $WHEEL_FILENAME
       - name: Run unit regression tests
         timeout-minutes: 60
         run: |
-          source ${{ github.workspace }}/python_env/bin/activate
-          cd $TT_METAL_HOME
-          export PYTHONPATH=$TT_METAL_HOME
+          set -x
+          pwd
+          echo $PYTHONPATH
+          ls -al
           ${{ matrix.test-group.cmd }}
+      - name: Cleanup
+        if: always()
+        run: |
+          # We are forced to checkout the repo into a subdir of the host's workdir; this pollutes the host
+          # with root-owned files.  Be sure to clean up after ourselves in case we're on a non-ephemeral runner.
+          echo "pre rm"
+          ls -al /__w/tt-metal/tt-metal
+          rm -rf /__w/tt-metal/tt-metal/docker-job
+          echo "post rm"
+          ls -al /__w/tt-metal/tt-metal
diff --git a/.github/workflows/tgg-unit-tests.yaml b/.github/workflows/tgg-unit-tests.yaml
index 6c42ff61f4f..9d1bba42a64 100644
--- a/.github/workflows/tgg-unit-tests.yaml
+++ b/.github/workflows/tgg-unit-tests.yaml
@@ -9,7 +9,12 @@ jobs:
   build-artifact:
     uses: ./.github/workflows/build-artifact.yaml
     secrets: inherit
+    with:
+      build-wheel: true
   TGG-tests:
     needs: build-artifact
     secrets: inherit
     uses: ./.github/workflows/tgg-unit-tests-impl.yaml
+    with:
+      docker-image: ${{ needs.build-artifact.outputs.ci-build-docker-image }}
+      wheel-artifact-name: ${{ needs.build-artifact.outputs.wheel-artifact-name }}
diff --git a/tests/scripts/run_tests.sh b/tests/scripts/run_tests.sh
index 0f4d4480a11..a048cd440c5 100755
--- a/tests/scripts/run_tests.sh
+++ b/tests/scripts/run_tests.sh
@@ -431,7 +431,13 @@ set_up_chdir() {
         return
       fi
     done
-      echo "Could not find the 'tt-metal' directory in your PYTHONPATH." 1>&2
+    for ENTRY in "${ENTRIES[@]}"; do
+      if [[ -d "$ENTRY/tt_metal" ]]; then
+        cd "$ENTRY"
+        return
+      fi
+    done
+    echo "Could not find the 'tt-metal' directory in your PYTHONPATH." 1>&2
     exit 1
 }
 
diff --git a/tests/scripts/t3000/run_t3000_unit_tests.sh b/tests/scripts/t3000/run_t3000_unit_tests.sh
index e4e54a510b1..7f709db3316 100755
--- a/tests/scripts/t3000/run_t3000_unit_tests.sh
+++ b/tests/scripts/t3000/run_t3000_unit_tests.sh
@@ -26,6 +26,7 @@ run_t3000_ttmetal_tests() {
   ./build/programming_examples/distributed/distributed_program_dispatch
   ./build/programming_examples/distributed/distributed_buffer_rw
   ./build/programming_examples/distributed/distributed_eltwise_add
+  ./build/programming_examples/distributed/distributed_trace_and_events
 
   # Record the end time
   end_time=$(date +%s)
diff --git a/tests/scripts/tt_bisect.sh b/tests/scripts/tt_bisect.sh
index 5304803d18b..077656456d8 100755
--- a/tests/scripts/tt_bisect.sh
+++ b/tests/scripts/tt_bisect.sh
@@ -7,6 +7,8 @@ Flags:
     -f | --file : test file to run, also the test that broke
     -g | --good : good commit to start bisect
     -b | --bad : bad commit to start bisect
+    -p | --path : commit-ish to cherry-pick onto each commit before building
+    -t | --timeout : timeout duration for the test
 Example:
     ./tests/scripts/tt_bisect.sh -f ./build/test/tt_metal/test_add_two_ints -b HEAD -g 1eb7930
 If the test involves multiple words you have to do "test_file":
@@ -19,7 +21,8 @@ source python_env/bin/activate
 export PYTHONPATH=$TT_METAL_HOME
 
 timeout_duration=2m
-while getopts "f:g:b:t:" opt; do
+patch=""
+while getopts "f:g:b:t:p:" opt; do
     case $opt in
          f | file)
             test=$OPTARG
@@ -33,6 +36,9 @@ while getopts "f:g:b:t:" opt; do
          t | timeout)
             timeout_duration=$OPTARG
             ;;
+         p | patch)
+            patch=$OPTARG
+            ;;
          \?)
             echo "Invalid option: -$OPTARG" >&2
             exit 1
@@ -48,14 +54,20 @@ fi
 echo "Time to find who broke it :)"
 echo "Good commit:" $good_commit
 echo "Bad commit:" $bad_commit
+if ([ ! -z "$patch" ]); then
+    echo "Cherry-pick commit:" $patch
+fi
 
 found=false
 
 git bisect start $bad_commit $good_commit --
 
 while [[ "$found" = "false" ]]; do
-   git submodule update --recursive
    echo "::group::Building `git rev-parse HEAD`"
+   if ([ ! -z "$patch" ]); then
+      git cherry-pick $patch
+   fi
+   git submodule update --recursive
    build_rc=0
    ./build_metal.sh --build-tests > /dev/null || build_rc=$?
    echo "::endgroup::"
@@ -70,6 +82,11 @@ while [[ "$found" = "false" ]]; do
    timeout_rc=0
    timeout "$timeout_duration" bash -c "$test" || timeout_rc=$?
    echo "Exit code: $timeout_rc"
+
+   if ([ ! -z "$patch" ]); then
+      # Must reset HEAD or git bisect good/bad will retry the merge base and we'll be stuck in a loop
+      git reset --hard HEAD^
+   fi
    echo "::endgroup::"
 
    if [ $timeout_rc -eq 0 ]; then
diff --git a/tests/tt_metal/distributed/CMakeLists.txt b/tests/tt_metal/distributed/CMakeLists.txt
index 922e19ef993..88890c7eded 100644
--- a/tests/tt_metal/distributed/CMakeLists.txt
+++ b/tests/tt_metal/distributed/CMakeLists.txt
@@ -1,7 +1,8 @@
 set(UNIT_TESTS_DISTRIBUTED_SRC
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_distributed.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_buffer.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_coord.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_device.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_device_reshape.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_workload.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_sub_device.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_allocator.cpp
diff --git a/tests/tt_metal/distributed/test_distributed.cpp b/tests/tt_metal/distributed/test_distributed.cpp
deleted file mode 100644
index bf8877879e3..00000000000
--- a/tests/tt_metal/distributed/test_distributed.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <tt-metalium/system_mesh.hpp>
-
-#include "tests/tt_metal/tt_metal/common/multi_device_fixture.hpp"
-
-namespace tt::tt_metal::distributed::test {
-namespace {
-
-TEST_F(T3000MeshDeviceFixture, SimpleMeshDeviceTest) {
-    EXPECT_EQ(mesh_device_->num_devices(), 8);
-    EXPECT_EQ(mesh_device_->num_rows(), 2);
-    EXPECT_EQ(mesh_device_->num_cols(), 4);
-}
-
-TEST(MeshDeviceSuite, Test1x1SystemMeshInitialize) {
-    auto& sys = tt::tt_metal::distributed::SystemMesh::instance();
-
-    auto config = tt::tt_metal::distributed::MeshDeviceConfig{.mesh_shape = MeshShape(1, 1)};
-
-    EXPECT_NO_THROW({
-        auto mesh = tt::tt_metal::distributed::MeshDevice::create(
-            config, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER);
-        mesh->close();
-    });
-}
-
-}  // namespace
-}  // namespace tt::tt_metal::distributed::test
diff --git a/tests/tt_metal/distributed/test_mesh_device.cpp b/tests/tt_metal/distributed/test_mesh_device.cpp
new file mode 100644
index 00000000000..c87c87cae35
--- /dev/null
+++ b/tests/tt_metal/distributed/test_mesh_device.cpp
@@ -0,0 +1,93 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+
+#include "mesh_device.hpp"
+#include "system_mesh.hpp"
+
+#include "tests/tt_metal/tt_metal/common/multi_device_fixture.hpp"
+
+namespace tt::tt_metal::distributed {
+namespace {
+
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
+using ::tt::tt_metal::distributed::MeshContainer;
+
+TEST(MeshDeviceInitTest, Init1x1Mesh) {
+    auto& sys = SystemMesh::instance();
+
+    auto config = tt::tt_metal::distributed::MeshDeviceConfig{.mesh_shape = MeshShape(1, 1)};
+
+    EXPECT_NO_THROW({
+        auto mesh = tt::tt_metal::distributed::MeshDevice::create(
+            config, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER);
+        mesh->close();
+    });
+}
+
+using MeshDeviceTest = T3000MeshDeviceFixture;
+
+TEST_F(MeshDeviceTest, SystemMeshTearDownWithoutClose) {
+    auto& sys = SystemMesh::instance();
+
+    const auto system_shape = sys.get_shape();
+    ASSERT_EQ(system_shape.dims(), 2);
+    EXPECT_EQ(system_shape[0], 2);
+    EXPECT_EQ(system_shape[1], 4);
+}
+
+TEST_F(MeshDeviceTest, MemoryAllocationStatistics) {
+    auto stats = mesh_device_->allocator()->get_statistics(tt::tt_metal::BufferType::DRAM);
+    for (auto* device : mesh_device_->get_devices()) {
+        auto device_stats = device->allocator()->get_statistics(tt::tt_metal::BufferType::DRAM);
+        EXPECT_EQ(stats.total_allocatable_size_bytes, device_stats.total_allocatable_size_bytes);
+    }
+}
+
+TEST_F(MeshDeviceTest, NumDramChannels) {
+    EXPECT_EQ(mesh_device_->num_dram_channels(), 96);  // 8 devices * 12 channels
+}
+
+TEST_F(MeshDeviceTest, ViewIs2D) {
+    std::vector<IDevice*> devices = mesh_device_->get_devices();
+
+    MeshContainer<IDevice*> container_1d(SimpleMeshShape(8), devices);
+    MeshDeviceView view_1d(container_1d);
+    EXPECT_FALSE(view_1d.is_mesh_2d());
+
+    MeshContainer<IDevice*> container_2d(SimpleMeshShape(2, 4), devices);
+    MeshDeviceView view_2d(container_2d);
+    EXPECT_TRUE(view_2d.is_mesh_2d());
+
+    MeshContainer<IDevice*> container_3d(SimpleMeshShape(2, 2, 2), devices);
+    MeshDeviceView view_3d(container_3d);
+    EXPECT_FALSE(view_3d.is_mesh_2d());
+}
+
+TEST_F(MeshDeviceTest, Submesh) {
+    EXPECT_EQ(mesh_device_->shape().num_rows, 2);
+    EXPECT_EQ(mesh_device_->shape().num_cols, 4);
+    EXPECT_THAT(mesh_device_->get_devices(), SizeIs(8));
+    EXPECT_TRUE(mesh_device_->is_parent_mesh());
+    EXPECT_THAT(mesh_device_->get_submeshes(), IsEmpty());
+
+    auto submesh = mesh_device_->create_submesh(MeshShape{1, 2}, MeshOffset{1, 1});
+    EXPECT_THAT(mesh_device_->get_submeshes(), SizeIs(1));
+    EXPECT_EQ(submesh->shape().num_rows, 1);
+    EXPECT_EQ(submesh->shape().num_cols, 2);
+    EXPECT_THAT(submesh->get_devices(), SizeIs(2));
+    EXPECT_FALSE(submesh->is_parent_mesh());
+    EXPECT_THAT(submesh->get_submeshes(), IsEmpty());
+
+    // Verify coordinates are correct.
+    EXPECT_EQ(mesh_device_->get_device(MeshCoordinate{1, 1})->id(), submesh->get_device(MeshCoordinate{0, 0})->id());
+    EXPECT_EQ(mesh_device_->get_device(MeshCoordinate{1, 2})->id(), submesh->get_device(MeshCoordinate{0, 1})->id());
+    EXPECT_EQ(submesh->get_device(1, 1), nullptr);
+}
+
+}  // namespace
+}  // namespace tt::tt_metal::distributed
diff --git a/tests/ttnn/distributed/test_distributed_reshape.cpp b/tests/tt_metal/distributed/test_mesh_device_reshape.cpp
similarity index 62%
rename from tests/ttnn/distributed/test_distributed_reshape.cpp
rename to tests/tt_metal/distributed/test_mesh_device_reshape.cpp
index f3a085d0700..893ad9aca1a 100644
--- a/tests/ttnn/distributed/test_distributed_reshape.cpp
+++ b/tests/tt_metal/distributed/test_mesh_device_reshape.cpp
@@ -6,29 +6,20 @@
 #include <gmock/gmock.h>
 #include <cstddef>
 #include <array>
-#include <ttnn/core.hpp>
-#include <ttnn/distributed/api.hpp>
+
+#include "host_api.hpp"
+#include "mesh_config.hpp"
+#include "mesh_device.hpp"
 #include "mesh_coord.hpp"
+
+#include "system_mesh.hpp"
 #include "tests/tt_metal/test_utils/env_vars.hpp"
 
-namespace ttnn::distributed::test {
+namespace tt::tt_metal::distributed {
 namespace {
 
 using ::testing::SizeIs;
 
-// Helper function to check test environment
-void check_t3k_test_environment() {
-    auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
-    const auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
-    const size_t num_devices = tt::tt_metal::GetNumAvailableDevices();
-    if (slow_dispatch) {
-        GTEST_SKIP() << "Skipping Multi-Device test suite, since it can only be run in Fast Dispatch Mode.";
-    }
-    if (num_devices < 8 or arch != tt::ARCH::WORMHOLE_B0) {
-        GTEST_SKIP() << "Skipping T3K Multi-Device test suite on non T3K machine.";
-    }
-}
-
 std::vector<chip_id_t> get_physical_device_ids(const MeshDevice& mesh) {
     std::vector<chip_id_t> device_ids;
     for (auto* device : mesh.get_devices()) {
@@ -37,46 +28,56 @@ std::vector<chip_id_t> get_physical_device_ids(const MeshDevice& mesh) {
     return device_ids;
 }
 
-static constexpr std::array<MeshShape, 24> kMeshShapes{
-    {{1, 1}, {1, 2}, {1, 3}, {1, 4}, {1, 5}, {1, 6}, {1, 7}, {1, 8}, {2, 1}, {2, 2}, {2, 3}, {2, 4},
-     {3, 1}, {3, 2}, {4, 1}, {4, 2}, {8, 1}, {7, 1}, {6, 1}, {5, 1}, {4, 1}, {3, 1}, {2, 1}, {1, 1}}};
-
-class MeshConfigurationTest : public ::testing::TestWithParam<MeshShape> {
-protected:
-    void SetUp() override { check_t3k_test_environment(); }
+class T3KTestFixture : public ::testing::Test {
+public:
+    void SetUp() override {
+        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
+        const auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
+        const size_t num_devices = tt::tt_metal::GetNumAvailableDevices();
+        if (slow_dispatch) {
+            GTEST_SKIP() << "Skipping Multi-Device test suite, since it can only be run in Fast Dispatch Mode.";
+        }
+        if (num_devices < 8 or arch != tt::ARCH::WORMHOLE_B0) {
+            GTEST_SKIP() << "Skipping T3K Multi-Device test suite on non T3K machine.";
+        }
+    }
 };
 
+constexpr std::array<MeshShape, 24> kMeshShapes{{{1, 1}, {1, 2}, {1, 3}, {1, 4}, {1, 5}, {1, 6}, {1, 7}, {1, 8},
+                                                 {2, 1}, {2, 2}, {2, 3}, {2, 4}, {3, 1}, {3, 2}, {4, 1}, {4, 2},
+                                                 {8, 1}, {7, 1}, {6, 1}, {5, 1}, {4, 1}, {3, 1}, {2, 1}, {1, 1}}};
+
+class MeshConfigurationTest : public T3KTestFixture, public ::testing::WithParamInterface<MeshShape> {};
+
 TEST_P(MeshConfigurationTest, MeshConfigurations) {
     const auto& shape = GetParam();
-    auto mesh = ttnn::distributed::open_mesh_device(
-        {shape.num_rows, shape.num_cols},
+    auto mesh = tt::tt_metal::distributed::MeshDevice::create(
+        MeshDeviceConfig{.mesh_shape = SimpleMeshShape(shape.num_rows, shape.num_cols)},
         DEFAULT_L1_SMALL_SIZE,
         DEFAULT_TRACE_REGION_SIZE,
         1,
         tt::tt_metal::DispatchCoreType::WORKER);
     EXPECT_EQ(mesh->num_rows(), shape.num_rows);
     EXPECT_EQ(mesh->num_cols(), shape.num_cols);
-    ttnn::distributed::close_mesh_device(mesh);
+    mesh->close();
 }
 
 TEST_P(MeshConfigurationTest, GetPhysicalDeviceIds) {
     const auto& shape = GetParam();
 
-    auto& system_mesh = tt::tt_metal::distributed::SystemMesh::instance();
+    auto& system_mesh = SystemMesh::instance();
     EXPECT_THAT(
         system_mesh.get_mapped_physical_device_ids(MeshDeviceConfig{.mesh_shape = SimpleMeshShape(shape)}),
         SizeIs(shape.num_cols * shape.num_rows));
 }
 
 // Test all possible mesh configurations on T3000
-INSTANTIATE_TEST_SUITE_P(MeshShapes, MeshConfigurationTest, ::testing::ValuesIn(kMeshShapes));
+INSTANTIATE_TEST_SUITE_P(AllMeshShapes, MeshConfigurationTest, ::testing::ValuesIn(kMeshShapes));
 
-class MeshReshapeTest : public ::testing::TestWithParam<std::tuple<MeshShape, MeshShape>> {
-protected:
-    void SetUp() override { check_t3k_test_environment(); }
-};
+class MeshDeviceReshapeRoundtripTest : public T3KTestFixture,
+                                       public ::testing::WithParamInterface<std::tuple<MeshShape, MeshShape>> {};
 
-TEST_P(MeshReshapeTest, ReshapeBetweenConfigurations) {
+TEST_P(MeshDeviceReshapeRoundtripTest, ReshapeBetweenConfigurations) {
     const auto& [old_shape, new_shape] = GetParam();
 
     if ((old_shape.num_rows * old_shape.num_cols) != (new_shape.num_rows * new_shape.num_cols)) {
@@ -86,8 +87,8 @@ TEST_P(MeshReshapeTest, ReshapeBetweenConfigurations) {
         GTEST_SKIP() << "Old shape is 1xN or Nx1; we test this in From1x4To2x2Invalid";
     }
 
-    auto mesh = ttnn::distributed::open_mesh_device(
-        {old_shape.num_rows, old_shape.num_cols},
+    auto mesh = tt::tt_metal::distributed::MeshDevice::create(
+        MeshDeviceConfig{.mesh_shape = SimpleMeshShape(old_shape.num_rows, old_shape.num_cols)},
         DEFAULT_L1_SMALL_SIZE,
         DEFAULT_TRACE_REGION_SIZE,
         1,
@@ -112,17 +113,14 @@ TEST_P(MeshReshapeTest, ReshapeBetweenConfigurations) {
 
 // Generate all possible combinations of shapes from kMeshShapes
 INSTANTIATE_TEST_SUITE_P(
-    ReshapeConfigurations,
-    MeshReshapeTest,
+    AllMeshShapes,
+    MeshDeviceReshapeRoundtripTest,
     ::testing::Combine(::testing::ValuesIn(kMeshShapes), ::testing::ValuesIn(kMeshShapes)));
 
 // Base class for non-parameterized tests
-class T3000ReshapeTest : public ::testing::Test {
-protected:
-    void SetUp() override { check_t3k_test_environment(); }
-};
+using MeshDeviceReshapeTest = T3KTestFixture;
 
-TEST_F(T3000ReshapeTest, InvalidRequestedShape) {
+TEST_F(MeshDeviceReshapeTest, InvalidRequestedShape) {
     auto& system_mesh = tt::tt_metal::distributed::SystemMesh::instance();
 
     // Shape too big.
@@ -144,9 +142,13 @@ TEST_F(T3000ReshapeTest, InvalidRequestedShape) {
         MeshDeviceConfig{.mesh_shape = SimpleMeshShape(8), .offset = MeshCoordinate(1)}));
 }
 
-TEST_F(T3000ReshapeTest, InvalidReshapeDimensions) {
-    auto mesh = ttnn::distributed::open_mesh_device(
-        {1, 8}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER);
+TEST_F(MeshDeviceReshapeTest, InvalidReshapeDimensions) {
+    auto mesh = tt::tt_metal::distributed::MeshDevice::create(
+        MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 8)},
+        DEFAULT_L1_SMALL_SIZE,
+        DEFAULT_TRACE_REGION_SIZE,
+        1,
+        tt::tt_metal::DispatchCoreType::WORKER);
 
     // Test reshaping to dimensions that don't match total device count
     EXPECT_THROW(mesh->reshape({3, 3}), std::runtime_error);  // 9 devices != 8
@@ -157,9 +159,13 @@ TEST_F(T3000ReshapeTest, InvalidReshapeDimensions) {
     EXPECT_EQ(mesh->num_cols(), 8);
 }
 
-TEST_F(T3000ReshapeTest, From1x8To2x4ThenBackTo1x8) {
-    auto mesh = ttnn::distributed::open_mesh_device(
-        {1, 8}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER);
+TEST_F(MeshDeviceReshapeTest, From1x8To2x4ThenBackTo1x8) {
+    auto mesh = tt::tt_metal::distributed::MeshDevice::create(
+        MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 8)},
+        DEFAULT_L1_SMALL_SIZE,
+        DEFAULT_TRACE_REGION_SIZE,
+        1,
+        tt::tt_metal::DispatchCoreType::WORKER);
 
     EXPECT_EQ(mesh->num_rows(), 1);
     EXPECT_EQ(mesh->num_cols(), 8);
@@ -187,9 +193,13 @@ TEST_F(T3000ReshapeTest, From1x8To2x4ThenBackTo1x8) {
     EXPECT_EQ(mesh->get_device_ids(), original_order);
 }
 
-TEST_F(T3000ReshapeTest, InvalidTotalDeviceCount) {
-    auto mesh = ttnn::distributed::open_mesh_device(
-        {1, 8}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER);
+TEST_F(MeshDeviceReshapeTest, InvalidTotalDeviceCount) {
+    auto mesh = tt::tt_metal::distributed::MeshDevice::create(
+        MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 8)},
+        DEFAULT_L1_SMALL_SIZE,
+        DEFAULT_TRACE_REGION_SIZE,
+        1,
+        tt::tt_metal::DispatchCoreType::WORKER);
 
     // Test reshaping to dimensions that don't match total device count
     EXPECT_THROW(mesh->reshape({3, 3}), std::runtime_error);  // 9 devices != 8
@@ -200,15 +210,19 @@ TEST_F(T3000ReshapeTest, InvalidTotalDeviceCount) {
     EXPECT_EQ(mesh->num_cols(), 8);
 }
 
-TEST_F(T3000ReshapeTest, From1x4To2x2Invalid) {
-    auto mesh = ttnn::distributed::open_mesh_device(
-        {1, 4}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER);
+TEST_F(MeshDeviceReshapeTest, From1x4To2x2Invalid) {
+    auto mesh = tt::tt_metal::distributed::MeshDevice::create(
+        MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 4)},
+        DEFAULT_L1_SMALL_SIZE,
+        DEFAULT_TRACE_REGION_SIZE,
+        1,
+        tt::tt_metal::DispatchCoreType::WORKER);
 
     // This is an invalid reshape because the 1x4 mesh does not fully cover the 2x2 mesh
     EXPECT_THROW(mesh->reshape({2, 2}), std::runtime_error);
 }
 
-TEST_F(T3000ReshapeTest, From1x4To2x2Valid) {
+TEST_F(MeshDeviceReshapeTest, From1x4To2x2Valid) {
     auto& system_mesh = tt::tt_metal::distributed::SystemMesh::instance();
 
     // Fetch the device ids for a physically connected 2x2 mesh.
@@ -218,14 +232,12 @@ TEST_F(T3000ReshapeTest, From1x4To2x2Valid) {
 
     // Supply the physical device ids to the mesh constructor that we know we know is 2x2 physically connected.
     // We will create a 1x4 mesh and then reshape it to 2x2.
-    auto mesh = ttnn::distributed::open_mesh_device(
-        {1, 4},
+    auto mesh = tt::tt_metal::distributed::MeshDevice::create(
+        MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 4), .physical_device_ids = physical_device_ids},
         DEFAULT_L1_SMALL_SIZE,
         DEFAULT_TRACE_REGION_SIZE,
         1,
-        tt::tt_metal::DispatchCoreType::WORKER,
-        MeshOffset{0, 0},
-        physical_device_ids);
+        tt::tt_metal::DispatchCoreType::WORKER);
 
     mesh->reshape({2, 2});
     EXPECT_EQ(mesh->num_rows(), 2);
@@ -236,9 +248,13 @@ TEST_F(T3000ReshapeTest, From1x4To2x2Valid) {
     }
 }
 
-TEST_F(T3000ReshapeTest, From2x2To1x4) {
-    auto mesh = ttnn::distributed::open_mesh_device(
-        {2, 2}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER);
+TEST_F(MeshDeviceReshapeTest, From2x2To1x4) {
+    auto mesh = tt::tt_metal::distributed::MeshDevice::create(
+        MeshDeviceConfig{.mesh_shape = SimpleMeshShape(2, 2)},
+        DEFAULT_L1_SMALL_SIZE,
+        DEFAULT_TRACE_REGION_SIZE,
+        1,
+        tt::tt_metal::DispatchCoreType::WORKER);
 
     auto mesh_2x2_device_ids = mesh->get_device_ids();
 
@@ -258,4 +274,4 @@ TEST_F(T3000ReshapeTest, From2x2To1x4) {
 }
 
 }  // namespace
-}  // namespace ttnn::distributed::test
+}  // namespace tt::tt_metal::distributed
diff --git a/tt_metal/api/tt-metalium/test_common.hpp b/tests/tt_metal/test_utils/test_common.hpp
similarity index 99%
rename from tt_metal/api/tt-metalium/test_common.hpp
rename to tests/tt_metal/test_utils/test_common.hpp
index 7a81c7a0732..dbcf2c50e25 100644
--- a/tt_metal/api/tt-metalium/test_common.hpp
+++ b/tests/tt_metal/test_utils/test_common.hpp
@@ -23,7 +23,7 @@ template <class T>
 constexpr std::false_type always_false{};
 
 template <class T>
-T parse(std::string const& s) {
+T parse(const std::string& s) {
     if constexpr (std::is_same_v<T, std::uint32_t>) {
         return std::stoul(s, 0, 0);
     } else if constexpr (std::is_same_v<T, int>) {
@@ -39,7 +39,7 @@ T parse(std::string const& s) {
     }
 }
 
-inline std::string strip(std::string const& s) {
+inline std::string strip(const std::string& s) {
     std::string whitespace = " \t\n";
     std::size_t start = s.find_first_not_of(whitespace);
     std::size_t end = s.find_last_not_of(whitespace);
diff --git a/tests/tt_metal/tt_metal/CMakeLists.txt b/tests/tt_metal/tt_metal/CMakeLists.txt
index bafab7885dd..9065d45acd5 100644
--- a/tests/tt_metal/tt_metal/CMakeLists.txt
+++ b/tests/tt_metal/tt_metal/CMakeLists.txt
@@ -48,6 +48,7 @@ foreach(TEST_SRC ${TT_METAL_TESTS_SRCS})
         PRIVATE
             "$<TARGET_PROPERTY:Metalium::Metal,INCLUDE_DIRECTORIES>"
             ${PROJECT_SOURCE_DIR}/tests
+            ${PROJECT_SOURCE_DIR}/tests/tt_metal/test_utils
             ${CMAKE_CURRENT_SOURCE_DIR}
     )
     set_target_properties(
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp
index ff359239b1e..4ab8453a76d 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp
@@ -28,6 +28,8 @@
 #include "tt_metal/tt_metal/common/matmul_test_utils.hpp"
 #include <yaml-cpp/yaml.h>
 
+#include "test_common.hpp"
+
 using std::vector;
 using namespace tt;
 using std::chrono::duration_cast;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp
index 16ceb8092cd..784f8814af0 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp
@@ -27,6 +27,8 @@
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
 #include "tt_metal/tt_metal/common/matmul_test_utils.hpp"
 
+#include "test_common.hpp"
+
 using std::vector;
 using namespace tt;
 using std::chrono::duration_cast;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp
index 38b82e910e7..24382b4ff73 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp
@@ -28,6 +28,8 @@
 #include "tt_metal/tt_metal/common/matmul_test_utils.hpp"
 #include <tt-metalium/work_split.hpp>
 
+#include "test_common.hpp"
+
 using std::vector;
 using namespace tt;
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_adjacent/test_noc_adjacent.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_adjacent/test_noc_adjacent.cpp
index 11944860693..a877ef09d0a 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_adjacent/test_noc_adjacent.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_adjacent/test_noc_adjacent.cpp
@@ -13,6 +13,8 @@
 #include <tt-metalium/command_queue.hpp>
 #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp"
 
+#include "test_common.hpp"
+
 using namespace tt;
 using namespace tt::tt_metal;
 using std::chrono::duration_cast;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp
index 661d0018769..27cb5adcff2 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp
@@ -14,6 +14,8 @@
 #include <tt-metalium/command_queue.hpp>
 #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp"
 
+#include "test_common.hpp"
+
 using namespace tt;
 using namespace tt::tt_metal;
 using std::chrono::duration_cast;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_pull_from_pcie.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_pull_from_pcie.cpp
index 8d83a1b175b..9e7ff0e7f05 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_pull_from_pcie.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_pull_from_pcie.cpp
@@ -16,6 +16,8 @@
 #include <tt-metalium/memcpy.hpp>
 #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp"
 
+#include "test_common.hpp"
+
 using namespace tt;
 using namespace tt::tt_metal;
 using std::chrono::duration_cast;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp
index 306c3463bd3..6ce45cc0efe 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp
@@ -17,6 +17,8 @@
 #include "tt_cluster.hpp"
 #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp"
 
+#include "test_common.hpp"
+
 using namespace tt;
 using namespace tt::tt_metal;
 using std::chrono::duration_cast;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp
index df8fe9407aa..3a9589bc218 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp
@@ -19,6 +19,8 @@
 #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp"
 #include <tt-metalium/work_split.hpp>
 
+#include "test_common.hpp"
+
 using namespace tt;
 using std::chrono::duration_cast;
 using std::chrono::microseconds;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch.cpp
index 9889aa430b9..2bc2d18553f 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch.cpp
@@ -13,6 +13,8 @@
 #include <tt-metalium/command_queue.hpp>
 #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp"
 
+#include "test_common.hpp"
+
 using std::vector;
 using namespace tt;
 using namespace tt::tt_metal;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp
index d40e9384635..554c85e559c 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp
@@ -21,6 +21,8 @@
 #include <tt-metalium/work_split.hpp>
 #include <yaml-cpp/yaml.h>
 
+#include "test_common.hpp"
+
 using namespace tt;
 using std::chrono::duration_cast;
 using std::chrono::microseconds;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp
index 301ceea8c21..9340465fe2c 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp
@@ -24,6 +24,8 @@
 #include <tt-metalium/work_split.hpp>
 #include <yaml-cpp/yaml.h>
 
+#include "test_common.hpp"
+
 using namespace tt;
 using std::chrono::duration_cast;
 using std::chrono::microseconds;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt
index e4178cba02b..598e4125424 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt
@@ -79,6 +79,7 @@ foreach(arch ${ARCHITECTURES})
                 "$<TARGET_PROPERTY:Metalium::Metal,INCLUDE_DIRECTORIES>"
                 ${PROJECT_SOURCE_DIR}/ttnn/cpp/ttnn/deprecated # this all should go away and be replaced with link to ttnn
                 ${PROJECT_SOURCE_DIR}/tests
+                ${PROJECT_SOURCE_DIR}/tests/tt_metal/test_utils
                 ${CMAKE_CURRENT_SOURCE_DIR}
         )
         set_target_properties(
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp
index 3053fd4c7ed..31f7c2296ed 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp
@@ -12,7 +12,7 @@
 #include "logger.hpp"
 #include <tt-metalium/host_api.hpp>
 #include <tt-metalium/tt_metal.hpp>
-#include <tt-metalium/test_common.hpp>
+#include "test_common.hpp"
 #include <tt-metalium/rtoptions.hpp>
 #include <tt-metalium/metal_soc_descriptor.h>
 #include <tt-metalium/event.hpp>
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp
index e751187a2ab..d0f0fea005b 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp
@@ -14,6 +14,8 @@
 #include <tt-metalium/cq_commands.hpp>
 #include "common.h"
 
+#include "test_common.hpp"
+
 constexpr uint32_t DEFAULT_ITERATIONS = 10000;
 constexpr uint32_t DEFAULT_WARMUP_ITERATIONS = 100;
 constexpr uint32_t DEFAULT_DISPATCH_BUFFER_LOG_PAGE_SIZE = 12;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp
index b9e3aaaf083..0d9c0eefd8f 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp
@@ -6,7 +6,7 @@
 #include <tt-metalium/host_api.hpp>
 #include <tt-metalium/hal_exp.hpp>
 #include <tt-metalium/tt_metal.hpp>
-#include <tt-metalium/test_common.hpp>
+#include "test_common.hpp"
 #include <tt-metalium/command_queue.hpp>
 #include <tt-metalium/device.hpp>
 #include <tt-metalium/rtoptions.hpp>
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp
index 0b1dc88bec3..0c6b581e7c3 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp
@@ -21,6 +21,8 @@
 #include <tt-metalium/hal.hpp>
 #include "llrt.hpp"
 
+#include "test_common.hpp"
+
 #define CQ_PREFETCH_CMD_BARE_MIN_SIZE tt::tt_metal::hal.get_alignment(tt::tt_metal::HalMemType::HOST)
 
 constexpr uint32_t DEFAULT_TEST_TYPE = 0;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp
index 13eb1015602..73c0fb19225 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp
@@ -12,7 +12,7 @@
 #include <tt-metalium/test_tiles.hpp>
 #include <tt-metalium/device.hpp>
 #include <tt-metalium/tt_metal.hpp>
-#include <tt-metalium/test_common.hpp>
+#include "test_common.hpp"
 #include <tt-metalium/util.hpp>
 #include <tt-metalium/host_api.hpp>
 #include "dprint_server.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp
index b15d222a21d..acef9bfcd07 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp
@@ -11,7 +11,7 @@
 #include <tt-metalium/bfloat16.hpp>
 #include <tt-metalium/test_tiles.hpp>
 #include <tt-metalium/tt_metal.hpp>
-#include <tt-metalium/test_common.hpp>
+#include "test_common.hpp"
 #include <tt-metalium/host_api.hpp>
 #include "dprint_server.hpp"
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp
index 24580476130..20ce9327a65 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp
@@ -11,7 +11,7 @@
 #include <tt-metalium/bfloat16.hpp>
 #include <tt-metalium/test_tiles.hpp>
 #include <tt-metalium/tt_metal.hpp>
-#include <tt-metalium/test_common.hpp>
+#include "test_common.hpp"
 #include <tt-metalium/host_api.hpp>
 #include "dprint_server.hpp"
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp
index a08ec04c278..9ae53cb1e28 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp
@@ -11,7 +11,7 @@
 #include <tt-metalium/bfloat16.hpp>
 #include <tt-metalium/test_tiles.hpp>
 #include <tt-metalium/tt_metal.hpp>
-#include <tt-metalium/test_common.hpp>
+#include "test_common.hpp"
 #include <tt-metalium/host_api.hpp>
 #include "dprint_server.hpp"
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp
index caa962ab89e..da12baa481f 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp
@@ -8,7 +8,7 @@
 
 #include <tt-metalium/bfloat16.hpp>
 #include <tt-metalium/host_api.hpp>
-#include <tt-metalium/test_common.hpp>
+#include "test_common.hpp"
 #include <tt-metalium/device.hpp>
 #include <tt-metalium/command_queue.hpp>
 
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp
index 714e0b2af26..c1f5b1426f9 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp
@@ -9,7 +9,7 @@
 #include <tt-metalium/bfloat16.hpp>
 #include <tt-metalium/tt_metal.hpp>
 #include <tt-metalium/host_api.hpp>
-#include <tt-metalium/test_common.hpp>
+#include "test_common.hpp"
 #include <tt-metalium/command_queue.hpp>
 
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp
index 4ab4568663b..89dedffba0a 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp
@@ -9,7 +9,7 @@
 
 #include <tt-metalium/bfloat16.hpp>
 #include <tt-metalium/tt_metal.hpp>
-#include <tt-metalium/test_common.hpp>
+#include "test_common.hpp"
 #include <tt-metalium/host_api.hpp>
 
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp
index 04ae58dc362..844d2e4bb9e 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp
@@ -9,7 +9,7 @@
 
 #include <tt-metalium/bfloat16.hpp>
 #include <tt-metalium/tt_metal.hpp>
-#include <tt-metalium/test_common.hpp>
+#include "test_common.hpp"
 #include <tt-metalium/host_api.hpp>
 
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/routing_test_common.hpp
similarity index 80%
rename from tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp
rename to tests/tt_metal/tt_metal/perf_microbenchmark/routing/routing_test_common.hpp
index ad6c6eff13b..1dcd801b127 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/routing_test_common.hpp
@@ -17,7 +17,8 @@ static inline std::string to_string(pkt_dest_size_choices_t choice) {
     }
 }
 
-static inline void log_phys_coord_to_json(nlohmann::json& config, const std::vector<CoreCoord>& phys_cores, const std::string& name) {
+static inline void log_phys_coord_to_json(
+    nlohmann::json& config, const std::vector<CoreCoord>& phys_cores, const std::string& name) {
     for (int i = 0; i < phys_cores.size(); ++i) {
         config[fmt::format("{}_{}", name, i)] = fmt::format("({}, {})", phys_cores[i].x, phys_cores[i].y);
     }
@@ -28,9 +29,9 @@ static inline void log_phys_coord_to_json(nlohmann::json& config, const CoreCoor
 }
 
 inline uint64_t get_64b_result(uint32_t* buf, uint32_t index) {
-    return (((uint64_t)buf[index]) << 32) | buf[index+1];
+    return (((uint64_t)buf[index]) << 32) | buf[index + 1];
 }
 
 inline uint64_t get_64b_result(const std::vector<uint32_t>& vec, uint32_t index) {
-    return (((uint64_t)vec[index]) << 32) | vec[index+1];
+    return (((uint64_t)vec[index]) << 32) | vec[index + 1];
 }
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp
index f267a746382..eda89407079 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp
@@ -9,6 +9,7 @@
 #include <tt-metalium/device.hpp>
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "test_common.hpp"
+#include "routing_test_common.hpp"
 #include "llrt.hpp"
 
 using std::vector;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp
index dc4a8f132fd..2834227a93e 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp
@@ -8,6 +8,7 @@
 #include <tt-metalium/rtoptions.hpp>
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "test_common.hpp"
+#include "routing_test_common.hpp"
 #include "llrt.hpp"
 
 using std::vector;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp
index bacca186d10..00761a5843a 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp
@@ -11,6 +11,7 @@
 // #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "tt_metal/fabric/hw/inc/tt_fabric_status.h"
 #include "test_common.hpp"
+#include "routing_test_common.hpp"
 #include "eth_l1_address_map.h"
 #include "tt_metal/fabric/hw/inc/tt_fabric_interface.h"
 
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp
index f495c0b5e7b..c6d48b3f670 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp
@@ -13,6 +13,7 @@
 //#include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "tt_metal/fabric/hw/inc/tt_fabric_status.h"
 #include "test_common.hpp"
+#include "routing_test_common.hpp"
 #include "eth_l1_address_map.h"
 #include "tt_metal/fabric/hw/inc/tt_fabric_interface.h"
 #include <numeric>
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp
index b6a5e0182c8..198246ce0da 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp
@@ -11,6 +11,7 @@
 // #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "tt_metal/fabric/hw/inc/tt_fabric_status.h"
 #include "test_common.hpp"
+#include "routing_test_common.hpp"
 #include "eth_l1_address_map.h"
 #include "tt_metal/fabric/hw/inc/tt_fabric_interface.h"
 
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp
index a645b972fa6..d8a5c7263bd 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp
@@ -8,6 +8,7 @@
 #include <tt-metalium/rtoptions.hpp>
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "test_common.hpp"
+#include "routing_test_common.hpp"
 #include "utils.hpp"
 #include "llrt.hpp"
 
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp
index 99d271f3ce0..bfaaadb2a0c 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp
@@ -9,6 +9,7 @@
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include <tt-metalium/device.hpp>
 #include "test_common.hpp"
+#include "routing_test_common.hpp"
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp
index 8c70290d9c3..23a4e9db4f7 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp
@@ -9,6 +9,7 @@
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include <tt-metalium/device.hpp>
 #include "test_common.hpp"
+#include "routing_test_common.hpp"
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp
index 0b9cf4ae5b4..c34eea39242 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp
@@ -9,6 +9,7 @@
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include <tt-metalium/device.hpp>
 #include "test_common.hpp"
+#include "routing_test_common.hpp"
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp
index 805ea48ca01..28a89013e54 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp
@@ -9,6 +9,7 @@
 #include <tt-metalium/device.hpp>
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "test_common.hpp"
+#include "routing_test_common.hpp"
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp
index 32d69fb8586..b4c37a1ff14 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp
@@ -9,6 +9,7 @@
 #include <tt-metalium/cq_commands.hpp>
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "test_common.hpp"
+#include "routing_test_common.hpp"
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp b/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp
index f6ffce0e797..9cb9cf85c0c 100644
--- a/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp
+++ b/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp
@@ -15,6 +15,8 @@
 
 #include "dprint_server.hpp"
 
+#include "test_common.hpp"
+
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
 //////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp
index 2b11027b701..2affd969e68 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp
@@ -13,6 +13,8 @@
 #include <tt-metalium/command_queue.hpp>
 #include <tt-metalium/tt_metal.hpp>
 
+#include "test_common.hpp"
+
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
 //////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp
index 2d457de3e58..b50fdd0f708 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp
@@ -12,6 +12,8 @@
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
 #include <tt-metalium/test_tiles.hpp>
 
+#include "test_common.hpp"
+
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
 //////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp b/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp
index 2ab7e642602..d69f71d3588 100644
--- a/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp
+++ b/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp
@@ -18,7 +18,7 @@
 #include "logger.hpp"
 #include <tt-metalium/host_api.hpp>
 #include <tt-metalium/tt_metal.hpp>
-#include <tt-metalium/test_common.hpp>
+#include "test_common.hpp"
 #include <tt-metalium/rtoptions.hpp>
 #include <tt-metalium/metal_soc_descriptor.h>
 #include <tt-metalium/event.hpp>
diff --git a/tests/ttnn/CMakeLists.txt b/tests/ttnn/CMakeLists.txt
index 3117e6b8920..7e3c43ea023 100644
--- a/tests/ttnn/CMakeLists.txt
+++ b/tests/ttnn/CMakeLists.txt
@@ -25,5 +25,4 @@ function(setup_ttnn_test_target target_name)
     )
 endfunction()
 
-add_subdirectory(distributed)
 add_subdirectory(unit_tests/gtests)
diff --git a/tests/ttnn/distributed/CMakeLists.txt b/tests/ttnn/distributed/CMakeLists.txt
deleted file mode 100644
index 5823925eec3..00000000000
--- a/tests/ttnn/distributed/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-add_executable(
-    test_distributed
-    test_distributed.cpp
-    test_distributed_reshape.cpp
-)
-add_executable(test_distributed_atexit test_distributed_atexit.cpp)
-
-# Set up properties for the target
-setup_ttnn_test_target(test_distributed)
-setup_ttnn_test_target(test_distributed_atexit)
-# Add test to CTest
-add_test(NAME test_distributed COMMAND test_distributed)
-add_test(NAME test_distributed_atexit COMMAND test_distributed_atexit)
diff --git a/tests/ttnn/distributed/test_distributed.cpp b/tests/ttnn/distributed/test_distributed.cpp
deleted file mode 100644
index ee9d2f83fb4..00000000000
--- a/tests/ttnn/distributed/test_distributed.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <gtest/gtest.h>
-#include <gmock/gmock.h>
-
-#include <tt-metalium/mesh_coord.hpp>
-
-#include <ttnn/core.hpp>
-#include <ttnn/distributed/api.hpp>
-#include "ttnn/distributed/types.hpp"
-
-namespace ttnn::distributed::test {
-
-using ::testing::IsEmpty;
-using ::testing::SizeIs;
-using ::tt::tt_metal::distributed::MeshContainer;
-
-class DistributedTest : public ::testing::Test {
-protected:
-    void SetUp() override {}
-    void TearDown() override {}
-};
-
-TEST_F(DistributedTest, TestSystemMeshTearDownWithoutClose) {
-    auto& sys = SystemMesh::instance();
-    auto mesh = ttnn::distributed::open_mesh_device(
-        /*mesh_shape=*/{2, 4},
-        DEFAULT_L1_SMALL_SIZE,
-        DEFAULT_TRACE_REGION_SIZE,
-        1,
-        tt::tt_metal::DispatchCoreType::WORKER);
-
-    const auto system_shape = sys.get_shape();
-    ASSERT_EQ(system_shape.dims(), 2);
-    EXPECT_EQ(system_shape[0], 2);
-    EXPECT_EQ(system_shape[1], 4);
-}
-
-TEST_F(DistributedTest, TestMemoryAllocationStatistics) {
-    auto mesh = ttnn::distributed::open_mesh_device(
-        {2, 4}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER);
-    auto stats = mesh->allocator()->get_statistics(tt::tt_metal::BufferType::DRAM);
-    for (auto* device : mesh->get_devices()) {
-        auto device_stats = device->allocator()->get_statistics(tt::tt_metal::BufferType::DRAM);
-        EXPECT_EQ(stats.total_allocatable_size_bytes, device_stats.total_allocatable_size_bytes);
-    }
-}
-
-TEST_F(DistributedTest, TestNumDramChannels) {
-    auto mesh = ttnn::distributed::open_mesh_device(
-        {2, 4}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER);
-    EXPECT_EQ(mesh->num_dram_channels(), 96);  // 8 devices * 12 channels
-}
-
-TEST_F(DistributedTest, ViewIs2D) {
-    auto mesh = ttnn::distributed::open_mesh_device(
-        {2, 4}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER);
-    std::vector<IDevice*> devices = mesh->get_devices();
-
-    MeshContainer<IDevice*> container_1d(SimpleMeshShape(8), devices);
-    MeshDeviceView view_1d(container_1d);
-    EXPECT_FALSE(view_1d.is_mesh_2d());
-
-    MeshContainer<IDevice*> container_2d(SimpleMeshShape(2, 4), devices);
-    MeshDeviceView view_2d(container_2d);
-    EXPECT_TRUE(view_2d.is_mesh_2d());
-
-    MeshContainer<IDevice*> container_3d(SimpleMeshShape(2, 2, 2), devices);
-    MeshDeviceView view_3d(container_3d);
-    EXPECT_FALSE(view_3d.is_mesh_2d());
-}
-
-TEST_F(DistributedTest, Submesh) {
-    auto mesh = ttnn::distributed::open_mesh_device(
-        {2, 4}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER);
-
-    EXPECT_EQ(mesh->shape().num_rows, 2);
-    EXPECT_EQ(mesh->shape().num_cols, 4);
-    EXPECT_THAT(mesh->get_devices(), SizeIs(8));
-    EXPECT_TRUE(mesh->is_parent_mesh());
-    EXPECT_THAT(mesh->get_submeshes(), IsEmpty());
-
-    auto submesh = mesh->create_submesh(MeshShape{1, 2}, MeshOffset{1, 1});
-    EXPECT_THAT(mesh->get_submeshes(), SizeIs(1));
-    EXPECT_EQ(submesh->shape().num_rows, 1);
-    EXPECT_EQ(submesh->shape().num_cols, 2);
-    EXPECT_THAT(submesh->get_devices(), SizeIs(2));
-    EXPECT_FALSE(submesh->is_parent_mesh());
-    EXPECT_THAT(submesh->get_submeshes(), IsEmpty());
-
-    // Verify coordinates are correct.
-    EXPECT_EQ(mesh->get_device(MeshCoordinate{1, 1})->id(), submesh->get_device(MeshCoordinate{0, 0})->id());
-    EXPECT_EQ(mesh->get_device(MeshCoordinate{1, 2})->id(), submesh->get_device(MeshCoordinate{0, 1})->id());
-    EXPECT_EQ(submesh->get_device(1, 1), nullptr);
-
-}  // namespace ttnn::distributed::test
-}  // namespace ttnn::distributed::test
diff --git a/tests/ttnn/distributed/test_distributed_atexit.cpp b/tests/ttnn/distributed/test_distributed_atexit.cpp
deleted file mode 100644
index 6d4461f7386..00000000000
--- a/tests/ttnn/distributed/test_distributed_atexit.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <gtest/gtest.h>
-
-#include <cstddef>
-#include <ttnn/core.hpp>
-#include <ttnn/distributed/api.hpp>
-#include <tt-metalium/mesh_device.hpp>
-
-namespace ttnn::distributed::test {
-
-// Simplified test without fixture, and mesh variable moved inside test
-TEST(DistributedTestStandalone, TestSystemMeshTearDownWithoutClose) {
-    static std::shared_ptr<ttnn::MeshDevice> mesh;
-    auto& sys = tt::tt_metal::distributed::SystemMesh::instance();
-    mesh = ttnn::distributed::open_mesh_device(
-        {2, 4}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER);
-
-    const auto system_shape = sys.get_shape();
-    ASSERT_EQ(system_shape.dims(), 2);
-    EXPECT_EQ(system_shape[0], 2);
-    EXPECT_EQ(system_shape[1], 4);
-}
-
-}  // namespace ttnn::distributed::test
diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp
index 91fe40d181e..c22ae1d57f3 100644
--- a/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp
+++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp
@@ -18,8 +18,8 @@ static constexpr bool enable_any_synchronization = enable_start_synchronization
 
 FORCE_INLINE void line_sync(
     FabricConnectionManager& fabric_connection,
-    volatile tt::fabric::PacketHeader* mcast_fwd_packet_header,
-    volatile tt::fabric::PacketHeader* mcast_bwd_packet_header,
+    volatile PACKET_HEADER_TYPE* mcast_fwd_packet_header,
+    volatile PACKET_HEADER_TYPE* mcast_bwd_packet_header,
     size_t sync_bank_addr,
     size_t sync_noc_x,
     size_t sync_noc_y,
@@ -33,7 +33,7 @@ FORCE_INLINE void line_sync(
         fabric_connection.get_forward_connection().wait_for_empty_write_slot();
         print_pkt_header(mcast_fwd_packet_header);
         fabric_connection.get_forward_connection().send_payload_flush_non_blocking_from_address(
-            (uint32_t)mcast_fwd_packet_header, sizeof(tt::fabric::PacketHeader));
+            (uint32_t)mcast_fwd_packet_header, sizeof(PACKET_HEADER_TYPE));
     }
 
     if (fabric_connection.has_backward_connection()) {
@@ -41,7 +41,7 @@ FORCE_INLINE void line_sync(
         fabric_connection.get_backward_connection().wait_for_empty_write_slot();
         print_pkt_header(mcast_bwd_packet_header);
         fabric_connection.get_backward_connection().send_payload_flush_non_blocking_from_address(
-            (uint32_t)mcast_bwd_packet_header, sizeof(tt::fabric::PacketHeader));
+            (uint32_t)mcast_bwd_packet_header, sizeof(PACKET_HEADER_TYPE));
     }
     noc_semaphore_inc(get_noc_addr(sync_noc_x, sync_noc_y, sync_bank_addr), 1);
     if (sync_noc_x == my_x[0] && sync_noc_y == my_y[0]) {
@@ -98,11 +98,11 @@ void kernel_main() {
     const auto source_l1_buffer_address = get_write_ptr(source_l1_cb_index);
     const auto packet_header_buffer_address = get_write_ptr(packet_header_cb);
 
-    auto* mcast_fwd_packet_header = reinterpret_cast<PacketHeader*>(packet_header_buffer_address);
+    auto* mcast_fwd_packet_header = reinterpret_cast<PACKET_HEADER_TYPE*>(packet_header_buffer_address);
     auto* mcast_bwd_packet_header =
-        reinterpret_cast<PacketHeader*>(packet_header_buffer_address + sizeof(tt::fabric::PacketHeader));
+        reinterpret_cast<PACKET_HEADER_TYPE*>(packet_header_buffer_address + sizeof(PACKET_HEADER_TYPE));
     auto* unicast_packet_header =
-        reinterpret_cast<PacketHeader*>(packet_header_buffer_address + sizeof(tt::fabric::PacketHeader) * 2);
+        reinterpret_cast<PACKET_HEADER_TYPE*>(packet_header_buffer_address + sizeof(PACKET_HEADER_TYPE) * 2);
     mcast_fwd_packet_header->to_chip_multicast(MulticastRoutingCommandHeader{1, static_cast<uint8_t>(mcast_fwd_hops)});
     mcast_bwd_packet_header->to_chip_multicast(MulticastRoutingCommandHeader{1, static_cast<uint8_t>(mcast_bwd_hops)});
 
@@ -146,7 +146,7 @@ void kernel_main() {
                 fabric_connection.get_forward_connection().send_payload_without_header_non_blocking_from_address(
                     source_l1_buffer_address, packet_payload_size_bytes);
                 fabric_connection.get_forward_connection().send_payload_flush_non_blocking_from_address(
-                    (uint32_t)mcast_fwd_packet_header, sizeof(tt::fabric::PacketHeader));
+                    (uint32_t)mcast_fwd_packet_header, sizeof(PACKET_HEADER_TYPE));
             }
 
             if (fabric_connection.has_backward_connection()) {
@@ -157,7 +157,7 @@ void kernel_main() {
                 fabric_connection.get_backward_connection().send_payload_without_header_non_blocking_from_address(
                     source_l1_buffer_address, packet_payload_size_bytes);
                 fabric_connection.get_backward_connection().send_payload_flush_non_blocking_from_address(
-                    (uint32_t)mcast_bwd_packet_header, sizeof(tt::fabric::PacketHeader));
+                    (uint32_t)mcast_bwd_packet_header, sizeof(PACKET_HEADER_TYPE));
             }
             {
                 noc_async_writes_flushed();
@@ -174,8 +174,7 @@ void kernel_main() {
         fabric_conn.wait_for_empty_write_slot();
         fabric_conn.send_payload_without_header_non_blocking_from_address(
             source_l1_buffer_address, packet_payload_size_bytes);
-        fabric_conn.send_payload_blocking_from_address(
-            (uint32_t)unicast_packet_header, sizeof(tt::fabric::PacketHeader));
+        fabric_conn.send_payload_blocking_from_address((uint32_t)unicast_packet_header, sizeof(PACKET_HEADER_TYPE));
     }
 
     if (enable_finish_synchronization) {
diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_reader.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_reader.cpp
index 976f579ab4d..46c421049f0 100644
--- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_reader.cpp
+++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_reader.cpp
@@ -30,7 +30,7 @@ void kernel_main() {
         uint32_t pages_to_read = std::min<uint32_t>(pages_per_edm_buffer, num_pages_to_read_total - num_pages_read);
         cb_reserve_back(cb_id_in0, pages_to_read);
         uint32_t local_l1_read_addr = get_write_ptr(cb_id_in0);
-        local_l1_read_addr += sizeof(tt::fabric::PacketHeader);
+        local_l1_read_addr += sizeof(PACKET_HEADER_TYPE);
 
         for (uint32_t p = 0; p < pages_to_read; ++p) {
             uint64_t src_noc_addr = get_noc_addr(num_pages_read + p, source_address_generator);
diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp
index b210f32efb5..7bc4ad00b90 100644
--- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp
+++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp
@@ -122,9 +122,9 @@ void kernel_main() {
 
         // bit of a hack to extract X/Y
         const auto dest_noc_address = get_noc_addr(p, dest_addr_gen, 0, NORMALIZED_NOC_INDEX);
-        const size_t packet_size = page_size + sizeof(tt::fabric::PacketHeader);
+        const size_t packet_size = page_size + sizeof(PACKET_HEADER_TYPE);
         auto packet_addr = get_read_ptr(cb_id_in0);
-        auto* packet_header = reinterpret_cast<volatile tt::fabric::PacketHeader*>(packet_addr);
+        auto* packet_header = reinterpret_cast<volatile PACKET_HEADER_TYPE*>(packet_addr);
         if constexpr (mcast_mode) {
             packet_header
                 ->to_chip_multicast(
@@ -145,7 +145,7 @@ void kernel_main() {
     if constexpr (!mcast_mode) {
         sender.wait_for_empty_write_slot();
 
-        auto& packet_header = *reinterpret_cast<tt::fabric::PacketHeader*>(a_packet_header_addr);
+        auto& packet_header = *reinterpret_cast<PACKET_HEADER_TYPE*>(a_packet_header_addr);
         ASSERT(*last_message_semaphore_address == 0);
         uint64_t last_message_semaphore_noc0_addr =
             safe_get_noc_addr(my_x[0], my_y[0], (uint32_t)last_message_semaphore_address, 0);
diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp
index eaa14a0e40f..23b9789b998 100644
--- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp
+++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp
@@ -52,10 +52,10 @@ auto forward_to_fabric_from_cb(
 
     // bit of a hack to extract X/Y
     const auto noc0_dest_address = get_noc_addr(current_page, dest_addr_gen, 0, NORMALIZED_NOC_INDEX);
-    const size_t packet_size = page_size + sizeof(tt::fabric::PacketHeader);
+    const size_t packet_size = page_size + sizeof(PACKET_HEADER_TYPE);
 
     auto packet_addr = get_read_ptr(cb_id);
-    auto &packet_header = *reinterpret_cast<tt::fabric::PacketHeader*>(packet_addr);
+    auto& packet_header = *reinterpret_cast<PACKET_HEADER_TYPE*>(packet_addr);
     if constexpr (mcast_mode) {
         packet_header
             .to_chip_multicast(tt::fabric::MulticastRoutingCommandHeader{config.mcast.distance, config.mcast.range})
@@ -182,7 +182,7 @@ void kernel_main() {
     sender.wait_for_empty_write_slot();
 
     constexpr size_t kLoopbackNumHopsToMyChip = 2;
-    auto &packet_header = *reinterpret_cast<tt::fabric::PacketHeader*>(a_packet_header_addr);
+    auto& packet_header = *reinterpret_cast<PACKET_HEADER_TYPE*>(a_packet_header_addr);
     ASSERT(*last_message_semaphore_address == 0);
     packet_header.reserved = 0xE;
     packet_header.reserved2 = 0xFFFF;
diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp
index ae5e9135a2b..8f5287ee0d7 100644
--- a/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp
+++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp
@@ -29,13 +29,14 @@ bool terminate_fabric_endpoints_farthest_to_nearest (
                 get_noc_addr(edm_noc_x, edm_noc_y, termination_addr),
                 tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE);
         } else {
-            auto &packet_header = *reinterpret_cast<tt::fabric::PacketHeader*>(a_packet_header_addr);
-            reinterpret_cast<volatile uint32_t*>(a_packet_header_addr)[sizeof(tt::fabric::PacketHeader) >> 2] = tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE;
+            auto& packet_header = *reinterpret_cast<PACKET_HEADER_TYPE*>(a_packet_header_addr);
+            reinterpret_cast<volatile uint32_t*>(a_packet_header_addr)[sizeof(PACKET_HEADER_TYPE) >> 2] =
+                tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE;
             sender.wait_for_empty_write_slot();
             packet_header.to_chip_unicast(static_cast<uint8_t>(distance))
                 .to_noc_unicast_write(
                     tt::fabric::NocUnicastCommandHeader{termination_sig_noc_addr},
-                    sizeof(tt::fabric::PacketHeader) + sizeof(uint32_t));
+                    sizeof(PACKET_HEADER_TYPE) + sizeof(uint32_t));
             sender.send_payload_blocking_from_address(a_packet_header_addr, packet_header.get_payload_size_including_header());
             noc_async_writes_flushed();
         }
diff --git a/tt_metal/CMakeLists.txt b/tt_metal/CMakeLists.txt
index 7d96a44a239..9dce6002708 100644
--- a/tt_metal/CMakeLists.txt
+++ b/tt_metal/CMakeLists.txt
@@ -142,7 +142,7 @@ if(BUILD_PROGRAMMING_EXAMPLES)
 endif()
 
 # Allow internal files to access the public API "by default" and without the
-# scoping that external consumers must use.  Scaoping may still be used if desired.
+# scoping that external consumers must use.  Scoping may still be used if desired.
 include_directories(
     api
     api/tt-metalium
diff --git a/tt_metal/api/tt-metalium/device_impl.hpp b/tt_metal/api/tt-metalium/device_impl.hpp
index 21d017789c0..878569038d2 100644
--- a/tt_metal/api/tt-metalium/device_impl.hpp
+++ b/tt_metal/api/tt-metalium/device_impl.hpp
@@ -270,6 +270,8 @@ class Device : public IDevice {
     program_cache::detail::ProgramCache program_cache_;
 
     uint32_t trace_buffers_size_ = 0;
+    bool uninitialized_error_fired_ =
+        false;  // To avoid spam with warnings about calling Device methods when it's not initialized.
 };
 
 }  // namespace v0
diff --git a/tt_metal/api/tt-metalium/system_mesh.hpp b/tt_metal/api/tt-metalium/system_mesh.hpp
index 1ee91588dcc..f904de46044 100644
--- a/tt_metal/api/tt-metalium/system_mesh.hpp
+++ b/tt_metal/api/tt-metalium/system_mesh.hpp
@@ -9,7 +9,7 @@
 
 #include "mesh_config.hpp"
 #include "mesh_coord.hpp"
-
+#include "indestructible.hpp"
 namespace tt::tt_metal::distributed {
 
 // SystemMesh creates a virtualization over the physical devices in the system.
@@ -21,6 +21,8 @@ class SystemMesh {
     std::unique_ptr<Impl> pimpl_;
     SystemMesh();
 
+    friend class tt::stl::Indestructible<SystemMesh>;
+
 public:
     static SystemMesh& instance();
     SystemMesh(const SystemMesh&) = delete;
@@ -28,12 +30,13 @@ class SystemMesh {
     SystemMesh(SystemMesh&&) = delete;
     SystemMesh& operator=(SystemMesh&&) = delete;
 
+    // Returns the shape of the system mesh
     const SimpleMeshShape& get_shape() const;
 
-    // Gets the physical device ID for a given logical row and column index
+    // Returns the physical device ID for a given logical row and column index
     chip_id_t get_physical_device_id(const MeshCoordinate& coord) const;
 
-    // Get the physical device IDs mapped to a MeshDevice
+    // Returns the physical device IDs mapped to a MeshDevice
     std::vector<chip_id_t> get_mapped_physical_device_ids(const MeshDeviceConfig& config) const;
     std::vector<chip_id_t> request_available_devices(const MeshDeviceConfig& config) const;
 };
diff --git a/tt_metal/common/mesh_coord.cpp b/tt_metal/common/mesh_coord.cpp
index 19dab608c35..88f4309cd90 100644
--- a/tt_metal/common/mesh_coord.cpp
+++ b/tt_metal/common/mesh_coord.cpp
@@ -82,9 +82,12 @@ bool operator==(const MeshCoordinate& lhs, const MeshCoordinate& rhs) {
 bool operator!=(const MeshCoordinate& lhs, const MeshCoordinate& rhs) { return !(lhs == rhs); }
 
 std::ostream& operator<<(std::ostream& os, const MeshCoordinate& coord) {
-    os << "MeshCoordinate(" << coord.dims() << ", [";
-    for (size_t dim : coord.coords()) {
-        os << dim << ", ";
+    os << "MeshCoordinate([";
+    for (size_t i = 0; i < coord.dims(); ++i) {
+        if (i > 0) {
+            os << ", ";
+        }
+        os << coord[i];
     }
     os << "])";
     return os;
diff --git a/tt_metal/distributed/mesh_device.cpp b/tt_metal/distributed/mesh_device.cpp
index 7190e8e3806..80535e32674 100644
--- a/tt_metal/distributed/mesh_device.cpp
+++ b/tt_metal/distributed/mesh_device.cpp
@@ -69,9 +69,7 @@ MeshDevice::ScopedDevices::ScopedDevices(
     size_t num_command_queues,
     const DispatchCoreConfig& dispatch_core_config,
     const MeshDeviceConfig& config) {
-    auto& system_mesh = SystemMesh::instance();
-    auto physical_device_ids = system_mesh.request_available_devices(config);
-
+    auto physical_device_ids = SystemMesh::instance().request_available_devices(config);
     opened_devices_ = tt::tt_metal::detail::CreateDevices(
         physical_device_ids, num_command_queues, l1_small_size, trace_region_size, dispatch_core_config);
 
diff --git a/tt_metal/distributed/system_mesh.cpp b/tt_metal/distributed/system_mesh.cpp
index b2eff3b89d2..10a20b6e433 100644
--- a/tt_metal/distributed/system_mesh.cpp
+++ b/tt_metal/distributed/system_mesh.cpp
@@ -8,6 +8,7 @@
 #include "umd/device/types/cluster_descriptor_types.h"
 #include "tt_metal/distributed/coordinate_translation.hpp"
 
+#include "indestructible.hpp"
 #include "mesh_coord.hpp"
 #include "tt_cluster.hpp"
 
@@ -29,8 +30,6 @@ class SystemMesh::Impl {
     const SimpleMeshShape& get_shape() const;
     std::vector<chip_id_t> get_mapped_physical_device_ids(const MeshDeviceConfig& config) const;
     std::vector<chip_id_t> request_available_devices(const MeshDeviceConfig& config) const;
-
-    IDevice* get_device(const chip_id_t physical_device_id) const;
     chip_id_t get_physical_device_id(const MeshCoordinate& coord) const;
 };
 
@@ -128,7 +127,7 @@ std::vector<chip_id_t> SystemMesh::Impl::get_mapped_physical_device_ids(const Me
 
         auto line_length = config.mesh_shape.mesh_size();
         for (const auto& logical_coordinate : MeshDeviceView::get_line_coordinates(line_length, shape_2d)) {
-            auto physical_device_id = logical_to_device_id_.at(logical_coordinate);
+            auto physical_device_id = get_physical_device_id(logical_coordinate);
             physical_device_ids.push_back(physical_device_id);
 
             log_debug(
@@ -176,14 +175,9 @@ std::vector<chip_id_t> SystemMesh::Impl::get_mapped_physical_device_ids(const Me
     MeshCoordinateRange system_range(system_offset, MeshCoordinate(end_coord));
 
     for (const auto& system_coord : system_range) {
-        auto physical_device_id = logical_to_device_id_.find(system_coord);
-        TT_FATAL(
-            physical_device_id != logical_to_device_id_.end(),
-            "Logical coordinate: {} not found in SystemMesh of shape {}",
-            system_coord,
-            logical_mesh_shape_);
-        physical_device_ids.push_back(physical_device_id->second);
-        log_debug(LogMetal, "Logical coordinate: {}, Physical device ID: {}", system_coord, physical_device_id->second);
+        auto physical_device_id = get_physical_device_id(system_coord);
+        physical_device_ids.push_back(physical_device_id);
+        log_debug(LogMetal, "Logical coordinate: {}, Physical device ID: {}", system_coord, physical_device_id);
     }
     return physical_device_ids;
 }
@@ -201,11 +195,11 @@ std::vector<chip_id_t> SystemMesh::Impl::request_available_devices(const MeshDev
 SystemMesh::SystemMesh() : pimpl_(std::make_unique<Impl>()) {}
 
 SystemMesh& SystemMesh::instance() {
-    static SystemMesh instance;
-    if (!instance.pimpl_->is_system_mesh_initialized()) {
-        instance.pimpl_->initialize();
+    static tt::stl::Indestructible<SystemMesh> instance;
+    if (!instance.get().pimpl_->is_system_mesh_initialized()) {
+        instance.get().pimpl_->initialize();
     }
-    return instance;
+    return instance.get();
 }
 
 chip_id_t SystemMesh::get_physical_device_id(const MeshCoordinate& coord) const {
diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp
index 8df3eb90854..4afa1b342a7 100644
--- a/tt_metal/impl/device/device.cpp
+++ b/tt_metal/impl/device/device.cpp
@@ -961,7 +961,10 @@ bool Device::initialize(const uint8_t num_hw_cqs, size_t l1_small_size, size_t t
 
 void Device::push_work(std::function<void()> work, bool blocking) {
     if (not this->initialized_) {
-        log_warning("Attempting to push work to Device {} which is not initialized. Ignoring...", this->id_);
+        if (!uninitialized_error_fired_) {
+            log_fatal("Attempting to push work to Device {} which is not initialized. Ignoring...", this->id_);
+            uninitialized_error_fired_ = true;
+        }
         return;
     }
     this->work_executor_.push_work(std::move(work), blocking);
diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp
index afa0a600254..b7cecc47732 100644
--- a/tt_metal/llrt/tt_cluster.cpp
+++ b/tt_metal/llrt/tt_cluster.cpp
@@ -23,7 +23,6 @@
 #include "fmt/base.h"
 #include <logger.hpp>
 #include <metal_soc_descriptor.h>
-#include <test_common.hpp>
 #include <tt_backend_api_types.hpp>
 #include "umd/device/types/arch.h"
 #include "umd/device/tt_cluster_descriptor.h"
diff --git a/tt_metal/llrt/tt_cluster.hpp b/tt_metal/llrt/tt_cluster.hpp
index 1b54e3a1213..6f91b01300e 100644
--- a/tt_metal/llrt/tt_cluster.hpp
+++ b/tt_metal/llrt/tt_cluster.hpp
@@ -8,7 +8,6 @@
 #include <functional>
 
 #include "metal_soc_descriptor.h"
-#include "test_common.hpp"
 #include "tt_backend_api_types.hpp"
 #include "umd/device/device_api_metal.h"
 #include "umd/device/tt_cluster_descriptor.h"
diff --git a/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/CMakeLists.txt b/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/CMakeLists.txt
new file mode 100644
index 00000000000..736e5ddcf76
--- /dev/null
+++ b/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/CMakeLists.txt
@@ -0,0 +1,18 @@
+set(DISTRIBUTED_TRCE_AND_EVENTS ${CMAKE_CURRENT_SOURCE_DIR}/distributed_trace_and_events.cpp)
+add_executable(distributed_trace_and_events ${DISTRIBUTED_TRCE_AND_EVENTS})
+
+target_link_libraries(
+    distributed_trace_and_events
+    PUBLIC
+        tt_metal
+        pthread
+)
+
+target_include_directories(distributed_trace_and_events PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+
+set_target_properties(
+    distributed_trace_and_events
+    PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY
+            ${PROJECT_BINARY_DIR}/programming_examples/distributed
+)
diff --git a/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/distributed_trace_and_events.cpp b/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/distributed_trace_and_events.cpp
new file mode 100644
index 00000000000..f64154f3c74
--- /dev/null
+++ b/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/distributed_trace_and_events.cpp
@@ -0,0 +1,285 @@
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/distributed.hpp>
+
+using namespace tt;
+using namespace tt::tt_metal;
+using namespace tt::tt_metal::distributed;
+
+// The following is an advanced programming example that demonstrates:
+//
+// 1. Initializing a MeshDevice with 2 MeshCommandQueues and a dedicated memory region to store MeshWorkload Traces
+// 2. Loading a SubDevice configuration on a Virtual Mesh, and how this configuration gets replicated across all
+// physical devices
+// 3. Allocating MeshBuffers in the distributed memory space exposed by the Virtual Mesh, to shard data across physical
+// devices
+// 4. Constructing programs targeting different SubDevices
+// 5. Constructing homogenous (same program dispatched to all physical devices) and heterogenous (different programs
+// dispatched
+//    to physical different devices) MeshWorkloads from programs
+// 6. Capturing the execution of MeshWorkloads inside a MeshTrace that gets loaded onto the Virtual Mesh
+// 7. Performing IO and MeshTrace execution on different MeshCommandQueues and using MeshEvents for MeshCQ <--> MeshCQ
+// synchronization
+
+std::shared_ptr<Program> EltwiseBinaryProgramGenerator(
+    const std::shared_ptr<MeshBuffer>& src0_buf,
+    const std::shared_ptr<MeshBuffer>& src1_buf,
+    const std::shared_ptr<MeshBuffer>& output_buf,
+    const SubDevice& sub_device_for_program,
+    uint32_t num_tiles,
+    uint32_t single_tile_size,
+    uint32_t eltwise_op_index) {
+    // Program Generation helper function: Can be used to run addition, multiplication and subtraction
+    // on a SubDevice.
+    // Requires:
+    // 1. The src (input) and output buffers
+    // 2. The SubDevice being targeted
+    // 3. The number of tiles that must be processed by the op
+    // 4. The size of the tile in bytes
+    // The op specifier: Addition (0), Multiplication (1), Subtraction (2)
+    const std::vector<std::string> op_id_to_op_define = {"add_tiles", "mul_tiles", "sub_tiles"};
+    const std::vector<std::string> op_id_to_op_type_define = {
+        "EltwiseBinaryType::ELWADD", "EltwiseBinaryType::ELWMUL", "EltwiseBinaryType::ELWSUB"};
+
+    const auto cores_for_program = sub_device_for_program.cores(HalProgrammableCoreType::TENSIX);
+
+    std::shared_ptr<Program> program = std::make_shared<Program>();
+
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
+    uint32_t num_input_tiles = 2;
+    tt_metal::CircularBufferConfig cb_src0_config =
+        tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}})
+            .set_page_size(src0_cb_index, single_tile_size);
+    auto cb_src0 = tt_metal::CreateCircularBuffer(*program, cores_for_program, cb_src0_config);
+
+    uint32_t src1_cb_index = tt::CBIndex::c_1;
+    tt_metal::CircularBufferConfig cb_src1_config =
+        tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src1_cb_index, tt::DataFormat::Float16_b}})
+            .set_page_size(src1_cb_index, single_tile_size);
+    auto cb_src1 = tt_metal::CreateCircularBuffer(*program, cores_for_program, cb_src1_config);
+
+    uint32_t output_cb_index = tt::CBIndex::c_16;
+    uint32_t num_output_tiles = 2;
+    tt_metal::CircularBufferConfig cb_output_config =
+        tt_metal::CircularBufferConfig(
+            num_output_tiles * single_tile_size, {{output_cb_index, tt::DataFormat::Float16_b}})
+            .set_page_size(output_cb_index, single_tile_size);
+    auto cb_output = tt_metal::CreateCircularBuffer(*program, cores_for_program, cb_output_config);
+
+    auto binary_reader_kernel = tt_metal::CreateKernel(
+        *program,
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_dual_8bank.cpp",
+        cores_for_program,
+        tt_metal::DataMovementConfig{
+            .processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
+
+    auto unary_writer_kernel = tt_metal::CreateKernel(
+        *program,
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp",
+        cores_for_program,
+        tt_metal::DataMovementConfig{
+            .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
+
+    std::vector<uint32_t> compute_kernel_args = {};
+
+    bool fp32_dest_acc_en = false;
+    bool math_approx_mode = false;
+    std::map<string, string> binary_defines = {
+        {"ELTWISE_OP", op_id_to_op_define[eltwise_op_index]},
+        {"ELTWISE_OP_TYPE", op_id_to_op_type_define[eltwise_op_index]}};
+    auto eltwise_binary_kernel = tt_metal::CreateKernel(
+        *program,
+        "tt_metal/kernels/compute/eltwise_binary.cpp",
+        cores_for_program,
+        tt_metal::ComputeConfig{.compile_args = compute_kernel_args, .defines = binary_defines});
+
+    SetRuntimeArgs(*program, eltwise_binary_kernel, cores_for_program, {num_tiles, 1});
+
+    const std::array<uint32_t, 7> reader_args = {
+        src0_buf->address(), 0, num_tiles, src1_buf->address(), 0, num_tiles, 0};
+
+    const std::array<uint32_t, 3> writer_args = {output_buf->address(), 0, num_tiles};
+
+    SetRuntimeArgs(*program, unary_writer_kernel, cores_for_program, writer_args);
+    SetRuntimeArgs(*program, binary_reader_kernel, cores_for_program, reader_args);
+
+    return program;
+}
+
+int main(int argc, char** argv) {
+    using tt::constants::TILE_HEIGHT;
+    using tt::constants::TILE_WIDTH;
+    // Initialize constants used to define the workload
+    constexpr uint32_t ADD_OP_ID = 0;
+    constexpr uint32_t MULTIPLY_OP_ID = 1;
+    constexpr uint32_t SUBTRACT_OP_ID = 2;
+    // Create a 2x4 MeshDevice with 2 MeshCQs, 16MB allocated to the trace region and Ethernet Dispatch enabled
+    auto mesh_device = MeshDevice::create(
+        MeshDeviceConfig{.mesh_shape = MeshShape(2, 4)}, // Shape of MeshDevice
+        0,  // l1 small size
+        16 << 20, // trace region size
+        2, // num MeshCQs
+        DispatchCoreType::ETH /* Dispatch Configuration: 8 Chip Wormhole systems can only support 2 MeshCQs when Ethernet Dispatch is enabled */);
+
+    // Initialize command queue ids used for data movement and workload dispatch
+    constexpr uint8_t data_movement_cq_id = 1;
+    constexpr uint8_t workload_cq_id = 0;
+    auto data_movement_cq = mesh_device->mesh_command_queue(data_movement_cq_id);
+    auto workload_cq = mesh_device->mesh_command_queue(workload_cq_id);
+
+    // =========== Step 1: Initialize and load two SubDevices ===========
+    // Each SubDevice contains a single core. This SubDevice configuration is loaded on each physical device
+    // in the Virtual Mesh
+    SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {0, 0}))});
+    SubDevice sub_device_2(std::array{CoreRangeSet(CoreRange({1, 1}, {1, 1}))});
+    auto sub_device_manager = mesh_device->create_sub_device_manager(
+        {sub_device_1, sub_device_2}, 3200 /* size of L1 region allocated for the SubDevices */);
+    mesh_device->load_sub_device_manager(sub_device_manager);
+
+    // =========== Step 2: Initialize IO Buffers and Workload parameters ===========
+    uint32_t single_tile_size = sizeof(bfloat16) * TILE_HEIGHT * TILE_WIDTH;  // Using bfloat16 in this example
+    uint32_t num_tiles_per_device = 2048;  // Number of tiles sent to each physical device
+    uint32_t num_tiles_in_mesh =
+        num_tiles_per_device * mesh_device->num_devices();  // The total number of tiles in the distributed memory space
+
+    // Specify data layout in distributed memory space - Data will be sharded in row major order across the Virtual Mesh
+    tt::tt_metal::distributed::ShardedBufferConfig global_buffer_config{
+        .global_size = single_tile_size * num_tiles_in_mesh,  // Total size of the sharded buffer
+        .global_buffer_shape =
+            {num_tiles_in_mesh * TILE_WIDTH, TILE_HEIGHT},  // Data represents horizontally concatenated tiles
+        .shard_shape = {num_tiles_per_device * TILE_WIDTH, TILE_HEIGHT},  // Row major sharding
+        .shard_orientation = ShardOrientation::ROW_MAJOR                  // Row major sharding
+    };
+    // Specify data layout on a single physical device
+    DeviceLocalBufferConfig per_device_buffer_config{
+        .page_size = single_tile_size,
+        .buffer_type = tt_metal::BufferType::DRAM,
+        .buffer_layout = TensorMemoryLayout::INTERLEAVED,
+        .bottom_up = true};
+    // Allocate buffers in distributed memory space for first MeshWorkload
+    auto add_src0_buf = MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get());
+    auto add_src1_buf = MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get());
+    auto add_output_buf = MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get());
+    // Allocate buffers in distributed memory space for second MeshWorkload
+    auto mul_sub_src0_buf = MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get());
+    auto mul_sub_src1_buf = MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get());
+    auto mul_sub_output_buf = MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get());
+
+    // =========== Step 3: Create Workloads to run on the Virtual Mesh ===========
+    // Specify Device Ranges on which the Workloads will run
+    LogicalDeviceRange all_devices({0, 0}, {mesh_device->num_cols() - 1, mesh_device->num_rows() - 1});
+    LogicalDeviceRange top_row({0, 0}, {mesh_device->num_cols() - 1, 0});
+    LogicalDeviceRange bottom_row(
+        {0, mesh_device->num_rows() - 1}, {mesh_device->num_cols() - 1, mesh_device->num_rows() - 1});
+    // Create three eltwise binary ops using a simple program generation function
+    auto add_program = EltwiseBinaryProgramGenerator(
+        add_src0_buf,
+        add_src1_buf,
+        add_output_buf,
+        sub_device_1,  // Addition runs on the first SubDevice
+        num_tiles_per_device,
+        single_tile_size,
+        ADD_OP_ID);
+    auto multiply_program = EltwiseBinaryProgramGenerator(
+        mul_sub_src0_buf,
+        mul_sub_src1_buf,
+        mul_sub_output_buf,
+        sub_device_2,  // Multiplication runs on the second SubDevice
+        num_tiles_per_device,
+        single_tile_size,
+        MULTIPLY_OP_ID);
+    auto subtract_program = EltwiseBinaryProgramGenerator(
+        mul_sub_src0_buf,
+        mul_sub_src1_buf,
+        mul_sub_output_buf,
+        sub_device_2,  // Subtraction runs on the second SubDevice
+        num_tiles_per_device,
+        single_tile_size,
+        SUBTRACT_OP_ID);
+    // Create MeshWorkloads and add programs to them. A MeshWorkload allows a program to target
+    // multiple Physical Devices in the Virtual Mesh.
+    auto add_mesh_workload = CreateMeshWorkload();
+    auto multiply_and_subtract_mesh_workload = CreateMeshWorkload();
+    AddProgramToMeshWorkload(
+        add_mesh_workload, *add_program, all_devices);  // Addition runs on the full grid (sub_device 1)
+    AddProgramToMeshWorkload(
+        multiply_and_subtract_mesh_workload,
+        *multiply_program,
+        top_row);  // Multiplication runs on the top row (sub_device 2)
+    AddProgramToMeshWorkload(
+        multiply_and_subtract_mesh_workload,
+        *subtract_program,
+        bottom_row);  // Subtraction runs on the bottom row (sub device 2)
+
+    // =========== Step 4: Compile and Load Workloads on the Mesh ===========
+    EnqueueMeshWorkload(mesh_device->mesh_command_queue(), add_mesh_workload, true);
+    EnqueueMeshWorkload(mesh_device->mesh_command_queue(), multiply_and_subtract_mesh_workload, true);
+    // =========== Step 5: Trace the MeshWorkloads using the Workload Dispatch CQ ===========
+    auto trace_id = BeginTraceCapture(mesh_device.get(), workload_cq_id);
+    EnqueueMeshWorkload(mesh_device->mesh_command_queue(), add_mesh_workload, false);
+    EnqueueMeshWorkload(mesh_device->mesh_command_queue(), multiply_and_subtract_mesh_workload, false);
+    EndTraceCapture(mesh_device.get(), workload_cq_id, trace_id);
+
+    // =========== Step 6: Populate inputs ===========
+    uint32_t workload_0_src0_val = 2;
+    uint32_t workload_0_src1_val = 3;
+    uint32_t workload_1_src0_val = 7;
+    uint32_t workload_1_src1_val = 5;
+    // Uniform values passed to the add operation
+    std::vector<uint32_t> add_src0_vec = create_constant_vector_of_bfloat16(add_src0_buf->size(), workload_0_src0_val);
+    std::vector<uint32_t> add_src1_vec = create_constant_vector_of_bfloat16(add_src1_buf->size(), workload_0_src1_val);
+    // Uniform values passed to the multiply and subtract operations (the top row runs multiplication with subtraction
+    // on the bottom row of the Virtual Mesh)
+    std::vector<uint32_t> mul_sub_src0_vec =
+        create_constant_vector_of_bfloat16(mul_sub_src0_buf->size(), workload_1_src0_val);
+    std::vector<uint32_t> mul_sub_src1_vec =
+        create_constant_vector_of_bfloat16(mul_sub_src1_buf->size(), workload_1_src1_val);
+
+    // =========== Step 7: Write inputs on MeshCQ1 ===========
+    // IO is done through MeshCQ1 and Workload dispatch is done through MeshCQ0. Use MeshEvents to synchronize the
+    // independent MeshCQs.
+    std::shared_ptr<MeshEvent> write_event = std::make_shared<MeshEvent>();
+    std::shared_ptr<MeshEvent> trace_event = std::make_shared<MeshEvent>();
+
+    EnqueueWriteMeshBuffer(data_movement_cq, add_src0_buf, add_src0_vec);
+    EnqueueWriteMeshBuffer(data_movement_cq, add_src1_buf, add_src1_vec);
+    EnqueueWriteMeshBuffer(data_movement_cq, mul_sub_src0_buf, mul_sub_src0_vec);
+    EnqueueWriteMeshBuffer(data_movement_cq, mul_sub_src1_buf, mul_sub_src1_vec);
+    // Synchronize
+    EnqueueRecordEvent(data_movement_cq, write_event);
+    EnqueueWaitForEvent(workload_cq, write_event);
+    // =========== Step 8: Run MeshTrace on MeshCQ0 ===========
+    ReplayTrace(mesh_device.get(), workload_cq_id, trace_id, false);
+    // Synchronize
+    EnqueueRecordEvent(workload_cq, trace_event);
+    EnqueueWaitForEvent(data_movement_cq, trace_event);
+    // =========== Step 9: Read Outputs on MeshCQ1 ===========
+    std::vector<bfloat16> add_dst_vec = {};
+    std::vector<bfloat16> mul_sub_dst_vec = {};
+    EnqueueReadMeshBuffer(data_movement_cq, add_dst_vec, add_output_buf);
+    EnqueueReadMeshBuffer(data_movement_cq, mul_sub_dst_vec, mul_sub_output_buf);
+
+    // =========== Step 10: Verify Outputs ===========
+    bool pass = true;
+    for (int i = 0; i < add_dst_vec.size(); i++) {
+        pass &= (add_dst_vec[i].to_float() == workload_0_src0_val + workload_0_src1_val);
+    }
+    for (int i = 0; i < mul_sub_dst_vec.size(); i++) {
+        if (i < mul_sub_dst_vec.size() / 2) {
+            pass &= (mul_sub_dst_vec[i].to_float() == workload_1_src0_val * workload_1_src1_val);
+        } else {
+            pass &= (mul_sub_dst_vec[i].to_float() == workload_1_src0_val - workload_1_src1_val);
+        }
+    }
+    ReleaseTrace(mesh_device.get(), trace_id);
+    if (pass) {
+        std::cout << "Running EltwiseBinary MeshTraces on 2 MeshCQs Passed!" << std::endl;
+        return 0;
+    } else {
+        std::cout << "Running EltwiseBinary MeshTraces on 2 MeshCQs Failed with Incorrect Outputs!" << std::endl;
+        return 1;
+    }
+}
diff --git a/tt_metal/programming_examples/distributed/CMakeLists.txt b/tt_metal/programming_examples/distributed/CMakeLists.txt
index e887109662d..7dcd7fc8583 100644
--- a/tt_metal/programming_examples/distributed/CMakeLists.txt
+++ b/tt_metal/programming_examples/distributed/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_subdirectory(1_distributed_program_dispatch)
 add_subdirectory(2_distributed_buffer_rw)
 add_subdirectory(3_distributed_eltwise_add)
+add_subdirectory(4_distributed_trace_and_events)
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp
index fd6bae7f5ee..aa8fd3f04f0 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp
@@ -28,7 +28,7 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write(
     uint32_t payload_size_bytes) {
     const size_t payload_l1_address = l1_read_addr;
 
-    auto pkt_hdr = reinterpret_cast<volatile tt::fabric::PacketHeader*>(packet_header_buffer_addr);
+    auto pkt_hdr = reinterpret_cast<volatile PACKET_HEADER_TYPE*>(packet_header_buffer_addr);
 #ifdef DEBUG_PRINT_ENABLED
     pkt_hdr->reserved2 = my_chip_id;
 #endif
@@ -44,7 +44,7 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write(
             pkt_hdr->to_chip_unicast(unicast_args.distance_in_hops);
             fabric_conn.wait_for_empty_write_slot();
             fabric_conn.send_payload_without_header_non_blocking_from_address(l1_read_addr, payload_size_bytes);
-            fabric_conn.send_payload_flush_blocking_from_address((uint32_t)pkt_hdr, sizeof(tt::fabric::PacketHeader));
+            fabric_conn.send_payload_flush_blocking_from_address((uint32_t)pkt_hdr, sizeof(PACKET_HEADER_TYPE));
         } break;
         case ttnn::ccl::cmd::CclCommandDestType::CHIP_MULTICAST: {
             noc_async_write(
@@ -57,7 +57,7 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write(
                 fabric_connection.get_forward_connection().send_payload_without_header_non_blocking_from_address(
                     l1_read_addr, payload_size_bytes);
                 fabric_connection.get_forward_connection().send_payload_flush_blocking_from_address(
-                    (uint32_t)pkt_hdr, sizeof(tt::fabric::PacketHeader));
+                    (uint32_t)pkt_hdr, sizeof(PACKET_HEADER_TYPE));
             }
 
             if (fabric_connection.has_backward_connection()) {
@@ -67,7 +67,7 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write(
                 fabric_connection.get_backward_connection().send_payload_without_header_non_blocking_from_address(
                     l1_read_addr, payload_size_bytes);
                 fabric_connection.get_backward_connection().send_payload_flush_blocking_from_address(
-                    (uint32_t)pkt_hdr, sizeof(tt::fabric::PacketHeader));
+                    (uint32_t)pkt_hdr, sizeof(PACKET_HEADER_TYPE));
             }
         } break;
         default: {
@@ -87,8 +87,8 @@ FORCE_INLINE void write_payload_then_advance_read_address(
     size_t& l1_read_addr,
     size_t payload_size_bytes) {
     static_assert(
-        ((sizeof(tt::fabric::PacketHeader) - 1) & sizeof(tt::fabric::PacketHeader)) == 0,
-        "sizeof(sizeof(tt::fabric::PacketHeader)) is not a power of two which violates the below assertion");
+        is_power_of_2(sizeof(PACKET_HEADER_TYPE)),
+        "sizeof(tt::fabric::PacketHeader) is not a power of two which violates the below assertion");
 
     switch (current_cmd_header.dest_type) {
         case ttnn::ccl::cmd::CclCommandDestType::CHIP_UNICAST: [[fallthrough]];
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader.cpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader.cpp
index bb62676afbf..172222d7abf 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader.cpp
@@ -162,7 +162,7 @@ void kernel_main() {
             for (uint32_t p = 0; p < command_tensor.worker_pages_per_slice; p += packet_size_in_pages) {
                 cb_reserve_back(cb_id, packet_size_in_pages);
                 const uint32_t local_l1_scratch_buffer_address =
-                    get_write_ptr(cb_id) + sizeof(tt::fabric::PacketHeader);
+                    get_write_ptr(cb_id) + sizeof(PACKET_HEADER_TYPE);
 
                 uint32_t n_pages = std::min(packet_size_in_pages, command_tensor.worker_pages_per_slice - p);
                 ASSERT(command_tensor.worker_start_offset_in_slice.w == 0);
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp
index 731ed70359e..8107d2d992e 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp
@@ -437,7 +437,7 @@ void try_advance_inline_write_or_atomic_inc(command_context_t<Addrgen>& cmd_ctx)
         ASSERT(cmd_ctx.core_desc_type == ttnn::ccl::cmd::CclCommandCoreDescriptorType::NOC_XY);
 
         ASSERT(cmd_ctx.packet_header_buffer_addr != 0);
-        auto* pkt_hdr = reinterpret_cast<tt::fabric::PacketHeader*>(cmd_ctx.packet_header_buffer_addr);
+        auto* pkt_hdr = reinterpret_cast<PACKET_HEADER_TYPE *>(cmd_ctx.packet_header_buffer_addr);
 
         uint64_t dest_noc_addr_for_pkt = safe_get_noc_addr(dest_noc0_x, dest_noc0_y, dest_bank_addr, 0);
         if (cmd_ctx.current_cmd_header.code == ttnn::ccl::cmd::CclCommandCode::ATOMIC_INC) {
@@ -457,7 +457,7 @@ void try_advance_inline_write_or_atomic_inc(command_context_t<Addrgen>& cmd_ctx)
                                               : cmd_ctx.fabric_connection.get_backward_connection();
                 fabric_connection.wait_for_empty_write_slot();
                 fabric_connection.send_payload_flush_blocking_from_address(
-                    cmd_ctx.packet_header_buffer_addr, sizeof(tt::fabric::PacketHeader));
+                    cmd_ctx.packet_header_buffer_addr, sizeof(PACKET_HEADER_TYPE));
             } break;
             case ttnn::ccl::cmd::CclCommandDestType::CHIP_MULTICAST: {
                 write_local = true;
@@ -467,7 +467,7 @@ void try_advance_inline_write_or_atomic_inc(command_context_t<Addrgen>& cmd_ctx)
                         1, static_cast<uint8_t>(mcast_args.num_targets_forward_direction)});
                     cmd_ctx.fabric_connection.get_forward_connection().wait_for_empty_write_slot();
                     cmd_ctx.fabric_connection.get_forward_connection().send_payload_flush_blocking_from_address(
-                        cmd_ctx.packet_header_buffer_addr, sizeof(tt::fabric::PacketHeader));
+                        cmd_ctx.packet_header_buffer_addr, sizeof(PACKET_HEADER_TYPE));
                 }
 
                 // Write the mcast packet (backward)
@@ -476,7 +476,7 @@ void try_advance_inline_write_or_atomic_inc(command_context_t<Addrgen>& cmd_ctx)
                         1, static_cast<uint8_t>(mcast_args.num_targets_backward_direction)});
                     cmd_ctx.fabric_connection.get_backward_connection().wait_for_empty_write_slot();
                     cmd_ctx.fabric_connection.get_backward_connection().send_payload_non_blocking_from_address(
-                        cmd_ctx.packet_header_buffer_addr, sizeof(tt::fabric::PacketHeader));
+                        cmd_ctx.packet_header_buffer_addr, sizeof(PACKET_HEADER_TYPE));
                 }
 
             } break;
@@ -559,7 +559,7 @@ void write_and_advance_local_read_address_for_fabric_write(
     uint32_t payload_size_bytes) {
     const size_t payload_l1_address = l1_read_addr;
 
-    auto pkt_hdr = reinterpret_cast<volatile tt::fabric::PacketHeader*>(packet_header_buffer_addr);
+    auto pkt_hdr = reinterpret_cast<volatile PACKET_HEADER_TYPE *>(packet_header_buffer_addr);
 
     pkt_hdr->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr}, payload_size_bytes);
 
@@ -573,7 +573,7 @@ void write_and_advance_local_read_address_for_fabric_write(
 
             fabric_conn.wait_for_empty_write_slot();
             fabric_conn.send_payload_without_header_non_blocking_from_address(l1_read_addr, payload_size_bytes);
-            fabric_conn.send_payload_flush_blocking_from_address((uint32_t)pkt_hdr, sizeof(tt::fabric::PacketHeader));
+            fabric_conn.send_payload_flush_blocking_from_address((uint32_t)pkt_hdr, sizeof(PACKET_HEADER_TYPE));
         } break;
         case ttnn::ccl::cmd::CclCommandDestType::CHIP_MULTICAST: {
             const auto [dest_noc_xy, dest_addr] = get_noc_address_components(noc0_dest_noc_addr);
@@ -588,7 +588,7 @@ void write_and_advance_local_read_address_for_fabric_write(
                 fabric_connection.get_forward_connection().send_payload_without_header_non_blocking_from_address(
                     l1_read_addr, payload_size_bytes);
                 fabric_connection.get_forward_connection().send_payload_flush_blocking_from_address(
-                    (uint32_t)pkt_hdr, sizeof(tt::fabric::PacketHeader));
+                    (uint32_t)pkt_hdr, sizeof(PACKET_HEADER_TYPE));
             }
 
             if (fabric_connection.has_backward_connection()) {
@@ -598,7 +598,7 @@ void write_and_advance_local_read_address_for_fabric_write(
                 fabric_connection.get_backward_connection().send_payload_without_header_non_blocking_from_address(
                     l1_read_addr, payload_size_bytes);
                 fabric_connection.get_backward_connection().send_payload_flush_blocking_from_address(
-                    (uint32_t)pkt_hdr, sizeof(tt::fabric::PacketHeader));
+                    (uint32_t)pkt_hdr, sizeof(PACKET_HEADER_TYPE));
             }
         } break;
         default: {
@@ -618,8 +618,8 @@ FORCE_INLINE void write_payload_then_advance_read_address(
     size_t& l1_read_addr,
     size_t payload_size_bytes) {
     static_assert(
-        ((sizeof(tt::fabric::PacketHeader) - 1) & sizeof(tt::fabric::PacketHeader)) == 0,
-        "sizeof(sizeof(tt::fabric::PacketHeader)) is not a power of two which violates the below assertion");
+        is_power_of_2(sizeof(PACKET_HEADER_TYPE)),
+        "sizeof(PACKET_HEADER_TYPE) is not a power of two which violates the below assertion");
 
     switch (current_cmd_header.dest_type) {
         case ttnn::ccl::cmd::CclCommandDestType::CHIP_UNICAST: [[fallthrough]];
@@ -933,7 +933,7 @@ void kernel_main() {
     cb_reserve_back(reserved_packet_header_cb_id, num_packet_headers_storable);
     auto packet_header_buffer_addr0 = get_write_ptr(reserved_packet_header_cb_id);
     auto packet_header_buffer_addr1 =
-        packet_header_buffer_addr0 + (num_packet_headers_storable >> 2) * sizeof(tt::fabric::PacketHeader);
+        packet_header_buffer_addr0 + (num_packet_headers_storable >> 2) * sizeof(PACKET_HEADER_TYPE);
 
     auto operand_0_cmd_ctx = command_context_t(
         fabric_connection,
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp
index 904cd775a9a..decb79c8070 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp
@@ -96,7 +96,7 @@ void mcast_contig_pages_to_noc_address(
     size_t backward_direction_num_hops) {
     const size_t payload_size_bytes = contig_pages_advanced * payload_page_size;
     const auto [dest_noc_xy, dest_addr] = get_noc_address_components(noc0_dest_addr);
-    const size_t payload_l1_address = l1_read_addr + sizeof(tt::fabric::PacketHeader);
+    const size_t payload_l1_address = l1_read_addr + sizeof(PACKET_HEADER_TYPE);
 
     // Local chip write
     noc_async_write(
@@ -106,15 +106,15 @@ void mcast_contig_pages_to_noc_address(
         // coords it is necessary
         get_noc_addr(dest_noc_xy.x, dest_noc_xy.y, dest_addr, noc_index),
         payload_size_bytes);
-    size_t packet_send_size_bytes = payload_size_bytes + sizeof(tt::fabric::PacketHeader);
+    size_t packet_send_size_bytes = payload_size_bytes + sizeof(PACKET_HEADER_TYPE);
 
     // Forward fabric connection
     if (has_forward_fabric_connection) {
         static_assert(
-            ((sizeof(tt::fabric::PacketHeader) - 1) & sizeof(tt::fabric::PacketHeader)) == 0,
-            "sizeof(sizeof(tt::fabric::PacketHeader)) is not a power of two which violates the below assertion");
+            is_power_of_2(sizeof(PACKET_HEADER_TYPE)),
+            "sizeof(tt::fabric::PacketHeader) is not a power of two which violates the below assertion");
 
-        auto& pkt_hdr = *reinterpret_cast<tt::fabric::PacketHeader*>(l1_read_addr);
+        auto& pkt_hdr = *reinterpret_cast<PACKET_HEADER_TYPE*>(l1_read_addr);
         pkt_hdr
             .to_chip_multicast(
                 tt::fabric::MulticastRoutingCommandHeader{1, static_cast<uint8_t>(forward_direction_num_hops)})
@@ -125,7 +125,7 @@ void mcast_contig_pages_to_noc_address(
 
     // Backward fabric connection
     if (has_backward_fabric_connection) {
-        auto& pkt_hdr = *reinterpret_cast<tt::fabric::PacketHeader*>(l1_read_addr);
+        auto& pkt_hdr = *reinterpret_cast<PACKET_HEADER_TYPE*>(l1_read_addr);
         pkt_hdr
             .to_chip_multicast(
                 tt::fabric::MulticastRoutingCommandHeader{1, static_cast<uint8_t>(backward_direction_num_hops)})
@@ -286,11 +286,11 @@ void mcast_sync_signal_to_addr(
                                size_t remote_sem_l1_addr,
                                size_t directional_num_hops) {
         static_assert(
-            ((sizeof(tt::fabric::PacketHeader) - 1) & sizeof(tt::fabric::PacketHeader)) == 0,
-            "sizeof(sizeof(tt::fabric::PacketHeader)) is not a power of two which violates the below assertion");
-        ASSERT((pkt_addr & (sizeof(tt::fabric::PacketHeader) - 1)) == 0);
+            is_power_of_2(sizeof(PACKET_HEADER_TYPE)),
+            "sizeof(tt::fabric::PacketHeader) is not a power of two which violates the below assertion");
+        ASSERT((pkt_addr & (sizeof(PACKET_HEADER_TYPE) - 1)) == 0);
 
-        auto& pkt_hdr = *reinterpret_cast<tt::fabric::PacketHeader*>(pkt_addr);
+        auto& pkt_hdr = *reinterpret_cast<PACKET_HEADER_TYPE*>(pkt_addr);
         pkt_hdr
             .to_chip_multicast(tt::fabric::MulticastRoutingCommandHeader{1, static_cast<uint8_t>(directional_num_hops)})
             .to_noc_unicast_atomic_inc(tt::fabric::NocUnicastAtomicIncCommandHeader{
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_writer.cpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_writer.cpp
index 71865c224e5..766cdd0b688 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_writer.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_writer.cpp
@@ -125,7 +125,7 @@ void kernel_main() {
     //    out when we start enabling other modes
     const size_t packet_size_in_pages = get_arg_val<uint32_t>(arg_idx++);
     const size_t payload_page_size = get_arg_val<uint32_t>(arg_idx++);
-    const size_t l1_scratch_page_size = payload_page_size + sizeof(tt::fabric::PacketHeader);
+    const size_t l1_scratch_page_size = payload_page_size + sizeof(PACKET_HEADER_TYPE);
     const size_t forward_direction_num_hops = get_arg_val<uint32_t>(arg_idx++);
     const size_t backward_direction_num_hops = get_arg_val<uint32_t>(arg_idx++);
     const bool has_forward_fabric_connection = get_arg_val<uint32_t>(arg_idx++) != 0;
@@ -248,7 +248,7 @@ void kernel_main() {
         DPRINT << "ccl_send_writer Sending payload completion sync signals\n";
         ASSERT(some_buffering_addr != 0);
         some_buffering_addr =
-            (some_buffering_addr + (sizeof(tt::fabric::PacketHeader))) & ~(sizeof(tt::fabric::PacketHeader) - 1);
+            (some_buffering_addr + (sizeof(PACKET_HEADER_TYPE))) & ~(sizeof(PACKET_HEADER_TYPE) - 1);
 
         mcast_sync_signal_to_addr(
             some_buffering_addr,
diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp
index 564ed163999..87ba5ea5fba 100644
--- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp
@@ -326,7 +326,7 @@ struct WorkerToFabricEdmSenderImpl {
     FORCE_INLINE void send_packet_header_and_notify_fabric(uint32_t source_address) {
         uint64_t buffer_address = this->compute_dest_buffer_slot_noc_addr();
 
-        send_chunk_from_address<blocking_mode>(source_address, 1, sizeof(tt::fabric::PacketHeader), buffer_address);
+        send_chunk_from_address<blocking_mode>(source_address, 1, sizeof(PACKET_HEADER_TYPE), buffer_address);
         post_send_payload_increment_pointers();
     }
 
@@ -335,23 +335,23 @@ struct WorkerToFabricEdmSenderImpl {
         uint64_t buffer_address = this->compute_dest_buffer_slot_noc_addr();
 
         // skip past the first part of the buffer which will be occupied by the packet header
-        send_chunk_from_address<blocking_mode>(source_address, 1, size_bytes, buffer_address + sizeof(tt::fabric::PacketHeader));
+        send_chunk_from_address<blocking_mode>(source_address, 1, size_bytes, buffer_address + sizeof(PACKET_HEADER_TYPE));
     }
     template <ttnn::ccl::EDM_IO_BLOCKING_MODE blocking_mode>
     FORCE_INLINE void send_payload_from_address_impl(uint32_t source_address, size_t size_bytes) {
         uint64_t buffer_address = this->compute_dest_buffer_slot_noc_addr();
 
         ASSERT(size_bytes <= this->buffer_size_bytes);
-        ASSERT(tt::fabric::is_valid(*const_cast<tt::fabric::PacketHeader*>(
-            reinterpret_cast<volatile tt::fabric::PacketHeader*>(source_address))));
+        ASSERT(tt::fabric::is_valid(*const_cast<PACKET_HEADER_TYPE*>(
+            reinterpret_cast<volatile PACKET_HEADER_TYPE*>(source_address))));
         send_chunk_from_address<blocking_mode>(source_address, 1, size_bytes, buffer_address);
         post_send_payload_increment_pointers();
     }
     template <ttnn::ccl::EDM_IO_BLOCKING_MODE blocking_mode>
     FORCE_INLINE void send_payload_from_address_with_trid_impl(uint32_t source_address, size_t size_bytes, uint8_t trid) {
         ASSERT(size_bytes <= this->buffer_size_bytes);
-        ASSERT(tt::fabric::is_valid(*const_cast<tt::fabric::PacketHeader*>(
-            reinterpret_cast<volatile tt::fabric::PacketHeader*>(source_address))));
+        ASSERT(tt::fabric::is_valid(*const_cast<PACKET_HEADER_TYPE*>(
+            reinterpret_cast<volatile PACKET_HEADER_TYPE*>(source_address))));
         send_chunk_from_address_with_trid<blocking_mode>(source_address, 1, size_bytes, this->edm_buffer_addr, trid, this->edm_noc_cmd_buf);
         post_send_payload_increment_pointers();
     }
diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp
index af3c53f27b5..468777220e8 100644
--- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp
@@ -89,11 +89,11 @@ struct NocMulticastAtomicIncCommandHeader {
     uint8_t size_x;
     uint8_t size_y;
 };
-static_assert(sizeof(NocUnicastCommandHeader) == 8, "NocUnicastCommandHeader size is not 1 byte");
-static_assert(sizeof(NocMulticastCommandHeader) == 8, "NocMulticastCommandHeader size is not 1 byte");
-static_assert(sizeof(NocUnicastInlineWriteCommandHeader) == 16, "NocMulticastCommandHeader size is not 1 byte");
-static_assert(sizeof(NocUnicastAtomicIncCommandHeader) == 16, "NocUnicastCommandHeader size is not 1 byte");
-static_assert(sizeof(NocMulticastAtomicIncCommandHeader) == 12, "NocAtomicIncCommandHeader size is not 1 byte");
+static_assert(sizeof(NocUnicastCommandHeader) == 8, "NocUnicastCommandHeader size is not 8 bytes");
+static_assert(sizeof(NocMulticastCommandHeader) == 8, "NocMulticastCommandHeader size is not 8 bytes");
+static_assert(sizeof(NocUnicastInlineWriteCommandHeader) == 16, "NocMulticastCommandHeader size is not 16 bytes");
+static_assert(sizeof(NocUnicastAtomicIncCommandHeader) == 16, "NocUnicastCommandHeader size is not 16 bytes");
+static_assert(sizeof(NocMulticastAtomicIncCommandHeader) == 12, "NocAtomicIncCommandHeader size is not 12 bytes");
 union NocCommandFields{
     NocUnicastCommandHeader unicast_write;
     NocUnicastInlineWriteCommandHeader unicast_inline_write;
@@ -104,116 +104,107 @@ union NocCommandFields{
 static_assert(sizeof(NocCommandFields) <= 16, "CommandFields size is not 16 bytes");
 
 // TODO: wrap this in a debug version that holds type info so we can assert for field/command/
-struct PacketHeader {
+template <typename Derived>
+struct PacketHeaderBase {
+    NocCommandFields command_fields; // size = 16B due to uint64_t alignment
+    uint16_t payload_size_bytes;
     // TODO: trim this down noc_send_type 2 bits (4 values):
     //   -> unicast_write, mcast_write, unicast_seminc, mcast_seminc
     // For now, kept it separate so I could do reads which would be handled differently
     // but for our purposes we shouldn't need read so we should be able to omit the support
     NocSendType noc_send_type : 3;
+    // ChipSendType only used by PacketHeader, but keep here for now for bit-fields
     ChipSendType chip_send_type : 1;
-
     // Used only by the EDM sender and receiver channels. Populated by EDM sender channel to
     // indicate to the receiver channel what channel was the source of this packet. Reserved
     // otherwise.
     uint8_t src_ch_id : 4;
 
-    RoutingFields routing_fields;
-    uint16_t payload_size_bytes; // excludes header size
-    NocCommandFields command_fields; // size = 16B due to uint64_t alignment
-
-    // Sort of hack to work-around DRAM read alignment issues that must be 32B aligned
-    // To simplify worker kernel code, we for now decide to pad up the packet header
-    // to 32B so the user can simplify shift into their CB chunk by sizeof(tt::fabric::PacketHeader)
-    // and automatically work around the DRAM read alignment bug.
-    //
-    // Future changes will remove this padding and require the worker kernel to be aware of this bug
-    // and pad their own CBs conditionally when reading from DRAM. It'll be up to the users to
-    // manage this complexity.
-    uint32_t padding0;
-    uint32_t padding1;
-
-    inline void set_chip_send_type(ChipSendType &type) { this->chip_send_type = type; }
-    inline void set_noc_send_type(NocSendType &type) { this->noc_send_type = type; }
-    inline void set_routing_fields(RoutingFields &fields) { this->routing_fields = fields; }
-    inline void set_command_fields(NocCommandFields &fields) { this->command_fields = fields; }
-
     // Returns size of payload in bytes - TODO: convert to words (4B)
     size_t get_payload_size_excluding_header() volatile const {
         return this->payload_size_bytes;
     }
+
     inline size_t get_payload_size_including_header() volatile const {
-        return get_payload_size_excluding_header() + sizeof(PacketHeader);
+        return get_payload_size_excluding_header() + sizeof(Derived);
     }
 
-    inline PacketHeader &to_chip_unicast(uint8_t distance_in_hops) {
-        this->chip_send_type = CHIP_UNICAST;
-        this->routing_fields.value = RoutingFields::LAST_CHIP_IN_MCAST_VAL | distance_in_hops;
-        return *this;
+    // Setters for noc_send_type, routing_fields, and command_fields
+    inline void set_noc_send_type(NocSendType &type) { this->noc_send_type = type; }
+    inline void set_command_fields(NocCommandFields &fields) { this->command_fields = fields; }
+
+    inline Derived &to_chip_unicast(uint8_t distance_in_hops) {
+        static_cast<Derived*>(this)->to_chip_unicast_impl(distance_in_hops);
+        return *static_cast<Derived*>(this);
     }
-    inline PacketHeader &to_chip_multicast(MulticastRoutingCommandHeader const &chip_multicast_command_header) {
-        this->chip_send_type = CHIP_MULTICAST;
-        this->routing_fields.value = ((static_cast<uint8_t>(chip_multicast_command_header.range_hops) << RoutingFields::START_DISTANCE_FIELD_BIT_WIDTH)) | static_cast<uint8_t>(chip_multicast_command_header.start_distance_in_hops);
-        return *this;
+
+    inline Derived &to_chip_multicast(MulticastRoutingCommandHeader const &mcast_routing_command_header) {
+        static_cast<Derived*>(this)->to_chip_multicast_impl(mcast_routing_command_header);
+        return *static_cast<Derived*>(this);
     }
 
-    inline PacketHeader &to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header, size_t payload_size_bytes) {
+    inline Derived &to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header, size_t payload_size_bytes) {
         this->noc_send_type = NOC_UNICAST_WRITE;
         this->command_fields.unicast_write = noc_unicast_command_header;
         this->payload_size_bytes = payload_size_bytes;
-        return *this;
+        return *static_cast<Derived*>(this);
     }
-    inline PacketHeader &to_noc_unicast_inline_write(NocUnicastInlineWriteCommandHeader const &noc_unicast_command_header) {
+
+    inline Derived &to_noc_unicast_inline_write(NocUnicastInlineWriteCommandHeader const &noc_unicast_command_header) {
         this->noc_send_type = NOC_UNICAST_INLINE_WRITE;
         this->command_fields.unicast_inline_write = noc_unicast_command_header;
         this->payload_size_bytes = 0;
-        return *this;
+        return *static_cast<Derived*>(this);
     }
-    inline PacketHeader &to_noc_multicast_write(NocMulticastCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) {
+
+    inline Derived &to_noc_multicast(NocMulticastCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) {
         this->noc_send_type = NOC_MULTICAST_WRITE;
         this->command_fields.mcast_write = noc_multicast_command_header;
         this->payload_size_bytes = payload_size_bytes;
-        return *this;
+        return *static_cast<Derived*>(this);
     }
-    inline PacketHeader &to_noc_unicast_atomic_inc(NocUnicastAtomicIncCommandHeader const &noc_unicast_atomic_inc_command_header) {
+
+    inline Derived &to_noc_unicast_atomic_inc(NocUnicastAtomicIncCommandHeader const &noc_unicast_atomic_inc_command_header) {
         this->noc_send_type = NOC_UNICAST_ATOMIC_INC;
         this->command_fields.unicast_seminc = noc_unicast_atomic_inc_command_header;
         this->payload_size_bytes = 0;
-        return *this;
+        return *static_cast<Derived*>(this);
     }
-    inline PacketHeader &to_noc_multicast_atomic_inc(NocMulticastAtomicIncCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) {
-        #if defined(KERNEL_BUILD) || defined(FW_BUILD)
-        ASSERT(false);
-        while (1) {};
-        #endif
+
+    inline Derived &to_noc_multicast_atomic_inc(
+        NocMulticastAtomicIncCommandHeader const &noc_multicast_atomic_inc_command_header, size_t payload_size_bytes) {
+        this->noc_send_type = NOC_MULTICAST_ATOMIC_INC;
+        this->command_fields.mcast_seminc = noc_multicast_atomic_inc_command_header;
         this->payload_size_bytes = payload_size_bytes;
-        return *this;
+        return *static_cast<Derived*>(this);
     }
 
-    inline volatile PacketHeader *to_chip_unicast(uint8_t distance_in_hops) volatile {
-        this->chip_send_type = CHIP_UNICAST;
-        this->routing_fields.value = RoutingFields::LAST_CHIP_IN_MCAST_VAL | distance_in_hops;
-        return this;
+    inline volatile Derived* to_chip_unicast(uint8_t distance_in_hops) volatile {
+        static_cast<volatile Derived*>(this)->to_chip_unicast_impl(distance_in_hops);
+        return static_cast<volatile Derived*>(this);
     }
-    inline volatile PacketHeader *to_chip_multicast(MulticastRoutingCommandHeader const &chip_multicast_command_header) volatile {
-        this->chip_send_type = CHIP_MULTICAST;
-        this->routing_fields.value = (static_cast<uint8_t>(chip_multicast_command_header.range_hops) << RoutingFields::START_DISTANCE_FIELD_BIT_WIDTH) | chip_multicast_command_header.start_distance_in_hops;
-        return this;
+
+    inline volatile Derived* to_chip_multicast(MulticastRoutingCommandHeader const &mcast_routing_command_header) volatile {
+        static_cast<volatile Derived*>(this)->to_chip_multicast_impl(mcast_routing_command_header);
+        return static_cast<volatile Derived*>(this);
     }
-    inline volatile PacketHeader *to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header, size_t payload_size_bytes) volatile {
+
+    inline volatile Derived* to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header, size_t payload_size_bytes) volatile {
         this->noc_send_type = NOC_UNICAST_WRITE;
         this->command_fields.unicast_write.noc_address = noc_unicast_command_header.noc_address;
         this->payload_size_bytes = payload_size_bytes;
-
-        return this;
+        return static_cast<volatile Derived*>(this);
     }
-    inline volatile PacketHeader &to_noc_unicast_inline_write(NocUnicastInlineWriteCommandHeader const &noc_unicast_command_header) volatile {
+
+    inline volatile Derived* to_noc_unicast_inline_write(NocUnicastInlineWriteCommandHeader const &noc_unicast_command_header) volatile {
         this->noc_send_type = NOC_UNICAST_INLINE_WRITE;
         this->command_fields.unicast_inline_write.noc_address = noc_unicast_command_header.noc_address;
         this->command_fields.unicast_inline_write.value = noc_unicast_command_header.value;
         this->payload_size_bytes = 0;
-        return *this;
+        return static_cast<volatile Derived*>(this);
     }
-    inline volatile PacketHeader *to_noc_multicast(NocMulticastCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) volatile {
+
+    inline volatile Derived* to_noc_multicast(NocMulticastCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) volatile {
         this->noc_send_type = NOC_MULTICAST_WRITE;
         this->command_fields.mcast_write.mcast_rect_size_x = noc_multicast_command_header.mcast_rect_size_x;
         this->command_fields.mcast_write.mcast_rect_size_y = noc_multicast_command_header.mcast_rect_size_y;
@@ -221,20 +212,19 @@ struct PacketHeader {
         this->command_fields.mcast_write.noc_y_start = noc_multicast_command_header.noc_y_start;
         this->payload_size_bytes = payload_size_bytes;
         this->command_fields.mcast_write.address = noc_multicast_command_header.address;
-
-        return this;
+        return static_cast<volatile Derived*>(this);
     }
-    inline volatile PacketHeader *to_noc_unicast_atomic_inc(
-        NocUnicastAtomicIncCommandHeader const &noc_unicast_atomic_inc_command_header) volatile {
+
+    inline volatile Derived* to_noc_unicast_atomic_inc(NocUnicastAtomicIncCommandHeader const &noc_unicast_atomic_inc_command_header) volatile {
         this->noc_send_type = NOC_UNICAST_ATOMIC_INC;
         this->command_fields.unicast_seminc.noc_address = noc_unicast_atomic_inc_command_header.noc_address;
         this->command_fields.unicast_seminc.val = noc_unicast_atomic_inc_command_header.val;
         this->command_fields.unicast_seminc.wrap = noc_unicast_atomic_inc_command_header.wrap;
         this->payload_size_bytes = 0;
-
-        return this;
+        return static_cast<volatile Derived*>(this);
     }
-    inline volatile PacketHeader *to_noc_multicast_atomic_inc(
+
+    inline volatile Derived *to_noc_multicast_atomic_inc(
         NocMulticastAtomicIncCommandHeader const &noc_multicast_atomic_inc_command_header, size_t payload_size_bytes) volatile {
         this->noc_send_type = NOC_MULTICAST_ATOMIC_INC;
         this->command_fields.mcast_seminc.address = noc_multicast_atomic_inc_command_header.address;
@@ -245,17 +235,145 @@ struct PacketHeader {
         this->command_fields.mcast_seminc.val = noc_multicast_atomic_inc_command_header.val;
         this->command_fields.mcast_seminc.wrap = noc_multicast_atomic_inc_command_header.wrap;
         this->payload_size_bytes = payload_size_bytes;
+        return static_cast<volatile Derived*>(this);
+    }
 
-        return this;
+    inline void set_src_ch_id(uint8_t ch_id) volatile {
+        this->src_ch_id = ch_id;
     }
-    inline void set_src_ch_id(uint8_t ch_id) volatile { this->src_ch_id = ch_id; }
 };
 
+struct PacketHeader : public PacketHeaderBase<PacketHeader> {
+    RoutingFields routing_fields;
+    // Sort of hack to work-around DRAM read alignment issues that must be 32B aligned
+    // To simplify worker kernel code, we for now decide to pad up the packet header
+    // to 32B so the user can simplify shift into their CB chunk by sizeof(tt::fabric::PacketHeader)
+    // and automatically work around the DRAM read alignment bug.
+    //
+    // Future changes will remove this padding and require the worker kernel to be aware of this bug
+    // and pad their own CBs conditionally when reading from DRAM. It'll be up to the users to
+    // manage this complexity.
+    uint32_t padding0;
+    uint32_t padding1;
+
+    private:
+
+    inline static uint32_t calculate_chip_unicast_routing_fields_value(uint8_t distance_in_hops) {
+        return RoutingFields::LAST_CHIP_IN_MCAST_VAL | distance_in_hops;
+    }
+    inline static uint32_t calculate_chip_multicast_routing_fields_value(
+        const MulticastRoutingCommandHeader& chip_multicast_command_header) {
+        return ((static_cast<uint8_t>(chip_multicast_command_header.range_hops) << RoutingFields::START_DISTANCE_FIELD_BIT_WIDTH)) | static_cast<uint8_t>(chip_multicast_command_header.start_distance_in_hops);
+    }
+
+    public:
+
+    // Setters for PacketHeader-specific fields
+    inline void set_chip_send_type(ChipSendType &type) { this->chip_send_type = type; }
+
+    inline void set_routing_fields(RoutingFields &fields) { this->routing_fields = fields; }
+
+    inline void to_chip_unicast_impl(uint8_t distance_in_hops) {
+        this->chip_send_type = CHIP_UNICAST;
+        this->routing_fields.value = PacketHeader::calculate_chip_unicast_routing_fields_value(distance_in_hops);
+    }
+    inline void to_chip_multicast_impl(MulticastRoutingCommandHeader const &chip_multicast_command_header) {
+        this->chip_send_type = CHIP_MULTICAST;
+        this->routing_fields.value = PacketHeader::calculate_chip_multicast_routing_fields_value(chip_multicast_command_header);
+    }
+
+    inline void to_chip_unicast_impl(uint8_t distance_in_hops) volatile {
+        this->chip_send_type = CHIP_UNICAST;
+        this->routing_fields.value = PacketHeader::calculate_chip_unicast_routing_fields_value(distance_in_hops);
+    }
+    inline void to_chip_multicast_impl(MulticastRoutingCommandHeader const &chip_multicast_command_header) volatile{
+        this->chip_send_type = CHIP_MULTICAST;
+        this->routing_fields.value = PacketHeader::calculate_chip_multicast_routing_fields_value(chip_multicast_command_header);
+    }
+};
+
+struct LowLatencyRoutingFields {
+    static constexpr uint32_t FIELD_WIDTH = 2;
+    static constexpr uint32_t FIELD_MASK = 0b11;
+    static constexpr uint32_t NOOP = 0b00;
+    static constexpr uint32_t WRITE_ONLY = 0b01;
+    static constexpr uint32_t FORWARD_ONLY = 0b10;
+    static constexpr uint32_t WRITE_AND_FORWARD = 0b11;
+    static constexpr uint32_t FWD_ONLY_FIELD = 0xAAAAAAAA;
+    static constexpr uint32_t WR_AND_FWD_FIELD = 0xFFFFFFFF;
+    uint32_t value;
+};
+
+struct LowLatencyPacketHeader : public PacketHeaderBase<LowLatencyPacketHeader> {
+    uint8_t padding0;
+    LowLatencyRoutingFields routing_fields;
+    uint32_t padding1;
+
+    private:
+
+    inline static uint32_t calculate_chip_unicast_routing_fields_value(uint8_t distance_in_hops) {
+        // Example of unicast 3 hops away
+        // First line will do 0xAAAAAAAA & 0b1111 = 0b1010. This means starting from our neighbor, we will forward twice (forward to neighbor is not encoded in the field)
+        // Last line will do 0b01 << 4 = 0b010000. This means that on the 3rd chip, we will write only
+        // Together this means the final encoding is 0b011010
+        return
+            (LowLatencyRoutingFields::FWD_ONLY_FIELD & ((1 << (distance_in_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH) - 1)) |
+            (LowLatencyRoutingFields::WRITE_ONLY << (distance_in_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH);
+    }
+    inline static uint32_t calculate_chip_multicast_routing_fields_value(
+        const MulticastRoutingCommandHeader& chip_multicast_command_header) {
+        // Example of starting 3 hops away mcasting to 2 chips
+        // First line will do 0xAAAAAAAA & 0b1111 = 0b1010. This means starting from our neighbor, we will forward twice (forward to neighbor is not encoded in the field)
+        // Second line will do 0xFFFFFFFF & 0b11 = 0b11. 0b11 << 4 = 0b110000. This means starting from the 3rd chip, we will write and forward once
+        // Last line will do 0b01 << 6 = 0b01000000. This means that on the 5th chip, we will write only
+        // Together this means the final encoding is 0b01111010
+        return
+            (LowLatencyRoutingFields::FWD_ONLY_FIELD & ((1 << (chip_multicast_command_header.start_distance_in_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH) - 1)) |
+            (LowLatencyRoutingFields::WR_AND_FWD_FIELD & ((1 << (chip_multicast_command_header.range_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH) - 1) <<
+            ((chip_multicast_command_header.start_distance_in_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH)) |
+            (LowLatencyRoutingFields::WRITE_ONLY << (chip_multicast_command_header.start_distance_in_hops + chip_multicast_command_header.range_hops - 2) * LowLatencyRoutingFields::FIELD_WIDTH);
+    }
+
+    public:
+
+    // Specialized implementations for LowLatencyPacketHeader
+    inline void set_routing_fields(LowLatencyRoutingFields &fields) {
+        this->routing_fields = fields;
+    }
+
+    inline void to_chip_unicast_impl(uint8_t distance_in_hops) {
+        this->routing_fields.value = LowLatencyPacketHeader::calculate_chip_unicast_routing_fields_value(distance_in_hops);
+    }
+    inline void to_chip_multicast_impl(
+        const MulticastRoutingCommandHeader& chip_multicast_command_header) {
+        this->routing_fields.value = LowLatencyPacketHeader::calculate_chip_multicast_routing_fields_value(chip_multicast_command_header);
+    }
+
+    inline void to_chip_unicast_impl(uint8_t distance_in_hops) volatile {
+        this->routing_fields.value = LowLatencyPacketHeader::calculate_chip_unicast_routing_fields_value(distance_in_hops);
+    }
+    inline void to_chip_multicast_impl(
+        const MulticastRoutingCommandHeader& chip_multicast_command_header) volatile {
+        this->routing_fields.value = LowLatencyPacketHeader::calculate_chip_multicast_routing_fields_value(chip_multicast_command_header);
+    }
+};
 
 // TODO: When we remove the 32B padding requirement, reduce to 16B size check
 static_assert(sizeof(PacketHeader) == 32, "sizeof(PacketHeader) is not equal to 32B");
+// Host code still hardcoded to sizeof(PacketHeader) so we need to keep this check
+static_assert(sizeof(LowLatencyPacketHeader) == sizeof(PacketHeader), "sizeof(LowLatencyPacketHeader) is not equal to 32B");
 
 static constexpr size_t header_size_bytes = sizeof(PacketHeader);
 
+#define FABRIC_LOW_LATENCY_MODE 1
+
+#if defined FABRIC_LOW_LATENCY_MODE and FABRIC_LOW_LATENCY_MODE == 1
+#define PACKET_HEADER_TYPE tt::fabric::LowLatencyPacketHeader
+#define ROUTING_FIELDS_TYPE tt::fabric::LowLatencyRoutingFields
+#else
+#define PACKET_HEADER_TYPE tt::fabric::PacketHeader
+#define ROUTING_FIELDS_TYPE tt::fabric::RoutingFields
+#endif
+
 
 } // namespace tt::fabric
diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp
index 2589c8f526a..a284320d4d1 100644
--- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp
@@ -16,4 +16,9 @@ FORCE_INLINE bool is_valid(PacketHeader const& packet_header) {
     return (packet_header.chip_send_type <= CHIP_SEND_TYPE_LAST) && (packet_header.noc_send_type <= NOC_SEND_TYPE_LAST);
 }
 
+FORCE_INLINE void validate(const LowLatencyPacketHeader& packet_header) {}
+FORCE_INLINE bool is_valid(const LowLatencyPacketHeader& packet_header) {
+    return (packet_header.noc_send_type <= NOC_SEND_TYPE_LAST);
+}
+
 }  // namespace tt::fabric
diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp
index 85553bf6dab..5e8f59954c2 100644
--- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp
@@ -32,7 +32,13 @@ FORCE_INLINE void print_pkt_hdr_routing_fields(volatile tt::fabric::PacketHeader
 #endif
 }
 
-FORCE_INLINE void print_pkt_header_noc_fields(volatile tt::fabric::PacketHeader *const packet_start) {
+FORCE_INLINE void print_pkt_hdr_routing_fields(volatile tt::fabric::LowLatencyPacketHeader *const packet_start) {
+    #ifdef DEBUG_PRINT_ENABLED
+        DPRINT << "ROUTE:" << packet_start->routing_fields.value << "\n";
+    #endif
+}
+
+FORCE_INLINE void print_pkt_header_noc_fields(volatile PACKET_HEADER_TYPE *const packet_start) {
 #ifdef DEBUG_PRINT_ENABLED
     switch (packet_start->noc_send_type) {
         case tt::fabric::NocSendType::NOC_UNICAST_WRITE: {
@@ -62,12 +68,23 @@ FORCE_INLINE void print_pkt_header(volatile tt::fabric::PacketHeader *const pack
 #endif
 }
 
+FORCE_INLINE void print_pkt_header(volatile tt::fabric::LowLatencyPacketHeader *const packet_start) {
+    #ifdef DEBUG_PRINT_ENABLED
+        auto const& header = *packet_start;
+        DPRINT << "PKT: nsnd_t:" << (uint32_t) packet_start->noc_send_type <<
+            ", src_chip:" << (uint32_t) packet_start->src_ch_id <<
+            ", payload_size_bytes:" << (uint32_t) packet_start->payload_size_bytes << "\n";
+        print_pkt_hdr_routing_fields(packet_start);
+        print_pkt_header_noc_fields(packet_start);
+    #endif
+    }
+
 
 // Since we unicast to local, we must omit the packet header
 FORCE_INLINE void execute_chip_unicast_to_local_chip(
-    volatile tt::fabric::PacketHeader *const packet_start, uint16_t payload_size_bytes, uint32_t transaction_id) {
+    volatile PACKET_HEADER_TYPE *const packet_start, uint16_t payload_size_bytes, uint32_t transaction_id) {
     auto const& header = *packet_start;
-    uint32_t payload_start_address = reinterpret_cast<size_t>(packet_start) + sizeof(tt::fabric::PacketHeader);
+    uint32_t payload_start_address = reinterpret_cast<size_t>(packet_start) + sizeof(PACKET_HEADER_TYPE);
 
     tt::fabric::NocSendType noc_send_type = packet_start->noc_send_type;
     switch (noc_send_type) {
@@ -116,6 +133,10 @@ FORCE_INLINE void update_packet_header_for_next_hop(volatile tt::fabric::PacketH
     packet_header->routing_fields.value = cached_routing_fields.value - decrement_val;
 }
 
+FORCE_INLINE void update_packet_header_for_next_hop(volatile tt::fabric::LowLatencyPacketHeader * packet_header, tt::fabric::LowLatencyRoutingFields cached_routing_fields) {
+    packet_header->routing_fields.value >>= tt::fabric::LowLatencyRoutingFields::FIELD_WIDTH;
+}
+
 // This function forwards a packet to the downstream EDM channel for eventual sending
 // to the next chip in the line/ring
 //
@@ -127,9 +148,9 @@ FORCE_INLINE void update_packet_header_for_next_hop(volatile tt::fabric::PacketH
 // !!!WARNING!!!
 template <uint8_t NUM_SENDER_BUFFERS>
 FORCE_INLINE void forward_payload_to_downstream_edm(
-    volatile tt::fabric::PacketHeader *packet_header,
+    volatile PACKET_HEADER_TYPE *packet_header,
     uint16_t payload_size_bytes,
-    tt::fabric::RoutingFields cached_routing_fields,
+    ROUTING_FIELDS_TYPE cached_routing_fields,
     tt::fabric::EdmToEdmSender<NUM_SENDER_BUFFERS> &downstream_edm_interface,
     uint8_t transaction_id
     ) {
@@ -141,6 +162,6 @@ FORCE_INLINE void forward_payload_to_downstream_edm(
     update_packet_header_for_next_hop(packet_header, cached_routing_fields);
     downstream_edm_interface.send_payload_non_blocking_from_address_with_trid(
         reinterpret_cast<size_t>(packet_header),
-        payload_size_bytes + sizeof(tt::fabric::PacketHeader),
+        payload_size_bytes + sizeof(PACKET_HEADER_TYPE),
         transaction_id);
 }
diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp
index be1ec45d50d..f80505d936d 100644
--- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp
@@ -433,18 +433,18 @@ struct ReceiverChannelPointers {
 };
 
 struct PacketHeaderRecorder {
-    volatile tt::fabric::PacketHeader *buffer_ptr;
+    volatile uint32_t *buffer_ptr;
     size_t buffer_n_headers;
     size_t buffer_index;
 
-    PacketHeaderRecorder(volatile tt::fabric::PacketHeader *buffer_ptr, size_t buffer_n_headers) : buffer_ptr(buffer_ptr), buffer_n_headers(buffer_n_headers), buffer_index(0) {}
+    PacketHeaderRecorder(volatile uint32_t *buffer_ptr, size_t buffer_n_headers) : buffer_ptr(buffer_ptr), buffer_n_headers(buffer_n_headers), buffer_index(0) {}
 
-    void record_packet_header(volatile tt::fabric::PacketHeader *packet_header_ptr) {
-        uint32_t dest_l1_addr = (uint32_t)buffer_ptr + buffer_index * sizeof(tt::fabric::PacketHeader);
+    void record_packet_header(volatile uint32_t *packet_header_ptr) {
+        uint32_t dest_l1_addr = (uint32_t)buffer_ptr + buffer_index * sizeof(PACKET_HEADER_TYPE);
         noc_async_write(
             (uint32_t)packet_header_ptr,
             get_noc_addr(my_x[0], my_y[0], dest_l1_addr),
-            sizeof(tt::fabric::PacketHeader),
+            sizeof(PACKET_HEADER_TYPE),
             1 - noc_index // avoid the contention on main noc
         );
         buffer_index++;
@@ -541,8 +541,8 @@ FORCE_INLINE void send_next_data(
     //       NOTE: if we always send full packet, then we don't need the second branch below dedicated for
     //             channel sync
     auto volatile *pkt_header =
-        reinterpret_cast<volatile tt::fabric::PacketHeader *>(sender_buffer_channel.get_buffer_address(local_sender_wrptr_buffer_index));
-    ASSERT(tt::fabric::is_valid(*const_cast<tt::fabric::PacketHeader *>(pkt_header)));
+        reinterpret_cast<volatile PACKET_HEADER_TYPE *>(sender_buffer_channel.get_buffer_address(local_sender_wrptr_buffer_index));
+    ASSERT(tt::fabric::is_valid(*const_cast<PACKET_HEADER_TYPE *>(pkt_header)));
     size_t payload_size_bytes = pkt_header->get_payload_size_including_header();
     pkt_header->src_ch_id = sender_channel_index;
 
@@ -582,7 +582,7 @@ FORCE_INLINE void receiver_send_received_ack(
     // Set the acknowledgement bits. We have a different location than the
 
     auto receiver_buffer_index = receiver_channel_ptr.get_buffer_index();
-    auto volatile *pkt_header = reinterpret_cast<volatile tt::fabric::PacketHeader *>(local_receiver_buffer_channel.get_buffer_address(receiver_buffer_index));
+    auto volatile *pkt_header = reinterpret_cast<volatile PACKET_HEADER_TYPE *>(local_receiver_buffer_channel.get_buffer_address(receiver_buffer_index));
     const auto src_id = pkt_header->src_ch_id;
     remote_update_ptr_val(to_sender_packets_acked_streams[src_id], 1);
 }
@@ -597,7 +597,7 @@ FORCE_INLINE void receiver_send_completion_ack(
 
     auto receiver_buffer_index = receiver_channel_ptr.get_buffer_index();
 
-    auto volatile *pkt_header = reinterpret_cast<volatile tt::fabric::PacketHeader *>(local_receiver_buffer_channel.get_buffer_address(receiver_buffer_index));
+    auto volatile *pkt_header = reinterpret_cast<volatile PACKET_HEADER_TYPE *>(local_receiver_buffer_channel.get_buffer_address(receiver_buffer_index));
     const auto src_id = pkt_header->src_ch_id;
     remote_update_ptr_val(to_sender_packets_completed_streams[src_id], 1);
     receiver_channel_ptr.increment();
@@ -607,11 +607,16 @@ FORCE_INLINE void receiver_send_completion_ack(
 
 template <uint8_t SENDER_NUM_BUFFERS>
 FORCE_INLINE bool can_forward_packet_completely(
-    tt::fabric::RoutingFields cached_routing_fields,
+    ROUTING_FIELDS_TYPE cached_routing_fields,
     tt::fabric::EdmToEdmSender<SENDER_NUM_BUFFERS>& downstream_edm_interface) {
     // We always check if it is the terminal mcast packet value. We can do this because all unicast packets have the
     // mcast terminal value masked in to the routing field. This simplifies the check here to a single compare.
-    bool deliver_locally_only = cached_routing_fields.value == tt::fabric::RoutingFields::LAST_MCAST_VAL;
+    bool deliver_locally_only;
+    if constexpr (std::is_same_v<ROUTING_FIELDS_TYPE, tt::fabric::RoutingFields>) {
+        deliver_locally_only = cached_routing_fields.value == tt::fabric::RoutingFields::LAST_MCAST_VAL;
+    } else if constexpr (std::is_same_v<ROUTING_FIELDS_TYPE, tt::fabric::LowLatencyRoutingFields>) {
+        deliver_locally_only = (cached_routing_fields.value & tt::fabric::LowLatencyRoutingFields::FIELD_MASK) == tt::fabric::LowLatencyRoutingFields::WRITE_ONLY;
+    }
     return deliver_locally_only || downstream_edm_interface.edm_has_space_for_packet();
 }
 
@@ -619,19 +624,39 @@ FORCE_INLINE bool can_forward_packet_completely(
 template <uint8_t SENDER_NUM_BUFFERS>
 FORCE_INLINE void receiver_forward_packet(
     // TODO: have a separate cached copy of the packet header to save some additional L1 loads
-    volatile tt::fabric::PacketHeader *packet_start,
-    tt::fabric::RoutingFields cached_routing_fields,
+    volatile PACKET_HEADER_TYPE *packet_start,
+    ROUTING_FIELDS_TYPE cached_routing_fields,
     tt::fabric::EdmToEdmSender<SENDER_NUM_BUFFERS> &downstream_edm_interface,
     uint8_t transaction_id) {
 
-    bool start_distance_is_terminal_value = (cached_routing_fields.value & tt::fabric::RoutingFields::HOP_DISTANCE_MASK) == tt::fabric::RoutingFields::LAST_HOP_DISTANCE_VAL;
-    uint16_t payload_size_bytes = packet_start->payload_size_bytes;
-    if (start_distance_is_terminal_value) {
-        execute_chip_unicast_to_local_chip(packet_start, payload_size_bytes, transaction_id);
-    }
-    bool not_last_destination_device = cached_routing_fields.value != tt::fabric::RoutingFields::LAST_MCAST_VAL;
-    if (not_last_destination_device) {
-        forward_payload_to_downstream_edm(packet_start, payload_size_bytes, cached_routing_fields, downstream_edm_interface, transaction_id);
+    if constexpr (std::is_same_v<ROUTING_FIELDS_TYPE, tt::fabric::RoutingFields>) {
+        // If the packet is a terminal packet, then we can just deliver it locally
+        bool start_distance_is_terminal_value = (cached_routing_fields.value & tt::fabric::RoutingFields::HOP_DISTANCE_MASK) == tt::fabric::RoutingFields::LAST_HOP_DISTANCE_VAL;
+        uint16_t payload_size_bytes = packet_start->payload_size_bytes;
+        if (start_distance_is_terminal_value) {
+            execute_chip_unicast_to_local_chip(packet_start, payload_size_bytes, transaction_id);
+        }
+        bool not_last_destination_device = cached_routing_fields.value != tt::fabric::RoutingFields::LAST_MCAST_VAL;
+        if (not_last_destination_device) {
+            forward_payload_to_downstream_edm(packet_start, payload_size_bytes, cached_routing_fields, downstream_edm_interface, transaction_id);
+        }
+    } else if constexpr (std::is_same_v<ROUTING_FIELDS_TYPE, tt::fabric::LowLatencyRoutingFields>) {
+        uint32_t routing = cached_routing_fields.value & tt::fabric::LowLatencyRoutingFields::FIELD_MASK;
+        uint16_t payload_size_bytes = packet_start->payload_size_bytes;
+        switch (routing) {
+            case tt::fabric::LowLatencyRoutingFields::WRITE_ONLY:
+                execute_chip_unicast_to_local_chip(packet_start, payload_size_bytes, transaction_id);
+                break;
+            case tt::fabric::LowLatencyRoutingFields::FORWARD_ONLY:
+                forward_payload_to_downstream_edm(packet_start, payload_size_bytes, cached_routing_fields, downstream_edm_interface, transaction_id);
+                break;
+            case tt::fabric::LowLatencyRoutingFields::WRITE_AND_FORWARD:
+                execute_chip_unicast_to_local_chip(packet_start, payload_size_bytes, transaction_id);
+                forward_payload_to_downstream_edm(packet_start, payload_size_bytes, cached_routing_fields, downstream_edm_interface, transaction_id);
+                break;
+            default:
+                ASSERT(false);
+        }
     }
 }
 
@@ -663,10 +688,10 @@ FORCE_INLINE bool run_sender_channel_step(
             bool sender_backpressured_from_sender_side = !(local_sender_channel_worker_interface.local_rdptr.distance_behind(local_sender_channel_worker_interface.local_wrptr) < SENDER_NUM_BUFFERS);
             if (!sender_backpressured_from_sender_side) {
                 did_something = true;
-                auto packet_header = reinterpret_cast<tt::fabric::PacketHeader*>(local_sender_channel.get_buffer_address(local_sender_channel_worker_interface.local_wrptr.get_buffer_index()));
+                auto packet_header = reinterpret_cast<PACKET_HEADER_TYPE*>(local_sender_channel.get_buffer_address(local_sender_channel_worker_interface.local_wrptr.get_buffer_index()));
                 if constexpr (enable_packet_header_recording) {
                     tt::fabric::validate(*packet_header);
-                    packet_header_recorder.record_packet_header(packet_header);
+                    packet_header_recorder.record_packet_header(reinterpret_cast<volatile uint32_t*>(packet_header));
                 }
                 send_next_data(
                     local_sender_channel,
@@ -780,9 +805,9 @@ FORCE_INLINE void run_receiver_channel_step(
     bool unwritten_packets = !wr_sent_ptr.is_caught_up_to(ack_ptr);
     if (unwritten_packets) {
         auto receiver_buffer_index = wr_sent_ptr.get_buffer_index();
-        volatile auto packet_header = local_receiver_channel.get_packet_header(receiver_buffer_index);
+        volatile auto packet_header = local_receiver_channel.template get_packet_header<PACKET_HEADER_TYPE>(receiver_buffer_index);
 
-        tt::fabric::RoutingFields cached_routing_fields = const_cast<tt::fabric::PacketHeader*>(packet_header)->routing_fields;
+        ROUTING_FIELDS_TYPE cached_routing_fields = const_cast<PACKET_HEADER_TYPE*>(packet_header)->routing_fields;
         bool can_send_to_all_local_chip_receivers =
             can_forward_packet_completely(
                 cached_routing_fields, downstream_edm_interface);
@@ -1054,14 +1079,14 @@ void kernel_main() {
 
     std::array<PacketHeaderRecorder, NUM_SENDER_CHANNELS> sender_channel_packet_recorders{
         PacketHeaderRecorder(
-            reinterpret_cast<volatile tt::fabric::PacketHeader *>(sender_0_completed_packet_header_cb_address),
+            reinterpret_cast<volatile uint32_t *>(sender_0_completed_packet_header_cb_address),
             sender_0_completed_packet_header_cb_size_headers),
         PacketHeaderRecorder(
-            reinterpret_cast<volatile tt::fabric::PacketHeader *>(sender_1_completed_packet_header_cb_address),
+            reinterpret_cast<volatile uint32_t *>(sender_1_completed_packet_header_cb_address),
             sender_1_completed_packet_header_cb_size_headers)
     };
     PacketHeaderRecorder receiver_channel_packet_recorder(
-        reinterpret_cast<volatile tt::fabric::PacketHeader *>(receiver_completed_packet_header_cb_address),
+        reinterpret_cast<volatile uint32_t *>(receiver_completed_packet_header_cb_address),
         receiver_completed_packet_header_cb_size_headers);
 
     static_assert(SENDER_NUM_BUFFERS > 0, "compile time argument [1]: SENDER_NUM_BUFFERS must be > 0");
@@ -1178,14 +1203,14 @@ void kernel_main() {
     auto local_receiver_channel = tt::fabric::EthChannelBuffer<RECEIVER_NUM_BUFFERS>(
         local_receiver_channel_buffer_address,
         channel_buffer_size,
-        tt::fabric::header_size_bytes,
+        sizeof(PACKET_HEADER_TYPE),
         eth_transaction_ack_word_addr,  // Assume for receiver channel, this address points to a chunk of memory that
                                         // can fit 2 eth_channel_syncs cfor ack
         receiver_channel_id);
     auto remote_receiver_channel = tt::fabric::EthChannelBuffer<RECEIVER_NUM_BUFFERS>(
         remote_receiver_channel_buffer_address,
         channel_buffer_size,
-        tt::fabric::header_size_bytes,
+        sizeof(PACKET_HEADER_TYPE),
         eth_transaction_ack_word_addr,  // Assume for receiver channel, this address points to a chunk of memory that
                                         // can fit 2 eth_channel_syncs cfor ack
         receiver_channel_id);
@@ -1196,13 +1221,13 @@ void kernel_main() {
         new (&local_sender_channels[i]) tt::fabric::EthChannelBuffer<SENDER_NUM_BUFFERS>(
             local_sender_buffer_addresses[i],
             channel_buffer_size,
-            tt::fabric::header_size_bytes,
+            sizeof(PACKET_HEADER_TYPE),
             0,  // For sender channels there is no eth_transaction_ack_word_addr because they don't send acks
             i);
         new (&remote_sender_channels[i]) tt::fabric::EthChannelBuffer<SENDER_NUM_BUFFERS>(
             remote_sender_buffer_addresses[i],
             channel_buffer_size,
-            tt::fabric::header_size_bytes,
+            sizeof(PACKET_HEADER_TYPE),
             0,  // For sender channels there is no eth_transaction_ack_word_addr because they don't send acks
             i);
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp
index 369c4f57f33..4bf3cad530e 100644
--- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp
@@ -64,12 +64,14 @@ class EthChannelBuffer final {
         return this->buffer_addresses[buffer_index];
     }
 
-    [[nodiscard]] FORCE_INLINE volatile PacketHeader *get_packet_header(BufferIndex const& buffer_index) const {
-        return reinterpret_cast<volatile PacketHeader *>(this->buffer_addresses[buffer_index]);
+    template <typename T>
+    [[nodiscard]] FORCE_INLINE volatile T *get_packet_header(BufferIndex const& buffer_index) const {
+        return reinterpret_cast<volatile T *>(this->buffer_addresses[buffer_index]);
     }
 
+    template <typename T>
     [[nodiscard]] FORCE_INLINE size_t get_payload_size(BufferIndex const& buffer_index) const {
-        return get_packet_header(buffer_index)->get_payload_size_including_header();
+        return get_packet_header<T>(buffer_index)->get_payload_size_including_header();
     }
     [[nodiscard]] FORCE_INLINE size_t get_channel_buffer_max_size_in_bytes(BufferIndex const& buffer_index) const {
         return this->buffer_size_in_bytes;
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/interleaved_dim3_1_1_32_any_writer.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/interleaved_dim3_1_1_32_any_writer.cpp
index a8dbeb8ade7..487df3be943 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/interleaved_dim3_1_1_32_any_writer.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/interleaved_dim3_1_1_32_any_writer.cpp
@@ -94,10 +94,10 @@ void kernel_main() {
     DPRINT << "packet_header_buffer_seminc: " << (uint32_t)packet_header_buffer_seminc << "\n";
 
     // pre-populate packet headers
-    volatile tt::fabric::PacketHeader* pkt_hdr_forward =
-        reinterpret_cast<volatile tt::fabric::PacketHeader*>(packet_header_buffer_addr_forward);
-    volatile tt::fabric::PacketHeader* pkt_hdr_backward =
-        reinterpret_cast<volatile tt::fabric::PacketHeader*>(packet_header_buffer_addr_backward);
+    volatile PACKET_HEADER_TYPE* pkt_hdr_forward =
+        reinterpret_cast<volatile PACKET_HEADER_TYPE*>(packet_header_buffer_addr_forward);
+    volatile PACKET_HEADER_TYPE* pkt_hdr_backward =
+        reinterpret_cast<volatile PACKET_HEADER_TYPE*>(packet_header_buffer_addr_backward);
     pkt_hdr_forward->to_chip_multicast(
         tt::fabric::MulticastRoutingCommandHeader{1, static_cast<uint8_t>(num_targets_forward_direction)});
     pkt_hdr_backward->to_chip_multicast(
@@ -152,7 +152,7 @@ void kernel_main() {
     // 2. mcast output ready semaphore
     uint64_t out_ready_sem_noc_addr_in_pkt =
         safe_get_noc_addr(out_ready_sem_noc0_x, out_ready_sem_noc0_y, out_ready_sem_bank_addr, 0);
-    auto* pkt_hdr = reinterpret_cast<tt::fabric::PacketHeader*>(packet_header_buffer_seminc);
+    auto* pkt_hdr = reinterpret_cast<PACKET_HEADER_TYPE*>(packet_header_buffer_seminc);
     pkt_hdr->to_noc_unicast_atomic_inc(tt::fabric::NocUnicastAtomicIncCommandHeader{
         out_ready_sem_noc_addr_in_pkt,
         static_cast<uint16_t>(1),  // increment 1
@@ -163,7 +163,7 @@ void kernel_main() {
         pkt_hdr->to_chip_multicast(
             tt::fabric::MulticastRoutingCommandHeader{1, static_cast<uint8_t>(num_targets_forward_direction)});
         fabric_connection.get_forward_connection().send_payload_flush_blocking_from_address(
-            packet_header_buffer_seminc, sizeof(tt::fabric::PacketHeader));
+            packet_header_buffer_seminc, sizeof(PACKET_HEADER_TYPE));
     }
     // Write the mcast packet (backward)
     if (fabric_connection.has_backward_connection()) {
@@ -171,7 +171,7 @@ void kernel_main() {
             tt::fabric::MulticastRoutingCommandHeader{1, static_cast<uint8_t>(num_targets_backward_direction)});
         fabric_connection.get_backward_connection().wait_for_empty_write_slot();
         fabric_connection.get_backward_connection().send_payload_non_blocking_from_address(
-            packet_header_buffer_seminc, sizeof(tt::fabric::PacketHeader));
+            packet_header_buffer_seminc, sizeof(PACKET_HEADER_TYPE));
     }
     // increment locally
     uint64_t out_ready_sem_noc_addr =
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/llama_post_binary_matmul_shape_writer.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/llama_post_binary_matmul_shape_writer.cpp
index b9f306cc42b..aad1e889c68 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/llama_post_binary_matmul_shape_writer.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/llama_post_binary_matmul_shape_writer.cpp
@@ -103,10 +103,10 @@ void kernel_main() {
     DPRINT << "packet_header_buffer_seminc: " << (uint32_t)packet_header_buffer_seminc << "\n";
 
     // pre-populate packet headers
-    volatile tt::fabric::PacketHeader* pkt_hdr_forward =
-        reinterpret_cast<volatile tt::fabric::PacketHeader*>(packet_header_buffer_addr_forward);
-    volatile tt::fabric::PacketHeader* pkt_hdr_backward =
-        reinterpret_cast<volatile tt::fabric::PacketHeader*>(packet_header_buffer_addr_backward);
+    volatile PACKET_HEADER_TYPE* pkt_hdr_forward =
+        reinterpret_cast<volatile PACKET_HEADER_TYPE*>(packet_header_buffer_addr_forward);
+    volatile PACKET_HEADER_TYPE* pkt_hdr_backward =
+        reinterpret_cast<volatile PACKET_HEADER_TYPE*>(packet_header_buffer_addr_backward);
     pkt_hdr_forward->to_chip_multicast(
         tt::fabric::MulticastRoutingCommandHeader{1, static_cast<uint8_t>(num_targets_forward_direction)});
     pkt_hdr_backward->to_chip_multicast(
@@ -158,7 +158,7 @@ void kernel_main() {
     }
 
     // 2. mcast output ready semaphore
-    auto* pkt_hdr = reinterpret_cast<tt::fabric::PacketHeader*>(packet_header_buffer_seminc);
+    auto* pkt_hdr = reinterpret_cast<PACKET_HEADER_TYPE*>(packet_header_buffer_seminc);
     uint64_t out_ready_sem_noc_addr_in_pkt =
         safe_get_noc_addr(out_ready_sem_noc0_x, out_ready_sem_noc0_y, out_ready_sem_bank_addr, 0);
     pkt_hdr->to_noc_unicast_atomic_inc(tt::fabric::NocUnicastAtomicIncCommandHeader{
@@ -171,7 +171,7 @@ void kernel_main() {
         pkt_hdr->to_chip_multicast(
             tt::fabric::MulticastRoutingCommandHeader{1, static_cast<uint8_t>(num_targets_forward_direction)});
         fabric_connection.get_forward_connection().send_payload_flush_blocking_from_address(
-            packet_header_buffer_seminc, sizeof(tt::fabric::PacketHeader));
+            packet_header_buffer_seminc, sizeof(PACKET_HEADER_TYPE));
     }
     // Write the mcast packet (backward)
     if (fabric_connection.has_backward_connection()) {
@@ -179,7 +179,7 @@ void kernel_main() {
             tt::fabric::MulticastRoutingCommandHeader{1, static_cast<uint8_t>(num_targets_backward_direction)});
         fabric_connection.get_backward_connection().wait_for_empty_write_slot();
         fabric_connection.get_backward_connection().send_payload_non_blocking_from_address(
-            packet_header_buffer_seminc, sizeof(tt::fabric::PacketHeader));
+            packet_header_buffer_seminc, sizeof(PACKET_HEADER_TYPE));
     }
     // increment locally
     uint64_t out_ready_sem_noc_addr =
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp
index 641e6cee244..55e2668d5d1 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp
@@ -12,8 +12,8 @@
 
 FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write(
     uint64_t noc0_dest_noc_addr,
-    volatile tt::fabric::PacketHeader* pkt_hdr_forward,
-    volatile tt::fabric::PacketHeader* pkt_hdr_backward,
+    volatile PACKET_HEADER_TYPE* pkt_hdr_forward,
+    volatile PACKET_HEADER_TYPE* pkt_hdr_backward,
     FabricConnectionManager& fabric_connection,
     size_t& l1_read_addr,
     uint32_t payload_size_bytes) {
@@ -29,7 +29,7 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write(
         fabric_connection.get_forward_connection().send_payload_without_header_non_blocking_from_address(
             l1_read_addr, payload_size_bytes);
         fabric_connection.get_forward_connection().send_payload_flush_blocking_from_address(
-            (uint32_t)pkt_hdr_forward, sizeof(tt::fabric::PacketHeader));
+            (uint32_t)pkt_hdr_forward, sizeof(PACKET_HEADER_TYPE));
     }
 
     if (fabric_connection.has_backward_connection()) {
@@ -37,7 +37,7 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write(
         fabric_connection.get_backward_connection().send_payload_without_header_non_blocking_from_address(
             l1_read_addr, payload_size_bytes);
         fabric_connection.get_backward_connection().send_payload_flush_blocking_from_address(
-            (uint32_t)pkt_hdr_backward, sizeof(tt::fabric::PacketHeader));
+            (uint32_t)pkt_hdr_backward, sizeof(PACKET_HEADER_TYPE));
     }
 
     noc_async_writes_flushed();