diff --git a/models/experimental/functional_unet/tests/test_unet_trace.py b/models/experimental/functional_unet/tests/test_unet_trace.py
index 22a0acd2fa1..cea62f6a12a 100644
--- a/models/experimental/functional_unet/tests/test_unet_trace.py
+++ b/models/experimental/functional_unet/tests/test_unet_trace.py
@@ -27,7 +27,7 @@
 
 
 @skip_for_grayskull("UNet not currently supported on GS")
-@pytest.mark.parametrize("device_params", [{"l1_small_size": 68864, "trace_region_size": 444416}], indirect=True)
+@pytest.mark.parametrize("device_params", [{"l1_small_size": 68864, "trace_region_size": 458752}], indirect=True)
 @pytest.mark.parametrize(
     "batch, groups, iterations",
     ((1, 2, 32),),
@@ -108,7 +108,7 @@ def test_unet_trace(
 
 @skip_for_grayskull("UNet not currently supported on GS")
 @pytest.mark.parametrize(
-    "device_params", [{"l1_small_size": 68864, "trace_region_size": 442368, "num_command_queues": 2}], indirect=True
+    "device_params", [{"l1_small_size": 68864, "trace_region_size": 917504, "num_command_queues": 2}], indirect=True
 )
 @pytest.mark.parametrize(
     "batch, groups, iterations",
@@ -343,7 +343,7 @@ def test_unet_trace_2cq_multi_device(
 
 @skip_for_grayskull("UNet not currently supported on GS")
 @pytest.mark.parametrize(
-    "device_params", [{"l1_small_size": 68864, "trace_region_size": 424960, "num_command_queues": 2}], indirect=True
+    "device_params", [{"l1_small_size": 68864, "trace_region_size": 1376256, "num_command_queues": 2}], indirect=True
 )
 @pytest.mark.parametrize(
     "batch, groups, iterations",
diff --git a/tech_reports/prog_examples/shard_data_rm/shard_data_rm.md b/tech_reports/prog_examples/shard_data_rm/shard_data_rm.md
index 899bbbf81d3..d9d21c7be9a 100644
--- a/tech_reports/prog_examples/shard_data_rm/shard_data_rm.md
+++ b/tech_reports/prog_examples/shard_data_rm/shard_data_rm.md
@@ -71,12 +71,12 @@ uint32_t shard_size = shard_height * shard_width;
 uint32_t input_unit_size = sizeof(uint32_t);
 uint32_t shard_width_bytes = shard_width * data_size;
 uint32_t num_units_per_row = shard_width * input_unit_size;
-uint32_t padded_offset_bytes = align(input_unit_size, device->get_allocator_alignment());
+uint32_t padded_offset_bytes = align(input_unit_size, device->get_allocator_alignment(BufferType::L1));
 ```
 
 In order to shard the correct data segments to the respective core, we indicate the shard height, width, size, and other data for the kernel function.
 For this situation, 16 units of data will be sharded across 4 cores; each core will have 4 units of data in their corresponding circular buffer.
-The `padded_offset_bytes` is set to ensure that the correct address is read from the kernel function when moving data to the circular buffer; in this case, the addresses are aligned to L1 memory.
+The `padded_offset_bytes` is set to ensure that the correct address is read from the kernel function when moving data to the circular buffer; in this case, the addresses are aligned to L1 memory with explicit referencing to BufferType::L1.
 This example demonstrates height sharding; the shard height is therefore set to evenly distribute the number of vector values across the cores.
 If the sharding strategy was different (i.e. width sharding or block sharding), the appropriate values for both the shard height and width would need to be set.
 
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py
index addaf847e2b..802c4ac1a4e 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py
@@ -164,7 +164,7 @@ def test_sharded_rm(
         ),
     )
 
-    yt = ttnn.interleaved_to_sharded(xt, grid_size, shard_size, shard_scheme, shard_orientation)
+    yt = ttnn.interleaved_to_sharded(xt, grid_size, shard_size, shard_scheme, shard_orientation, keep_l1_aligned=True)
 
     zt = ttnn.sharded_to_interleaved(
         yt,
@@ -172,6 +172,7 @@ def test_sharded_rm(
             memory_layout=ttnn.TensorMemoryLayout.INTERLEAVED,
             buffer_type=ttnn.BufferType.L1,
         ),
+        is_l1_aligned=True,
     )
 
     tt_og = xt.cpu().to_torch()
diff --git a/tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp b/tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp
index 210e1de804a..0ded948fdf5 100644
--- a/tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp
+++ b/tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp
@@ -19,7 +19,7 @@ uint64_t get_alloc_limit(const tt::tt_metal::IDevice* device) {
     auto dispatch_core_config = dispatch_core_manager::instance().get_dispatch_core_config(device->id());
     auto storage_core_bank_size =
         tt::get_storage_core_bank_size(device->id(), device->num_hw_cqs(), dispatch_core_config);
-    const uint32_t allocator_alignment = device->get_allocator_alignment();
+    const uint32_t allocator_alignment = device->get_allocator_alignment(BufferType::L1);
     const uint32_t interleaved_l1_bank_size = storage_core_bank_size.has_value()
                                                   ? storage_core_bank_size.value()
                                                   : (soc_desc.worker_l1_size - l1_unreserved_base);
diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_sub_device.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_sub_device.cpp
index c88557298f8..9111335b29d 100644
--- a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_sub_device.cpp
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_sub_device.cpp
@@ -84,7 +84,7 @@ TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceAllocations) {
     device->load_sub_device_manager(sub_device_manager_1);
 
     auto buffer_1 = CreateBuffer(shard_config_1, SubDeviceId{0});
-    EXPECT_EQ(buffer_1->address(), max_addr - buffer_1->aligned_page_size());
+    EXPECT_TRUE(buffer_1->address() <= max_addr - buffer_1->aligned_page_size());
     EnqueueWriteBuffer(device->command_queue(), buffer_1, input_1, false);
     std::vector<uint32_t> output_1;
     EnqueueReadBuffer(device->command_queue(), buffer_1, output_1, true);
@@ -105,7 +105,7 @@ TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceAllocations) {
     device->load_sub_device_manager(sub_device_manager_2);
 
     auto buffer_3 = CreateBuffer(shard_config_2, SubDeviceId{1});
-    EXPECT_EQ(buffer_3->address(), max_addr - buffer_3->aligned_page_size());
+    EXPECT_TRUE(buffer_3->address() <= max_addr - buffer_3->aligned_page_size());
     EnqueueWriteBuffer(device->command_queue(), buffer_3, input_2, false);
     std::vector<uint32_t> output_2;
     EnqueueReadBuffer(device->command_queue(), buffer_3, output_2, true);
@@ -118,7 +118,7 @@ TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceAllocations) {
     }
 
     auto buffer_4 = CreateBuffer(shard_config_1, SubDeviceId{0});
-    EXPECT_EQ(buffer_4->address(), max_addr - buffer_4->aligned_page_size());
+    EXPECT_TRUE(buffer_4->address() <= max_addr - buffer_4->aligned_page_size());
     EXPECT_THROW(CreateBuffer(interleaved_config, SubDeviceId{0}), std::exception);
 }
 
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h
index e5fbdf05dde..80d3b9f8ecb 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h
@@ -651,7 +651,7 @@ inline void generate_random_paged_payload(
         words_per_page);
 
     // Note: the dst address marches in unison regardless of whether or not a core is written to
-    uint32_t page_size_alignment_bytes = device->get_allocator_alignment();
+    uint32_t page_size_alignment_bytes = device->get_allocator_alignment(buf_type);
     for (uint32_t page_id = start_page; page_id < start_page + cmd.write_paged.pages; page_id++) {
         CoreCoord bank_core;
         uint32_t bank_id = page_id % num_banks;
@@ -931,8 +931,9 @@ inline void gen_dispatcher_paged_write_cmd(
     uint32_t start_page,
     uint32_t page_size,
     uint32_t pages) {
-    uint32_t page_size_alignment_bytes = device->get_allocator_alignment();
-    uint32_t num_banks = device->num_banks(is_dram ? BufferType::DRAM : BufferType::L1);
+    BufferType buffer_type = is_dram ? BufferType::DRAM : BufferType::L1;
+    uint32_t page_size_alignment_bytes = device->get_allocator_alignment(buffer_type);
+    uint32_t num_banks = device->num_banks(buffer_type);
     CoreType core_type = is_dram ? CoreType::DRAM : CoreType::WORKER;
 
     // Not safe to mix paged L1 and paged DRAM writes currently in this test since same book-keeping.
diff --git a/tests/ttnn/unit_tests/operations/test_core.py b/tests/ttnn/unit_tests/operations/test_core.py
index c39154379df..57709827f07 100644
--- a/tests/ttnn/unit_tests/operations/test_core.py
+++ b/tests/ttnn/unit_tests/operations/test_core.py
@@ -527,8 +527,9 @@ def test_bh_alignment_i2s(
         memory_config=input_buffer_type,
         dtype=ttnn.bfloat16,
     )
-    x_t_sharded = ttnn.to_memory_config(x_t, shard_config)
-    x_t = ttnn.to_memory_config(x_t_sharded, output_buffer_type)
+    # So far the sharded tensor alignment is controled by keep_l1_aligned flag, will remove it later after launch
+    x_t_sharded = ttnn.interleaved_to_sharded(x_t, shard_config, keep_l1_aligned=True)
+    x_t = ttnn.sharded_to_interleaved(x_t_sharded, output_buffer_type, is_l1_aligned=True)
     output_data = ttnn.from_device(x_t)
     output_data = ttnn.to_torch(output_data)
     passing = torch.equal(input_data, output_data)
diff --git a/tests/ttnn/unit_tests/operations/test_group_norm.py b/tests/ttnn/unit_tests/operations/test_group_norm.py
index 57441a6f047..2ae8848ee2f 100644
--- a/tests/ttnn/unit_tests/operations/test_group_norm.py
+++ b/tests/ttnn/unit_tests/operations/test_group_norm.py
@@ -292,7 +292,7 @@ def test_group_norm_with_block_sharded_v2_8x8_grid(device, N, C, H, W, num_group
     sharded_mem_config = ttnn.MemoryConfig(
         ttnn.types.TensorMemoryLayout.BLOCK_SHARDED, ttnn.types.BufferType.L1, shard_spec
     )
-    input_tensor = ttnn.to_memory_config(input_tensor, sharded_mem_config)
+    input_tensor = ttnn.interleaved_to_sharded(input_tensor, sharded_mem_config, keep_l1_aligned=True)
 
     # groupnorm
     output_tensor = ttnn.group_norm(
@@ -306,7 +306,7 @@ def test_group_norm_with_block_sharded_v2_8x8_grid(device, N, C, H, W, num_group
     )
 
     # output tensor
-    output_tensor = ttnn.to_memory_config(output_tensor, ttnn.L1_MEMORY_CONFIG)
+    output_tensor = ttnn.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG, is_l1_aligned=True)
     output_tensor = ttnn.from_device(output_tensor)
     output_tensor = ttnn.to_torch(output_tensor)
 
diff --git a/tests/ttnn/unit_tests/operations/test_pad.py b/tests/ttnn/unit_tests/operations/test_pad.py
index 6f6ef9b4274..adea6bb33e7 100644
--- a/tests/ttnn/unit_tests/operations/test_pad.py
+++ b/tests/ttnn/unit_tests/operations/test_pad.py
@@ -226,8 +226,10 @@ def test_pad_rm_sharded_stickwise(
     ttnn_input_tensor = ttnn.from_torch(
         torch_input_tensor, dtype=ttnn.float32, layout=ttnn.ROW_MAJOR_LAYOUT, device=device
     )
-    ttnn_sharded_input_tensor = ttnn.to_memory_config(ttnn_input_tensor, input_shard_memory_config)
-
+    # Still relay on keep_l1_aligned = True to make it work with the current implementation
+    ttnn_sharded_input_tensor = ttnn.interleaved_to_sharded(
+        ttnn_input_tensor, input_shard_memory_config, keep_l1_aligned=True
+    )
     padded_tensor = ttnn.pad(ttnn_sharded_input_tensor, pad_to_shape, input_tensor_start, pad_value)
 
     tt_output_tensor = ttnn.to_memory_config(padded_tensor, ttnn.L1_MEMORY_CONFIG)
diff --git a/tt_metal/api/tt-metalium/allocator.hpp b/tt_metal/api/tt-metalium/allocator.hpp
index acbe2fed8bc..89e9d87f35d 100644
--- a/tt_metal/api/tt-metalium/allocator.hpp
+++ b/tt_metal/api/tt-metalium/allocator.hpp
@@ -120,6 +120,8 @@ const std::vector<uint32_t>& bank_ids_from_dram_channel(const Allocator& allocat
 const std::vector<uint32_t>& bank_ids_from_logical_core(
     const Allocator& allocator, BufferType buffer_type, const CoreCoord& logical_core);
 
+uint32_t get_alignment(const Allocator& alloator, const BufferType& buffer_type);
+
 Statistics get_statistics(const Allocator& allocator, const BufferType& buffer_type);
 
 void dump_memory_blocks(const Allocator& allocator, const BufferType& buffer_type, std::ofstream& out);
diff --git a/tt_metal/api/tt-metalium/allocator_types.hpp b/tt_metal/api/tt-metalium/allocator_types.hpp
index 19497b83d4e..f6f2c277904 100644
--- a/tt_metal/api/tt-metalium/allocator_types.hpp
+++ b/tt_metal/api/tt-metalium/allocator_types.hpp
@@ -43,6 +43,7 @@ struct AllocatorConfig {
     size_t dram_bank_size = 0;
     std::vector<size_t> dram_bank_offsets = {};
     uint32_t dram_unreserved_base = 0;
+    uint32_t dram_alignment = 0;
     //! worker specific configuration
     uint32_t l1_unreserved_base = 0;
     CoreRangeSet worker_grid = {};
@@ -56,7 +57,7 @@ struct AllocatorConfig {
     BankMapping l1_bank_remap =
         {};  // for remapping which l1 bank points to which bank if we assume normal row-major assignment
     CoreRangeSet compute_grid = {};
-    uint32_t alignment = 0;
+    uint32_t l1_alignment = 0;
     bool disable_interleaved = false;
     void reset();
     ~AllocatorConfig() { reset(); }
diff --git a/tt_metal/api/tt-metalium/device.hpp b/tt_metal/api/tt-metalium/device.hpp
index 8339787ef25..32031f685c9 100644
--- a/tt_metal/api/tt-metalium/device.hpp
+++ b/tt_metal/api/tt-metalium/device.hpp
@@ -55,11 +55,11 @@ class IDevice {
     IDevice() = default;
     virtual ~IDevice() = default;
 
-    IDevice(const IDevice &other) = delete;
-    IDevice& operator=(const IDevice &other) = delete;
+    IDevice(const IDevice& other) = delete;
+    IDevice& operator=(const IDevice& other) = delete;
 
-    IDevice(IDevice &&other) = default;
-    IDevice& operator=(IDevice &&other) = default;
+    IDevice(IDevice&& other) = default;
+    IDevice& operator=(IDevice&& other) = default;
 
     virtual tt::ARCH arch() const = 0;
 
@@ -84,19 +84,22 @@ class IDevice {
     // Given a coordinate in Virtual NOC0 Space, get the equivalent coordinate in Virtual noc_index space
     virtual CoreCoord virtual_noc0_coordinate(uint8_t noc_index, CoreCoord coord) const = 0;
 
-    virtual std::vector<CoreCoord> worker_cores_from_logical_cores(const std::vector<CoreCoord> &logical_cores) const = 0;
-    virtual std::vector<CoreCoord> ethernet_cores_from_logical_cores(const std::vector<CoreCoord> &logical_cores) const = 0;
+    virtual std::vector<CoreCoord> worker_cores_from_logical_cores(
+        const std::vector<CoreCoord>& logical_cores) const = 0;
+    virtual std::vector<CoreCoord> ethernet_cores_from_logical_cores(
+        const std::vector<CoreCoord>& logical_cores) const = 0;
     virtual std::vector<CoreCoord> get_optimal_dram_bank_to_logical_worker_assignment() = 0;
 
-    virtual CoreCoord virtual_core_from_logical_core(const CoreCoord &logical_coord, const CoreType& core_type) const = 0;
-    virtual CoreCoord worker_core_from_logical_core(const CoreCoord &logical_core) const = 0;
+    virtual CoreCoord virtual_core_from_logical_core(
+        const CoreCoord& logical_coord, const CoreType& core_type) const = 0;
+    virtual CoreCoord worker_core_from_logical_core(const CoreCoord& logical_core) const = 0;
 
     // Ethernet API
-    virtual CoreCoord ethernet_core_from_logical_core(const CoreCoord &logical_core) const = 0;
-    virtual CoreCoord logical_core_from_ethernet_core(const CoreCoord &ethernet_core) const = 0;
-    virtual std::unordered_set<CoreCoord> get_active_ethernet_cores(bool skip_reserved_tunnel_cores=false) const = 0;
+    virtual CoreCoord ethernet_core_from_logical_core(const CoreCoord& logical_core) const = 0;
+    virtual CoreCoord logical_core_from_ethernet_core(const CoreCoord& ethernet_core) const = 0;
+    virtual std::unordered_set<CoreCoord> get_active_ethernet_cores(bool skip_reserved_tunnel_cores = false) const = 0;
     virtual std::unordered_set<CoreCoord> get_inactive_ethernet_cores() const = 0;
-    virtual bool is_active_ethernet_core(CoreCoord logical_core, bool skip_reserved_tunnel_cores=false) const = 0;
+    virtual bool is_active_ethernet_core(CoreCoord logical_core, bool skip_reserved_tunnel_cores = false) const = 0;
     virtual std::tuple<chip_id_t, CoreCoord> get_connected_ethernet_core(CoreCoord eth_core) const = 0;
     virtual std::vector<CoreCoord> get_ethernet_sockets(chip_id_t connected_chip_id) const = 0;
     virtual bool is_inactive_ethernet_core(CoreCoord logical_core) const = 0;
@@ -106,16 +109,16 @@ class IDevice {
     virtual CoreRangeSet worker_cores(HalProgrammableCoreType core_type, SubDeviceId sub_device_id) const = 0;
     virtual uint32_t num_worker_cores(HalProgrammableCoreType core_type, SubDeviceId sub_device_id) const = 0;
 
-    virtual const std::unique_ptr<Allocator> &get_initialized_allocator() const = 0;
-    virtual const std::unique_ptr<Allocator> &get_initialized_allocator(SubDeviceId sub_device_id) const = 0;
+    virtual const std::unique_ptr<Allocator>& get_initialized_allocator() const = 0;
+    virtual const std::unique_ptr<Allocator>& get_initialized_allocator(SubDeviceId sub_device_id) const = 0;
 
-    virtual DeviceAddr get_base_allocator_addr(const HalMemType &mem_type) const = 0;
-    virtual DeviceAddr get_base_allocator_addr(const HalMemType &mem_type, SubDeviceId sub_device_id) const = 0;
+    virtual DeviceAddr get_base_allocator_addr(const HalMemType& mem_type) const = 0;
+    virtual DeviceAddr get_base_allocator_addr(const HalMemType& mem_type, SubDeviceId sub_device_id) const = 0;
 
-    virtual uint32_t num_banks(const BufferType &buffer_type) const = 0;
-    virtual uint32_t num_banks(const BufferType &buffer_type, SubDeviceId sub_device_id) const = 0;
-    virtual uint32_t bank_size(const BufferType &buffer_type) const = 0;
-    virtual uint32_t bank_size(const BufferType &buffer_type, SubDeviceId sub_device_id) const = 0;
+    virtual uint32_t num_banks(const BufferType& buffer_type) const = 0;
+    virtual uint32_t num_banks(const BufferType& buffer_type, SubDeviceId sub_device_id) const = 0;
+    virtual uint32_t bank_size(const BufferType& buffer_type) const = 0;
+    virtual uint32_t bank_size(const BufferType& buffer_type, SubDeviceId sub_device_id) const = 0;
 
     virtual uint32_t dram_channel_from_bank_id(uint32_t bank_id) const = 0;
     virtual uint32_t dram_channel_from_bank_id(uint32_t bank_id, SubDeviceId sub_device_id) const = 0;
@@ -129,50 +132,61 @@ class IDevice {
     virtual CoreCoord logical_core_from_bank_id(uint32_t bank_id) const = 0;
     virtual CoreCoord logical_core_from_bank_id(uint32_t bank_id, SubDeviceId sub_device_id) const = 0;
 
-    virtual const std::vector<uint32_t> &bank_ids_from_dram_channel(uint32_t dram_channel) const = 0;
-    virtual const std::vector<uint32_t> &bank_ids_from_dram_channel(uint32_t dram_channel, SubDeviceId sub_device_id) const = 0;
+    virtual const std::vector<uint32_t>& bank_ids_from_dram_channel(uint32_t dram_channel) const = 0;
+    virtual const std::vector<uint32_t>& bank_ids_from_dram_channel(
+        uint32_t dram_channel, SubDeviceId sub_device_id) const = 0;
 
-    virtual const std::vector<uint32_t> &bank_ids_from_logical_core(BufferType buffer_type, const CoreCoord &logical_core) const = 0;
-    virtual const std::vector<uint32_t> &bank_ids_from_logical_core(BufferType buffer_type, const CoreCoord &logical_core, SubDeviceId sub_device_id) const = 0;
+    virtual const std::vector<uint32_t>& bank_ids_from_logical_core(
+        BufferType buffer_type, const CoreCoord& logical_core) const = 0;
+    virtual const std::vector<uint32_t>& bank_ids_from_logical_core(
+        BufferType buffer_type, const CoreCoord& logical_core, SubDeviceId sub_device_id) const = 0;
 
-    virtual allocator::Statistics get_memory_allocation_statistics(const BufferType &buffer_type) const = 0;
-    virtual allocator::Statistics get_memory_allocation_statistics(const BufferType &buffer_type, SubDeviceId sub_device_id) const = 0;
+    virtual allocator::Statistics get_memory_allocation_statistics(const BufferType& buffer_type) const = 0;
+    virtual allocator::Statistics get_memory_allocation_statistics(
+        const BufferType& buffer_type, SubDeviceId sub_device_id) const = 0;
 
-    virtual uint32_t get_allocator_alignment() const = 0;
-    virtual uint32_t get_allocator_alignment(SubDeviceId sub_device_id) const = 0;
+    virtual uint32_t get_allocator_alignment(const BufferType& buffer_type) const = 0;
+    virtual uint32_t get_allocator_alignment(const BufferType& buffer_type, SubDeviceId sub_device_id) const = 0;
 
     virtual std::optional<DeviceAddr> lowest_occupied_compute_l1_address() const = 0;
-    virtual std::optional<DeviceAddr> lowest_occupied_compute_l1_address(tt::stl::Span<const SubDeviceId> sub_device_ids) const = 0;
+    virtual std::optional<DeviceAddr> lowest_occupied_compute_l1_address(
+        tt::stl::Span<const SubDeviceId> sub_device_ids) const = 0;
 
     virtual size_t get_l1_small_size() const = 0;
     virtual size_t get_l1_small_size(SubDeviceId sub_device_id) const = 0;
 
-    virtual const std::unordered_set<Buffer *> &get_allocated_buffers() const = 0;
-    virtual const std::unordered_set<Buffer *> &get_allocated_buffers(SubDeviceId sub_device_id) const = 0;
+    virtual const std::unordered_set<Buffer*>& get_allocated_buffers() const = 0;
+    virtual const std::unordered_set<Buffer*>& get_allocated_buffers(SubDeviceId sub_device_id) const = 0;
 
     virtual void deallocate_buffers() = 0;
     virtual void deallocate_buffers(SubDeviceId sub_device_id) = 0;
 
-    virtual void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out) const = 0;
-    virtual void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out, SubDeviceId sub_device_id) const = 0;
+    virtual void dump_memory_blocks(const BufferType& buffer_type, std::ofstream& out) const = 0;
+    virtual void dump_memory_blocks(
+        const BufferType& buffer_type, std::ofstream& out, SubDeviceId sub_device_id) const = 0;
 
     virtual MemoryBlockTable get_memory_block_table(const BufferType& buffer_type) const = 0;
 
     // Set of logical ethernet core coordinates
     // core.x represents connectivity to one other chip, i.e. cores with <x> all connect to same chip
     // core.y represents different channels along one <x>
-    virtual const std::set<CoreCoord> &ethernet_cores() const = 0;
-    virtual const std::set<CoreCoord> &storage_only_cores() const = 0;
+    virtual const std::set<CoreCoord>& ethernet_cores() const = 0;
+    virtual const std::set<CoreCoord>& storage_only_cores() const = 0;
 
     virtual uint32_t get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& core) const = 0;
     virtual uint32_t get_noc_multicast_encoding(uint8_t noc_index, const CoreRange& cores) const = 0;
 
     virtual const JitBuildEnv& build_env() const = 0;
-    virtual const string build_firmware_target_path(uint32_t programmable_core, uint32_t processor_class, int i) const = 0;
-    virtual const string build_kernel_target_path(uint32_t programmable_core, uint32_t processor_class, int i, const string& kernel_name) const = 0;
-    virtual const JitBuildState& build_firmware_state(uint32_t programmable_core, uint32_t processor_class, int i) const = 0;
-    virtual const JitBuildState& build_kernel_state(uint32_t programmable_core, uint32_t processor_class, int i) const = 0;
-    virtual const JitBuildStateSubset build_kernel_states(uint32_t programmable_core, uint32_t processor_class) const = 0;
+    virtual const string build_firmware_target_path(
+        uint32_t programmable_core, uint32_t processor_class, int i) const = 0;
+    virtual const string build_kernel_target_path(
+        uint32_t programmable_core, uint32_t processor_class, int i, const string& kernel_name) const = 0;
+    virtual const JitBuildState& build_firmware_state(
+        uint32_t programmable_core, uint32_t processor_class, int i) const = 0;
+    virtual const JitBuildState& build_kernel_state(
+        uint32_t programmable_core, uint32_t processor_class, int i) const = 0;
+    virtual const JitBuildStateSubset build_kernel_states(
+        uint32_t programmable_core, uint32_t processor_class) const = 0;
 
     virtual SystemMemoryManager& sysmem_manager() = 0;
     virtual HWCommandQueue& hw_command_queue(size_t cq_id = 0) = 0;
@@ -193,7 +207,12 @@ class IDevice {
 
     // Checks that the given arch is on the given pci_slot and that it's responding
     // Puts device into reset
-    virtual bool initialize(const uint8_t num_hw_cqs, size_t l1_small_size, size_t trace_region_size, tt::stl::Span<const std::uint32_t> l1_bank_remap = {}, bool minimal = false) = 0;
+    virtual bool initialize(
+        const uint8_t num_hw_cqs,
+        size_t l1_small_size,
+        size_t trace_region_size,
+        tt::stl::Span<const std::uint32_t> l1_bank_remap = {},
+        bool minimal = false) = 0;
     virtual void build_firmware() = 0;
     virtual void reset_cores() = 0;
     virtual void initialize_and_launch_firmware() = 0;
@@ -227,7 +246,8 @@ class IDevice {
     template <typename T = DeviceAddr>
     T get_dev_addr(CoreCoord virtual_core, HalL1MemAddrType addr_type) const;
 
-    virtual std::vector<std::pair<transfer_info_cores, uint32_t>> extract_dst_noc_multicast_info(const std::vector<CoreRange>& ranges, const CoreType core_type) = 0;
+    virtual std::vector<std::pair<transfer_info_cores, uint32_t>> extract_dst_noc_multicast_info(
+        const std::vector<CoreRange>& ranges, const CoreType core_type) = 0;
 
     virtual bool dispatch_s_enabled() const = 0;
     virtual bool distributed_dispatcher() const = 0;
@@ -236,17 +256,19 @@ class IDevice {
 
     virtual uint8_t num_noc_mcast_txns(SubDeviceId sub_device_id) const = 0;
     virtual uint8_t num_noc_unicast_txns(SubDeviceId sub_device_id) const = 0;
-    virtual uint8_t noc_data_start_index(SubDeviceId sub_device_id, bool mcast_data=true, bool unicast_data=true) const = 0;
+    virtual uint8_t noc_data_start_index(
+        SubDeviceId sub_device_id, bool mcast_data = true, bool unicast_data = true) const = 0;
 
     virtual SubDeviceManagerId get_active_sub_device_manager_id() const = 0;
     virtual SubDeviceManagerId get_default_sub_device_manager_id() const = 0;
-    virtual SubDeviceManagerId create_sub_device_manager(tt::stl::Span<const SubDevice> sub_devices, DeviceAddr local_l1_size) = 0;
+    virtual SubDeviceManagerId create_sub_device_manager(
+        tt::stl::Span<const SubDevice> sub_devices, DeviceAddr local_l1_size) = 0;
     virtual void remove_sub_device_manager(SubDeviceManagerId sub_device_manager_id) = 0;
     virtual void load_sub_device_manager(SubDeviceManagerId sub_device_manager_id) = 0;
     virtual void clear_loaded_sub_device_manager() = 0;
     virtual CoreCoord virtual_program_dispatch_core(uint8_t cq_id) const = 0;
-    virtual const std::vector<SubDeviceId> &get_sub_device_ids() const = 0;
-    virtual const std::vector<SubDeviceId> &get_sub_device_stall_group() const = 0;
+    virtual const std::vector<SubDeviceId>& get_sub_device_ids() const = 0;
+    virtual const std::vector<SubDeviceId>& get_sub_device_stall_group() const = 0;
     virtual void set_sub_device_stall_group(tt::stl::Span<const SubDeviceId> sub_device_ids) = 0;
     virtual void reset_sub_device_stall_group() = 0;
     virtual uint32_t num_sub_devices() const = 0;
diff --git a/tt_metal/api/tt-metalium/device_impl.hpp b/tt_metal/api/tt-metalium/device_impl.hpp
index ea5a41c1482..a816c5c3331 100644
--- a/tt_metal/api/tt-metalium/device_impl.hpp
+++ b/tt_metal/api/tt-metalium/device_impl.hpp
@@ -33,7 +33,7 @@ inline namespace v0 {
 class Device : public IDevice {
 public:
     // friend void tt_gdb(IDevice* device, int chip_id, const vector<CoreCoord> cores, vector<string> ops);
-    Device () = delete;
+    Device() = delete;
     Device(
         chip_id_t device_id,
         const uint8_t num_hw_cqs,
@@ -47,11 +47,11 @@ class Device : public IDevice {
     ~Device() override;
 
     // TODO: Add copy/move semantics
-    Device(const Device &other) = delete;
-    Device& operator=(const Device &other) = delete;
+    Device(const Device& other) = delete;
+    Device& operator=(const Device& other) = delete;
 
-    Device(Device &&other) = default;
-    Device& operator=(Device &&other) = default;
+    Device(Device&& other) = default;
+    Device& operator=(Device&& other) = default;
 
     tt::ARCH arch() const override;
 
@@ -76,19 +76,20 @@ class Device : public IDevice {
     // Given a coordinate in Virtual NOC0 Space, get the equivalent coordinate in Virtual noc_index space
     CoreCoord virtual_noc0_coordinate(uint8_t noc_index, CoreCoord coord) const override;
 
-    std::vector<CoreCoord> worker_cores_from_logical_cores(const std::vector<CoreCoord> &logical_cores) const override;
-    std::vector<CoreCoord> ethernet_cores_from_logical_cores(const std::vector<CoreCoord> &logical_cores) const override;
+    std::vector<CoreCoord> worker_cores_from_logical_cores(const std::vector<CoreCoord>& logical_cores) const override;
+    std::vector<CoreCoord> ethernet_cores_from_logical_cores(
+        const std::vector<CoreCoord>& logical_cores) const override;
     std::vector<CoreCoord> get_optimal_dram_bank_to_logical_worker_assignment() override;
 
-    CoreCoord virtual_core_from_logical_core(const CoreCoord &logical_coord, const CoreType& core_type) const override;
-    CoreCoord worker_core_from_logical_core(const CoreCoord &logical_core) const override;
+    CoreCoord virtual_core_from_logical_core(const CoreCoord& logical_coord, const CoreType& core_type) const override;
+    CoreCoord worker_core_from_logical_core(const CoreCoord& logical_core) const override;
 
     // Ethernet API
-    CoreCoord ethernet_core_from_logical_core(const CoreCoord &logical_core) const override;
-    CoreCoord logical_core_from_ethernet_core(const CoreCoord &ethernet_core) const override;
-    std::unordered_set<CoreCoord> get_active_ethernet_cores(bool skip_reserved_tunnel_cores=false) const override;
+    CoreCoord ethernet_core_from_logical_core(const CoreCoord& logical_core) const override;
+    CoreCoord logical_core_from_ethernet_core(const CoreCoord& ethernet_core) const override;
+    std::unordered_set<CoreCoord> get_active_ethernet_cores(bool skip_reserved_tunnel_cores = false) const override;
     std::unordered_set<CoreCoord> get_inactive_ethernet_cores() const override;
-    bool is_active_ethernet_core(CoreCoord logical_core, bool skip_reserved_tunnel_cores=false) const override;
+    bool is_active_ethernet_core(CoreCoord logical_core, bool skip_reserved_tunnel_cores = false) const override;
     std::tuple<chip_id_t, CoreCoord> get_connected_ethernet_core(CoreCoord eth_core) const override;
     std::vector<CoreCoord> get_ethernet_sockets(chip_id_t connected_chip_id) const override;
     bool is_inactive_ethernet_core(CoreCoord logical_core) const override;
@@ -98,16 +99,16 @@ class Device : public IDevice {
     CoreRangeSet worker_cores(HalProgrammableCoreType core_type, SubDeviceId sub_device_id) const override;
     uint32_t num_worker_cores(HalProgrammableCoreType core_type, SubDeviceId sub_device_id) const override;
 
-    const std::unique_ptr<Allocator> &get_initialized_allocator() const override;
-    const std::unique_ptr<Allocator> &get_initialized_allocator(SubDeviceId sub_device_id) const override;
+    const std::unique_ptr<Allocator>& get_initialized_allocator() const override;
+    const std::unique_ptr<Allocator>& get_initialized_allocator(SubDeviceId sub_device_id) const override;
 
-    DeviceAddr get_base_allocator_addr(const HalMemType &mem_type) const override;
-    DeviceAddr get_base_allocator_addr(const HalMemType &mem_type, SubDeviceId sub_device_id) const override;
+    DeviceAddr get_base_allocator_addr(const HalMemType& mem_type) const override;
+    DeviceAddr get_base_allocator_addr(const HalMemType& mem_type, SubDeviceId sub_device_id) const override;
 
-    uint32_t num_banks(const BufferType &buffer_type) const override;
-    uint32_t num_banks(const BufferType &buffer_type, SubDeviceId sub_device_id) const override;
-    uint32_t bank_size(const BufferType &buffer_type) const override;
-    uint32_t bank_size(const BufferType &buffer_type, SubDeviceId sub_device_id) const override;
+    uint32_t num_banks(const BufferType& buffer_type) const override;
+    uint32_t num_banks(const BufferType& buffer_type, SubDeviceId sub_device_id) const override;
+    uint32_t bank_size(const BufferType& buffer_type) const override;
+    uint32_t bank_size(const BufferType& buffer_type, SubDeviceId sub_device_id) const override;
 
     uint32_t dram_channel_from_bank_id(uint32_t bank_id) const override;
     uint32_t dram_channel_from_bank_id(uint32_t bank_id, SubDeviceId sub_device_id) const override;
@@ -121,49 +122,57 @@ class Device : public IDevice {
     CoreCoord logical_core_from_bank_id(uint32_t bank_id) const override;
     CoreCoord logical_core_from_bank_id(uint32_t bank_id, SubDeviceId sub_device_id) const override;
 
-    const std::vector<uint32_t> &bank_ids_from_dram_channel(uint32_t dram_channel) const override;
-    const std::vector<uint32_t> &bank_ids_from_dram_channel(uint32_t dram_channel, SubDeviceId sub_device_id) const override;
+    const std::vector<uint32_t>& bank_ids_from_dram_channel(uint32_t dram_channel) const override;
+    const std::vector<uint32_t>& bank_ids_from_dram_channel(
+        uint32_t dram_channel, SubDeviceId sub_device_id) const override;
 
-    const std::vector<uint32_t> &bank_ids_from_logical_core(BufferType buffer_type, const CoreCoord &logical_core) const override;
-    const std::vector<uint32_t> &bank_ids_from_logical_core(BufferType buffer_type, const CoreCoord &logical_core, SubDeviceId sub_device_id) const override;
+    const std::vector<uint32_t>& bank_ids_from_logical_core(
+        BufferType buffer_type, const CoreCoord& logical_core) const override;
+    const std::vector<uint32_t>& bank_ids_from_logical_core(
+        BufferType buffer_type, const CoreCoord& logical_core, SubDeviceId sub_device_id) const override;
 
-    allocator::Statistics get_memory_allocation_statistics(const BufferType &buffer_type) const override;
-    allocator::Statistics get_memory_allocation_statistics(const BufferType &buffer_type, SubDeviceId sub_device_id) const override;
+    allocator::Statistics get_memory_allocation_statistics(const BufferType& buffer_type) const override;
+    allocator::Statistics get_memory_allocation_statistics(
+        const BufferType& buffer_type, SubDeviceId sub_device_id) const override;
 
-    uint32_t get_allocator_alignment() const override;
-    uint32_t get_allocator_alignment(SubDeviceId sub_device_id) const override;
+    uint32_t get_allocator_alignment(const BufferType& buffer_type) const override;
+    uint32_t get_allocator_alignment(const BufferType& buffer_type, SubDeviceId sub_device_id) const override;
 
     std::optional<DeviceAddr> lowest_occupied_compute_l1_address() const override;
-    std::optional<DeviceAddr> lowest_occupied_compute_l1_address(tt::stl::Span<const SubDeviceId> sub_device_ids) const override;
+    std::optional<DeviceAddr> lowest_occupied_compute_l1_address(
+        tt::stl::Span<const SubDeviceId> sub_device_ids) const override;
 
     size_t get_l1_small_size() const override;
     size_t get_l1_small_size(SubDeviceId sub_device_id) const override;
 
-    const std::unordered_set<Buffer *> &get_allocated_buffers() const override;
-    const std::unordered_set<Buffer *> &get_allocated_buffers(SubDeviceId sub_device_id) const override;
+    const std::unordered_set<Buffer*>& get_allocated_buffers() const override;
+    const std::unordered_set<Buffer*>& get_allocated_buffers(SubDeviceId sub_device_id) const override;
 
     void deallocate_buffers() override;
     void deallocate_buffers(SubDeviceId sub_device_id) override;
 
-    void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out) const override;
-    void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out, SubDeviceId sub_device_id) const override;
+    void dump_memory_blocks(const BufferType& buffer_type, std::ofstream& out) const override;
+    void dump_memory_blocks(
+        const BufferType& buffer_type, std::ofstream& out, SubDeviceId sub_device_id) const override;
 
-    MemoryBlockTable get_memory_block_table(const BufferType &buffer_type) const override;
+    MemoryBlockTable get_memory_block_table(const BufferType& buffer_type) const override;
 
     // Set of logical ethernet core coordinates
     // core.x represents connectivity to one other chip, i.e. cores with <x> all connect to same chip
     // core.y represents different channels along one <x>
-    const std::set<CoreCoord> &ethernet_cores() const override { return this->ethernet_cores_; }
+    const std::set<CoreCoord>& ethernet_cores() const override { return this->ethernet_cores_; }
 
-    const std::set<CoreCoord> &storage_only_cores() const override { return this->storage_only_cores_; }
+    const std::set<CoreCoord>& storage_only_cores() const override { return this->storage_only_cores_; }
 
     uint32_t get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& core) const override;
     uint32_t get_noc_multicast_encoding(uint8_t noc_index, const CoreRange& cores) const override;
 
     const JitBuildEnv& build_env() const override { return this->build_env_; }
     const string build_firmware_target_path(uint32_t programmable_core, uint32_t processor_class, int i) const override;
-    const string build_kernel_target_path(uint32_t programmable_core, uint32_t processor_class, int i, const string& kernel_name) const override;
-    const JitBuildState& build_firmware_state(uint32_t programmable_core, uint32_t processor_class, int i) const override;
+    const string build_kernel_target_path(
+        uint32_t programmable_core, uint32_t processor_class, int i, const string& kernel_name) const override;
+    const JitBuildState& build_firmware_state(
+        uint32_t programmable_core, uint32_t processor_class, int i) const override;
     const JitBuildState& build_kernel_state(uint32_t programmable_core, uint32_t processor_class, int i) const override;
     const JitBuildStateSubset build_kernel_states(uint32_t programmable_core, uint32_t processor_class) const override;
 
@@ -185,7 +194,12 @@ class Device : public IDevice {
 
     // Checks that the given arch is on the given pci_slot and that it's responding
     // Puts device into reset
-    bool initialize(const uint8_t num_hw_cqs, size_t l1_small_size, size_t trace_region_size, tt::stl::Span<const std::uint32_t> l1_bank_remap = {}, bool minimal = false) override;
+    bool initialize(
+        const uint8_t num_hw_cqs,
+        size_t l1_small_size,
+        size_t trace_region_size,
+        tt::stl::Span<const std::uint32_t> l1_bank_remap = {},
+        bool minimal = false) override;
     void build_firmware() override;
     void reset_cores() override;
     void initialize_and_launch_firmware() override;
@@ -200,7 +214,9 @@ class Device : public IDevice {
     void enable_async(bool enable) override;
     void synchronize() override;
     WorkExecutorMode get_worker_mode() override { return work_executor_.get_worker_mode(); }
-    void set_worker_queue_mode(const WorkerQueueMode& mode) override { this->work_executor_.set_worker_queue_mode(mode); }
+    void set_worker_queue_mode(const WorkerQueueMode& mode) override {
+        this->work_executor_.set_worker_queue_mode(mode);
+    }
     WorkerQueueMode get_worker_queue_mode() override { return this->work_executor_.get_worker_queue_mode(); }
     bool is_worker_queue_empty() const override { return work_executor_.worker_queue.empty(); }
     bool can_use_passthrough_scheduling() const override;
@@ -216,7 +232,8 @@ class Device : public IDevice {
 
     HalProgrammableCoreType get_programmable_core_type(CoreCoord virtual_core) const override;
 
-    std::vector<std::pair<transfer_info_cores, uint32_t>> extract_dst_noc_multicast_info(const std::vector<CoreRange>& ranges, const CoreType core_type) override;
+    std::vector<std::pair<transfer_info_cores, uint32_t>> extract_dst_noc_multicast_info(
+        const std::vector<CoreRange>& ranges, const CoreType core_type) override;
 
     bool dispatch_s_enabled() const override;
     bool distributed_dispatcher() const override;
@@ -225,17 +242,19 @@ class Device : public IDevice {
 
     uint8_t num_noc_mcast_txns(SubDeviceId sub_device_id) const override;
     uint8_t num_noc_unicast_txns(SubDeviceId sub_device_id) const override;
-    uint8_t noc_data_start_index(SubDeviceId sub_device_id, bool mcast_data=true, bool unicast_data=true) const override;
+    uint8_t noc_data_start_index(
+        SubDeviceId sub_device_id, bool mcast_data = true, bool unicast_data = true) const override;
 
     SubDeviceManagerId get_active_sub_device_manager_id() const override;
     SubDeviceManagerId get_default_sub_device_manager_id() const override;
-    SubDeviceManagerId create_sub_device_manager(tt::stl::Span<const SubDevice> sub_devices, DeviceAddr local_l1_size) override;
+    SubDeviceManagerId create_sub_device_manager(
+        tt::stl::Span<const SubDevice> sub_devices, DeviceAddr local_l1_size) override;
     void remove_sub_device_manager(SubDeviceManagerId sub_device_manager_id) override;
     void load_sub_device_manager(SubDeviceManagerId sub_device_manager_id) override;
     void clear_loaded_sub_device_manager() override;
     CoreCoord virtual_program_dispatch_core(uint8_t cq_id) const override;
-    const std::vector<SubDeviceId> &get_sub_device_ids() const override;
-    const std::vector<SubDeviceId> &get_sub_device_stall_group() const override;
+    const std::vector<SubDeviceId>& get_sub_device_ids() const override;
+    const std::vector<SubDeviceId>& get_sub_device_stall_group() const override;
     void set_sub_device_stall_group(tt::stl::Span<const SubDeviceId> sub_device_ids) override;
     void reset_sub_device_stall_group() override;
     uint32_t num_sub_devices() const override;
@@ -257,17 +276,19 @@ class Device : public IDevice {
         size_t l1_small_size, size_t trace_region_size, tt::stl::Span<const std::uint32_t> l1_bank_remap = {});
     void initialize_build();
     void initialize_device_kernel_defines();
-    void initialize_device_bank_to_noc_tables(const HalProgrammableCoreType &core_type, CoreCoord virtual_core);
-    void initialize_firmware(const HalProgrammableCoreType &core_type, CoreCoord virtual_core, launch_msg_t *launch_msg, go_msg_t* go_msg);
+    void initialize_device_bank_to_noc_tables(const HalProgrammableCoreType& core_type, CoreCoord virtual_core);
+    void initialize_firmware(
+        const HalProgrammableCoreType& core_type, CoreCoord virtual_core, launch_msg_t* launch_msg, go_msg_t* go_msg);
 
-    void initialize_default_sub_device_state(size_t l1_small_size, size_t trace_region_size, tt::stl::Span<const std::uint32_t> l1_bank_remap);
+    void initialize_default_sub_device_state(
+        size_t l1_small_size, size_t trace_region_size, tt::stl::Span<const std::uint32_t> l1_bank_remap);
 
     void compile_command_queue_programs();
     void configure_command_queue_programs();
     void clear_l1_state();
     void get_associated_dispatch_virtual_cores(
-        std::unordered_map<chip_id_t, std::unordered_set<CoreCoord>> &my_dispatch_cores,
-        std::unordered_map<chip_id_t, std::unordered_set<CoreCoord>> &other_dispatch_cores);
+        std::unordered_map<chip_id_t, std::unordered_set<CoreCoord>>& my_dispatch_cores,
+        std::unordered_map<chip_id_t, std::unordered_set<CoreCoord>>& other_dispatch_cores);
     std::pair<int, int> build_processor_type_to_index(uint32_t programmable_core, uint32_t processor_class) const;
 
     void set_worker_mode(const WorkExecutorMode& mode);
@@ -277,10 +298,10 @@ class Device : public IDevice {
     void mark_allocations_unsafe();
     void mark_allocations_safe();
 
-    CoreCoord physical_worker_core_from_logical_core(const CoreCoord &logical_core) const;
+    CoreCoord physical_worker_core_from_logical_core(const CoreCoord& logical_core) const;
     CoreCoord dram_core_from_dram_channel(uint32_t dram_channel) const;
-    CoreType core_type_from_physical_core(const CoreCoord &physical_core) const;
-    CoreCoord virtual_core_from_physical_core(const CoreCoord &physical_coord, const CoreType& core_type) const;
+    CoreType core_type_from_physical_core(const CoreCoord& physical_core) const;
+    CoreCoord virtual_core_from_physical_core(const CoreCoord& physical_coord, const CoreType& core_type) const;
 
     chip_id_t id_;
     uint32_t build_key_ = 0;
diff --git a/tt_metal/api/tt-metalium/memcpy.hpp b/tt_metal/api/tt-metalium/memcpy.hpp
index 0905032697e..02b2253f519 100644
--- a/tt_metal/api/tt-metalium/memcpy.hpp
+++ b/tt_metal/api/tt-metalium/memcpy.hpp
@@ -32,6 +32,14 @@ static inline void memcpy_to_device(void* __restrict dst, const void* __restrict
     uint8_t* dst8 = (uint8_t*)dst;
 
     if (size_t num_lines = n / inner_blk_size) {
+        if ((uintptr_t)dst8 % sizeof(__m256i) != 0) {
+            __m128i blk = _mm_loadu_si128((const __m128i*)src8);
+            _mm_stream_si128((__m128i*)dst8, blk);
+            src8 += sizeof(__m128i);
+            dst8 += sizeof(__m128i);
+            n -= sizeof(__m128i);
+            num_lines = n / inner_blk_size;
+        }
         for (size_t i = 0; i < num_lines; ++i) {
             for (size_t j = 0; j < inner_loop; ++j) {
                 __m256i blk = _mm256_loadu_si256((const __m256i*)src8);
@@ -45,6 +53,14 @@ static inline void memcpy_to_device(void* __restrict dst, const void* __restrict
 
     if (n > 0) {
         if (size_t num_lines = n / sizeof(__m256i)) {
+            if ((uintptr_t)dst8 % sizeof(__m256i) != 0) {
+                __m128i blk = _mm_loadu_si128((const __m128i*)src8);
+                _mm_stream_si128((__m128i*)dst8, blk);
+                src8 += sizeof(__m128i);
+                dst8 += sizeof(__m128i);
+                n -= sizeof(__m128i);
+                num_lines = n / sizeof(__m256i);
+            }
             for (size_t i = 0; i < num_lines; ++i) {
                 __m256i blk = _mm256_loadu_si256((const __m256i*)src8);
                 _mm256_stream_si256((__m256i*)dst8, blk);
diff --git a/tt_metal/api/tt-metalium/mesh_device.hpp b/tt_metal/api/tt-metalium/mesh_device.hpp
index ee682f3b5f9..fb72695d568 100644
--- a/tt_metal/api/tt-metalium/mesh_device.hpp
+++ b/tt_metal/api/tt-metalium/mesh_device.hpp
@@ -99,18 +99,18 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this<MeshDevic
     CoreCoord virtual_noc_coordinate(uint8_t noc_index, CoreCoord coord) const override;
     CoreCoord virtual_noc0_coordinate(uint8_t noc_index, CoreCoord coord) const override;
 
-    std::vector<CoreCoord> worker_cores_from_logical_cores(const std::vector<CoreCoord>&logical_cores) const override;
-    std::vector<CoreCoord> ethernet_cores_from_logical_cores(const std::vector<CoreCoord> &logical_cores) const override;
+    std::vector<CoreCoord> worker_cores_from_logical_cores(const std::vector<CoreCoord>& logical_cores) const override;
+    std::vector<CoreCoord> ethernet_cores_from_logical_cores(
+        const std::vector<CoreCoord>& logical_cores) const override;
     std::vector<CoreCoord> get_optimal_dram_bank_to_logical_worker_assignment() override;
 
-
     CoreCoord virtual_core_from_logical_core(const CoreCoord& logical_coord, const CoreType& core_type) const override;
     CoreCoord worker_core_from_logical_core(const CoreCoord& logical_core) const override;
     CoreCoord ethernet_core_from_logical_core(const CoreCoord& logical_core) const override;
     CoreCoord logical_core_from_ethernet_core(const CoreCoord& ethernet_core) const override;
-    std::unordered_set<CoreCoord> get_active_ethernet_cores(bool skip_reserved_tunnel_cores=false) const override;
+    std::unordered_set<CoreCoord> get_active_ethernet_cores(bool skip_reserved_tunnel_cores = false) const override;
     std::unordered_set<CoreCoord> get_inactive_ethernet_cores() const override;
-    bool is_active_ethernet_core(CoreCoord logical_core, bool skip_reserved_tunnel_cores=false) const override;
+    bool is_active_ethernet_core(CoreCoord logical_core, bool skip_reserved_tunnel_cores = false) const override;
     std::tuple<chip_id_t, CoreCoord> get_connected_ethernet_core(CoreCoord eth_core) const override;
     std::vector<CoreCoord> get_ethernet_sockets(chip_id_t connected_chip_id) const override;
     bool is_inactive_ethernet_core(CoreCoord logical_core) const override;
@@ -134,15 +134,20 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this<MeshDevic
     CoreCoord logical_core_from_bank_id(uint32_t bank_id) const override;
     CoreCoord logical_core_from_bank_id(uint32_t bank_id, SubDeviceId sub_device_id) const override;
     const std::vector<uint32_t>& bank_ids_from_dram_channel(uint32_t dram_channel) const override;
-    const std::vector<uint32_t>& bank_ids_from_dram_channel(uint32_t dram_channel, SubDeviceId sub_device_id) const override;
-    const std::vector<uint32_t>& bank_ids_from_logical_core(BufferType buffer_type, const CoreCoord& logical_core) const override;
-    const std::vector<uint32_t>& bank_ids_from_logical_core(BufferType buffer_type, const CoreCoord& logical_core, SubDeviceId sub_device_id) const override;
+    const std::vector<uint32_t>& bank_ids_from_dram_channel(
+        uint32_t dram_channel, SubDeviceId sub_device_id) const override;
+    const std::vector<uint32_t>& bank_ids_from_logical_core(
+        BufferType buffer_type, const CoreCoord& logical_core) const override;
+    const std::vector<uint32_t>& bank_ids_from_logical_core(
+        BufferType buffer_type, const CoreCoord& logical_core, SubDeviceId sub_device_id) const override;
     allocator::Statistics get_memory_allocation_statistics(const BufferType& buffer_type) const override;
-    allocator::Statistics get_memory_allocation_statistics(const BufferType& buffer_type, SubDeviceId sub_device_id) const override;
-    uint32_t get_allocator_alignment() const override;
-    uint32_t get_allocator_alignment(SubDeviceId sub_device_id) const override;
+    allocator::Statistics get_memory_allocation_statistics(
+        const BufferType& buffer_type, SubDeviceId sub_device_id) const override;
+    uint32_t get_allocator_alignment(const BufferType& buffer_type) const override;
+    uint32_t get_allocator_alignment(const BufferType& buffer_type, SubDeviceId sub_device_id) const override;
     std::optional<DeviceAddr> lowest_occupied_compute_l1_address() const override;
-    std::optional<DeviceAddr> lowest_occupied_compute_l1_address(tt::stl::Span<const SubDeviceId> sub_device_ids) const override;
+    std::optional<DeviceAddr> lowest_occupied_compute_l1_address(
+        tt::stl::Span<const SubDeviceId> sub_device_ids) const override;
     size_t get_l1_small_size() const override;
     size_t get_l1_small_size(SubDeviceId sub_device_id) const override;
     const std::unordered_set<Buffer*>& get_allocated_buffers() const override;
@@ -150,15 +155,18 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this<MeshDevic
     void deallocate_buffers() override;
     void deallocate_buffers(SubDeviceId sub_device_id) override;
     void dump_memory_blocks(const BufferType& buffer_type, std::ofstream& out) const override;
-    void dump_memory_blocks(const BufferType& buffer_type, std::ofstream& out, SubDeviceId sub_device_id) const override;
+    void dump_memory_blocks(
+        const BufferType& buffer_type, std::ofstream& out, SubDeviceId sub_device_id) const override;
     const std::set<CoreCoord>& ethernet_cores() const override;
     const std::set<CoreCoord>& storage_only_cores() const override;
     uint32_t get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& core) const override;
     uint32_t get_noc_multicast_encoding(uint8_t noc_index, const CoreRange& cores) const override;
     const JitBuildEnv& build_env() const override;
     const string build_firmware_target_path(uint32_t programmable_core, uint32_t processor_class, int i) const override;
-    const string build_kernel_target_path(uint32_t programmable_core, uint32_t processor_class, int i, const string& kernel_name) const override;
-    const JitBuildState& build_firmware_state(uint32_t programmable_core, uint32_t processor_class, int i) const override;
+    const string build_kernel_target_path(
+        uint32_t programmable_core, uint32_t processor_class, int i, const string& kernel_name) const override;
+    const JitBuildState& build_firmware_state(
+        uint32_t programmable_core, uint32_t processor_class, int i) const override;
     const JitBuildState& build_kernel_state(uint32_t programmable_core, uint32_t processor_class, int i) const override;
     const JitBuildStateSubset build_kernel_states(uint32_t programmable_core, uint32_t processor_class) const override;
     SystemMemoryManager& sysmem_manager() override;
@@ -178,7 +186,12 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this<MeshDevic
     bool using_fast_dispatch() const override;
 
     // Initialization APIs
-    bool initialize(const uint8_t num_hw_cqs, size_t l1_small_size, size_t trace_region_size, tt::stl::Span<const std::uint32_t> l1_bank_remap = {}, bool minimal = false) override;
+    bool initialize(
+        const uint8_t num_hw_cqs,
+        size_t l1_small_size,
+        size_t trace_region_size,
+        tt::stl::Span<const std::uint32_t> l1_bank_remap = {},
+        bool minimal = false) override;
     void build_firmware() override;
     void reset_cores() override;
     void initialize_and_launch_firmware() override;
@@ -200,17 +213,20 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this<MeshDevic
     program_cache::detail::ProgramCache& get_program_cache() override;
     std::size_t num_program_cache_entries() override;
     HalProgrammableCoreType get_programmable_core_type(CoreCoord virtual_core) const override;
-    std::vector<std::pair<transfer_info_cores, uint32_t>> extract_dst_noc_multicast_info(const std::vector<CoreRange>& ranges, const CoreType core_type) override;
+    std::vector<std::pair<transfer_info_cores, uint32_t>> extract_dst_noc_multicast_info(
+        const std::vector<CoreRange>& ranges, const CoreType core_type) override;
     bool dispatch_s_enabled() const override;
     bool distributed_dispatcher() const override;
     NOC dispatch_go_signal_noc() const override;
     size_t get_device_kernel_defines_hash() override;
     uint8_t num_noc_mcast_txns(SubDeviceId sub_device_id) const override;
     uint8_t num_noc_unicast_txns(SubDeviceId sub_device_id) const override;
-    uint8_t noc_data_start_index(SubDeviceId sub_device_id, bool mcast_data=true, bool unicast_data=true) const override;
+    uint8_t noc_data_start_index(
+        SubDeviceId sub_device_id, bool mcast_data = true, bool unicast_data = true) const override;
     SubDeviceManagerId get_active_sub_device_manager_id() const override;
     SubDeviceManagerId get_default_sub_device_manager_id() const override;
-    SubDeviceManagerId create_sub_device_manager(tt::stl::Span<const SubDevice> sub_devices, DeviceAddr local_l1_size) override;
+    SubDeviceManagerId create_sub_device_manager(
+        tt::stl::Span<const SubDevice> sub_devices, DeviceAddr local_l1_size) override;
     void remove_sub_device_manager(SubDeviceManagerId sub_device_manager_id) override;
     void load_sub_device_manager(SubDeviceManagerId sub_device_manager_id) override;
     void clear_loaded_sub_device_manager() override;
diff --git a/tt_metal/distributed/mesh_device.cpp b/tt_metal/distributed/mesh_device.cpp
index 1ce83e3c727..a6ff052756d 100644
--- a/tt_metal/distributed/mesh_device.cpp
+++ b/tt_metal/distributed/mesh_device.cpp
@@ -800,13 +800,11 @@ std::vector<std::vector<chip_id_t>> MeshDevice::get_tunnels_from_mmio() const {
 
 // Allocator methods
 // Memory statistics and buffer management
-uint32_t MeshDevice::get_allocator_alignment() const {
-    const auto& allocator = this->get_initialized_allocator();
-    return allocator->config.alignment;
+uint32_t MeshDevice::get_allocator_alignment(const BufferType& buffer_type) const {
+    return reference_device()->get_allocator_alignment(buffer_type);
 }
-uint32_t MeshDevice::get_allocator_alignment(SubDeviceId sub_device_id) const {
-    const auto& allocator = this->get_initialized_allocator(sub_device_id);
-    return allocator->config.alignment;
+uint32_t MeshDevice::get_allocator_alignment(const BufferType& buffer_type, SubDeviceId sub_device_id) const {
+    return reference_device()->get_allocator_alignment(buffer_type, sub_device_id);
 }
 
 std::optional<DeviceAddr> MeshDevice::lowest_occupied_compute_l1_address() const {
diff --git a/tt_metal/hw/inc/blackhole/core_config.h b/tt_metal/hw/inc/blackhole/core_config.h
index 9e4ba749e7b..beab0ab565c 100644
--- a/tt_metal/hw/inc/blackhole/core_config.h
+++ b/tt_metal/hw/inc/blackhole/core_config.h
@@ -25,5 +25,5 @@ constexpr uint8_t NumEthDispatchClasses = 2;
 constexpr uint8_t NumDramDispatchClasses = 1;
 constexpr uint8_t noc_size_x = 17;
 constexpr uint8_t noc_size_y = 12;
-#define ALLOCATOR_ALIGNMENT 64
-#define LOG_BASE_2_OF_ALLOCATOR_ALIGNMENT 6
+#define LOG_BASE_2_OF_DRAM_ALIGNMENT 6
+#define LOG_BASE_2_OF_L1_ALIGNMENT 4
diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/hw/inc/dataflow_api.h
index 5fe53ab5eb0..0cc2cb7c99d 100644
--- a/tt_metal/hw/inc/dataflow_api.h
+++ b/tt_metal/hw/inc/dataflow_api.h
@@ -119,6 +119,25 @@ FORCE_INLINE uint32_t get_bank_offset(uint32_t bank_index) {
     }
 }
 
+template <bool DRAM>
+FORCE_INLINE
+constexpr uint32_t get_allocator_alignment() {
+    if constexpr (DRAM) {
+        return DRAM_ALIGNMENT;
+    } else {
+        return L1_ALIGNMENT;
+    }
+}
+
+template <bool DRAM>
+FORCE_INLINE
+constexpr uint32_t get_log_base2_of_allocator_alignment() {
+    if constexpr (DRAM) {
+        return LOG_BASE_2_OF_DRAM_ALIGNMENT;
+    } else {
+        return LOG_BASE_2_OF_L1_ALIGNMENT;
+    }
+}
 }  // namespace interleaved_addr_gen
 
 /**
@@ -620,8 +639,8 @@ uint64_t get_dram_noc_addr(
     uint8_t noc = noc_index) {
     uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index<true>(id);
     uint32_t bank_index = interleaved_addr_gen::get_bank_index<true>(id, bank_offset_index);
-    uint32_t addr = (bank_offset_index * align_power_of_2(page_size, ALLOCATOR_ALIGNMENT)) + bank_base_address +
-                    offset + bank_to_dram_offset[bank_index];
+    uint32_t addr = (bank_offset_index * align_power_of_2(page_size, interleaved_addr_gen::get_allocator_alignment<true>())) + bank_base_address + offset +
+                    bank_to_dram_offset[bank_index];
     uint32_t noc_xy = interleaved_addr_gen::get_noc_xy<true>(bank_index, noc);
     uint64_t noc_addr = get_noc_addr_helper(noc_xy, addr);
     return noc_addr;
@@ -635,8 +654,8 @@ uint64_t get_l1_noc_addr(
     uint8_t noc = noc_index) {
     uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index<false>(id);
     uint32_t bank_index = interleaved_addr_gen::get_bank_index<false>(id, bank_offset_index);
-    uint32_t addr = (bank_offset_index * align_power_of_2(page_size, ALLOCATOR_ALIGNMENT)) + bank_base_address +
-                    offset + bank_to_dram_offset[bank_index];
+    uint32_t addr = (bank_offset_index * align_power_of_2(page_size, interleaved_addr_gen::get_allocator_alignment<false>())) + bank_base_address + offset +
+                    bank_to_dram_offset[bank_index];
     uint32_t noc_xy = interleaved_addr_gen::get_noc_xy<false>(bank_index, noc);
     uint64_t noc_addr = get_noc_addr_helper(noc_xy, addr);
     return noc_addr;
@@ -1006,7 +1025,7 @@ template <bool DRAM>
 struct InterleavedAddrGen {
     uint32_t bank_base_address;  // Base address for the whole tensor.
     const uint32_t page_size;    // Num bytes in page.
-    const uint32_t aligned_page_size = align_power_of_2(page_size, ALLOCATOR_ALIGNMENT);
+    const uint32_t aligned_page_size = align_power_of_2(page_size, interleaved_addr_gen::get_allocator_alignment<DRAM>());
 
     FORCE_INLINE
     uint32_t get_addr(
@@ -1041,9 +1060,11 @@ struct InterleavedPow2AddrGen {
     const uint32_t bank_base_address;
     const uint32_t log_base_2_of_page_size;  // WARNING: This struct is used for optimized get_noc_addr in which case
                                              // you know that bank_unit_size is a power of 2
-    const uint32_t aligned_log_base_2_of_page_size = this->log_base_2_of_page_size > LOG_BASE_2_OF_ALLOCATOR_ALIGNMENT
+    static constexpr uint32_t log_base_2_of_allocator_alignment =
+        interleaved_addr_gen::get_log_base2_of_allocator_alignment<DRAM>();
+    const uint32_t aligned_log_base_2_of_page_size = this->log_base_2_of_page_size > log_base_2_of_allocator_alignment
                                                          ? this->log_base_2_of_page_size
-                                                         : LOG_BASE_2_OF_ALLOCATOR_ALIGNMENT;
+                                                         : log_base_2_of_allocator_alignment;
 
     FORCE_INLINE
     uint32_t get_addr(
@@ -1156,9 +1177,11 @@ template <bool DRAM>
 struct InterleavedPow2AddrGenFast {
     uint32_t bank_base_address;              // Base address for the whole tensor.
     const uint32_t log_base_2_of_page_size;  // Num bytes in bank unit.
-    const uint32_t aligned_log_base_2_of_page_size = this->log_base_2_of_page_size > LOG_BASE_2_OF_ALLOCATOR_ALIGNMENT
+    static constexpr uint32_t log_base_2_of_allocator_alignment =
+        interleaved_addr_gen::get_log_base2_of_allocator_alignment<DRAM>();
+    const uint32_t aligned_log_base_2_of_page_size = this->log_base_2_of_page_size > log_base_2_of_allocator_alignment
                                                          ? this->log_base_2_of_page_size
-                                                         : LOG_BASE_2_OF_ALLOCATOR_ALIGNMENT;
+                                                         : log_base_2_of_allocator_alignment;
 
     FORCE_INLINE
     uint32_t get_addr(
diff --git a/tt_metal/hw/inc/grayskull/core_config.h b/tt_metal/hw/inc/grayskull/core_config.h
index 5f73abc2364..066d86376c0 100644
--- a/tt_metal/hw/inc/grayskull/core_config.h
+++ b/tt_metal/hw/inc/grayskull/core_config.h
@@ -17,5 +17,5 @@ constexpr uint8_t MaxProcessorsPerCoreType = 5;
 constexpr uint8_t NumTensixDispatchClasses = 3;
 constexpr uint8_t noc_size_x = 13;
 constexpr uint8_t noc_size_y = 12;
-#define ALLOCATOR_ALIGNMENT 32
-#define LOG_BASE_2_OF_ALLOCATOR_ALIGNMENT 5
+#define LOG_BASE_2_OF_DRAM_ALIGNMENT 5
+#define LOG_BASE_2_OF_L1_ALIGNMENT 4
diff --git a/tt_metal/hw/inc/wormhole/core_config.h b/tt_metal/hw/inc/wormhole/core_config.h
index 491ab6bb54a..e1d0c168036 100644
--- a/tt_metal/hw/inc/wormhole/core_config.h
+++ b/tt_metal/hw/inc/wormhole/core_config.h
@@ -22,5 +22,5 @@ constexpr uint8_t NumTensixDispatchClasses = 3;
 constexpr uint8_t NumEthDispatchClasses = 1;
 constexpr uint8_t noc_size_x = 10;
 constexpr uint8_t noc_size_y = 12;
-#define ALLOCATOR_ALIGNMENT 32
-#define LOG_BASE_2_OF_ALLOCATOR_ALIGNMENT 5
+#define LOG_BASE_2_OF_DRAM_ALIGNMENT 5
+#define LOG_BASE_2_OF_L1_ALIGNMENT 4
diff --git a/tt_metal/impl/allocator/allocator.cpp b/tt_metal/impl/allocator/allocator.cpp
index c626741533b..1b5b0d106f6 100644
--- a/tt_metal/impl/allocator/allocator.cpp
+++ b/tt_metal/impl/allocator/allocator.cpp
@@ -64,7 +64,7 @@ BankManager::BankManager(
     }
     this->interleaved_address_limit_ = 0;
     validate_num_banks(this->bank_id_to_bank_offset_.size(), this->buffer_type_, disable_interleaved);
-    this->init_allocator(size_bytes, alignment_bytes, alloc_offset);
+    this->init_allocator(size_bytes, hal.get_alignment(HalMemType::DRAM), alloc_offset);
 }
 
 BankManager::BankManager(
@@ -80,7 +80,7 @@ BankManager::BankManager(
     interleaved_address_limit_(interleaved_address_limit),
     alignment_bytes_(alignment_bytes) {
     validate_num_banks(this->bank_id_to_bank_offset_.size(), this->buffer_type_, disable_interleaved);
-    this->init_allocator(size_bytes, alignment_bytes, alloc_offset);
+    this->init_allocator(size_bytes, hal.get_alignment(HalMemType::DRAM), alloc_offset);
 }
 
 uint32_t BankManager::num_banks() const { return this->bank_id_to_bank_offset_.size(); }
@@ -245,7 +245,7 @@ void init_one_bank_per_channel(Allocator& allocator, const AllocatorConfig& allo
         BufferType::DRAM,
         bank_offsets,
         dram_bank_size,
-        alloc_config.alignment,
+        alloc_config.dram_alignment,
         alloc_config.dram_unreserved_base,
         alloc_config.disable_interleaved);
     for (uint32_t bank_id = 0; bank_id < alloc_config.num_dram_channels; bank_id++) {
@@ -260,7 +260,7 @@ void init_one_bank_per_channel(Allocator& allocator, const AllocatorConfig& allo
         BufferType::TRACE,
         bank_offsets,
         alloc_config.trace_region_size,
-        alloc_config.alignment,
+        alloc_config.dram_alignment,
         dram_bank_size + alloc_config.dram_unreserved_base,
         alloc_config.disable_interleaved);
     for (uint32_t bank_id = 0; bank_id < alloc_config.num_dram_channels; bank_id++) {
@@ -281,7 +281,7 @@ void init_one_bank_per_l1(Allocator& allocator, const AllocatorConfig& alloc_con
         BufferType::L1,
         bank_offsets,
         l1_bank_size,
-        alloc_config.alignment,
+        alloc_config.l1_alignment,
         alloc_config.l1_unreserved_base,
         alloc_config.disable_interleaved);
 
@@ -358,6 +358,18 @@ const std::vector<uint32_t>& bank_ids_from_logical_core(
     return allocator.logical_core_to_bank_ids.at(buffer_type).at(logical_core);
 }
 
+uint32_t get_alignment(const Allocator& alloator, const BufferType& buffer_type) {
+    switch (buffer_type) {
+        case BufferType::DRAM:
+        case BufferType::TRACE: return alloator.config.dram_alignment;
+        case BufferType::L1:
+        case BufferType::L1_SMALL: return alloator.config.l1_alignment;
+        default: {
+            TT_THROW("Allocator does not support buffer ");
+        }
+    }
+}
+
 Statistics get_statistics(const Allocator& allocator, const BufferType& buffer_type) {
     Statistics stats;
     switch (buffer_type) {
diff --git a/tt_metal/impl/allocator/l1_banking_allocator.cpp b/tt_metal/impl/allocator/l1_banking_allocator.cpp
index d969d7ab4fa..820acde04ad 100644
--- a/tt_metal/impl/allocator/l1_banking_allocator.cpp
+++ b/tt_metal/impl/allocator/l1_banking_allocator.cpp
@@ -189,7 +189,7 @@ void init_compute_and_storage_l1_bank_manager(Allocator& allocator, const Alloca
     // Storage only cores only need to reserve mailbox space to hold barriers
     uint32_t mem_mailbox_base = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::MAILBOX);
     uint32_t storage_core_unreserved_base =
-        ((mem_mailbox_base + alloc_config.alignment - 1) / alloc_config.alignment) * alloc_config.alignment;
+        ((mem_mailbox_base + alloc_config.l1_alignment - 1) / alloc_config.l1_alignment) * alloc_config.l1_alignment;
 
     // There is only l1_bank_size bytes available for L1 buffers to be allocated in
     uint64_t l1_bank_size = alloc_config.storage_core_bank_size.has_value()
@@ -205,7 +205,7 @@ void init_compute_and_storage_l1_bank_manager(Allocator& allocator, const Alloca
         bank_id_to_bank_offset,
         allocatable_l1_size,
         interleaved_address_limit,
-        alloc_config.alignment,
+        alloc_config.l1_alignment,
         alloc_config.l1_unreserved_base,
         alloc_config.disable_interleaved);
 
@@ -219,7 +219,7 @@ void init_compute_and_storage_l1_bank_manager(Allocator& allocator, const Alloca
         small_bank_id_to_bank_offset,
         alloc_config.l1_small_size,
         small_interleaved_address_limit,
-        alloc_config.alignment,
+        alloc_config.l1_alignment,
         small_alloc_offset,
         alloc_config.disable_interleaved);
 }
diff --git a/tt_metal/impl/buffers/buffer.cpp b/tt_metal/impl/buffers/buffer.cpp
index 21e5787e85e..a7f5b53e359 100644
--- a/tt_metal/impl/buffers/buffer.cpp
+++ b/tt_metal/impl/buffers/buffer.cpp
@@ -454,7 +454,7 @@ DeviceAddr Buffer::bank_local_page_address(uint32_t bank_id, uint32_t page_index
 }
 
 uint32_t Buffer::alignment() const {
-    return this->allocator_->config.alignment;
+    return this->device_->get_allocator_alignment(this->buffer_type_);
 }
 
 DeviceAddr Buffer::aligned_page_size() const {
diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp
index e360bf508ef..e56a549a47b 100644
--- a/tt_metal/impl/device/device.cpp
+++ b/tt_metal/impl/device/device.cpp
@@ -262,25 +262,28 @@ std::unique_ptr<Allocator> Device::initialize_allocator(size_t l1_small_size, si
          .dram_bank_offsets = {},
          .dram_unreserved_base =
              hal.get_dev_addr(HalDramMemAddrType::DRAM_BARRIER) + hal.get_dev_size(HalDramMemAddrType::DRAM_BARRIER),
-         .l1_unreserved_base = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED),
+         .dram_alignment = hal.get_alignment(HalMemType::DRAM),
+         .l1_unreserved_base = align(
+             hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED),
+             hal.get_alignment(HalMemType::DRAM)),
          .worker_grid = CoreRangeSet(CoreRange(CoreCoord(0, 0), CoreCoord(logical_size.x - 1, logical_size.y - 1))),
          .worker_l1_size = static_cast<size_t>(soc_desc.worker_l1_size),
          .storage_core_bank_size = get_storage_core_bank_size(id_, num_hw_cqs_, dispatch_core_config),
-         .l1_small_size = tt::align(l1_small_size, hal.get_alignment(HalMemType::L1)),
+         .l1_small_size = tt::align(l1_small_size, hal.get_alignment(HalMemType::DRAM)),
          .trace_region_size = tt::align(trace_region_size, hal.get_alignment(HalMemType::DRAM)),
          .core_type_from_noc_coord_table = {},  // Populated later
          .worker_log_to_virtual_routing_x = tt::Cluster::instance().get_worker_logical_to_virtual_x(this->id()),
          .worker_log_to_virtual_routing_y = tt::Cluster::instance().get_worker_logical_to_virtual_y(this->id()),
          .l1_bank_remap = {l1_bank_remap.begin(), l1_bank_remap.end()},
          .compute_grid = CoreRangeSet(CoreRange(CoreCoord(0, 0), CoreCoord(compute_size.x - 1, compute_size.y - 1))),
-         .alignment = std::max(hal.get_alignment(HalMemType::DRAM), hal.get_alignment(HalMemType::L1)),
+         .l1_alignment = hal.get_alignment(HalMemType::L1),
          .disable_interleaved = false});
     TT_FATAL(config.l1_small_size < (config.storage_core_bank_size.has_value() ? config.storage_core_bank_size.value() : config.worker_l1_size - config.l1_unreserved_base),
             "Reserved size must be less than bank size");
     TT_FATAL(
-        config.l1_small_size % config.alignment == 0,
-        "Reserved size must be aligned to allocator alignment {}",
-        config.alignment);
+        config.l1_small_size % config.l1_alignment == 0,
+        "Reserved size must be aligned to L1 allocator alignment {}",
+        config.l1_alignment);
     // Initialize dram_offsets from soc_descriptor
     for (auto channel = 0; channel < soc_desc.get_num_dram_channels(); channel++) {
         config.dram_bank_offsets.push_back(soc_desc.get_address_offset(channel));
@@ -1402,14 +1405,14 @@ allocator::Statistics Device::get_memory_allocation_statistics(const BufferType
     return allocator::get_statistics(*allocator, buffer_type);
 }
 
-uint32_t Device::get_allocator_alignment() const {
+uint32_t Device::get_allocator_alignment(const BufferType &buffer_type) const {
     const auto& allocator = this->get_initialized_allocator();
-    return allocator->config.alignment;
+    return allocator::get_alignment(*allocator, buffer_type);
 }
 
-uint32_t Device::get_allocator_alignment(SubDeviceId sub_device_id) const {
+uint32_t Device::get_allocator_alignment(const BufferType &buffer_type, SubDeviceId sub_device_id) const {
     const auto& allocator = this->get_initialized_allocator(sub_device_id);
-    return allocator->config.alignment;
+    return allocator::get_alignment(*allocator, buffer_type);
 }
 
 size_t Device::get_l1_small_size() const {
@@ -1887,9 +1890,7 @@ bool v1::CloseDevice(IDevice* device) { return v0::CloseDevice(device); }
 
 void v1::DeallocateBuffers(IDevice* device) { device->deallocate_buffers(); }
 
-void v1::DumpDeviceProfileResults(IDevice* device) {
-    detail::DumpDeviceProfileResults(device);
-}
+void v1::DumpDeviceProfileResults(IDevice* device) { detail::DumpDeviceProfileResults(device); }
 
 ARCH v1::GetArch(IDevice* device) { return device->arch(); }
 
diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp
index e53c4572513..33bdda21a06 100644
--- a/tt_metal/impl/program/program.cpp
+++ b/tt_metal/impl/program/program.cpp
@@ -855,7 +855,7 @@ void detail::Program_::allocate_circular_buffers(const IDevice* device) {
                 }
             }
         }
-        computed_addr = tt::align(computed_addr, device->get_allocator_alignment());
+        computed_addr = tt::align(computed_addr, device->get_allocator_alignment(BufferType::DRAM));
         for (const CoreRange &core_range : circular_buffer->core_ranges().ranges()) {
             for (CircularBufferAllocator &cb_allocator : this->cb_allocators_) {
                 if (cb_allocator.core_range.intersects(core_range)) {
diff --git a/tt_metal/impl/sub_device/sub_device_manager.cpp b/tt_metal/impl/sub_device/sub_device_manager.cpp
index 69e7d4ea257..e61320f57d7 100644
--- a/tt_metal/impl/sub_device/sub_device_manager.cpp
+++ b/tt_metal/impl/sub_device/sub_device_manager.cpp
@@ -261,6 +261,7 @@ void SubDeviceManager::populate_sub_allocators() {
              .dram_bank_size = 0,
              .dram_bank_offsets = global_allocator_config.dram_bank_offsets,
              .dram_unreserved_base = global_allocator_config.dram_unreserved_base,
+             .dram_alignment = global_allocator_config.dram_alignment,
              .l1_unreserved_base = global_allocator_config.l1_unreserved_base,
              .worker_grid = compute_cores,
              .worker_l1_size = global_allocator_config.l1_unreserved_base + local_l1_size_,
@@ -272,7 +273,7 @@ void SubDeviceManager::populate_sub_allocators() {
              .worker_log_to_virtual_routing_y = global_allocator_config.worker_log_to_virtual_routing_y,
              .l1_bank_remap = std::move(l1_bank_remap),
              .compute_grid = compute_cores,
-             .alignment = global_allocator_config.alignment,
+             .l1_alignment = global_allocator_config.l1_alignment,
              .disable_interleaved = true});
         TT_FATAL(
             config.l1_small_size < (config.storage_core_bank_size.has_value()
@@ -280,9 +281,9 @@ void SubDeviceManager::populate_sub_allocators() {
                                         : config.worker_l1_size - config.l1_unreserved_base),
             "Reserved size must be less than bank size");
         TT_FATAL(
-            config.l1_small_size % config.alignment == 0,
-            "Reserved size must be aligned to allocator alignment {}",
-            config.alignment);
+            config.l1_small_size % config.l1_alignment == 0,
+            "Reserved size must be aligned to allocator L1 alignment {}",
+            config.l1_alignment);
 
         // sub_devices only have compute cores for allocation
         for (const CoreCoord& core : corerange_to_cores(compute_cores)) {
diff --git a/tt_metal/programming_examples/sharding/shard_data_rm.cpp b/tt_metal/programming_examples/sharding/shard_data_rm.cpp
index 6bcf9cf9385..d255077389a 100644
--- a/tt_metal/programming_examples/sharding/shard_data_rm.cpp
+++ b/tt_metal/programming_examples/sharding/shard_data_rm.cpp
@@ -45,7 +45,7 @@ int main(int argc, char** argv) {
     uint32_t input_unit_size = sizeof(uint32_t);
     uint32_t shard_width_bytes = shard_width * data_size;
     uint32_t num_units_per_row = shard_width * input_unit_size;
-    uint32_t padded_offset_bytes = align(input_unit_size, device->get_allocator_alignment());
+    uint32_t padded_offset_bytes = align(input_unit_size, device->get_allocator_alignment(BufferType::DRAM));
 
     // configure and create interleaved DRAM buffer to insert source data into
     uint32_t src_buffer_size = input_unit_size * num_values / data_size;
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp
index 1df0a085bd7..79498305904 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp
@@ -1059,7 +1059,7 @@ conv_op_l1_usage conv2d::calculate_L1_usage(
             } else if (output_dtype == DataType::FLOAT32) {
                 per_core_out_width_aligned *= 4;
             }
-            output_size = round_up(per_core_out_width_aligned, 32) * pconfig.per_core_out_matrix_height;
+            output_size = round_up(per_core_out_width_aligned, 16) * pconfig.per_core_out_matrix_height;
         } else {
             output_size = per_core_out_matrix_height_ntiles * per_core_out_matrix_width_ntiles * output_tile_size;
         }
@@ -1162,7 +1162,7 @@ conv_op_l1_usage conv2d::calculate_L1_usage(
             } else if (output_dtype == DataType::FLOAT32) {
                 per_core_out_width_aligned *= 4;
             }
-            output_size = round_up(per_core_out_width_aligned, 32) * pconfig.per_core_out_matrix_height;
+            output_size = round_up(per_core_out_width_aligned, 16) * pconfig.per_core_out_matrix_height;
         } else {
             output_size = per_core_out_matrix_height_ntiles * per_core_out_matrix_width_ntiles * output_tile_size;
         }
diff --git a/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.cpp
index f3872ff1581..472d25a5f14 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.cpp
@@ -525,10 +525,11 @@ tt_metal::operation::ProgramWithCallbacks concat_multi_core(
 
     uint32_t num_output_pages;
     uint32_t single_page_size;
+    uint32_t common_align_len = std::max(input_tensors[0].buffer()->alignment(), output.buffer()->alignment());
     if (rm_layout) {
         num_output_pages = output.volume() / output.get_padded_shape()[-1];
         single_page_size =
-            tt::align(output.element_size() * output.get_padded_shape()[-1], output.buffer()->alignment());
+            tt::align(output.element_size() * output.get_padded_shape()[-1], common_align_len);
     } else {
         num_output_pages = output.volume() / TILE_HW;
         single_page_size = tt_metal::detail::TileSize(cb_data_format);
diff --git a/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_multi_core_program_factory.cpp
index 3ab6978d611..99b4766631b 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_multi_core_program_factory.cpp
@@ -39,7 +39,7 @@ Fold::MultiCore::cached_program_t fold_multi_core(
 
     // input CB
     uint32_t cb_src0_index = tt::CBIndex::c_0;
-    uint32_t aligned_pixel_size = round_up_to_mul32(pixel_size);
+    uint32_t aligned_pixel_size = round_up_to_mul16(pixel_size);
     auto src_cb_config = CircularBufferConfig(num_pixels * aligned_pixel_size, {{cb_src0_index, cb_data_format}})
                              .set_page_size(cb_src0_index, aligned_pixel_size)
                              .set_globally_allocated_address(*input.buffer());
@@ -47,7 +47,7 @@ Fold::MultiCore::cached_program_t fold_multi_core(
 
     // output CB
     uint32_t cb_dst0_index = tt::CBIndex::c_16;
-    uint32_t aligned_dst_pixel_size = round_up_to_mul32(dst_pixel_size);
+    uint32_t aligned_dst_pixel_size = round_up_to_mul16(dst_pixel_size);
     auto dst_cb_config =
         CircularBufferConfig(num_dst_pixels * aligned_dst_pixel_size, {{cb_dst0_index, cb_data_format}})
             .set_page_size(cb_dst0_index, aligned_dst_pixel_size)
diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
index 9fb9a819007..7f4d30c28ab 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
@@ -1431,14 +1431,12 @@ operation::ProgramWithCallbacks pad_rm_sharded_width_only(
         TT_THROW("ttnn.pad: unsupported data type for pad_rm_sharded_stickwise");
     }
 
-    // FIXME: assumes that this was sharded using DRAM alignment so that gaps are left in the tensor.
-    // if this changes, we should change the stick step to be 16B (L1 alignment).
-    auto dram_alignment_bytes = tt::tt_metal::hal.get_alignment(tt::tt_metal::HalMemType::DRAM);
+    auto l1_alignment_bytes = tt::tt_metal::hal.get_alignment(tt::tt_metal::HalMemType::L1);
     uint32_t padded_stick_step = tt::round_up(
-        padded_stick_bytes, dram_alignment_bytes);  // round padded_stick bytes to a multiple of dram_alignment_bytes
+        padded_stick_bytes, l1_alignment_bytes);  // round padded_stick bytes to a multiple of l1_alignment_bytes
     uint32_t unpadded_stick_step = tt::round_up(
         unpadded_stick_bytes,
-        dram_alignment_bytes);  // round unpadded_stick bytes to a multiple of dram_alignment_bytes
+        l1_alignment_bytes);  // round unpadded_stick bytes to a multiple of l1_alignment_bytes
 
     std::vector<uint32_t> reader_ct_args = {
         unpadded_stick_bytes,
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_reader.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_reader.cpp
index 835773e8a0a..26eb50cb479 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_reader.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_reader.cpp
@@ -6,9 +6,9 @@
 #include "dataflow_api.h"
 
 void kernel_main() {
-
-	constexpr uint32_t shard_cb_id = get_compile_time_arg_val(0);
+    constexpr uint32_t shard_cb_id = get_compile_time_arg_val(0);
     constexpr bool read_from_dram = get_compile_time_arg_val(1);
+    constexpr uint32_t scratch_cb_id = get_compile_time_arg_val(2);
 
     uint32_t src_addr = get_arg_val<uint32_t>(0);
     uint32_t write_offset = get_arg_val<uint32_t>(1);
@@ -17,12 +17,25 @@ void kernel_main() {
     uint32_t args_idx = 0;
 
     uint32_t l1_write_addr = get_write_ptr(shard_cb_id) + write_offset;
+    cb_reserve_back(scratch_cb_id, 1);
+    uint32_t scratch_l1_write_addr = get_write_ptr(scratch_cb_id);
+    uint64_t scratch_l1_noc_read_addr = get_noc_addr(scratch_l1_write_addr);
+
     for (uint32_t i = 0; i < num_reads; ++i) {
         uint32_t bank_id = args[args_idx++];
         uint32_t addr = src_addr + args[args_idx++];
-        uint32_t read_size = args[args_idx++];
-        noc_async_read(get_noc_addr_from_bank_id<read_from_dram>(bank_id, addr), l1_write_addr, read_size);
-        l1_write_addr += read_size;
+        uint32_t units_to_transfer = args[args_idx++];
+        uint32_t unit_size = args[args_idx++];
+        uint32_t read_stride_bytes = args[args_idx++];
+        uint32_t write_stride_bytes = args[args_idx++];
+        uint64_t read_addr = get_noc_addr_from_bank_id<read_from_dram>(bank_id, addr);
+        for (uint32_t unit_idx = 0; unit_idx < units_to_transfer; ++unit_idx) {
+            noc_async_read(read_addr, scratch_l1_write_addr, unit_size);
+            noc_async_read_barrier();
+            noc_async_read(scratch_l1_noc_read_addr, l1_write_addr, unit_size);
+            read_addr += read_stride_bytes;
+            l1_write_addr += write_stride_bytes;
+            noc_async_read_barrier();
+        }
     }
-    noc_async_read_barrier();
 }
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_writer.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_writer.cpp
index 4c1f4f9d59f..3dcb2c796e0 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_writer.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_writer.cpp
@@ -20,7 +20,10 @@ void kernel_main() {
     for (uint32_t i = 0; i < num_writes; ++i) {
         uint32_t bank_id = args[args_idx++];
         uint32_t addr = dst_addr + args[args_idx++];
-        uint32_t write_size = args[args_idx++];
+        uint32_t units_to_transfer = args[args_idx++];
+        uint32_t unit_size = args[args_idx++];
+        args_idx += 2;  // Skip read_stride_bytes, write_stride_bytes
+        uint32_t write_size = units_to_transfer * unit_size;
         noc_async_write(l1_read_addr, get_noc_addr_from_bank_id<write_to_dram>(bank_id, addr), write_size);
         l1_read_addr += write_size;
     }
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_program_factory.cpp
index d4208c2e38f..a7002f90c18 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_program_factory.cpp
@@ -312,6 +312,8 @@ operation::ProgramWithCallbacks reshard_multi_core_same_width(const Tensor& inpu
     auto local_core_type = local_tensor.buffer()->core_type();
     auto remote_core_type = remote_tensor.buffer()->core_type();
     constexpr uint32_t cb_index = tt::CBIndex::c_0;
+    constexpr uint32_t scratch_cb_index_k0 = tt::CBIndex::c_1;
+    constexpr uint32_t scratch_cb_index_k1 = tt::CBIndex::c_2;
     auto local_cores = corerange_to_cores(
         local_shard_spec.grid, std::nullopt, local_shard_spec.orientation == ShardOrientation::ROW_MAJOR);
     auto remote_cores = corerange_to_cores(
@@ -331,6 +333,11 @@ operation::ProgramWithCallbacks reshard_multi_core_same_width(const Tensor& inpu
         remote_units_per_shard = remote_shard_spec.shape[0];
     }
     const uint32_t total_size = std::min(local_units_per_shard, remote_units_per_shard) * unit_size;
+    uint32_t read_stride_bytes = (is_reader && input.buffer()->buffer_type() == BufferType::DRAM)
+                                     ? tt::align(unit_size, hal.get_alignment(HalMemType::DRAM))
+                                     : tt::align(unit_size, hal.get_alignment(HalMemType::L1));
+    uint32_t write_stride_bytes = tt::align(unit_size, hal.get_alignment(HalMemType::L1));
+
     const std::string kernel_name =
         is_reader
             ? "ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_reader.cpp"
@@ -341,13 +348,13 @@ operation::ProgramWithCallbacks reshard_multi_core_same_width(const Tensor& inpu
         program,
         kernel_name,
         all_cores,
-        tt::tt_metal::ReaderDataMovementConfig({cb_index, interface_with_dram}));
+        tt::tt_metal::ReaderDataMovementConfig({cb_index, interface_with_dram, scratch_cb_index_k0}));
 
     tt::tt_metal::KernelHandle kernel_id_1 = tt::tt_metal::CreateKernel(
         program,
         kernel_name,
         all_cores,
-        tt::tt_metal::WriterDataMovementConfig({cb_index, interface_with_dram}));
+        tt::tt_metal::WriterDataMovementConfig({cb_index, interface_with_dram, scratch_cb_index_k1}));
 
     tt::tt_metal::CircularBufferConfig cb_config =
         tt::tt_metal::CircularBufferConfig(total_size, {{cb_index, data_format}})
@@ -388,9 +395,14 @@ operation::ProgramWithCallbacks reshard_multi_core_same_width(const Tensor& inpu
                     bank_id = device->bank_ids_from_logical_core(remote_buffer_type, remote_cores[remote_core_idx])[0];
                     kernel_args.insert(
                         kernel_args.end(),
-                        {bank_id,
-                         (remote_units_per_shard - remote_core_units_rem) * unit_size,
-                         units_to_transfer * unit_size});
+                        {
+                            bank_id,
+                            (remote_units_per_shard - remote_core_units_rem) * unit_size,
+                            units_to_transfer,
+                            unit_size,
+                            read_stride_bytes,
+                            write_stride_bytes,
+                        });
                     local_units_per_core -= units_to_transfer;
                     local_units_to_transfer -= units_to_transfer;
                     remote_core_units_rem -= units_to_transfer;
@@ -401,6 +413,17 @@ operation::ProgramWithCallbacks reshard_multi_core_same_width(const Tensor& inpu
             SetRuntimeArgs(program, kernel_id, core, kernel_args);
         }
     }
+    // Set up scratch pad for unaligned DRAM access
+    uint32_t aligned_cb_bytes = tt::align(unit_size, hal.get_alignment(HalMemType::DRAM));
+
+    tt::tt_metal::CircularBufferConfig scratch_cb_k0_config =
+        tt::tt_metal::CircularBufferConfig(unit_size, {{scratch_cb_index_k0, data_format}})
+            .set_page_size(scratch_cb_index_k0, 1);
+    auto cb_scratch_k0 = tt::tt_metal::CreateCircularBuffer(program, all_cores, scratch_cb_k0_config);
+    tt::tt_metal::CircularBufferConfig scratch_cb_k1_config =
+        tt::tt_metal::CircularBufferConfig(unit_size, {{scratch_cb_index_k1, data_format}})
+            .set_page_size(scratch_cb_index_k1, 1);
+    auto cb_scratch_k1 = tt::tt_metal::CreateCircularBuffer(program, all_cores, scratch_cb_k1_config);
 
     auto override_runtime_arguments_callback = [kernel_id_0, kernel_id_1, cb_0, local_cores](
                                                    const void* operation,
diff --git a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/multi_core/groupnorm_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/multi_core/groupnorm_op_multi_core.cpp
index 37af5cf65c0..384eb94677f 100644
--- a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/multi_core/groupnorm_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/multi_core/groupnorm_op_multi_core.cpp
@@ -171,7 +171,8 @@ operation::ProgramWithCallbacks groupnorm_multi_core_sharded(
     uint32_t per_core_N = a.shard_spec().value().shape[1];
     uint32_t per_core_Mt = per_core_M / TILE_HEIGHT;
     uint32_t per_core_Nt = (per_core_N + TILE_WIDTH - 1) / TILE_WIDTH;
-    uint32_t per_core_N_bytes_padded = round_up_to_mul32(per_core_N * datum_size_bytes);
+    uint32_t l1_alignment = tt::tt_metal::hal.get_alignment(tt::tt_metal::HalMemType::L1);
+    uint32_t per_core_N_bytes_padded = tt::round_up(per_core_N * datum_size_bytes, l1_alignment);
     bool reader_repack_output = (per_core_N % TILE_WIDTH) != 0;
     bool tilize_in = a.get_layout() == Layout::ROW_MAJOR;
     bool untilize_out = output.get_layout() == Layout::ROW_MAJOR;