From 386992af59623daa49a1f1ef1a22a9852b8cca70 Mon Sep 17 00:00:00 2001
From: Sankar Manoj <smanoj@tenstorrent.com>
Date: Sun, 12 Jan 2025 06:30:20 +0000
Subject: [PATCH 1/3] #0: First commit for loading weights on device

#0: WIP Conv device weights

#0: WIP Conv device weights

#0: Conv device weights

#0: 80% pass for loading weights on device

#0: Shallow conv support

#0: rebase fix

#0: Fix pad by using multicore

#0: Fix pad by using multicore

#0: Fix OOM for pad

#0: Fix device weights

#0: Re-enable tests

#0: Re-enable tests

#0: Re-enable tests

#0: Fix OOM for pad

#0: Build fix

#0: Build fix

#0: Re-enable transpose shards for Conv2D Unit Tests

#0: Tests fix

#0: Tests fix

#0: Rebase fi

#0: Tests fix

#0: Skip weights bfloat8 on grayskull

#0: Reverted types

#0: Add flag for always preprocessing weights

#0: Preprocess bias on device

#0: Fix conv bias

#0: Rebase fix

#0: Rebase fix

#0: Bug fix

#0: Skip test on N300

#18185: Change order of pad & permute

#0: Fix sweep

#0: Changed default for preprocess weights on device to false
---
 .../sweep_utils/conv2d_common.py              |   5 +-
 .../unit_tests/operations/test_new_conv2d.py  |  42 ++-
 .../operations/test_prepare_conv_weights.py   | 130 --------
 .../ttnn/operations/conv/conv2d/conv2d.cpp    |  49 ++-
 .../operations/conv/conv2d/conv2d_pybind.cpp  |   6 +
 .../operations/conv/conv2d/conv2d_utils.cpp   |   7 +-
 .../conv/conv2d/device/conv2d_op.hpp          |  11 +
 .../conv2d_op_sharded_program_factory.cpp     | 151 +++++++--
 .../conv/conv2d/prepare_conv2d_weights.cpp    | 313 +++++++++++++++++-
 .../conv/conv2d/prepare_conv2d_weights.hpp    |  16 +
 .../pad/device/pad_program_factory.cpp        |  15 +-
 .../ttnn/operations/data_movement/pad/pad.cpp |  12 +-
 12 files changed, 566 insertions(+), 191 deletions(-)

diff --git a/tests/sweep_framework/sweep_utils/conv2d_common.py b/tests/sweep_framework/sweep_utils/conv2d_common.py
index 1c18de54308..cabe82f80d3 100644
--- a/tests/sweep_framework/sweep_utils/conv2d_common.py
+++ b/tests/sweep_framework/sweep_utils/conv2d_common.py
@@ -275,6 +275,7 @@ def run_conv2d_short_sweep(
             dtype=output_dtype,
             weights_dtype=weights_dtype,
             output_layout=output_layout,
+            preprocess_weights_on_device=True,
         )
     else:
         tt_weight_tensor = ttnn.from_torch(torch_weight_tensor, ttnn.bfloat16)
@@ -282,7 +283,9 @@ def run_conv2d_short_sweep(
             tt_bias_tensor = ttnn.from_torch(torch_bias_tensor, ttnn.bfloat16)
 
         tt_input_tensor = ttnn.from_torch(torch_input_tensor, ttnn.bfloat16, device=device)
-        conv_config = ttnn.Conv2dConfig()
+        conv_config = ttnn.Conv2dConfig(
+            preprocess_weights_on_device=True,
+        )
 
     start_time = start_measuring_time()
     [tt_output_tensor_on_device, [out_height, out_width], [weights_device, bias_device]] = ttnn.conv2d(
diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py
index dbc28079e16..ec130ab4c45 100644
--- a/tests/ttnn/unit_tests/operations/test_new_conv2d.py
+++ b/tests/ttnn/unit_tests/operations/test_new_conv2d.py
@@ -58,6 +58,7 @@ def run_conv(
     config_override,
     dilation=1,
     use_shallow_conv_variant=False,
+    transpose_shards=True,  # https://github.com/tenstorrent/tt-metal/issues/17897
     fp32_accum=False,
     packer_l1_acc=False,
     output_layout=ttnn.TILE_LAYOUT,
@@ -72,7 +73,11 @@ def run_conv(
     weight_mesh_mapper=None,
     output_mesh_composer=None,
     enable_split_reader=False,
+<<<<<<< HEAD
     activation="",
+=======
+    preprocess_weights_on_device=True,
+>>>>>>> 55b6f9b444 (#0: First commit for loading weights on device)
 ):
     if isinstance(device, ttnn.MeshDevice):
         assert input_mesh_mapper is not None, "Expected mesh mapper for input tensor when using device mesh"
@@ -92,7 +97,7 @@ def run_conv(
     torch_input_tensor = torch.permute(torch_input_tensor_nchw, (0, 2, 3, 1))
 
     torch_weight_tensor = randomize_torch_tensor(torch_tensor_map, conv_weight_shape)
-    torch_bias_tensor = randomize_torch_tensor(torch_tensor_map, conv_bias_shape) if has_bias else None
+    torch_bias_tensor = randomize_torch_tensor(torch_tensor_map, conv_bias_shape) * 10 if has_bias else None
 
     torch_out_golden_tensor = torch.nn.functional.conv2d(
         torch_input_tensor_nchw,
@@ -138,6 +143,9 @@ def run_conv(
         enable_subblock_padding=False,
         output_layout=output_layout,
         activation=activation,
+        transpose_shards=transpose_shards,
+        preprocess_weights_on_device=preprocess_weights_on_device,
+        always_preprocess_weights=True,
     )
     compute_config = ttnn.init_device_compute_kernel_config(
         device.arch(),
@@ -157,7 +165,7 @@ def run_conv(
             conv_config.override_sharding_config = True
             print("Setting num_cores_nhw to 98")
 
-    [tt_output_tensor_on_device, [out_height, out_width]] = ttnn.conv2d(
+    [tt_output_tensor_on_device, [out_height, out_width], [d_w, d_b]] = ttnn.conv2d(
         input_tensor=tt_input_tensor,
         weight_tensor=tt_weight_tensor,
         in_channels=input_channels,
@@ -178,8 +186,8 @@ def run_conv(
         groups=groups,
         memory_config=memory_config,
         return_output_dim=True,
+        return_weights_and_bias=True,
     )
-
     tt_output_tensor = ttnn.from_device(tt_output_tensor_on_device)
     torch_output_tensor = ttnn.to_torch(tt_output_tensor, mesh_composer=output_mesh_composer)
 
@@ -195,6 +203,8 @@ def run_conv(
 
     if not fp32_accum:
         pcc = 0.985
+        if input_channels * filter_height * filter_width > 10000:
+            pcc = 0.97
     elif math_fidelity == ttnn.MathFidelity.LoFi and activations_dtype == ttnn.bfloat8_b:
         pcc = 0.996
     else:
@@ -388,6 +398,9 @@ def test_conv_features(
     if output_layout == ttnn.ROW_MAJOR_LAYOUT and activations_dtype == ttnn.bfloat8_b:
         pytest.skip("Row major layout not compatible with bfloat8_b")
 
+    if output_layout == ttnn.ROW_MAJOR_LAYOUT and activations_dtype == ttnn.bfloat16 and packer_l1_acc and fp32_accum:
+        pytest.skip("skipping due to pack_untilize_dst issue!")
+
     run_conv(
         device,
         torch_tensor_map,
@@ -411,6 +424,7 @@ def test_conv_features(
         has_bias=True,
         fp32_accum=fp32_accum,
         packer_l1_acc=packer_l1_acc,
+        preprocess_weights_on_device=True,
     )
 
 
@@ -782,7 +796,7 @@ def test_conv_for_segformer_512x512(
 )
 @pytest.mark.parametrize(
     "weights_dtype",
-    [ttnn.bfloat16, ttnn.bfloat8_b],
+    [ttnn.bfloat16],
 )
 @pytest.mark.parametrize(
     "activations_dtype",
@@ -965,6 +979,7 @@ def test_resnet50_conv_wh(
         pad_w,
         config_override=config_override,
         use_shallow_conv_variant=use_shallow_conv_variant,
+        transpose_shards=True,  ## use RM (transpose_mcast=False) with 2D on WH
         packer_l1_acc=packer_l1_acc,
         fp32_accum=False,
         has_bias=has_bias,
@@ -1026,6 +1041,7 @@ def test_conv_mem_config_wh(
         shard_layout=shard_layout,
         config_override=config_override,
         use_shallow_conv_variant=use_shallow_conv_variant,
+        transpose_shards=True,  ## use RM (transpose_mcast=False) with 2D on WH
         packer_l1_acc=True,
         fp32_accum=False,
         has_bias=True,
@@ -1211,7 +1227,7 @@ def test_resnet50_conv_wh_fp32(
 )
 @pytest.mark.parametrize(
     "weights_dtype",
-    [ttnn.bfloat8_b],
+    [ttnn.bfloat16],
 )
 @pytest.mark.parametrize(
     "activations_dtype",
@@ -1353,7 +1369,7 @@ def test_sd_conv(
 )
 @pytest.mark.parametrize(
     "activations_dtype",
-    [ttnn.bfloat16, ttnn.bfloat8_b],
+    [ttnn.bfloat16],
 )
 @pytest.mark.parametrize(
     "fp32_accum",
@@ -1494,7 +1510,7 @@ def test_sd_conv_wh(
 )
 @pytest.mark.parametrize(
     "weights_dtype",
-    [ttnn.bfloat8_b],
+    [ttnn.bfloat16],
 )
 @pytest.mark.parametrize(
     "activations_dtype",
@@ -1646,6 +1662,7 @@ def test_unet_conv_wh(
         config_override,
         shard_layout=shard_layout,
         use_shallow_conv_variant=use_shallow_conv_variant,
+        transpose_shards=True,  ## use RM (transpose_mcast=False) with 2D on WH
         output_layout=output_layout,
         auto_shard=auto_shard,
     )
@@ -1744,6 +1761,7 @@ def test_unet_conv_groups_2_wh(
         config_override,
         shard_layout=shard_layout,
         use_shallow_conv_variant=use_shallow_conv_variant,
+        transpose_shards=True,  ## use RM (transpose_mcast=False) with 2D on WH
         output_layout=output_layout,
         auto_shard=auto_shard,
         groups=groups,
@@ -1841,6 +1859,7 @@ def test_unet_conv_groups_4_6_wh(
         config_override,
         shard_layout=shard_layout,
         use_shallow_conv_variant=use_shallow_conv_variant,
+        transpose_shards=True,  ## use RM (transpose_mcast=False) with 2D on WH
         output_layout=output_layout,
         groups=groups,
     )
@@ -1939,12 +1958,14 @@ def test_unet_conv_groups_8_wh(
         config_override,
         shard_layout=shard_layout,
         use_shallow_conv_variant=use_shallow_conv_variant,
+        transpose_shards=True,  ## use RM (transpose_mcast=False) with 2D on WH
         output_layout=output_layout,
         auto_shard=auto_shard,
         groups=groups,
     )
 
 
+@skip_for_grayskull()
 @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
 @pytest.mark.parametrize(
     "batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, config_override",
@@ -2006,6 +2027,7 @@ def test_halo_reshard_conv(
     )
 
 
+@skip_for_grayskull()
 @pytest.mark.skip("New API needs to be tested")
 @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
 @pytest.mark.parametrize(
@@ -2247,6 +2269,7 @@ def test_conv_groups(
     )
 
 
+@skip_for_grayskull()
 @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
 @pytest.mark.parametrize(
     "batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, shard_layout, config_override, use_shallow_conv_variant, groups",
@@ -2367,6 +2390,7 @@ def test_yolov4_conv_groups_larger_than_one(
     )
 
 
+@skip_for_grayskull()
 @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
 @pytest.mark.parametrize(
     " output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, shard_layout, config_override, use_shallow_conv_variant, groups",
@@ -2655,6 +2679,7 @@ def test_shallow_conv_with_tiled_input(device):
 
 # Tests running conv2d which maps to matmul w/o sharding the input tensor.
 # Output tensor is in DRAM.
+@skip_for_grayskull()
 @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
 @pytest.mark.parametrize("tiled_input", [True, False])
 @pytest.mark.parametrize("input_on_device", [True, False])
@@ -2780,6 +2805,9 @@ def test_small_in_large_out_channels_auto_shard(device, torch_tensor_map):
     padding = (0, 0)
     height = 128
     width = 128
+    if device.core_grid.y != 8 and is_wormhole_b0():
+        pytest.skip("Needs 8x8 grid for wormhole_b0")
+
     run_conv(
         device,
         torch_tensor_map,
diff --git a/tests/ttnn/unit_tests/operations/test_prepare_conv_weights.py b/tests/ttnn/unit_tests/operations/test_prepare_conv_weights.py
index c71c5cfbd26..1543913a051 100644
--- a/tests/ttnn/unit_tests/operations/test_prepare_conv_weights.py
+++ b/tests/ttnn/unit_tests/operations/test_prepare_conv_weights.py
@@ -196,133 +196,3 @@ def test_prepare_conv_weights(
     passing, pcc_msg = check_with_pcc_without_tensor_printout(torch_output_tensor, torch_out_golden_tensor, pcc=pcc)
     logger.info(f"PCC = {pcc_msg}. Threshold = {pcc}")
     assert passing
-
-
-@skip_for_grayskull()
-@skip_for_blackhole()
-# @skip_for_wormhole_b0()
-@pytest.mark.parametrize(
-    "batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array, config_override",
-    (
-        # rn50 layer1
-        (8, 64, 64, 56, 56, 3, 3, 1, 1, 1, 1, True, None),
-        (16, 64, 64, 56, 56, 3, 3, 1, 1, 1, 1, True, None),
-        (20, 64, 64, 56, 56, 3, 3, 1, 1, 1, 1, True, None),
-    ),
-)
-@pytest.mark.parametrize("packer_l1_acc", [True, False], ids=["pack_l1", "no_pack_l1"])
-@pytest.mark.parametrize("has_bias", [True, False], ids=["has_bias", "no_bias"])
-@pytest.mark.parametrize("device_params", [{"l1_small_size": 2**15}], indirect=True)
-def test_prepare_bias(
-    batch_size,
-    output_channels,
-    input_channels,
-    input_height,
-    input_width,
-    filter_height,
-    filter_width,
-    stride_h,
-    stride_w,
-    pad_h,
-    pad_w,
-    use_1d_systolic_array,
-    packer_l1_acc,
-    config_override,
-    has_bias,
-    device,
-):
-    if device.core_grid.y == 7:
-        pytest.skip("Issue #6992: Statically allocated circular buffers in program clash with L1 buffers on core range")
-
-    if batch_size == 20 and (
-        output_channels == 64 or (stride_h == 2 and (output_channels == 256 or output_channels == 128))
-    ):
-        pytest.skip("Skipping test because it won't fit in L1!")
-
-    inp_shape = (batch_size, input_channels, input_height, input_width)
-    conv_weight_shape = (output_channels, input_channels, filter_height, filter_width)
-    torch_weight_tensor = torch.randn(conv_weight_shape, dtype=torch.bfloat16)
-    torch_input_tensor = torch.randn(inp_shape, dtype=torch.bfloat16)
-    torch_bias_tensor = torch.randn((1, 1, 1, output_channels), dtype=torch.bfloat16) if has_bias else None
-
-    torch_out_golden_tensor = torch.nn.functional.conv2d(
-        torch_input_tensor,
-        torch_weight_tensor,
-        bias=torch_bias_tensor.reshape(-1) if has_bias else None,
-        stride=(stride_h, stride_w),
-        padding=(pad_h, pad_w),
-        dilation=(1, 1),
-        groups=1,
-    ).permute(0, 2, 3, 1)
-
-    tt_input_tensor = ttnn.from_torch(torch_input_tensor.transpose(-3, -2).transpose(-2, -1), ttnn.bfloat16)
-    tt_weight_tensor = ttnn.from_torch(torch_weight_tensor, ttnn.bfloat16)
-    tt_bias_tensor = ttnn.from_torch(torch_bias_tensor, ttnn.bfloat16) if has_bias else None
-
-    conv_config = ttnn.Conv2dConfig(
-        dtype=ttnn.bfloat16,
-        weights_dtype=ttnn.bfloat16,
-        input_channels_alignment=(16 if input_channels == 16 and input_height == 115 else 32),
-        enable_act_double_buffer=False,
-        enable_split_reader=False,
-        enable_subblock_padding=False,
-    )
-    compute_config = ttnn.init_device_compute_kernel_config(device.arch(), packer_l1_acc=packer_l1_acc)
-    if config_override and "act_block_h" in config_override:
-        conv_config.act_block_h_override = config_override["act_block_h"]
-
-    if config_override and "act_block_w_div" in config_override:
-        conv_config.act_block_w_div = config_override["act_block_w_div"]
-
-    if config_override and "num_cores_nhw" in config_override:
-        if config_override["num_cores_nhw"] == 98:
-            conv_config.core_grid = ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (11, 7)), ttnn.CoreRange((0, 8), (1, 8))})
-            conv_config.override_sharding_config = True
-            print("Setting num_cores_nhw to 98")
-
-    conv_kwargs = {
-        "input_layout": ttnn.ROW_MAJOR_LAYOUT,
-        "in_channels": input_channels,
-        "out_channels": output_channels,
-        "batch_size": batch_size,
-        "input_height": input_height,
-        "input_width": input_width,
-        "kernel_size": (filter_height, filter_width),
-        "stride": (stride_h, stride_w),
-        "padding": (pad_h, pad_w),
-        "dilation": (1, 1),
-        "groups": 1,
-        "device": device,
-        "conv_config": conv_config,
-    }
-
-    tt_input_tensor = ttnn.to_device(tt_input_tensor, device)
-
-    tt_bias_tensor_formatted = (
-        ttnn.prepare_conv_bias(
-            bias_tensor=tt_bias_tensor, input_memory_config=tt_input_tensor.memory_config(), **conv_kwargs
-        )
-        if has_bias
-        else None
-    )
-
-    tt_bias_tensor_formatted = ttnn.to_device(tt_bias_tensor_formatted, device) if has_bias else None
-    (k := next(iter(conv_kwargs)), conv_kwargs.pop(k))  ##removing 1st element from dict
-    tt_output_tensor_on_device = ttnn.conv2d(
-        input_tensor=tt_input_tensor,
-        weight_tensor=tt_weight_tensor,
-        bias_tensor=tt_bias_tensor_formatted,
-        **conv_kwargs,
-        compute_config=compute_config,
-    )
-
-    tt_output_tensor = ttnn.from_device(tt_output_tensor_on_device)
-    torch_output_tensor = ttnn.to_torch(tt_output_tensor)
-
-    torch_output_tensor = torch_output_tensor[:, :, :, :output_channels]
-    torch_output_tensor = torch_output_tensor.reshape(torch_out_golden_tensor.shape)
-
-    pcc = 0.99
-    passing, pcc_msg = check_with_pcc_without_tensor_printout(torch_output_tensor, torch_out_golden_tensor, pcc=pcc)
-    logger.info(f"PCC = {pcc_msg}. Threshold = {pcc}")
-    assert passing
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
index a3928a36629..3f856572366 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
@@ -119,22 +119,41 @@ Result conv2d(
     bool weight_is_on_device = ttnn::is_tensor_on_device_or_multidevice(weight_tensor);
     ttnn::Tensor weight_tensor_on_device = weight_tensor;
     std::optional<ttnn::Tensor> bias_tensor_on_device = bias_tensor;
-    if (!weight_is_on_device) {
+    if (!weight_is_on_device || conv_config.always_preprocess_weights) {
         // prepare weights in desired layout and move to device
-        tie(weight_tensor_on_device, bias_tensor_on_device) = prepare_conv_weights_biases_and_move_to_device(
-            weight_tensor,
-            bias_tensor,
-            conv_config.input_channels_alignment,
-            conv_config.weights_dtype,
-            opt_conv_op_block_config.act_block_w_ntiles,
-            opt_conv_op_block_config.out_subblock_w_ntiles,
-            parallel_config,
-            output_parallel_config,
-            device,
-            groups,
-            opt_conv_op_block_config.act_block_h_ntiles,
-            input_width,
-            true);
+
+        // TODO: Implement heuristic to decide if weights should be preprocessed on device.
+        if (conv_config.preprocess_weights_on_device == false) {
+            tie(weight_tensor_on_device, bias_tensor_on_device) = prepare_conv_weights_biases_and_move_to_device(
+                weight_tensor,
+                bias_tensor,
+                conv_config.input_channels_alignment,
+                conv_config.weights_dtype,
+                opt_conv_op_block_config.act_block_w_ntiles,
+                opt_conv_op_block_config.out_subblock_w_ntiles,
+                parallel_config,
+                output_parallel_config,
+                device,
+                groups,
+                opt_conv_op_block_config.act_block_h_ntiles,
+                input_width,
+                true);
+        } else {
+            tie(weight_tensor_on_device, bias_tensor_on_device) = prepare_conv_weights_biases_on_device(
+                weight_tensor,
+                bias_tensor,
+                conv_config.input_channels_alignment,
+                conv_config.weights_dtype,
+                opt_conv_op_block_config.act_block_w_ntiles,
+                opt_conv_op_block_config.out_subblock_w_ntiles,
+                parallel_config,
+                output_parallel_config,
+                device,
+                groups,
+                opt_conv_op_block_config.act_block_h_ntiles,
+                input_width,
+                true);
+        }
     }
     // if 1x1 conv w/ stride 1, convert input tensor to tile layout if required
     if (mm_conv) {
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp
index 0591ed02d0c..af3c683b6db 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp
@@ -335,6 +335,8 @@ void py_bind_conv2d(py::module& module) {
             bool,
             bool,
             bool,
+            bool,
+            bool,
             bool>(),
         py::kw_only(),
         py::arg("dtype") = DataType::BFLOAT16,
@@ -351,6 +353,8 @@ void py_bind_conv2d(py::module& module) {
         py::arg("core_grid") = std::nullopt,
         py::arg("transpose_shards") = true,
         py::arg("output_layout") = Layout::TILE,
+        py::arg("preprocess_weights_on_device") = false,
+        py::arg("always_preprocess_weights") = false,
         py::arg("enable_act_double_buffer") = false,
         py::arg("enable_weights_double_buffer") = false,
         py::arg("enable_split_reader") = false,
@@ -369,6 +373,8 @@ void py_bind_conv2d(py::module& module) {
     py_conv_config.def_readwrite("core_grid", &Conv2dConfig::core_grid);
     py_conv_config.def_readwrite("transpose_shards", &Conv2dConfig::transpose_shards);
     py_conv_config.def_readwrite("output_layout", &Conv2dConfig::output_layout);
+    py_conv_config.def_readwrite("preprocess_weights_on_device", &Conv2dConfig::preprocess_weights_on_device);
+    py_conv_config.def_readwrite("always_preprocess_weights", &Conv2dConfig::always_preprocess_weights);
     py_conv_config.def_readwrite("enable_act_double_buffer", &Conv2dConfig::enable_act_double_buffer);
     py_conv_config.def_readwrite("enable_weights_double_buffer", &Conv2dConfig::enable_weights_double_buffer);
     py_conv_config.def_readwrite("enable_split_reader", &Conv2dConfig::enable_split_reader);
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp
index 6f67fb238a6..7bdc858a526 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp
@@ -869,9 +869,12 @@ std::tuple<OptimizedConvParallelizationConfig, OptimizedConvBlockConfig, MemoryC
             get_num_cores_nhw_from_parallel_config(largest_parallel_config),
             get_num_cores_channels_from_parallel_config(largest_parallel_config));
 
+    uint32_t input_channels_alignment =
+        (input_parallel_config.shard_scheme == tt::tt_metal::TensorMemoryLayout::WIDTH_SHARDED)
+            ? 32
+            : conv_config.input_channels_alignment;
     uint32_t in_channels_padded = tt::round_up(
-        in_channels,
-        get_num_cores_channels_from_parallel_config(input_parallel_config) * conv_config.input_channels_alignment);
+        in_channels, get_num_cores_channels_from_parallel_config(input_parallel_config) * input_channels_alignment);
 
     uint32_t nhw_out_padded_ntile_per_core =
         conv_out_memory_config.shard_spec.value().shape[0] / tt::constants::TILE_HEIGHT;
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp
index 04557524b76..254614cd532 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <optional>
 #include "ttnn/operations/sliding_window/sliding_window.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/run_operation.hpp"
@@ -61,6 +62,13 @@ struct Conv2dConfig {
     // BFLOAT8 is always Tile layout.
     Layout output_layout = Layout::TILE;
 
+    // Select between preprocessing weights on device or on host.
+    bool preprocess_weights_on_device = false;
+
+    // If false, only preprocess weights if they are originally located on host.
+    // If true, preprocess weights regarding of original location.
+    bool always_preprocess_weights = false;
+
     // Doubles the size of the CBs for activation.
     // Increased perf, but increased L1 usage.
     bool enable_act_double_buffer = false;
@@ -73,6 +81,7 @@ struct Conv2dConfig {
     bool enable_split_reader = false;
 
     bool enable_subblock_padding = false;
+
     static constexpr auto attribute_names = std::make_tuple(
         "dtype",
         "weights_dtype",
@@ -88,6 +97,7 @@ struct Conv2dConfig {
         "core_grid",
         "transpose_shards",
         "output_layout",
+        "preprocess_weights_on_device",
         "enable_act_double_buffer",
         "enable_weights_double_buffer",
         "enable_split_reader",
@@ -108,6 +118,7 @@ struct Conv2dConfig {
             std::cref(this->core_grid),
             std::cref(this->transpose_shards),
             std::cref(this->output_layout),
+            std::cref(this->preprocess_weights_on_device),
             std::cref(this->enable_act_double_buffer),
             std::cref(this->enable_weights_double_buffer),
             std::cref(this->enable_split_reader),
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp
index 32fd24971e8..ce2999e4ca8 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp
@@ -474,7 +474,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl(
         }
     }
 
-    // assert(out_block_h_ntiles == act_block_h_ntiles); // TODO: fix output block sizing
+    // TT_FATAL(out_block_h_ntiles == act_block_h_ntiles); // TODO: fix output block sizing
     TT_FATAL(
         out_block_h_ntiles >= act_block_h_ntiles,
         "Output block height (in # of tiles) ({}) should be greater than or equal to activation block height (in # of "
@@ -578,8 +578,8 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl(
             sliding_window_config,
             parallelization_config.num_cores_nhw,
             out_block_h_ntiles);
-    assert(act_matrix_shape.size() == 3);
-    assert(act_matrix_shape[0] == 1);
+    TT_FATAL(act_matrix_shape.size() == 3, "act_matrix_shape should have be of size 3");
+    TT_FATAL(act_matrix_shape[0] == 1, "act_matrix_shape should have 1 as the first dimension");
     uint32_t act_matrix_height = (uint32_t)act_matrix_shape[1];
     uint32_t act_matrix_width = (uint32_t)act_matrix_shape[2];
     if (block_sharded) {
@@ -589,7 +589,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl(
     uint32_t act_matrix_height_unpadded = (uint32_t)act_matrix_shape_unpadded[1];
     uint32_t act_matrix_width_unpadded = (uint32_t)act_matrix_shape_unpadded[2];
 
-    // TODO: Move all these asserts/checks to validate?
+    // TODO: Move all these TT_FATALs/checks to validate?
 
     uint32_t input_width = ashape[2];
     uint32_t input_channels = ashape[3];
@@ -611,7 +611,10 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl(
     // matrix multiplication shape check valid for all convs except depthwise conv1d
     if (!is_conv_1d_depthwise_conv) {
         TT_FATAL(
-            act_matrix_width == weight_matrix_height, "The width of tensor a needs to match the height of tensor b");
+            act_matrix_width == weight_matrix_height,
+            "The width of tensor a {} needs to match the height of tensor b {}",
+            act_matrix_width,
+            weight_matrix_height);
     }
     // Tile size divisibility checks
     TT_FATAL(act_matrix_height % TILE_HEIGHT == 0, "Height of activation matrix needs to be divisible by 32");
@@ -635,10 +638,26 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl(
     uint32_t act_matrix_height_ntiles = act_matrix_height / TILE_HEIGHT;
     uint32_t act_matrix_width_ntiles = act_matrix_width / TILE_WIDTH;
 
-    assert(act_matrix_height_ntiles % act_block_h_ntiles == 0);
-    assert(act_matrix_width_ntiles % act_block_w_ntiles == 0);
-    assert(weight_matrix_width_ntiles % weight_block_w_ntiles == 0);
-    assert(act_matrix_height_ntiles % out_block_h_ntiles == 0);
+    TT_FATAL(
+        act_matrix_height_ntiles % act_block_h_ntiles == 0,
+        "act_matrix_height_ntiles {} should be divisible by act_block_h_ntiles {}",
+        act_matrix_height_ntiles,
+        act_block_h_ntiles);
+    TT_FATAL(
+        act_matrix_width_ntiles % act_block_w_ntiles == 0,
+        "act_matrix_width_ntiles {} should be divisible by act_block_w_ntiles {}",
+        act_matrix_width_ntiles,
+        act_block_w_ntiles);
+    TT_FATAL(
+        weight_matrix_width_ntiles % weight_block_w_ntiles == 0,
+        "weight_matrix_width_ntiles {} should be divisible by weight_block_w_ntiles {}",
+        weight_matrix_width_ntiles,
+        weight_block_w_ntiles);
+    TT_FATAL(
+        act_matrix_height_ntiles % out_block_h_ntiles == 0,
+        "act_matrix_height_ntiles {} should be divisible by out_block_h_ntiles {}",
+        act_matrix_height_ntiles,
+        out_block_h_ntiles);
 
     uint32_t num_blocks_act_h = act_matrix_height_ntiles / act_block_h_ntiles;
     uint32_t num_blocks_out_h = act_matrix_height_ntiles / out_block_h_ntiles;
@@ -672,7 +691,11 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl(
 
     // weight block info
     uint32_t weight_block_w_datums = weight_matrix_width / num_blocks_weight_w;
-    assert(weight_block_w_ntiles % out_subblock_w_ntiles == 0);
+    TT_FATAL(
+        weight_block_w_ntiles % out_subblock_w_ntiles == 0,
+        "weight_block_w_ntiles {} should be divisible by weight_block_w_ntiles {}",
+        weight_block_w_ntiles,
+        out_subblock_w_ntiles);
     uint32_t weight_num_subblocks = weight_block_w_ntiles / out_subblock_w_ntiles;
     uint32_t weight_block_h_ntiles = is_conv_1d_depthwise_conv ? act_block_h_ntiles : act_block_w_ntiles;
     uint32_t weight_block_num_tiles = weight_block_w_ntiles * weight_block_h_ntiles;
@@ -681,14 +704,21 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl(
     // writer of conv op partially removes padding on the width
     // it removes the padding done for block width but it doesn't remove padding done for tiled width
     uint32_t output_channels_padded_to_tile_width = round_up(output_channels, TILE_WIDTH);
-    assert(output_channels_padded_to_tile_width <= weight_matrix_width);
+    TT_FATAL(
+        output_channels_padded_to_tile_width <= weight_matrix_width,
+        "output_channels_padded_to_tile_width {} should be less than or equal to weight_matrix_width {}",
+        output_channels_padded_to_tile_width,
+        weight_matrix_width);
     uint32_t output_width_num_tiles = output_channels_padded_to_tile_width / TILE_WIDTH;
     uint32_t num_blocks_output_w =
         (uint32_t)std::ceil((double)output_channels_padded_to_tile_width / (double)weight_block_w_datums);
     uint32_t last_block_width_datums = (output_channels_padded_to_tile_width % weight_block_w_datums == 0)
                                            ? weight_block_w_datums
                                            : (output_channels_padded_to_tile_width % weight_block_w_datums);
-    assert(last_block_width_datums % TILE_WIDTH == 0);
+    TT_FATAL(
+        last_block_width_datums % TILE_WIDTH == 0,
+        "last_block_width_datums {} should be divisible by TILE_WIDTH",
+        last_block_width_datums);
 
     uint32_t out_block_h_datums = out_block_h_ntiles * TILE_HEIGHT;
 
@@ -706,9 +736,12 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl(
     // act
     uint32_t act_dram_addr = src0_dram_buffer->address();
 
-    assert(act_matrix_width_ntiles % act_block_w_ntiles == 0);
-    assert(act_block_h_ntiles % out_subblock_h_ntiles == 0);
-    // assert(out_block_h_ntiles % out_subblock_h_ntiles == 0);
+    TT_FATAL(
+        act_block_h_ntiles % out_subblock_h_ntiles == 0,
+        "act_block_h_ntiles {} should be divisible by out_subblock_h_ntiles {}",
+        act_block_h_ntiles,
+        out_subblock_h_ntiles);
+    // TT_FATAL(out_block_h_ntiles % out_subblock_h_ntiles == 0);
     uint32_t act_num_subblocks = act_block_h_ntiles / out_subblock_h_ntiles;
     uint32_t act_block_num_tiles = act_block_h_ntiles * act_block_w_ntiles;
     uint32_t act_subblock_h_ntiles = out_subblock_h_ntiles;
@@ -743,7 +776,11 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl(
 
     uint32_t output_height_padded_to_tile_height = round_up(act_matrix_height_unpadded, TILE_HEIGHT);
     uint32_t output_height_num_tiles = output_height_padded_to_tile_height / TILE_HEIGHT;
-    assert(output_height_num_tiles <= act_matrix_height_ntiles);
+    TT_FATAL(
+        output_height_num_tiles <= act_matrix_height_ntiles,
+        "output_height_num_tiles {} should be less than or equal to act_matrix_height_ntiles {}",
+        output_height_num_tiles,
+        act_matrix_height_ntiles);
 
     uint32_t src_dram_act_buffer_size_bytes = src0_dram_buffer->size();
     uint32_t src_dram_weight_buffer_size_bytes = src1_dram_buffer->size();
@@ -840,46 +877,94 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl(
     reader_defines["WINDOW_INNER"] = std::to_string(window_inner);
     log_debug(LogOp, "window_outer: {}, window_inner: {}", window_outer, window_inner);
 
-    assert(weight_matrix_width_ntiles % per_core_out_matrix_width_ntiles == 0);
-    assert(per_core_out_matrix_width_ntiles % weight_block_w_ntiles == 0);
+    TT_FATAL(
+        weight_matrix_width_ntiles % per_core_out_matrix_width_ntiles == 0,
+        "weight_matrix_width_ntiles {} should be divisible by per_core_out_matrix_width_ntiles {}",
+        weight_matrix_width_ntiles,
+        per_core_out_matrix_width_ntiles);
+    TT_FATAL(
+        per_core_out_matrix_width_ntiles % weight_block_w_ntiles == 0,
+        "per_core_out_matrix_width_ntiles {} should be divisible by weight_block_w_ntiles {}",
+        per_core_out_matrix_width_ntiles,
+        weight_block_w_ntiles);
     uint32_t num_blocks_weight_w_per_core = per_core_out_matrix_width_ntiles / weight_block_w_ntiles;
     if (not weight_width_sliced) {
-        assert(num_blocks_weight_w_per_core == num_blocks_weight_w);
+        TT_FATAL(
+            num_blocks_weight_w_per_core == num_blocks_weight_w,
+            "num_blocks_weight_w_per_core {} should be equal to num_blocks_weight_w {}",
+            num_blocks_weight_w_per_core,
+            num_blocks_weight_w);
     }
     uint32_t num_weight_slices_width = weight_matrix_width_ntiles / per_core_out_matrix_width_ntiles;
     uint32_t total_num_cores_per_weight_slice = 0;
     uint32_t total_num_cores_per_act_slice = 0;  // only used when (BLOCK_SHARDING && !transpose_mcast)
     if (weight_width_sliced) {
         if (transpose_mcast) {
-            assert(num_cores_y % num_weight_slices_width == 0);
+            TT_FATAL(
+                num_cores_y % num_weight_slices_width == 0,
+                "num_cores_y {} should be divisible by num_weight_slices_width {}",
+                num_cores_y,
+                num_weight_slices_width);
             uint32_t num_cores_y_per_weight_slice_width = num_cores_y / num_weight_slices_width;
             total_num_cores_per_weight_slice = num_cores_y_per_weight_slice_width * num_cores_x;
         } else {
-            assert(num_cores_x % num_weight_slices_width == 0);
+            TT_FATAL(
+                num_cores_x % num_weight_slices_width == 0,
+                "num_cores_x {} should be divisible by num_weight_slices_width {}",
+                num_cores_x,
+                num_weight_slices_width);
             uint32_t num_cores_x_per_weight_slice_width = num_cores_x / num_weight_slices_width;
             uint32_t num_act_slices_height = act_matrix_height_ntiles / per_core_out_matrix_height_ntiles;
             total_num_cores_per_act_slice = num_cores_x * num_cores_y / num_act_slices_height;
             log_debug(LogOp, "total_num_cores_per_act_slice: {}", total_num_cores_per_act_slice);
             total_num_cores_per_weight_slice = num_cores_x_per_weight_slice_width * num_cores_y;
         }
-        assert(total_num_cores_per_weight_slice * per_core_out_matrix_height_ntiles == act_matrix_height_ntiles);
+        TT_FATAL(
+            total_num_cores_per_weight_slice * per_core_out_matrix_height_ntiles == act_matrix_height_ntiles,
+            "total_num_cores_per_weight_slice {} * per_core_out_matrix_height_ntiles {} should be equal to "
+            "act_matrix_height_ntiles {}",
+            total_num_cores_per_weight_slice,
+            per_core_out_matrix_height_ntiles,
+            act_matrix_height_ntiles);
     } else {
-        assert(num_cores_y % num_weight_slices_width == 0);
+        TT_FATAL(
+            num_cores_y % num_weight_slices_width == 0,
+            "num_cores_y {} should be divisible by num_weight_slices_width {}",
+            num_cores_y,
+            num_weight_slices_width);
         uint32_t num_cores_y_per_weight_slice_width = num_cores_y / num_weight_slices_width;
         total_num_cores_per_weight_slice = num_cores_y_per_weight_slice_width * num_cores_x;
-        assert(total_num_cores * per_core_out_matrix_height_ntiles >= act_matrix_height_ntiles);
+        TT_FATAL(
+            total_num_cores * per_core_out_matrix_height_ntiles >= act_matrix_height_ntiles,
+            "total_num_cores {} * per_core_out_matrix_height_ntiles {} should be greater than or equal to "
+            "act_matrix_height_ntiles {}",
+            total_num_cores,
+            per_core_out_matrix_height_ntiles,
+            act_matrix_height_ntiles);
     }
-    assert(per_core_out_matrix_height_ntiles % act_block_h_ntiles == 0);
+    TT_FATAL(
+        per_core_out_matrix_height_ntiles % act_block_h_ntiles == 0,
+        "per_core_out_matrix_height_ntiles {} should be divisible by act_block_h_ntiles {}",
+        per_core_out_matrix_height_ntiles,
+        act_block_h_ntiles);
     uint32_t num_blocks_act_h_per_core = per_core_out_matrix_height_ntiles / act_block_h_ntiles;
-    // assert(per_core_out_matrix_height_ntiles % out_block_h_ntiles == 0);
+    // TT_FATAL(per_core_out_matrix_height_ntiles % out_block_h_ntiles == 0);
     // uint32_t num_blocks_out_h_per_core = per_core_out_matrix_height_ntiles / out_block_h_ntiles;
     uint32_t num_blocks_out_h_per_core =
         (per_core_out_matrix_height_ntiles + out_block_h_ntiles - 1) / out_block_h_ntiles;
     bool act_height_sliced = per_core_out_matrix_height_ntiles < act_matrix_height_ntiles;
     if (not act_height_sliced) {
-        TT_FATAL(num_blocks_act_h_per_core == num_blocks_act_h, "Error");
-        TT_FATAL(num_blocks_out_h_per_core == num_blocks_out_h, "Error");
-        TT_FATAL(num_cores_x == 1, "Error");
+        TT_FATAL(
+            num_blocks_act_h_per_core == num_blocks_act_h,
+            "num_blocks_act_h_per_core {} should be equal to num_blocks_act_h {}",
+            num_blocks_act_h_per_core,
+            num_blocks_act_h);
+        TT_FATAL(
+            num_blocks_out_h_per_core == num_blocks_out_h,
+            "num_blocks_out_h_per_core {} should be equal to num_blocks_out_h {}",
+            num_blocks_out_h_per_core,
+            num_blocks_out_h);
+        TT_FATAL(num_cores_x == 1, "num_cores_x {} should be equal to 1", num_cores_x);
     }
     uint32_t act_block_h_datums_last_block =
         (per_core_out_matrix_height_ntiles - (num_blocks_act_h_per_core - 1) * act_block_h_ntiles) * TILE_HEIGHT;
@@ -1135,7 +1220,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl(
     if (filter_h >= 1 and filter_w >= 1) {
         if (!is_conv1d and weight_width_sliced) {
             // 2D conv
-            assert(read_window_in_inner_loop == true);
+            TT_FATAL(read_window_in_inner_loop == true, "read_window_in_inner_loop should be true for this conv");
             reader_kernel =
                 "ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/"
                 "reader_conv_activations_2d_mcast_padded_with_halo_3x3_weights_v2.cpp";
@@ -1447,7 +1532,11 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl(
         uint32_t out_start_tile_id_w = weight_slice_i * per_core_out_matrix_width_ntiles;
         uint32_t bias_tile_offset = weight_slice_i * per_core_out_matrix_width_ntiles;
         if (has_bias) {
-            assert(bias_tile_offset < bias_ntiles);
+            TT_FATAL(
+                bias_tile_offset < bias_ntiles,
+                "bias_tile_offset {} should be less than bias_ntiles {}",
+                bias_tile_offset,
+                bias_ntiles);
         }
 
         if (weight_width_sliced) {
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp
index 2f7b82a170e..6964612fb39 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp
@@ -9,6 +9,10 @@
 #include "ttnn/operations/core/core.hpp"
 #include "ttnn/operations/data_movement/pad/pad.hpp"
 #include "ttnn/tensor/types.hpp"
+#include "ttnn/operations/data_movement/permute/permute.hpp"
+#include "ttnn/operations/data_movement/reshape_view/reshape.hpp"
+#include "ttnn/operations/data_movement/tilize/tilize.hpp"
+#include "ttnn/operations/sliding_window/sliding_window.hpp"
 using namespace tt;
 namespace ttnn {
 namespace operations::conv {
@@ -475,8 +479,6 @@ Tensor convert_conv_weight_tensor_to_depthwise_layout(
 }
 
 void validate_weight_tensor(const ttnn::Tensor& weight_tensor) {
-    TT_FATAL(
-        !ttnn::has_storage_type_of(weight_tensor, ttnn::DEVICE_STORAGE_TYPE), "conv weight should be placed on host");
     TT_FATAL(weight_tensor.get_layout() == Layout::ROW_MAJOR, "conv weight layout should be in row_major layout");
     TT_FATAL(weight_tensor.get_logical_shape().rank() == 4, "conv weight should be 4D tensor");
 }
@@ -631,6 +633,282 @@ static OptimizedConvBlockConfig get_opt_block_config(
         conv_config.enable_split_reader);
 }
 
+template <typename T>
+std::pair<ttnn::Tensor, std::optional<ttnn::Tensor>> prepare_conv_weights_biases_on_device(
+    const ttnn::Tensor& weight_tensor,
+    std::optional<const ttnn::Tensor>& bias_tensor,
+    uint32_t input_channels_alignment,
+    DataType weights_bias_dtype,
+    uint32_t weight_block_h_ntiles,
+    uint32_t weight_block_w_ntiles,
+    const sliding_window::ParallelConfig& input_parallel_config,
+    const sliding_window::ParallelConfig& output_parallel_config,
+    T* device,
+    uint32_t groups,
+    uint32_t act_block_h_ntiles,
+    uint32_t input_width,
+    const bool parameters_on_device) {
+    validate_weight_tensor(weight_tensor);
+    ttnn::Tensor weight_tensor_;  // tensor to return
+    ttnn::Tensor bias_tensor_;
+
+    auto original_weights_shape = weight_tensor.get_logical_shape();
+    uint32_t original_weights_out_channels = original_weights_shape[0];
+    uint32_t original_weights_in_channels = original_weights_shape[1];
+    uint32_t original_weights_window_h = original_weights_shape[2];
+    uint32_t original_weights_window_w = original_weights_shape[3];
+
+    bool is_conv1d = original_weights_window_w == 1 && input_width == 1;
+    bool is_depthwise_conv = groups == original_weights_out_channels && original_weights_in_channels == 1;
+
+    weight_tensor_ = weight_tensor;
+    // Convert weight tensor to 0 padded shape if groups > 1
+    if (groups > 1 and is_tensor_on_device_or_multidevice(weight_tensor_)) {
+        TT_THROW(
+            "Grouped Convolution not supported when weights are on device. Please move the weights tensor to host");
+    }
+    if (!is_conv1d and groups > 1) {
+        weight_tensor_ = convert_conv_weight_tensor_to_grouped_layout(weight_tensor_, groups, weights_bias_dtype);
+    } else if (is_conv1d and groups > 1) {
+        if (is_depthwise_conv) {
+            weight_tensor_ =
+                convert_conv_weight_tensor_to_depthwise_layout(weight_tensor_, act_block_h_ntiles, weights_bias_dtype);
+            weight_block_h_ntiles = act_block_h_ntiles;
+        } else {
+            weight_tensor_ = convert_conv_weight_tensor_to_grouped_layout(weight_tensor_, groups, weights_bias_dtype);
+        }
+    }
+
+    weight_tensor_ = ttnn::operations::core::to_device(weight_tensor_, device, std::nullopt);
+
+    auto weights_shape = weight_tensor_.get_logical_shape();
+    uint32_t out_channels = weights_shape[0];
+    uint32_t in_channels = weights_shape[1];
+    uint32_t window_h = weights_shape[2];
+    uint32_t window_w = weights_shape[3];
+
+    uint32_t input_num_cores_channels = get_num_cores_channels_from_parallel_config(input_parallel_config);
+    uint32_t output_num_cores_channels = get_num_cores_channels_from_parallel_config(output_parallel_config);
+
+    uint32_t out_channels_padded = tt::round_up(out_channels, output_num_cores_channels * tt::constants::TILE_WIDTH);
+    uint32_t in_channels_padded = tt::round_up(in_channels, input_num_cores_channels * input_channels_alignment);
+    uint32_t out_channel_padding = out_channels_padded - out_channels;
+
+    if (weights_bias_dtype == DataType::BFLOAT8_B) {
+        TT_ASSERT(weight_tensor_.get_dtype() == DataType::FLOAT32);
+        if (bias_tensor.has_value()) {
+            TT_ASSERT(bias_tensor.value().get_dtype() == DataType::FLOAT32);
+        }
+    } else {
+        // TODO: fix the need to check this. We should be able to accept any datatype and convert
+        TT_ASSERT(weight_tensor_.get_dtype() == weights_bias_dtype);
+        if (bias_tensor.has_value()) {
+            TT_ASSERT(bias_tensor.value().get_dtype() == weights_bias_dtype);
+        }
+    }
+
+    // Block sharding re-orders the weights by dividing the input_channels along number of in_channel_cores.
+    if (input_parallel_config.shard_scheme == TensorMemoryLayout::BLOCK_SHARDED) {
+        weight_tensor_ = ttnn::permute(weight_tensor_, ttnn::SmallVector<int64_t>({2, 3, 1, 0}));
+
+        ttnn::Shape weights_channels_padded_shape(
+            std::array<uint32_t, 4>({window_h, window_w, out_channels_padded, in_channels_padded}));
+
+        weight_tensor_ = ttnn::pad(
+            weight_tensor_,
+            tt::tt_metal::Array4D({window_h, window_w, in_channels_padded, out_channels_padded}),
+            tt::tt_metal::Array4D({0, 0, 0, 0}),
+            0.0f,
+            true,
+            std::nullopt);
+
+        TT_FATAL(
+            input_num_cores_channels == output_num_cores_channels,
+            "Input and output cores must be the same for Block Sharded Conv2d");
+        TT_FATAL(
+            in_channels_padded % input_num_cores_channels == 0,
+            "Input channels {} must be divisble by num cores {}",
+            in_channels_padded,
+            input_num_cores_channels);
+        auto in_channels_per_core = in_channels_padded / input_num_cores_channels;
+
+        TT_FATAL(
+            out_channels_padded % output_num_cores_channels == 0,
+            "output channels {} must be divisble by num cores {}",
+            out_channels_padded,
+            output_num_cores_channels);
+        auto out_channels_per_core = out_channels_padded / output_num_cores_channels;
+        auto rounded_weight_block_height =
+            tt::round_up(window_h * window_w * in_channels_per_core, constants::TILE_HEIGHT);
+        auto rounded_weight_block_width = tt::round_up(out_channels_per_core, constants::TILE_WIDTH);
+
+        auto final_out_channels_padded = rounded_weight_block_width * output_num_cores_channels;
+
+        if (final_out_channels_padded != out_channels_padded) {
+            weight_tensor_ = ttnn::reshape(
+                weight_tensor_,
+                ttnn::Shape(
+                    {in_channels_padded * window_h, window_w, output_num_cores_channels, out_channels_per_core}));
+
+            weight_tensor_ = ttnn::pad(
+                weight_tensor_,
+                tt::tt_metal::Array4D(
+                    {in_channels_padded * window_h, window_w, output_num_cores_channels, rounded_weight_block_width}),
+                tt::tt_metal::Array4D({0, 0, 0, 0}),
+                0,
+                true,
+                std::nullopt);
+        }
+        weight_tensor_ = ttnn::reshape(
+            weight_tensor_,
+            ttnn::Shape(
+                {window_h, window_w, input_num_cores_channels, in_channels_per_core, final_out_channels_padded}));
+
+        weight_tensor_ = ttnn::permute(weight_tensor_, ttnn::SmallVector<int64_t>({2, 0, 1, 3, 4}));
+        weight_tensor_ = ttnn::reshape(
+            weight_tensor_,
+            ttnn::Shape(
+                {1, input_num_cores_channels, window_h * window_w * in_channels_per_core, final_out_channels_padded}));
+        weight_tensor_ = ttnn::pad(
+            weight_tensor_,
+            tt::tt_metal::Array4D(
+                {1, input_num_cores_channels, rounded_weight_block_height, final_out_channels_padded}),
+            tt::tt_metal::Array4D({0, 0, 0, 0}),
+            0,
+            true,
+            std::nullopt);
+
+        weight_tensor_ = ttnn::reshape(
+            weight_tensor_,
+            ttnn::Shape({1, 1, rounded_weight_block_height * input_num_cores_channels, final_out_channels_padded}));
+    } else {
+        weight_tensor_ = ttnn::permute(weight_tensor_, ttnn::SmallVector<int64_t>({2, 3, 1, 0}));
+
+        ttnn::Shape weights_channels_padded_shape(
+            std::array<uint32_t, 4>({window_h, window_w, out_channels_padded, in_channels_padded}));
+
+        weight_tensor_ = ttnn::pad(
+            weight_tensor_,
+            tt::tt_metal::Array4D({window_h, window_w, in_channels_padded, out_channels_padded}),
+            tt::tt_metal::Array4D({0, 0, 0, 0}),
+            0.0f,
+            true,
+            std::nullopt);
+
+        auto weight_block_h_datums = weight_block_h_ntiles * constants::TILE_HEIGHT;
+        if ((weight_block_h_datums > (window_w * in_channels_padded)) &&
+            (input_parallel_config.shard_scheme == TensorMemoryLayout::HEIGHT_SHARDED)) {
+            weight_tensor_ = ttnn::reshape(
+                weight_tensor_, ttnn::Shape({1, window_h, window_w * in_channels_padded, out_channels_padded}));
+            weight_tensor_ = ttnn::pad(
+                weight_tensor_,
+                tt::tt_metal::Array4D({1, window_h, weight_block_h_datums, out_channels_padded}),
+                tt::tt_metal::Array4D({0, 0, 0, 0}),
+                0.0f,
+                true,
+                std::nullopt);
+            weight_tensor_ = ttnn::reshape(
+                weight_tensor_, ttnn::Shape({1, 1, window_h * weight_block_h_datums, out_channels_padded}));
+        } else {
+            weight_tensor_ = ttnn::reshape(
+                weight_tensor_, ttnn::Shape({1, 1, window_h * window_w * in_channels_padded, out_channels_padded}));
+        }
+    }
+    weight_tensor_ = ttnn::tilize(
+        weight_tensor_,
+        ttnn::MemoryConfig(
+            {.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED,
+             .buffer_type = tt::tt_metal::BufferType::DRAM}),
+        weights_bias_dtype,
+        true);
+
+    uint32_t weight_matrix_height = in_channels * window_h * window_w;
+    int32_t weight_matrix_height_padding = weight_tensor_.get_logical_shape()[2] - weight_matrix_height;
+    TT_FATAL(weight_matrix_height_padding >= 0, " Matrix Height Padding can't be negative");
+
+    ttnn::Shape target_shape(std::array<uint32_t, 4>{1, 1, weight_matrix_height, out_channels});
+
+    weight_tensor_ = ttnn::reshape(weight_tensor_, target_shape, weight_tensor_.get_padded_shape());
+
+    if (bias_tensor.has_value()) {
+        bias_tensor_ = bias_tensor.value();
+        bool is_bias_tensor_is_on_device = ttnn::is_tensor_on_device_or_multidevice(bias_tensor_);
+        if (!is_bias_tensor_is_on_device) {
+            bias_tensor_ = ttnn::operations::core::to_device(bias_tensor_, device, std::nullopt);
+        }
+        if (input_parallel_config.shard_scheme == TensorMemoryLayout::BLOCK_SHARDED) {
+            auto bias_out_channels = bias_tensor_.get_logical_shape()[3];
+            ttnn::Shape bias_channels_padded_shape({1, 1, 1, out_channels_padded});
+            bias_tensor_ = ttnn::pad(
+                bias_tensor_,
+                bias_channels_padded_shape.to_array_4D(),
+                tt::tt_metal::Array4D{0, 0, 0, 0},
+                0,
+                true,
+                std::nullopt);
+            auto out_channels_per_core = out_channels_padded / output_num_cores_channels;
+            auto rounded_weight_block_width = tt::round_up(out_channels_per_core, constants::TILE_WIDTH);
+
+            auto final_out_channels_padded = rounded_weight_block_width * output_num_cores_channels;
+
+            if (final_out_channels_padded != out_channels_padded) {
+                bias_tensor_ =
+                    ttnn::reshape(bias_tensor_, ttnn::Shape({1, 1, output_num_cores_channels, out_channels_per_core}));
+
+                bias_tensor_ = ttnn::pad(
+                    bias_tensor_,
+                    tt::tt_metal::Array4D({1, 1, output_num_cores_channels, rounded_weight_block_width}),
+                    tt::tt_metal::Array4D({0, 0, 0, 0}),
+                    0,
+                    true,
+                    std::nullopt);
+            }
+            bias_tensor_ = ttnn::reshape(bias_tensor_, ttnn::Shape({1, 1, 1, final_out_channels_padded}));
+            bias_tensor_ = ttnn::pad(
+                bias_tensor_,
+                tt::tt_metal::Array4D({1, 1, 32, final_out_channels_padded}),
+                tt::tt_metal::Array4D{0, 0, 0, 0},
+                0,
+                true,
+                std::nullopt);
+        } else {
+            ttnn::Shape bias_channels_padded_shape({1, 1, 32, round_up(out_channels, weight_block_w_ntiles * 32)});
+            bias_tensor_ = ttnn::pad(
+                bias_tensor_,
+                bias_channels_padded_shape.to_array_4D(),
+                tt::tt_metal::Array4D{0, 0, 0, 0},
+                0,
+                true,
+                std::nullopt);
+        }
+        bias_tensor_ = ttnn::tilize(
+            bias_tensor_,
+            ttnn::MemoryConfig(
+                {.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED,
+                 .buffer_type = tt::tt_metal::BufferType::DRAM}),
+            weights_bias_dtype,
+            true);
+
+        ttnn::Shape bias_target_shape(std::array<uint32_t, 4>{1, 1, 1, out_channels});
+        bias_tensor_ = ttnn::reshape(bias_tensor_, bias_target_shape, bias_tensor_.get_padded_shape());
+
+        // TT_FATAL(
+        //     bias_tensor_.get_logical_shape()[3] == out_channels,
+        //     "Bias must have the same length as output channels");
+        // bias_tensor_ = conv_bias_layout_convert(
+        //     bias_tensor_,
+        //     weights_bias_dtype,
+        //     weight_block_h_ntiles,
+        //     weight_block_w_ntiles,
+        //     output_parallel_config,
+        //     device,
+        //     out_channels_padded,
+        //     is_non_tile_mul_width);
+    }
+
+    return {weight_tensor_, bias_tensor.has_value() ? bias_tensor_ : std::optional<ttnn::Tensor>()};
+}
+
 template <typename T>
 std::pair<ttnn::Tensor, std::optional<ttnn::Tensor>> prepare_conv_weights_biases_and_move_to_device(
     const ttnn::Tensor& weight_tensor,
@@ -703,7 +981,6 @@ std::pair<ttnn::Tensor, std::optional<ttnn::Tensor>> prepare_conv_weights_biases
     }
     weight_tensor_ =
         ttnn::pad(weight_tensor_, weights_channels_padded_shape.to_array_4D(), tt::tt_metal::Array4D({0, 0, 0, 0}), 0);
-
     // for conv op, pad the weights to block shape
     if (input_parallel_config.shard_scheme == TensorMemoryLayout::HEIGHT_SHARDED) {
         weight_tensor_ = convert_conv_weight_tensor_to_special_padding_tiled_layout(
@@ -985,6 +1262,36 @@ template ttnn::Tensor prepare_conv_weights<MeshDevice>(
     const std::optional<const Conv2dConfig>& conv_config_,
     const std::optional<const DeviceComputeKernelConfig>& compute_config_);
 
+template std::pair<ttnn::Tensor, std::optional<ttnn::Tensor>> prepare_conv_weights_biases_on_device<IDevice>(
+    const ttnn::Tensor& weight_tensor,
+    std::optional<const ttnn::Tensor>& bias_tensor,
+    uint32_t input_channels_alignment,
+    DataType weights_bias_dtype,
+    uint32_t weight_block_h_ntiles,
+    uint32_t weight_block_w_ntiles,
+    const sliding_window::ParallelConfig& input_parallel_config,
+    const sliding_window::ParallelConfig& output_parallel_config,
+    IDevice* device,
+    uint32_t groups,
+    uint32_t act_block_h_ntiles,
+    uint32_t input_width,
+    const bool parameters_on_device);
+
+template std::pair<ttnn::Tensor, std::optional<ttnn::Tensor>> prepare_conv_weights_biases_on_device<MeshDevice>(
+    const ttnn::Tensor& weight_tensor,
+    std::optional<const ttnn::Tensor>& bias_tensor,
+    uint32_t input_channels_alignment,
+    DataType weights_bias_dtype,
+    uint32_t weight_block_h_ntiles,
+    uint32_t weight_block_w_ntiles,
+    const sliding_window::ParallelConfig& input_parallel_config,
+    const sliding_window::ParallelConfig& output_parallel_config,
+    MeshDevice* device,
+    uint32_t groups,
+    uint32_t act_block_h_ntiles,
+    uint32_t input_width,
+    const bool parameters_on_device);
+
 template std::pair<ttnn::Tensor, std::optional<ttnn::Tensor>> prepare_conv_weights_biases_and_move_to_device<IDevice>(
     const ttnn::Tensor& weight_tensor,
     std::optional<const ttnn::Tensor>& bias_tensor,
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp
index 5377a62a345..2824a9cd4fe 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp
@@ -103,6 +103,22 @@ ttnn::Tensor prepare_conv_bias(
     const std::optional<const Conv2dConfig>& conv_config_,
     const std::optional<const DeviceComputeKernelConfig>& compute_config_);
 
+template <typename T>
+std::pair<ttnn::Tensor, std::optional<ttnn::Tensor>> prepare_conv_weights_biases_on_device(
+    const ttnn::Tensor& weight_tensor,
+    std::optional<const ttnn::Tensor>& bias_tensor,
+    uint32_t input_channels_alignment,
+    DataType weights_bias_dtype,
+    uint32_t weight_block_h_ntiles,
+    uint32_t weight_block_w_ntiles,
+    const sliding_window::ParallelConfig& input_parallel_config,
+    const sliding_window::ParallelConfig& output_parallel_config,
+    T* device,
+    uint32_t groups,
+    uint32_t act_block_h_ntiles,
+    uint32_t input_width,
+    const bool parameters_on_device);
+
 template <typename T>
 std::pair<ttnn::Tensor, std::optional<ttnn::Tensor>> prepare_conv_weights_biases_and_move_to_device(
     const ttnn::Tensor& weight_tensor,
diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
index 350dc0c2b88..4d71acb859b 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
@@ -792,6 +792,13 @@ std::vector<std::pair<std::vector<uint32_t>, std::vector<uint32_t>>> get_runtime
     return ret_val;
 }
 
+uint32_t get_num_max_sticks(uint32_t num_sticks_to_read, uint32_t stick_size, uint32_t max_read_size) {
+    uint32_t num_sticks = tt::round_up(max_read_size, stick_size) / stick_size;
+    while (num_sticks * stick_size > max_read_size || num_sticks_to_read % num_sticks != 0) {
+        num_sticks--;
+    }
+    return num_sticks;
+}
 operation::ProgramWithCallbacks pad_rm_reader_writer_multi_core_v2(
     const Tensor& a,
     Tensor& output,
@@ -841,8 +848,14 @@ operation::ProgramWithCallbacks pad_rm_reader_writer_multi_core_v2(
                           ? num_sticks_padded_per_core_group_1
                           : num_sticks_padded_per_core_group_2;
 
+    uint32_t max_read_size = 256 * 1024;
+    uint32_t W_bytes = a.get_padded_shape()[3] * a.element_size();
+    auto num_sticks_per_core_read = get_num_max_sticks(num_sticks, W_bytes, max_read_size);
+    auto input_cb_pages = std::min(num_sticks_per_core_read, num_sticks);
+
     tt::tt_metal::CircularBufferConfig cb_src0_config =
-        tt::tt_metal::CircularBufferConfig(num_sticks * stick_size_padded_aligned, {{src0_cb_index, cb_data_format}})
+        tt::tt_metal::CircularBufferConfig(
+            input_cb_pages * stick_size_padded_aligned, {{src0_cb_index, cb_data_format}})
             .set_page_size(src0_cb_index, stick_size_padded_aligned);
     auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_src0_config);
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp
index 9e4382f3d73..d8c78a70cdd 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp
@@ -51,7 +51,17 @@ static ttnn::Tensor pad_impl(
         const auto rank = input_tensor_shape.rank();
 
         TT_FATAL(rank == 4, "ttnn.pad: input tensor passed to pad_impl must have rank == 4, but got rank {}.", rank);
-
+        bool input_output_same = true;
+        for (size_t i = 0; i < rank; i++) {
+            if (input_tensor_shape[i] != output_padded_shape[i]) {
+                input_output_same = false;
+                break;
+            }
+        }
+        if (input_output_same) {
+            tt::log_debug("Pad Input and Output Shapes are the same. Skipping pad and returning input tensor.");
+            return input_tensor;
+        }
         using ShardStrategy = ttnn::operations::data_movement::ShardStrategy;
         using ShardOrientation = tt::tt_metal::ShardOrientation;
         using Layout = tt::tt_metal::Layout;

From e80531278a18d4fbfd335e2bbb00ad28229cb887 Mon Sep 17 00:00:00 2001
From: Sankar Manoj <smanoj@tenstorrent.com>
Date: Sun, 2 Mar 2025 13:07:29 +0000
Subject: [PATCH 2/3] #0: Fix tests

---
 tests/ttnn/unit_tests/operations/test_new_conv2d.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py
index ec130ab4c45..c00ab81e8a7 100644
--- a/tests/ttnn/unit_tests/operations/test_new_conv2d.py
+++ b/tests/ttnn/unit_tests/operations/test_new_conv2d.py
@@ -73,11 +73,8 @@ def run_conv(
     weight_mesh_mapper=None,
     output_mesh_composer=None,
     enable_split_reader=False,
-<<<<<<< HEAD
     activation="",
-=======
     preprocess_weights_on_device=True,
->>>>>>> 55b6f9b444 (#0: First commit for loading weights on device)
 ):
     if isinstance(device, ttnn.MeshDevice):
         assert input_mesh_mapper is not None, "Expected mesh mapper for input tensor when using device mesh"

From 4af8f4077c5694298613ace4f3db02a180b6eadf Mon Sep 17 00:00:00 2001
From: Sankar Manoj <smanoj@tenstorrent.com>
Date: Mon, 3 Mar 2025 17:44:03 +0000
Subject: [PATCH 3/3] #0: Input Channels alignment = 16 only for HS

---
 .../ttnn/operations/conv/conv2d/conv2d_utils.cpp    |  6 +++---
 .../conv/conv2d/prepare_conv2d_weights.cpp          | 13 -------------
 2 files changed, 3 insertions(+), 16 deletions(-)

diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp
index 7bdc858a526..bf27d1efbf2 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp
@@ -870,9 +870,9 @@ std::tuple<OptimizedConvParallelizationConfig, OptimizedConvBlockConfig, MemoryC
             get_num_cores_channels_from_parallel_config(largest_parallel_config));
 
     uint32_t input_channels_alignment =
-        (input_parallel_config.shard_scheme == tt::tt_metal::TensorMemoryLayout::WIDTH_SHARDED)
-            ? 32
-            : conv_config.input_channels_alignment;
+        (input_parallel_config.shard_scheme == tt::tt_metal::TensorMemoryLayout::HEIGHT_SHARDED)
+            ? conv_config.input_channels_alignment
+            : constants::TILE_WIDTH;
     uint32_t in_channels_padded = tt::round_up(
         in_channels, get_num_cores_channels_from_parallel_config(input_parallel_config) * input_channels_alignment);
 
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp
index 6964612fb39..8686ec186eb 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp
@@ -891,19 +891,6 @@ std::pair<ttnn::Tensor, std::optional<ttnn::Tensor>> prepare_conv_weights_biases
 
         ttnn::Shape bias_target_shape(std::array<uint32_t, 4>{1, 1, 1, out_channels});
         bias_tensor_ = ttnn::reshape(bias_tensor_, bias_target_shape, bias_tensor_.get_padded_shape());
-
-        // TT_FATAL(
-        //     bias_tensor_.get_logical_shape()[3] == out_channels,
-        //     "Bias must have the same length as output channels");
-        // bias_tensor_ = conv_bias_layout_convert(
-        //     bias_tensor_,
-        //     weights_bias_dtype,
-        //     weight_block_h_ntiles,
-        //     weight_block_w_ntiles,
-        //     output_parallel_config,
-        //     device,
-        //     out_channels_padded,
-        //     is_non_tile_mul_width);
     }
 
     return {weight_tensor_, bias_tensor.has_value() ? bias_tensor_ : std::optional<ttnn::Tensor>()};