revert perf test changes

tenstorrent · Feb 25, 2025 · 9532eaf · 9532eaf
1 parent 5215684
commit 9532eaf
Show file tree

Hide file tree

Showing 4 changed files with 17 additions and 15 deletions.
diff --git a/models/demos/distilbert/tests/test_perf_distilbert.py b/models/demos/distilbert/tests/test_perf_distilbert.py
@@ -152,9 +152,9 @@ def test_distilbert_perf_device(batch_size, test, reset_seeds):
     margin = 0.03
     num_iterations = 1
     if is_grayskull():
-        expected_perf = 292
+        expected_perf = 57.3
     elif is_wormhole_b0():
-        expected_perf = 680
+        expected_perf = 95.5
 
     command = f"pytest tests/ttnn/integration_tests/distilbert/test_ttnn_distilbert.py::test_distilbert_for_question_answering[sequence_size=768-batch_size=8-model_name=distilbert-base-uncased-distilled-squad]"
     cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"]

diff --git a/tests/ttnn/unit_tests/operations/test_slice.py b/tests/ttnn/unit_tests/operations/test_slice.py
@@ -12,11 +12,13 @@
 
 
 def run_slice_rm_sharded(device, n, c, h, w):
-    torch_input_tensor = torch.rand((n, c, h, w), dtype=torch.bfloat16)
+    torch_input_tensor = torch.ones((n, c, h, w), dtype=torch.bfloat16)
     n_unpadded = n
-    c_unpadded = 115
+    c_unpadded = min(c, 115)
     h_unpadded = 115
-    torch_output_tensor = torch_input_tensor[:n_unpadded, :c_unpadded, :h_unpadded, :]
+    w_unpadded = 16
+
+    torch_output_tensor = torch_input_tensor[:n_unpadded, :c_unpadded, :h_unpadded, :w_unpadded]
     tt_input_tensor = ttnn.from_torch(
         torch_input_tensor,
         dtype=ttnn.DataType.BFLOAT16,
@@ -45,27 +47,27 @@ def run_slice_rm_sharded(device, n, c, h, w):
     grid_size = ttnn.CoreGrid(y=num_cores_y, x=num_cores_x)
     grid_coord = ttnn.CoreCoord(grid_size.x - 1, grid_size.y - 1)
     shard_grid = ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), grid_coord)})
-    shard_spec = ttnn.ShardSpec(shard_grid, (shard_h, w), ttnn.ShardOrientation.ROW_MAJOR)
+    shard_spec = ttnn.ShardSpec(shard_grid, (shard_h, w_unpadded), ttnn.ShardOrientation.ROW_MAJOR)
     output_mem_config = ttnn.MemoryConfig(
         ttnn.types.TensorMemoryLayout.HEIGHT_SHARDED, ttnn.types.BufferType.L1, shard_spec
     )
 
     tt_output_tensor = ttnn.slice(
         tt_input_tensor,
         (0, 0, 0, 0),
-        (n_unpadded, c_unpadded, h_unpadded, w),
+        (n_unpadded, c_unpadded, h_unpadded, w_unpadded),
         memory_config=output_mem_config,
     )
-    tt_output_tensor = ttnn.to_memory_config(tt_output_tensor, ttnn.L1_MEMORY_CONFIG)
+
     tt_output_tensor = ttnn.from_device(tt_output_tensor)
     tt_output_tensor = ttnn.to_torch(tt_output_tensor)
     assert_with_pcc(torch_output_tensor, tt_output_tensor, 0.9999)
 
 
-@pytest.mark.parametrize("n", [16])
-@pytest.mark.parametrize("c", [128])
-@pytest.mark.parametrize("h", [128])
-@pytest.mark.parametrize("w", [16])
+@pytest.mark.parametrize("n", [1, 16, 21])
+@pytest.mark.parametrize("c", [1, 128, 133])
+@pytest.mark.parametrize("h", [128, 150])
+@pytest.mark.parametrize("w", [16, 32])
 def test_slice_rm_sharded_with_program_cache(device, n, c, h, w, use_program_cache):
     for _ in range(2):
         run_slice_rm_sharded(device, n, c, h, w)
@@ -79,7 +81,7 @@ def test_slice_rm_sharded_with_program_cache(device, n, c, h, w, use_program_cac
             device=device,
             memory_config=ttnn.L1_MEMORY_CONFIG,
         )
-    assert device.num_program_cache_entries() == 3
+    assert device.num_program_cache_entries() == 2
 
 
 @pytest.mark.parametrize("n", [16])

diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.cpp
@@ -592,7 +592,7 @@ operation::ProgramWithCallbacks slice_rm_multi_core_sharded(
     uint32_t num_unpadded_sticks = output.volume() / output.get_logical_shape()[-1];
 
     // stick sizes
-    uint32_t W_padded = a.get_logical_shape()[-1];
+    uint32_t W_padded = a.get_padded_shape()[-1];
     uint32_t W_unpadded = output.get_logical_shape()[-1];
     auto stick_size_padded = W_padded * a.element_size();
     auto stick_size_unpadded = W_unpadded * output.element_size();

diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/slice.cpp b/ttnn/cpp/ttnn/operations/data_movement/slice/slice.cpp
@@ -107,7 +107,7 @@ ttnn::Tensor SliceOperation::invoke(
             aligned_ends &= slice_aligned_to_tile(modified_ends) || (modified_ends[input_rank - 1] == input_shape[-1] &&
                                                                      modified_ends[input_rank - 2] == input_shape[-2]);
         }
-        rm_only = !no_step || !aligned_begins || !aligned_ends || one_dimensional;
+        rm_only = !no_step || !aligned_begins || !aligned_ends || one_dimensional || input_tensor.is_sharded();
         if (rm_only) {
             if (!no_step) {
                 TT_FATAL(