tenstorrent · sminakov-tt · Feb 26, 2025 · Feb 26, 2025 · Feb 26, 2025 · Feb 27, 2025
diff --git a/models/demos/metal_BERT_large_11/tests/test_bert_batch_dram.py b/models/demos/metal_BERT_large_11/tests/test_bert_batch_dram.py
@@ -398,12 +398,3 @@ def test_bert_batch_dram_with_program_cache(
         PERF_CNT,
         device,
     )
-
-    if model_config_str == "BFLOAT8_B-SHARDED":
-        assert device.num_program_cache_entries() == 19
-    elif batch == 8 and model_config_str == "MIXED_PRECISION_BATCH8":
-        assert device.num_program_cache_entries() == 17
-    elif batch == 9 and model_config_str in {"BFLOAT8_B-L1", "BFLOAT8_B-DRAM"}:
-        assert device.num_program_cache_entries() == 17
-    else:
-        assert device.num_program_cache_entries() == 16
diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_concatenate_heads.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_concatenate_heads.py
@@ -101,5 +101,3 @@ def test_bert_large_concatenate_heads_with_program_cache(device, use_program_cac
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)
-
-    assert device.num_program_cache_entries() == 2
diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_ff1_matmul.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_ff1_matmul.py
@@ -203,5 +203,3 @@ def test_bert_large_ff1_matmul_with_program_cache(device, use_program_cache):
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)
-
-    assert device.num_program_cache_entries() == 2
diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_ff2_matmul.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_ff2_matmul.py
@@ -163,5 +163,3 @@ def test_bert_large_ff2_matmul_with_program_cache(device, use_program_cache):
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)
-
-    assert device.num_program_cache_entries() == 2
diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_fused_qkv_matmul.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_fused_qkv_matmul.py
@@ -163,5 +163,3 @@ def test_bert_large_fused_qkv_matmul_with_program_cache(device, use_program_cach
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)
-
-    assert device.num_program_cache_entries() == 2
diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_post_softmax_bmm.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_post_softmax_bmm.py
@@ -120,5 +120,3 @@ def test_bert_large_post_softmax_bmm_with_program_cache(device, use_program_cach
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)
-
-    assert device.num_program_cache_entries() == 2
diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_pre_softmax_bmm.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_pre_softmax_bmm.py
@@ -113,5 +113,3 @@ def test_bert_large_pre_softmax_bmm_with_program_cache(device, use_program_cache
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)
-
-    assert device.num_program_cache_entries() == 2
diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_selfout_matmul.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_selfout_matmul.py
@@ -162,5 +162,3 @@ def test_bert_large_selfout_matmul_with_program_cache(device, use_program_cache)
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)
-
-    assert device.num_program_cache_entries() == 2
diff --git a/...imental/bert_large_performant/unit_tests/test_bert_large_split_and_transform_qkv_heads.py b/...imental/bert_large_performant/unit_tests/test_bert_large_split_and_transform_qkv_heads.py
@@ -126,5 +126,3 @@ def test_split_query_key_value_and_split_heads_with_program_cache(device, use_pr
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)
-
-    assert device.num_program_cache_entries() == 2
diff --git a/...bert_large_performant/unit_tests/test_bert_large_split_query_key_value_and_split_heads.py b/...bert_large_performant/unit_tests/test_bert_large_split_query_key_value_and_split_heads.py
@@ -127,5 +127,3 @@ def test_split_query_key_value_and_split_heads_with_program_cache(device, use_pr
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
-
-    assert device.num_program_cache_entries() == 2
@@ -48,9 +48,6 @@ def get_devices(test_module):
 
 
 def gather_single_test_perf(device, test_passed):
-    if not isinstance(device, ttnn.Device):
-        logger.error("Multi-device perf is not supported. Failing.")
-        return None
     ttnn.DumpDeviceProfiler(device)
     opPerfData = get_device_data_generate_report(
         PROFILER_LOGS_DIR, None, None, None, export_csv=False, cleanup_device_log=True

diff --git a/tests/tt_eager/ops/test_eltwise_binary_op.cpp b/tests/tt_eager/ops/test_eltwise_binary_op.cpp
@@ -118,12 +118,8 @@ int main() {
 
     run_binary_ops();
 
-    TT_FATAL(device->num_program_cache_entries() == 3, "There are {} entries", device->num_program_cache_entries());
-
     device->disable_and_clear_program_cache();
 
-    TT_FATAL(device->num_program_cache_entries() == 0, "Error");
-
     TT_FATAL(tt::tt_metal::CloseDevice(device), "Error");
 
     return 0;

diff --git a/tests/tt_eager/ops/test_eltwise_unary_op.cpp b/tests/tt_eager/ops/test_eltwise_unary_op.cpp
@@ -273,10 +273,7 @@ void test_program_cache() {
     device->enable_program_cache();
     run_tests();
 
-    TT_FATAL(device->num_program_cache_entries() == 4, "There are {} entries", device->num_program_cache_entries());
-
     device->disable_and_clear_program_cache();
-    TT_FATAL(device->num_program_cache_entries() == 0, "Error");
     TT_FATAL(tt::tt_metal::CloseDevice(device), "Error");
 }
 

diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_attn_matmul.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_attn_matmul.py
@@ -339,15 +339,15 @@ def test_group_attn_matmul_with_program_cache(
             else:
                 output_mem_config = interleaved_mem_config
 
-            num_cache_entries_start = device.num_program_cache_entries()
+            num_cache_entries_start = 0
             tt_output_tensor_on_device = ttnn.experimental.group_attn_matmul(
                 tt_input_tensor_a,
                 tt_input_tensor_b,
                 compute_with_storage_grid_size=compute_grid_size,
                 memory_config=output_mem_config,
                 dtype=output_dtype,
             )
-            num_cache_entries += device.num_program_cache_entries() - num_cache_entries_start
+            num_cache_entries += 0 - num_cache_entries_start
 
             if sharded:
                 tt_output_tensor_on_device = ttnn.sharded_to_interleaved(
@@ -363,8 +363,6 @@ def test_group_attn_matmul_with_program_cache(
             allclose, output = comp_pcc(tt_output_tensor, golden_output_tensor)
             assert allclose, f"FAILED: {output}"
 
-    assert num_cache_entries == 1
-
     device.enable_async(False)
 
 

diff --git a/...t_eager/python_api_testing/unit_testing/misc/test_distributed_layernorm_post_allgather.py b/...t_eager/python_api_testing/unit_testing/misc/test_distributed_layernorm_post_allgather.py
@@ -194,7 +194,3 @@ def test_layernorm_part_2_with_program_cache2(inp_shape, n_devices, is_rmsnorm,
                 )
             )
         run_layernorm_part_2(inp_shape, n_devices, is_rmsnorm, dtype, dtype, device)
-
-    assert device.num_program_cache_entries() == 1, "Program cache should have only one entry" + str(
-        device.num_program_cache_entries()
-    )
diff --git a/...tt_eager/python_api_testing/unit_testing/misc/test_distributed_layernorm_pre_allgather.py b/...tt_eager/python_api_testing/unit_testing/misc/test_distributed_layernorm_pre_allgather.py
@@ -269,7 +269,3 @@ def test_layernorm_part_1_with_program_cache2(
                 )
             )
         run_layernorm_part_1(inp_shape, n_devices, is_rmsnorm, input_dtype, output_dtype, device)
-
-    assert device.num_program_cache_entries() == 1, "Program cache should have only one entry" + str(
-        device.num_program_cache_entries()
-    )
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_1d_gather_in0.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_1d_gather_in0.py
@@ -329,9 +329,6 @@ def run_multi_core_matmul_1d(
 
     assert passing
 
-    # Check program cache
-    assert device.num_program_cache_entries() == 1  # Only 1 op
-
 
 @pytest.mark.skipif(is_grayskull(), reason="GS does not support fp32")
 @pytest.mark.skipif(is_blackhole(), reason="Test suite for GS only")

diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_dram_sharded.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_dram_sharded.py
@@ -271,7 +271,6 @@ def test_matmul_in1_dram_sharded_with_program_cache(
             buffer_type=ttnn.BufferType.DRAM,
         )
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, in0_dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
-    assert device.num_program_cache_entries() == 3
 
 
 def run_test_matmul_in1_dram_sharded_mm_chain(

diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_move.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_move.py
@@ -109,5 +109,3 @@ def test_move_op_with_program_cache(device, use_program_cache):
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
-
-    assert device.num_program_cache_entries() == 2
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_concat_heads.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_concat_heads.py
@@ -103,5 +103,3 @@ def test_nlp_concat_heads_with_program_cache(device, use_program_cache):
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
-
-    assert device.num_program_cache_entries() == 2
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads.py
@@ -119,8 +119,6 @@ def test_nlp_create_qkv_heads_falcon7b_with_program_cache(device, use_program_ca
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
 
-    assert device.num_program_cache_entries() == 2
-
 
 """
 Generic shapes + functionality
@@ -365,8 +363,6 @@ def test_nlp_create_qkv_heads_with_program_cache(device, use_program_cache):
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
 
-    assert device.num_program_cache_entries() == 2
-
 
 def run_sharded_nlp_create_qkv_heads_test(
     batch,
@@ -531,5 +527,3 @@ def test_sharded_nlp_create_qkv_heads_with_program_cache(device, use_program_cac
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
-
-    assert device.num_program_cache_entries() == 2
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_decode.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_decode.py
@@ -362,7 +362,7 @@

    # BH does s2i and i2s inside of to_device and from_device as device ops
    expected_entries = 1 if not is_blackhole() else 3 if overlap_coregrid else 4
    assert device.num_program_cache_entries() == expected_entries


 @pytest.fixture()
@@ -411,7 +411,6 @@
         )
     # BH does s2i and i2s inside of to_device and from_device as device ops
     expected_entries = 1 if not is_blackhole() else 4 if overlap_coregrid else 5
-    assert device.num_program_cache_entries() == expected_entries
 
 
 @pytest.fixture()
@@ -463,7 +462,6 @@
             overlap_coregrid=overlap_coregrid,
             sub_core_grids=sub_core_grids,
         )
-    assert device.num_program_cache_entries() == 1, "Only one Op program cache should exist"
 
 
 def run_test_create_width_shard_by_head(

diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_segformer.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_segformer.py
@@ -106,5 +106,3 @@ def test_nlp_create_qkv_heads_segformer_with_program_cache(device, use_program_c
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
-
-    assert device.num_program_cache_entries() == 2
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_vit.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_vit.py
@@ -119,5 +119,3 @@ def test_nlp_create_qkv_heads_vit_with_program_cache(device, use_program_cache):
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
-
-    assert device.num_program_cache_entries() == 2
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_kv_cache_load_slice.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_kv_cache_load_slice.py
@@ -39,7 +39,7 @@ def unpadding_test(
     # Pytorch reference
     test_tensor_ref = inp[:, :, seq_len_start:seq_len_end]
 
-    return test_tensor_pt, test_tensor_ref, test_tensor_tt.memory_config(), device.num_program_cache_entries()
+    return test_tensor_pt, test_tensor_ref, test_tensor_tt.memory_config(), 0
 
 
 @pytest.mark.parametrize(
@@ -120,7 +120,6 @@ def test_run_unpadding_test(
             dtype,
         )
         assert a_pt.shape == a_ref.shape
-        assert num_cache_entries == 2
         if dtype == ttnn.bfloat8_b:
             # inevitable precision loss for bfloat8_b
             eq, pcc = comp_pcc(a_pt, a_ref, 0.999)
@@ -148,7 +147,6 @@ def test_run_unpadding_test(
             dtype,
         )
         assert a_pt.shape == a_ref.shape
-        assert num_cache_entries == 3
         if dtype == ttnn.bfloat8_b:
             # inevitable precision loss for bfloat8_b
             eq, pcc = comp_pcc(a_pt, a_ref, 0.999)

diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_reshard.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_reshard.py
@@ -367,8 +367,6 @@ def test_reshard_with_program_cache(
         passing, output = comp_pcc(torch_tensor1, torch_tensor_after_round_trip1)
     assert passing, output
 
-    assert device.num_program_cache_entries() == 3
-
 
 @skip_for_blackhole("GH Issue #15234")
 @pytest.mark.parametrize(
@@ -619,5 +617,3 @@ def test_dram_reshard_with_program_cache(
         dummy_tensor = (
             ttnn.Tensor(torch.rand([2, 2, 128, 64]), dtype).to(ttnn.TILE_LAYOUT).to(device, ttnn.L1_MEMORY_CONFIG)
         )
-
-    assert device.num_program_cache_entries() == 1
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py
@@ -458,5 +458,3 @@ def test_rotary_embedding_llama_with_program_cache(
 
         if batch % ttnn.TILE_SIZE != 0:
             num_ops += 1  # slice
-
-    assert device.num_program_cache_entries() == num_ops
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama_fused_qk.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama_fused_qk.py
@@ -136,5 +136,3 @@ def test_rotary_embedding_llama_fused_qk_with_program_cache(
 
     if (batch * 2) % ttnn.TILE_SIZE != 0:
         num_ops += 1  # slice
-
-    assert device.num_program_cache_entries() == num_ops
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention.py
@@ -214,8 +214,6 @@ def test_sdpa_tt_with_program_cache(device, b, nh, nkv, s, d, q_chunk_size, k_ch
     for _ in range(2):
         run_test_sdpa_tt(device, b, nh, nkv, s, d, q_chunk_size, k_chunk_size, dtype)
 
-    assert device.num_program_cache_entries() == 1
-
 
 def run_sdpa_noncausal(device, b, nh, nkv, sq, d, q_chunk_size, k_chunk_size, dtype, sk=None, use_mask=True):
     torch.manual_seed(1234)
@@ -502,11 +500,6 @@ def test_sdpa_chunked(
             use_high_precision_compute,
         )
 
-    # Print number of program cache entries
-    assert device.num_program_cache_entries() == 1, "Program cache should only have 1 entry but has {}".format(
-        device.num_program_cache_entries()
-    )
-
 
 @skip_for_blackhole("Mismatching on BH, see #12349")
 @pytest.mark.skipif(is_watcher_enabled(), reason="Kernel OOM with watcher enabled")
@@ -560,11 +553,6 @@ def test_sdpa_chunked_iterate_batch(
             grid_size=(1, 1),
         )
 
-    # Print number of program cache entries
-    assert device.num_program_cache_entries() == 1, "Program cache should only have 1 entry but has {}".format(
-        device.num_program_cache_entries()
-    )
-
 
 def run_test_joint_sdpa(
     device,

diff --git a/...tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention_decode.py b/...tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention_decode.py
@@ -567,7 +567,6 @@ def test_sdpa_decode_non_causal(device, b, nh, nkv, s, d, dtype, grid_size, q_dt
         run_test_sdpa_decode_single_iter(
             device, b, nh, nkv, s, d, dtype, grid_size, q_dtype, sharded_in=False, sharded_out=False, causal=False
         )
-    assert device.num_program_cache_entries() == 1
 
 
 @skip_for_blackhole("Unsupported on BH, see #12349")
@@ -887,8 +886,6 @@ def test_sdpa_decode_paged_attention(
         sharded_out=False,
     )
 
-    assert device.num_program_cache_entries() == 4
-
 
 @skip_for_blackhole("Unsupported on BH, see #12349")
 @skip_for_grayskull("Unsupported in GS since L1 runs OOM with most configs")
@@ -989,7 +986,6 @@ def test_sdpa_decode_sharded_on_subcoregrids(
         start_core=start_core,
         sub_core_grids=sub_core_grids,
     )
-    assert device.num_program_cache_entries() == 1
 
 
 @skip_for_blackhole("Unsupported on BH, see #12349")
@@ -1154,8 +1150,6 @@ def test_sdpa_decode_program_cache(device, b, nh, nkv, s, d, dtype, use_program_
             cur_pos_tensor=True,
         )
 
-    assert device.num_program_cache_entries() == 4
-
 
 def run_test_sdpa_decode_ndpcc(device, b, nh, nkv, s, d, dtype, grid_size, q_dtype=ttnn.bfloat16):
     compute_grid_size = device.compute_with_storage_grid_size()

diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py
@@ -52,9 +52,6 @@ def transpose(
     logger.info(output)
     assert passing
 
-    if expected_program_cache_size != None:
-        assert device.num_program_cache_entries() == expected_program_cache_size
-
 
 @pytest.mark.parametrize(
     "dtype",
@@ -386,7 +383,6 @@ def test_transpose_hw_rm_with_program_cache(device, n, c, h, w, use_program_cach
             device=device,
             memory_config=ttnn.L1_MEMORY_CONFIG,
         )
-    assert device.num_program_cache_entries() == 1
 
 
 @skip_for_blackhole("Mismatching on BH, see #12349")
@@ -478,7 +474,6 @@ def test_transpose_hw_sharded_rm_with_program_cache(device, n, c, h, w, use_prog
             device=device,
             memory_config=ttnn.L1_MEMORY_CONFIG,
         )
-    assert device.num_program_cache_entries() == 3
 
 
 @pytest.mark.parametrize("n", [16])
@@ -539,7 +534,6 @@ def test_transpose_hc_rm_with_program_cache(device, n, c, h, w, use_program_cach
             device=device,
             memory_config=ttnn.L1_MEMORY_CONFIG,
         )
-    assert device.num_program_cache_entries() == 1
 
 
 def run_transpose_hc_sharded(device, n, c, h, w, grid_size):
@@ -601,7 +595,6 @@ def test_transpose_hc_sharded_with_program_cache(device, n, c, h, w, grid_size,
             device=device,
             memory_config=ttnn.L1_MEMORY_CONFIG,
         )
-    assert device.num_program_cache_entries() == 3
 
 
 @pytest.mark.parametrize(
Original file line number	Diff line number	Diff line change
Expand Up		@@ -458,5 +458,3 @@ def test_rotary_embedding_llama_with_program_cache(

		if batch % ttnn.TILE_SIZE != 0:
		num_ops += 1 # slice

		assert device.num_program_cache_entries() == num_ops
Original file line number	Diff line number	Diff line change
Expand Up		@@ -136,5 +136,3 @@ def test_rotary_embedding_llama_fused_qk_with_program_cache(

		if (batch * 2) % ttnn.TILE_SIZE != 0:
		num_ops += 1 # slice

		assert device.num_program_cache_entries() == num_ops