Revert "Revert "Revert "Change tests"""

This reverts commit 01ca8ac.
tenstorrent · Feb 28, 2025 · 4de44fe · 4de44fe
1 parent 8475369
commit 4de44fe
Show file tree

Hide file tree

Showing 74 changed files with 324 additions and 37 deletions.
diff --git a/models/demos/metal_BERT_large_11/tests/test_bert_batch_dram.py b/models/demos/metal_BERT_large_11/tests/test_bert_batch_dram.py
@@ -398,3 +398,12 @@ def test_bert_batch_dram_with_program_cache(
         PERF_CNT,
         device,
     )
+
+    if model_config_str == "BFLOAT8_B-SHARDED":
+        assert device.num_program_cache_entries() == 19
+    elif batch == 8 and model_config_str == "MIXED_PRECISION_BATCH8":
+        assert device.num_program_cache_entries() == 17
+    elif batch == 9 and model_config_str in {"BFLOAT8_B-L1", "BFLOAT8_B-DRAM"}:
+        assert device.num_program_cache_entries() == 17
+    else:
+        assert device.num_program_cache_entries() == 16
diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_concatenate_heads.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_concatenate_heads.py
@@ -101,3 +101,5 @@ def test_bert_large_concatenate_heads_with_program_cache(device, use_program_cac
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_ff1_matmul.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_ff1_matmul.py
@@ -203,3 +203,5 @@ def test_bert_large_ff1_matmul_with_program_cache(device, use_program_cache):
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_ff2_matmul.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_ff2_matmul.py
@@ -163,3 +163,5 @@ def test_bert_large_ff2_matmul_with_program_cache(device, use_program_cache):
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_fused_qkv_matmul.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_fused_qkv_matmul.py
@@ -163,3 +163,5 @@ def test_bert_large_fused_qkv_matmul_with_program_cache(device, use_program_cach
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_post_softmax_bmm.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_post_softmax_bmm.py
@@ -120,3 +120,5 @@ def test_bert_large_post_softmax_bmm_with_program_cache(device, use_program_cach
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_pre_softmax_bmm.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_pre_softmax_bmm.py
@@ -113,3 +113,5 @@ def test_bert_large_pre_softmax_bmm_with_program_cache(device, use_program_cache
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_selfout_matmul.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_selfout_matmul.py
@@ -162,3 +162,5 @@ def test_bert_large_selfout_matmul_with_program_cache(device, use_program_cache)
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/...imental/bert_large_performant/unit_tests/test_bert_large_split_and_transform_qkv_heads.py b/...imental/bert_large_performant/unit_tests/test_bert_large_split_and_transform_qkv_heads.py
@@ -126,3 +126,5 @@ def test_split_query_key_value_and_split_heads_with_program_cache(device, use_pr
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/...bert_large_performant/unit_tests/test_bert_large_split_query_key_value_and_split_heads.py b/...bert_large_performant/unit_tests/test_bert_large_split_query_key_value_and_split_heads.py
@@ -127,3 +127,5 @@ def test_split_query_key_value_and_split_heads_with_program_cache(device, use_pr
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_attn_matmul.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_attn_matmul.py
@@ -339,15 +339,15 @@ def test_group_attn_matmul_with_program_cache(
             else:
                 output_mem_config = interleaved_mem_config
 
-            num_cache_entries_start = 0
+            num_cache_entries_start = device.num_program_cache_entries()
             tt_output_tensor_on_device = ttnn.experimental.group_attn_matmul(
                 tt_input_tensor_a,
                 tt_input_tensor_b,
                 compute_with_storage_grid_size=compute_grid_size,
                 memory_config=output_mem_config,
                 dtype=output_dtype,
             )
-            num_cache_entries += 0 - num_cache_entries_start
+            num_cache_entries += device.num_program_cache_entries() - num_cache_entries_start
 
             if sharded:
                 tt_output_tensor_on_device = ttnn.sharded_to_interleaved(
@@ -363,6 +363,8 @@ def test_group_attn_matmul_with_program_cache(
             allclose, output = comp_pcc(tt_output_tensor, golden_output_tensor)
             assert allclose, f"FAILED: {output}"
 
+    assert num_cache_entries == 1
+
     device.enable_async(False)
 
 

diff --git a/...t_eager/python_api_testing/unit_testing/misc/test_distributed_layernorm_post_allgather.py b/...t_eager/python_api_testing/unit_testing/misc/test_distributed_layernorm_post_allgather.py
@@ -194,3 +194,7 @@ def test_layernorm_part_2_with_program_cache2(inp_shape, n_devices, is_rmsnorm,
                 )
             )
         run_layernorm_part_2(inp_shape, n_devices, is_rmsnorm, dtype, dtype, device)
+
+    assert device.num_program_cache_entries() == 1, "Program cache should have only one entry" + str(
+        device.num_program_cache_entries()
+    )
diff --git a/...tt_eager/python_api_testing/unit_testing/misc/test_distributed_layernorm_pre_allgather.py b/...tt_eager/python_api_testing/unit_testing/misc/test_distributed_layernorm_pre_allgather.py
@@ -269,3 +269,7 @@ def test_layernorm_part_1_with_program_cache2(
                 )
             )
         run_layernorm_part_1(inp_shape, n_devices, is_rmsnorm, input_dtype, output_dtype, device)
+
+    assert device.num_program_cache_entries() == 1, "Program cache should have only one entry" + str(
+        device.num_program_cache_entries()
+    )
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_1d_gather_in0.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_1d_gather_in0.py
@@ -329,6 +329,9 @@ def run_multi_core_matmul_1d(
 
     assert passing
 
+    # Check program cache
+    assert device.num_program_cache_entries() == 1  # Only 1 op
+
 
 @pytest.mark.skipif(is_grayskull(), reason="GS does not support fp32")
 @pytest.mark.skipif(is_blackhole(), reason="Test suite for GS only")

diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_dram_sharded.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_dram_sharded.py
@@ -271,6 +271,7 @@ def test_matmul_in1_dram_sharded_with_program_cache(
             buffer_type=ttnn.BufferType.DRAM,
         )
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, in0_dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
+    assert device.num_program_cache_entries() == 3
 
 
 def run_test_matmul_in1_dram_sharded_mm_chain(

diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_move.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_move.py
@@ -109,3 +109,5 @@ def test_move_op_with_program_cache(device, use_program_cache):
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_concat_heads.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_concat_heads.py
@@ -103,3 +103,5 @@ def test_nlp_concat_heads_with_program_cache(device, use_program_cache):
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads.py
@@ -119,6 +119,8 @@ def test_nlp_create_qkv_heads_falcon7b_with_program_cache(device, use_program_ca
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
 
+    assert device.num_program_cache_entries() == 2
+
 
 """
 Generic shapes + functionality
@@ -363,6 +365,8 @@ def test_nlp_create_qkv_heads_with_program_cache(device, use_program_cache):
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
 
+    assert device.num_program_cache_entries() == 2
+
 
 def run_sharded_nlp_create_qkv_heads_test(
     batch,
@@ -527,3 +531,5 @@ def test_sharded_nlp_create_qkv_heads_with_program_cache(device, use_program_cac
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_decode.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_decode.py
@@ -411,6 +411,7 @@ def test_create_heads_with_slice(
         )
     # BH does s2i and i2s inside of to_device and from_device as device ops
     expected_entries = 1 if not is_blackhole() else 4 if overlap_coregrid else 5
+    assert device.num_program_cache_entries() == expected_entries
 
 
 @pytest.fixture()
@@ -462,6 +463,7 @@ def test_create_min_width_shard_subcoregrid(
             overlap_coregrid=overlap_coregrid,
             sub_core_grids=sub_core_grids,
         )
+    assert device.num_program_cache_entries() == 1, "Only one Op program cache should exist"
 
 
 def run_test_create_width_shard_by_head(

diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_segformer.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_segformer.py
@@ -106,3 +106,5 @@ def test_nlp_create_qkv_heads_segformer_with_program_cache(device, use_program_c
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_vit.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_vit.py
@@ -119,3 +119,5 @@ def test_nlp_create_qkv_heads_vit_with_program_cache(device, use_program_cache):
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_kv_cache_load_slice.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_kv_cache_load_slice.py
@@ -39,7 +39,7 @@ def unpadding_test(
     # Pytorch reference
     test_tensor_ref = inp[:, :, seq_len_start:seq_len_end]
 
-    return test_tensor_pt, test_tensor_ref, test_tensor_tt.memory_config(), 0
+    return test_tensor_pt, test_tensor_ref, test_tensor_tt.memory_config(), device.num_program_cache_entries()
 
 
 @pytest.mark.parametrize(
@@ -120,6 +120,7 @@ def test_run_unpadding_test(
             dtype,
         )
         assert a_pt.shape == a_ref.shape
+        assert num_cache_entries == 2
         if dtype == ttnn.bfloat8_b:
             # inevitable precision loss for bfloat8_b
             eq, pcc = comp_pcc(a_pt, a_ref, 0.999)
@@ -147,6 +148,7 @@ def test_run_unpadding_test(
             dtype,
         )
         assert a_pt.shape == a_ref.shape
+        assert num_cache_entries == 3
         if dtype == ttnn.bfloat8_b:
             # inevitable precision loss for bfloat8_b
             eq, pcc = comp_pcc(a_pt, a_ref, 0.999)

diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_reshard.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_reshard.py
@@ -367,6 +367,8 @@ def test_reshard_with_program_cache(
         passing, output = comp_pcc(torch_tensor1, torch_tensor_after_round_trip1)
     assert passing, output
 
+    assert device.num_program_cache_entries() == 3
+
 
 @skip_for_blackhole("GH Issue #15234")
 @pytest.mark.parametrize(
@@ -617,3 +619,5 @@ def test_dram_reshard_with_program_cache(
         dummy_tensor = (
             ttnn.Tensor(torch.rand([2, 2, 128, 64]), dtype).to(ttnn.TILE_LAYOUT).to(device, ttnn.L1_MEMORY_CONFIG)
         )
+
+    assert device.num_program_cache_entries() == 1
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py
@@ -458,3 +458,5 @@ def test_rotary_embedding_llama_with_program_cache(
 
         if batch % ttnn.TILE_SIZE != 0:
             num_ops += 1  # slice
+
+    assert device.num_program_cache_entries() == num_ops
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama_fused_qk.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama_fused_qk.py
@@ -136,3 +136,5 @@ def test_rotary_embedding_llama_fused_qk_with_program_cache(
 
     if (batch * 2) % ttnn.TILE_SIZE != 0:
         num_ops += 1  # slice
+
+    assert device.num_program_cache_entries() == num_ops
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention.py
@@ -214,6 +214,8 @@ def test_sdpa_tt_with_program_cache(device, b, nh, nkv, s, d, q_chunk_size, k_ch
     for _ in range(2):
         run_test_sdpa_tt(device, b, nh, nkv, s, d, q_chunk_size, k_chunk_size, dtype)
 
+    assert device.num_program_cache_entries() == 1
+
 
 def run_sdpa_noncausal(device, b, nh, nkv, sq, d, q_chunk_size, k_chunk_size, dtype, sk=None, use_mask=True):
     torch.manual_seed(1234)
@@ -500,6 +502,11 @@ def test_sdpa_chunked(
             use_high_precision_compute,
         )
 
+    # Print number of program cache entries
+    assert device.num_program_cache_entries() == 1, "Program cache should only have 1 entry but has {}".format(
+        device.num_program_cache_entries()
+    )
+
 
 @skip_for_blackhole("Mismatching on BH, see #12349")
 @pytest.mark.skipif(is_watcher_enabled(), reason="Kernel OOM with watcher enabled")
@@ -553,6 +560,11 @@ def test_sdpa_chunked_iterate_batch(
             grid_size=(1, 1),
         )
 
+    # Print number of program cache entries
+    assert device.num_program_cache_entries() == 1, "Program cache should only have 1 entry but has {}".format(
+        device.num_program_cache_entries()
+    )
+
 
 def run_test_joint_sdpa(
     device,

diff --git a/...tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention_decode.py b/...tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention_decode.py
@@ -567,6 +567,7 @@ def test_sdpa_decode_non_causal(device, b, nh, nkv, s, d, dtype, grid_size, q_dt
         run_test_sdpa_decode_single_iter(
             device, b, nh, nkv, s, d, dtype, grid_size, q_dtype, sharded_in=False, sharded_out=False, causal=False
         )
+    assert device.num_program_cache_entries() == 1
 
 
 @skip_for_blackhole("Unsupported on BH, see #12349")
@@ -886,6 +887,8 @@ def test_sdpa_decode_paged_attention(
         sharded_out=False,
     )
 
+    assert device.num_program_cache_entries() == 4
+
 
 @skip_for_blackhole("Unsupported on BH, see #12349")
 @skip_for_grayskull("Unsupported in GS since L1 runs OOM with most configs")
@@ -986,6 +989,7 @@ def test_sdpa_decode_sharded_on_subcoregrids(
         start_core=start_core,
         sub_core_grids=sub_core_grids,
     )
+    assert device.num_program_cache_entries() == 1
 
 
 @skip_for_blackhole("Unsupported on BH, see #12349")
@@ -1150,6 +1154,8 @@ def test_sdpa_decode_program_cache(device, b, nh, nkv, s, d, dtype, use_program_
             cur_pos_tensor=True,
         )
 
+    assert device.num_program_cache_entries() == 4
+
 
 def run_test_sdpa_decode_ndpcc(device, b, nh, nkv, s, d, dtype, grid_size, q_dtype=ttnn.bfloat16):
     compute_grid_size = device.compute_with_storage_grid_size()

diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py
@@ -52,6 +52,9 @@ def transpose(
     logger.info(output)
     assert passing
 
+    if expected_program_cache_size != None:
+        assert device.num_program_cache_entries() == expected_program_cache_size
+
 
 @pytest.mark.parametrize(
     "dtype",
@@ -383,6 +386,7 @@ def test_transpose_hw_rm_with_program_cache(device, n, c, h, w, use_program_cach
             device=device,
             memory_config=ttnn.L1_MEMORY_CONFIG,
         )
+    assert device.num_program_cache_entries() == 1
 
 
 @skip_for_blackhole("Mismatching on BH, see #12349")
@@ -474,6 +478,7 @@ def test_transpose_hw_sharded_rm_with_program_cache(device, n, c, h, w, use_prog
             device=device,
             memory_config=ttnn.L1_MEMORY_CONFIG,
         )
+    assert device.num_program_cache_entries() == 3
 
 
 @pytest.mark.parametrize("n", [16])
@@ -534,6 +539,7 @@ def test_transpose_hc_rm_with_program_cache(device, n, c, h, w, use_program_cach
             device=device,
             memory_config=ttnn.L1_MEMORY_CONFIG,
         )
+    assert device.num_program_cache_entries() == 1
 
 
 def run_transpose_hc_sharded(device, n, c, h, w, grid_size):
@@ -595,6 +601,7 @@ def test_transpose_hc_sharded_with_program_cache(device, n, c, h, w, grid_size,
             device=device,
             memory_config=ttnn.L1_MEMORY_CONFIG,
         )
+    assert device.num_program_cache_entries() == 3
 
 
 @pytest.mark.parametrize(

diff --git a/tests/ttnn/unit_tests/operations/test_backward_embedding.py b/tests/ttnn/unit_tests/operations/test_backward_embedding.py
@@ -120,3 +120,5 @@ def test_embedding_bw_with_program_cache(
 
         logger.debug(comp_out)
         assert comp_pass
+
+    assert device.num_program_cache_entries() == 1
diff --git a/tests/ttnn/unit_tests/operations/test_bernoulli.py b/tests/ttnn/unit_tests/operations/test_bernoulli.py
@@ -95,11 +95,13 @@ def test_bernoulli_callback(shape, seed, in_dtype, out_dtype, device, is_out_all
         run_bernoulli(shape, in_dtype, out_dtype, device, seed=seed, is_out_alloc=is_out_alloc)
         # Add dummy tensor to make sure that created tensor in 2 iteration don't share the same addr
         tt_dummy_tensor = ttnn.empty([1, 1, 32, 32], ttnn.bfloat16, ttnn.TILE_LAYOUT, device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
         # Cache must hit when we change seed and seed runtime arg is overrode
         seed = seed + 1
 
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
 
 
 @skip_for_grayskull("Requires wormhole_b0 to run")

diff --git a/tests/ttnn/unit_tests/operations/test_clone.py b/tests/ttnn/unit_tests/operations/test_clone.py
@@ -243,5 +243,7 @@ def test_clone_callback(
         )
         torch_dummy = torch.randn([32, 32])
         ttnn_dummy = ttnn.from_torch(torch_dummy, device=device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
diff --git a/tests/ttnn/unit_tests/operations/test_convert_to_chw.py b/tests/ttnn/unit_tests/operations/test_convert_to_chw.py
@@ -110,3 +110,5 @@ def test_convert_to_chw_with_program_cache(device, use_program_cache):
         tt_dummy_tensor = (
             ttnn.Tensor(py_dummy_tensor, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device, ttnn.L1_MEMORY_CONFIG)
         )
+
+    assert device.num_program_cache_entries() == 2
diff --git a/tests/ttnn/unit_tests/operations/test_distributed_layernorm_sharded.py b/tests/ttnn/unit_tests/operations/test_distributed_layernorm_sharded.py
@@ -269,6 +269,7 @@ def run_pre_allgather_layernorm(
                 tt_ex2, torch_ex2, atol=max_atol_ex2
             ), f"E(x^2) mismatch for device {d} (atol: {atol_delta_ex2})"
 
+    assert device.num_program_cache_entries() == 2, "Program cache not working as expected"
     logger.info("Pre-allgather layernorm test passed for all devices")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -458,3 +458,5 @@ def test_rotary_embedding_llama_with_program_cache(

		if batch % ttnn.TILE_SIZE != 0:
		num_ops += 1 # slice

		assert device.num_program_cache_entries() == num_ops
Original file line number	Diff line number	Diff line change
Expand Up		@@ -136,3 +136,5 @@ def test_rotary_embedding_llama_fused_qk_with_program_cache(

		if (batch * 2) % ttnn.TILE_SIZE != 0:
		num_ops += 1 # slice

		assert device.num_program_cache_entries() == num_ops
Original file line number	Diff line number	Diff line change
Expand Up		@@ -120,3 +120,5 @@ def test_embedding_bw_with_program_cache(

		logger.debug(comp_out)
		assert comp_pass

		assert device.num_program_cache_entries() == 1