From 4de44fe678ec51b1c5ccfe36c2cf741ed6b15263 Mon Sep 17 00:00:00 2001
From: Stanislav Minakov <sminakov@tenstorrent.com>
Date: Fri, 28 Feb 2025 15:22:24 -0800
Subject: [PATCH] Revert "Revert "Revert "Change tests"""

This reverts commit 01ca8ac87795b8da10cee6c383b2ba1a134dc9d8.
---
 .../tests/test_bert_batch_dram.py             |  9 ++++++++
 .../test_bert_large_concatenate_heads.py      |  2 ++
 .../unit_tests/test_bert_large_ff1_matmul.py  |  2 ++
 .../unit_tests/test_bert_large_ff2_matmul.py  |  2 ++
 .../test_bert_large_fused_qkv_matmul.py       |  2 ++
 .../test_bert_large_post_softmax_bmm.py       |  2 ++
 .../test_bert_large_pre_softmax_bmm.py        |  2 ++
 .../test_bert_large_selfout_matmul.py         |  2 ++
 ...ert_large_split_and_transform_qkv_heads.py |  2 ++
 ...e_split_query_key_value_and_split_heads.py |  2 ++
 .../unit_testing/misc/test_attn_matmul.py     |  6 +++--
 ...st_distributed_layernorm_post_allgather.py |  4 ++++
 ...est_distributed_layernorm_pre_allgather.py |  4 ++++
 .../misc/test_matmul_1d_gather_in0.py         |  3 +++
 .../misc/test_matmul_dram_sharded.py          |  1 +
 .../unit_testing/misc/test_move.py            |  2 ++
 .../misc/test_nlp_concat_heads.py             |  2 ++
 .../misc/test_nlp_create_qkv_heads.py         |  6 +++++
 .../misc/test_nlp_create_qkv_heads_decode.py  |  2 ++
 .../test_nlp_create_qkv_heads_segformer.py    |  2 ++
 .../misc/test_nlp_create_qkv_heads_vit.py     |  2 ++
 .../misc/test_nlp_kv_cache_load_slice.py      |  4 +++-
 .../unit_testing/misc/test_reshard.py         |  4 ++++
 .../misc/test_rotary_embedding_llama.py       |  2 ++
 .../test_rotary_embedding_llama_fused_qk.py   |  2 ++
 .../misc/test_scaled_dot_product_attention.py | 12 ++++++++++
 ...est_scaled_dot_product_attention_decode.py |  6 +++++
 .../unit_testing/misc/test_transpose.py       |  7 ++++++
 .../operations/test_backward_embedding.py     |  2 ++
 .../unit_tests/operations/test_bernoulli.py   |  4 +++-
 .../ttnn/unit_tests/operations/test_clone.py  |  4 +++-
 .../operations/test_convert_to_chw.py         |  2 ++
 .../test_distributed_layernorm_sharded.py     |  1 +
 .../operations/test_fast_reduce_nc.py         |  2 ++
 .../unit_tests/operations/test_full_like.py   |  5 +++++
 .../unit_tests/operations/test_index_fill.py  |  5 +++++
 .../ttnn/unit_tests/operations/test_matmul.py |  4 ++++
 .../unit_tests/operations/test_moreh_adam.py  | 10 ++++++++-
 .../unit_tests/operations/test_moreh_adamw.py | 14 +++++++++---
 .../operations/test_moreh_arange.py           |  4 +++-
 .../unit_tests/operations/test_moreh_bmm.py   |  8 +++++--
 .../operations/test_moreh_cumsum.py           |  2 ++
 .../unit_tests/operations/test_moreh_dot.py   |  5 +++++
 .../operations/test_moreh_dot_backward.py     |  4 +++-
 .../unit_tests/operations/test_moreh_fold.py  |  4 +++-
 .../unit_tests/operations/test_moreh_full.py  |  5 +++++
 .../operations/test_moreh_getitem.py          |  8 +++++--
 .../operations/test_moreh_group_norm.py       |  8 +++++--
 .../operations/test_moreh_layer_norm.py       |  8 +++++--
 .../operations/test_moreh_linear.py           |  8 +++++--
 .../operations/test_moreh_logsoftmax.py       | 10 +++++++++
 .../operations/test_moreh_matmul.py           |  1 +
 .../unit_tests/operations/test_moreh_mean.py  |  8 +++++--
 .../operations/test_moreh_nll_loss.py         | 12 ++++++++--
 .../test_moreh_nll_loss_unreduced.py          | 12 ++++++++--
 .../unit_tests/operations/test_moreh_norm.py  |  8 +++++--
 .../unit_tests/operations/test_moreh_sgd.py   |  4 +++-
 .../operations/test_moreh_softmax.py          | 10 +++++++++
 .../operations/test_moreh_softmin.py          | 10 +++++++++
 .../unit_tests/operations/test_moreh_sum.py   |  2 ++
 .../unit_tests/operations/test_new_conv2d.py  |  2 +-
 tests/ttnn/unit_tests/operations/test_pad.py  |  2 ++
 .../test_paged_fused_update_cache.py          |  2 ++
 .../operations/test_paged_update_cache.py     |  8 +++++++
 .../ttnn/unit_tests/operations/test_repeat.py | 22 +++++++++++++++++++
 .../unit_tests/operations/test_sampling.py    |  8 +++++--
 .../ttnn/unit_tests/operations/test_slice.py  |  7 +++++-
 .../unit_tests/operations/test_softmax.py     |  2 ++
 .../operations/test_ssm_1d_sum_reduce.py      |  2 ++
 .../operations/test_ssm_prefix_scan.py        |  2 ++
 ...t_ssm_repeat_and_interleave_eltwise_mul.py |  2 ++
 .../unit_tests/operations/test_uniform.py     |  4 +++-
 tests/ttnn/unit_tests/test_expand.py          |  5 ++++-
 tests/ttnn/unit_tests/test_reshape.py         |  1 +
 74 files changed, 324 insertions(+), 37 deletions(-)

diff --git a/models/demos/metal_BERT_large_11/tests/test_bert_batch_dram.py b/models/demos/metal_BERT_large_11/tests/test_bert_batch_dram.py
index a5c98018f8a..050e7b9dce4 100644
--- a/models/demos/metal_BERT_large_11/tests/test_bert_batch_dram.py
+++ b/models/demos/metal_BERT_large_11/tests/test_bert_batch_dram.py
@@ -398,3 +398,12 @@ def test_bert_batch_dram_with_program_cache(
         PERF_CNT,
         device,
     )
+
+    if model_config_str == "BFLOAT8_B-SHARDED":
+        assert device.num_program_cache_entries() == 19
+    elif batch == 8 and model_config_str == "MIXED_PRECISION_BATCH8":
+        assert device.num_program_cache_entries() == 17
+    elif batch == 9 and model_config_str in {"BFLOAT8_B-L1", "BFLOAT8_B-DRAM"}:
+        assert device.num_program_cache_entries() == 17
+    else:
+        assert device.num_program_cache_entries() == 16
diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_concatenate_heads.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_concatenate_heads.py
index 4714dbab498..4c99e3974bd 100644
--- a/models/experimental/bert_large_performant/unit_tests/test_bert_large_concatenate_heads.py
+++ b/models/experimental/bert_large_performant/unit_tests/test_bert_large_concatenate_heads.py
@@ -101,3 +101,5 @@ def test_bert_large_concatenate_heads_with_program_cache(device, use_program_cac
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_ff1_matmul.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_ff1_matmul.py
index 0cbf02e4d5a..96cb04e56e5 100644
--- a/models/experimental/bert_large_performant/unit_tests/test_bert_large_ff1_matmul.py
+++ b/models/experimental/bert_large_performant/unit_tests/test_bert_large_ff1_matmul.py
@@ -203,3 +203,5 @@ def test_bert_large_ff1_matmul_with_program_cache(device, use_program_cache):
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_ff2_matmul.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_ff2_matmul.py
index 34c9a5900d7..8b212c5699d 100644
--- a/models/experimental/bert_large_performant/unit_tests/test_bert_large_ff2_matmul.py
+++ b/models/experimental/bert_large_performant/unit_tests/test_bert_large_ff2_matmul.py
@@ -163,3 +163,5 @@ def test_bert_large_ff2_matmul_with_program_cache(device, use_program_cache):
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_fused_qkv_matmul.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_fused_qkv_matmul.py
index 56a138dcec4..4d2c3660b9b 100644
--- a/models/experimental/bert_large_performant/unit_tests/test_bert_large_fused_qkv_matmul.py
+++ b/models/experimental/bert_large_performant/unit_tests/test_bert_large_fused_qkv_matmul.py
@@ -163,3 +163,5 @@ def test_bert_large_fused_qkv_matmul_with_program_cache(device, use_program_cach
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_post_softmax_bmm.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_post_softmax_bmm.py
index 3d296be58bb..890d6cea1ee 100644
--- a/models/experimental/bert_large_performant/unit_tests/test_bert_large_post_softmax_bmm.py
+++ b/models/experimental/bert_large_performant/unit_tests/test_bert_large_post_softmax_bmm.py
@@ -120,3 +120,5 @@ def test_bert_large_post_softmax_bmm_with_program_cache(device, use_program_cach
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_pre_softmax_bmm.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_pre_softmax_bmm.py
index f383215a01f..3ade03968a3 100644
--- a/models/experimental/bert_large_performant/unit_tests/test_bert_large_pre_softmax_bmm.py
+++ b/models/experimental/bert_large_performant/unit_tests/test_bert_large_pre_softmax_bmm.py
@@ -113,3 +113,5 @@ def test_bert_large_pre_softmax_bmm_with_program_cache(device, use_program_cache
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_selfout_matmul.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_selfout_matmul.py
index f03dc887d4d..bcb2b165ba8 100644
--- a/models/experimental/bert_large_performant/unit_tests/test_bert_large_selfout_matmul.py
+++ b/models/experimental/bert_large_performant/unit_tests/test_bert_large_selfout_matmul.py
@@ -162,3 +162,5 @@ def test_bert_large_selfout_matmul_with_program_cache(device, use_program_cache)
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_split_and_transform_qkv_heads.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_split_and_transform_qkv_heads.py
index 410513f29d4..fcfafffec55 100644
--- a/models/experimental/bert_large_performant/unit_tests/test_bert_large_split_and_transform_qkv_heads.py
+++ b/models/experimental/bert_large_performant/unit_tests/test_bert_large_split_and_transform_qkv_heads.py
@@ -126,3 +126,5 @@ def test_split_query_key_value_and_split_heads_with_program_cache(device, use_pr
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_split_query_key_value_and_split_heads.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_split_query_key_value_and_split_heads.py
index 1c230f32d41..a4a0d76844f 100644
--- a/models/experimental/bert_large_performant/unit_tests/test_bert_large_split_query_key_value_and_split_heads.py
+++ b/models/experimental/bert_large_performant/unit_tests/test_bert_large_split_query_key_value_and_split_heads.py
@@ -127,3 +127,5 @@ def test_split_query_key_value_and_split_heads_with_program_cache(device, use_pr
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_attn_matmul.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_attn_matmul.py
index 631fd135d04..68bdc6501ad 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_attn_matmul.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_attn_matmul.py
@@ -339,7 +339,7 @@ def test_group_attn_matmul_with_program_cache(
             else:
                 output_mem_config = interleaved_mem_config
 
-            num_cache_entries_start = 0
+            num_cache_entries_start = device.num_program_cache_entries()
             tt_output_tensor_on_device = ttnn.experimental.group_attn_matmul(
                 tt_input_tensor_a,
                 tt_input_tensor_b,
@@ -347,7 +347,7 @@ def test_group_attn_matmul_with_program_cache(
                 memory_config=output_mem_config,
                 dtype=output_dtype,
             )
-            num_cache_entries += 0 - num_cache_entries_start
+            num_cache_entries += device.num_program_cache_entries() - num_cache_entries_start
 
             if sharded:
                 tt_output_tensor_on_device = ttnn.sharded_to_interleaved(
@@ -363,6 +363,8 @@ def test_group_attn_matmul_with_program_cache(
             allclose, output = comp_pcc(tt_output_tensor, golden_output_tensor)
             assert allclose, f"FAILED: {output}"
 
+    assert num_cache_entries == 1
+
     device.enable_async(False)
 
 
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_distributed_layernorm_post_allgather.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_distributed_layernorm_post_allgather.py
index efdc4e98355..16834c94dbb 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_distributed_layernorm_post_allgather.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_distributed_layernorm_post_allgather.py
@@ -194,3 +194,7 @@ def test_layernorm_part_2_with_program_cache2(inp_shape, n_devices, is_rmsnorm,
                 )
             )
         run_layernorm_part_2(inp_shape, n_devices, is_rmsnorm, dtype, dtype, device)
+
+    assert device.num_program_cache_entries() == 1, "Program cache should have only one entry" + str(
+        device.num_program_cache_entries()
+    )
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_distributed_layernorm_pre_allgather.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_distributed_layernorm_pre_allgather.py
index 17596a94169..83648cdf223 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_distributed_layernorm_pre_allgather.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_distributed_layernorm_pre_allgather.py
@@ -269,3 +269,7 @@ def test_layernorm_part_1_with_program_cache2(
                 )
             )
         run_layernorm_part_1(inp_shape, n_devices, is_rmsnorm, input_dtype, output_dtype, device)
+
+    assert device.num_program_cache_entries() == 1, "Program cache should have only one entry" + str(
+        device.num_program_cache_entries()
+    )
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_1d_gather_in0.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_1d_gather_in0.py
index cf375694fea..028583664b0 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_1d_gather_in0.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_1d_gather_in0.py
@@ -329,6 +329,9 @@ def run_multi_core_matmul_1d(
 
     assert passing
 
+    # Check program cache
+    assert device.num_program_cache_entries() == 1  # Only 1 op
+
 
 @pytest.mark.skipif(is_grayskull(), reason="GS does not support fp32")
 @pytest.mark.skipif(is_blackhole(), reason="Test suite for GS only")
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_dram_sharded.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_dram_sharded.py
index 44100d8e44d..423861dd172 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_dram_sharded.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_dram_sharded.py
@@ -271,6 +271,7 @@ def test_matmul_in1_dram_sharded_with_program_cache(
             buffer_type=ttnn.BufferType.DRAM,
         )
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, in0_dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
+    assert device.num_program_cache_entries() == 3
 
 
 def run_test_matmul_in1_dram_sharded_mm_chain(
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_move.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_move.py
index 20142bffdd5..9ff5dd915a4 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_move.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_move.py
@@ -109,3 +109,5 @@ def test_move_op_with_program_cache(device, use_program_cache):
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_concat_heads.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_concat_heads.py
index d7013094e18..bce8e5e91cf 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_concat_heads.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_concat_heads.py
@@ -103,3 +103,5 @@ def test_nlp_concat_heads_with_program_cache(device, use_program_cache):
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads.py
index 6e21fb642f9..e0eb4589021 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads.py
@@ -119,6 +119,8 @@ def test_nlp_create_qkv_heads_falcon7b_with_program_cache(device, use_program_ca
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
 
+    assert device.num_program_cache_entries() == 2
+
 
 """
 Generic shapes + functionality
@@ -363,6 +365,8 @@ def test_nlp_create_qkv_heads_with_program_cache(device, use_program_cache):
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
 
+    assert device.num_program_cache_entries() == 2
+
 
 def run_sharded_nlp_create_qkv_heads_test(
     batch,
@@ -527,3 +531,5 @@ def test_sharded_nlp_create_qkv_heads_with_program_cache(device, use_program_cac
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_decode.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_decode.py
index c361ec37be0..bcfce9a534d 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_decode.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_decode.py
@@ -411,6 +411,7 @@ def test_create_heads_with_slice(
         )
     # BH does s2i and i2s inside of to_device and from_device as device ops
     expected_entries = 1 if not is_blackhole() else 4 if overlap_coregrid else 5
+    assert device.num_program_cache_entries() == expected_entries
 
 
 @pytest.fixture()
@@ -462,6 +463,7 @@ def test_create_min_width_shard_subcoregrid(
             overlap_coregrid=overlap_coregrid,
             sub_core_grids=sub_core_grids,
         )
+    assert device.num_program_cache_entries() == 1, "Only one Op program cache should exist"
 
 
 def run_test_create_width_shard_by_head(
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_segformer.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_segformer.py
index 7e9ccd8d61f..7135cb3517f 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_segformer.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_segformer.py
@@ -106,3 +106,5 @@ def test_nlp_create_qkv_heads_segformer_with_program_cache(device, use_program_c
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_vit.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_vit.py
index e6c21497a19..5e71490bfc0 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_vit.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_vit.py
@@ -119,3 +119,5 @@ def test_nlp_create_qkv_heads_vit_with_program_cache(device, use_program_cache):
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_kv_cache_load_slice.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_kv_cache_load_slice.py
index 661ba847ad4..12f87d94346 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_kv_cache_load_slice.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_kv_cache_load_slice.py
@@ -39,7 +39,7 @@ def unpadding_test(
     # Pytorch reference
     test_tensor_ref = inp[:, :, seq_len_start:seq_len_end]
 
-    return test_tensor_pt, test_tensor_ref, test_tensor_tt.memory_config(), 0
+    return test_tensor_pt, test_tensor_ref, test_tensor_tt.memory_config(), device.num_program_cache_entries()
 
 
 @pytest.mark.parametrize(
@@ -120,6 +120,7 @@ def test_run_unpadding_test(
             dtype,
         )
         assert a_pt.shape == a_ref.shape
+        assert num_cache_entries == 2
         if dtype == ttnn.bfloat8_b:
             # inevitable precision loss for bfloat8_b
             eq, pcc = comp_pcc(a_pt, a_ref, 0.999)
@@ -147,6 +148,7 @@ def test_run_unpadding_test(
             dtype,
         )
         assert a_pt.shape == a_ref.shape
+        assert num_cache_entries == 3
         if dtype == ttnn.bfloat8_b:
             # inevitable precision loss for bfloat8_b
             eq, pcc = comp_pcc(a_pt, a_ref, 0.999)
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_reshard.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_reshard.py
index 07210afcc1d..1f51f5eb372 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_reshard.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_reshard.py
@@ -367,6 +367,8 @@ def test_reshard_with_program_cache(
         passing, output = comp_pcc(torch_tensor1, torch_tensor_after_round_trip1)
     assert passing, output
 
+    assert device.num_program_cache_entries() == 3
+
 
 @skip_for_blackhole("GH Issue #15234")
 @pytest.mark.parametrize(
@@ -617,3 +619,5 @@ def test_dram_reshard_with_program_cache(
         dummy_tensor = (
             ttnn.Tensor(torch.rand([2, 2, 128, 64]), dtype).to(ttnn.TILE_LAYOUT).to(device, ttnn.L1_MEMORY_CONFIG)
         )
+
+    assert device.num_program_cache_entries() == 1
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py
index 616d1e61fb7..01ea4b5858a 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py
@@ -458,3 +458,5 @@ def test_rotary_embedding_llama_with_program_cache(
 
         if batch % ttnn.TILE_SIZE != 0:
             num_ops += 1  # slice
+
+    assert device.num_program_cache_entries() == num_ops
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama_fused_qk.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama_fused_qk.py
index c400ee590c3..1f4aaca24a8 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama_fused_qk.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama_fused_qk.py
@@ -136,3 +136,5 @@ def test_rotary_embedding_llama_fused_qk_with_program_cache(
 
     if (batch * 2) % ttnn.TILE_SIZE != 0:
         num_ops += 1  # slice
+
+    assert device.num_program_cache_entries() == num_ops
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention.py
index 47e75d22d21..9bc75655c85 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention.py
@@ -214,6 +214,8 @@ def test_sdpa_tt_with_program_cache(device, b, nh, nkv, s, d, q_chunk_size, k_ch
     for _ in range(2):
         run_test_sdpa_tt(device, b, nh, nkv, s, d, q_chunk_size, k_chunk_size, dtype)
 
+    assert device.num_program_cache_entries() == 1
+
 
 def run_sdpa_noncausal(device, b, nh, nkv, sq, d, q_chunk_size, k_chunk_size, dtype, sk=None, use_mask=True):
     torch.manual_seed(1234)
@@ -500,6 +502,11 @@ def test_sdpa_chunked(
             use_high_precision_compute,
         )
 
+    # Print number of program cache entries
+    assert device.num_program_cache_entries() == 1, "Program cache should only have 1 entry but has {}".format(
+        device.num_program_cache_entries()
+    )
+
 
 @skip_for_blackhole("Mismatching on BH, see #12349")
 @pytest.mark.skipif(is_watcher_enabled(), reason="Kernel OOM with watcher enabled")
@@ -553,6 +560,11 @@ def test_sdpa_chunked_iterate_batch(
             grid_size=(1, 1),
         )
 
+    # Print number of program cache entries
+    assert device.num_program_cache_entries() == 1, "Program cache should only have 1 entry but has {}".format(
+        device.num_program_cache_entries()
+    )
+
 
 def run_test_joint_sdpa(
     device,
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention_decode.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention_decode.py
index f32420f6c0c..1ac916d2413 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention_decode.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention_decode.py
@@ -567,6 +567,7 @@ def test_sdpa_decode_non_causal(device, b, nh, nkv, s, d, dtype, grid_size, q_dt
         run_test_sdpa_decode_single_iter(
             device, b, nh, nkv, s, d, dtype, grid_size, q_dtype, sharded_in=False, sharded_out=False, causal=False
         )
+    assert device.num_program_cache_entries() == 1
 
 
 @skip_for_blackhole("Unsupported on BH, see #12349")
@@ -886,6 +887,8 @@ def test_sdpa_decode_paged_attention(
         sharded_out=False,
     )
 
+    assert device.num_program_cache_entries() == 4
+
 
 @skip_for_blackhole("Unsupported on BH, see #12349")
 @skip_for_grayskull("Unsupported in GS since L1 runs OOM with most configs")
@@ -986,6 +989,7 @@ def test_sdpa_decode_sharded_on_subcoregrids(
         start_core=start_core,
         sub_core_grids=sub_core_grids,
     )
+    assert device.num_program_cache_entries() == 1
 
 
 @skip_for_blackhole("Unsupported on BH, see #12349")
@@ -1150,6 +1154,8 @@ def test_sdpa_decode_program_cache(device, b, nh, nkv, s, d, dtype, use_program_
             cur_pos_tensor=True,
         )
 
+    assert device.num_program_cache_entries() == 4
+
 
 def run_test_sdpa_decode_ndpcc(device, b, nh, nkv, s, d, dtype, grid_size, q_dtype=ttnn.bfloat16):
     compute_grid_size = device.compute_with_storage_grid_size()
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py
index eee3c1c9d04..3cd7f275927 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py
@@ -52,6 +52,9 @@ def transpose(
     logger.info(output)
     assert passing
 
+    if expected_program_cache_size != None:
+        assert device.num_program_cache_entries() == expected_program_cache_size
+
 
 @pytest.mark.parametrize(
     "dtype",
@@ -383,6 +386,7 @@ def test_transpose_hw_rm_with_program_cache(device, n, c, h, w, use_program_cach
             device=device,
             memory_config=ttnn.L1_MEMORY_CONFIG,
         )
+    assert device.num_program_cache_entries() == 1
 
 
 @skip_for_blackhole("Mismatching on BH, see #12349")
@@ -474,6 +478,7 @@ def test_transpose_hw_sharded_rm_with_program_cache(device, n, c, h, w, use_prog
             device=device,
             memory_config=ttnn.L1_MEMORY_CONFIG,
         )
+    assert device.num_program_cache_entries() == 3
 
 
 @pytest.mark.parametrize("n", [16])
@@ -534,6 +539,7 @@ def test_transpose_hc_rm_with_program_cache(device, n, c, h, w, use_program_cach
             device=device,
             memory_config=ttnn.L1_MEMORY_CONFIG,
         )
+    assert device.num_program_cache_entries() == 1
 
 
 def run_transpose_hc_sharded(device, n, c, h, w, grid_size):
@@ -595,6 +601,7 @@ def test_transpose_hc_sharded_with_program_cache(device, n, c, h, w, grid_size,
             device=device,
             memory_config=ttnn.L1_MEMORY_CONFIG,
         )
+    assert device.num_program_cache_entries() == 3
 
 
 @pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/test_backward_embedding.py b/tests/ttnn/unit_tests/operations/test_backward_embedding.py
index d2ded07ad16..102490cee3f 100644
--- a/tests/ttnn/unit_tests/operations/test_backward_embedding.py
+++ b/tests/ttnn/unit_tests/operations/test_backward_embedding.py
@@ -120,3 +120,5 @@ def test_embedding_bw_with_program_cache(
 
         logger.debug(comp_out)
         assert comp_pass
+
+    assert device.num_program_cache_entries() == 1
diff --git a/tests/ttnn/unit_tests/operations/test_bernoulli.py b/tests/ttnn/unit_tests/operations/test_bernoulli.py
index 58d6af052aa..c2c21a61f6e 100644
--- a/tests/ttnn/unit_tests/operations/test_bernoulli.py
+++ b/tests/ttnn/unit_tests/operations/test_bernoulli.py
@@ -95,11 +95,13 @@ def test_bernoulli_callback(shape, seed, in_dtype, out_dtype, device, is_out_all
         run_bernoulli(shape, in_dtype, out_dtype, device, seed=seed, is_out_alloc=is_out_alloc)
         # Add dummy tensor to make sure that created tensor in 2 iteration don't share the same addr
         tt_dummy_tensor = ttnn.empty([1, 1, 32, 32], ttnn.bfloat16, ttnn.TILE_LAYOUT, device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
         # Cache must hit when we change seed and seed runtime arg is overrode
         seed = seed + 1
 
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
 
 
 @skip_for_grayskull("Requires wormhole_b0 to run")
diff --git a/tests/ttnn/unit_tests/operations/test_clone.py b/tests/ttnn/unit_tests/operations/test_clone.py
index 45f69d45511..e928d6e29c9 100644
--- a/tests/ttnn/unit_tests/operations/test_clone.py
+++ b/tests/ttnn/unit_tests/operations/test_clone.py
@@ -243,5 +243,7 @@ def test_clone_callback(
         )
         torch_dummy = torch.randn([32, 32])
         ttnn_dummy = ttnn.from_torch(torch_dummy, device=device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
diff --git a/tests/ttnn/unit_tests/operations/test_convert_to_chw.py b/tests/ttnn/unit_tests/operations/test_convert_to_chw.py
index ebbc39f0029..66bbee701f6 100644
--- a/tests/ttnn/unit_tests/operations/test_convert_to_chw.py
+++ b/tests/ttnn/unit_tests/operations/test_convert_to_chw.py
@@ -110,3 +110,5 @@ def test_convert_to_chw_with_program_cache(device, use_program_cache):
         tt_dummy_tensor = (
             ttnn.Tensor(py_dummy_tensor, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device, ttnn.L1_MEMORY_CONFIG)
         )
+
+    assert device.num_program_cache_entries() == 2
diff --git a/tests/ttnn/unit_tests/operations/test_distributed_layernorm_sharded.py b/tests/ttnn/unit_tests/operations/test_distributed_layernorm_sharded.py
index 881628ffe67..bc12c5d61c6 100644
--- a/tests/ttnn/unit_tests/operations/test_distributed_layernorm_sharded.py
+++ b/tests/ttnn/unit_tests/operations/test_distributed_layernorm_sharded.py
@@ -269,6 +269,7 @@ def run_pre_allgather_layernorm(
                 tt_ex2, torch_ex2, atol=max_atol_ex2
             ), f"E(x^2) mismatch for device {d} (atol: {atol_delta_ex2})"
 
+    assert device.num_program_cache_entries() == 2, "Program cache not working as expected"
     logger.info("Pre-allgather layernorm test passed for all devices")
 
 
diff --git a/tests/ttnn/unit_tests/operations/test_fast_reduce_nc.py b/tests/ttnn/unit_tests/operations/test_fast_reduce_nc.py
index 5e8a6072c3f..f1b5f8306fa 100644
--- a/tests/ttnn/unit_tests/operations/test_fast_reduce_nc.py
+++ b/tests/ttnn/unit_tests/operations/test_fast_reduce_nc.py
@@ -164,6 +164,7 @@ def test_fast_reduce_nc_with_prgm_caching(dims, device, use_program_cache):
         logger.debug(f"Output pcc={output_pcc}")
 
         assert passing
+        assert device.num_program_cache_entries() == len(dims) + 1
 
     input_shape_2 = [1, 8, 32, 32]
     output_shape_2 = input_shape_2.copy()
@@ -189,3 +190,4 @@ def test_fast_reduce_nc_with_prgm_caching(dims, device, use_program_cache):
         logger.debug(f"Output pcc={output_pcc}")
 
         assert passing
+        assert device.num_program_cache_entries() == 2 * len(dims) + 1
diff --git a/tests/ttnn/unit_tests/operations/test_full_like.py b/tests/ttnn/unit_tests/operations/test_full_like.py
index 5ff2693c3f4..cbac2a9d28b 100644
--- a/tests/ttnn/unit_tests/operations/test_full_like.py
+++ b/tests/ttnn/unit_tests/operations/test_full_like.py
@@ -111,6 +111,11 @@ def test_full_like_callback(device, input_shape, fill_value, layout, use_program
         tt_output = ttnn.moreh_full_like(tt_input, fill_value)
         assert ttnn.is_tensor_storage_on_device(tt_output)
         tt_output_cpu = ttnn.to_torch(tt_output)
+        if i == 0:
+            num_program_cache_entries = device.num_program_cache_entries()
+            assert num_program_cache_entries > 0
+        else:
+            assert device.num_program_cache_entries() == num_program_cache_entries
         torch_dummy = torch.randn([32, 32])
         tt_dummy = ttnn.from_torch(torch_dummy, device=device)
 
diff --git a/tests/ttnn/unit_tests/operations/test_index_fill.py b/tests/ttnn/unit_tests/operations/test_index_fill.py
index a8b7be1ba4e..8935f5c5bab 100644
--- a/tests/ttnn/unit_tests/operations/test_index_fill.py
+++ b/tests/ttnn/unit_tests/operations/test_index_fill.py
@@ -129,5 +129,10 @@ def test_index_fill_callback(shape, dim, value, device, use_program_cache):
     torch.manual_seed(2024)
     for i in range(2):
         run_index_fill_test(shape, dim, value, torch.int32, device)
+        if i == 0:
+            num_program_cache_entries = device.num_program_cache_entries()
+            assert num_program_cache_entries > 0
+        else:
+            assert device.num_program_cache_entries() == num_program_cache_entries
         torch_dummy = torch.randn([32, 32])
         tt_dummy = ttnn.from_torch(torch_dummy, device=device)
diff --git a/tests/ttnn/unit_tests/operations/test_matmul.py b/tests/ttnn/unit_tests/operations/test_matmul.py
index 1468d0e12d5..1bb4cb64bf6 100644
--- a/tests/ttnn/unit_tests/operations/test_matmul.py
+++ b/tests/ttnn/unit_tests/operations/test_matmul.py
@@ -631,6 +631,7 @@ def test_matmul_2d_multiple_output_blocks_per_core(
             device=device,
             memory_config=ttnn.L1_MEMORY_CONFIG,
         )
+    assert device.num_program_cache_entries() == 1
 
 
 def run_matmul_2d_tiny_tile(
@@ -791,6 +792,7 @@ def test_matmul_2d_tiny_tile(
             device=device,
             memory_config=ttnn.L1_MEMORY_CONFIG,
         )
+    assert device.num_program_cache_entries() == 1
 
 
 def run_matmul_1d_tiny_tile(
@@ -954,6 +956,7 @@ def test_matmul_1d_tiny_tile(
             device=device,
             memory_config=ttnn.L1_MEMORY_CONFIG,
         )
+    assert device.num_program_cache_entries() == 1
 
 
 def run_matmul_1d_multiple_output_blocks_per_core(
@@ -1179,6 +1182,7 @@ def test_matmul_1d_multiple_output_blocks_per_core(
             device=device,
             memory_config=ttnn.L1_MEMORY_CONFIG,
         )
+    assert device.num_program_cache_entries() == 1
 
 
 @pytest.mark.parametrize("side", ["height", "width"])
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_adam.py b/tests/ttnn/unit_tests/operations/test_moreh_adam.py
index 9c6caecefbe..ba0095d4339 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_adam.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_adam.py
@@ -170,8 +170,10 @@ def test_moreh_adam_callback(params, device, use_program_cache):
         run_moreh_adam(shape, lr, betas, eps, weight_decay, amsgrad, fp32_dest_acc_en, device)
         torch_dummy = torch.randn([32, 32])
         tt_dummy = to_ttnn(torch_dummy, device=device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
 
 
 @pytest.mark.parametrize(
@@ -190,8 +192,11 @@ def test_moreh_adam_caching(params, device, use_program_cache):
         run_moreh_adam(shape, lr, betas, eps, weight_decay, amsgrad, fp32_dest_acc_en, device, step=i)
         torch_dummy = torch.randn([32, 32])
         tt_dummy = to_ttnn(torch_dummy, device=device)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
 
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    for i in range(1, 4):
+        assert num_program_cache_entries_list[0] == num_program_cache_entries_list[i]
 
     num_program_cache_entries_list = []
     for i in range(4):
@@ -203,5 +208,8 @@ def test_moreh_adam_caching(params, device, use_program_cache):
         run_moreh_adam(shape, lr, betas, eps, weight_decay, amsgrad, fp32_dest_acc_en, device)
         torch_dummy = torch.randn([32, 32])
         tt_dummy = to_ttnn(torch_dummy, device=device)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
 
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    for i in range(1, 4):
+        assert num_program_cache_entries_list[0] == num_program_cache_entries_list[i]
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_adamw.py b/tests/ttnn/unit_tests/operations/test_moreh_adamw.py
index 1ee583ae00d..0e7f65510b4 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_adamw.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_adamw.py
@@ -224,8 +224,10 @@ def test_moreh_adamw_callback(shape, lr, betas, eps, weight_decay, amsgrad, step
         run_moreh_adamw(shape, lr, betas, eps, weight_decay, amsgrad, step, device)
         torch_dummy = torch.randn([32, 32])
         tt_dummy = to_ttnn(torch_dummy, device=device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
 
 
 @pytest.mark.parametrize(
@@ -277,8 +279,11 @@ def test_moreh_adamw_cache(shape, lr, betas, eps, weight_decay, amsgrad, device,
         run_moreh_adamw(shape, lr, betas, eps, weight_decay, amsgrad, step, device)
         torch_dummy = torch.randn([32, 32])
         tt_dummy = to_ttnn(torch_dummy, device=device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    for i in range(1, 4):
+        assert num_program_cache_entries_list[0] == num_program_cache_entries_list[i]
 
     num_program_cache_entries_list = []
     for _ in range(4):
@@ -288,5 +293,8 @@ def test_moreh_adamw_cache(shape, lr, betas, eps, weight_decay, amsgrad, device,
         run_moreh_adamw(shape, lr, betas, eps, weight_decay, amsgrad, 8, device)
         torch_dummy = torch.randn([32, 32])
         tt_dummy = to_ttnn(torch_dummy, device=device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    for i in range(1, 4):
+        assert num_program_cache_entries_list[0] == num_program_cache_entries_list[i]
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_arange.py b/tests/ttnn/unit_tests/operations/test_moreh_arange.py
index 82a03ddcbf0..f7717e59ff1 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_arange.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_arange.py
@@ -123,5 +123,7 @@ def test_arange_callback(start_end_step, optional_output, dtype, device, use_pro
         run_moreh_arange(start_end_step, optional_output, dtype, True, device)
         torch_dummy = torch.randn([32, 32])
         tt_dummy = ttnn.from_torch(torch_dummy, device=device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_bmm.py b/tests/ttnn/unit_tests/operations/test_moreh_bmm.py
index f03207155ce..72c1e6bc8fb 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_bmm.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_bmm.py
@@ -257,8 +257,10 @@ def test_moreh_bmm_callback(shape, device, use_program_cache):
         run_moreh_bmm(shape, True, False if is_grayskull() else True, device)
         torch_dummy = torch.randn([32, 32])
         ttnn_dummy = ttnn.from_torch(torch_dummy, device=device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
 
 
 @pytest.mark.parametrize(
@@ -336,5 +338,7 @@ def test_moreh_bmm_backward_callback(requires_grad, device, use_program_cache):
         run_moreh_bmm_backward([7, 511, 313, 765], requires_grad, False if is_grayskull() else True, device)
         torch_dummy = torch.randn([32, 32])
         ttnn_dummy = ttnn.from_torch(torch_dummy, device=device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_cumsum.py b/tests/ttnn/unit_tests/operations/test_moreh_cumsum.py
index 6b533e3f4bf..34048c0ec00 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_cumsum.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_cumsum.py
@@ -189,6 +189,7 @@ def test_moreh_cumsum_callback(input_shape, dim, device, use_program_cache):
         logger.debug(f"Output pcc={output_pcc}")
 
         assert passing
+    assert device.num_program_cache_entries() == 1
 
 
 @pytest.mark.parametrize(
@@ -240,3 +241,4 @@ def test_moreh_cumsum_backward_callback(input_shape, dim, device, use_program_ca
         logger.debug(f"Output pcc={output_pcc}")
 
         assert passing
+    assert device.num_program_cache_entries() == 1
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_dot.py b/tests/ttnn/unit_tests/operations/test_moreh_dot.py
index dd18f43f7c2..3719883defe 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_dot.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_dot.py
@@ -119,6 +119,11 @@ def test_moreh_matmul_1d_callback(input_shape, dtype, device, use_program_cache)
         run_moreh_dot_test(input_shape, dtype, device)
         torch_dummy = torch.randn([32, 32])
         tt_dummy = ttnn.from_torch(torch_dummy, device=device)
+        if i == 0:
+            num_program_cache_entries = device.num_program_cache_entries()
+            assert num_program_cache_entries > 0
+        else:
+            assert device.num_program_cache_entries() == num_program_cache_entries
 
 
 @pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_dot_backward.py b/tests/ttnn/unit_tests/operations/test_moreh_dot_backward.py
index 00274fd53df..51b443396bf 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_dot_backward.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_dot_backward.py
@@ -182,8 +182,10 @@ def test_moreh_dot_backward_callback(
     num_program_in_cache = []
     for i in range(2):
         run_moreh_dot_backward(input_shape, requires_grad, device)
-        num_program_in_cache.append(0)
+        num_program_in_cache.append(device.num_program_cache_entries())
         dummy = torch.randn([32, 32])
         tt_dummy = ttnn.from_torch(dummy, device=device)
 
     logger.info(f"num_program_in_cache={num_program_in_cache}")
+    assert num_program_in_cache[0] > 0
+    assert num_program_in_cache[0] == num_program_in_cache[1]
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_fold.py b/tests/ttnn/unit_tests/operations/test_moreh_fold.py
index f5707164d4d..4da4c78b14a 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_fold.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_fold.py
@@ -84,5 +84,7 @@ def test_fold_callback(
         # Add dummy tensor to make sure that created tensor in 2 iteration don't share the same addr
         torch_dummy = torch.randn([32, 32])
         tt_dummy = ttnn.from_torch(torch_dummy, device=device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_full.py b/tests/ttnn/unit_tests/operations/test_moreh_full.py
index 8af9c3ff813..12e08f19166 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_full.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_full.py
@@ -101,5 +101,10 @@ def test_full_callback(device, input_shape, fill_value, layout, use_program_cach
         tt_output_cpu = ttnn.to_torch(tt_output)
         torch_dummy = torch.randn([32, 32])
         ttnn_dummy = ttnn.from_torch(torch_dummy, device=device)
+        if i == 0:
+            num_program_cache_entries = device.num_program_cache_entries()
+            assert num_program_cache_entries > 0
+        else:
+            assert device.num_program_cache_entries() == num_program_cache_entries
 
         assert torch.equal(torch_output, tt_output_cpu)
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_getitem.py b/tests/ttnn/unit_tests/operations/test_moreh_getitem.py
index 8d5770bbedd..f42b323b4af 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_getitem.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_getitem.py
@@ -275,8 +275,10 @@ def test_getitem_RAW_MAJOR_callback(shape_index_dim, dtype, index_size, device,
         run_getitem_RAW_MAJOR(shape_index_dim, dtype, index_size, device)
         torch_dummy = torch.randn([32, 32])
         tt_dummy = to_ttnn(torch_dummy, device=device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
 
 
 @skip_for_blackhole("Mismatching on Blackhole, see #12349")
@@ -823,5 +825,7 @@ def test_getitem_tilized_one_index_callback(
         run_moreh_geitem_tilized_one_index(shape_index_dim, dtype, index_size, row_major_index, device)
         torch_dummy = torch.randn([32, 32])
         tt_dummy = to_ttnn(torch_dummy, device=device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_group_norm.py b/tests/ttnn/unit_tests/operations/test_moreh_group_norm.py
index 92e70d4bfa0..ee35d8244e7 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_group_norm.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_group_norm.py
@@ -330,8 +330,10 @@ def test_moreh_group_norm_callback(N, C_num_groups, HW, eps, affine, compute_mea
         run_test_moreh_group_norm(N, C_num_groups, HW, eps, affine, compute_mean_rstd, device)
         torch_dummy = torch.randn([32, 32])
         tt_dummy = to_ttnn(torch_dummy, device=device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
 
 
 def run_test_moreh_group_norm_backward(
@@ -539,5 +541,7 @@ def test_moreh_group_norm_backward_callback(
         )
         torch_dummy = torch.randn([32, 32])
         tt_dummy = to_ttnn(torch_dummy, device=device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_layer_norm.py b/tests/ttnn/unit_tests/operations/test_moreh_layer_norm.py
index 16ee75454be..f5bec45afd9 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_layer_norm.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_layer_norm.py
@@ -656,8 +656,10 @@ def test_moreh_layer_norm_callback(
         run_moreh_layer_norm(input_shape_normalized_dims, elementwise_affine, eps, dtype, device)
         torch_dummy = torch.randn([32, 32])
         tt_dummy = to_ttnn(torch_dummy, device=device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
 
 
 @skip_for_grayskull("Using the transpose function in copy_tile causes a hang.")
@@ -695,8 +697,10 @@ def test_moreh_layer_norm_backward_callback(
         run_moreh_layer_norm_backward(input_shape_normalized_dims, elementwise_affine, eps, dtype, device)
         torch_dummy = torch.randn([32, 32])
         tt_dummy = to_ttnn(torch_dummy, device=device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
 
 
 @skip_for_grayskull("Using the transpose function in copy_tile causes a hang.")
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_linear.py b/tests/ttnn/unit_tests/operations/test_moreh_linear.py
index f28c35403d3..d5368466b29 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_linear.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_linear.py
@@ -223,8 +223,10 @@ def test_moreh_linear_enable_cache(shapes, device, use_program_cache):
         assert passing
         torch_dummy = torch.randn([32, 32])
         tt_dummy = to_ttnn(torch_dummy, device=device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
 
 
 def moreh_linear_backward(
@@ -384,8 +386,10 @@ def test_moreh_linear_backward_enable_cache(shapes, device, use_program_cache):
         assert passing
         torch_dummy = torch.randn([32, 32])
         tt_dummy = to_ttnn(torch_dummy, device=device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
 
 
 @skip_for_grayskull("GS does not support fp32")
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_logsoftmax.py b/tests/ttnn/unit_tests/operations/test_moreh_logsoftmax.py
index 39786a172af..94723e12e30 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_logsoftmax.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_logsoftmax.py
@@ -482,6 +482,11 @@ def test_logsoftmax_callback(shape_dim_strategy, dtype, device, use_program_cach
 
     for i in range(2):
         run_moreh_logsoftmax_test(shape, dim, dtype, ttnn.TILE_LAYOUT, device, rtol, atol, True, strategy=strategy)
+        if i == 0:
+            num_program_cache_entries = device.num_program_cache_entries()
+            assert num_program_cache_entries > 0
+        else:
+            assert device.num_program_cache_entries() == num_program_cache_entries
         torch_dummy = torch.randn([32, 32])
         tt_dummy = ttnn.from_torch(torch_dummy, device=device)
 
@@ -512,5 +517,10 @@ def test_logsoftmax_backward_callback(shape_dim_strategy, dtype, device, use_pro
         run_moreh_logsoftmax_backward_test(
             shape, dim, dtype, ttnn.TILE_LAYOUT, device, rtol, atol, True, strategy=strategy
         )
+        if i == 0:
+            num_program_cache_entries = device.num_program_cache_entries()
+            assert num_program_cache_entries > 0
+        else:
+            assert device.num_program_cache_entries() == num_program_cache_entries
         torch_dummy = torch.randn([32, 32])
         tt_dummy = ttnn.from_torch(torch_dummy, device=device)
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_matmul.py b/tests/ttnn/unit_tests/operations/test_moreh_matmul.py
index 5749686c305..bc0c95a2cfb 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_matmul.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_matmul.py
@@ -317,6 +317,7 @@ def test_moreh_matmul_enable_cache(params, device, use_program_cache):
             params = tuple(param_list)
         passing = moreh_matmul(params, False, None, device)
         assert passing
+    assert device.num_program_cache_entries() == 2
 
 
 @skip_for_grayskull("GS does not support fp32")
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_mean.py b/tests/ttnn/unit_tests/operations/test_moreh_mean.py
index 69f66a5abf7..19db48091d9 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_mean.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_mean.py
@@ -195,8 +195,10 @@ def test_moreh_mean_callback(input_shape_dim, device, use_program_cache):
         run_moreh_mean(input_shape_dim, device, keepdim=True)
         torch_dummy = torch.randn([32, 32])
         ttnn_dummy = ttnn.from_torch(torch_dummy, device=device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
 
 
 @pytest.mark.parametrize(
@@ -264,8 +266,10 @@ def test_moreh_mean_backward_callback(input_shape_dim, device, use_program_cache
         run_moreh_mean_backward(input_shape_dim, device, keepdim=True)
         torch_dummy = torch.randn([32, 32])
         ttnn_dummy = ttnn.from_torch(torch_dummy, device=device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
 
 
 @pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_nll_loss.py b/tests/ttnn/unit_tests/operations/test_moreh_nll_loss.py
index 887a9db9d17..9015b77d65d 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_nll_loss.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_nll_loss.py
@@ -204,9 +204,13 @@ def test_moreh_nll_loss_callback(shape, reduction, device, use_program_cache):
         torch_dummy = torch.randn([32, 32])
         tt_dummy = to_ttnn(torch_dummy, device=device)
 
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
 
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert (
+        num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
+        and num_program_cache_entries_list[2] == num_program_cache_entries_list[3]
+    )
 
 
 @pytest.mark.parametrize(
@@ -280,9 +284,13 @@ def test_moreh_nll_loss_backward_test_callback(shape, reduction_mean, device, us
         torch_dummy = torch.randn([32, 32])
         tt_dummy = to_ttnn(torch_dummy, device=device)
 
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
 
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert (
+        num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
+        and num_program_cache_entries_list[2] == num_program_cache_entries_list[3]
+    )
 
 
 @pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_nll_loss_unreduced.py b/tests/ttnn/unit_tests/operations/test_moreh_nll_loss_unreduced.py
index 15af1d6145f..1f3e9b701aa 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_nll_loss_unreduced.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_nll_loss_unreduced.py
@@ -196,9 +196,13 @@ def test_moreh_nll_loss_unreduced_callback(shape, device, use_program_cache):
         torch_dummy = torch.randn([32, 32])
         tt_dummy = to_ttnn(torch_dummy, device=device)
 
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
 
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert (
+        num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
+        and num_program_cache_entries_list[2] == num_program_cache_entries_list[3]
+    )
 
 
 @pytest.mark.parametrize(
@@ -250,6 +254,10 @@ def test_moreh_nll_loss_unreduced_backward_test_callback(shape, none_weight, dev
         torch_dummy = torch.randn([32, 32])
         tt_dummy = to_ttnn(torch_dummy, device=device)
 
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
 
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert (
+        num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
+        and num_program_cache_entries_list[2] == num_program_cache_entries_list[3]
+    )
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_norm.py b/tests/ttnn/unit_tests/operations/test_moreh_norm.py
index e4487797305..8526d30d5d0 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_norm.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_norm.py
@@ -387,8 +387,10 @@ def test_moreh_norm_callback(dim_rtol_atol, keepdim, device, is_linalg_vector_no
         )
         torch_dummy = torch.randn([32, 32])
         ttnn_dummy = ttnn.from_torch(torch_dummy, device=device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
 
 
 @pytest.mark.parametrize("p", [2.0, 2.5, -2.5])
@@ -521,5 +523,7 @@ def test_moreh_norm_backward_callback(dim_rtol_atol, keepdim, device, is_linalg_
         )
         torch_dummy = torch.randn([32, 32])
         ttnn_dummy = ttnn.from_torch(torch_dummy, device=device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_sgd.py b/tests/ttnn/unit_tests/operations/test_moreh_sgd.py
index 7f4c43a12a6..841da226892 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_sgd.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_sgd.py
@@ -299,7 +299,7 @@ def forward(self, x):
         )
         torch_dummy = torch.randn([32, 32])
         tt_dummy = ttnn.from_torch(torch_dummy, device=device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
 
     assert dev_param_in.shape == list(model.weight.shape)
     # check param_out
@@ -322,3 +322,5 @@ def forward(self, x):
 
         assert passing
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_softmax.py b/tests/ttnn/unit_tests/operations/test_moreh_softmax.py
index 5e9fb768800..5c414f9b191 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_softmax.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_softmax.py
@@ -438,6 +438,11 @@ def test_softmax_callback(shape_dim_strategy, dtype, device, use_program_cache):
 
     for i in range(2):
         run_moreh_softmax_test(shape, dim, dtype, ttnn.TILE_LAYOUT, device, rtol, atol, True, strategy=strategy)
+        if i == 0:
+            num_program_cache_entries = device.num_program_cache_entries()
+            assert num_program_cache_entries > 0
+        else:
+            assert device.num_program_cache_entries() == num_program_cache_entries
         torch_dummy = torch.randn([32, 32])
         tt_dummy = ttnn.from_torch(torch_dummy, device=device)
 
@@ -467,6 +472,11 @@ def test_softmax_backward_callback(shape_dim_strategy, dtype, device, use_progra
         run_moreh_softmax_backward_test(
             shape, dim, dtype, ttnn.TILE_LAYOUT, device, rtol, atol, True, strategy=strategy
         )
+        if i == 0:
+            num_program_cache_entries = device.num_program_cache_entries()
+            assert num_program_cache_entries > 0
+        else:
+            assert device.num_program_cache_entries() == num_program_cache_entries
         torch_dummy = torch.randn([32, 32])
         tt_dummy = ttnn.from_torch(torch_dummy, device=device)
 
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_softmin.py b/tests/ttnn/unit_tests/operations/test_moreh_softmin.py
index 1e2fd275808..ba9f6e317f8 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_softmin.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_softmin.py
@@ -475,6 +475,11 @@ def test_softmin_callback(shape_dim_strategy, dtype, device, use_program_cache):
     rtol = atol = 0.05
     for i in range(2):
         run_moreh_softmin_test(shape, dim, dtype, ttnn.TILE_LAYOUT, device, rtol, atol, True, strategy=strategy)
+        if i == 0:
+            num_program_cache_entries = device.num_program_cache_entries()
+            assert num_program_cache_entries > 0
+        else:
+            assert device.num_program_cache_entries() == num_program_cache_entries
         torch_dummy = torch.randn([32, 32])
         tt_dummy = ttnn.from_torch(torch_dummy, device=device)
 
@@ -504,5 +509,10 @@ def test_softmin_backward_callback(shape_dim_strategy, dtype, device, use_progra
         run_moreh_softmin_backward_test(
             shape, dim, dtype, ttnn.TILE_LAYOUT, device, rtol, atol, True, strategy=strategy
         )
+        if i == 0:
+            num_program_cache_entries = device.num_program_cache_entries()
+            assert num_program_cache_entries > 0
+        else:
+            assert device.num_program_cache_entries() == num_program_cache_entries
         torch_dummy = torch.randn([32, 32])
         tt_dummy = ttnn.from_torch(torch_dummy, device=device)
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_sum.py b/tests/ttnn/unit_tests/operations/test_moreh_sum.py
index ca77c19d234..5e8616b99a8 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_sum.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_sum.py
@@ -251,6 +251,7 @@ def test_moreh_sum_enable_cache(input_shape, dim, device, use_program_cache):
     for i in range(2):
         passing = moreh_sum(input_shape, dim, keepdim[i], use_provide_output[i], False, device)
         assert passing
+    assert device.num_program_cache_entries() == 2
 
 
 @pytest.mark.parametrize(
@@ -434,6 +435,7 @@ def test_moreh_sum_backward_enable_cache(input_shape, dim, device, use_program_c
     for i in range(2):
         passing = moreh_sum_backward(input_shape, dim, keepdim[i], use_provide_output[i], False, device)
         assert passing
+    assert device.num_program_cache_entries() == num_cache_entires[dim]
 
 
 @pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py
index bc19ff76498..362db0c940c 100644
--- a/tests/ttnn/unit_tests/operations/test_new_conv2d.py
+++ b/tests/ttnn/unit_tests/operations/test_new_conv2d.py
@@ -75,11 +75,11 @@ def run_conv(
     activation="",
 ):
     if isinstance(device, ttnn.MeshDevice):
-        num_devices = len(device.get_device_ids())
         if num_devices != 1:
             assert input_mesh_mapper is not None, "Expected mesh mapper for input tensor when using device mesh"
             assert weight_mesh_mapper is not None, "Expected mesh mapper for weight tensors when using device mesh"
             assert output_mesh_composer is not None, "Expected mesh composer for output tensor when using device mesh"
+        num_devices = len(device.get_device_ids())
         total_batch_size = num_devices * batch_size  # Batch size across all devices
         logger.info(f"Using {num_devices} devices for this test")
     else:
diff --git a/tests/ttnn/unit_tests/operations/test_pad.py b/tests/ttnn/unit_tests/operations/test_pad.py
index f245489fd2e..00ef1461791 100644
--- a/tests/ttnn/unit_tests/operations/test_pad.py
+++ b/tests/ttnn/unit_tests/operations/test_pad.py
@@ -72,6 +72,7 @@ def test_pad_rm_with_program_cache(device, n, c, h, w, padding, torch_padding, v
             device=device,
             memory_config=ttnn.L1_MEMORY_CONFIG,
         )
+    assert device.num_program_cache_entries() == 1
 
 
 def run_pad_rm_sharded(device, n, c, h, w, padding, torch_padding, value, shard_orient):
@@ -270,6 +271,7 @@ def test_pad_rm_sharded(device, n, c, h, w, padding, torch_padding, value, shard
             device=device,
             memory_config=ttnn.L1_MEMORY_CONFIG,
         )
+    assert device.num_program_cache_entries() == 3
 
 
 @pytest.mark.parametrize("h", [32])
diff --git a/tests/ttnn/unit_tests/operations/test_paged_fused_update_cache.py b/tests/ttnn/unit_tests/operations/test_paged_fused_update_cache.py
index e36b249d485..d41489271b4 100644
--- a/tests/ttnn/unit_tests/operations/test_paged_fused_update_cache.py
+++ b/tests/ttnn/unit_tests/operations/test_paged_fused_update_cache.py
@@ -293,3 +293,5 @@ def test_paged_fused_update_cache_decode_program_caching(
             device,
             pcc,
         )
+
+    assert device.num_program_cache_entries() == 1
diff --git a/tests/ttnn/unit_tests/operations/test_paged_update_cache.py b/tests/ttnn/unit_tests/operations/test_paged_update_cache.py
index 97785a0f62f..5b0a628c578 100644
--- a/tests/ttnn/unit_tests/operations/test_paged_update_cache.py
+++ b/tests/ttnn/unit_tests/operations/test_paged_update_cache.py
@@ -251,6 +251,8 @@ def test_update_cache_decode_program_cache(
             cache_idx + 1, False, head_dim, max_seq_len, num_users, num_heads, input_dtype, cache_dtype, device
         )
 
+    assert device.num_program_cache_entries() == 1
+
 
 def run_test_tensor_index_update_cache_decode(
     cache_idx, head_dim, max_seq_len, num_users, num_heads, input_dtype, cache_dtype, device
@@ -359,6 +361,8 @@ def test_tensor_index_update_cache_decode_program_cache(
             cache_idx, head_dim, max_seq_len, num_users, num_heads, input_dtype, cache_dtype, device
         )
 
+    assert device.num_program_cache_entries() == 1
+
 
 def run_test_paged_update_cache_decode(
     cache_idx, block_size, head_dim, max_seq_len, num_users, num_heads, input_dtype, cache_dtype, device
@@ -553,6 +557,8 @@ def test_paged_update_cache_decode_program_caching(
             cache_idx + 10, block_size, head_dim, max_seq_len, num_users, num_heads, input_dtype, cache_dtype, device
         )
 
+    assert device.num_program_cache_entries() == 1
+
 
 def run_test_paged_fill_cache(
     block_size, head_dim, user_seq_len, max_seq_len, num_users, num_heads, input_dtype, cache_dtype, device
@@ -664,3 +670,5 @@ def test_paged_fill_cache_program_cache(
     run_test_paged_fill_cache(
         block_size, head_dim, user_seq_len, max_seq_len, num_users, num_heads, input_dtype, cache_dtype, device
     )
+
+    assert device.num_program_cache_entries() == 1
diff --git a/tests/ttnn/unit_tests/operations/test_repeat.py b/tests/ttnn/unit_tests/operations/test_repeat.py
index 6a975a050fb..c10efdff258 100644
--- a/tests/ttnn/unit_tests/operations/test_repeat.py
+++ b/tests/ttnn/unit_tests/operations/test_repeat.py
@@ -97,6 +97,13 @@ def test_pc_repeat(device, layout, shape, repeat_shape, use_program_cache):
         ), f"Output shape {output.shape} does not match torch shape {torch_results[i].shape}"
 
         assert_with_pcc(torch_results[i], output, 0.9999)
+        if i == 0:
+            base_program_cache_entires = device.num_program_cache_entries()
+        else:
+            assert (
+                device.num_program_cache_entries() == base_program_cache_entires,
+                "program cache entries differ on same configs",
+            )
 
 
 # 17975 test cases
@@ -105,6 +112,7 @@ def test_pc_repeat(device, layout, shape, repeat_shape, use_program_cache):
 def test_pc_with_different_shapes_in_sequence(device, use_program_cache):
     y = torch.rand((1, 1, 256, 384), dtype=torch.bfloat16)
     y_tt = ttnn.from_torch(y, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device)
+    base_program_cache_entires = device.num_program_cache_entries()
 
     x = torch.zeros((64, 1, 256, 384), dtype=torch.bfloat16)
     x_tt = ttnn.from_torch(x, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device)
@@ -116,6 +124,10 @@ def test_pc_with_different_shapes_in_sequence(device, use_program_cache):
         assert torch.allclose(z_torch, y, atol=1e-2), f"z_torch[{i}] != y"
     for _ in range(num_iters):
         y_tt = ttnn.from_torch(y, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device)
+        assert (
+            device.num_program_cache_entries() == base_program_cache_entires,
+            "program cache entries differ on same configs",
+        )
 
         x = torch.zeros((64, 1, 256, 384), dtype=torch.bfloat16)
         x_tt = ttnn.from_torch(x, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device)
@@ -128,6 +140,7 @@ def test_pc_with_different_shapes_in_sequence(device, use_program_cache):
     y = torch.rand((1, 1, 32, 32), dtype=torch.bfloat16)
 
     y_tt = ttnn.from_torch(y, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device)
+    base_program_cache_entires = device.num_program_cache_entries()
 
     x = torch.zeros((4, 1, 32, 32), dtype=torch.bfloat16)
     x_tt = ttnn.from_torch(x, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device)
@@ -141,6 +154,10 @@ def test_pc_with_different_shapes_in_sequence(device, use_program_cache):
         assert torch.allclose(z_torch, y, atol=1e-2), f"z_torch[{i}] != y"
     for _ in range(num_iters):
         y_tt = ttnn.from_torch(y, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device)
+        assert (
+            device.num_program_cache_entries() == base_program_cache_entires,
+            "program cache entries differ on same configs",
+        )
 
         x = torch.zeros((4, 1, 32, 32), dtype=torch.bfloat16)
         x_tt = ttnn.from_torch(x, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device)
@@ -155,6 +172,7 @@ def test_pc_with_different_shapes_in_sequence(device, use_program_cache):
     y = torch.rand((1, 1, 256, 384), dtype=torch.bfloat16)
 
     y_tt = ttnn.from_torch(y, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device)
+    base_program_cache_entires = device.num_program_cache_entries()
     z_tt = ttnn.repeat(y_tt, ttnn.Shape([64, 1, 1, 1]))
 
     for i in range(64):
@@ -163,6 +181,10 @@ def test_pc_with_different_shapes_in_sequence(device, use_program_cache):
     for _ in range(num_iters):
         y = torch.rand((1, 1, 256, 384), dtype=torch.bfloat16)
         y_tt = ttnn.from_torch(y, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device)
+        assert (
+            device.num_program_cache_entries() == base_program_cache_entires,
+            "program cache entries differ on same configs",
+        )
         z_tt = ttnn.repeat(y_tt, ttnn.Shape([64, 1, 1, 1]))
 
         for i in range(64):
diff --git a/tests/ttnn/unit_tests/operations/test_sampling.py b/tests/ttnn/unit_tests/operations/test_sampling.py
index 86cab3ff93a..e9cf04a54a0 100644
--- a/tests/ttnn/unit_tests/operations/test_sampling.py
+++ b/tests/ttnn/unit_tests/operations/test_sampling.py
@@ -140,9 +140,11 @@ def test_sampling_callback(shape, k, p, seed, device, use_program_cache):
         run_sampling(shape, k, p, seed, device)
         # Add dummy tensor to make sure that created tensor in 2 iteration don't share the same addr
         tt_dummy_tensor = ttnn.empty([1, 1, 32, 32], ttnn.bfloat16, ttnn.TILE_LAYOUT, device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
 
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
 
 
 @skip_for_grayskull("Requires wormhole_b0 to run")
@@ -166,6 +168,8 @@ def test_sampling_subcores_callback(shape, k, p, seed, device, sub_core_grids, u
         run_sampling(shape, k, p, seed, device, sub_core_grids)
         # Add dummy tensor to make sure that created tensor in 2 iteration don't share the same addr
         tt_dummy_tensor = ttnn.empty([1, 1, 32, 32], ttnn.bfloat16, ttnn.TILE_LAYOUT, device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
 
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
diff --git a/tests/ttnn/unit_tests/operations/test_slice.py b/tests/ttnn/unit_tests/operations/test_slice.py
index c10273c3748..0d6a7d043f9 100644
--- a/tests/ttnn/unit_tests/operations/test_slice.py
+++ b/tests/ttnn/unit_tests/operations/test_slice.py
@@ -79,6 +79,7 @@ def test_slice_rm_sharded_with_program_cache(device, n, c, h, w, use_program_cac
             device=device,
             memory_config=ttnn.L1_MEMORY_CONFIG,
         )
+    assert device.num_program_cache_entries() == 3
 
 
 @pytest.mark.parametrize("n", [16])
@@ -138,7 +139,7 @@ def slice_test(
         output_tensor_start[3] : output_tensor_end[3],
     ]
 
-    return a_pt, a_ref, 0
+    return a_pt, a_ref, device.num_program_cache_entries()
 
 
 @pytest.mark.parametrize(
@@ -202,6 +203,7 @@ def test_run_slice_test(
     assert a_pt.shape == a_ref.shape
     eq = torch.equal(a_pt, a_ref)
     assert eq
+    assert num_cache_entries == 1
 
     a_pt, a_ref, num_cache_entries = slice_test(
         ttnn.ROW_MAJOR_LAYOUT,
@@ -217,6 +219,7 @@ def test_run_slice_test(
     eq = torch.equal(a_pt, a_ref)
     assert eq
     # different width for row major
+    assert num_cache_entries == 2
 
     a_pt, a_ref, num_cache_entries = slice_test(
         ttnn.TILE_LAYOUT,
@@ -229,6 +232,7 @@ def test_run_slice_test(
         dtype,
     )
     # change from RM to TILE
+    assert num_cache_entries == 3
     assert a_pt.shape == a_ref.shape
     eq = torch.equal(a_pt, a_ref)
     assert eq
@@ -244,6 +248,7 @@ def test_run_slice_test(
         dtype,
     )
     # CACHE HIT
+    assert num_cache_entries == 4
     assert a_pt.shape == a_ref.shape
     eq = torch.equal(a_pt, a_ref)
     assert eq
diff --git a/tests/ttnn/unit_tests/operations/test_softmax.py b/tests/ttnn/unit_tests/operations/test_softmax.py
index 29c9785b4b0..c5378effb42 100644
--- a/tests/ttnn/unit_tests/operations/test_softmax.py
+++ b/tests/ttnn/unit_tests/operations/test_softmax.py
@@ -123,6 +123,7 @@ def test_softmax_stable_with_program_cache(
             device=device,
             memory_config=ttnn.L1_MEMORY_CONFIG,
         )
+    assert device.num_program_cache_entries() == 1
 
 
 def run_softmax_sharded_stable(
@@ -220,6 +221,7 @@ def test_softmax_sharded_stable_with_program_cache(
             device=device,
             memory_config=ttnn.L1_MEMORY_CONFIG,
         )
+    assert device.num_program_cache_entries() == 1
 
 
 @pytest.mark.parametrize("batch_size", [1, 16])
diff --git a/tests/ttnn/unit_tests/operations/test_ssm_1d_sum_reduce.py b/tests/ttnn/unit_tests/operations/test_ssm_1d_sum_reduce.py
index 0cdfc767821..195f4799201 100644
--- a/tests/ttnn/unit_tests/operations/test_ssm_1d_sum_reduce.py
+++ b/tests/ttnn/unit_tests/operations/test_ssm_1d_sum_reduce.py
@@ -77,3 +77,5 @@ def test_ssm_1d_sum_reduce_with_program_cache(device, use_program_cache):
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/tests/ttnn/unit_tests/operations/test_ssm_prefix_scan.py b/tests/ttnn/unit_tests/operations/test_ssm_prefix_scan.py
index 7aec14a3445..bd9a5114f85 100644
--- a/tests/ttnn/unit_tests/operations/test_ssm_prefix_scan.py
+++ b/tests/ttnn/unit_tests/operations/test_ssm_prefix_scan.py
@@ -184,3 +184,5 @@ def test_ssm_prefix_scan_with_program_cache(device, use_program_cache):
         run_ssm_prefix_scan(L, E, N, num_cores, dtype, device)
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, dummy_memory_config)
+
+    assert device.num_program_cache_entries() == 1
diff --git a/tests/ttnn/unit_tests/operations/test_ssm_repeat_and_interleave_eltwise_mul.py b/tests/ttnn/unit_tests/operations/test_ssm_repeat_and_interleave_eltwise_mul.py
index 1a11e1d5df8..569eafadd97 100644
--- a/tests/ttnn/unit_tests/operations/test_ssm_repeat_and_interleave_eltwise_mul.py
+++ b/tests/ttnn/unit_tests/operations/test_ssm_repeat_and_interleave_eltwise_mul.py
@@ -107,3 +107,5 @@ def test_ssm_eltwise_mul_with_program_cache(device, use_program_cache):
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
         tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
+
+    assert device.num_program_cache_entries() == 3
diff --git a/tests/ttnn/unit_tests/operations/test_uniform.py b/tests/ttnn/unit_tests/operations/test_uniform.py
index 14a12d61bfd..032854ca282 100644
--- a/tests/ttnn/unit_tests/operations/test_uniform.py
+++ b/tests/ttnn/unit_tests/operations/test_uniform.py
@@ -130,12 +130,14 @@ def test_uniform_callback(shape, rand_range, dtype, seed, device, use_program_ca
         run_uniform(shape, rand_range, dtype, device, seed=seed)
         # Add dummy tensor to make sure that created tensor in 2 iteration don't share the same addr
         tt_dummy_tensor = ttnn.empty([1, 1, 32, 32], ttnn.bfloat16, ttnn.TILE_LAYOUT, device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
 
         # Cache must hit when we change seed and seed runtime arg is overrode
         seed = seed + 1
 
     logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
 
 
 @skip_for_grayskull("Requires wormhole_b0 to run")
diff --git a/tests/ttnn/unit_tests/test_expand.py b/tests/ttnn/unit_tests/test_expand.py
index 40b99dbd4a5..e19daba93e8 100644
--- a/tests/ttnn/unit_tests/test_expand.py
+++ b/tests/ttnn/unit_tests/test_expand.py
@@ -47,4 +47,7 @@ def test_expand_callback(tensor_layout, device, use_program_cache):
     num_program_cache_entries_list = []
     for i in range(2):
         test_expand([32, 1], [32, 32], tensor_layout, device)
-        num_program_cache_entries_list.append(0)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
+
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
diff --git a/tests/ttnn/unit_tests/test_reshape.py b/tests/ttnn/unit_tests/test_reshape.py
index 9ff10beba95..40fd7c15052 100644
--- a/tests/ttnn/unit_tests/test_reshape.py
+++ b/tests/ttnn/unit_tests/test_reshape.py
@@ -211,6 +211,7 @@ def test_reshape_hw_rm_with_program_cache(device, n, c, h, w, use_program_cache)
             device=device,
             memory_config=ttnn.L1_MEMORY_CONFIG,
         )
+    assert device.num_program_cache_entries() == 1
 
 
 @pytest.mark.parametrize("h", [32])