From 4de44fe678ec51b1c5ccfe36c2cf741ed6b15263 Mon Sep 17 00:00:00 2001 From: Stanislav Minakov Date: Fri, 28 Feb 2025 15:22:24 -0800 Subject: [PATCH] Revert "Revert "Revert "Change tests""" This reverts commit 01ca8ac87795b8da10cee6c383b2ba1a134dc9d8. --- .../tests/test_bert_batch_dram.py | 9 ++++++++ .../test_bert_large_concatenate_heads.py | 2 ++ .../unit_tests/test_bert_large_ff1_matmul.py | 2 ++ .../unit_tests/test_bert_large_ff2_matmul.py | 2 ++ .../test_bert_large_fused_qkv_matmul.py | 2 ++ .../test_bert_large_post_softmax_bmm.py | 2 ++ .../test_bert_large_pre_softmax_bmm.py | 2 ++ .../test_bert_large_selfout_matmul.py | 2 ++ ...ert_large_split_and_transform_qkv_heads.py | 2 ++ ...e_split_query_key_value_and_split_heads.py | 2 ++ .../unit_testing/misc/test_attn_matmul.py | 6 +++-- ...st_distributed_layernorm_post_allgather.py | 4 ++++ ...est_distributed_layernorm_pre_allgather.py | 4 ++++ .../misc/test_matmul_1d_gather_in0.py | 3 +++ .../misc/test_matmul_dram_sharded.py | 1 + .../unit_testing/misc/test_move.py | 2 ++ .../misc/test_nlp_concat_heads.py | 2 ++ .../misc/test_nlp_create_qkv_heads.py | 6 +++++ .../misc/test_nlp_create_qkv_heads_decode.py | 2 ++ .../test_nlp_create_qkv_heads_segformer.py | 2 ++ .../misc/test_nlp_create_qkv_heads_vit.py | 2 ++ .../misc/test_nlp_kv_cache_load_slice.py | 4 +++- .../unit_testing/misc/test_reshard.py | 4 ++++ .../misc/test_rotary_embedding_llama.py | 2 ++ .../test_rotary_embedding_llama_fused_qk.py | 2 ++ .../misc/test_scaled_dot_product_attention.py | 12 ++++++++++ ...est_scaled_dot_product_attention_decode.py | 6 +++++ .../unit_testing/misc/test_transpose.py | 7 ++++++ .../operations/test_backward_embedding.py | 2 ++ .../unit_tests/operations/test_bernoulli.py | 4 +++- .../ttnn/unit_tests/operations/test_clone.py | 4 +++- .../operations/test_convert_to_chw.py | 2 ++ .../test_distributed_layernorm_sharded.py | 1 + .../operations/test_fast_reduce_nc.py | 2 ++ .../unit_tests/operations/test_full_like.py | 5 +++++ .../unit_tests/operations/test_index_fill.py | 5 +++++ .../ttnn/unit_tests/operations/test_matmul.py | 4 ++++ .../unit_tests/operations/test_moreh_adam.py | 10 ++++++++- .../unit_tests/operations/test_moreh_adamw.py | 14 +++++++++--- .../operations/test_moreh_arange.py | 4 +++- .../unit_tests/operations/test_moreh_bmm.py | 8 +++++-- .../operations/test_moreh_cumsum.py | 2 ++ .../unit_tests/operations/test_moreh_dot.py | 5 +++++ .../operations/test_moreh_dot_backward.py | 4 +++- .../unit_tests/operations/test_moreh_fold.py | 4 +++- .../unit_tests/operations/test_moreh_full.py | 5 +++++ .../operations/test_moreh_getitem.py | 8 +++++-- .../operations/test_moreh_group_norm.py | 8 +++++-- .../operations/test_moreh_layer_norm.py | 8 +++++-- .../operations/test_moreh_linear.py | 8 +++++-- .../operations/test_moreh_logsoftmax.py | 10 +++++++++ .../operations/test_moreh_matmul.py | 1 + .../unit_tests/operations/test_moreh_mean.py | 8 +++++-- .../operations/test_moreh_nll_loss.py | 12 ++++++++-- .../test_moreh_nll_loss_unreduced.py | 12 ++++++++-- .../unit_tests/operations/test_moreh_norm.py | 8 +++++-- .../unit_tests/operations/test_moreh_sgd.py | 4 +++- .../operations/test_moreh_softmax.py | 10 +++++++++ .../operations/test_moreh_softmin.py | 10 +++++++++ .../unit_tests/operations/test_moreh_sum.py | 2 ++ .../unit_tests/operations/test_new_conv2d.py | 2 +- tests/ttnn/unit_tests/operations/test_pad.py | 2 ++ .../test_paged_fused_update_cache.py | 2 ++ .../operations/test_paged_update_cache.py | 8 +++++++ .../ttnn/unit_tests/operations/test_repeat.py | 22 +++++++++++++++++++ .../unit_tests/operations/test_sampling.py | 8 +++++-- .../ttnn/unit_tests/operations/test_slice.py | 7 +++++- .../unit_tests/operations/test_softmax.py | 2 ++ .../operations/test_ssm_1d_sum_reduce.py | 2 ++ .../operations/test_ssm_prefix_scan.py | 2 ++ ...t_ssm_repeat_and_interleave_eltwise_mul.py | 2 ++ .../unit_tests/operations/test_uniform.py | 4 +++- tests/ttnn/unit_tests/test_expand.py | 5 ++++- tests/ttnn/unit_tests/test_reshape.py | 1 + 74 files changed, 324 insertions(+), 37 deletions(-) diff --git a/models/demos/metal_BERT_large_11/tests/test_bert_batch_dram.py b/models/demos/metal_BERT_large_11/tests/test_bert_batch_dram.py index a5c98018f8a..050e7b9dce4 100644 --- a/models/demos/metal_BERT_large_11/tests/test_bert_batch_dram.py +++ b/models/demos/metal_BERT_large_11/tests/test_bert_batch_dram.py @@ -398,3 +398,12 @@ def test_bert_batch_dram_with_program_cache( PERF_CNT, device, ) + + if model_config_str == "BFLOAT8_B-SHARDED": + assert device.num_program_cache_entries() == 19 + elif batch == 8 and model_config_str == "MIXED_PRECISION_BATCH8": + assert device.num_program_cache_entries() == 17 + elif batch == 9 and model_config_str in {"BFLOAT8_B-L1", "BFLOAT8_B-DRAM"}: + assert device.num_program_cache_entries() == 17 + else: + assert device.num_program_cache_entries() == 16 diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_concatenate_heads.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_concatenate_heads.py index 4714dbab498..4c99e3974bd 100644 --- a/models/experimental/bert_large_performant/unit_tests/test_bert_large_concatenate_heads.py +++ b/models/experimental/bert_large_performant/unit_tests/test_bert_large_concatenate_heads.py @@ -101,3 +101,5 @@ def test_bert_large_concatenate_heads_with_program_cache(device, use_program_cac dummy_shape = [1, 1, 32, 32] py_dummy_tensor = torch.randn(dummy_shape) tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config) + + assert device.num_program_cache_entries() == 2 diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_ff1_matmul.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_ff1_matmul.py index 0cbf02e4d5a..96cb04e56e5 100644 --- a/models/experimental/bert_large_performant/unit_tests/test_bert_large_ff1_matmul.py +++ b/models/experimental/bert_large_performant/unit_tests/test_bert_large_ff1_matmul.py @@ -203,3 +203,5 @@ def test_bert_large_ff1_matmul_with_program_cache(device, use_program_cache): dummy_shape = [1, 1, 32, 32] py_dummy_tensor = torch.randn(dummy_shape) tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config) + + assert device.num_program_cache_entries() == 2 diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_ff2_matmul.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_ff2_matmul.py index 34c9a5900d7..8b212c5699d 100644 --- a/models/experimental/bert_large_performant/unit_tests/test_bert_large_ff2_matmul.py +++ b/models/experimental/bert_large_performant/unit_tests/test_bert_large_ff2_matmul.py @@ -163,3 +163,5 @@ def test_bert_large_ff2_matmul_with_program_cache(device, use_program_cache): dummy_shape = [1, 1, 32, 32] py_dummy_tensor = torch.randn(dummy_shape) tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config) + + assert device.num_program_cache_entries() == 2 diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_fused_qkv_matmul.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_fused_qkv_matmul.py index 56a138dcec4..4d2c3660b9b 100644 --- a/models/experimental/bert_large_performant/unit_tests/test_bert_large_fused_qkv_matmul.py +++ b/models/experimental/bert_large_performant/unit_tests/test_bert_large_fused_qkv_matmul.py @@ -163,3 +163,5 @@ def test_bert_large_fused_qkv_matmul_with_program_cache(device, use_program_cach dummy_shape = [1, 1, 32, 32] py_dummy_tensor = torch.randn(dummy_shape) tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config) + + assert device.num_program_cache_entries() == 2 diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_post_softmax_bmm.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_post_softmax_bmm.py index 3d296be58bb..890d6cea1ee 100644 --- a/models/experimental/bert_large_performant/unit_tests/test_bert_large_post_softmax_bmm.py +++ b/models/experimental/bert_large_performant/unit_tests/test_bert_large_post_softmax_bmm.py @@ -120,3 +120,5 @@ def test_bert_large_post_softmax_bmm_with_program_cache(device, use_program_cach dummy_shape = [1, 1, 32, 32] py_dummy_tensor = torch.randn(dummy_shape) tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config) + + assert device.num_program_cache_entries() == 2 diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_pre_softmax_bmm.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_pre_softmax_bmm.py index f383215a01f..3ade03968a3 100644 --- a/models/experimental/bert_large_performant/unit_tests/test_bert_large_pre_softmax_bmm.py +++ b/models/experimental/bert_large_performant/unit_tests/test_bert_large_pre_softmax_bmm.py @@ -113,3 +113,5 @@ def test_bert_large_pre_softmax_bmm_with_program_cache(device, use_program_cache dummy_shape = [1, 1, 32, 32] py_dummy_tensor = torch.randn(dummy_shape) tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config) + + assert device.num_program_cache_entries() == 2 diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_selfout_matmul.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_selfout_matmul.py index f03dc887d4d..bcb2b165ba8 100644 --- a/models/experimental/bert_large_performant/unit_tests/test_bert_large_selfout_matmul.py +++ b/models/experimental/bert_large_performant/unit_tests/test_bert_large_selfout_matmul.py @@ -162,3 +162,5 @@ def test_bert_large_selfout_matmul_with_program_cache(device, use_program_cache) dummy_shape = [1, 1, 32, 32] py_dummy_tensor = torch.randn(dummy_shape) tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config) + + assert device.num_program_cache_entries() == 2 diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_split_and_transform_qkv_heads.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_split_and_transform_qkv_heads.py index 410513f29d4..fcfafffec55 100644 --- a/models/experimental/bert_large_performant/unit_tests/test_bert_large_split_and_transform_qkv_heads.py +++ b/models/experimental/bert_large_performant/unit_tests/test_bert_large_split_and_transform_qkv_heads.py @@ -126,3 +126,5 @@ def test_split_query_key_value_and_split_heads_with_program_cache(device, use_pr dummy_shape = [1, 1, 32, 32] py_dummy_tensor = torch.randn(dummy_shape) tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config) + + assert device.num_program_cache_entries() == 2 diff --git a/models/experimental/bert_large_performant/unit_tests/test_bert_large_split_query_key_value_and_split_heads.py b/models/experimental/bert_large_performant/unit_tests/test_bert_large_split_query_key_value_and_split_heads.py index 1c230f32d41..a4a0d76844f 100644 --- a/models/experimental/bert_large_performant/unit_tests/test_bert_large_split_query_key_value_and_split_heads.py +++ b/models/experimental/bert_large_performant/unit_tests/test_bert_large_split_query_key_value_and_split_heads.py @@ -127,3 +127,5 @@ def test_split_query_key_value_and_split_heads_with_program_cache(device, use_pr dummy_shape = [1, 1, 32, 32] py_dummy_tensor = torch.randn(dummy_shape) tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config) + + assert device.num_program_cache_entries() == 2 diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_attn_matmul.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_attn_matmul.py index 631fd135d04..68bdc6501ad 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_attn_matmul.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_attn_matmul.py @@ -339,7 +339,7 @@ def test_group_attn_matmul_with_program_cache( else: output_mem_config = interleaved_mem_config - num_cache_entries_start = 0 + num_cache_entries_start = device.num_program_cache_entries() tt_output_tensor_on_device = ttnn.experimental.group_attn_matmul( tt_input_tensor_a, tt_input_tensor_b, @@ -347,7 +347,7 @@ def test_group_attn_matmul_with_program_cache( memory_config=output_mem_config, dtype=output_dtype, ) - num_cache_entries += 0 - num_cache_entries_start + num_cache_entries += device.num_program_cache_entries() - num_cache_entries_start if sharded: tt_output_tensor_on_device = ttnn.sharded_to_interleaved( @@ -363,6 +363,8 @@ def test_group_attn_matmul_with_program_cache( allclose, output = comp_pcc(tt_output_tensor, golden_output_tensor) assert allclose, f"FAILED: {output}" + assert num_cache_entries == 1 + device.enable_async(False) diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_distributed_layernorm_post_allgather.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_distributed_layernorm_post_allgather.py index efdc4e98355..16834c94dbb 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_distributed_layernorm_post_allgather.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_distributed_layernorm_post_allgather.py @@ -194,3 +194,7 @@ def test_layernorm_part_2_with_program_cache2(inp_shape, n_devices, is_rmsnorm, ) ) run_layernorm_part_2(inp_shape, n_devices, is_rmsnorm, dtype, dtype, device) + + assert device.num_program_cache_entries() == 1, "Program cache should have only one entry" + str( + device.num_program_cache_entries() + ) diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_distributed_layernorm_pre_allgather.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_distributed_layernorm_pre_allgather.py index 17596a94169..83648cdf223 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_distributed_layernorm_pre_allgather.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_distributed_layernorm_pre_allgather.py @@ -269,3 +269,7 @@ def test_layernorm_part_1_with_program_cache2( ) ) run_layernorm_part_1(inp_shape, n_devices, is_rmsnorm, input_dtype, output_dtype, device) + + assert device.num_program_cache_entries() == 1, "Program cache should have only one entry" + str( + device.num_program_cache_entries() + ) diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_1d_gather_in0.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_1d_gather_in0.py index cf375694fea..028583664b0 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_1d_gather_in0.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_1d_gather_in0.py @@ -329,6 +329,9 @@ def run_multi_core_matmul_1d( assert passing + # Check program cache + assert device.num_program_cache_entries() == 1 # Only 1 op + @pytest.mark.skipif(is_grayskull(), reason="GS does not support fp32") @pytest.mark.skipif(is_blackhole(), reason="Test suite for GS only") diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_dram_sharded.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_dram_sharded.py index 44100d8e44d..423861dd172 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_dram_sharded.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_dram_sharded.py @@ -271,6 +271,7 @@ def test_matmul_in1_dram_sharded_with_program_cache( buffer_type=ttnn.BufferType.DRAM, ) tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, in0_dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config) + assert device.num_program_cache_entries() == 3 def run_test_matmul_in1_dram_sharded_mm_chain( diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_move.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_move.py index 20142bffdd5..9ff5dd915a4 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_move.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_move.py @@ -109,3 +109,5 @@ def test_move_op_with_program_cache(device, use_program_cache): dummy_shape = [1, 1, 32, 32] py_dummy_tensor = torch.randn(dummy_shape) tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config) + + assert device.num_program_cache_entries() == 2 diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_concat_heads.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_concat_heads.py index d7013094e18..bce8e5e91cf 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_concat_heads.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_concat_heads.py @@ -103,3 +103,5 @@ def test_nlp_concat_heads_with_program_cache(device, use_program_cache): dummy_shape = [1, 1, 32, 32] py_dummy_tensor = torch.randn(dummy_shape) tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config) + + assert device.num_program_cache_entries() == 2 diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads.py index 6e21fb642f9..e0eb4589021 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads.py @@ -119,6 +119,8 @@ def test_nlp_create_qkv_heads_falcon7b_with_program_cache(device, use_program_ca py_dummy_tensor = torch.randn(dummy_shape) tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config) + assert device.num_program_cache_entries() == 2 + """ Generic shapes + functionality @@ -363,6 +365,8 @@ def test_nlp_create_qkv_heads_with_program_cache(device, use_program_cache): py_dummy_tensor = torch.randn(dummy_shape) tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config) + assert device.num_program_cache_entries() == 2 + def run_sharded_nlp_create_qkv_heads_test( batch, @@ -527,3 +531,5 @@ def test_sharded_nlp_create_qkv_heads_with_program_cache(device, use_program_cac dummy_shape = [1, 1, 32, 32] py_dummy_tensor = torch.randn(dummy_shape) tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config) + + assert device.num_program_cache_entries() == 2 diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_decode.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_decode.py index c361ec37be0..bcfce9a534d 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_decode.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_decode.py @@ -411,6 +411,7 @@ def test_create_heads_with_slice( ) # BH does s2i and i2s inside of to_device and from_device as device ops expected_entries = 1 if not is_blackhole() else 4 if overlap_coregrid else 5 + assert device.num_program_cache_entries() == expected_entries @pytest.fixture() @@ -462,6 +463,7 @@ def test_create_min_width_shard_subcoregrid( overlap_coregrid=overlap_coregrid, sub_core_grids=sub_core_grids, ) + assert device.num_program_cache_entries() == 1, "Only one Op program cache should exist" def run_test_create_width_shard_by_head( diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_segformer.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_segformer.py index 7e9ccd8d61f..7135cb3517f 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_segformer.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_segformer.py @@ -106,3 +106,5 @@ def test_nlp_create_qkv_heads_segformer_with_program_cache(device, use_program_c dummy_shape = [1, 1, 32, 32] py_dummy_tensor = torch.randn(dummy_shape) tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config) + + assert device.num_program_cache_entries() == 2 diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_vit.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_vit.py index e6c21497a19..5e71490bfc0 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_vit.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_vit.py @@ -119,3 +119,5 @@ def test_nlp_create_qkv_heads_vit_with_program_cache(device, use_program_cache): dummy_shape = [1, 1, 32, 32] py_dummy_tensor = torch.randn(dummy_shape) tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config) + + assert device.num_program_cache_entries() == 2 diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_kv_cache_load_slice.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_kv_cache_load_slice.py index 661ba847ad4..12f87d94346 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_kv_cache_load_slice.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_kv_cache_load_slice.py @@ -39,7 +39,7 @@ def unpadding_test( # Pytorch reference test_tensor_ref = inp[:, :, seq_len_start:seq_len_end] - return test_tensor_pt, test_tensor_ref, test_tensor_tt.memory_config(), 0 + return test_tensor_pt, test_tensor_ref, test_tensor_tt.memory_config(), device.num_program_cache_entries() @pytest.mark.parametrize( @@ -120,6 +120,7 @@ def test_run_unpadding_test( dtype, ) assert a_pt.shape == a_ref.shape + assert num_cache_entries == 2 if dtype == ttnn.bfloat8_b: # inevitable precision loss for bfloat8_b eq, pcc = comp_pcc(a_pt, a_ref, 0.999) @@ -147,6 +148,7 @@ def test_run_unpadding_test( dtype, ) assert a_pt.shape == a_ref.shape + assert num_cache_entries == 3 if dtype == ttnn.bfloat8_b: # inevitable precision loss for bfloat8_b eq, pcc = comp_pcc(a_pt, a_ref, 0.999) diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_reshard.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_reshard.py index 07210afcc1d..1f51f5eb372 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_reshard.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_reshard.py @@ -367,6 +367,8 @@ def test_reshard_with_program_cache( passing, output = comp_pcc(torch_tensor1, torch_tensor_after_round_trip1) assert passing, output + assert device.num_program_cache_entries() == 3 + @skip_for_blackhole("GH Issue #15234") @pytest.mark.parametrize( @@ -617,3 +619,5 @@ def test_dram_reshard_with_program_cache( dummy_tensor = ( ttnn.Tensor(torch.rand([2, 2, 128, 64]), dtype).to(ttnn.TILE_LAYOUT).to(device, ttnn.L1_MEMORY_CONFIG) ) + + assert device.num_program_cache_entries() == 1 diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py index 616d1e61fb7..01ea4b5858a 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py @@ -458,3 +458,5 @@ def test_rotary_embedding_llama_with_program_cache( if batch % ttnn.TILE_SIZE != 0: num_ops += 1 # slice + + assert device.num_program_cache_entries() == num_ops diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama_fused_qk.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama_fused_qk.py index c400ee590c3..1f4aaca24a8 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama_fused_qk.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama_fused_qk.py @@ -136,3 +136,5 @@ def test_rotary_embedding_llama_fused_qk_with_program_cache( if (batch * 2) % ttnn.TILE_SIZE != 0: num_ops += 1 # slice + + assert device.num_program_cache_entries() == num_ops diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention.py index 47e75d22d21..9bc75655c85 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention.py @@ -214,6 +214,8 @@ def test_sdpa_tt_with_program_cache(device, b, nh, nkv, s, d, q_chunk_size, k_ch for _ in range(2): run_test_sdpa_tt(device, b, nh, nkv, s, d, q_chunk_size, k_chunk_size, dtype) + assert device.num_program_cache_entries() == 1 + def run_sdpa_noncausal(device, b, nh, nkv, sq, d, q_chunk_size, k_chunk_size, dtype, sk=None, use_mask=True): torch.manual_seed(1234) @@ -500,6 +502,11 @@ def test_sdpa_chunked( use_high_precision_compute, ) + # Print number of program cache entries + assert device.num_program_cache_entries() == 1, "Program cache should only have 1 entry but has {}".format( + device.num_program_cache_entries() + ) + @skip_for_blackhole("Mismatching on BH, see #12349") @pytest.mark.skipif(is_watcher_enabled(), reason="Kernel OOM with watcher enabled") @@ -553,6 +560,11 @@ def test_sdpa_chunked_iterate_batch( grid_size=(1, 1), ) + # Print number of program cache entries + assert device.num_program_cache_entries() == 1, "Program cache should only have 1 entry but has {}".format( + device.num_program_cache_entries() + ) + def run_test_joint_sdpa( device, diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention_decode.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention_decode.py index f32420f6c0c..1ac916d2413 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention_decode.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention_decode.py @@ -567,6 +567,7 @@ def test_sdpa_decode_non_causal(device, b, nh, nkv, s, d, dtype, grid_size, q_dt run_test_sdpa_decode_single_iter( device, b, nh, nkv, s, d, dtype, grid_size, q_dtype, sharded_in=False, sharded_out=False, causal=False ) + assert device.num_program_cache_entries() == 1 @skip_for_blackhole("Unsupported on BH, see #12349") @@ -886,6 +887,8 @@ def test_sdpa_decode_paged_attention( sharded_out=False, ) + assert device.num_program_cache_entries() == 4 + @skip_for_blackhole("Unsupported on BH, see #12349") @skip_for_grayskull("Unsupported in GS since L1 runs OOM with most configs") @@ -986,6 +989,7 @@ def test_sdpa_decode_sharded_on_subcoregrids( start_core=start_core, sub_core_grids=sub_core_grids, ) + assert device.num_program_cache_entries() == 1 @skip_for_blackhole("Unsupported on BH, see #12349") @@ -1150,6 +1154,8 @@ def test_sdpa_decode_program_cache(device, b, nh, nkv, s, d, dtype, use_program_ cur_pos_tensor=True, ) + assert device.num_program_cache_entries() == 4 + def run_test_sdpa_decode_ndpcc(device, b, nh, nkv, s, d, dtype, grid_size, q_dtype=ttnn.bfloat16): compute_grid_size = device.compute_with_storage_grid_size() diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py index eee3c1c9d04..3cd7f275927 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py @@ -52,6 +52,9 @@ def transpose( logger.info(output) assert passing + if expected_program_cache_size != None: + assert device.num_program_cache_entries() == expected_program_cache_size + @pytest.mark.parametrize( "dtype", @@ -383,6 +386,7 @@ def test_transpose_hw_rm_with_program_cache(device, n, c, h, w, use_program_cach device=device, memory_config=ttnn.L1_MEMORY_CONFIG, ) + assert device.num_program_cache_entries() == 1 @skip_for_blackhole("Mismatching on BH, see #12349") @@ -474,6 +478,7 @@ def test_transpose_hw_sharded_rm_with_program_cache(device, n, c, h, w, use_prog device=device, memory_config=ttnn.L1_MEMORY_CONFIG, ) + assert device.num_program_cache_entries() == 3 @pytest.mark.parametrize("n", [16]) @@ -534,6 +539,7 @@ def test_transpose_hc_rm_with_program_cache(device, n, c, h, w, use_program_cach device=device, memory_config=ttnn.L1_MEMORY_CONFIG, ) + assert device.num_program_cache_entries() == 1 def run_transpose_hc_sharded(device, n, c, h, w, grid_size): @@ -595,6 +601,7 @@ def test_transpose_hc_sharded_with_program_cache(device, n, c, h, w, grid_size, device=device, memory_config=ttnn.L1_MEMORY_CONFIG, ) + assert device.num_program_cache_entries() == 3 @pytest.mark.parametrize( diff --git a/tests/ttnn/unit_tests/operations/test_backward_embedding.py b/tests/ttnn/unit_tests/operations/test_backward_embedding.py index d2ded07ad16..102490cee3f 100644 --- a/tests/ttnn/unit_tests/operations/test_backward_embedding.py +++ b/tests/ttnn/unit_tests/operations/test_backward_embedding.py @@ -120,3 +120,5 @@ def test_embedding_bw_with_program_cache( logger.debug(comp_out) assert comp_pass + + assert device.num_program_cache_entries() == 1 diff --git a/tests/ttnn/unit_tests/operations/test_bernoulli.py b/tests/ttnn/unit_tests/operations/test_bernoulli.py index 58d6af052aa..c2c21a61f6e 100644 --- a/tests/ttnn/unit_tests/operations/test_bernoulli.py +++ b/tests/ttnn/unit_tests/operations/test_bernoulli.py @@ -95,11 +95,13 @@ def test_bernoulli_callback(shape, seed, in_dtype, out_dtype, device, is_out_all run_bernoulli(shape, in_dtype, out_dtype, device, seed=seed, is_out_alloc=is_out_alloc) # Add dummy tensor to make sure that created tensor in 2 iteration don't share the same addr tt_dummy_tensor = ttnn.empty([1, 1, 32, 32], ttnn.bfloat16, ttnn.TILE_LAYOUT, device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) # Cache must hit when we change seed and seed runtime arg is overrode seed = seed + 1 logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] @skip_for_grayskull("Requires wormhole_b0 to run") diff --git a/tests/ttnn/unit_tests/operations/test_clone.py b/tests/ttnn/unit_tests/operations/test_clone.py index 45f69d45511..e928d6e29c9 100644 --- a/tests/ttnn/unit_tests/operations/test_clone.py +++ b/tests/ttnn/unit_tests/operations/test_clone.py @@ -243,5 +243,7 @@ def test_clone_callback( ) torch_dummy = torch.randn([32, 32]) ttnn_dummy = ttnn.from_torch(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] diff --git a/tests/ttnn/unit_tests/operations/test_convert_to_chw.py b/tests/ttnn/unit_tests/operations/test_convert_to_chw.py index ebbc39f0029..66bbee701f6 100644 --- a/tests/ttnn/unit_tests/operations/test_convert_to_chw.py +++ b/tests/ttnn/unit_tests/operations/test_convert_to_chw.py @@ -110,3 +110,5 @@ def test_convert_to_chw_with_program_cache(device, use_program_cache): tt_dummy_tensor = ( ttnn.Tensor(py_dummy_tensor, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device, ttnn.L1_MEMORY_CONFIG) ) + + assert device.num_program_cache_entries() == 2 diff --git a/tests/ttnn/unit_tests/operations/test_distributed_layernorm_sharded.py b/tests/ttnn/unit_tests/operations/test_distributed_layernorm_sharded.py index 881628ffe67..bc12c5d61c6 100644 --- a/tests/ttnn/unit_tests/operations/test_distributed_layernorm_sharded.py +++ b/tests/ttnn/unit_tests/operations/test_distributed_layernorm_sharded.py @@ -269,6 +269,7 @@ def run_pre_allgather_layernorm( tt_ex2, torch_ex2, atol=max_atol_ex2 ), f"E(x^2) mismatch for device {d} (atol: {atol_delta_ex2})" + assert device.num_program_cache_entries() == 2, "Program cache not working as expected" logger.info("Pre-allgather layernorm test passed for all devices") diff --git a/tests/ttnn/unit_tests/operations/test_fast_reduce_nc.py b/tests/ttnn/unit_tests/operations/test_fast_reduce_nc.py index 5e8a6072c3f..f1b5f8306fa 100644 --- a/tests/ttnn/unit_tests/operations/test_fast_reduce_nc.py +++ b/tests/ttnn/unit_tests/operations/test_fast_reduce_nc.py @@ -164,6 +164,7 @@ def test_fast_reduce_nc_with_prgm_caching(dims, device, use_program_cache): logger.debug(f"Output pcc={output_pcc}") assert passing + assert device.num_program_cache_entries() == len(dims) + 1 input_shape_2 = [1, 8, 32, 32] output_shape_2 = input_shape_2.copy() @@ -189,3 +190,4 @@ def test_fast_reduce_nc_with_prgm_caching(dims, device, use_program_cache): logger.debug(f"Output pcc={output_pcc}") assert passing + assert device.num_program_cache_entries() == 2 * len(dims) + 1 diff --git a/tests/ttnn/unit_tests/operations/test_full_like.py b/tests/ttnn/unit_tests/operations/test_full_like.py index 5ff2693c3f4..cbac2a9d28b 100644 --- a/tests/ttnn/unit_tests/operations/test_full_like.py +++ b/tests/ttnn/unit_tests/operations/test_full_like.py @@ -111,6 +111,11 @@ def test_full_like_callback(device, input_shape, fill_value, layout, use_program tt_output = ttnn.moreh_full_like(tt_input, fill_value) assert ttnn.is_tensor_storage_on_device(tt_output) tt_output_cpu = ttnn.to_torch(tt_output) + if i == 0: + num_program_cache_entries = device.num_program_cache_entries() + assert num_program_cache_entries > 0 + else: + assert device.num_program_cache_entries() == num_program_cache_entries torch_dummy = torch.randn([32, 32]) tt_dummy = ttnn.from_torch(torch_dummy, device=device) diff --git a/tests/ttnn/unit_tests/operations/test_index_fill.py b/tests/ttnn/unit_tests/operations/test_index_fill.py index a8b7be1ba4e..8935f5c5bab 100644 --- a/tests/ttnn/unit_tests/operations/test_index_fill.py +++ b/tests/ttnn/unit_tests/operations/test_index_fill.py @@ -129,5 +129,10 @@ def test_index_fill_callback(shape, dim, value, device, use_program_cache): torch.manual_seed(2024) for i in range(2): run_index_fill_test(shape, dim, value, torch.int32, device) + if i == 0: + num_program_cache_entries = device.num_program_cache_entries() + assert num_program_cache_entries > 0 + else: + assert device.num_program_cache_entries() == num_program_cache_entries torch_dummy = torch.randn([32, 32]) tt_dummy = ttnn.from_torch(torch_dummy, device=device) diff --git a/tests/ttnn/unit_tests/operations/test_matmul.py b/tests/ttnn/unit_tests/operations/test_matmul.py index 1468d0e12d5..1bb4cb64bf6 100644 --- a/tests/ttnn/unit_tests/operations/test_matmul.py +++ b/tests/ttnn/unit_tests/operations/test_matmul.py @@ -631,6 +631,7 @@ def test_matmul_2d_multiple_output_blocks_per_core( device=device, memory_config=ttnn.L1_MEMORY_CONFIG, ) + assert device.num_program_cache_entries() == 1 def run_matmul_2d_tiny_tile( @@ -791,6 +792,7 @@ def test_matmul_2d_tiny_tile( device=device, memory_config=ttnn.L1_MEMORY_CONFIG, ) + assert device.num_program_cache_entries() == 1 def run_matmul_1d_tiny_tile( @@ -954,6 +956,7 @@ def test_matmul_1d_tiny_tile( device=device, memory_config=ttnn.L1_MEMORY_CONFIG, ) + assert device.num_program_cache_entries() == 1 def run_matmul_1d_multiple_output_blocks_per_core( @@ -1179,6 +1182,7 @@ def test_matmul_1d_multiple_output_blocks_per_core( device=device, memory_config=ttnn.L1_MEMORY_CONFIG, ) + assert device.num_program_cache_entries() == 1 @pytest.mark.parametrize("side", ["height", "width"]) diff --git a/tests/ttnn/unit_tests/operations/test_moreh_adam.py b/tests/ttnn/unit_tests/operations/test_moreh_adam.py index 9c6caecefbe..ba0095d4339 100644 --- a/tests/ttnn/unit_tests/operations/test_moreh_adam.py +++ b/tests/ttnn/unit_tests/operations/test_moreh_adam.py @@ -170,8 +170,10 @@ def test_moreh_adam_callback(params, device, use_program_cache): run_moreh_adam(shape, lr, betas, eps, weight_decay, amsgrad, fp32_dest_acc_en, device) torch_dummy = torch.randn([32, 32]) tt_dummy = to_ttnn(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] @pytest.mark.parametrize( @@ -190,8 +192,11 @@ def test_moreh_adam_caching(params, device, use_program_cache): run_moreh_adam(shape, lr, betas, eps, weight_decay, amsgrad, fp32_dest_acc_en, device, step=i) torch_dummy = torch.randn([32, 32]) tt_dummy = to_ttnn(torch_dummy, device=device) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + for i in range(1, 4): + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[i] num_program_cache_entries_list = [] for i in range(4): @@ -203,5 +208,8 @@ def test_moreh_adam_caching(params, device, use_program_cache): run_moreh_adam(shape, lr, betas, eps, weight_decay, amsgrad, fp32_dest_acc_en, device) torch_dummy = torch.randn([32, 32]) tt_dummy = to_ttnn(torch_dummy, device=device) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + for i in range(1, 4): + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[i] diff --git a/tests/ttnn/unit_tests/operations/test_moreh_adamw.py b/tests/ttnn/unit_tests/operations/test_moreh_adamw.py index 1ee583ae00d..0e7f65510b4 100644 --- a/tests/ttnn/unit_tests/operations/test_moreh_adamw.py +++ b/tests/ttnn/unit_tests/operations/test_moreh_adamw.py @@ -224,8 +224,10 @@ def test_moreh_adamw_callback(shape, lr, betas, eps, weight_decay, amsgrad, step run_moreh_adamw(shape, lr, betas, eps, weight_decay, amsgrad, step, device) torch_dummy = torch.randn([32, 32]) tt_dummy = to_ttnn(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] @pytest.mark.parametrize( @@ -277,8 +279,11 @@ def test_moreh_adamw_cache(shape, lr, betas, eps, weight_decay, amsgrad, device, run_moreh_adamw(shape, lr, betas, eps, weight_decay, amsgrad, step, device) torch_dummy = torch.randn([32, 32]) tt_dummy = to_ttnn(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + for i in range(1, 4): + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[i] num_program_cache_entries_list = [] for _ in range(4): @@ -288,5 +293,8 @@ def test_moreh_adamw_cache(shape, lr, betas, eps, weight_decay, amsgrad, device, run_moreh_adamw(shape, lr, betas, eps, weight_decay, amsgrad, 8, device) torch_dummy = torch.randn([32, 32]) tt_dummy = to_ttnn(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + for i in range(1, 4): + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[i] diff --git a/tests/ttnn/unit_tests/operations/test_moreh_arange.py b/tests/ttnn/unit_tests/operations/test_moreh_arange.py index 82a03ddcbf0..f7717e59ff1 100644 --- a/tests/ttnn/unit_tests/operations/test_moreh_arange.py +++ b/tests/ttnn/unit_tests/operations/test_moreh_arange.py @@ -123,5 +123,7 @@ def test_arange_callback(start_end_step, optional_output, dtype, device, use_pro run_moreh_arange(start_end_step, optional_output, dtype, True, device) torch_dummy = torch.randn([32, 32]) tt_dummy = ttnn.from_torch(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] diff --git a/tests/ttnn/unit_tests/operations/test_moreh_bmm.py b/tests/ttnn/unit_tests/operations/test_moreh_bmm.py index f03207155ce..72c1e6bc8fb 100644 --- a/tests/ttnn/unit_tests/operations/test_moreh_bmm.py +++ b/tests/ttnn/unit_tests/operations/test_moreh_bmm.py @@ -257,8 +257,10 @@ def test_moreh_bmm_callback(shape, device, use_program_cache): run_moreh_bmm(shape, True, False if is_grayskull() else True, device) torch_dummy = torch.randn([32, 32]) ttnn_dummy = ttnn.from_torch(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] @pytest.mark.parametrize( @@ -336,5 +338,7 @@ def test_moreh_bmm_backward_callback(requires_grad, device, use_program_cache): run_moreh_bmm_backward([7, 511, 313, 765], requires_grad, False if is_grayskull() else True, device) torch_dummy = torch.randn([32, 32]) ttnn_dummy = ttnn.from_torch(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] diff --git a/tests/ttnn/unit_tests/operations/test_moreh_cumsum.py b/tests/ttnn/unit_tests/operations/test_moreh_cumsum.py index 6b533e3f4bf..34048c0ec00 100644 --- a/tests/ttnn/unit_tests/operations/test_moreh_cumsum.py +++ b/tests/ttnn/unit_tests/operations/test_moreh_cumsum.py @@ -189,6 +189,7 @@ def test_moreh_cumsum_callback(input_shape, dim, device, use_program_cache): logger.debug(f"Output pcc={output_pcc}") assert passing + assert device.num_program_cache_entries() == 1 @pytest.mark.parametrize( @@ -240,3 +241,4 @@ def test_moreh_cumsum_backward_callback(input_shape, dim, device, use_program_ca logger.debug(f"Output pcc={output_pcc}") assert passing + assert device.num_program_cache_entries() == 1 diff --git a/tests/ttnn/unit_tests/operations/test_moreh_dot.py b/tests/ttnn/unit_tests/operations/test_moreh_dot.py index dd18f43f7c2..3719883defe 100644 --- a/tests/ttnn/unit_tests/operations/test_moreh_dot.py +++ b/tests/ttnn/unit_tests/operations/test_moreh_dot.py @@ -119,6 +119,11 @@ def test_moreh_matmul_1d_callback(input_shape, dtype, device, use_program_cache) run_moreh_dot_test(input_shape, dtype, device) torch_dummy = torch.randn([32, 32]) tt_dummy = ttnn.from_torch(torch_dummy, device=device) + if i == 0: + num_program_cache_entries = device.num_program_cache_entries() + assert num_program_cache_entries > 0 + else: + assert device.num_program_cache_entries() == num_program_cache_entries @pytest.mark.parametrize( diff --git a/tests/ttnn/unit_tests/operations/test_moreh_dot_backward.py b/tests/ttnn/unit_tests/operations/test_moreh_dot_backward.py index 00274fd53df..51b443396bf 100644 --- a/tests/ttnn/unit_tests/operations/test_moreh_dot_backward.py +++ b/tests/ttnn/unit_tests/operations/test_moreh_dot_backward.py @@ -182,8 +182,10 @@ def test_moreh_dot_backward_callback( num_program_in_cache = [] for i in range(2): run_moreh_dot_backward(input_shape, requires_grad, device) - num_program_in_cache.append(0) + num_program_in_cache.append(device.num_program_cache_entries()) dummy = torch.randn([32, 32]) tt_dummy = ttnn.from_torch(dummy, device=device) logger.info(f"num_program_in_cache={num_program_in_cache}") + assert num_program_in_cache[0] > 0 + assert num_program_in_cache[0] == num_program_in_cache[1] diff --git a/tests/ttnn/unit_tests/operations/test_moreh_fold.py b/tests/ttnn/unit_tests/operations/test_moreh_fold.py index f5707164d4d..4da4c78b14a 100644 --- a/tests/ttnn/unit_tests/operations/test_moreh_fold.py +++ b/tests/ttnn/unit_tests/operations/test_moreh_fold.py @@ -84,5 +84,7 @@ def test_fold_callback( # Add dummy tensor to make sure that created tensor in 2 iteration don't share the same addr torch_dummy = torch.randn([32, 32]) tt_dummy = ttnn.from_torch(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] diff --git a/tests/ttnn/unit_tests/operations/test_moreh_full.py b/tests/ttnn/unit_tests/operations/test_moreh_full.py index 8af9c3ff813..12e08f19166 100644 --- a/tests/ttnn/unit_tests/operations/test_moreh_full.py +++ b/tests/ttnn/unit_tests/operations/test_moreh_full.py @@ -101,5 +101,10 @@ def test_full_callback(device, input_shape, fill_value, layout, use_program_cach tt_output_cpu = ttnn.to_torch(tt_output) torch_dummy = torch.randn([32, 32]) ttnn_dummy = ttnn.from_torch(torch_dummy, device=device) + if i == 0: + num_program_cache_entries = device.num_program_cache_entries() + assert num_program_cache_entries > 0 + else: + assert device.num_program_cache_entries() == num_program_cache_entries assert torch.equal(torch_output, tt_output_cpu) diff --git a/tests/ttnn/unit_tests/operations/test_moreh_getitem.py b/tests/ttnn/unit_tests/operations/test_moreh_getitem.py index 8d5770bbedd..f42b323b4af 100644 --- a/tests/ttnn/unit_tests/operations/test_moreh_getitem.py +++ b/tests/ttnn/unit_tests/operations/test_moreh_getitem.py @@ -275,8 +275,10 @@ def test_getitem_RAW_MAJOR_callback(shape_index_dim, dtype, index_size, device, run_getitem_RAW_MAJOR(shape_index_dim, dtype, index_size, device) torch_dummy = torch.randn([32, 32]) tt_dummy = to_ttnn(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] @skip_for_blackhole("Mismatching on Blackhole, see #12349") @@ -823,5 +825,7 @@ def test_getitem_tilized_one_index_callback( run_moreh_geitem_tilized_one_index(shape_index_dim, dtype, index_size, row_major_index, device) torch_dummy = torch.randn([32, 32]) tt_dummy = to_ttnn(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] diff --git a/tests/ttnn/unit_tests/operations/test_moreh_group_norm.py b/tests/ttnn/unit_tests/operations/test_moreh_group_norm.py index 92e70d4bfa0..ee35d8244e7 100644 --- a/tests/ttnn/unit_tests/operations/test_moreh_group_norm.py +++ b/tests/ttnn/unit_tests/operations/test_moreh_group_norm.py @@ -330,8 +330,10 @@ def test_moreh_group_norm_callback(N, C_num_groups, HW, eps, affine, compute_mea run_test_moreh_group_norm(N, C_num_groups, HW, eps, affine, compute_mean_rstd, device) torch_dummy = torch.randn([32, 32]) tt_dummy = to_ttnn(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] def run_test_moreh_group_norm_backward( @@ -539,5 +541,7 @@ def test_moreh_group_norm_backward_callback( ) torch_dummy = torch.randn([32, 32]) tt_dummy = to_ttnn(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] diff --git a/tests/ttnn/unit_tests/operations/test_moreh_layer_norm.py b/tests/ttnn/unit_tests/operations/test_moreh_layer_norm.py index 16ee75454be..f5bec45afd9 100644 --- a/tests/ttnn/unit_tests/operations/test_moreh_layer_norm.py +++ b/tests/ttnn/unit_tests/operations/test_moreh_layer_norm.py @@ -656,8 +656,10 @@ def test_moreh_layer_norm_callback( run_moreh_layer_norm(input_shape_normalized_dims, elementwise_affine, eps, dtype, device) torch_dummy = torch.randn([32, 32]) tt_dummy = to_ttnn(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] @skip_for_grayskull("Using the transpose function in copy_tile causes a hang.") @@ -695,8 +697,10 @@ def test_moreh_layer_norm_backward_callback( run_moreh_layer_norm_backward(input_shape_normalized_dims, elementwise_affine, eps, dtype, device) torch_dummy = torch.randn([32, 32]) tt_dummy = to_ttnn(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] @skip_for_grayskull("Using the transpose function in copy_tile causes a hang.") diff --git a/tests/ttnn/unit_tests/operations/test_moreh_linear.py b/tests/ttnn/unit_tests/operations/test_moreh_linear.py index f28c35403d3..d5368466b29 100644 --- a/tests/ttnn/unit_tests/operations/test_moreh_linear.py +++ b/tests/ttnn/unit_tests/operations/test_moreh_linear.py @@ -223,8 +223,10 @@ def test_moreh_linear_enable_cache(shapes, device, use_program_cache): assert passing torch_dummy = torch.randn([32, 32]) tt_dummy = to_ttnn(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] def moreh_linear_backward( @@ -384,8 +386,10 @@ def test_moreh_linear_backward_enable_cache(shapes, device, use_program_cache): assert passing torch_dummy = torch.randn([32, 32]) tt_dummy = to_ttnn(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] @skip_for_grayskull("GS does not support fp32") diff --git a/tests/ttnn/unit_tests/operations/test_moreh_logsoftmax.py b/tests/ttnn/unit_tests/operations/test_moreh_logsoftmax.py index 39786a172af..94723e12e30 100644 --- a/tests/ttnn/unit_tests/operations/test_moreh_logsoftmax.py +++ b/tests/ttnn/unit_tests/operations/test_moreh_logsoftmax.py @@ -482,6 +482,11 @@ def test_logsoftmax_callback(shape_dim_strategy, dtype, device, use_program_cach for i in range(2): run_moreh_logsoftmax_test(shape, dim, dtype, ttnn.TILE_LAYOUT, device, rtol, atol, True, strategy=strategy) + if i == 0: + num_program_cache_entries = device.num_program_cache_entries() + assert num_program_cache_entries > 0 + else: + assert device.num_program_cache_entries() == num_program_cache_entries torch_dummy = torch.randn([32, 32]) tt_dummy = ttnn.from_torch(torch_dummy, device=device) @@ -512,5 +517,10 @@ def test_logsoftmax_backward_callback(shape_dim_strategy, dtype, device, use_pro run_moreh_logsoftmax_backward_test( shape, dim, dtype, ttnn.TILE_LAYOUT, device, rtol, atol, True, strategy=strategy ) + if i == 0: + num_program_cache_entries = device.num_program_cache_entries() + assert num_program_cache_entries > 0 + else: + assert device.num_program_cache_entries() == num_program_cache_entries torch_dummy = torch.randn([32, 32]) tt_dummy = ttnn.from_torch(torch_dummy, device=device) diff --git a/tests/ttnn/unit_tests/operations/test_moreh_matmul.py b/tests/ttnn/unit_tests/operations/test_moreh_matmul.py index 5749686c305..bc0c95a2cfb 100644 --- a/tests/ttnn/unit_tests/operations/test_moreh_matmul.py +++ b/tests/ttnn/unit_tests/operations/test_moreh_matmul.py @@ -317,6 +317,7 @@ def test_moreh_matmul_enable_cache(params, device, use_program_cache): params = tuple(param_list) passing = moreh_matmul(params, False, None, device) assert passing + assert device.num_program_cache_entries() == 2 @skip_for_grayskull("GS does not support fp32") diff --git a/tests/ttnn/unit_tests/operations/test_moreh_mean.py b/tests/ttnn/unit_tests/operations/test_moreh_mean.py index 69f66a5abf7..19db48091d9 100644 --- a/tests/ttnn/unit_tests/operations/test_moreh_mean.py +++ b/tests/ttnn/unit_tests/operations/test_moreh_mean.py @@ -195,8 +195,10 @@ def test_moreh_mean_callback(input_shape_dim, device, use_program_cache): run_moreh_mean(input_shape_dim, device, keepdim=True) torch_dummy = torch.randn([32, 32]) ttnn_dummy = ttnn.from_torch(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] @pytest.mark.parametrize( @@ -264,8 +266,10 @@ def test_moreh_mean_backward_callback(input_shape_dim, device, use_program_cache run_moreh_mean_backward(input_shape_dim, device, keepdim=True) torch_dummy = torch.randn([32, 32]) ttnn_dummy = ttnn.from_torch(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] @pytest.mark.parametrize( diff --git a/tests/ttnn/unit_tests/operations/test_moreh_nll_loss.py b/tests/ttnn/unit_tests/operations/test_moreh_nll_loss.py index 887a9db9d17..9015b77d65d 100644 --- a/tests/ttnn/unit_tests/operations/test_moreh_nll_loss.py +++ b/tests/ttnn/unit_tests/operations/test_moreh_nll_loss.py @@ -204,9 +204,13 @@ def test_moreh_nll_loss_callback(shape, reduction, device, use_program_cache): torch_dummy = torch.randn([32, 32]) tt_dummy = to_ttnn(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert ( + num_program_cache_entries_list[0] == num_program_cache_entries_list[1] + and num_program_cache_entries_list[2] == num_program_cache_entries_list[3] + ) @pytest.mark.parametrize( @@ -280,9 +284,13 @@ def test_moreh_nll_loss_backward_test_callback(shape, reduction_mean, device, us torch_dummy = torch.randn([32, 32]) tt_dummy = to_ttnn(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert ( + num_program_cache_entries_list[0] == num_program_cache_entries_list[1] + and num_program_cache_entries_list[2] == num_program_cache_entries_list[3] + ) @pytest.mark.parametrize( diff --git a/tests/ttnn/unit_tests/operations/test_moreh_nll_loss_unreduced.py b/tests/ttnn/unit_tests/operations/test_moreh_nll_loss_unreduced.py index 15af1d6145f..1f3e9b701aa 100644 --- a/tests/ttnn/unit_tests/operations/test_moreh_nll_loss_unreduced.py +++ b/tests/ttnn/unit_tests/operations/test_moreh_nll_loss_unreduced.py @@ -196,9 +196,13 @@ def test_moreh_nll_loss_unreduced_callback(shape, device, use_program_cache): torch_dummy = torch.randn([32, 32]) tt_dummy = to_ttnn(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert ( + num_program_cache_entries_list[0] == num_program_cache_entries_list[1] + and num_program_cache_entries_list[2] == num_program_cache_entries_list[3] + ) @pytest.mark.parametrize( @@ -250,6 +254,10 @@ def test_moreh_nll_loss_unreduced_backward_test_callback(shape, none_weight, dev torch_dummy = torch.randn([32, 32]) tt_dummy = to_ttnn(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert ( + num_program_cache_entries_list[0] == num_program_cache_entries_list[1] + and num_program_cache_entries_list[2] == num_program_cache_entries_list[3] + ) diff --git a/tests/ttnn/unit_tests/operations/test_moreh_norm.py b/tests/ttnn/unit_tests/operations/test_moreh_norm.py index e4487797305..8526d30d5d0 100644 --- a/tests/ttnn/unit_tests/operations/test_moreh_norm.py +++ b/tests/ttnn/unit_tests/operations/test_moreh_norm.py @@ -387,8 +387,10 @@ def test_moreh_norm_callback(dim_rtol_atol, keepdim, device, is_linalg_vector_no ) torch_dummy = torch.randn([32, 32]) ttnn_dummy = ttnn.from_torch(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] @pytest.mark.parametrize("p", [2.0, 2.5, -2.5]) @@ -521,5 +523,7 @@ def test_moreh_norm_backward_callback(dim_rtol_atol, keepdim, device, is_linalg_ ) torch_dummy = torch.randn([32, 32]) ttnn_dummy = ttnn.from_torch(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] diff --git a/tests/ttnn/unit_tests/operations/test_moreh_sgd.py b/tests/ttnn/unit_tests/operations/test_moreh_sgd.py index 7f4c43a12a6..841da226892 100644 --- a/tests/ttnn/unit_tests/operations/test_moreh_sgd.py +++ b/tests/ttnn/unit_tests/operations/test_moreh_sgd.py @@ -299,7 +299,7 @@ def forward(self, x): ) torch_dummy = torch.randn([32, 32]) tt_dummy = ttnn.from_torch(torch_dummy, device=device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) assert dev_param_in.shape == list(model.weight.shape) # check param_out @@ -322,3 +322,5 @@ def forward(self, x): assert passing logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] diff --git a/tests/ttnn/unit_tests/operations/test_moreh_softmax.py b/tests/ttnn/unit_tests/operations/test_moreh_softmax.py index 5e9fb768800..5c414f9b191 100644 --- a/tests/ttnn/unit_tests/operations/test_moreh_softmax.py +++ b/tests/ttnn/unit_tests/operations/test_moreh_softmax.py @@ -438,6 +438,11 @@ def test_softmax_callback(shape_dim_strategy, dtype, device, use_program_cache): for i in range(2): run_moreh_softmax_test(shape, dim, dtype, ttnn.TILE_LAYOUT, device, rtol, atol, True, strategy=strategy) + if i == 0: + num_program_cache_entries = device.num_program_cache_entries() + assert num_program_cache_entries > 0 + else: + assert device.num_program_cache_entries() == num_program_cache_entries torch_dummy = torch.randn([32, 32]) tt_dummy = ttnn.from_torch(torch_dummy, device=device) @@ -467,6 +472,11 @@ def test_softmax_backward_callback(shape_dim_strategy, dtype, device, use_progra run_moreh_softmax_backward_test( shape, dim, dtype, ttnn.TILE_LAYOUT, device, rtol, atol, True, strategy=strategy ) + if i == 0: + num_program_cache_entries = device.num_program_cache_entries() + assert num_program_cache_entries > 0 + else: + assert device.num_program_cache_entries() == num_program_cache_entries torch_dummy = torch.randn([32, 32]) tt_dummy = ttnn.from_torch(torch_dummy, device=device) diff --git a/tests/ttnn/unit_tests/operations/test_moreh_softmin.py b/tests/ttnn/unit_tests/operations/test_moreh_softmin.py index 1e2fd275808..ba9f6e317f8 100644 --- a/tests/ttnn/unit_tests/operations/test_moreh_softmin.py +++ b/tests/ttnn/unit_tests/operations/test_moreh_softmin.py @@ -475,6 +475,11 @@ def test_softmin_callback(shape_dim_strategy, dtype, device, use_program_cache): rtol = atol = 0.05 for i in range(2): run_moreh_softmin_test(shape, dim, dtype, ttnn.TILE_LAYOUT, device, rtol, atol, True, strategy=strategy) + if i == 0: + num_program_cache_entries = device.num_program_cache_entries() + assert num_program_cache_entries > 0 + else: + assert device.num_program_cache_entries() == num_program_cache_entries torch_dummy = torch.randn([32, 32]) tt_dummy = ttnn.from_torch(torch_dummy, device=device) @@ -504,5 +509,10 @@ def test_softmin_backward_callback(shape_dim_strategy, dtype, device, use_progra run_moreh_softmin_backward_test( shape, dim, dtype, ttnn.TILE_LAYOUT, device, rtol, atol, True, strategy=strategy ) + if i == 0: + num_program_cache_entries = device.num_program_cache_entries() + assert num_program_cache_entries > 0 + else: + assert device.num_program_cache_entries() == num_program_cache_entries torch_dummy = torch.randn([32, 32]) tt_dummy = ttnn.from_torch(torch_dummy, device=device) diff --git a/tests/ttnn/unit_tests/operations/test_moreh_sum.py b/tests/ttnn/unit_tests/operations/test_moreh_sum.py index ca77c19d234..5e8616b99a8 100644 --- a/tests/ttnn/unit_tests/operations/test_moreh_sum.py +++ b/tests/ttnn/unit_tests/operations/test_moreh_sum.py @@ -251,6 +251,7 @@ def test_moreh_sum_enable_cache(input_shape, dim, device, use_program_cache): for i in range(2): passing = moreh_sum(input_shape, dim, keepdim[i], use_provide_output[i], False, device) assert passing + assert device.num_program_cache_entries() == 2 @pytest.mark.parametrize( @@ -434,6 +435,7 @@ def test_moreh_sum_backward_enable_cache(input_shape, dim, device, use_program_c for i in range(2): passing = moreh_sum_backward(input_shape, dim, keepdim[i], use_provide_output[i], False, device) assert passing + assert device.num_program_cache_entries() == num_cache_entires[dim] @pytest.mark.parametrize( diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py index bc19ff76498..362db0c940c 100644 --- a/tests/ttnn/unit_tests/operations/test_new_conv2d.py +++ b/tests/ttnn/unit_tests/operations/test_new_conv2d.py @@ -75,11 +75,11 @@ def run_conv( activation="", ): if isinstance(device, ttnn.MeshDevice): - num_devices = len(device.get_device_ids()) if num_devices != 1: assert input_mesh_mapper is not None, "Expected mesh mapper for input tensor when using device mesh" assert weight_mesh_mapper is not None, "Expected mesh mapper for weight tensors when using device mesh" assert output_mesh_composer is not None, "Expected mesh composer for output tensor when using device mesh" + num_devices = len(device.get_device_ids()) total_batch_size = num_devices * batch_size # Batch size across all devices logger.info(f"Using {num_devices} devices for this test") else: diff --git a/tests/ttnn/unit_tests/operations/test_pad.py b/tests/ttnn/unit_tests/operations/test_pad.py index f245489fd2e..00ef1461791 100644 --- a/tests/ttnn/unit_tests/operations/test_pad.py +++ b/tests/ttnn/unit_tests/operations/test_pad.py @@ -72,6 +72,7 @@ def test_pad_rm_with_program_cache(device, n, c, h, w, padding, torch_padding, v device=device, memory_config=ttnn.L1_MEMORY_CONFIG, ) + assert device.num_program_cache_entries() == 1 def run_pad_rm_sharded(device, n, c, h, w, padding, torch_padding, value, shard_orient): @@ -270,6 +271,7 @@ def test_pad_rm_sharded(device, n, c, h, w, padding, torch_padding, value, shard device=device, memory_config=ttnn.L1_MEMORY_CONFIG, ) + assert device.num_program_cache_entries() == 3 @pytest.mark.parametrize("h", [32]) diff --git a/tests/ttnn/unit_tests/operations/test_paged_fused_update_cache.py b/tests/ttnn/unit_tests/operations/test_paged_fused_update_cache.py index e36b249d485..d41489271b4 100644 --- a/tests/ttnn/unit_tests/operations/test_paged_fused_update_cache.py +++ b/tests/ttnn/unit_tests/operations/test_paged_fused_update_cache.py @@ -293,3 +293,5 @@ def test_paged_fused_update_cache_decode_program_caching( device, pcc, ) + + assert device.num_program_cache_entries() == 1 diff --git a/tests/ttnn/unit_tests/operations/test_paged_update_cache.py b/tests/ttnn/unit_tests/operations/test_paged_update_cache.py index 97785a0f62f..5b0a628c578 100644 --- a/tests/ttnn/unit_tests/operations/test_paged_update_cache.py +++ b/tests/ttnn/unit_tests/operations/test_paged_update_cache.py @@ -251,6 +251,8 @@ def test_update_cache_decode_program_cache( cache_idx + 1, False, head_dim, max_seq_len, num_users, num_heads, input_dtype, cache_dtype, device ) + assert device.num_program_cache_entries() == 1 + def run_test_tensor_index_update_cache_decode( cache_idx, head_dim, max_seq_len, num_users, num_heads, input_dtype, cache_dtype, device @@ -359,6 +361,8 @@ def test_tensor_index_update_cache_decode_program_cache( cache_idx, head_dim, max_seq_len, num_users, num_heads, input_dtype, cache_dtype, device ) + assert device.num_program_cache_entries() == 1 + def run_test_paged_update_cache_decode( cache_idx, block_size, head_dim, max_seq_len, num_users, num_heads, input_dtype, cache_dtype, device @@ -553,6 +557,8 @@ def test_paged_update_cache_decode_program_caching( cache_idx + 10, block_size, head_dim, max_seq_len, num_users, num_heads, input_dtype, cache_dtype, device ) + assert device.num_program_cache_entries() == 1 + def run_test_paged_fill_cache( block_size, head_dim, user_seq_len, max_seq_len, num_users, num_heads, input_dtype, cache_dtype, device @@ -664,3 +670,5 @@ def test_paged_fill_cache_program_cache( run_test_paged_fill_cache( block_size, head_dim, user_seq_len, max_seq_len, num_users, num_heads, input_dtype, cache_dtype, device ) + + assert device.num_program_cache_entries() == 1 diff --git a/tests/ttnn/unit_tests/operations/test_repeat.py b/tests/ttnn/unit_tests/operations/test_repeat.py index 6a975a050fb..c10efdff258 100644 --- a/tests/ttnn/unit_tests/operations/test_repeat.py +++ b/tests/ttnn/unit_tests/operations/test_repeat.py @@ -97,6 +97,13 @@ def test_pc_repeat(device, layout, shape, repeat_shape, use_program_cache): ), f"Output shape {output.shape} does not match torch shape {torch_results[i].shape}" assert_with_pcc(torch_results[i], output, 0.9999) + if i == 0: + base_program_cache_entires = device.num_program_cache_entries() + else: + assert ( + device.num_program_cache_entries() == base_program_cache_entires, + "program cache entries differ on same configs", + ) # 17975 test cases @@ -105,6 +112,7 @@ def test_pc_repeat(device, layout, shape, repeat_shape, use_program_cache): def test_pc_with_different_shapes_in_sequence(device, use_program_cache): y = torch.rand((1, 1, 256, 384), dtype=torch.bfloat16) y_tt = ttnn.from_torch(y, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + base_program_cache_entires = device.num_program_cache_entries() x = torch.zeros((64, 1, 256, 384), dtype=torch.bfloat16) x_tt = ttnn.from_torch(x, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) @@ -116,6 +124,10 @@ def test_pc_with_different_shapes_in_sequence(device, use_program_cache): assert torch.allclose(z_torch, y, atol=1e-2), f"z_torch[{i}] != y" for _ in range(num_iters): y_tt = ttnn.from_torch(y, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + assert ( + device.num_program_cache_entries() == base_program_cache_entires, + "program cache entries differ on same configs", + ) x = torch.zeros((64, 1, 256, 384), dtype=torch.bfloat16) x_tt = ttnn.from_torch(x, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) @@ -128,6 +140,7 @@ def test_pc_with_different_shapes_in_sequence(device, use_program_cache): y = torch.rand((1, 1, 32, 32), dtype=torch.bfloat16) y_tt = ttnn.from_torch(y, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + base_program_cache_entires = device.num_program_cache_entries() x = torch.zeros((4, 1, 32, 32), dtype=torch.bfloat16) x_tt = ttnn.from_torch(x, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) @@ -141,6 +154,10 @@ def test_pc_with_different_shapes_in_sequence(device, use_program_cache): assert torch.allclose(z_torch, y, atol=1e-2), f"z_torch[{i}] != y" for _ in range(num_iters): y_tt = ttnn.from_torch(y, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + assert ( + device.num_program_cache_entries() == base_program_cache_entires, + "program cache entries differ on same configs", + ) x = torch.zeros((4, 1, 32, 32), dtype=torch.bfloat16) x_tt = ttnn.from_torch(x, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) @@ -155,6 +172,7 @@ def test_pc_with_different_shapes_in_sequence(device, use_program_cache): y = torch.rand((1, 1, 256, 384), dtype=torch.bfloat16) y_tt = ttnn.from_torch(y, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + base_program_cache_entires = device.num_program_cache_entries() z_tt = ttnn.repeat(y_tt, ttnn.Shape([64, 1, 1, 1])) for i in range(64): @@ -163,6 +181,10 @@ def test_pc_with_different_shapes_in_sequence(device, use_program_cache): for _ in range(num_iters): y = torch.rand((1, 1, 256, 384), dtype=torch.bfloat16) y_tt = ttnn.from_torch(y, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + assert ( + device.num_program_cache_entries() == base_program_cache_entires, + "program cache entries differ on same configs", + ) z_tt = ttnn.repeat(y_tt, ttnn.Shape([64, 1, 1, 1])) for i in range(64): diff --git a/tests/ttnn/unit_tests/operations/test_sampling.py b/tests/ttnn/unit_tests/operations/test_sampling.py index 86cab3ff93a..e9cf04a54a0 100644 --- a/tests/ttnn/unit_tests/operations/test_sampling.py +++ b/tests/ttnn/unit_tests/operations/test_sampling.py @@ -140,9 +140,11 @@ def test_sampling_callback(shape, k, p, seed, device, use_program_cache): run_sampling(shape, k, p, seed, device) # Add dummy tensor to make sure that created tensor in 2 iteration don't share the same addr tt_dummy_tensor = ttnn.empty([1, 1, 32, 32], ttnn.bfloat16, ttnn.TILE_LAYOUT, device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] @skip_for_grayskull("Requires wormhole_b0 to run") @@ -166,6 +168,8 @@ def test_sampling_subcores_callback(shape, k, p, seed, device, sub_core_grids, u run_sampling(shape, k, p, seed, device, sub_core_grids) # Add dummy tensor to make sure that created tensor in 2 iteration don't share the same addr tt_dummy_tensor = ttnn.empty([1, 1, 32, 32], ttnn.bfloat16, ttnn.TILE_LAYOUT, device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] diff --git a/tests/ttnn/unit_tests/operations/test_slice.py b/tests/ttnn/unit_tests/operations/test_slice.py index c10273c3748..0d6a7d043f9 100644 --- a/tests/ttnn/unit_tests/operations/test_slice.py +++ b/tests/ttnn/unit_tests/operations/test_slice.py @@ -79,6 +79,7 @@ def test_slice_rm_sharded_with_program_cache(device, n, c, h, w, use_program_cac device=device, memory_config=ttnn.L1_MEMORY_CONFIG, ) + assert device.num_program_cache_entries() == 3 @pytest.mark.parametrize("n", [16]) @@ -138,7 +139,7 @@ def slice_test( output_tensor_start[3] : output_tensor_end[3], ] - return a_pt, a_ref, 0 + return a_pt, a_ref, device.num_program_cache_entries() @pytest.mark.parametrize( @@ -202,6 +203,7 @@ def test_run_slice_test( assert a_pt.shape == a_ref.shape eq = torch.equal(a_pt, a_ref) assert eq + assert num_cache_entries == 1 a_pt, a_ref, num_cache_entries = slice_test( ttnn.ROW_MAJOR_LAYOUT, @@ -217,6 +219,7 @@ def test_run_slice_test( eq = torch.equal(a_pt, a_ref) assert eq # different width for row major + assert num_cache_entries == 2 a_pt, a_ref, num_cache_entries = slice_test( ttnn.TILE_LAYOUT, @@ -229,6 +232,7 @@ def test_run_slice_test( dtype, ) # change from RM to TILE + assert num_cache_entries == 3 assert a_pt.shape == a_ref.shape eq = torch.equal(a_pt, a_ref) assert eq @@ -244,6 +248,7 @@ def test_run_slice_test( dtype, ) # CACHE HIT + assert num_cache_entries == 4 assert a_pt.shape == a_ref.shape eq = torch.equal(a_pt, a_ref) assert eq diff --git a/tests/ttnn/unit_tests/operations/test_softmax.py b/tests/ttnn/unit_tests/operations/test_softmax.py index 29c9785b4b0..c5378effb42 100644 --- a/tests/ttnn/unit_tests/operations/test_softmax.py +++ b/tests/ttnn/unit_tests/operations/test_softmax.py @@ -123,6 +123,7 @@ def test_softmax_stable_with_program_cache( device=device, memory_config=ttnn.L1_MEMORY_CONFIG, ) + assert device.num_program_cache_entries() == 1 def run_softmax_sharded_stable( @@ -220,6 +221,7 @@ def test_softmax_sharded_stable_with_program_cache( device=device, memory_config=ttnn.L1_MEMORY_CONFIG, ) + assert device.num_program_cache_entries() == 1 @pytest.mark.parametrize("batch_size", [1, 16]) diff --git a/tests/ttnn/unit_tests/operations/test_ssm_1d_sum_reduce.py b/tests/ttnn/unit_tests/operations/test_ssm_1d_sum_reduce.py index 0cdfc767821..195f4799201 100644 --- a/tests/ttnn/unit_tests/operations/test_ssm_1d_sum_reduce.py +++ b/tests/ttnn/unit_tests/operations/test_ssm_1d_sum_reduce.py @@ -77,3 +77,5 @@ def test_ssm_1d_sum_reduce_with_program_cache(device, use_program_cache): dummy_shape = [1, 1, 32, 32] py_dummy_tensor = torch.randn(dummy_shape) tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config) + + assert device.num_program_cache_entries() == 2 diff --git a/tests/ttnn/unit_tests/operations/test_ssm_prefix_scan.py b/tests/ttnn/unit_tests/operations/test_ssm_prefix_scan.py index 7aec14a3445..bd9a5114f85 100644 --- a/tests/ttnn/unit_tests/operations/test_ssm_prefix_scan.py +++ b/tests/ttnn/unit_tests/operations/test_ssm_prefix_scan.py @@ -184,3 +184,5 @@ def test_ssm_prefix_scan_with_program_cache(device, use_program_cache): run_ssm_prefix_scan(L, E, N, num_cores, dtype, device) py_dummy_tensor = torch.randn(dummy_shape) tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, dummy_memory_config) + + assert device.num_program_cache_entries() == 1 diff --git a/tests/ttnn/unit_tests/operations/test_ssm_repeat_and_interleave_eltwise_mul.py b/tests/ttnn/unit_tests/operations/test_ssm_repeat_and_interleave_eltwise_mul.py index 1a11e1d5df8..569eafadd97 100644 --- a/tests/ttnn/unit_tests/operations/test_ssm_repeat_and_interleave_eltwise_mul.py +++ b/tests/ttnn/unit_tests/operations/test_ssm_repeat_and_interleave_eltwise_mul.py @@ -107,3 +107,5 @@ def test_ssm_eltwise_mul_with_program_cache(device, use_program_cache): dummy_shape = [1, 1, 32, 32] py_dummy_tensor = torch.randn(dummy_shape) tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config) + + assert device.num_program_cache_entries() == 3 diff --git a/tests/ttnn/unit_tests/operations/test_uniform.py b/tests/ttnn/unit_tests/operations/test_uniform.py index 14a12d61bfd..032854ca282 100644 --- a/tests/ttnn/unit_tests/operations/test_uniform.py +++ b/tests/ttnn/unit_tests/operations/test_uniform.py @@ -130,12 +130,14 @@ def test_uniform_callback(shape, rand_range, dtype, seed, device, use_program_ca run_uniform(shape, rand_range, dtype, device, seed=seed) # Add dummy tensor to make sure that created tensor in 2 iteration don't share the same addr tt_dummy_tensor = ttnn.empty([1, 1, 32, 32], ttnn.bfloat16, ttnn.TILE_LAYOUT, device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) # Cache must hit when we change seed and seed runtime arg is overrode seed = seed + 1 logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] @skip_for_grayskull("Requires wormhole_b0 to run") diff --git a/tests/ttnn/unit_tests/test_expand.py b/tests/ttnn/unit_tests/test_expand.py index 40b99dbd4a5..e19daba93e8 100644 --- a/tests/ttnn/unit_tests/test_expand.py +++ b/tests/ttnn/unit_tests/test_expand.py @@ -47,4 +47,7 @@ def test_expand_callback(tensor_layout, device, use_program_cache): num_program_cache_entries_list = [] for i in range(2): test_expand([32, 1], [32, 32], tensor_layout, device) - num_program_cache_entries_list.append(0) + num_program_cache_entries_list.append(device.num_program_cache_entries()) + + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] diff --git a/tests/ttnn/unit_tests/test_reshape.py b/tests/ttnn/unit_tests/test_reshape.py index 9ff10beba95..40fd7c15052 100644 --- a/tests/ttnn/unit_tests/test_reshape.py +++ b/tests/ttnn/unit_tests/test_reshape.py @@ -211,6 +211,7 @@ def test_reshape_hw_rm_with_program_cache(device, n, c, h, w, use_program_cache) device=device, memory_config=ttnn.L1_MEMORY_CONFIG, ) + assert device.num_program_cache_entries() == 1 @pytest.mark.parametrize("h", [32])