Skip to content

Commit

Permalink
Revert "Revert "Revert "Change tests"""
Browse files Browse the repository at this point in the history
This reverts commit 01ca8ac.
  • Loading branch information
sminakov-tt committed Feb 28, 2025
1 parent 8475369 commit 4de44fe
Show file tree
Hide file tree
Showing 74 changed files with 324 additions and 37 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -398,3 +398,12 @@ def test_bert_batch_dram_with_program_cache(
PERF_CNT,
device,
)

if model_config_str == "BFLOAT8_B-SHARDED":
assert device.num_program_cache_entries() == 19
elif batch == 8 and model_config_str == "MIXED_PRECISION_BATCH8":
assert device.num_program_cache_entries() == 17
elif batch == 9 and model_config_str in {"BFLOAT8_B-L1", "BFLOAT8_B-DRAM"}:
assert device.num_program_cache_entries() == 17
else:
assert device.num_program_cache_entries() == 16
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,5 @@ def test_bert_large_concatenate_heads_with_program_cache(device, use_program_cac
dummy_shape = [1, 1, 32, 32]
py_dummy_tensor = torch.randn(dummy_shape)
tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)

assert device.num_program_cache_entries() == 2
Original file line number Diff line number Diff line change
Expand Up @@ -203,3 +203,5 @@ def test_bert_large_ff1_matmul_with_program_cache(device, use_program_cache):
dummy_shape = [1, 1, 32, 32]
py_dummy_tensor = torch.randn(dummy_shape)
tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)

assert device.num_program_cache_entries() == 2
Original file line number Diff line number Diff line change
Expand Up @@ -163,3 +163,5 @@ def test_bert_large_ff2_matmul_with_program_cache(device, use_program_cache):
dummy_shape = [1, 1, 32, 32]
py_dummy_tensor = torch.randn(dummy_shape)
tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)

assert device.num_program_cache_entries() == 2
Original file line number Diff line number Diff line change
Expand Up @@ -163,3 +163,5 @@ def test_bert_large_fused_qkv_matmul_with_program_cache(device, use_program_cach
dummy_shape = [1, 1, 32, 32]
py_dummy_tensor = torch.randn(dummy_shape)
tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)

assert device.num_program_cache_entries() == 2
Original file line number Diff line number Diff line change
Expand Up @@ -120,3 +120,5 @@ def test_bert_large_post_softmax_bmm_with_program_cache(device, use_program_cach
dummy_shape = [1, 1, 32, 32]
py_dummy_tensor = torch.randn(dummy_shape)
tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)

assert device.num_program_cache_entries() == 2
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,5 @@ def test_bert_large_pre_softmax_bmm_with_program_cache(device, use_program_cache
dummy_shape = [1, 1, 32, 32]
py_dummy_tensor = torch.randn(dummy_shape)
tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)

assert device.num_program_cache_entries() == 2
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,5 @@ def test_bert_large_selfout_matmul_with_program_cache(device, use_program_cache)
dummy_shape = [1, 1, 32, 32]
py_dummy_tensor = torch.randn(dummy_shape)
tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)

assert device.num_program_cache_entries() == 2
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,5 @@ def test_split_query_key_value_and_split_heads_with_program_cache(device, use_pr
dummy_shape = [1, 1, 32, 32]
py_dummy_tensor = torch.randn(dummy_shape)
tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype, device, ttnn.TILE_LAYOUT, mem_config)

assert device.num_program_cache_entries() == 2
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,5 @@ def test_split_query_key_value_and_split_heads_with_program_cache(device, use_pr
dummy_shape = [1, 1, 32, 32]
py_dummy_tensor = torch.randn(dummy_shape)
tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)

assert device.num_program_cache_entries() == 2
Original file line number Diff line number Diff line change
Expand Up @@ -339,15 +339,15 @@ def test_group_attn_matmul_with_program_cache(
else:
output_mem_config = interleaved_mem_config

num_cache_entries_start = 0
num_cache_entries_start = device.num_program_cache_entries()
tt_output_tensor_on_device = ttnn.experimental.group_attn_matmul(
tt_input_tensor_a,
tt_input_tensor_b,
compute_with_storage_grid_size=compute_grid_size,
memory_config=output_mem_config,
dtype=output_dtype,
)
num_cache_entries += 0 - num_cache_entries_start
num_cache_entries += device.num_program_cache_entries() - num_cache_entries_start

if sharded:
tt_output_tensor_on_device = ttnn.sharded_to_interleaved(
Expand All @@ -363,6 +363,8 @@ def test_group_attn_matmul_with_program_cache(
allclose, output = comp_pcc(tt_output_tensor, golden_output_tensor)
assert allclose, f"FAILED: {output}"

assert num_cache_entries == 1

device.enable_async(False)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,3 +194,7 @@ def test_layernorm_part_2_with_program_cache2(inp_shape, n_devices, is_rmsnorm,
)
)
run_layernorm_part_2(inp_shape, n_devices, is_rmsnorm, dtype, dtype, device)

assert device.num_program_cache_entries() == 1, "Program cache should have only one entry" + str(
device.num_program_cache_entries()
)
Original file line number Diff line number Diff line change
Expand Up @@ -269,3 +269,7 @@ def test_layernorm_part_1_with_program_cache2(
)
)
run_layernorm_part_1(inp_shape, n_devices, is_rmsnorm, input_dtype, output_dtype, device)

assert device.num_program_cache_entries() == 1, "Program cache should have only one entry" + str(
device.num_program_cache_entries()
)
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,9 @@ def run_multi_core_matmul_1d(

assert passing

# Check program cache
assert device.num_program_cache_entries() == 1 # Only 1 op


@pytest.mark.skipif(is_grayskull(), reason="GS does not support fp32")
@pytest.mark.skipif(is_blackhole(), reason="Test suite for GS only")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ def test_matmul_in1_dram_sharded_with_program_cache(
buffer_type=ttnn.BufferType.DRAM,
)
tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, in0_dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
assert device.num_program_cache_entries() == 3


def run_test_matmul_in1_dram_sharded_mm_chain(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,5 @@ def test_move_op_with_program_cache(device, use_program_cache):
dummy_shape = [1, 1, 32, 32]
py_dummy_tensor = torch.randn(dummy_shape)
tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)

assert device.num_program_cache_entries() == 2

Check failure on line 113 in tests/tt_eager/python_api_testing/unit_testing/misc/test_move.py

View workflow job for this annotation

GitHub Actions / fast-dispatch-unit-tests (wormhole_b0, N300) / eager unit tests 4 wormhole_b0 N300

test_move_op_with_program_cache assert 0 == 2 + where 0 = <bound method PyCapsule.num_program_cache_entries of MeshDevice(1x1 grid, 1 devices)>() + where <bound method PyCapsule.num_program_cache_entries of MeshDevice(1x1 grid, 1 devices)> = MeshDevice(1x1 grid, 1 devices).num_program_cache_entries

Check failure on line 113 in tests/tt_eager/python_api_testing/unit_testing/misc/test_move.py

View workflow job for this annotation

GitHub Actions / fast-dispatch-unit-tests (wormhole_b0, N150) / eager unit tests 4 wormhole_b0 N150

test_move_op_with_program_cache assert 0 == 2 + where 0 = <bound method PyCapsule.num_program_cache_entries of MeshDevice(1x1 grid, 1 devices)>() + where <bound method PyCapsule.num_program_cache_entries of MeshDevice(1x1 grid, 1 devices)> = MeshDevice(1x1 grid, 1 devices).num_program_cache_entries
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,5 @@ def test_nlp_concat_heads_with_program_cache(device, use_program_cache):
dummy_shape = [1, 1, 32, 32]
py_dummy_tensor = torch.randn(dummy_shape)
tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)

assert device.num_program_cache_entries() == 2
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ def test_nlp_create_qkv_heads_falcon7b_with_program_cache(device, use_program_ca
py_dummy_tensor = torch.randn(dummy_shape)
tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)

assert device.num_program_cache_entries() == 2


"""
Generic shapes + functionality
Expand Down Expand Up @@ -363,6 +365,8 @@ def test_nlp_create_qkv_heads_with_program_cache(device, use_program_cache):
py_dummy_tensor = torch.randn(dummy_shape)
tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)

assert device.num_program_cache_entries() == 2


def run_sharded_nlp_create_qkv_heads_test(
batch,
Expand Down Expand Up @@ -527,3 +531,5 @@ def test_sharded_nlp_create_qkv_heads_with_program_cache(device, use_program_cac
dummy_shape = [1, 1, 32, 32]
py_dummy_tensor = torch.randn(dummy_shape)
tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)

assert device.num_program_cache_entries() == 2
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,7 @@ def test_create_heads_with_slice(
)
# BH does s2i and i2s inside of to_device and from_device as device ops
expected_entries = 1 if not is_blackhole() else 4 if overlap_coregrid else 5
assert device.num_program_cache_entries() == expected_entries


@pytest.fixture()
Expand Down Expand Up @@ -462,6 +463,7 @@ def test_create_min_width_shard_subcoregrid(
overlap_coregrid=overlap_coregrid,
sub_core_grids=sub_core_grids,
)
assert device.num_program_cache_entries() == 1, "Only one Op program cache should exist"


def run_test_create_width_shard_by_head(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,5 @@ def test_nlp_create_qkv_heads_segformer_with_program_cache(device, use_program_c
dummy_shape = [1, 1, 32, 32]
py_dummy_tensor = torch.randn(dummy_shape)
tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)

assert device.num_program_cache_entries() == 2
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,5 @@ def test_nlp_create_qkv_heads_vit_with_program_cache(device, use_program_cache):
dummy_shape = [1, 1, 32, 32]
py_dummy_tensor = torch.randn(dummy_shape)
tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)

assert device.num_program_cache_entries() == 2
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def unpadding_test(
# Pytorch reference
test_tensor_ref = inp[:, :, seq_len_start:seq_len_end]

return test_tensor_pt, test_tensor_ref, test_tensor_tt.memory_config(), 0
return test_tensor_pt, test_tensor_ref, test_tensor_tt.memory_config(), device.num_program_cache_entries()


@pytest.mark.parametrize(
Expand Down Expand Up @@ -120,6 +120,7 @@ def test_run_unpadding_test(
dtype,
)
assert a_pt.shape == a_ref.shape
assert num_cache_entries == 2
if dtype == ttnn.bfloat8_b:
# inevitable precision loss for bfloat8_b
eq, pcc = comp_pcc(a_pt, a_ref, 0.999)
Expand Down Expand Up @@ -147,6 +148,7 @@ def test_run_unpadding_test(
dtype,
)
assert a_pt.shape == a_ref.shape
assert num_cache_entries == 3
if dtype == ttnn.bfloat8_b:
# inevitable precision loss for bfloat8_b
eq, pcc = comp_pcc(a_pt, a_ref, 0.999)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,8 @@ def test_reshard_with_program_cache(
passing, output = comp_pcc(torch_tensor1, torch_tensor_after_round_trip1)
assert passing, output

assert device.num_program_cache_entries() == 3


@skip_for_blackhole("GH Issue #15234")
@pytest.mark.parametrize(
Expand Down Expand Up @@ -617,3 +619,5 @@ def test_dram_reshard_with_program_cache(
dummy_tensor = (
ttnn.Tensor(torch.rand([2, 2, 128, 64]), dtype).to(ttnn.TILE_LAYOUT).to(device, ttnn.L1_MEMORY_CONFIG)
)

assert device.num_program_cache_entries() == 1
Original file line number Diff line number Diff line change
Expand Up @@ -458,3 +458,5 @@ def test_rotary_embedding_llama_with_program_cache(

if batch % ttnn.TILE_SIZE != 0:
num_ops += 1 # slice

assert device.num_program_cache_entries() == num_ops
Original file line number Diff line number Diff line change
Expand Up @@ -136,3 +136,5 @@ def test_rotary_embedding_llama_fused_qk_with_program_cache(

if (batch * 2) % ttnn.TILE_SIZE != 0:
num_ops += 1 # slice

assert device.num_program_cache_entries() == num_ops
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,8 @@ def test_sdpa_tt_with_program_cache(device, b, nh, nkv, s, d, q_chunk_size, k_ch
for _ in range(2):
run_test_sdpa_tt(device, b, nh, nkv, s, d, q_chunk_size, k_chunk_size, dtype)

assert device.num_program_cache_entries() == 1


def run_sdpa_noncausal(device, b, nh, nkv, sq, d, q_chunk_size, k_chunk_size, dtype, sk=None, use_mask=True):
torch.manual_seed(1234)
Expand Down Expand Up @@ -500,6 +502,11 @@ def test_sdpa_chunked(
use_high_precision_compute,
)

# Print number of program cache entries
assert device.num_program_cache_entries() == 1, "Program cache should only have 1 entry but has {}".format(
device.num_program_cache_entries()
)


@skip_for_blackhole("Mismatching on BH, see #12349")
@pytest.mark.skipif(is_watcher_enabled(), reason="Kernel OOM with watcher enabled")
Expand Down Expand Up @@ -553,6 +560,11 @@ def test_sdpa_chunked_iterate_batch(
grid_size=(1, 1),
)

# Print number of program cache entries
assert device.num_program_cache_entries() == 1, "Program cache should only have 1 entry but has {}".format(
device.num_program_cache_entries()
)


def run_test_joint_sdpa(
device,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,7 @@ def test_sdpa_decode_non_causal(device, b, nh, nkv, s, d, dtype, grid_size, q_dt
run_test_sdpa_decode_single_iter(
device, b, nh, nkv, s, d, dtype, grid_size, q_dtype, sharded_in=False, sharded_out=False, causal=False
)
assert device.num_program_cache_entries() == 1


@skip_for_blackhole("Unsupported on BH, see #12349")
Expand Down Expand Up @@ -886,6 +887,8 @@ def test_sdpa_decode_paged_attention(
sharded_out=False,
)

assert device.num_program_cache_entries() == 4


@skip_for_blackhole("Unsupported on BH, see #12349")
@skip_for_grayskull("Unsupported in GS since L1 runs OOM with most configs")
Expand Down Expand Up @@ -986,6 +989,7 @@ def test_sdpa_decode_sharded_on_subcoregrids(
start_core=start_core,
sub_core_grids=sub_core_grids,
)
assert device.num_program_cache_entries() == 1


@skip_for_blackhole("Unsupported on BH, see #12349")
Expand Down Expand Up @@ -1150,6 +1154,8 @@ def test_sdpa_decode_program_cache(device, b, nh, nkv, s, d, dtype, use_program_
cur_pos_tensor=True,
)

assert device.num_program_cache_entries() == 4


def run_test_sdpa_decode_ndpcc(device, b, nh, nkv, s, d, dtype, grid_size, q_dtype=ttnn.bfloat16):
compute_grid_size = device.compute_with_storage_grid_size()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ def transpose(
logger.info(output)
assert passing

if expected_program_cache_size != None:
assert device.num_program_cache_entries() == expected_program_cache_size


@pytest.mark.parametrize(
"dtype",
Expand Down Expand Up @@ -383,6 +386,7 @@ def test_transpose_hw_rm_with_program_cache(device, n, c, h, w, use_program_cach
device=device,
memory_config=ttnn.L1_MEMORY_CONFIG,
)
assert device.num_program_cache_entries() == 1


@skip_for_blackhole("Mismatching on BH, see #12349")
Expand Down Expand Up @@ -474,6 +478,7 @@ def test_transpose_hw_sharded_rm_with_program_cache(device, n, c, h, w, use_prog
device=device,
memory_config=ttnn.L1_MEMORY_CONFIG,
)
assert device.num_program_cache_entries() == 3


@pytest.mark.parametrize("n", [16])
Expand Down Expand Up @@ -534,6 +539,7 @@ def test_transpose_hc_rm_with_program_cache(device, n, c, h, w, use_program_cach
device=device,
memory_config=ttnn.L1_MEMORY_CONFIG,
)
assert device.num_program_cache_entries() == 1


def run_transpose_hc_sharded(device, n, c, h, w, grid_size):
Expand Down Expand Up @@ -595,6 +601,7 @@ def test_transpose_hc_sharded_with_program_cache(device, n, c, h, w, grid_size,
device=device,
memory_config=ttnn.L1_MEMORY_CONFIG,
)
assert device.num_program_cache_entries() == 3


@pytest.mark.parametrize(
Expand Down
2 changes: 2 additions & 0 deletions tests/ttnn/unit_tests/operations/test_backward_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,3 +120,5 @@ def test_embedding_bw_with_program_cache(

logger.debug(comp_out)
assert comp_pass

assert device.num_program_cache_entries() == 1
4 changes: 3 additions & 1 deletion tests/ttnn/unit_tests/operations/test_bernoulli.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,13 @@ def test_bernoulli_callback(shape, seed, in_dtype, out_dtype, device, is_out_all
run_bernoulli(shape, in_dtype, out_dtype, device, seed=seed, is_out_alloc=is_out_alloc)
# Add dummy tensor to make sure that created tensor in 2 iteration don't share the same addr
tt_dummy_tensor = ttnn.empty([1, 1, 32, 32], ttnn.bfloat16, ttnn.TILE_LAYOUT, device)
num_program_cache_entries_list.append(0)
num_program_cache_entries_list.append(device.num_program_cache_entries())
# Cache must hit when we change seed and seed runtime arg is overrode
seed = seed + 1

logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
assert num_program_cache_entries_list[0] > 0
assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]


@skip_for_grayskull("Requires wormhole_b0 to run")
Expand Down
4 changes: 3 additions & 1 deletion tests/ttnn/unit_tests/operations/test_clone.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,5 +243,7 @@ def test_clone_callback(
)
torch_dummy = torch.randn([32, 32])
ttnn_dummy = ttnn.from_torch(torch_dummy, device=device)
num_program_cache_entries_list.append(0)
num_program_cache_entries_list.append(device.num_program_cache_entries())
logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
assert num_program_cache_entries_list[0] > 0
assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
2 changes: 2 additions & 0 deletions tests/ttnn/unit_tests/operations/test_convert_to_chw.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,5 @@ def test_convert_to_chw_with_program_cache(device, use_program_cache):
tt_dummy_tensor = (
ttnn.Tensor(py_dummy_tensor, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device, ttnn.L1_MEMORY_CONFIG)
)

assert device.num_program_cache_entries() == 2
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,7 @@ def run_pre_allgather_layernorm(
tt_ex2, torch_ex2, atol=max_atol_ex2
), f"E(x^2) mismatch for device {d} (atol: {atol_delta_ex2})"

assert device.num_program_cache_entries() == 2, "Program cache not working as expected"
logger.info("Pre-allgather layernorm test passed for all devices")


Expand Down
Loading

0 comments on commit 4de44fe

Please sign in to comment.