Skip to content

Commit

Permalink
#0: Disable performance checks in functional tests
Browse files Browse the repository at this point in the history
- Don't check performance in nightly test
- Update device performance target
  • Loading branch information
esmalTT committed Oct 7, 2024
1 parent 221674d commit bcad30a
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 21 deletions.
14 changes: 10 additions & 4 deletions models/demos/wormhole/mamba/demo/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,7 @@ def run_mamba_demo(
cache_dir: Optional[str] = None,
display: bool = True,
prefill_chunk_size: int = 32,
assert_on_performance_measurements: bool = True,
):
profiler = BenchmarkProfiler()
profiler.start("run")
Expand Down Expand Up @@ -345,17 +346,19 @@ def callback(token: torch.Tensor, inference_time: float) -> None:
prefill_time_to_token_per_user = prefill_stats.mean_throughput_per_user
decode_time_to_token_per_user = decode_stats.mean_throughput_per_user

time_to_first_token = 1 / (prefill_time_to_token_per_user + decode_time_to_token_per_user) # t/s/u

measurements = {
"total_demo_time": profiler.get_duration("run"),
"compile_prefill": profiler.get_duration("compile_prefill"),
"compile_decode": profiler.get_duration("compile_decode"),
"inference_prefill": prefill_stats.total_time,
"inference_decode": decode_stats.total_time,
"prefill_t/s": prefill_stats.mean_throughput,
"prefill_time_to_token": prefill_stats.total_time,
"prefill_time_to_token": time_to_first_token,
"decode_t/s": decode_stats.mean_throughput,
"decode_t/s/u": decode_stats.mean_throughput_per_user,
"prefill_decode_t/s/u": 1 / (prefill_time_to_token_per_user + decode_time_to_token_per_user), # t/s/u
"prefill_decode_t/s/u": time_to_first_token,
"token_verification": 1, # This is checked by the caller - but we could also do a match here
}

Expand All @@ -367,7 +370,7 @@ def callback(token: torch.Tensor, inference_time: float) -> None:
logger.info(
f"Decode throughput: {decode_stats.mean_throughput:.1f} t/s, {decode_stats.mean_throughput_per_user:.2f} t/s/u"
)
logger.info(f"Time to first token: {(1e3 * measurements['prefill_decode_t/s/u']):.2f} ms")
logger.info(f"Time to first token: {(1e3 * time_to_first_token):.2f} ms")

chunk_size_to_prefill_targets_tok_per_s = {32: 135.0, 128: 270.0} # perf is different for different chunk sizes
targets = {
Expand All @@ -390,7 +393,10 @@ def callback(token: torch.Tensor, inference_time: float) -> None:
output_sequence_length=tokenized_prompts.shape[1] + generated_sequence_length,
)

verify_perf(measurements, targets)
if assert_on_performance_measurements:
verify_perf(measurements, targets)
else:
logger.warning(f"Skipping performance checks (this is expected for functional tests)")

return DemoResult(generated_text=token_display.sequences)

Expand Down
1 change: 1 addition & 0 deletions models/demos/wormhole/mamba/tests/test_mamba_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def test_demo(
display=True,
cache_dir=get_tt_cache_path(model_version),
prefill_chunk_size=prefill_chunk_size,
assert_on_performance_measurements=False, # Don't check performance for functional tests
)

expected = user_input[0] + expected_output[0]
Expand Down
31 changes: 14 additions & 17 deletions models/demos/wormhole/mamba/tests/test_mamba_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def test_mamba_perf_e2e(
upper_margin = MARGIN
if not is_nearby(inference_time, expected_inference_time, lower_margin=lower_margin, upper_margin=upper_margin):
logger.warning(
"Inference time does not match (within some margin) the expected value (was {inference_time:2f} but expected {expected_inference_time:2f})"
f"Inference time does not match (within some margin) the expected value (was {inference_time:2f} but expected {expected_inference_time:2f})"
)

if not is_nearby(compile_time, expected_compile_time, lower_margin=lower_margin, upper_margin=upper_margin):
Expand All @@ -142,33 +142,30 @@ def test_mamba_perf_e2e(
@pytest.mark.timeout(600)
@pytest.mark.models_device_performance_bare_metal
@pytest.mark.parametrize(
"batch, warmup, expected_device_fw_duration_ms",
((32, True, 1.66),),
"batch, expected_layer_duration_ms",
((32, 1.71),),
)
def test_mamba_perf_device(batch, warmup, expected_device_fw_duration_ms, reset_seeds):
def test_mamba_perf_device(batch, expected_layer_duration_ms):
subdir = "ttnn_mamba"
margin = 0.03
if warmup:
inference_iterations = 2
else:
inference_iterations = 1
command = f"pytest models/demos/wormhole/mamba/tests/test_mamba_model.py::test_device_perf[{inference_iterations}]"
margin = 0.01
command = f"pytest models/demos/wormhole/mamba/tests/test_mamba_model.py::test_device_perf[1]"
cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"]

# Convert expected perf (ms) to samples/s
expected_device_fw_duration_ns = expected_device_fw_duration_ms * 1e6 # ms to ns
expected_total_device_fw_samples = get_samples_per_s(expected_device_fw_duration_ns * inference_iterations, batch)

inference_time_key = "AVG DEVICE FW SAMPLES/S"
expected_perf_cols = {inference_time_key: expected_total_device_fw_samples}
expected_layer_duration_ns = expected_layer_duration_ms * 1e6 # ms to ns
expected_total_layer_samples_per_s = get_samples_per_s(expected_layer_duration_ns, batch)
inference_time_key = "AVG DEVICE KERNEL SAMPLES/S"
expected_perf_cols = {inference_time_key: expected_total_layer_samples_per_s}

post_processed_results = run_device_perf(command, subdir, 1, cols, batch)
logger.info(
f"Checking device performance... Expecting {expected_total_layer_samples_per_s} samples/sec (equivalent to {expected_layer_duration_ms} ms per layer)"
)
expected_results = check_device_perf(post_processed_results, margin, expected_perf_cols, assert_on_fail=True)
comment = ""
prep_device_perf_report(
model_name=f"mamba-2.8b_batch_{batch}",
batch_size=batch,
post_processed_results=post_processed_results,
expected_results=expected_results,
comments=comment,
comments="",
)

0 comments on commit bcad30a

Please sign in to comment.