#0: Disable performance checks in functional tests

- Don't check performance in nightly test - Update device performance target
tenstorrent · Oct 7, 2024 · bcad30a · bcad30a
1 parent 221674d
commit bcad30a
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 21 deletions.
diff --git a/models/demos/wormhole/mamba/demo/demo.py b/models/demos/wormhole/mamba/demo/demo.py
@@ -219,6 +219,7 @@ def run_mamba_demo(
     cache_dir: Optional[str] = None,
     display: bool = True,
     prefill_chunk_size: int = 32,
+    assert_on_performance_measurements: bool = True,
 ):
     profiler = BenchmarkProfiler()
     profiler.start("run")
@@ -345,17 +346,19 @@ def callback(token: torch.Tensor, inference_time: float) -> None:
     prefill_time_to_token_per_user = prefill_stats.mean_throughput_per_user
     decode_time_to_token_per_user = decode_stats.mean_throughput_per_user
 
+    time_to_first_token = 1 / (prefill_time_to_token_per_user + decode_time_to_token_per_user)  # t/s/u
+
     measurements = {
         "total_demo_time": profiler.get_duration("run"),
         "compile_prefill": profiler.get_duration("compile_prefill"),
         "compile_decode": profiler.get_duration("compile_decode"),
         "inference_prefill": prefill_stats.total_time,
         "inference_decode": decode_stats.total_time,
         "prefill_t/s": prefill_stats.mean_throughput,
-        "prefill_time_to_token": prefill_stats.total_time,
+        "prefill_time_to_token": time_to_first_token,
         "decode_t/s": decode_stats.mean_throughput,
         "decode_t/s/u": decode_stats.mean_throughput_per_user,
-        "prefill_decode_t/s/u": 1 / (prefill_time_to_token_per_user + decode_time_to_token_per_user),  # t/s/u
+        "prefill_decode_t/s/u": time_to_first_token,
         "token_verification": 1,  # This is checked by the caller - but we could also do a match here
     }
 
@@ -367,7 +370,7 @@ def callback(token: torch.Tensor, inference_time: float) -> None:
     logger.info(
         f"Decode throughput: {decode_stats.mean_throughput:.1f} t/s, {decode_stats.mean_throughput_per_user:.2f} t/s/u"
     )
-    logger.info(f"Time to first token: {(1e3 * measurements['prefill_decode_t/s/u']):.2f} ms")
+    logger.info(f"Time to first token: {(1e3 * time_to_first_token):.2f} ms")
 
     chunk_size_to_prefill_targets_tok_per_s = {32: 135.0, 128: 270.0}  # perf is different for different chunk sizes
     targets = {
@@ -390,7 +393,10 @@ def callback(token: torch.Tensor, inference_time: float) -> None:
         output_sequence_length=tokenized_prompts.shape[1] + generated_sequence_length,
     )
 
-    verify_perf(measurements, targets)
+    if assert_on_performance_measurements:
+        verify_perf(measurements, targets)
+    else:
+        logger.warning(f"Skipping performance checks (this is expected for functional tests)")
 
     return DemoResult(generated_text=token_display.sequences)
 

diff --git a/models/demos/wormhole/mamba/tests/test_mamba_demo.py b/models/demos/wormhole/mamba/tests/test_mamba_demo.py
@@ -55,6 +55,7 @@ def test_demo(
         display=True,
         cache_dir=get_tt_cache_path(model_version),
         prefill_chunk_size=prefill_chunk_size,
+        assert_on_performance_measurements=False,  # Don't check performance for functional tests
     )
 
     expected = user_input[0] + expected_output[0]

diff --git a/models/demos/wormhole/mamba/tests/test_mamba_perf.py b/models/demos/wormhole/mamba/tests/test_mamba_perf.py
@@ -129,7 +129,7 @@ def test_mamba_perf_e2e(
     upper_margin = MARGIN
     if not is_nearby(inference_time, expected_inference_time, lower_margin=lower_margin, upper_margin=upper_margin):
         logger.warning(
-            "Inference time does not match (within some margin) the expected value (was {inference_time:2f} but expected {expected_inference_time:2f})"
+            f"Inference time does not match (within some margin) the expected value (was {inference_time:2f} but expected {expected_inference_time:2f})"
         )
 
     if not is_nearby(compile_time, expected_compile_time, lower_margin=lower_margin, upper_margin=upper_margin):
@@ -142,33 +142,30 @@ def test_mamba_perf_e2e(
 @pytest.mark.timeout(600)
 @pytest.mark.models_device_performance_bare_metal
 @pytest.mark.parametrize(
-    "batch, warmup, expected_device_fw_duration_ms",
-    ((32, True, 1.66),),
+    "batch, expected_layer_duration_ms",
+    ((32, 1.71),),
 )
-def test_mamba_perf_device(batch, warmup, expected_device_fw_duration_ms, reset_seeds):
+def test_mamba_perf_device(batch, expected_layer_duration_ms):
     subdir = "ttnn_mamba"
-    margin = 0.03
-    if warmup:
-        inference_iterations = 2
-    else:
-        inference_iterations = 1
-    command = f"pytest models/demos/wormhole/mamba/tests/test_mamba_model.py::test_device_perf[{inference_iterations}]"
+    margin = 0.01
+    command = f"pytest models/demos/wormhole/mamba/tests/test_mamba_model.py::test_device_perf[1]"
     cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"]
 
     # Convert expected perf (ms) to samples/s
-    expected_device_fw_duration_ns = expected_device_fw_duration_ms * 1e6  # ms to ns
-    expected_total_device_fw_samples = get_samples_per_s(expected_device_fw_duration_ns * inference_iterations, batch)
-
-    inference_time_key = "AVG DEVICE FW SAMPLES/S"
-    expected_perf_cols = {inference_time_key: expected_total_device_fw_samples}
+    expected_layer_duration_ns = expected_layer_duration_ms * 1e6  # ms to ns
+    expected_total_layer_samples_per_s = get_samples_per_s(expected_layer_duration_ns, batch)
+    inference_time_key = "AVG DEVICE KERNEL SAMPLES/S"
+    expected_perf_cols = {inference_time_key: expected_total_layer_samples_per_s}
 
     post_processed_results = run_device_perf(command, subdir, 1, cols, batch)
+    logger.info(
+        f"Checking device performance... Expecting {expected_total_layer_samples_per_s} samples/sec (equivalent to {expected_layer_duration_ms} ms per layer)"
+    )
     expected_results = check_device_perf(post_processed_results, margin, expected_perf_cols, assert_on_fail=True)
-    comment = ""
     prep_device_perf_report(
         model_name=f"mamba-2.8b_batch_{batch}",
         batch_size=batch,
         post_processed_results=post_processed_results,
         expected_results=expected_results,
-        comments=comment,
+        comments="",
     )