add comparison of pcc/atol of all outputs

add _extract_outputs function to ModelTester that subclasses may need to implement. This function should return a tuple of torch.Tensors Trim unused code in tests/utils.py move verify_against_golden to verify.py replace self.model with self.framework_model and self.compiled_model
tenstorrent · Jan 21, 2025 · cf83a2f · cf83a2f
1 parent 061e3fb
commit cf83a2f
Show file tree

Hide file tree

Showing 36 changed files with 314 additions and 543 deletions.
diff --git a/tests/models/MobileNetV2/test_MobileNetV2.py b/tests/models/MobileNetV2/test_MobileNetV2.py
@@ -45,7 +45,9 @@ def test_MobileNetV2(record_property, mode, nightly):
     if nightly:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(
+        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+    )
     results = tester.test_model()
     if mode == "eval":
         # Print the top 5 predictions

diff --git a/tests/models/Qwen/test_qwen2_token_classification.py b/tests/models/Qwen/test_qwen2_token_classification.py
@@ -45,15 +45,16 @@ def test_qwen2_token_classification(record_property, model_name, mode, nightly):
     if nightly:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(model_name, mode, relative_atol=0.01, compiler_config=cc)
     with torch.no_grad():
         results = tester.test_model()
 
     if mode == "eval":
         logits = results.logits
         predicted_token_class_ids = logits.argmax(-1)
         predicted_tokens_classes = [
-            tester.model.config.id2label[t.item()] for t in predicted_token_class_ids[0]
+            tester.framework_model.config.id2label[t.item()]
+            for t in predicted_token_class_ids[0]
         ]
         input_ids = tester.inputs["input_ids"]
         tokens = tester.tokenizer.convert_ids_to_tokens(input_ids[0])

diff --git a/tests/models/albert/test_albert_masked_lm.py b/tests/models/albert/test_albert_masked_lm.py
@@ -58,7 +58,9 @@ def test_albert_masked_lm(record_property, model_name, mode, nightly):
     if nightly:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(
+        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+    )
     results = tester.test_model()
 
     if mode == "eval":

diff --git a/tests/models/albert/test_albert_question_answering.py b/tests/models/albert/test_albert_question_answering.py
@@ -41,7 +41,7 @@ def test_albert_question_answering(record_property, model_name, mode, nightly):
     if nightly:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(model_name, mode, relative_atol=0.01, compiler_config=cc)
     results = tester.test_model()
 
     if mode == "eval":

diff --git a/tests/models/albert/test_albert_sequence_classification.py b/tests/models/albert/test_albert_sequence_classification.py
@@ -41,13 +41,15 @@ def test_albert_sequence_classification(record_property, model_name, mode, night
     if nightly:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(
+        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+    )
     results = tester.test_model()
 
     if mode == "eval":
         logits = results.logits
         predicted_class_id = logits.argmax().item()
-        predicted_label = tester.model.config.id2label[predicted_class_id]
+        predicted_label = tester.framework_model.config.id2label[predicted_class_id]
 
         print(
             f"Model: {model_name} | Input: {tester.input_text} | Label: {predicted_label}"

diff --git a/tests/models/albert/test_albert_token_classification.py b/tests/models/albert/test_albert_token_classification.py
@@ -43,7 +43,9 @@ def test_albert_token_classification(record_property, model_name, mode, nightly)
     if nightly:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(
+        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+    )
     results = tester.test_model()
 
     if mode == "eval":

diff --git a/tests/models/beit/test_beit_image_classification.py b/tests/models/beit/test_beit_image_classification.py
@@ -57,14 +57,20 @@ def test_beit_image_classification(record_property, model_name, mode, nightly):
     else:
         cc.compile_depth = CompileDepth.TTNN_IR
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    required_atol = 0.032 if model_name == "microsoft/beit-base-patch16-224" else 0.05
+    tester = ThisTester(
+        model_name, mode, required_atol=required_atol, compiler_config=cc
+    )
     results = tester.test_model()
 
     if mode == "eval":
         logits = results.logits
 
         # model predicts one of the 1000 ImageNet classes
         predicted_class_idx = logits.argmax(-1).item()
-        print("Predicted class:", tester.model.config.id2label[predicted_class_idx])
+        print(
+            "Predicted class:",
+            tester.framework_model.config.id2label[predicted_class_idx],
+        )
 
     record_property("torch_ttnn", (tester, results))
diff --git a/tests/models/bert/test_bert.py b/tests/models/bert/test_bert.py
@@ -53,7 +53,7 @@ def test_bert(record_property, mode, nightly):
     if nightly:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(model_name, mode, relative_atol=0.012, compiler_config=cc)
     results = tester.test_model()
 
     if mode == "eval":

diff --git a/tests/models/bloom/test_bloom.py b/tests/models/bloom/test_bloom.py
@@ -49,7 +49,13 @@ def test_bloom(record_property, mode, nightly):
     if nightly:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(
+        model_name,
+        mode,
+        relative_atol=0.01,
+        assert_on_output_mismatch=False,
+        compiler_config=cc,
+    )
     results = tester.test_model()
 
     if mode == "eval":

diff --git a/tests/models/clip/test_clip.py b/tests/models/clip/test_clip.py
@@ -72,7 +72,7 @@ def test_clip(record_property, mode, nightly):
     if nightly:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(model_name, mode, relative_atol=0.013, compiler_config=cc)
     results = tester.test_model()
 
     if mode == "eval":

diff --git a/tests/models/deit/test_deit.py b/tests/models/deit/test_deit.py
@@ -59,13 +59,16 @@ def test_deit(record_property, model_name, mode, nightly):
     if nightly:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(model_name, mode, relative_atol=0.01, compiler_config=cc)
     results = tester.test_model()
 
     if mode == "eval":
         logits = results.logits
         # model predicts one of the 1000 ImageNet classes
         predicted_class_idx = logits.argmax(-1).item()
-        print("Predicted class:", tester.model.config.id2label[predicted_class_idx])
+        print(
+            "Predicted class:",
+            tester.framework_model.config.id2label[predicted_class_idx],
+        )
 
     record_property("torch_ttnn", (tester, results))
diff --git a/tests/models/detr/test_detr.py b/tests/models/detr/test_detr.py
@@ -36,6 +36,9 @@ def _load_inputs(self):
         input_batch = input_tensor.unsqueeze(0).to(torch.bfloat16)
         return input_batch
 
+    def _extract_outputs(self, output_object):
+        return (output_object["pred_logits"], output_object["pred_boxes"])
+
 
 @pytest.mark.parametrize(
     "mode",
@@ -54,7 +57,9 @@ def test_detr(record_property, mode, nightly):
     else:
         cc.compile_depth = CompileDepth.TTNN_IR
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(
+        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+    )
     results = tester.test_model()
 
     if mode == "eval":

diff --git a/tests/models/distilbert/test_distilbert.py b/tests/models/distilbert/test_distilbert.py
@@ -37,7 +37,9 @@ def test_distilbert(record_property, model_name, mode, nightly):
     if nightly:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(
+        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+    )
     results = tester.test_model()
 
     if mode == "eval":

diff --git a/tests/models/dpr/test_dpr.py b/tests/models/dpr/test_dpr.py
@@ -45,7 +45,9 @@ def test_dpr(record_property, mode, nightly):
     if nightly:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(
+        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+    )
     results = tester.test_model()
 
     if mode == "eval":

diff --git a/tests/models/falcon/test_falcon.py b/tests/models/falcon/test_falcon.py
@@ -42,7 +42,7 @@ def test_falcon(record_property, mode, nightly):
     if nightly:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(model_name, mode, relative_atol=0.013, compiler_config=cc)
     results = tester.test_model()
 
     if mode == "eval":

diff --git a/tests/models/glpn_kitti/test_glpn_kitti.py b/tests/models/glpn_kitti/test_glpn_kitti.py
@@ -45,7 +45,7 @@ def test_glpn_kitti(record_property, mode, nightly):
     else:
         cc.compile_depth = CompileDepth.TTNN_IR
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(model_name, mode, relative_atol=0.013, compiler_config=cc)
     results = tester.test_model()
     if mode == "eval":
         predicted_depth = results.predicted_depth

diff --git a/tests/models/gpt2/test_gpt2.py b/tests/models/gpt2/test_gpt2.py
@@ -44,7 +44,7 @@ def test_gpt2(record_property, mode, nightly):
     if nightly:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(model_name, mode, relative_atol=0.013, compiler_config=cc)
     results = tester.test_model()
     if mode == "eval":
         # Helper function to decode output to human-readable text

diff --git a/tests/models/hardnet/test_hardnet.py b/tests/models/hardnet/test_hardnet.py
@@ -63,7 +63,7 @@ def test_hardnet(record_property, mode, nightly):
     if nightly:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(model_name, mode, relative_atol=0.01, compiler_config=cc)
     results = tester.test_model()
     if mode == "eval":
         # Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes

diff --git a/tests/models/llama/test_llama.py b/tests/models/llama/test_llama.py
@@ -50,7 +50,9 @@ def test_llama(record_property, mode, nightly):
     if nightly:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(
+        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+    )
     results = tester.test_model()
     if mode == "eval":
         # Helper function to decode output to human-readable text

diff --git a/tests/models/mgp-str-base/test_mgp_str_base.py b/tests/models/mgp-str-base/test_mgp_str_base.py
@@ -52,7 +52,7 @@ def test_mgp_str_base(record_property, mode, nightly):
     else:
         cc.compile_depth = CompileDepth.TTNN_IR
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(model_name, mode, relative_atol=0.01, compiler_config=cc)
     results = tester.test_model()
 
     if mode == "eval":

diff --git a/tests/models/mlpmixer/test_mlpmixer.py b/tests/models/mlpmixer/test_mlpmixer.py
@@ -47,6 +47,8 @@ def test_mlpmixer(record_property, mode, nightly):
     if nightly:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(
+        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+    )
     results = tester.test_model()
     record_property("torch_ttnn", (tester, results))
diff --git a/tests/models/mnist/test_mnist.py b/tests/models/mnist/test_mnist.py
@@ -73,7 +73,9 @@ def test_mnist_train(record_property, mode, nightly):
     if nightly:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(
+        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+    )
     results = tester.test_model()
 
     record_property("torch_ttnn", (tester, results))
diff --git a/tests/models/openpose/test_openpose_v2.py b/tests/models/openpose/test_openpose_v2.py
@@ -63,7 +63,9 @@ def test_openpose_v2(record_property, mode, nightly):
     if nightly:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(
+        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+    )
     results = tester.test_model()
     if mode == "eval":
         print(f"Output: {results}")

diff --git a/tests/models/perceiver_io/test_perceiver_io.py b/tests/models/perceiver_io/test_perceiver_io.py
@@ -50,7 +50,9 @@ def test_perceiver_io(record_property, mode, nightly):
     if nightly:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(
+        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+    )
     results = tester.test_model()
     if mode == "eval":
         logits = results.logits

diff --git a/tests/models/resnet/test_resnet.py b/tests/models/resnet/test_resnet.py
@@ -37,7 +37,9 @@ def test_resnet(record_property, mode, nightly):
     if nightly:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(
+        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+    )
     results = tester.test_model()
 
     # Check inference result

diff --git a/tests/models/resnet50/test_resnet50.py b/tests/models/resnet50/test_resnet50.py
@@ -52,7 +52,7 @@ def test_resnet(record_property, mode, nightly):
     else:
         cc.compile_depth = CompileDepth.TTNN_IR
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(model_name, mode, required_atol=0.03, compiler_config=cc)
     results = tester.test_model()
     if mode == "eval":
         # Print the top 5 predictions

diff --git a/tests/models/roberta/test_roberta.py b/tests/models/roberta/test_roberta.py
@@ -38,7 +38,7 @@ def test_roberta(record_property, mode, nightly):
     if nightly:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(model_name, mode, relative_atol=0.012, compiler_config=cc)
     results = tester.test_model()
     if mode == "eval":
         logits = results.logits

diff --git a/tests/models/segformer/test_segformer.py b/tests/models/segformer/test_segformer.py
@@ -60,7 +60,7 @@ def test_segformer(record_property, mode, nightly):
     else:
         cc.compile_depth = CompileDepth.TTNN_IR
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(model_name, mode, relative_atol=0.01, compiler_config=cc)
     results = tester.test_model()
     if mode == "eval":
         logits = results.logits  # shape (batch_size, num_labels, height/4, width/4)

diff --git a/tests/models/stable_diffusion/test_stable_diffusion_v2.py b/tests/models/stable_diffusion/test_stable_diffusion_v2.py
@@ -40,7 +40,7 @@ def _load_inputs(self):
         batch_size = text_embeddings.shape[0]
         height, width = 512, 512  # Output image size
         latents = torch.randn(
-            (batch_size, self.model.in_channels, height // 8, width // 8)
+            (batch_size, self.framework_model.in_channels, height // 8, width // 8)
         )
 
         # Set number of diffusion steps
@@ -75,7 +75,9 @@ def test_stable_diffusion_v2(record_property, mode, nightly):
     if nightly:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(
+        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+    )
     results = tester.test_model()
     if mode == "eval":
         noise_pred = results.sample

diff --git a/tests/models/vilt/test_vilt.py b/tests/models/vilt/test_vilt.py
@@ -50,11 +50,11 @@ def test_vilt(record_property, mode, nightly):
     else:
         cc.compile_depth = CompileDepth.TTNN_IR
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(model_name, mode, relative_atol=0.01, compiler_config=cc)
     results = tester.test_model()
     if mode == "eval":
         logits = results.logits
         idx = logits.argmax(-1).item()
-        print("Predicted answer:", tester.model.config.id2label[idx])
+        print("Predicted answer:", tester.framework_model.config.id2label[idx])
 
     record_property("torch_ttnn", (tester, results))
diff --git a/tests/models/xglm/test_xglm.py b/tests/models/xglm/test_xglm.py
@@ -46,7 +46,7 @@ def test_xglm(record_property, mode, nightly):
     else:
         cc.compile_depth = CompileDepth.TTNN_IR
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(model_name, mode, relative_atol=0.02, compiler_config=cc)
     results = tester.test_model()
 
     record_property("torch_ttnn", (tester, results))
diff --git a/tests/models/yolos/test_yolos.py b/tests/models/yolos/test_yolos.py
@@ -48,7 +48,9 @@ def test_yolos(record_property, mode, nightly):
     if nightly:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(
+        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+    )
     results = tester.test_model()
     if mode == "eval":
         # Helper function to decode output to human-readable text

diff --git a/tests/models/yolov3/test_yolov3.py b/tests/models/yolov3/test_yolov3.py
@@ -68,7 +68,9 @@ def test_yolov3(record_property, mode, nightly):
     if nightly:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_name, mode, compiler_config=cc)
+    tester = ThisTester(
+        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+    )
     results = tester.test_model()
 
     record_property("torch_ttnn", (tester, results))