diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index f3c315641..82fbb9b67 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -49,8 +49,6 @@ jobs:
       MILABENCH_DASH: "no"
     
     steps:
-      - uses: actions/checkout@v3
-    
       - uses: conda-incubator/setup-miniconda@v2
         with:
           auto-activate-base: false
@@ -58,6 +56,12 @@ jobs:
           miniconda-version: "latest"
           activate-environment: test
 
+      # - name: clean
+      #   run: |
+      #     python -c "import shutil; shutil.rmtree('/opt/actions-runner/_work/milabench/milabench')"
+
+      - uses: actions/checkout@v3
+    
       - name: Pytorch Sanity
         run: |
           if [[ "${{ matrix.arch }}" == "rocm" ]]; then
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..012435f51
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,7 @@
+[submodule "benchmarks/mlperf/apex"]
+	path = benchmarks/mlperf/apex
+	url = https://github.com/NVIDIA/apex.git
+
+[submodule "benchmarks/mlperf/training_results_v2.1"]
+	path = benchmarks/mlperf/training_results_v2.1
+	url = https://github.com/mlcommons/training_results_v2.1.git
diff --git a/benchmarks/huggingface/bench/__main__.py b/benchmarks/huggingface/bench/__main__.py
index 4e46f4ffd..3190a0b75 100644
--- a/benchmarks/huggingface/bench/__main__.py
+++ b/benchmarks/huggingface/bench/__main__.py
@@ -12,6 +12,7 @@
 from .synth import SyntheticData, generators
 
 
+
 def is_tf32_allowed(args):
     return "tf32" in args.precision
 
@@ -20,6 +21,16 @@ def is_fp16_allowed(args):
     return "fp16" in args.precision
 
 
+class ModelWrapper(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        
+    def forward(self, x):
+        out =  self.model(input_ids=x['input_ids'], labels=x['labels'])
+        return out['loss'], out['logits']
+
+
 class Runner:
     def __init__(self, args):
         use_cuda = not args.no_cuda and torch.cuda.is_available()
@@ -32,17 +43,30 @@ def __init__(self, args):
         self.device = torch.device("cuda" if use_cuda else "cpu")
         self.batch_size = args.batch_size
         info = models[args.model]()
-        self.model = info.model.to(self.device)
-        self.optimizer = optim.Adam(self.model.parameters(), lr=args.lr)
-
+        
+        
         self.data = SyntheticData(
             n=args.batch_size,
             repeat=100000,
             generators=generators[info.category](info),
         )
+        
         self.loader = DataLoader(
             self.data, batch_size=args.batch_size, num_workers=args.num_workers
         )
+        
+        example = next(iter(self.loader))
+        example = {k: x.to(self.device) for k, x in example.items()}
+
+        model = ModelWrapper(info.model).to(self.device)
+        
+        jit = False
+        if jit:
+            model = torch.jit.trace(model, example)
+        
+        self.model = model
+        self.optimizer = optim.Adam(self.model.parameters(), lr=args.lr)
+
 
         self.amp_scaler = torch.cuda.amp.GradScaler(enabled=is_fp16_allowed(args))
         if is_fp16_allowed(args):
@@ -52,9 +76,7 @@ def __init__(self, args):
 
     def step(self, data):
         with self.amp_context():
-            outputs = self.model(**data)
-
-        loss = outputs.loss
+            loss, _ = self.model(data)
 
         self.amp_scaler.scale(loss).backward()
         self.amp_scaler.step(self.optimizer)
diff --git a/benchmarks/mlperf/apex b/benchmarks/mlperf/apex
new file mode 160000
index 000000000..05091d498
--- /dev/null
+++ b/benchmarks/mlperf/apex
@@ -0,0 +1 @@
+Subproject commit 05091d498d21058a0fe736b828c43431d4f0dda2
diff --git a/benchmarks/mlperf/benchfile.py b/benchmarks/mlperf/benchfile.py
new file mode 100644
index 000000000..8e69d4a95
--- /dev/null
+++ b/benchmarks/mlperf/benchfile.py
@@ -0,0 +1,10 @@
+from milabench.pack import Package
+
+
+class MLPerfBenchmark(Package):
+    base_requirements = "requirements.in"
+    main_script = "main.py"
+
+
+__pack__ = MLPerfBenchmark
+
diff --git a/benchmarks/mlperf/main.py b/benchmarks/mlperf/main.py
new file mode 100644
index 000000000..c665b653e
--- /dev/null
+++ b/benchmarks/mlperf/main.py
@@ -0,0 +1,13 @@
+
+import sys
+import os
+
+
+FOLDER = os.path.dirname(__file__)
+BENCH = "training_results_v2.1/NVIDIA/benchmarks/bert/implementations/pytorch-preview"
+
+print(sys.path)
+sys.path.append(os.path.join(FOLDER, BENCH))
+print(sys.path)
+
+import run_squad
diff --git a/benchmarks/mlperf/requirements.in b/benchmarks/mlperf/requirements.in
new file mode 100644
index 000000000..fef907ca0
--- /dev/null
+++ b/benchmarks/mlperf/requirements.in
@@ -0,0 +1,4 @@
+git+https://github.com/NVIDIA/mlperf-common.git
+git+https://github.com/NVIDIA/apex.git
+git+https://github.com/mlcommons/logging.git
+boto3
diff --git a/benchmarks/mlperf/training_results_v2.1 b/benchmarks/mlperf/training_results_v2.1
new file mode 160000
index 000000000..158189d4c
--- /dev/null
+++ b/benchmarks/mlperf/training_results_v2.1
@@ -0,0 +1 @@
+Subproject commit 158189d4cbfbee366c10da1f0f086c85d8f15b5f
diff --git a/config/base.yaml b/config/base.yaml
index ffa1fe629..9fd435cd2 100644
--- a/config/base.yaml
+++ b/config/base.yaml
@@ -10,6 +10,16 @@ _defaults:
       gpu_load_threshold: 0.5
       gpu_mem_threshold: 0.5
 
+
+mlperf:
+  inherits: _defaults
+  definition: ../benchmarks/mlperf
+  group: mlperf
+  install_group: torch
+  plan:
+    method: per_gpu
+
+
 _torchvision:
   inherits: _defaults
   definition: ../benchmarks/torchvision
@@ -92,7 +102,9 @@ resnet50:
   
   argv:
     --model: resnet50
-    --batch-size: 64
+    --batch-size: 256
+    --synthetic-data: true
+    --precision: 'fp16'
 
 efficientnet_b4:
   inherits: _torchvision
@@ -172,7 +184,7 @@ _bert-base:
     - precision-showcase
   argv:
     --model: "Bert"
-    --batch-size: 32
+    --batch-size: 48
   voir:
     options:
       stop: 30