From 5c2ede43cdbb54421b121e3bc0043f1daa8e2875 Mon Sep 17 00:00:00 2001
From: Pierre Delaunay <pierre@delaunay.io>
Date: Thu, 21 Nov 2024 19:50:40 -0500
Subject: [PATCH 1/4] Add missing tags to tests config

---
 tests/config/argerror.yaml    | 2 ++
 tests/config/benchio.yaml     | 4 +++-
 tests/config/benchio_bad.yaml | 3 ++-
 tests/config/scaling.yaml     | 2 ++
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/tests/config/argerror.yaml b/tests/config/argerror.yaml
index 49ad733cc..59041b72f 100644
--- a/tests/config/argerror.yaml
+++ b/tests/config/argerror.yaml
@@ -9,3 +9,5 @@ benchio:
     n: 1
   argv:
     --start: 0
+  tags:
+    - monogpu
\ No newline at end of file
diff --git a/tests/config/benchio.yaml b/tests/config/benchio.yaml
index f2c694e22..50c352ca8 100644
--- a/tests/config/benchio.yaml
+++ b/tests/config/benchio.yaml
@@ -4,4 +4,6 @@ benchio:
   weight: 2
   plan:
     method: njobs
-    n: 2
\ No newline at end of file
+    n: 2
+  tags:
+    - monogpu
\ No newline at end of file
diff --git a/tests/config/benchio_bad.yaml b/tests/config/benchio_bad.yaml
index ac0b2f820..51b15ac4b 100644
--- a/tests/config/benchio_bad.yaml
+++ b/tests/config/benchio_bad.yaml
@@ -8,4 +8,5 @@ benchio:
 
   argv:
     --bad: true
-
+  tags:
+    - monogpu
\ No newline at end of file
diff --git a/tests/config/scaling.yaml b/tests/config/scaling.yaml
index 664996f79..3f3b032e9 100644
--- a/tests/config/scaling.yaml
+++ b/tests/config/scaling.yaml
@@ -5,3 +5,5 @@ benchio:
     64: 12Go
     128: 24Go
     256: 48Go
+  tags:
+    - monogpu
\ No newline at end of file

From 490a1e7c7409e19c8e41b584d33a2d6c88dfd3d9 Mon Sep 17 00:00:00 2001
From: Pierre Delaunay <pierre@delaunay.io>
Date: Thu, 21 Nov 2024 20:08:57 -0500
Subject: [PATCH 2/4] Add revision

---
 benchmarks/llava/main.py    | 6 +++++-
 benchmarks/llava/prepare.py | 6 +++++-
 milabench/sizer.py          | 7 +++++--
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/benchmarks/llava/main.py b/benchmarks/llava/main.py
index 233ae2ebb..6c49b04a6 100755
--- a/benchmarks/llava/main.py
+++ b/benchmarks/llava/main.py
@@ -62,8 +62,12 @@ def main():
         "llava-hf/llava-1.5-7b-hf",
         torch_dtype=torch.bfloat16,
         device_map=compat.device_type,
+        revision="a272c74b2481d8aff3aa6fc2c4bf891fe57334fb"
+    )
+    processor = AutoProcessor.from_pretrained(
+        "llava-hf/llava-1.5-7b-hf",
+        revision="a272c74b2481d8aff3aa6fc2c4bf891fe57334fb"
     )
-    processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
 
     # Load dataset and create DataLoader
     dataset = load_dataset("HuggingFaceM4/the_cauldron", "aokvqa")["train"]
diff --git a/benchmarks/llava/prepare.py b/benchmarks/llava/prepare.py
index afa480b86..5e8b018f3 100755
--- a/benchmarks/llava/prepare.py
+++ b/benchmarks/llava/prepare.py
@@ -11,8 +11,12 @@ def main():
         "llava-hf/llava-1.5-7b-hf",
         torch_dtype=torch.float32,  # Change to float32
         device_map="auto",
+        revision="a272c74b2481d8aff3aa6fc2c4bf891fe57334fb"
+    )
+    _ = AutoProcessor.from_pretrained(
+        "llava-hf/llava-1.5-7b-hf",
+        revision="a272c74b2481d8aff3aa6fc2c4bf891fe57334fb"
     )
-    _ = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
 
     # Load dataset and create DataLoader
     _ = load_dataset("HuggingFaceM4/the_cauldron", "aokvqa")["train"]
diff --git a/milabench/sizer.py b/milabench/sizer.py
index 00d6d2b67..46a9983fc 100644
--- a/milabench/sizer.py
+++ b/milabench/sizer.py
@@ -53,9 +53,10 @@ def to_octet(value: str) -> float:
 class Sizer:
     """Automatically scale the batch size to match GPU spec"""
 
-    def __init__(self, scaling_config=None):
+    def __init__(self, sizer=None, scaling_config=None):
         self.path = scaling_config
-
+        self.sizer_override = sizer
+        
         if scaling_config is None:
             scaling_config = default_scaling_config
 
@@ -64,6 +65,8 @@ def __init__(self, scaling_config=None):
             
     @property
     def options(self):
+        if self.sizer_override:
+            return self.sizer_override
         return SizerOptions()
 
     def benchscaling(self, benchmark):

From 40bf7715095016be569499d6b152c88cbb6a4a28 Mon Sep 17 00:00:00 2001
From: Pierre Delaunay <pierre@delaunay.io>
Date: Thu, 21 Nov 2024 20:11:15 -0500
Subject: [PATCH 3/4] Add regression files

---
 .../test_command_reg_one_node.txt             | 619 ++++++++++++++++
 .../test_command_reg_two_nodes.txt            | 676 ++++++++++++++++++
 2 files changed, 1295 insertions(+)
 create mode 100644 tests/test_command_reg/test_command_reg_one_node.txt
 create mode 100644 tests/test_command_reg/test_command_reg_two_nodes.txt

diff --git a/tests/test_command_reg/test_command_reg_one_node.txt b/tests/test_command_reg/test_command_reg_one_node.txt
new file mode 100644
index 000000000..af21f4cde
--- /dev/null
+++ b/tests/test_command_reg/test_command_reg_one_node.txt
@@ -0,0 +1,619 @@
+#!/bin/sh
+
+echo "---"
+echo "Virtual Env"
+echo "==========="
+export VIRTUAL_ENV=$BASE/venv/torch
+
+source $VIRTUAL_ENV/bin/activate
+echo "---"
+echo "Milabench"
+echo "========="
+export MILABENCH_DIR_BASE=$BASE
+export MILABENCH_DIR_VENV=$BASE/venv/torch
+export MILABENCH_DIR_DATA=$BASE/data
+export MILABENCH_DIR_RUNS=$BASE/runs
+export MILABENCH_DIR_EXTRA=$BASE/extra/llm
+export MILABENCH_DIR_CACHE=$BASE/cache
+export OMP_NUM_THREADS=0
+export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 3600, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "monogpu", "nlp", "nobatch"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}'
+
+echo "---"
+echo "llama"
+echo "====="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  wait
+)
+
+echo "---"
+echo "fp16"
+echo "===="
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  wait
+)
+
+echo "---"
+echo "bf16"
+echo "===="
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  wait
+)
+
+echo "---"
+echo "tf32"
+echo "===="
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  wait
+)
+
+echo "---"
+echo "fp32"
+echo "===="
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  wait
+)
+
+echo "---"
+echo "resnet50"
+echo "========"
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  wait
+)
+
+echo "---"
+echo "resnet50-noio"
+echo "============="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  wait
+)
+
+echo "---"
+echo "resnet152-ddp-gpus"
+echo "=================="
+time (
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  wait
+)
+
+echo "---"
+echo "convnext_large-fp32"
+echo "==================="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  wait
+)
+
+echo "---"
+echo "convnext_large-fp16"
+echo "==================="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  wait
+)
+
+echo "---"
+echo "convnext_large-tf32"
+echo "==================="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  wait
+)
+
+echo "---"
+echo "convnext_large-tf32-fp16"
+echo "========================"
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  wait
+)
+
+echo "---"
+echo "regnet_y_128gf"
+echo "=============="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  wait
+)
+
+echo "---"
+echo "bert-fp32"
+echo "========="
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  wait
+)
+
+echo "---"
+echo "bert-fp16"
+echo "========="
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  wait
+)
+
+echo "---"
+echo "bert-tf32"
+echo "========="
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  wait
+)
+
+echo "---"
+echo "bert-tf32-fp16"
+echo "=============="
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  wait
+)
+
+echo "---"
+echo "t5"
+echo "=="
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  wait
+)
+
+echo "---"
+echo "reformer"
+echo "========"
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  wait
+)
+
+echo "---"
+echo "whisper"
+echo "======="
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  wait
+)
+
+echo "---"
+echo "focalnet"
+echo "========"
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D0 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D1 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D2 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D3 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D4 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D5 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D6 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D7 --checkpoint-hist 1 &
+  wait
+)
+
+echo "---"
+echo "brax"
+echo "===="
+time (
+  python $SRC/milabench/benchmarks/brax/main.py --episode-length 20 --batch-size 1024 --num-minibatches 32 --num-envs 8192 &
+  wait
+)
+
+echo "---"
+echo "diffusion-single"
+echo "================"
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  wait
+)
+
+echo "---"
+echo "diffusion-gpus"
+echo "=============="
+time (
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  wait
+)
+
+echo "---"
+echo "diffusion-nodes"
+echo "==============="
+time (
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  wait
+)
+
+echo "---"
+echo "lightning"
+echo "========="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  wait
+)
+
+echo "---"
+echo "lightning-gpus"
+echo "=============="
+time (
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  wait
+)
+
+echo "---"
+echo "dinov2-giant-single"
+echo "==================="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  wait
+)
+
+echo "---"
+echo "dinov2-giant-gpus"
+echo "================="
+time (
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  wait
+)
+
+echo "---"
+echo "llm-lora-single"
+echo "==============="
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  wait
+)
+
+echo "---"
+echo "llm-lora-ddp-gpus"
+echo "================="
+time (
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-gpus/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  wait
+)
+
+echo "---"
+echo "llm-lora-ddp-nodes"
+echo "=================="
+time (
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  wait
+)
+
+echo "---"
+echo "llm-lora-mp-gpus"
+echo "================"
+time (
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_lora.yaml epochs=1 output_dir=$BASE/extra/llm-lora-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ safetensors=true metric_logger.log_dir=$BASE/extra/llm-lora-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" batch_size=8 gradient_accumulation_steps=1 device=cuda &
+  wait
+)
+
+echo "---"
+echo "llm-full-mp-gpus"
+echo "================"
+time (
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 device=cuda &
+  wait
+)
+
+echo "---"
+echo "llm-full-mp-nodes"
+echo "================="
+time (
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 device=cuda &
+  wait
+)
+
+echo "---"
+echo "dqn"
+echo "==="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  wait
+)
+
+echo "---"
+echo "ppo"
+echo "==="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  wait
+)
+
+echo "---"
+echo "pna"
+echo "==="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  wait
+)
+
+echo "---"
+echo "dimenet"
+echo "======="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  wait
+)
+
+echo "---"
+echo "recursiongfn"
+echo "============"
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  wait
+)
+
+echo "---"
+echo "torchatari"
+echo "=========="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  wait
+)
+
+echo "---"
+echo "llava-single"
+echo "============"
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  wait
+)
+
+echo "---"
+echo "rlhf-single"
+echo "==========="
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  wait
+)
+
+echo "---"
+echo "rlhf-gpus"
+echo "========="
+time (
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-gpus/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  wait
+)
+
+echo "---"
+echo "vjepa-single"
+echo "============"
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  wait
+)
+
+echo "---"
+echo "vjepa-gpus"
+echo "=========="
+time (
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-gpus &
+  wait
+)
+
diff --git a/tests/test_command_reg/test_command_reg_two_nodes.txt b/tests/test_command_reg/test_command_reg_two_nodes.txt
new file mode 100644
index 000000000..5e516e3f9
--- /dev/null
+++ b/tests/test_command_reg/test_command_reg_two_nodes.txt
@@ -0,0 +1,676 @@
+#!/bin/sh
+
+echo "---"
+echo "Virtual Env"
+echo "==========="
+export VIRTUAL_ENV=$BASE/venv/torch
+
+source $VIRTUAL_ENV/bin/activate
+echo "---"
+echo "Milabench"
+echo "========="
+export MILABENCH_DIR_BASE=$BASE
+export MILABENCH_DIR_VENV=$BASE/venv/torch
+export MILABENCH_DIR_DATA=$BASE/data
+export MILABENCH_DIR_RUNS=$BASE/runs
+export MILABENCH_DIR_EXTRA=$BASE/extra/llm
+export MILABENCH_DIR_CACHE=$BASE/cache
+export OMP_NUM_THREADS=0
+export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}, {"ip": "192.168.0.11", "main": false, "name": "1", "sshport": 22, "user": "username", "hostname": "192.168.0.11"}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 3600, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "monogpu", "nlp", "nobatch"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}'
+
+echo "---"
+echo "llama"
+echo "====="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  wait
+)
+
+echo "---"
+echo "fp16"
+echo "===="
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  wait
+)
+
+echo "---"
+echo "bf16"
+echo "===="
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  wait
+)
+
+echo "---"
+echo "tf32"
+echo "===="
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  wait
+)
+
+echo "---"
+echo "fp32"
+echo "===="
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  wait
+)
+
+echo "---"
+echo "resnet50"
+echo "========"
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  wait
+)
+
+echo "---"
+echo "resnet50-noio"
+echo "============="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  wait
+)
+
+echo "---"
+echo "resnet152-ddp-gpus"
+echo "=================="
+time (
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  wait
+)
+
+echo "---"
+echo "convnext_large-fp32"
+echo "==================="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  wait
+)
+
+echo "---"
+echo "convnext_large-fp16"
+echo "==================="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  wait
+)
+
+echo "---"
+echo "convnext_large-tf32"
+echo "==================="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  wait
+)
+
+echo "---"
+echo "convnext_large-tf32-fp16"
+echo "========================"
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  wait
+)
+
+echo "---"
+echo "regnet_y_128gf"
+echo "=============="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  wait
+)
+
+echo "---"
+echo "bert-fp32"
+echo "========="
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  wait
+)
+
+echo "---"
+echo "bert-fp16"
+echo "========="
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  wait
+)
+
+echo "---"
+echo "bert-tf32"
+echo "========="
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  wait
+)
+
+echo "---"
+echo "bert-tf32-fp16"
+echo "=============="
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  wait
+)
+
+echo "---"
+echo "t5"
+echo "=="
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  wait
+)
+
+echo "---"
+echo "reformer"
+echo "========"
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  wait
+)
+
+echo "---"
+echo "whisper"
+echo "======="
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  wait
+)
+
+echo "---"
+echo "focalnet"
+echo "========"
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D0 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D1 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D2 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D3 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D4 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D5 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D6 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D7 --checkpoint-hist 1 &
+  wait
+)
+
+echo "---"
+echo "brax"
+echo "===="
+time (
+  python $SRC/milabench/benchmarks/brax/main.py --episode-length 20 --batch-size 1024 --num-minibatches 32 --num-envs 8192 &
+  wait
+)
+
+echo "---"
+echo "diffusion-single"
+echo "================"
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  wait
+)
+
+echo "---"
+echo "diffusion-gpus"
+echo "=============="
+time (
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  wait
+)
+
+0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1
+is_local 127.0.0.1 True
+1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11
+is_local 192.168.0.11 False
+0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1
+is_local 127.0.0.1 True
+1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11
+is_local 192.168.0.11 False
+0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1
+is_local 127.0.0.1 True
+1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11
+is_local 192.168.0.11 False
+echo "---"
+echo "diffusion-nodes"
+echo "==============="
+time (
+0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1
+is_local 127.0.0.1 True
+1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11
+is_local 192.168.0.11 False
+is_local 127.0.0.1 True
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=16 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+is_local 192.168.0.11 False
+  ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=1 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=16 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  wait
+)
+
+echo "---"
+echo "lightning"
+echo "========="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  wait
+)
+
+echo "---"
+echo "lightning-gpus"
+echo "=============="
+time (
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  wait
+)
+
+echo "---"
+echo "dinov2-giant-single"
+echo "==================="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  wait
+)
+
+echo "---"
+echo "dinov2-giant-gpus"
+echo "================="
+time (
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  wait
+)
+
+echo "---"
+echo "llm-lora-single"
+echo "==============="
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  wait
+)
+
+echo "---"
+echo "llm-lora-ddp-gpus"
+echo "================="
+time (
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-gpus/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  wait
+)
+
+0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1
+is_local 127.0.0.1 True
+1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11
+is_local 192.168.0.11 False
+0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1
+is_local 127.0.0.1 True
+1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11
+is_local 192.168.0.11 False
+0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1
+is_local 127.0.0.1 True
+1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11
+is_local 192.168.0.11 False
+echo "---"
+echo "llm-lora-ddp-nodes"
+echo "=================="
+time (
+0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1
+is_local 127.0.0.1 True
+1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11
+is_local 192.168.0.11 False
+is_local 127.0.0.1 True
+  $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=0 --local-addr=127.0.0.1 --rdzv-conf=rank=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+is_local 192.168.0.11 False
+  ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=1 --local-addr=192.168.0.11 --rdzv-conf=rank=1 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  wait
+)
+
+echo "---"
+echo "llm-lora-mp-gpus"
+echo "================"
+time (
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_lora.yaml epochs=1 output_dir=$BASE/extra/llm-lora-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ safetensors=true metric_logger.log_dir=$BASE/extra/llm-lora-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" batch_size=8 gradient_accumulation_steps=1 device=cuda &
+  wait
+)
+
+echo "---"
+echo "llm-full-mp-gpus"
+echo "================"
+time (
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 device=cuda &
+  wait
+)
+
+0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1
+is_local 127.0.0.1 True
+1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11
+is_local 192.168.0.11 False
+0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1
+is_local 127.0.0.1 True
+1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11
+is_local 192.168.0.11 False
+0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1
+is_local 127.0.0.1 True
+1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11
+is_local 192.168.0.11 False
+echo "---"
+echo "llm-full-mp-nodes"
+echo "================="
+time (
+0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1
+is_local 127.0.0.1 True
+1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11
+is_local 192.168.0.11 False
+is_local 127.0.0.1 True
+  $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=0 --local-addr=127.0.0.1 --rdzv-conf=rank=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 device=cuda &
+is_local 192.168.0.11 False
+  ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=1 --local-addr=192.168.0.11 --rdzv-conf=rank=1 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 device=cuda &
+  wait
+)
+
+echo "---"
+echo "dqn"
+echo "==="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  wait
+)
+
+echo "---"
+echo "ppo"
+echo "==="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  wait
+)
+
+echo "---"
+echo "pna"
+echo "==="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  wait
+)
+
+echo "---"
+echo "dimenet"
+echo "======="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  wait
+)
+
+echo "---"
+echo "recursiongfn"
+echo "============"
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  wait
+)
+
+echo "---"
+echo "torchatari"
+echo "=========="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  wait
+)
+
+echo "---"
+echo "llava-single"
+echo "============"
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  wait
+)
+
+echo "---"
+echo "rlhf-single"
+echo "==========="
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  wait
+)
+
+echo "---"
+echo "rlhf-gpus"
+echo "========="
+time (
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-gpus/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  wait
+)
+
+echo "---"
+echo "vjepa-single"
+echo "============"
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  wait
+)
+
+echo "---"
+echo "vjepa-gpus"
+echo "=========="
+time (
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-gpus &
+  wait
+)
+

From 0527b6ee89f3cdc28014e529be9a0d8d10844c0c Mon Sep 17 00:00:00 2001
From: Pierre Delaunay <pierre@delaunay.io>
Date: Thu, 21 Nov 2024 20:58:32 -0500
Subject: [PATCH 4/4] update test_memory_tracking

---
 milabench/sizer.py                            |  4 +-
 milabench/system.py                           |  5 ++
 tests/test_scaler.py                          | 41 ++++++-----
 tests/test_summary/test_compare.txt           |  4 +-
 tests/test_summary/test_report.txt            |  2 +-
 .../test_report_folder_does_average.txt       |  2 +-
 tests/test_validation.py                      | 71 ++++++++++---------
 7 files changed, 70 insertions(+), 59 deletions(-)

diff --git a/milabench/sizer.py b/milabench/sizer.py
index 46a9983fc..4bd62bc7f 100644
--- a/milabench/sizer.py
+++ b/milabench/sizer.py
@@ -53,7 +53,7 @@ def to_octet(value: str) -> float:
 class Sizer:
     """Automatically scale the batch size to match GPU spec"""
 
-    def __init__(self, sizer=None, scaling_config=None):
+    def __init__(self, sizer=None, scaling_config=option("sizer.config", etype=str)):
         self.path = scaling_config
         self.sizer_override = sizer
         
@@ -255,7 +255,7 @@ class MemoryUsageExtractor(ValidationLayer):
     def __init__(self):
         
         self.filepath = option("sizer.save", str, None)
-        sizer = batch_sizer()
+        sizer = Sizer()
         self.memory = deepcopy(sizer.scaling_config)
         self.scaling = None
         self.benchname = None
diff --git a/milabench/system.py b/milabench/system.py
index bd8298c6e..691d06bd9 100644
--- a/milabench/system.py
+++ b/milabench/system.py
@@ -121,6 +121,11 @@ def apply_system(config: dict):
     system = system_global.get()
     old = deepcopy(system)
     
+    if system is None:
+        system = dict()
+        system_global.set(system)
+        system = system_global.get()
+    
     for k, v in config.items():
         frags = k.split(".")
         
diff --git a/tests/test_scaler.py b/tests/test_scaler.py
index f00a89793..07cdb2ed2 100644
--- a/tests/test_scaler.py
+++ b/tests/test_scaler.py
@@ -76,28 +76,27 @@ def fakeexec(pack):
 
 def test_scaler_enabled(multipack, config):
     from milabench.system import system_global
-    import contextvars
-
-    ctx = contextvars.copy_context()
-
-    def update_ctx():
-        sizer = Sizer(
-            SizerOptions(
-                size=None,
-                autoscale=True,
-                multiple=8,
-            ),
-            config("scaling"),
-        )
-        sizer_global.set(sizer)
-        system = system_global.get()
-        gpu = system.setdefault("gpu", dict())
-        gpu["capacity"] = "41920 MiB"
-
-    ctx.run(update_ctx)
+    from milabench.system import apply_system
+
+    conf = {
+        "gpu": {
+            "capacity": "41920 MiB"
+        },
+        "options": {
+            "sizer": {
+                "multiple": 8
+            }
+        }
+    }
 
     for k, pack in multipack.packs.items():
-        assert ctx.run(lambda: fakeexec(pack)) == ["--batch_size", "232"]
+        # Sizer is only enabled when config is applied
+        assert fakeexec(pack) == []
+
+    with apply_system(conf):
+        for k, pack in multipack.packs.items():
+            fakeexec(pack) == ["--batch_size", "232"]
 
-        # Sizer is only enabled inside the context
+    for k, pack in multipack.packs.items():
+        # Sizer is only enabled when config is applied
         assert fakeexec(pack) == []
diff --git a/tests/test_summary/test_compare.txt b/tests/test_summary/test_compare.txt
index c4dd7f6dc..c3bb5bf0f 100644
--- a/tests/test_summary/test_compare.txt
+++ b/tests/test_summary/test_compare.txt
@@ -1,5 +1,5 @@
                                        |   rijubigo |   sedumoje
-                                       | 2023-03-24 | 2023-03-24
-bench                |          metric |   13:45:27 |   13:57:35
+                                       | 2024-08-23 | 2024-08-23
+bench                |          metric |   09:22:03 |   09:22:03
 ----------------------------------------------------------------
 benchio              |      train_rate |    8780.41 |    8286.03
diff --git a/tests/test_summary/test_report.txt b/tests/test_summary/test_report.txt
index b9f6ce02a..2f4d3fe4b 100644
--- a/tests/test_summary/test_report.txt
+++ b/tests/test_summary/test_report.txt
@@ -11,4 +11,4 @@ benchio |    0 |   4 |    0 |    7979.82 |   2.9% |  17.2% |         nan |    79
 Scores
 ------
 Failure rate:       0.00% (PASS)
-Score:            7979.82
+Score:            7980.82
diff --git a/tests/test_summary/test_report_folder_does_average.txt b/tests/test_summary/test_report_folder_does_average.txt
index 9fda7a9c2..8884a73a6 100644
--- a/tests/test_summary/test_report_folder_does_average.txt
+++ b/tests/test_summary/test_report_folder_does_average.txt
@@ -11,4 +11,4 @@ benchio |    0 |   6 |    0 |    7878.45 |   2.5% |  18.0% |       24456 |    78
 Scores
 ------
 Failure rate:       0.00% (PASS)
-Score:            7878.45
+Score:            7879.45
diff --git a/tests/test_validation.py b/tests/test_validation.py
index d5f1007b8..9ed9000aa 100644
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -76,39 +76,46 @@ def test_planning_layer_per_gpu_bad(replayfolder, monkeypatch):
 
 def test_memory_tracking(replayfolder, config, tmp_path):
     import contextvars
-
-    from milabench.sizer import (
-        MemoryUsageExtractor,
-        Sizer,
-        SizerOptions,
-        sizer_global,
-        system_global,
-    )
-
-    ctx = contextvars.copy_context()
-
-    def update_ctx():
-        sizer = Sizer(
-            SizerOptions(
-                size=None,
-                autoscale=True,
-                multiple=8,
-            ),
-            config("scaling"),
+    import yaml
+    from milabench.system import apply_system, option
+    
+    conf = {
+        "gpu": {
+            "capacity": "41920 MiB"
+        },
+        "options": {
+            "sizer": {
+                "multiple": 8,
+                "autoscale": 1
+            }
+        }
+    }
+    
+    with apply_system(conf):
+        from milabench.sizer import (
+            MemoryUsageExtractor,
+            Sizer,
+            SizerOptions,
+            sizer_global,
+            system_global,
         )
-        sizer_global.set(sizer)
-        system_global.set({"gpu": {"capacity": "41920 MiB"}})
-
-    ctx.run(update_ctx)
-    layer = ctx.run(lambda: MemoryUsageExtractor())
-
-    layer.filepath = f"{tmp_path}/dummy"
-
-    assert 123 not in layer.memory["benchio"]["model"]
-
-    ctx.run(lambda: replay_validation_scenario(replayfolder, layer, filename="usage"))
-
-    assert 123 in layer.memory["benchio"]["model"]
+        
+        layer = MemoryUsageExtractor()
+        with open(config("scaling"), "r") as sconf:
+            layer.memory = yaml.safe_load(sconf)
+            
+        layer.filepath = f"{tmp_path}/dummy"
+
+        print(system_global.get())
+        # print(option("sizer.multiple", etype=int))
+        # print(option("sizer.config", etype=str))
+        # print(Sizer().scaling_config)
+        assert 123 not in layer.memory["benchio"]["model"]
+
+        replay_validation_scenario(replayfolder, layer, filename="usage")
+
+        # print(layer.memory)
+        assert 123 in layer.memory["benchio"]["model"]
 
 
 def test_exception_tracking(replayfolder, file_regression, capsys):