Merge pull request #309 from mila-iqia/H100

H100
mila-iqia · Nov 21, 2024 · 4fdf736 · 4fdf736
2 parents 08eebc1 + 8498689
commit 4fdf736
Show file tree

Hide file tree

Showing 4 changed files with 19 additions and 28 deletions.
diff --git a/milabench/_version.py b/milabench/_version.py
@@ -1,5 +1,5 @@
 """This file is generated, do not modify"""
 
-__tag__ = "v0.1.0-129-ga60a3aa"
-__commit__ = "a60a3aae21e87e46bcce403620a3f56c12878554"
-__date__ = "2024-11-06 22:52:12 -0500"
+__tag__ = "v1.0.0_RC1-12-g3b87cb4"
+__commit__ = "3b87cb465e855be452953273c314ab01024e0925"
+__date__ = "2024-10-09 12:04:43 -0400"
diff --git a/milabench/compare.py b/milabench/compare.py
@@ -27,6 +27,9 @@ def fetch_runs(folder, filter):
     runs = []
     ignored = 0
     for run in os.listdir(folder):
+        if run.startswith("install") or run.startswith("prepare"):
+            continue
+
         if filter is not None and (not fnmatch.fnmatch(run, filter)):
             ignored += 1
             continue

diff --git a/milabench/system.py b/milabench/system.py
@@ -406,6 +406,7 @@ def resolve_hostname(ip):
             if is_loopback(ip):
                 return hostname, True
 
+        return socket.gethostname(), hostname.startswith(socket.gethostname())
         return hostname, hostname == socket.gethostname()
 
     except:

diff --git a/scripts/article/run_cuda.sh b/scripts/article/run_cuda.sh
@@ -49,7 +49,7 @@ install_prepare() {
     # Install milabench's benchmarks in their venv
     #
     # pip install torch
-    milabench pin --variant cuda --from-scratch $ARGS 
+    # milabench pin --variant cuda --from-scratch $ARGS 
     milabench install --system $MILABENCH_WORDIR/system.yaml $ARGS
 
     which pip
@@ -70,9 +70,9 @@ install_prepare() {
     milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS
 }
 
-module load cuda/12.3.2
+# module load cuda/12.3.2
 
-if [ ! -d "$MILABENCH_WORDIR/results" ]; then
+if [ ! -d "$MILABENCH_WORDIR/env" ]; then
     install_prepare 
 else
     echo "Reusing previous install"
@@ -87,8 +87,9 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then
     # pip install torch
     # milabench pin --variant cuda --from-scratch 
     # rm -rf $MILABENCH_WORDIR/results/venv/
-    # rm -rf $MILABENCH_WORDIR/results/extra
-    # milabench install --system $MILABENCH_WORDIR/system.yaml
+    rm -rf $MILABENCH_WORDIR/results/extra
+
+    milabench install --system $MILABENCH_WORDIR/system.yaml
     milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS
 
     (
@@ -98,28 +99,14 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then
         # pip install torchao --no-input
     )
 
-    # pip install torch
-    # milabench pin --variant cuda --from-scratch 
-    # milabench install --system $MILABENCH_WORDIR/system.yaml --force $ARGS
-    # milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS
-
-    # ARGS="--select resnet50-noio,brax,lightning,dinov2-giant-single,dinov2-giant-gpus,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-full-mp-gpus,llm-full-mp-nodes,dqn,ppo,dimenet,llava-single,rlhf-single,rlhf-gpus,vjepa-single,vjepa-gpus"
-
-    # MEMORY_CAPACITY=("4Go" "8Go" "16Go" "32Go" "64Go" "80Go")
-    # # MEMORY_CAPACITY=("2048" "4096" "8192")
-
-    # #   Run the benchmakrs 
-    # for CAPACITY in "${MEMORY_CAPACITY[@]}"; do
-    #     export MILABENCH_SIZER_AUTO=1
-    #     export MILABENCH_SIZER_MULTIPLE=8
-    #     export MILABENCH_SIZER_CAPACITY=$CAPACITY
-    #     # export MILABENCH_SIZER_BATCH_SIZE=$CAPACITY
-    #     milabench run --run-name "c$CAPACITY.{time}" --system $MILABENCH_WORDIR/system.yaml $ARGS || true
-    # done
-
     milabench run --system $MILABENCH_WORDIR/system.yaml $ARGS
 
     #
     #   Display report
     milabench report --runs $MILABENCH_WORDIR/results/runs
-fi
+fi
+
+
+# rsync -av [email protected]:~/rocm/results/cache ~/cuda/results/cache
+# rsync -av [email protected]:~/rocm/results/data ~/cuda/results/data
+# rsync -av [email protected]:~/rocm/results/cache ~/cuda/results/cache