From 1bb2d16d39e4f8e2c7bdcf10cf674e0432f31b83 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Fri, 24 May 2024 11:20:33 -0700
Subject: [PATCH 01/20] add upstream pax 2-node Grok test

---
 .github/container/test-pax.sh             | 4 ++++
 .github/workflows/_test_upstream_pax.yaml | 5 +++++
 2 files changed, 9 insertions(+)

diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh
index 91cb926b4..a11c62edf 100755
--- a/.github/container/test-pax.sh
+++ b/.github/container/test-pax.sh
@@ -382,6 +382,10 @@ elif [[ ${MODEL_TYPE} == "5B" ]]; then
 elif [[ ${MODEL_TYPE} == "LLaMA70BProxy" ]]; then
   CONFIG=ci_configs.LLaMA70BSyntheticSmall
   ADDITIONAL_ARGS="--fdl.DCN_MESH_SHAPE=[1,${NODES},1] --fdl.ICI_MESH_SHAPE=[${DP},${FSDP},${TP}] ${ADDITIONAL_ARGS} --fdl.PERCORE_BATCH_SIZE=${BATCH_PER_GPU}"
+## hard-code ICI mesh shape for Grok
+elif [[ ${MODEL_TYPE} == "GrokProxy" ]]; then
+  CONFIG=paxml.tasks.lm.params.nvidia.Grok_Proxy
+  ADDITIONAL_ARGS="--fdl.DCN_MESH_SHAPE=[1,${NODES},1,1] --fdl.ICI_MESH_SHAPE=[1,1,8,1] ${ADDITIONAL_ARGS} --fdl.PERCORE_BATCH_SIZE=${BATCH_PER_GPU} --fdl.NUM_LAYERS=2"
 else
   echo "Unsupported model ${MODEL_TYPE}"
   exit 1
diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml
index c07749b12..423a9f5de 100644
--- a/.github/workflows/_test_upstream_pax.yaml
+++ b/.github/workflows/_test_upstream_pax.yaml
@@ -210,6 +210,11 @@ jobs:
             BATCH_SIZE: 4
             EVALUATE: true
             ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate"
+          - TEST_NAME: Grok
+            PARALLEL_CONFIG: [1, 2, 8, 1] ## note: only used to compute num nodes
+            BATCH_SIZE: 4
+            EVALUATE: true
+            ADDITIONAL_ARGS: "--model-type GrokProxy"
       fail-fast: false
 
     runs-on: ubuntu-22.04

From 17a7eb46cff8f54c63266b3e52d287c729e14278 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Wed, 29 May 2024 16:57:49 -0700
Subject: [PATCH 02/20] update pax tests to pass in ICI and DCN mesh shapes
 explicitly

---
 .github/container/test-pax.sh             |  99 +++++-------------
 .github/workflows/_test_pax_rosetta.yaml  | 122 +++++++++++++---------
 .github/workflows/_test_upstream_pax.yaml |  87 ++++++++-------
 3 files changed, 149 insertions(+), 159 deletions(-)

diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh
index a11c62edf..d4ad9b069 100755
--- a/.github/container/test-pax.sh
+++ b/.github/container/test-pax.sh
@@ -22,11 +22,9 @@ usage() {
     echo "  --evaluate                 Whether to test evaluation rather than training."
     echo "  -s, --steps                Number of steps to run, defaults to 500."
     echo "  --multiprocess             Enable the multiprocess GPU mode."
+    echo "  --ici                      ICI mesh shape."
+    echo "  --dcn                      DCN mesh shape."
     echo "  -o, --output NAME          Name for the output folder, a temporary folder will be created if none specified."
-    echo "  --data-parallel            Data parallelism to use. Defaults to 1."
-    echo "  --fsdp                     Fully-sharded data parallelism to use. Defaults to 1."
-    echo "  --tensor-parallel          Tensor parallelism to use. Defaults to 1."
-    echo "  --pipeline-parallel        Pipeline parallelism to use. Defaults to 1 for no pipelining." 
     echo "  -n, --nodes                Number of nodes."
     echo "  -h, --help                 Print usage."
     exit $1
@@ -44,10 +42,8 @@ OUTPUT=$(mktemp -d)
 BATCH_PER_GPU=4
 DTYPE="bfloat16"
 STEPS=500
-DP=1
-FSDP=1
-TP=1
-PP=1
+ICI="[1,1,1]"
+DCN="[1,1,1]"
 NODES=1
 ENABLE_TE=0
 MODEL_TYPE=126M
@@ -103,20 +99,12 @@ while [ : ]; do
             OUTPUT=$2
             shift 2
             ;;
-        --data-parallel)
-            DP="$2"
+        --ici)
+            ICI="$2"
             shift 2
             ;;
-        --fsdp)
-            FSDP="$2"
-            shift 2
-            ;;
-        --tensor-parallel)
-            TP="$2"
-            shift 2
-            ;;
-        --pipeline-parallel)
-            PP="$2"
+        --dcn)
+            DCN="$2"
             shift 2
             ;;
         -n | --nodes)
@@ -152,10 +140,8 @@ print_var ENABLE_TE
 print_var NVTE_FUSED_ATTN
 print_var EVALUATE
 print_var DROPOUT
-print_var DP
-print_var FSDP
-print_var TP
-print_var PP
+print_var ICI
+print_var DCN
 
 PAXML_DIR=$(dirname `python -c 'import paxml; print(*paxml.__path__)'`)
 pushd ${PAXML_DIR}
@@ -174,16 +160,7 @@ from paxml.tasks.lm.params.lm_cloud import SyntheticDataset
 from praxis import base_layer
 from praxis import layers
 
-dp = ${DP}
-fsdp = ${FSDP}
-tp = ${TP}
-pp = ${PP}
-num_gpus = ${NGPUS}
-percore_batch_size = ${BATCH_PER_GPU}
-dtype = "${DTYPE}"
-dropout = float(${DROPOUT})
-
-assert num_gpus == dp*fsdp*tp*pp, f'product of parallel strategies should equal number of available gpus. Have {num_gpus} gpus, but product of parallel strategies is {dp*fsdp*tp*pp}'
+assert num_gpus == np.prod(${ICI}) * np.prod(${DCN}), f'product of parallel strategies should equal number of available gpus. Have {num_gpus} gpus, but product of parallel strategies is {np.prod(${ICI}) * np.prod(${DCN})}'
 
 ## heuristics to get ici and dcn mesh shapes.
 ## these heuristics only support one parallel strategy across nodes
@@ -301,11 +278,6 @@ class LLaMA70BSyntheticSmall(BaseLLaMA, SyntheticDataset):
     USE_MQA = True
     NUM_KV_HEADS = 8
 
-    PERCORE_BATCH_SIZE = 4
-
-    ICI_MESH_SHAPE = [1, 8, 1]
-    DCN_MESH_SHAPE = [1, 1, 1]
-
     def task(self):
       task_p = super().task()
       task_p.train.always_use_train_for_model_init=False
@@ -317,11 +289,8 @@ if pp > 1:
   @experiment_registry.register
   class Synthetic126MCI(GPT126MPP, SyntheticDataset):
     
-    ICI_MESH_SHAPE = [pp, dp, fsdp, tp]
-    DCN_MESH_SHAPE = [dcn_pp, dcn_dp, dcn_fsdp, 1]
     MICROBATCH_SIZE = 2
     NUM_STAGES = pp
-    PERCORE_BATCH_SIZE = percore_batch_size
     FRPOP_DTYPE = dtype
     
     def task(self):
@@ -334,9 +303,6 @@ else:
   @experiment_registry.register
   class Synthetic126MCI(Synthetic126M):
     
-    ICI_MESH_SHAPE = [dp, fsdp, tp]
-    DCN_MESH_SHAPE = [dcn_dp, dcn_fsdp, 1]
-    PERCORE_BATCH_SIZE = percore_batch_size
     FRPOP_DTYPE = dtype
 
     DROPOUT_PROB = dropout
@@ -378,52 +344,41 @@ if [[ ${MODEL_TYPE} == "126M" ]]; then
   CONFIG=ci_configs.Synthetic126MCI
 elif [[ ${MODEL_TYPE} == "5B" ]]; then
   CONFIG=paxml.contrib.gpu.scripts_gpu.configs.Synthetic5B
-  ADDITIONAL_ARGS="--fdl.DCN_MESH_SHAPE=[1,${NODES},1] --fdl.ICI_MESH_SHAPE=[${DP},${FSDP},${TP}] ${ADDITIONAL_ARGS} --fdl.PERCORE_BATCH_SIZE=${BATCH_PER_GPU}"
 elif [[ ${MODEL_TYPE} == "LLaMA70BProxy" ]]; then
   CONFIG=ci_configs.LLaMA70BSyntheticSmall
-  ADDITIONAL_ARGS="--fdl.DCN_MESH_SHAPE=[1,${NODES},1] --fdl.ICI_MESH_SHAPE=[${DP},${FSDP},${TP}] ${ADDITIONAL_ARGS} --fdl.PERCORE_BATCH_SIZE=${BATCH_PER_GPU}"
 ## hard-code ICI mesh shape for Grok
 elif [[ ${MODEL_TYPE} == "GrokProxy" ]]; then
   CONFIG=paxml.tasks.lm.params.nvidia.Grok_Proxy
-  ADDITIONAL_ARGS="--fdl.DCN_MESH_SHAPE=[1,${NODES},1,1] --fdl.ICI_MESH_SHAPE=[1,1,8,1] ${ADDITIONAL_ARGS} --fdl.PERCORE_BATCH_SIZE=${BATCH_PER_GPU} --fdl.NUM_LAYERS=2"
+  ADDITIONAL_ARGS="--fdl.NUM_LAYERS=2"
 else
   echo "Unsupported model ${MODEL_TYPE}"
   exit 1
 fi
 
+CMD_LINE_FLAGS = "--fdl_config=${CONFIG} \
+    --fdl.MAX_STEPS=0 \
+    --job_log_dir=${OUTPUT} \
+    --alsologtostderr \
+    --fdl.PERCORE_BATCH_SIZE=${BATCH_PER_GPU} \
+    --fdl.ICI_MESH_SHAPE=${ICI} \
+    --fdl.DCN_MESH_SHAPE=${DCN} \
+    $ADDITIONAL_ARGS \
+    $([[ $MULTIPROCESS != 0 ]] && echo --multiprocess_gpu)"
+
 if [[ ${EVALUATE} -ne 0 ]]; then
 
   trap "rm -rf ${OUTPUT}/checkpoints" ERR INT HUP TERM EXIT
 
   ## train for 0 steps to generate an initial checkpoint
-  python -m paxml.main \
-    --fdl_config=${CONFIG} \
-    --fdl.MAX_STEPS=0 \
-    --job_log_dir=${OUTPUT} \
-    --alsologtostderr \
-    $ADDITIONAL_ARGS \
-    $([[ $MULTIPROCESS != 0 ]] && echo --multiprocess_gpu)
+  python -m paxml.main ${CMD_LINE_FLAGS}
 
   ## restore from initial checkpoint for eval
-  python -m paxml.main \
-    --fdl_config=${CONFIG} \
-    --job_log_dir=${OUTPUT} \
-    --mode='eval' \
-    --fdl.MAX_STEPS=0 \
-    --alsologtostderr \
-    --enable_checkpoint_saving=False \
-    $ADDITIONAL_ARGS \
-    $([[ $MULTIPROCESS != 0 ]] && echo --multiprocess_gpu)
+  python -m paxml.main ${CMD_LINE_FLAGS} \
+    --enable_checkpoint_saving=False
 
 else
-  python -m paxml.main \
-    --fdl_config=${CONFIG} \
-    --job_log_dir=${OUTPUT} \
-    --alsologtostderr \
-    --fdl.MAX_STEPS=${STEPS} \
-    --enable_checkpoint_saving=False \
-    $ADDITIONAL_ARGS \
-    $([[ $MULTIPROCESS != 0 ]] && echo --multiprocess_gpu)
+  python -m paxml.main ${CMD_LINE_FLAGS} \
+    --enable_checkpoint_saving=False
 fi
 
 set +x
diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml
index 7b80bfa60..8a05f6a11 100644
--- a/.github/workflows/_test_pax_rosetta.yaml
+++ b/.github/workflows/_test_pax_rosetta.yaml
@@ -33,9 +33,13 @@ jobs:
   single-process-multi-device-te:
     strategy:
       matrix:
-        PARALLEL_CONFIG:
-        - [1, 8, 1, 1]
-        - [1, 1, 2, 4]
+        include:
+          - TEST_NAME: 8DP1FSDP1TP1PP_single_process_TE
+            ICI: [8, 1, 1]
+            DCN: [1, 1, 1]
+          - TEST_NAME: 1DP2FSDP4TP1PP_single_process_TE
+            ICI: [8, 1, 1]
+            DCN: [1, 1, 1]
       fail-fast: false
 
     runs-on: ubuntu-22.04
@@ -70,7 +74,7 @@ jobs:
         shell: bash -x -e {0}
         run: |
           IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')"
-          TEST_CASE_NAME=${{ matrix.PARALLEL_CONFIG[1] }}DP${{ matrix.PARALLEL_CONFIG[2] }}FSDP${{ matrix.PARALLEL_CONFIG[3] }}TP${{ matrix.PARALLEL_CONFIG[0] }}PP_single_process_TE
+          TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }}
           MAX_GPUS_PER_NODE=8
           GPUS_PER_NODE=8
           NODES=1
@@ -118,10 +122,8 @@ jobs:
               --dtype bfloat16 \
               --batch-per-gpu 4 \
               --steps 500 \
-              --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \
-              --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \
-              --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \
-              --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \
+              --ici ${{ matrix.ICI }}
+              --dcn ${{ matrix.DCN }}
               --nodes ${{ steps.meta.outputs.NODES }} \
               --enable-te
           EOF
@@ -226,36 +228,52 @@ jobs:
       matrix:
         include:
           - TEST_NAME: 1DP1FSDP1TP1PP_TE
-            PARALLEL_CONFIG: [1, 1, 1, 1]
+            ICI: [1, 1, 1]
+            DCN: [1, 1, 1]
             BATCH_SIZE: 4
+            TASKS: 1
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 8DP1FSDP1TP1PP_TE
-            PARALLEL_CONFIG: [1, 8, 1, 1]
+            ICI: [8, 1, 1]
+            DCN: [1, 1, 1]
             ADDITIONAL_ARGS: ""
             BATCH_SIZE: 4
+            TASKS: 8
           - TEST_NAME: 1DP8FSDP1TP1PP_TE
-            PARALLEL_CONFIG: [1, 1, 8, 1]
+            ICI: [1, 8, 1]
+            DCN: [1, 1, 1]
             BATCH_SIZE: 4
+            TASKS: 8
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 4DP1FSDP2TP1PP_TE
-            PARALLEL_CONFIG: [1, 4, 1, 2]
+            ICI: [4, 1, 1]
+            DCN: [1, 1, 1]
             BATCH_SIZE: 4
+            TASKS: 8
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 16DP1FSDP1TP1PP_TE
-            PARALLEL_CONFIG: [1, 16, 1, 1]
+            ICI: [8, 1, 1]
+            DCN: [2, 1, 1]
             BATCH_SIZE: 4
+            TASKS: 16
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 5B_fused_attn_1
-            PARALLEL_CONFIG: [1, 1, 8, 1]
+            ICI: [1, 8, 1]
+            DCN: [1, 1, 1]
             BATCH_SIZE: 2
+            TASKS: 8
             ADDITIONAL_ARGS: "--model-type 5B --enable-fused-attn"
           - TEST_NAME: 5B_fused_attn_0
-            PARALLEL_CONFIG: [1, 1, 8, 1]
+            ICI: [1, 8, 1]
+            DCN: [1, 1, 1]
             BATCH_SIZE: 2
+            TASKS: 8
             ADDITIONAL_ARGS: "--model-type 5B"
           - TEST_NAME: LLaMA_eval_TE
-            PARALLEL_CONFIG: [1, 1, 8, 1]
+            ICI: [1, 8, 1]
+            DCN: [1, 1, 1]
             BATCH_SIZE: 4
+            TASKS: 8
             EVALUATE: true
             ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate"
       fail-fast: false
@@ -274,7 +292,6 @@ jobs:
 
       - name: Check out the repository under ${GITHUB_WORKSPACE}
         uses: actions/checkout@v4
-
       - name: Setup SSH known hosts
         id: ssh-known-hosts
         run: |
@@ -292,7 +309,7 @@ jobs:
           cd $GITHUB_WORKSPACE
           IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')"
           TEST_CASE_NAME=${{ matrix.TEST_NAME }}
-          TOTAL_TASKS=$((${{ matrix.PARALLEL_CONFIG[0] }} * ${{ matrix.PARALLEL_CONFIG[1] }} * ${{ matrix.PARALLEL_CONFIG[2] }} * ${{ matrix.PARALLEL_CONFIG[3] }}))
+          TOTAL_TASKS=${{ matrix.TASKS }}
           MAX_GPUS_PER_NODE=8
           NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE))
           GPUS_PER_NODE=$((TOTAL_TASKS/NODES))
@@ -340,10 +357,8 @@ jobs:
               --dtype bfloat16 \
               --batch-per-gpu ${{ matrix.BATCH_SIZE }} \
               --steps 300 \
-              --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \
-              --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \
-              --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \
-              --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \
+              --ici ${{ matrix.ICI }} \
+              --dcn ${{ matric.DCN }} \
               --nodes ${{ steps.meta.outputs.NODES }} \
               --enable-te \
               --additional-args "--fdl.PACKED_INPUT=False"  \
@@ -451,11 +466,23 @@ jobs:
   rosetta-pax-multi-node:
     strategy:
       matrix:
-        PARALLEL_CONFIG:
-        - [1, 8, 1, 1]
-        - [1, 4, 1, 2]
-        - [4, 2, 1, 1]
-        - [4, 2, 1, 2]
+        include:
+          - TEST_NAME: 8DP1FSDP1TP1PP
+            ICI: [8, 1, 1]
+            DCN: [1, 1, 1]
+            TASKS: 1
+          - TEST_NAME: 4DP1FSDP2TP1PP
+            ICI: [4, 1, 2]
+            DCN: [1, 1, 1]
+            TASKS: 8
+          - TEST_NAME: 4DP1FSDP1TP2PP
+            ICI: [4, 2, 1, 1]
+            DCN: [1, 1, 1, 1]
+            TASKS: 8
+          - TEST_NAME: 4DP1FSDP2TP2PP
+            ICI: [2, 2, 1, 2]
+            DCN: [2, 1, 1, 1]
+            TASKS: 16
       fail-fast: false
 
     runs-on: ubuntu-22.04
@@ -488,8 +515,8 @@ jobs:
         shell: bash -x -e {0}
         run: |
           IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')"
-          TEST_CASE_NAME=${{ matrix.PARALLEL_CONFIG[1] }}DP${{ matrix.PARALLEL_CONFIG[2] }}FSDP${{ matrix.PARALLEL_CONFIG[3] }}TP${{ matrix.PARALLEL_CONFIG[0] }}PP
-          TOTAL_TASKS=$((${{ matrix.PARALLEL_CONFIG[0] }} * ${{ matrix.PARALLEL_CONFIG[1] }} * ${{ matrix.PARALLEL_CONFIG[2] }} * ${{ matrix.PARALLEL_CONFIG[3] }}))
+          TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }}
+          TOTAL_TASKS=${{ matrix.TASKS] }}
           MAX_GPUS_PER_NODE=8
           NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE))
           GPUS_PER_NODE=$((TOTAL_TASKS/NODES))
@@ -538,10 +565,8 @@ jobs:
               --dtype bfloat16 \
               --batch-per-gpu 4 \
               --steps 300 \
-              --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \
-              --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \
-              --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \
-              --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \
+              --ici ${{ matrix.ICI }} \
+              --dcn ${{ matrix.DCN }} \
               --nodes ${{ steps.meta.outputs.NODES }} \
               $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess)
           EOF
@@ -646,8 +671,11 @@ jobs:
   rosetta-pax-single-node-dropout-te:
     strategy:
       matrix:
-        PARALLEL_CONFIG:
-        - [1, 8, 1, 1]
+        include:
+          - TEST_NAME: 8DP_TE_dropout
+            ICI: [8, 1, 1]
+            DCN: [1, 1, 1]
+            TASKS: 8
       fail-fast: false
 
     runs-on: ubuntu-22.04
@@ -681,8 +709,8 @@ jobs:
         run: |
           cd $GITHUB_WORKSPACE
           IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')"
-          TEST_CASE_NAME=${{ matrix.PARALLEL_CONFIG[1] }}DP_TE_dropout
-          TOTAL_TASKS=$((${{ matrix.PARALLEL_CONFIG[0] }} * ${{ matrix.PARALLEL_CONFIG[1] }} * ${{ matrix.PARALLEL_CONFIG[2] }} * ${{ matrix.PARALLEL_CONFIG[3] }}))
+          TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }}
+          TOTAL_TASKS=${{ matrix.TASKS }}
           MAX_GPUS_PER_NODE=8
           NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE))
           GPUS_PER_NODE=$((TOTAL_TASKS/NODES))
@@ -730,10 +758,8 @@ jobs:
               --dtype bfloat16 \
               --batch-per-gpu 4 \
               --steps 300 \
-              --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \
-              --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \
-              --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \
-              --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \
+              --ici ${{ matrix.ICI }} \
+              --dcn ${{ matrix.DCN }} \
               --nodes ${{ steps.meta.outputs.NODES }} \
               --enable-dropout \
               --enable-te \
@@ -840,8 +866,10 @@ jobs:
   single-process-evaluation-te:
     strategy:
       matrix:
-        PARALLEL_CONFIG:
-        - [1, 8, 1, 1]
+        include:
+          - TEST_NAME: 8DP1FSDP1TP1PP_eval_TE
+            ICI: [8, 1, 1]
+            DCN: [1, 1, 1]
       fail-fast: false
 
     runs-on: ubuntu-22.04
@@ -874,7 +902,7 @@ jobs:
         shell: bash -x -e {0}
         run: |
           IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')"
-          TEST_CASE_NAME=${{ matrix.PARALLEL_CONFIG[1] }}DP${{ matrix.PARALLEL_CONFIG[2] }}FSDP${{ matrix.PARALLEL_CONFIG[3] }}TP${{ matrix.PARALLEL_CONFIG[0] }}PP_eval_TE
+          TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }}
           MAX_GPUS_PER_NODE=8
           GPUS_PER_NODE=8
 
@@ -923,10 +951,8 @@ jobs:
               --batch-per-gpu 4 \
               --steps 500 \
               --evaluate \
-              --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \
-              --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \
-              --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \
-              --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \
+              --ici ${{ matrix.ICI }} \
+              --dcn ${{ matrix.DCN }} \
               --nodes 1 \
               --enable-te
           EOF
diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml
index 423a9f5de..e05341647 100644
--- a/.github/workflows/_test_upstream_pax.yaml
+++ b/.github/workflows/_test_upstream_pax.yaml
@@ -33,9 +33,13 @@ jobs:
   single-process-multi-device:
     strategy:
       matrix:
-        PARALLEL_CONFIG:
-        - [1, 8, 1, 1]
-        - [1, 1, 2, 4]
+        include:
+          - TEST_NAME: 8DP1FSDP1TP1PP_single_process
+            ICI: [8, 1, 1]
+            DCN: [1, 1, 1]
+          - TEST_NAME: 1DP2FSDP4TP1PP_single_process
+            ICI: [8, 1, 1]
+            DCN: [1, 1, 1]
       fail-fast: false
 
     runs-on: ubuntu-22.04
@@ -67,7 +71,7 @@ jobs:
         shell: bash -x -e {0}
         run: |
           IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')"
-          TEST_CASE_NAME=${{ matrix.PARALLEL_CONFIG[1] }}DP${{ matrix.PARALLEL_CONFIG[2] }}FSDP${{ matrix.PARALLEL_CONFIG[3] }}TP${{ matrix.PARALLEL_CONFIG[0] }}PP_single_process
+          TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }}
           MAX_GPUS_PER_NODE=8
           NODES=1
           GPUS_PER_NODE=8
@@ -114,10 +118,8 @@ jobs:
               --dtype bfloat16 \
               --batch-per-gpu 4 \
               --steps 500 \
-              --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \
-              --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \
-              --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \
-              --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \
+              --ici ${{ matrix.ICI }} \
+              --dcn ${{ matrix.DCN }} \
               --nodes ${{ steps.meta.outputs.NODES }}
           EOF
           )
@@ -177,42 +179,51 @@ jobs:
   pax-multi-node:
     strategy:
       matrix:
-        include:
           - TEST_NAME: 1DP1FSDP1TP1PP
-            PARALLEL_CONFIG: [1, 1, 1, 1]
-            BATCH_SIZE: 4
+            ICI: [1, 1, 1]
+            DCN: [1, 1, 1]
+            TASKS: 1
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 8DP1FSDP1TP1PP
-            PARALLEL_CONFIG: [1, 8, 1, 1]
-            BATCH_SIZE: 4
+            ICI: [8, 1, 1]
+            DCN: [1, 1, 1]
+            TASKS: 8
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 1DP8FSDP1TP1PP
-            PARALLEL_CONFIG: [1, 1, 8, 1]
-            BATCH_SIZE: 4
+            ICI: [1, 8, 1]
+            DCN: [1, 1, 1]
+            TASKS: 8
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 2DP1FSDP1TP4PP
-            PARALLEL_CONFIG: [4, 2, 1, 1]
-            BATCH_SIZE: 4
+            ICI: [4, 2, 1, 1]
+            DCN: [1, 1, 1, 1]
+            TASKS: 8
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 4DP1FSDP2TP1PP
-            PARALLEL_CONFIG: [1, 4, 1, 2]
-            BATCH_SIZE: 4
+            ICI: [4, 2, 1]
+            DCN: [1, 1, 1]
+            TASKS: 8
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 16DP1FSDP1TP1PP
-            PARALLEL_CONFIG: [1, 16, 1, 1]
-            BATCH_SIZE: 4
+            ICI: [8, 1, 1]
+            DCN: [2, 1, 1]
+            TASKS: 16
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 2DP1FSDP2TP4PP
-            PARALLEL_CONFIG: [4, 2, 1, 2]
-            BATCH_SIZE: 4
+            ICI: [1, 1, 2, 4]
+            DCN: [2, 1, 1, 1]
+            TASKS: 16
+            ADDITIONAL_ARGS: ""
           - TEST_NAME: LLaMA_eval
-            PARALLEL_CONFIG: [1, 1, 8, 1]
-            BATCH_SIZE: 4
+            ICI: [1, 8, 1]
+            DCN: [1, 1, 1]
+            TASKS: 16
             EVALUATE: true
             ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate"
           - TEST_NAME: Grok
-            PARALLEL_CONFIG: [1, 2, 8, 1] ## note: only used to compute num nodes
-            BATCH_SIZE: 4
+            ICI: [1, 1, 8, 1]
+            DCN: [1, 2, 1, 1]
+            TASKS: 16
             EVALUATE: true
             ADDITIONAL_ARGS: "--model-type GrokProxy"
       fail-fast: false
@@ -247,7 +258,7 @@ jobs:
         run: |
           IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')"
           TEST_CASE_NAME=${{ matrix.TEST_NAME }}
-          TOTAL_TASKS=$((${{ matrix.PARALLEL_CONFIG[0] }} * ${{ matrix.PARALLEL_CONFIG[1] }} * ${{ matrix.PARALLEL_CONFIG[2] }} * ${{ matrix.PARALLEL_CONFIG[3] }}))
+          TOTAL_TASKS=${{ matrix.TASKS }}
           MAX_GPUS_PER_NODE=8
           NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE))
           GPUS_PER_NODE=$((TOTAL_TASKS/NODES))
@@ -295,10 +306,8 @@ jobs:
               --dtype bfloat16 \
               --batch-per-gpu 4 \
               --steps 500 \
-              --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \
-              --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \
-              --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \
-              --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \
+              --ici ${{ matrix.ICI }} \
+              --dcn ${{ matrix.DCN }} \
               --nodes ${{ steps.meta.outputs.NODES }} \
               $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess) \
               ${{ matrix.ADDITIONAL_ARGS }}
@@ -362,8 +371,10 @@ jobs:
   single-process-evaluation:
     strategy:
       matrix:
-        PARALLEL_CONFIG:
-        - [1, 8, 1, 1]
+        include:
+          - TEST_NAME: 8DP1FSDP1TP1PP_eval
+            ICI: [8, 1, 1]
+            DCN: [1, 1, 1]
       fail-fast: false
 
     runs-on: ubuntu-22.04
@@ -395,7 +406,7 @@ jobs:
         shell: bash -x -e {0}
         run: |
           IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')"
-          TEST_CASE_NAME=${{ matrix.PARALLEL_CONFIG[1] }}DP${{ matrix.PARALLEL_CONFIG[2] }}FSDP${{ matrix.PARALLEL_CONFIG[3] }}TP${{ matrix.PARALLEL_CONFIG[0] }}PP_eval
+          TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }}PP
           TOTAL_TASKS=1
           NODES=1
           GPUS_PER_NODE=8
@@ -444,10 +455,8 @@ jobs:
               --batch-per-gpu 4 \
               --steps 500 \
               --evaluate \
-              --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \
-              --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \
-              --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \
-              --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \
+              --ici ${{ matrix.ICI }} \
+              --dcn ${{ matrix.DCN }} \
               --nodes ${{ steps.meta.outputs.NODES }} \
               $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess)
           EOF

From 374a6b01d228dba633ff1e335c135edf3ba13e99 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Wed, 29 May 2024 17:01:41 -0700
Subject: [PATCH 03/20] fixes

---
 .github/container/test-pax.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh
index d4ad9b069..3efd01c4d 100755
--- a/.github/container/test-pax.sh
+++ b/.github/container/test-pax.sh
@@ -30,7 +30,7 @@ usage() {
     exit $1
 }
 
-args=$(getopt -o a:b:s:o:n:h --long additional-args:,batch-per-gpu:,dtype:,enable-te,enable-dropout,enable-fused-attn,model-type:,evaluate,steps:,help,multiprocess,output:,data-parallel:,fsdp:,tensor-parallel:,pipeline-parallel:,nodes: -- "$@")
+args=$(getopt -o a:b:s:o:n:h --long additional-args:,batch-per-gpu:,dtype:,enable-te,enable-dropout,enable-fused-attn,model-type:,evaluate,steps:,help,multiprocess,output:,ici:,dcn:,nodes: -- "$@")
 if [[ $? -ne 0 ]]; then
     exit $1
 fi
@@ -349,7 +349,7 @@ elif [[ ${MODEL_TYPE} == "LLaMA70BProxy" ]]; then
 ## hard-code ICI mesh shape for Grok
 elif [[ ${MODEL_TYPE} == "GrokProxy" ]]; then
   CONFIG=paxml.tasks.lm.params.nvidia.Grok_Proxy
-  ADDITIONAL_ARGS="--fdl.NUM_LAYERS=2"
+  ADDITIONAL_ARGS+="--fdl.NUM_LAYERS=2"
 else
   echo "Unsupported model ${MODEL_TYPE}"
   exit 1

From 2b87465106b6db6120d77409692774ec6bb5fbcc Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Wed, 29 May 2024 17:06:28 -0700
Subject: [PATCH 04/20] fix

---
 .github/workflows/_test_upstream_pax.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml
index e05341647..ac6519ffa 100644
--- a/.github/workflows/_test_upstream_pax.yaml
+++ b/.github/workflows/_test_upstream_pax.yaml
@@ -179,6 +179,7 @@ jobs:
   pax-multi-node:
     strategy:
       matrix:
+        include:
           - TEST_NAME: 1DP1FSDP1TP1PP
             ICI: [1, 1, 1]
             DCN: [1, 1, 1]

From 67dbc3728773380a5ad0571634a7c3faab3f9713 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Wed, 29 May 2024 17:07:41 -0700
Subject: [PATCH 05/20] fix

---
 .github/workflows/_test_pax_rosetta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml
index 8a05f6a11..85d7cdc4c 100644
--- a/.github/workflows/_test_pax_rosetta.yaml
+++ b/.github/workflows/_test_pax_rosetta.yaml
@@ -358,7 +358,7 @@ jobs:
               --batch-per-gpu ${{ matrix.BATCH_SIZE }} \
               --steps 300 \
               --ici ${{ matrix.ICI }} \
-              --dcn ${{ matric.DCN }} \
+              --dcn ${{ matrix.DCN }} \
               --nodes ${{ steps.meta.outputs.NODES }} \
               --enable-te \
               --additional-args "--fdl.PACKED_INPUT=False"  \

From 86f043a096140abdd3b63969279e30f8ac1f106b Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Wed, 29 May 2024 17:08:39 -0700
Subject: [PATCH 06/20] fix

---
 .github/workflows/_test_pax_rosetta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml
index 85d7cdc4c..be30c22b6 100644
--- a/.github/workflows/_test_pax_rosetta.yaml
+++ b/.github/workflows/_test_pax_rosetta.yaml
@@ -516,7 +516,7 @@ jobs:
         run: |
           IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')"
           TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }}
-          TOTAL_TASKS=${{ matrix.TASKS] }}
+          TOTAL_TASKS=${{ matrix.TASKS }}
           MAX_GPUS_PER_NODE=8
           NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE))
           GPUS_PER_NODE=$((TOTAL_TASKS/NODES))

From 0ac6db74ca9017ab3f17a7af4e595d103d8c914c Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Wed, 29 May 2024 22:38:05 -0700
Subject: [PATCH 07/20] fix syntax

---
 .github/container/test-pax.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh
index 3efd01c4d..fe8f460f4 100755
--- a/.github/container/test-pax.sh
+++ b/.github/container/test-pax.sh
@@ -355,7 +355,7 @@ else
   exit 1
 fi
 
-CMD_LINE_FLAGS = "--fdl_config=${CONFIG} \
+CMD_LINE_FLAGS="--fdl_config=${CONFIG} \
     --fdl.MAX_STEPS=0 \
     --job_log_dir=${OUTPUT} \
     --alsologtostderr \

From e6c9aebdf64101ef56df6bf22d95371c9ab12bf7 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Thu, 30 May 2024 09:14:00 -0700
Subject: [PATCH 08/20] fix ICI and DCN mesh shapes

---
 .github/workflows/_test_pax_rosetta.yaml  | 20 ++++++++++----------
 .github/workflows/_test_upstream_pax.yaml | 12 ++++++------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml
index be30c22b6..1f1bef8af 100644
--- a/.github/workflows/_test_pax_rosetta.yaml
+++ b/.github/workflows/_test_pax_rosetta.yaml
@@ -122,8 +122,8 @@ jobs:
               --dtype bfloat16 \
               --batch-per-gpu 4 \
               --steps 500 \
-              --ici ${{ matrix.ICI }}
-              --dcn ${{ matrix.DCN }}
+              --ici "\"${{ matrix.ICI }}\"" \
+              --dcn "\"${{ matrix.DCN }}\"" \
               --nodes ${{ steps.meta.outputs.NODES }} \
               --enable-te
           EOF
@@ -357,8 +357,8 @@ jobs:
               --dtype bfloat16 \
               --batch-per-gpu ${{ matrix.BATCH_SIZE }} \
               --steps 300 \
-              --ici ${{ matrix.ICI }} \
-              --dcn ${{ matrix.DCN }} \
+              --ici "\"${{ matrix.ICI }}\"" \
+              --dcn "\"${{ matrix.DCN }}\"" \
               --nodes ${{ steps.meta.outputs.NODES }} \
               --enable-te \
               --additional-args "--fdl.PACKED_INPUT=False"  \
@@ -565,8 +565,8 @@ jobs:
               --dtype bfloat16 \
               --batch-per-gpu 4 \
               --steps 300 \
-              --ici ${{ matrix.ICI }} \
-              --dcn ${{ matrix.DCN }} \
+              --ici "\"${{ matrix.ICI }}\"" \
+              --dcn "\"${{ matrix.DCN }}\"" \
               --nodes ${{ steps.meta.outputs.NODES }} \
               $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess)
           EOF
@@ -758,8 +758,8 @@ jobs:
               --dtype bfloat16 \
               --batch-per-gpu 4 \
               --steps 300 \
-              --ici ${{ matrix.ICI }} \
-              --dcn ${{ matrix.DCN }} \
+              --ici "\"${{ matrix.ICI }}\"" \
+              --dcn "\"${{ matrix.DCN }}\"" \
               --nodes ${{ steps.meta.outputs.NODES }} \
               --enable-dropout \
               --enable-te \
@@ -951,8 +951,8 @@ jobs:
               --batch-per-gpu 4 \
               --steps 500 \
               --evaluate \
-              --ici ${{ matrix.ICI }} \
-              --dcn ${{ matrix.DCN }} \
+              --ici "\"${{ matrix.ICI }}\"" \
+              --dcn "\"${{ matrix.DCN }}\"" \
               --nodes 1 \
               --enable-te
           EOF
diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml
index ac6519ffa..4fe2d107d 100644
--- a/.github/workflows/_test_upstream_pax.yaml
+++ b/.github/workflows/_test_upstream_pax.yaml
@@ -118,8 +118,8 @@ jobs:
               --dtype bfloat16 \
               --batch-per-gpu 4 \
               --steps 500 \
-              --ici ${{ matrix.ICI }} \
-              --dcn ${{ matrix.DCN }} \
+              --ici "\"${{ matrix.ICI }}\"" \
+              --dcn "\"${{ matrix.DCN }}\"" \
               --nodes ${{ steps.meta.outputs.NODES }}
           EOF
           )
@@ -307,8 +307,8 @@ jobs:
               --dtype bfloat16 \
               --batch-per-gpu 4 \
               --steps 500 \
-              --ici ${{ matrix.ICI }} \
-              --dcn ${{ matrix.DCN }} \
+              --ici "\"${{ matrix.ICI }}\"" \
+              --dcn "\"${{ matrix.DCN }}\"" \
               --nodes ${{ steps.meta.outputs.NODES }} \
               $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess) \
               ${{ matrix.ADDITIONAL_ARGS }}
@@ -456,8 +456,8 @@ jobs:
               --batch-per-gpu 4 \
               --steps 500 \
               --evaluate \
-              --ici ${{ matrix.ICI }} \
-              --dcn ${{ matrix.DCN }} \
+              --ici "\"${{ matrix.ICI }}\"" \
+              --dcn "\"${{ matrix.DCN }}\"" \
               --nodes ${{ steps.meta.outputs.NODES }} \
               $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess)
           EOF

From f226c03c3c665d0a791c3dc27bf4d44ab65e86e5 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Thu, 30 May 2024 17:04:36 -0700
Subject: [PATCH 09/20] more fixes

---
 .github/workflows/_test_pax_rosetta.yaml  | 72 +++++++++++------------
 .github/workflows/_test_upstream_pax.yaml | 52 ++++++++--------
 2 files changed, 62 insertions(+), 62 deletions(-)

diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml
index 1f1bef8af..8193abbcf 100644
--- a/.github/workflows/_test_pax_rosetta.yaml
+++ b/.github/workflows/_test_pax_rosetta.yaml
@@ -35,11 +35,11 @@ jobs:
       matrix:
         include:
           - TEST_NAME: 8DP1FSDP1TP1PP_single_process_TE
-            ICI: [8, 1, 1]
-            DCN: [1, 1, 1]
+            ICI: "[8, 1, 1]"
+            DCN: "[1, 1, 1]"
           - TEST_NAME: 1DP2FSDP4TP1PP_single_process_TE
-            ICI: [8, 1, 1]
-            DCN: [1, 1, 1]
+            ICI: "[8, 1, 1]"
+            DCN: "[1, 1, 1]"
       fail-fast: false
 
     runs-on: ubuntu-22.04
@@ -74,7 +74,7 @@ jobs:
         shell: bash -x -e {0}
         run: |
           IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')"
-          TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }}
+          TEST_CASE_NAME=${{ matrix.TEST_NAME }}
           MAX_GPUS_PER_NODE=8
           GPUS_PER_NODE=8
           NODES=1
@@ -228,50 +228,50 @@ jobs:
       matrix:
         include:
           - TEST_NAME: 1DP1FSDP1TP1PP_TE
-            ICI: [1, 1, 1]
-            DCN: [1, 1, 1]
+            ICI: "[1, 1, 1]"
+            DCN: "[1, 1, 1]"
             BATCH_SIZE: 4
             TASKS: 1
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 8DP1FSDP1TP1PP_TE
-            ICI: [8, 1, 1]
-            DCN: [1, 1, 1]
+            ICI: "[8, 1, 1]"
+            DCN: "[1, 1, 1]"
             ADDITIONAL_ARGS: ""
             BATCH_SIZE: 4
             TASKS: 8
           - TEST_NAME: 1DP8FSDP1TP1PP_TE
-            ICI: [1, 8, 1]
-            DCN: [1, 1, 1]
+            ICI: "[1, 8, 1]"
+            DCN: "[1, 1, 1]"
             BATCH_SIZE: 4
             TASKS: 8
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 4DP1FSDP2TP1PP_TE
-            ICI: [4, 1, 1]
-            DCN: [1, 1, 1]
+            ICI: "[4, 1, 1]"
+            DCN: "[1, 1, 1]"
             BATCH_SIZE: 4
             TASKS: 8
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 16DP1FSDP1TP1PP_TE
-            ICI: [8, 1, 1]
-            DCN: [2, 1, 1]
+            ICI: "[8, 1, 1]"
+            DCN: "[2, 1, 1]"
             BATCH_SIZE: 4
             TASKS: 16
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 5B_fused_attn_1
-            ICI: [1, 8, 1]
-            DCN: [1, 1, 1]
+            ICI: "[1, 8, 1]"
+            DCN: "[1, 1, 1]"
             BATCH_SIZE: 2
             TASKS: 8
             ADDITIONAL_ARGS: "--model-type 5B --enable-fused-attn"
           - TEST_NAME: 5B_fused_attn_0
-            ICI: [1, 8, 1]
-            DCN: [1, 1, 1]
+            ICI: "[1, 8, 1]"
+            DCN: "[1, 1, 1]"
             BATCH_SIZE: 2
             TASKS: 8
             ADDITIONAL_ARGS: "--model-type 5B"
           - TEST_NAME: LLaMA_eval_TE
-            ICI: [1, 8, 1]
-            DCN: [1, 1, 1]
+            ICI: "[1, 8, 1]"
+            DCN: "[1, 1, 1]"
             BATCH_SIZE: 4
             TASKS: 8
             EVALUATE: true
@@ -468,20 +468,20 @@ jobs:
       matrix:
         include:
           - TEST_NAME: 8DP1FSDP1TP1PP
-            ICI: [8, 1, 1]
-            DCN: [1, 1, 1]
+            ICI: "[8, 1, 1]"
+            DCN: "[1, 1, 1]"
             TASKS: 1
           - TEST_NAME: 4DP1FSDP2TP1PP
-            ICI: [4, 1, 2]
-            DCN: [1, 1, 1]
+            ICI: "[4, 1, 2]"
+            DCN: "[1, 1, 1]"
             TASKS: 8
           - TEST_NAME: 4DP1FSDP1TP2PP
-            ICI: [4, 2, 1, 1]
-            DCN: [1, 1, 1, 1]
+            ICI: "[4, 2, 1, 1]"
+            DCN: "[1, 1, 1, 1]"
             TASKS: 8
           - TEST_NAME: 4DP1FSDP2TP2PP
-            ICI: [2, 2, 1, 2]
-            DCN: [2, 1, 1, 1]
+            ICI: "[2, 2, 1, 2]"
+            DCN: "[2, 1, 1, 1]"
             TASKS: 16
       fail-fast: false
 
@@ -515,7 +515,7 @@ jobs:
         shell: bash -x -e {0}
         run: |
           IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')"
-          TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }}
+          TEST_CASE_NAME=${{ matrix.TEST_NAME }}
           TOTAL_TASKS=${{ matrix.TASKS }}
           MAX_GPUS_PER_NODE=8
           NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE))
@@ -673,8 +673,8 @@ jobs:
       matrix:
         include:
           - TEST_NAME: 8DP_TE_dropout
-            ICI: [8, 1, 1]
-            DCN: [1, 1, 1]
+            ICI: "[8, 1, 1]"
+            DCN: "[1, 1, 1]"
             TASKS: 8
       fail-fast: false
 
@@ -709,7 +709,7 @@ jobs:
         run: |
           cd $GITHUB_WORKSPACE
           IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')"
-          TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }}
+          TEST_CASE_NAME=${{ matrix.TEST_NAME }}
           TOTAL_TASKS=${{ matrix.TASKS }}
           MAX_GPUS_PER_NODE=8
           NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE))
@@ -868,8 +868,8 @@ jobs:
       matrix:
         include:
           - TEST_NAME: 8DP1FSDP1TP1PP_eval_TE
-            ICI: [8, 1, 1]
-            DCN: [1, 1, 1]
+            ICI: "[8, 1, 1]"
+            DCN: "[1, 1, 1]"
       fail-fast: false
 
     runs-on: ubuntu-22.04
@@ -902,7 +902,7 @@ jobs:
         shell: bash -x -e {0}
         run: |
           IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')"
-          TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }}
+          TEST_CASE_NAME=${{ matrix.TEST_NAME }}
           MAX_GPUS_PER_NODE=8
           GPUS_PER_NODE=8
 
diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml
index 4fe2d107d..d69dcf5c2 100644
--- a/.github/workflows/_test_upstream_pax.yaml
+++ b/.github/workflows/_test_upstream_pax.yaml
@@ -35,11 +35,11 @@ jobs:
       matrix:
         include:
           - TEST_NAME: 8DP1FSDP1TP1PP_single_process
-            ICI: [8, 1, 1]
-            DCN: [1, 1, 1]
+            ICI: "[8, 1, 1]"
+            DCN: "[1, 1, 1]"
           - TEST_NAME: 1DP2FSDP4TP1PP_single_process
-            ICI: [8, 1, 1]
-            DCN: [1, 1, 1]
+            ICI: "[8, 1, 1]"
+            DCN: "[1, 1, 1]"
       fail-fast: false
 
     runs-on: ubuntu-22.04
@@ -71,7 +71,7 @@ jobs:
         shell: bash -x -e {0}
         run: |
           IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')"
-          TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }}
+          TEST_CASE_NAME=${{ matrix.TEST_NAME }}
           MAX_GPUS_PER_NODE=8
           NODES=1
           GPUS_PER_NODE=8
@@ -181,49 +181,49 @@ jobs:
       matrix:
         include:
           - TEST_NAME: 1DP1FSDP1TP1PP
-            ICI: [1, 1, 1]
-            DCN: [1, 1, 1]
+            ICI: "[1, 1, 1]"
+            DCN: "[1, 1, 1]"
             TASKS: 1
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 8DP1FSDP1TP1PP
-            ICI: [8, 1, 1]
-            DCN: [1, 1, 1]
+            ICI: "[8, 1, 1]"
+            DCN: "[1, 1, 1]"
             TASKS: 8
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 1DP8FSDP1TP1PP
-            ICI: [1, 8, 1]
-            DCN: [1, 1, 1]
+            ICI: "[1, 8, 1]"
+            DCN: "[1, 1, 1]"
             TASKS: 8
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 2DP1FSDP1TP4PP
-            ICI: [4, 2, 1, 1]
-            DCN: [1, 1, 1, 1]
+            ICI: "[4, 2, 1, 1]"
+            DCN: "[1, 1, 1, 1]"
             TASKS: 8
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 4DP1FSDP2TP1PP
-            ICI: [4, 2, 1]
-            DCN: [1, 1, 1]
+            ICI: "[4, 2, 1]"
+            DCN: "[1, 1, 1]"
             TASKS: 8
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 16DP1FSDP1TP1PP
-            ICI: [8, 1, 1]
-            DCN: [2, 1, 1]
+            ICI: "[8, 1, 1]"
+            DCN: "[2, 1, 1]"
             TASKS: 16
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 2DP1FSDP2TP4PP
-            ICI: [1, 1, 2, 4]
-            DCN: [2, 1, 1, 1]
+            ICI: "[1, 1, 2, 4]"
+            DCN: "[2, 1, 1, 1]"
             TASKS: 16
             ADDITIONAL_ARGS: ""
           - TEST_NAME: LLaMA_eval
-            ICI: [1, 8, 1]
-            DCN: [1, 1, 1]
+            ICI: "[1, 8, 1]"
+            DCN: "[1, 1, 1]"
             TASKS: 16
             EVALUATE: true
             ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate"
           - TEST_NAME: Grok
-            ICI: [1, 1, 8, 1]
-            DCN: [1, 2, 1, 1]
+            ICI: "[1, 1, 8, 1]"
+            DCN: "[1, 2, 1, 1]"
             TASKS: 16
             EVALUATE: true
             ADDITIONAL_ARGS: "--model-type GrokProxy"
@@ -374,8 +374,8 @@ jobs:
       matrix:
         include:
           - TEST_NAME: 8DP1FSDP1TP1PP_eval
-            ICI: [8, 1, 1]
-            DCN: [1, 1, 1]
+            ICI: "[8, 1, 1]"
+            DCN: "[1, 1, 1]"
       fail-fast: false
 
     runs-on: ubuntu-22.04
@@ -407,7 +407,7 @@ jobs:
         shell: bash -x -e {0}
         run: |
           IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')"
-          TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }}PP
+          TEST_CASE_NAME=${{ matrix.TEST_NAME }}
           TOTAL_TASKS=1
           NODES=1
           GPUS_PER_NODE=8

From e5bd8511fbc239d0f3b135b8422677e8da212312 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Thu, 30 May 2024 22:12:04 -0700
Subject: [PATCH 10/20] fix max_steps

---
 .github/container/test-pax.sh | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh
index fe8f460f4..d49b640fe 100755
--- a/.github/container/test-pax.sh
+++ b/.github/container/test-pax.sh
@@ -356,7 +356,6 @@ else
 fi
 
 CMD_LINE_FLAGS="--fdl_config=${CONFIG} \
-    --fdl.MAX_STEPS=0 \
     --job_log_dir=${OUTPUT} \
     --alsologtostderr \
     --fdl.PERCORE_BATCH_SIZE=${BATCH_PER_GPU} \
@@ -370,15 +369,13 @@ if [[ ${EVALUATE} -ne 0 ]]; then
   trap "rm -rf ${OUTPUT}/checkpoints" ERR INT HUP TERM EXIT
 
   ## train for 0 steps to generate an initial checkpoint
-  python -m paxml.main ${CMD_LINE_FLAGS}
+  python -m paxml.main ${CMD_LINE_FLAGS} --fdl.MAX_STEPS=0
 
   ## restore from initial checkpoint for eval
-  python -m paxml.main ${CMD_LINE_FLAGS} \
-    --enable_checkpoint_saving=False
+  python -m paxml.main ${CMD_LINE_FLAGS} --enable_checkpoint_saving=False
 
 else
-  python -m paxml.main ${CMD_LINE_FLAGS} \
-    --enable_checkpoint_saving=False
+  python -m paxml.main ${CMD_LINE_FLAGS} --enable_checkpoint_saving=False
 fi
 
 set +x

From 817e3512637be80d5a1b6ad62621b81043aab418 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Thu, 30 May 2024 22:13:44 -0700
Subject: [PATCH 11/20] fix

---
 .github/container/test-pax.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh
index d49b640fe..a1ca25f35 100755
--- a/.github/container/test-pax.sh
+++ b/.github/container/test-pax.sh
@@ -372,7 +372,7 @@ if [[ ${EVALUATE} -ne 0 ]]; then
   python -m paxml.main ${CMD_LINE_FLAGS} --fdl.MAX_STEPS=0
 
   ## restore from initial checkpoint for eval
-  python -m paxml.main ${CMD_LINE_FLAGS} --enable_checkpoint_saving=False
+  python -m paxml.main ${CMD_LINE_FLAGS} --enable_checkpoint_saving=False --fdl.MAX_STEPS=0
 
 else
   python -m paxml.main ${CMD_LINE_FLAGS} --enable_checkpoint_saving=False

From e98cc98a500410e9c751365e40e915ec860f259f Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Thu, 30 May 2024 22:16:01 -0700
Subject: [PATCH 12/20] fix command line args

---
 .github/container/test-pax.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh
index a1ca25f35..8ac2ac79f 100755
--- a/.github/container/test-pax.sh
+++ b/.github/container/test-pax.sh
@@ -372,10 +372,10 @@ if [[ ${EVALUATE} -ne 0 ]]; then
   python -m paxml.main ${CMD_LINE_FLAGS} --fdl.MAX_STEPS=0
 
   ## restore from initial checkpoint for eval
-  python -m paxml.main ${CMD_LINE_FLAGS} --enable_checkpoint_saving=False --fdl.MAX_STEPS=0
+  python -m paxml.main ${CMD_LINE_FLAGS} --enable_checkpoint_saving=False --fdl.MAX_STEPS=0 --mode='eval'
 
 else
-  python -m paxml.main ${CMD_LINE_FLAGS} --enable_checkpoint_saving=False
+  python -m paxml.main ${CMD_LINE_FLAGS} --enable_checkpoint_saving=False --fdl.MAX_STEPS=${STEPS}
 fi
 
 set +x

From c53fab31f6acd30f83180e94583c4cee30f4b834 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Fri, 31 May 2024 09:10:01 -0700
Subject: [PATCH 13/20] cleanup, bug fixes

---
 .github/container/test-pax.sh             | 44 +++++++++++------------
 .github/workflows/_test_pax_rosetta.yaml  |  5 +++
 .github/workflows/_test_upstream_pax.yaml |  4 +--
 3 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh
index 8ac2ac79f..3aa711df9 100755
--- a/.github/container/test-pax.sh
+++ b/.github/container/test-pax.sh
@@ -24,13 +24,14 @@ usage() {
     echo "  --multiprocess             Enable the multiprocess GPU mode."
     echo "  --ici                      ICI mesh shape."
     echo "  --dcn                      DCN mesh shape."
+    echo "  --enable-pipeline-parallel Whether to use pipeline parallelism."
     echo "  -o, --output NAME          Name for the output folder, a temporary folder will be created if none specified."
     echo "  -n, --nodes                Number of nodes."
     echo "  -h, --help                 Print usage."
     exit $1
 }
 
-args=$(getopt -o a:b:s:o:n:h --long additional-args:,batch-per-gpu:,dtype:,enable-te,enable-dropout,enable-fused-attn,model-type:,evaluate,steps:,help,multiprocess,output:,ici:,dcn:,nodes: -- "$@")
+args=$(getopt -o a:b:s:o:n:h --long additional-args:,batch-per-gpu:,dtype:,enable-te,enable-dropout,enable-fused-attn,enable-pipeline-parallel,model-type:,evaluate,steps:,help,multiprocess,output:,ici:,dcn:,nodes: -- "$@")
 if [[ $? -ne 0 ]]; then
     exit $1
 fi
@@ -48,6 +49,7 @@ NODES=1
 ENABLE_TE=0
 MODEL_TYPE=126M
 NVTE_FUSED_ATTN=0
+ENABLE_PP=0
 DROPOUT=0
 EVALUATE=0
 ADDITIONAL_ARGS=""
@@ -79,6 +81,10 @@ while [ : ]; do
             NVTE_FUSED_ATTN=1
             shift 1
             ;;
+	--enable-pipeline-parallel)
+	    ENABLE_PP=1
+	    shift 1
+	    ;;
         --model-type)
             MODEL_TYPE=$2
             shift 2
@@ -149,6 +155,7 @@ pushd ${PAXML_DIR}
 ## Create configs file
 cat > ci_configs.py <<EOF
 import math
+import numpy as np
 from paxml import tasks_lib, experiment_registry
 from paxml.contrib.gpu.scripts_gpu.configs import (
     BaseLLaMA,
@@ -160,25 +167,12 @@ from paxml.tasks.lm.params.lm_cloud import SyntheticDataset
 from praxis import base_layer
 from praxis import layers
 
-assert num_gpus == np.prod(${ICI}) * np.prod(${DCN}), f'product of parallel strategies should equal number of available gpus. Have {num_gpus} gpus, but product of parallel strategies is {np.prod(${ICI}) * np.prod(${DCN})}'
+num_gpus = ${NGPUS}
+dropout = float(${DROPOUT})
+dtype = "${DTYPE}"
+pp = $ENABLE_PP
 
-## heuristics to get ici and dcn mesh shapes.
-## these heuristics only support one parallel strategy across nodes
-## but should be sufficient for now
-dcn_factor = math.ceil(num_gpus / 8)
-dcn_dp = 1
-dcn_fsdp = 1
-dcn_pp = 1
-if dcn_factor > 1:
-  if dp % dcn_factor == 0:
-    dcn_dp = dcn_factor
-    dp = int(dp / dcn_factor)
-  elif fsdp % dcn_factor == 0: 
-    dcn_fsdp = dcn_factor
-    fsdp = int(fsdp / dcn_factor)
-  elif pp % dcn_factor == 0: 
-    dcn_pp = dcn_factor
-    pp = int(pp / dcn_factor)
+assert num_gpus == np.prod(${ICI}) * np.prod(${DCN}), f'product of parallel strategies should equal number of available gpus. Have {num_gpus} gpus, but product of parallel strategies is {np.prod(${ICI}) * np.prod(${DCN})}'
 
 WeightInit = base_layer.WeightInit
 
@@ -285,12 +279,12 @@ class LLaMA70BSyntheticSmall(BaseLLaMA, SyntheticDataset):
       return task_p
 
 
-if pp > 1:
+if pp == 1:
   @experiment_registry.register
   class Synthetic126MCI(GPT126MPP, SyntheticDataset):
     
     MICROBATCH_SIZE = 2
-    NUM_STAGES = pp
+    NUM_STAGES = ${ICI}[0]
     FRPOP_DTYPE = dtype
     
     def task(self):
@@ -355,14 +349,18 @@ else
   exit 1
 fi
 
+echo "HERE"
+
 CMD_LINE_FLAGS="--fdl_config=${CONFIG} \
     --job_log_dir=${OUTPUT} \
     --alsologtostderr \
     --fdl.PERCORE_BATCH_SIZE=${BATCH_PER_GPU} \
     --fdl.ICI_MESH_SHAPE=${ICI} \
     --fdl.DCN_MESH_SHAPE=${DCN} \
-    $ADDITIONAL_ARGS \
-    $([[ $MULTIPROCESS != 0 ]] && echo --multiprocess_gpu)"
+    $ADDITIONAL_ARGS"
+if [[ $MULTIPROCESS != 0 ]]; then
+  CMD_LINE_FLAGS+=" --multiprocess_gpu"
+fi
 
 if [[ ${EVALUATE} -ne 0 ]]; then
 
diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml
index 8193abbcf..8996ea8fb 100644
--- a/.github/workflows/_test_pax_rosetta.yaml
+++ b/.github/workflows/_test_pax_rosetta.yaml
@@ -471,18 +471,22 @@ jobs:
             ICI: "[8, 1, 1]"
             DCN: "[1, 1, 1]"
             TASKS: 1
+            ADDITIONAL_ARGS: ""
           - TEST_NAME: 4DP1FSDP2TP1PP
             ICI: "[4, 1, 2]"
             DCN: "[1, 1, 1]"
             TASKS: 8
+            ADDITIONAL_ARGS: ""
           - TEST_NAME: 4DP1FSDP1TP2PP
             ICI: "[4, 2, 1, 1]"
             DCN: "[1, 1, 1, 1]"
             TASKS: 8
+            ADDITIONAL_ARGS: "--enable-pipeline-parallel"
           - TEST_NAME: 4DP1FSDP2TP2PP
             ICI: "[2, 2, 1, 2]"
             DCN: "[2, 1, 1, 1]"
             TASKS: 16
+            ADDITIONAL_ARGS: "--enable-pipeline-parallel"
       fail-fast: false
 
     runs-on: ubuntu-22.04
@@ -568,6 +572,7 @@ jobs:
               --ici "\"${{ matrix.ICI }}\"" \
               --dcn "\"${{ matrix.DCN }}\"" \
               --nodes ${{ steps.meta.outputs.NODES }} \
+              ${{ matrix.ADDITIONAL_ARGS }} \
               $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess)
           EOF
           )
diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml
index d69dcf5c2..dd1d301d8 100644
--- a/.github/workflows/_test_upstream_pax.yaml
+++ b/.github/workflows/_test_upstream_pax.yaml
@@ -199,7 +199,7 @@ jobs:
             ICI: "[4, 2, 1, 1]"
             DCN: "[1, 1, 1, 1]"
             TASKS: 8
-            ADDITIONAL_ARGS: ""
+            ADDITIONAL_ARGS: "--enable-pipeline-parallel"
           - TEST_NAME: 4DP1FSDP2TP1PP
             ICI: "[4, 2, 1]"
             DCN: "[1, 1, 1]"
@@ -214,7 +214,7 @@ jobs:
             ICI: "[1, 1, 2, 4]"
             DCN: "[2, 1, 1, 1]"
             TASKS: 16
-            ADDITIONAL_ARGS: ""
+            ADDITIONAL_ARGS: "--enable-pipeline-parallel"
           - TEST_NAME: LLaMA_eval
             ICI: "[1, 8, 1]"
             DCN: "[1, 1, 1]"

From ad61b488e8595db774505aa8c89c16e0476e7824 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Fri, 31 May 2024 16:32:29 -0700
Subject: [PATCH 14/20] attempt tp fix ici and dcn mesh shape formatting

---
 .github/workflows/_test_pax_rosetta.yaml  | 20 ++++++++++----------
 .github/workflows/_test_upstream_pax.yaml | 12 ++++++------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml
index 8996ea8fb..a6e934935 100644
--- a/.github/workflows/_test_pax_rosetta.yaml
+++ b/.github/workflows/_test_pax_rosetta.yaml
@@ -122,8 +122,8 @@ jobs:
               --dtype bfloat16 \
               --batch-per-gpu 4 \
               --steps 500 \
-              --ici "\"${{ matrix.ICI }}\"" \
-              --dcn "\"${{ matrix.DCN }}\"" \
+              --ici "${{ matrix.ICI }}" \
+              --dcn "${{ matrix.DCN }}" \
               --nodes ${{ steps.meta.outputs.NODES }} \
               --enable-te
           EOF
@@ -357,8 +357,8 @@ jobs:
               --dtype bfloat16 \
               --batch-per-gpu ${{ matrix.BATCH_SIZE }} \
               --steps 300 \
-              --ici "\"${{ matrix.ICI }}\"" \
-              --dcn "\"${{ matrix.DCN }}\"" \
+              --ici "${{ matrix.ICI }}" \
+              --dcn "${{ matrix.DCN }}" \
               --nodes ${{ steps.meta.outputs.NODES }} \
               --enable-te \
               --additional-args "--fdl.PACKED_INPUT=False"  \
@@ -569,8 +569,8 @@ jobs:
               --dtype bfloat16 \
               --batch-per-gpu 4 \
               --steps 300 \
-              --ici "\"${{ matrix.ICI }}\"" \
-              --dcn "\"${{ matrix.DCN }}\"" \
+              --ici "${{ matrix.ICI }}" \
+              --dcn "${{ matrix.DCN }}" \
               --nodes ${{ steps.meta.outputs.NODES }} \
               ${{ matrix.ADDITIONAL_ARGS }} \
               $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess)
@@ -763,8 +763,8 @@ jobs:
               --dtype bfloat16 \
               --batch-per-gpu 4 \
               --steps 300 \
-              --ici "\"${{ matrix.ICI }}\"" \
-              --dcn "\"${{ matrix.DCN }}\"" \
+              --ici "${{ matrix.ICI }}" \
+              --dcn "${{ matrix.DCN }}" \
               --nodes ${{ steps.meta.outputs.NODES }} \
               --enable-dropout \
               --enable-te \
@@ -956,8 +956,8 @@ jobs:
               --batch-per-gpu 4 \
               --steps 500 \
               --evaluate \
-              --ici "\"${{ matrix.ICI }}\"" \
-              --dcn "\"${{ matrix.DCN }}\"" \
+              --ici "${{ matrix.ICI }}" \
+              --dcn "${{ matrix.DCN }}" \
               --nodes 1 \
               --enable-te
           EOF
diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml
index dd1d301d8..78aaa586f 100644
--- a/.github/workflows/_test_upstream_pax.yaml
+++ b/.github/workflows/_test_upstream_pax.yaml
@@ -118,8 +118,8 @@ jobs:
               --dtype bfloat16 \
               --batch-per-gpu 4 \
               --steps 500 \
-              --ici "\"${{ matrix.ICI }}\"" \
-              --dcn "\"${{ matrix.DCN }}\"" \
+              --ici "${{ matrix.ICI }}" \
+              --dcn "${{ matrix.DCN }}" \
               --nodes ${{ steps.meta.outputs.NODES }}
           EOF
           )
@@ -307,8 +307,8 @@ jobs:
               --dtype bfloat16 \
               --batch-per-gpu 4 \
               --steps 500 \
-              --ici "\"${{ matrix.ICI }}\"" \
-              --dcn "\"${{ matrix.DCN }}\"" \
+              --ici "${{ matrix.ICI }}" \
+              --dcn "${{ matrix.DCN }}" \
               --nodes ${{ steps.meta.outputs.NODES }} \
               $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess) \
               ${{ matrix.ADDITIONAL_ARGS }}
@@ -456,8 +456,8 @@ jobs:
               --batch-per-gpu 4 \
               --steps 500 \
               --evaluate \
-              --ici "\"${{ matrix.ICI }}\"" \
-              --dcn "\"${{ matrix.DCN }}\"" \
+              --ici "${{ matrix.ICI }}" \
+              --dcn "${{ matrix.DCN }}" \
               --nodes ${{ steps.meta.outputs.NODES }} \
               $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess)
           EOF

From a64bfad695e7aace6cf5ddbcec1f0c4355e8edd1 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Sat, 1 Jun 2024 12:51:23 -0700
Subject: [PATCH 15/20] fix mesh shapes

---
 .github/container/test-pax.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh
index 3aa711df9..7b87394f7 100755
--- a/.github/container/test-pax.sh
+++ b/.github/container/test-pax.sh
@@ -355,8 +355,8 @@ CMD_LINE_FLAGS="--fdl_config=${CONFIG} \
     --job_log_dir=${OUTPUT} \
     --alsologtostderr \
     --fdl.PERCORE_BATCH_SIZE=${BATCH_PER_GPU} \
-    --fdl.ICI_MESH_SHAPE=${ICI} \
-    --fdl.DCN_MESH_SHAPE=${DCN} \
+    --fdl.ICI_MESH_SHAPE=\"${ICI}\" \
+    --fdl.DCN_MESH_SHAPE=\"${DCN}\" \
     $ADDITIONAL_ARGS"
 if [[ $MULTIPROCESS != 0 ]]; then
   CMD_LINE_FLAGS+=" --multiprocess_gpu"

From 5101dbb61f0e89aff0bafbccf558584f84640692 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Sun, 2 Jun 2024 20:48:27 -0700
Subject: [PATCH 16/20] ici/dcn syntax fix

---
 .github/workflows/_test_pax_rosetta.yaml  | 64 +++++++++++------------
 .github/workflows/_test_upstream_pax.yaml | 48 ++++++++---------
 2 files changed, 56 insertions(+), 56 deletions(-)

diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml
index a6e934935..d3d167045 100644
--- a/.github/workflows/_test_pax_rosetta.yaml
+++ b/.github/workflows/_test_pax_rosetta.yaml
@@ -35,11 +35,11 @@ jobs:
       matrix:
         include:
           - TEST_NAME: 8DP1FSDP1TP1PP_single_process_TE
-            ICI: "[8, 1, 1]"
-            DCN: "[1, 1, 1]"
+            ICI: "[8,1,1]"
+            DCN: "[1,1,1]"
           - TEST_NAME: 1DP2FSDP4TP1PP_single_process_TE
-            ICI: "[8, 1, 1]"
-            DCN: "[1, 1, 1]"
+            ICI: "[8,1,1]"
+            DCN: "[1,1,1]"
       fail-fast: false
 
     runs-on: ubuntu-22.04
@@ -228,50 +228,50 @@ jobs:
       matrix:
         include:
           - TEST_NAME: 1DP1FSDP1TP1PP_TE
-            ICI: "[1, 1, 1]"
-            DCN: "[1, 1, 1]"
+            ICI: "[1,1,1]"
+            DCN: "[1,1,1]"
             BATCH_SIZE: 4
             TASKS: 1
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 8DP1FSDP1TP1PP_TE
-            ICI: "[8, 1, 1]"
-            DCN: "[1, 1, 1]"
+            ICI: "[8,1,1]"
+            DCN: "[1,1,1]"
             ADDITIONAL_ARGS: ""
             BATCH_SIZE: 4
             TASKS: 8
           - TEST_NAME: 1DP8FSDP1TP1PP_TE
-            ICI: "[1, 8, 1]"
-            DCN: "[1, 1, 1]"
+            ICI: "[1,8,1]"
+            DCN: "[1,1,1]"
             BATCH_SIZE: 4
             TASKS: 8
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 4DP1FSDP2TP1PP_TE
-            ICI: "[4, 1, 1]"
-            DCN: "[1, 1, 1]"
+            ICI: "[4,1,1]"
+            DCN: "[1,1,1]"
             BATCH_SIZE: 4
             TASKS: 8
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 16DP1FSDP1TP1PP_TE
-            ICI: "[8, 1, 1]"
-            DCN: "[2, 1, 1]"
+            ICI: "[8,1,1]"
+            DCN: "[2,1,1]"
             BATCH_SIZE: 4
             TASKS: 16
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 5B_fused_attn_1
-            ICI: "[1, 8, 1]"
-            DCN: "[1, 1, 1]"
+            ICI: "[1,8,1]"
+            DCN: "[1,1,1]"
             BATCH_SIZE: 2
             TASKS: 8
             ADDITIONAL_ARGS: "--model-type 5B --enable-fused-attn"
           - TEST_NAME: 5B_fused_attn_0
-            ICI: "[1, 8, 1]"
-            DCN: "[1, 1, 1]"
+            ICI: "[1,8,1]"
+            DCN: "[1,1,1]"
             BATCH_SIZE: 2
             TASKS: 8
             ADDITIONAL_ARGS: "--model-type 5B"
           - TEST_NAME: LLaMA_eval_TE
-            ICI: "[1, 8, 1]"
-            DCN: "[1, 1, 1]"
+            ICI: "[1,8,1]"
+            DCN: "[1,1,1]"
             BATCH_SIZE: 4
             TASKS: 8
             EVALUATE: true
@@ -468,23 +468,23 @@ jobs:
       matrix:
         include:
           - TEST_NAME: 8DP1FSDP1TP1PP
-            ICI: "[8, 1, 1]"
-            DCN: "[1, 1, 1]"
+            ICI: "[8,1,1]"
+            DCN: "[1,1,1]"
             TASKS: 1
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 4DP1FSDP2TP1PP
-            ICI: "[4, 1, 2]"
-            DCN: "[1, 1, 1]"
+            ICI: "[4,1,2]"
+            DCN: "[1,1,1]"
             TASKS: 8
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 4DP1FSDP1TP2PP
-            ICI: "[4, 2, 1, 1]"
-            DCN: "[1, 1, 1, 1]"
+            ICI: "[4,2,1,1]"
+            DCN: "[1,1,1,1]"
             TASKS: 8
             ADDITIONAL_ARGS: "--enable-pipeline-parallel"
           - TEST_NAME: 4DP1FSDP2TP2PP
-            ICI: "[2, 2, 1, 2]"
-            DCN: "[2, 1, 1, 1]"
+            ICI: "[2,2,1,2]"
+            DCN: "[2,1,1,1]"
             TASKS: 16
             ADDITIONAL_ARGS: "--enable-pipeline-parallel"
       fail-fast: false
@@ -678,8 +678,8 @@ jobs:
       matrix:
         include:
           - TEST_NAME: 8DP_TE_dropout
-            ICI: "[8, 1, 1]"
-            DCN: "[1, 1, 1]"
+            ICI: "[8,1,1]"
+            DCN: "[1,1,1]"
             TASKS: 8
       fail-fast: false
 
@@ -873,8 +873,8 @@ jobs:
       matrix:
         include:
           - TEST_NAME: 8DP1FSDP1TP1PP_eval_TE
-            ICI: "[8, 1, 1]"
-            DCN: "[1, 1, 1]"
+            ICI: "[8,1,1]"
+            DCN: "[1,1,1]"
       fail-fast: false
 
     runs-on: ubuntu-22.04
diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml
index 78aaa586f..e8357cdf1 100644
--- a/.github/workflows/_test_upstream_pax.yaml
+++ b/.github/workflows/_test_upstream_pax.yaml
@@ -35,11 +35,11 @@ jobs:
       matrix:
         include:
           - TEST_NAME: 8DP1FSDP1TP1PP_single_process
-            ICI: "[8, 1, 1]"
-            DCN: "[1, 1, 1]"
+            ICI: "[8,1,1]"
+            DCN: "[1,1,1]"
           - TEST_NAME: 1DP2FSDP4TP1PP_single_process
-            ICI: "[8, 1, 1]"
-            DCN: "[1, 1, 1]"
+            ICI: "[8,1,1]"
+            DCN: "[1,1,1]"
       fail-fast: false
 
     runs-on: ubuntu-22.04
@@ -181,49 +181,49 @@ jobs:
       matrix:
         include:
           - TEST_NAME: 1DP1FSDP1TP1PP
-            ICI: "[1, 1, 1]"
-            DCN: "[1, 1, 1]"
+            ICI: "[1,1,1]"
+            DCN: "[1,1,1]"
             TASKS: 1
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 8DP1FSDP1TP1PP
-            ICI: "[8, 1, 1]"
-            DCN: "[1, 1, 1]"
+            ICI: "[8,1,1]"
+            DCN: "[1,1,1]"
             TASKS: 8
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 1DP8FSDP1TP1PP
-            ICI: "[1, 8, 1]"
-            DCN: "[1, 1, 1]"
+            ICI: "[1,8,1]"
+            DCN: "[1,1,1]"
             TASKS: 8
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 2DP1FSDP1TP4PP
-            ICI: "[4, 2, 1, 1]"
-            DCN: "[1, 1, 1, 1]"
+            ICI: "[4,2,1,1]"
+            DCN: "[1,1,1,1]"
             TASKS: 8
             ADDITIONAL_ARGS: "--enable-pipeline-parallel"
           - TEST_NAME: 4DP1FSDP2TP1PP
-            ICI: "[4, 2, 1]"
-            DCN: "[1, 1, 1]"
+            ICI: "[4,2,1]"
+            DCN: "[1,1,1]"
             TASKS: 8
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 16DP1FSDP1TP1PP
-            ICI: "[8, 1, 1]"
-            DCN: "[2, 1, 1]"
+            ICI: "[8,1,1]"
+            DCN: "[2,1,1]"
             TASKS: 16
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 2DP1FSDP2TP4PP
-            ICI: "[1, 1, 2, 4]"
-            DCN: "[2, 1, 1, 1]"
+            ICI: "[1,1,2,4]"
+            DCN: "[2,1,1,1]"
             TASKS: 16
             ADDITIONAL_ARGS: "--enable-pipeline-parallel"
           - TEST_NAME: LLaMA_eval
-            ICI: "[1, 8, 1]"
-            DCN: "[1, 1, 1]"
+            ICI: "[1,8,1]"
+            DCN: "[1,1,1]"
             TASKS: 16
             EVALUATE: true
             ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate"
           - TEST_NAME: Grok
-            ICI: "[1, 1, 8, 1]"
-            DCN: "[1, 2, 1, 1]"
+            ICI: "[1,1,8,1]"
+            DCN: "[1,2,1,1]"
             TASKS: 16
             EVALUATE: true
             ADDITIONAL_ARGS: "--model-type GrokProxy"
@@ -374,8 +374,8 @@ jobs:
       matrix:
         include:
           - TEST_NAME: 8DP1FSDP1TP1PP_eval
-            ICI: "[8, 1, 1]"
-            DCN: "[1, 1, 1]"
+            ICI: "[8,1,1]"
+            DCN: "[1,1,1]"
       fail-fast: false
 
     runs-on: ubuntu-22.04

From 9e3b1aad4c2be5aa8c99d7118a91e6725b164675 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Mon, 3 Jun 2024 11:19:21 -0700
Subject: [PATCH 17/20] fix mesh shapes

---
 .github/workflows/_test_pax_rosetta.yaml  | 20 ++++++++++----------
 .github/workflows/_test_upstream_pax.yaml | 12 ++++++------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml
index d3d167045..005ba0a90 100644
--- a/.github/workflows/_test_pax_rosetta.yaml
+++ b/.github/workflows/_test_pax_rosetta.yaml
@@ -122,8 +122,8 @@ jobs:
               --dtype bfloat16 \
               --batch-per-gpu 4 \
               --steps 500 \
-              --ici "${{ matrix.ICI }}" \
-              --dcn "${{ matrix.DCN }}" \
+              --ici ${{ matrix.ICI }} \
+              --dcn ${{ matrix.DCN }} \
               --nodes ${{ steps.meta.outputs.NODES }} \
               --enable-te
           EOF
@@ -357,8 +357,8 @@ jobs:
               --dtype bfloat16 \
               --batch-per-gpu ${{ matrix.BATCH_SIZE }} \
               --steps 300 \
-              --ici "${{ matrix.ICI }}" \
-              --dcn "${{ matrix.DCN }}" \
+              --ici ${{ matrix.ICI }} \
+              --dcn ${{ matrix.DCN }} \
               --nodes ${{ steps.meta.outputs.NODES }} \
               --enable-te \
               --additional-args "--fdl.PACKED_INPUT=False"  \
@@ -569,8 +569,8 @@ jobs:
               --dtype bfloat16 \
               --batch-per-gpu 4 \
               --steps 300 \
-              --ici "${{ matrix.ICI }}" \
-              --dcn "${{ matrix.DCN }}" \
+              --ici ${{ matrix.ICI }} \
+              --dcn ${{ matrix.DCN }} \
               --nodes ${{ steps.meta.outputs.NODES }} \
               ${{ matrix.ADDITIONAL_ARGS }} \
               $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess)
@@ -763,8 +763,8 @@ jobs:
               --dtype bfloat16 \
               --batch-per-gpu 4 \
               --steps 300 \
-              --ici "${{ matrix.ICI }}" \
-              --dcn "${{ matrix.DCN }}" \
+              --ici ${{ matrix.ICI }} \
+              --dcn ${{ matrix.DCN }} \
               --nodes ${{ steps.meta.outputs.NODES }} \
               --enable-dropout \
               --enable-te \
@@ -956,8 +956,8 @@ jobs:
               --batch-per-gpu 4 \
               --steps 500 \
               --evaluate \
-              --ici "${{ matrix.ICI }}" \
-              --dcn "${{ matrix.DCN }}" \
+              --ici ${{ matrix.ICI }} \
+              --dcn ${{ matrix.DCN }} \
               --nodes 1 \
               --enable-te
           EOF
diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml
index e8357cdf1..2967edab3 100644
--- a/.github/workflows/_test_upstream_pax.yaml
+++ b/.github/workflows/_test_upstream_pax.yaml
@@ -118,8 +118,8 @@ jobs:
               --dtype bfloat16 \
               --batch-per-gpu 4 \
               --steps 500 \
-              --ici "${{ matrix.ICI }}" \
-              --dcn "${{ matrix.DCN }}" \
+              --ici ${{ matrix.ICI }} \
+              --dcn ${{ matrix.DCN }} \
               --nodes ${{ steps.meta.outputs.NODES }}
           EOF
           )
@@ -307,8 +307,8 @@ jobs:
               --dtype bfloat16 \
               --batch-per-gpu 4 \
               --steps 500 \
-              --ici "${{ matrix.ICI }}" \
-              --dcn "${{ matrix.DCN }}" \
+              --ici ${{ matrix.ICI }} \
+              --dcn ${{ matrix.DCN }} \
               --nodes ${{ steps.meta.outputs.NODES }} \
               $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess) \
               ${{ matrix.ADDITIONAL_ARGS }}
@@ -456,8 +456,8 @@ jobs:
               --batch-per-gpu 4 \
               --steps 500 \
               --evaluate \
-              --ici "${{ matrix.ICI }}" \
-              --dcn "${{ matrix.DCN }}" \
+              --ici ${{ matrix.ICI }} \
+              --dcn ${{ matrix.DCN }} \
               --nodes ${{ steps.meta.outputs.NODES }} \
               $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess)
           EOF

From 99e8ca6ee45e72214c64758d1ee62f140a8d3721 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Mon, 3 Jun 2024 13:45:40 -0700
Subject: [PATCH 18/20] fix

---
 .github/container/test-pax.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh
index 7b87394f7..724501b03 100755
--- a/.github/container/test-pax.sh
+++ b/.github/container/test-pax.sh
@@ -106,11 +106,11 @@ while [ : ]; do
             shift 2
             ;;
         --ici)
-            ICI="$2"
+            ICI=$2
             shift 2
             ;;
         --dcn)
-            DCN="$2"
+            DCN=$2
             shift 2
             ;;
         -n | --nodes)
@@ -355,8 +355,8 @@ CMD_LINE_FLAGS="--fdl_config=${CONFIG} \
     --job_log_dir=${OUTPUT} \
     --alsologtostderr \
     --fdl.PERCORE_BATCH_SIZE=${BATCH_PER_GPU} \
-    --fdl.ICI_MESH_SHAPE=\"${ICI}\" \
-    --fdl.DCN_MESH_SHAPE=\"${DCN}\" \
+    --fdl.ICI_MESH_SHAPE=${ICI} \
+    --fdl.DCN_MESH_SHAPE=${DCN} \
     $ADDITIONAL_ARGS"
 if [[ $MULTIPROCESS != 0 ]]; then
   CMD_LINE_FLAGS+=" --multiprocess_gpu"

From d8ea091b6f0a8c221ba258654c71b093a70a91d6 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Tue, 4 Jun 2024 10:01:57 -0700
Subject: [PATCH 19/20] bug fixes

---
 .github/workflows/_test_pax_rosetta.yaml  | 8 ++++----
 .github/workflows/_test_upstream_pax.yaml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml
index 005ba0a90..fef938315 100644
--- a/.github/workflows/_test_pax_rosetta.yaml
+++ b/.github/workflows/_test_pax_rosetta.yaml
@@ -246,7 +246,7 @@ jobs:
             TASKS: 8
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 4DP1FSDP2TP1PP_TE
-            ICI: "[4,1,1]"
+            ICI: "[4,1,2]"
             DCN: "[1,1,1]"
             BATCH_SIZE: 4
             TASKS: 8
@@ -470,19 +470,19 @@ jobs:
           - TEST_NAME: 8DP1FSDP1TP1PP
             ICI: "[8,1,1]"
             DCN: "[1,1,1]"
-            TASKS: 1
+            TASKS: 8
             ADDITIONAL_ARGS: ""
           - TEST_NAME: 4DP1FSDP2TP1PP
             ICI: "[4,1,2]"
             DCN: "[1,1,1]"
             TASKS: 8
             ADDITIONAL_ARGS: ""
-          - TEST_NAME: 4DP1FSDP1TP2PP
+          - TEST_NAME: 2DP1FSDP1TP4PP
             ICI: "[4,2,1,1]"
             DCN: "[1,1,1,1]"
             TASKS: 8
             ADDITIONAL_ARGS: "--enable-pipeline-parallel"
-          - TEST_NAME: 4DP1FSDP2TP2PP
+          - TEST_NAME: 2DP1FSDP2TP4PP
             ICI: "[2,2,1,2]"
             DCN: "[2,1,1,1]"
             TASKS: 16
diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml
index 2967edab3..7e0f0e1c4 100644
--- a/.github/workflows/_test_upstream_pax.yaml
+++ b/.github/workflows/_test_upstream_pax.yaml
@@ -218,7 +218,7 @@ jobs:
           - TEST_NAME: LLaMA_eval
             ICI: "[1,8,1]"
             DCN: "[1,1,1]"
-            TASKS: 16
+            TASKS: 8
             EVALUATE: true
             ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate"
           - TEST_NAME: Grok

From 3b8a66c93b3e8db7482d8038e886bb801fe671fa Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Fri, 7 Jun 2024 09:26:04 -0700
Subject: [PATCH 20/20] reduce num_layers for grok

---
 .github/container/test-pax.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh
index 724501b03..407e0b817 100755
--- a/.github/container/test-pax.sh
+++ b/.github/container/test-pax.sh
@@ -343,7 +343,7 @@ elif [[ ${MODEL_TYPE} == "LLaMA70BProxy" ]]; then
 ## hard-code ICI mesh shape for Grok
 elif [[ ${MODEL_TYPE} == "GrokProxy" ]]; then
   CONFIG=paxml.tasks.lm.params.nvidia.Grok_Proxy
-  ADDITIONAL_ARGS+="--fdl.NUM_LAYERS=2"
+  ADDITIONAL_ARGS+="--fdl.NUM_LAYERS=1"
 else
   echo "Unsupported model ${MODEL_TYPE}"
   exit 1