From 1bb2d16d39e4f8e2c7bdcf10cf674e0432f31b83 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Fri, 24 May 2024 11:20:33 -0700 Subject: [PATCH 01/20] add upstream pax 2-node Grok test --- .github/container/test-pax.sh | 4 ++++ .github/workflows/_test_upstream_pax.yaml | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh index 91cb926b4..a11c62edf 100755 --- a/.github/container/test-pax.sh +++ b/.github/container/test-pax.sh @@ -382,6 +382,10 @@ elif [[ ${MODEL_TYPE} == "5B" ]]; then elif [[ ${MODEL_TYPE} == "LLaMA70BProxy" ]]; then CONFIG=ci_configs.LLaMA70BSyntheticSmall ADDITIONAL_ARGS="--fdl.DCN_MESH_SHAPE=[1,${NODES},1] --fdl.ICI_MESH_SHAPE=[${DP},${FSDP},${TP}] ${ADDITIONAL_ARGS} --fdl.PERCORE_BATCH_SIZE=${BATCH_PER_GPU}" +## hard-code ICI mesh shape for Grok +elif [[ ${MODEL_TYPE} == "GrokProxy" ]]; then + CONFIG=paxml.tasks.lm.params.nvidia.Grok_Proxy + ADDITIONAL_ARGS="--fdl.DCN_MESH_SHAPE=[1,${NODES},1,1] --fdl.ICI_MESH_SHAPE=[1,1,8,1] ${ADDITIONAL_ARGS} --fdl.PERCORE_BATCH_SIZE=${BATCH_PER_GPU} --fdl.NUM_LAYERS=2" else echo "Unsupported model ${MODEL_TYPE}" exit 1 diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml index c07749b12..423a9f5de 100644 --- a/.github/workflows/_test_upstream_pax.yaml +++ b/.github/workflows/_test_upstream_pax.yaml @@ -210,6 +210,11 @@ jobs: BATCH_SIZE: 4 EVALUATE: true ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate" + - TEST_NAME: Grok + PARALLEL_CONFIG: [1, 2, 8, 1] ## note: only used to compute num nodes + BATCH_SIZE: 4 + EVALUATE: true + ADDITIONAL_ARGS: "--model-type GrokProxy" fail-fast: false runs-on: ubuntu-22.04 From 17a7eb46cff8f54c63266b3e52d287c729e14278 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Wed, 29 May 2024 16:57:49 -0700 Subject: [PATCH 02/20] update pax tests to pass in ICI and DCN mesh shapes explicitly --- .github/container/test-pax.sh | 99 +++++------------- .github/workflows/_test_pax_rosetta.yaml | 122 +++++++++++++--------- .github/workflows/_test_upstream_pax.yaml | 87 ++++++++------- 3 files changed, 149 insertions(+), 159 deletions(-) diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh index a11c62edf..d4ad9b069 100755 --- a/.github/container/test-pax.sh +++ b/.github/container/test-pax.sh @@ -22,11 +22,9 @@ usage() { echo " --evaluate Whether to test evaluation rather than training." echo " -s, --steps Number of steps to run, defaults to 500." echo " --multiprocess Enable the multiprocess GPU mode." + echo " --ici ICI mesh shape." + echo " --dcn DCN mesh shape." echo " -o, --output NAME Name for the output folder, a temporary folder will be created if none specified." - echo " --data-parallel Data parallelism to use. Defaults to 1." - echo " --fsdp Fully-sharded data parallelism to use. Defaults to 1." - echo " --tensor-parallel Tensor parallelism to use. Defaults to 1." - echo " --pipeline-parallel Pipeline parallelism to use. Defaults to 1 for no pipelining." echo " -n, --nodes Number of nodes." echo " -h, --help Print usage." exit $1 @@ -44,10 +42,8 @@ OUTPUT=$(mktemp -d) BATCH_PER_GPU=4 DTYPE="bfloat16" STEPS=500 -DP=1 -FSDP=1 -TP=1 -PP=1 +ICI="[1,1,1]" +DCN="[1,1,1]" NODES=1 ENABLE_TE=0 MODEL_TYPE=126M @@ -103,20 +99,12 @@ while [ : ]; do OUTPUT=$2 shift 2 ;; - --data-parallel) - DP="$2" + --ici) + ICI="$2" shift 2 ;; - --fsdp) - FSDP="$2" - shift 2 - ;; - --tensor-parallel) - TP="$2" - shift 2 - ;; - --pipeline-parallel) - PP="$2" + --dcn) + DCN="$2" shift 2 ;; -n | --nodes) @@ -152,10 +140,8 @@ print_var ENABLE_TE print_var NVTE_FUSED_ATTN print_var EVALUATE print_var DROPOUT -print_var DP -print_var FSDP -print_var TP -print_var PP +print_var ICI +print_var DCN PAXML_DIR=$(dirname `python -c 'import paxml; print(*paxml.__path__)'`) pushd ${PAXML_DIR} @@ -174,16 +160,7 @@ from paxml.tasks.lm.params.lm_cloud import SyntheticDataset from praxis import base_layer from praxis import layers -dp = ${DP} -fsdp = ${FSDP} -tp = ${TP} -pp = ${PP} -num_gpus = ${NGPUS} -percore_batch_size = ${BATCH_PER_GPU} -dtype = "${DTYPE}" -dropout = float(${DROPOUT}) - -assert num_gpus == dp*fsdp*tp*pp, f'product of parallel strategies should equal number of available gpus. Have {num_gpus} gpus, but product of parallel strategies is {dp*fsdp*tp*pp}' +assert num_gpus == np.prod(${ICI}) * np.prod(${DCN}), f'product of parallel strategies should equal number of available gpus. Have {num_gpus} gpus, but product of parallel strategies is {np.prod(${ICI}) * np.prod(${DCN})}' ## heuristics to get ici and dcn mesh shapes. ## these heuristics only support one parallel strategy across nodes @@ -301,11 +278,6 @@ class LLaMA70BSyntheticSmall(BaseLLaMA, SyntheticDataset): USE_MQA = True NUM_KV_HEADS = 8 - PERCORE_BATCH_SIZE = 4 - - ICI_MESH_SHAPE = [1, 8, 1] - DCN_MESH_SHAPE = [1, 1, 1] - def task(self): task_p = super().task() task_p.train.always_use_train_for_model_init=False @@ -317,11 +289,8 @@ if pp > 1: @experiment_registry.register class Synthetic126MCI(GPT126MPP, SyntheticDataset): - ICI_MESH_SHAPE = [pp, dp, fsdp, tp] - DCN_MESH_SHAPE = [dcn_pp, dcn_dp, dcn_fsdp, 1] MICROBATCH_SIZE = 2 NUM_STAGES = pp - PERCORE_BATCH_SIZE = percore_batch_size FRPOP_DTYPE = dtype def task(self): @@ -334,9 +303,6 @@ else: @experiment_registry.register class Synthetic126MCI(Synthetic126M): - ICI_MESH_SHAPE = [dp, fsdp, tp] - DCN_MESH_SHAPE = [dcn_dp, dcn_fsdp, 1] - PERCORE_BATCH_SIZE = percore_batch_size FRPOP_DTYPE = dtype DROPOUT_PROB = dropout @@ -378,52 +344,41 @@ if [[ ${MODEL_TYPE} == "126M" ]]; then CONFIG=ci_configs.Synthetic126MCI elif [[ ${MODEL_TYPE} == "5B" ]]; then CONFIG=paxml.contrib.gpu.scripts_gpu.configs.Synthetic5B - ADDITIONAL_ARGS="--fdl.DCN_MESH_SHAPE=[1,${NODES},1] --fdl.ICI_MESH_SHAPE=[${DP},${FSDP},${TP}] ${ADDITIONAL_ARGS} --fdl.PERCORE_BATCH_SIZE=${BATCH_PER_GPU}" elif [[ ${MODEL_TYPE} == "LLaMA70BProxy" ]]; then CONFIG=ci_configs.LLaMA70BSyntheticSmall - ADDITIONAL_ARGS="--fdl.DCN_MESH_SHAPE=[1,${NODES},1] --fdl.ICI_MESH_SHAPE=[${DP},${FSDP},${TP}] ${ADDITIONAL_ARGS} --fdl.PERCORE_BATCH_SIZE=${BATCH_PER_GPU}" ## hard-code ICI mesh shape for Grok elif [[ ${MODEL_TYPE} == "GrokProxy" ]]; then CONFIG=paxml.tasks.lm.params.nvidia.Grok_Proxy - ADDITIONAL_ARGS="--fdl.DCN_MESH_SHAPE=[1,${NODES},1,1] --fdl.ICI_MESH_SHAPE=[1,1,8,1] ${ADDITIONAL_ARGS} --fdl.PERCORE_BATCH_SIZE=${BATCH_PER_GPU} --fdl.NUM_LAYERS=2" + ADDITIONAL_ARGS="--fdl.NUM_LAYERS=2" else echo "Unsupported model ${MODEL_TYPE}" exit 1 fi +CMD_LINE_FLAGS = "--fdl_config=${CONFIG} \ + --fdl.MAX_STEPS=0 \ + --job_log_dir=${OUTPUT} \ + --alsologtostderr \ + --fdl.PERCORE_BATCH_SIZE=${BATCH_PER_GPU} \ + --fdl.ICI_MESH_SHAPE=${ICI} \ + --fdl.DCN_MESH_SHAPE=${DCN} \ + $ADDITIONAL_ARGS \ + $([[ $MULTIPROCESS != 0 ]] && echo --multiprocess_gpu)" + if [[ ${EVALUATE} -ne 0 ]]; then trap "rm -rf ${OUTPUT}/checkpoints" ERR INT HUP TERM EXIT ## train for 0 steps to generate an initial checkpoint - python -m paxml.main \ - --fdl_config=${CONFIG} \ - --fdl.MAX_STEPS=0 \ - --job_log_dir=${OUTPUT} \ - --alsologtostderr \ - $ADDITIONAL_ARGS \ - $([[ $MULTIPROCESS != 0 ]] && echo --multiprocess_gpu) + python -m paxml.main ${CMD_LINE_FLAGS} ## restore from initial checkpoint for eval - python -m paxml.main \ - --fdl_config=${CONFIG} \ - --job_log_dir=${OUTPUT} \ - --mode='eval' \ - --fdl.MAX_STEPS=0 \ - --alsologtostderr \ - --enable_checkpoint_saving=False \ - $ADDITIONAL_ARGS \ - $([[ $MULTIPROCESS != 0 ]] && echo --multiprocess_gpu) + python -m paxml.main ${CMD_LINE_FLAGS} \ + --enable_checkpoint_saving=False else - python -m paxml.main \ - --fdl_config=${CONFIG} \ - --job_log_dir=${OUTPUT} \ - --alsologtostderr \ - --fdl.MAX_STEPS=${STEPS} \ - --enable_checkpoint_saving=False \ - $ADDITIONAL_ARGS \ - $([[ $MULTIPROCESS != 0 ]] && echo --multiprocess_gpu) + python -m paxml.main ${CMD_LINE_FLAGS} \ + --enable_checkpoint_saving=False fi set +x diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml index 7b80bfa60..8a05f6a11 100644 --- a/.github/workflows/_test_pax_rosetta.yaml +++ b/.github/workflows/_test_pax_rosetta.yaml @@ -33,9 +33,13 @@ jobs: single-process-multi-device-te: strategy: matrix: - PARALLEL_CONFIG: - - [1, 8, 1, 1] - - [1, 1, 2, 4] + include: + - TEST_NAME: 8DP1FSDP1TP1PP_single_process_TE + ICI: [8, 1, 1] + DCN: [1, 1, 1] + - TEST_NAME: 1DP2FSDP4TP1PP_single_process_TE + ICI: [8, 1, 1] + DCN: [1, 1, 1] fail-fast: false runs-on: ubuntu-22.04 @@ -70,7 +74,7 @@ jobs: shell: bash -x -e {0} run: | IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" - TEST_CASE_NAME=${{ matrix.PARALLEL_CONFIG[1] }}DP${{ matrix.PARALLEL_CONFIG[2] }}FSDP${{ matrix.PARALLEL_CONFIG[3] }}TP${{ matrix.PARALLEL_CONFIG[0] }}PP_single_process_TE + TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }} MAX_GPUS_PER_NODE=8 GPUS_PER_NODE=8 NODES=1 @@ -118,10 +122,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu 4 \ --steps 500 \ - --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \ - --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \ - --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \ - --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \ + --ici ${{ matrix.ICI }} + --dcn ${{ matrix.DCN }} --nodes ${{ steps.meta.outputs.NODES }} \ --enable-te EOF @@ -226,36 +228,52 @@ jobs: matrix: include: - TEST_NAME: 1DP1FSDP1TP1PP_TE - PARALLEL_CONFIG: [1, 1, 1, 1] + ICI: [1, 1, 1] + DCN: [1, 1, 1] BATCH_SIZE: 4 + TASKS: 1 ADDITIONAL_ARGS: "" - TEST_NAME: 8DP1FSDP1TP1PP_TE - PARALLEL_CONFIG: [1, 8, 1, 1] + ICI: [8, 1, 1] + DCN: [1, 1, 1] ADDITIONAL_ARGS: "" BATCH_SIZE: 4 + TASKS: 8 - TEST_NAME: 1DP8FSDP1TP1PP_TE - PARALLEL_CONFIG: [1, 1, 8, 1] + ICI: [1, 8, 1] + DCN: [1, 1, 1] BATCH_SIZE: 4 + TASKS: 8 ADDITIONAL_ARGS: "" - TEST_NAME: 4DP1FSDP2TP1PP_TE - PARALLEL_CONFIG: [1, 4, 1, 2] + ICI: [4, 1, 1] + DCN: [1, 1, 1] BATCH_SIZE: 4 + TASKS: 8 ADDITIONAL_ARGS: "" - TEST_NAME: 16DP1FSDP1TP1PP_TE - PARALLEL_CONFIG: [1, 16, 1, 1] + ICI: [8, 1, 1] + DCN: [2, 1, 1] BATCH_SIZE: 4 + TASKS: 16 ADDITIONAL_ARGS: "" - TEST_NAME: 5B_fused_attn_1 - PARALLEL_CONFIG: [1, 1, 8, 1] + ICI: [1, 8, 1] + DCN: [1, 1, 1] BATCH_SIZE: 2 + TASKS: 8 ADDITIONAL_ARGS: "--model-type 5B --enable-fused-attn" - TEST_NAME: 5B_fused_attn_0 - PARALLEL_CONFIG: [1, 1, 8, 1] + ICI: [1, 8, 1] + DCN: [1, 1, 1] BATCH_SIZE: 2 + TASKS: 8 ADDITIONAL_ARGS: "--model-type 5B" - TEST_NAME: LLaMA_eval_TE - PARALLEL_CONFIG: [1, 1, 8, 1] + ICI: [1, 8, 1] + DCN: [1, 1, 1] BATCH_SIZE: 4 + TASKS: 8 EVALUATE: true ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate" fail-fast: false @@ -274,7 +292,6 @@ jobs: - name: Check out the repository under ${GITHUB_WORKSPACE} uses: actions/checkout@v4 - - name: Setup SSH known hosts id: ssh-known-hosts run: | @@ -292,7 +309,7 @@ jobs: cd $GITHUB_WORKSPACE IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" TEST_CASE_NAME=${{ matrix.TEST_NAME }} - TOTAL_TASKS=$((${{ matrix.PARALLEL_CONFIG[0] }} * ${{ matrix.PARALLEL_CONFIG[1] }} * ${{ matrix.PARALLEL_CONFIG[2] }} * ${{ matrix.PARALLEL_CONFIG[3] }})) + TOTAL_TASKS=${{ matrix.TASKS }} MAX_GPUS_PER_NODE=8 NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE)) GPUS_PER_NODE=$((TOTAL_TASKS/NODES)) @@ -340,10 +357,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu ${{ matrix.BATCH_SIZE }} \ --steps 300 \ - --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \ - --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \ - --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \ - --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \ + --ici ${{ matrix.ICI }} \ + --dcn ${{ matric.DCN }} \ --nodes ${{ steps.meta.outputs.NODES }} \ --enable-te \ --additional-args "--fdl.PACKED_INPUT=False" \ @@ -451,11 +466,23 @@ jobs: rosetta-pax-multi-node: strategy: matrix: - PARALLEL_CONFIG: - - [1, 8, 1, 1] - - [1, 4, 1, 2] - - [4, 2, 1, 1] - - [4, 2, 1, 2] + include: + - TEST_NAME: 8DP1FSDP1TP1PP + ICI: [8, 1, 1] + DCN: [1, 1, 1] + TASKS: 1 + - TEST_NAME: 4DP1FSDP2TP1PP + ICI: [4, 1, 2] + DCN: [1, 1, 1] + TASKS: 8 + - TEST_NAME: 4DP1FSDP1TP2PP + ICI: [4, 2, 1, 1] + DCN: [1, 1, 1, 1] + TASKS: 8 + - TEST_NAME: 4DP1FSDP2TP2PP + ICI: [2, 2, 1, 2] + DCN: [2, 1, 1, 1] + TASKS: 16 fail-fast: false runs-on: ubuntu-22.04 @@ -488,8 +515,8 @@ jobs: shell: bash -x -e {0} run: | IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" - TEST_CASE_NAME=${{ matrix.PARALLEL_CONFIG[1] }}DP${{ matrix.PARALLEL_CONFIG[2] }}FSDP${{ matrix.PARALLEL_CONFIG[3] }}TP${{ matrix.PARALLEL_CONFIG[0] }}PP - TOTAL_TASKS=$((${{ matrix.PARALLEL_CONFIG[0] }} * ${{ matrix.PARALLEL_CONFIG[1] }} * ${{ matrix.PARALLEL_CONFIG[2] }} * ${{ matrix.PARALLEL_CONFIG[3] }})) + TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }} + TOTAL_TASKS=${{ matrix.TASKS] }} MAX_GPUS_PER_NODE=8 NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE)) GPUS_PER_NODE=$((TOTAL_TASKS/NODES)) @@ -538,10 +565,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu 4 \ --steps 300 \ - --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \ - --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \ - --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \ - --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \ + --ici ${{ matrix.ICI }} \ + --dcn ${{ matrix.DCN }} \ --nodes ${{ steps.meta.outputs.NODES }} \ $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess) EOF @@ -646,8 +671,11 @@ jobs: rosetta-pax-single-node-dropout-te: strategy: matrix: - PARALLEL_CONFIG: - - [1, 8, 1, 1] + include: + - TEST_NAME: 8DP_TE_dropout + ICI: [8, 1, 1] + DCN: [1, 1, 1] + TASKS: 8 fail-fast: false runs-on: ubuntu-22.04 @@ -681,8 +709,8 @@ jobs: run: | cd $GITHUB_WORKSPACE IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" - TEST_CASE_NAME=${{ matrix.PARALLEL_CONFIG[1] }}DP_TE_dropout - TOTAL_TASKS=$((${{ matrix.PARALLEL_CONFIG[0] }} * ${{ matrix.PARALLEL_CONFIG[1] }} * ${{ matrix.PARALLEL_CONFIG[2] }} * ${{ matrix.PARALLEL_CONFIG[3] }})) + TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }} + TOTAL_TASKS=${{ matrix.TASKS }} MAX_GPUS_PER_NODE=8 NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE)) GPUS_PER_NODE=$((TOTAL_TASKS/NODES)) @@ -730,10 +758,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu 4 \ --steps 300 \ - --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \ - --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \ - --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \ - --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \ + --ici ${{ matrix.ICI }} \ + --dcn ${{ matrix.DCN }} \ --nodes ${{ steps.meta.outputs.NODES }} \ --enable-dropout \ --enable-te \ @@ -840,8 +866,10 @@ jobs: single-process-evaluation-te: strategy: matrix: - PARALLEL_CONFIG: - - [1, 8, 1, 1] + include: + - TEST_NAME: 8DP1FSDP1TP1PP_eval_TE + ICI: [8, 1, 1] + DCN: [1, 1, 1] fail-fast: false runs-on: ubuntu-22.04 @@ -874,7 +902,7 @@ jobs: shell: bash -x -e {0} run: | IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" - TEST_CASE_NAME=${{ matrix.PARALLEL_CONFIG[1] }}DP${{ matrix.PARALLEL_CONFIG[2] }}FSDP${{ matrix.PARALLEL_CONFIG[3] }}TP${{ matrix.PARALLEL_CONFIG[0] }}PP_eval_TE + TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }} MAX_GPUS_PER_NODE=8 GPUS_PER_NODE=8 @@ -923,10 +951,8 @@ jobs: --batch-per-gpu 4 \ --steps 500 \ --evaluate \ - --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \ - --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \ - --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \ - --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \ + --ici ${{ matrix.ICI }} \ + --dcn ${{ matrix.DCN }} \ --nodes 1 \ --enable-te EOF diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml index 423a9f5de..e05341647 100644 --- a/.github/workflows/_test_upstream_pax.yaml +++ b/.github/workflows/_test_upstream_pax.yaml @@ -33,9 +33,13 @@ jobs: single-process-multi-device: strategy: matrix: - PARALLEL_CONFIG: - - [1, 8, 1, 1] - - [1, 1, 2, 4] + include: + - TEST_NAME: 8DP1FSDP1TP1PP_single_process + ICI: [8, 1, 1] + DCN: [1, 1, 1] + - TEST_NAME: 1DP2FSDP4TP1PP_single_process + ICI: [8, 1, 1] + DCN: [1, 1, 1] fail-fast: false runs-on: ubuntu-22.04 @@ -67,7 +71,7 @@ jobs: shell: bash -x -e {0} run: | IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" - TEST_CASE_NAME=${{ matrix.PARALLEL_CONFIG[1] }}DP${{ matrix.PARALLEL_CONFIG[2] }}FSDP${{ matrix.PARALLEL_CONFIG[3] }}TP${{ matrix.PARALLEL_CONFIG[0] }}PP_single_process + TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }} MAX_GPUS_PER_NODE=8 NODES=1 GPUS_PER_NODE=8 @@ -114,10 +118,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu 4 \ --steps 500 \ - --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \ - --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \ - --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \ - --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \ + --ici ${{ matrix.ICI }} \ + --dcn ${{ matrix.DCN }} \ --nodes ${{ steps.meta.outputs.NODES }} EOF ) @@ -177,42 +179,51 @@ jobs: pax-multi-node: strategy: matrix: - include: - TEST_NAME: 1DP1FSDP1TP1PP - PARALLEL_CONFIG: [1, 1, 1, 1] - BATCH_SIZE: 4 + ICI: [1, 1, 1] + DCN: [1, 1, 1] + TASKS: 1 ADDITIONAL_ARGS: "" - TEST_NAME: 8DP1FSDP1TP1PP - PARALLEL_CONFIG: [1, 8, 1, 1] - BATCH_SIZE: 4 + ICI: [8, 1, 1] + DCN: [1, 1, 1] + TASKS: 8 ADDITIONAL_ARGS: "" - TEST_NAME: 1DP8FSDP1TP1PP - PARALLEL_CONFIG: [1, 1, 8, 1] - BATCH_SIZE: 4 + ICI: [1, 8, 1] + DCN: [1, 1, 1] + TASKS: 8 ADDITIONAL_ARGS: "" - TEST_NAME: 2DP1FSDP1TP4PP - PARALLEL_CONFIG: [4, 2, 1, 1] - BATCH_SIZE: 4 + ICI: [4, 2, 1, 1] + DCN: [1, 1, 1, 1] + TASKS: 8 ADDITIONAL_ARGS: "" - TEST_NAME: 4DP1FSDP2TP1PP - PARALLEL_CONFIG: [1, 4, 1, 2] - BATCH_SIZE: 4 + ICI: [4, 2, 1] + DCN: [1, 1, 1] + TASKS: 8 ADDITIONAL_ARGS: "" - TEST_NAME: 16DP1FSDP1TP1PP - PARALLEL_CONFIG: [1, 16, 1, 1] - BATCH_SIZE: 4 + ICI: [8, 1, 1] + DCN: [2, 1, 1] + TASKS: 16 ADDITIONAL_ARGS: "" - TEST_NAME: 2DP1FSDP2TP4PP - PARALLEL_CONFIG: [4, 2, 1, 2] - BATCH_SIZE: 4 + ICI: [1, 1, 2, 4] + DCN: [2, 1, 1, 1] + TASKS: 16 + ADDITIONAL_ARGS: "" - TEST_NAME: LLaMA_eval - PARALLEL_CONFIG: [1, 1, 8, 1] - BATCH_SIZE: 4 + ICI: [1, 8, 1] + DCN: [1, 1, 1] + TASKS: 16 EVALUATE: true ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate" - TEST_NAME: Grok - PARALLEL_CONFIG: [1, 2, 8, 1] ## note: only used to compute num nodes - BATCH_SIZE: 4 + ICI: [1, 1, 8, 1] + DCN: [1, 2, 1, 1] + TASKS: 16 EVALUATE: true ADDITIONAL_ARGS: "--model-type GrokProxy" fail-fast: false @@ -247,7 +258,7 @@ jobs: run: | IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" TEST_CASE_NAME=${{ matrix.TEST_NAME }} - TOTAL_TASKS=$((${{ matrix.PARALLEL_CONFIG[0] }} * ${{ matrix.PARALLEL_CONFIG[1] }} * ${{ matrix.PARALLEL_CONFIG[2] }} * ${{ matrix.PARALLEL_CONFIG[3] }})) + TOTAL_TASKS=${{ matrix.TASKS }} MAX_GPUS_PER_NODE=8 NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE)) GPUS_PER_NODE=$((TOTAL_TASKS/NODES)) @@ -295,10 +306,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu 4 \ --steps 500 \ - --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \ - --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \ - --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \ - --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \ + --ici ${{ matrix.ICI }} \ + --dcn ${{ matrix.DCN }} \ --nodes ${{ steps.meta.outputs.NODES }} \ $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess) \ ${{ matrix.ADDITIONAL_ARGS }} @@ -362,8 +371,10 @@ jobs: single-process-evaluation: strategy: matrix: - PARALLEL_CONFIG: - - [1, 8, 1, 1] + include: + - TEST_NAME: 8DP1FSDP1TP1PP_eval + ICI: [8, 1, 1] + DCN: [1, 1, 1] fail-fast: false runs-on: ubuntu-22.04 @@ -395,7 +406,7 @@ jobs: shell: bash -x -e {0} run: | IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" - TEST_CASE_NAME=${{ matrix.PARALLEL_CONFIG[1] }}DP${{ matrix.PARALLEL_CONFIG[2] }}FSDP${{ matrix.PARALLEL_CONFIG[3] }}TP${{ matrix.PARALLEL_CONFIG[0] }}PP_eval + TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }}PP TOTAL_TASKS=1 NODES=1 GPUS_PER_NODE=8 @@ -444,10 +455,8 @@ jobs: --batch-per-gpu 4 \ --steps 500 \ --evaluate \ - --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \ - --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \ - --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \ - --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \ + --ici ${{ matrix.ICI }} \ + --dcn ${{ matrix.DCN }} \ --nodes ${{ steps.meta.outputs.NODES }} \ $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess) EOF From 374a6b01d228dba633ff1e335c135edf3ba13e99 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Wed, 29 May 2024 17:01:41 -0700 Subject: [PATCH 03/20] fixes --- .github/container/test-pax.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh index d4ad9b069..3efd01c4d 100755 --- a/.github/container/test-pax.sh +++ b/.github/container/test-pax.sh @@ -30,7 +30,7 @@ usage() { exit $1 } -args=$(getopt -o a:b:s:o:n:h --long additional-args:,batch-per-gpu:,dtype:,enable-te,enable-dropout,enable-fused-attn,model-type:,evaluate,steps:,help,multiprocess,output:,data-parallel:,fsdp:,tensor-parallel:,pipeline-parallel:,nodes: -- "$@") +args=$(getopt -o a:b:s:o:n:h --long additional-args:,batch-per-gpu:,dtype:,enable-te,enable-dropout,enable-fused-attn,model-type:,evaluate,steps:,help,multiprocess,output:,ici:,dcn:,nodes: -- "$@") if [[ $? -ne 0 ]]; then exit $1 fi @@ -349,7 +349,7 @@ elif [[ ${MODEL_TYPE} == "LLaMA70BProxy" ]]; then ## hard-code ICI mesh shape for Grok elif [[ ${MODEL_TYPE} == "GrokProxy" ]]; then CONFIG=paxml.tasks.lm.params.nvidia.Grok_Proxy - ADDITIONAL_ARGS="--fdl.NUM_LAYERS=2" + ADDITIONAL_ARGS+="--fdl.NUM_LAYERS=2" else echo "Unsupported model ${MODEL_TYPE}" exit 1 From 2b87465106b6db6120d77409692774ec6bb5fbcc Mon Sep 17 00:00:00 2001 From: ashors1 Date: Wed, 29 May 2024 17:06:28 -0700 Subject: [PATCH 04/20] fix --- .github/workflows/_test_upstream_pax.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml index e05341647..ac6519ffa 100644 --- a/.github/workflows/_test_upstream_pax.yaml +++ b/.github/workflows/_test_upstream_pax.yaml @@ -179,6 +179,7 @@ jobs: pax-multi-node: strategy: matrix: + include: - TEST_NAME: 1DP1FSDP1TP1PP ICI: [1, 1, 1] DCN: [1, 1, 1] From 67dbc3728773380a5ad0571634a7c3faab3f9713 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Wed, 29 May 2024 17:07:41 -0700 Subject: [PATCH 05/20] fix --- .github/workflows/_test_pax_rosetta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml index 8a05f6a11..85d7cdc4c 100644 --- a/.github/workflows/_test_pax_rosetta.yaml +++ b/.github/workflows/_test_pax_rosetta.yaml @@ -358,7 +358,7 @@ jobs: --batch-per-gpu ${{ matrix.BATCH_SIZE }} \ --steps 300 \ --ici ${{ matrix.ICI }} \ - --dcn ${{ matric.DCN }} \ + --dcn ${{ matrix.DCN }} \ --nodes ${{ steps.meta.outputs.NODES }} \ --enable-te \ --additional-args "--fdl.PACKED_INPUT=False" \ From 86f043a096140abdd3b63969279e30f8ac1f106b Mon Sep 17 00:00:00 2001 From: ashors1 Date: Wed, 29 May 2024 17:08:39 -0700 Subject: [PATCH 06/20] fix --- .github/workflows/_test_pax_rosetta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml index 85d7cdc4c..be30c22b6 100644 --- a/.github/workflows/_test_pax_rosetta.yaml +++ b/.github/workflows/_test_pax_rosetta.yaml @@ -516,7 +516,7 @@ jobs: run: | IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }} - TOTAL_TASKS=${{ matrix.TASKS] }} + TOTAL_TASKS=${{ matrix.TASKS }} MAX_GPUS_PER_NODE=8 NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE)) GPUS_PER_NODE=$((TOTAL_TASKS/NODES)) From 0ac6db74ca9017ab3f17a7af4e595d103d8c914c Mon Sep 17 00:00:00 2001 From: ashors1 Date: Wed, 29 May 2024 22:38:05 -0700 Subject: [PATCH 07/20] fix syntax --- .github/container/test-pax.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh index 3efd01c4d..fe8f460f4 100755 --- a/.github/container/test-pax.sh +++ b/.github/container/test-pax.sh @@ -355,7 +355,7 @@ else exit 1 fi -CMD_LINE_FLAGS = "--fdl_config=${CONFIG} \ +CMD_LINE_FLAGS="--fdl_config=${CONFIG} \ --fdl.MAX_STEPS=0 \ --job_log_dir=${OUTPUT} \ --alsologtostderr \ From e6c9aebdf64101ef56df6bf22d95371c9ab12bf7 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Thu, 30 May 2024 09:14:00 -0700 Subject: [PATCH 08/20] fix ICI and DCN mesh shapes --- .github/workflows/_test_pax_rosetta.yaml | 20 ++++++++++---------- .github/workflows/_test_upstream_pax.yaml | 12 ++++++------ 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml index be30c22b6..1f1bef8af 100644 --- a/.github/workflows/_test_pax_rosetta.yaml +++ b/.github/workflows/_test_pax_rosetta.yaml @@ -122,8 +122,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu 4 \ --steps 500 \ - --ici ${{ matrix.ICI }} - --dcn ${{ matrix.DCN }} + --ici "\"${{ matrix.ICI }}\"" \ + --dcn "\"${{ matrix.DCN }}\"" \ --nodes ${{ steps.meta.outputs.NODES }} \ --enable-te EOF @@ -357,8 +357,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu ${{ matrix.BATCH_SIZE }} \ --steps 300 \ - --ici ${{ matrix.ICI }} \ - --dcn ${{ matrix.DCN }} \ + --ici "\"${{ matrix.ICI }}\"" \ + --dcn "\"${{ matrix.DCN }}\"" \ --nodes ${{ steps.meta.outputs.NODES }} \ --enable-te \ --additional-args "--fdl.PACKED_INPUT=False" \ @@ -565,8 +565,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu 4 \ --steps 300 \ - --ici ${{ matrix.ICI }} \ - --dcn ${{ matrix.DCN }} \ + --ici "\"${{ matrix.ICI }}\"" \ + --dcn "\"${{ matrix.DCN }}\"" \ --nodes ${{ steps.meta.outputs.NODES }} \ $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess) EOF @@ -758,8 +758,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu 4 \ --steps 300 \ - --ici ${{ matrix.ICI }} \ - --dcn ${{ matrix.DCN }} \ + --ici "\"${{ matrix.ICI }}\"" \ + --dcn "\"${{ matrix.DCN }}\"" \ --nodes ${{ steps.meta.outputs.NODES }} \ --enable-dropout \ --enable-te \ @@ -951,8 +951,8 @@ jobs: --batch-per-gpu 4 \ --steps 500 \ --evaluate \ - --ici ${{ matrix.ICI }} \ - --dcn ${{ matrix.DCN }} \ + --ici "\"${{ matrix.ICI }}\"" \ + --dcn "\"${{ matrix.DCN }}\"" \ --nodes 1 \ --enable-te EOF diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml index ac6519ffa..4fe2d107d 100644 --- a/.github/workflows/_test_upstream_pax.yaml +++ b/.github/workflows/_test_upstream_pax.yaml @@ -118,8 +118,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu 4 \ --steps 500 \ - --ici ${{ matrix.ICI }} \ - --dcn ${{ matrix.DCN }} \ + --ici "\"${{ matrix.ICI }}\"" \ + --dcn "\"${{ matrix.DCN }}\"" \ --nodes ${{ steps.meta.outputs.NODES }} EOF ) @@ -307,8 +307,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu 4 \ --steps 500 \ - --ici ${{ matrix.ICI }} \ - --dcn ${{ matrix.DCN }} \ + --ici "\"${{ matrix.ICI }}\"" \ + --dcn "\"${{ matrix.DCN }}\"" \ --nodes ${{ steps.meta.outputs.NODES }} \ $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess) \ ${{ matrix.ADDITIONAL_ARGS }} @@ -456,8 +456,8 @@ jobs: --batch-per-gpu 4 \ --steps 500 \ --evaluate \ - --ici ${{ matrix.ICI }} \ - --dcn ${{ matrix.DCN }} \ + --ici "\"${{ matrix.ICI }}\"" \ + --dcn "\"${{ matrix.DCN }}\"" \ --nodes ${{ steps.meta.outputs.NODES }} \ $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess) EOF From f226c03c3c665d0a791c3dc27bf4d44ab65e86e5 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Thu, 30 May 2024 17:04:36 -0700 Subject: [PATCH 09/20] more fixes --- .github/workflows/_test_pax_rosetta.yaml | 72 +++++++++++------------ .github/workflows/_test_upstream_pax.yaml | 52 ++++++++-------- 2 files changed, 62 insertions(+), 62 deletions(-) diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml index 1f1bef8af..8193abbcf 100644 --- a/.github/workflows/_test_pax_rosetta.yaml +++ b/.github/workflows/_test_pax_rosetta.yaml @@ -35,11 +35,11 @@ jobs: matrix: include: - TEST_NAME: 8DP1FSDP1TP1PP_single_process_TE - ICI: [8, 1, 1] - DCN: [1, 1, 1] + ICI: "[8, 1, 1]" + DCN: "[1, 1, 1]" - TEST_NAME: 1DP2FSDP4TP1PP_single_process_TE - ICI: [8, 1, 1] - DCN: [1, 1, 1] + ICI: "[8, 1, 1]" + DCN: "[1, 1, 1]" fail-fast: false runs-on: ubuntu-22.04 @@ -74,7 +74,7 @@ jobs: shell: bash -x -e {0} run: | IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" - TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }} + TEST_CASE_NAME=${{ matrix.TEST_NAME }} MAX_GPUS_PER_NODE=8 GPUS_PER_NODE=8 NODES=1 @@ -228,50 +228,50 @@ jobs: matrix: include: - TEST_NAME: 1DP1FSDP1TP1PP_TE - ICI: [1, 1, 1] - DCN: [1, 1, 1] + ICI: "[1, 1, 1]" + DCN: "[1, 1, 1]" BATCH_SIZE: 4 TASKS: 1 ADDITIONAL_ARGS: "" - TEST_NAME: 8DP1FSDP1TP1PP_TE - ICI: [8, 1, 1] - DCN: [1, 1, 1] + ICI: "[8, 1, 1]" + DCN: "[1, 1, 1]" ADDITIONAL_ARGS: "" BATCH_SIZE: 4 TASKS: 8 - TEST_NAME: 1DP8FSDP1TP1PP_TE - ICI: [1, 8, 1] - DCN: [1, 1, 1] + ICI: "[1, 8, 1]" + DCN: "[1, 1, 1]" BATCH_SIZE: 4 TASKS: 8 ADDITIONAL_ARGS: "" - TEST_NAME: 4DP1FSDP2TP1PP_TE - ICI: [4, 1, 1] - DCN: [1, 1, 1] + ICI: "[4, 1, 1]" + DCN: "[1, 1, 1]" BATCH_SIZE: 4 TASKS: 8 ADDITIONAL_ARGS: "" - TEST_NAME: 16DP1FSDP1TP1PP_TE - ICI: [8, 1, 1] - DCN: [2, 1, 1] + ICI: "[8, 1, 1]" + DCN: "[2, 1, 1]" BATCH_SIZE: 4 TASKS: 16 ADDITIONAL_ARGS: "" - TEST_NAME: 5B_fused_attn_1 - ICI: [1, 8, 1] - DCN: [1, 1, 1] + ICI: "[1, 8, 1]" + DCN: "[1, 1, 1]" BATCH_SIZE: 2 TASKS: 8 ADDITIONAL_ARGS: "--model-type 5B --enable-fused-attn" - TEST_NAME: 5B_fused_attn_0 - ICI: [1, 8, 1] - DCN: [1, 1, 1] + ICI: "[1, 8, 1]" + DCN: "[1, 1, 1]" BATCH_SIZE: 2 TASKS: 8 ADDITIONAL_ARGS: "--model-type 5B" - TEST_NAME: LLaMA_eval_TE - ICI: [1, 8, 1] - DCN: [1, 1, 1] + ICI: "[1, 8, 1]" + DCN: "[1, 1, 1]" BATCH_SIZE: 4 TASKS: 8 EVALUATE: true @@ -468,20 +468,20 @@ jobs: matrix: include: - TEST_NAME: 8DP1FSDP1TP1PP - ICI: [8, 1, 1] - DCN: [1, 1, 1] + ICI: "[8, 1, 1]" + DCN: "[1, 1, 1]" TASKS: 1 - TEST_NAME: 4DP1FSDP2TP1PP - ICI: [4, 1, 2] - DCN: [1, 1, 1] + ICI: "[4, 1, 2]" + DCN: "[1, 1, 1]" TASKS: 8 - TEST_NAME: 4DP1FSDP1TP2PP - ICI: [4, 2, 1, 1] - DCN: [1, 1, 1, 1] + ICI: "[4, 2, 1, 1]" + DCN: "[1, 1, 1, 1]" TASKS: 8 - TEST_NAME: 4DP1FSDP2TP2PP - ICI: [2, 2, 1, 2] - DCN: [2, 1, 1, 1] + ICI: "[2, 2, 1, 2]" + DCN: "[2, 1, 1, 1]" TASKS: 16 fail-fast: false @@ -515,7 +515,7 @@ jobs: shell: bash -x -e {0} run: | IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" - TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }} + TEST_CASE_NAME=${{ matrix.TEST_NAME }} TOTAL_TASKS=${{ matrix.TASKS }} MAX_GPUS_PER_NODE=8 NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE)) @@ -673,8 +673,8 @@ jobs: matrix: include: - TEST_NAME: 8DP_TE_dropout - ICI: [8, 1, 1] - DCN: [1, 1, 1] + ICI: "[8, 1, 1]" + DCN: "[1, 1, 1]" TASKS: 8 fail-fast: false @@ -709,7 +709,7 @@ jobs: run: | cd $GITHUB_WORKSPACE IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" - TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }} + TEST_CASE_NAME=${{ matrix.TEST_NAME }} TOTAL_TASKS=${{ matrix.TASKS }} MAX_GPUS_PER_NODE=8 NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE)) @@ -868,8 +868,8 @@ jobs: matrix: include: - TEST_NAME: 8DP1FSDP1TP1PP_eval_TE - ICI: [8, 1, 1] - DCN: [1, 1, 1] + ICI: "[8, 1, 1]" + DCN: "[1, 1, 1]" fail-fast: false runs-on: ubuntu-22.04 @@ -902,7 +902,7 @@ jobs: shell: bash -x -e {0} run: | IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" - TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }} + TEST_CASE_NAME=${{ matrix.TEST_NAME }} MAX_GPUS_PER_NODE=8 GPUS_PER_NODE=8 diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml index 4fe2d107d..d69dcf5c2 100644 --- a/.github/workflows/_test_upstream_pax.yaml +++ b/.github/workflows/_test_upstream_pax.yaml @@ -35,11 +35,11 @@ jobs: matrix: include: - TEST_NAME: 8DP1FSDP1TP1PP_single_process - ICI: [8, 1, 1] - DCN: [1, 1, 1] + ICI: "[8, 1, 1]" + DCN: "[1, 1, 1]" - TEST_NAME: 1DP2FSDP4TP1PP_single_process - ICI: [8, 1, 1] - DCN: [1, 1, 1] + ICI: "[8, 1, 1]" + DCN: "[1, 1, 1]" fail-fast: false runs-on: ubuntu-22.04 @@ -71,7 +71,7 @@ jobs: shell: bash -x -e {0} run: | IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" - TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }} + TEST_CASE_NAME=${{ matrix.TEST_NAME }} MAX_GPUS_PER_NODE=8 NODES=1 GPUS_PER_NODE=8 @@ -181,49 +181,49 @@ jobs: matrix: include: - TEST_NAME: 1DP1FSDP1TP1PP - ICI: [1, 1, 1] - DCN: [1, 1, 1] + ICI: "[1, 1, 1]" + DCN: "[1, 1, 1]" TASKS: 1 ADDITIONAL_ARGS: "" - TEST_NAME: 8DP1FSDP1TP1PP - ICI: [8, 1, 1] - DCN: [1, 1, 1] + ICI: "[8, 1, 1]" + DCN: "[1, 1, 1]" TASKS: 8 ADDITIONAL_ARGS: "" - TEST_NAME: 1DP8FSDP1TP1PP - ICI: [1, 8, 1] - DCN: [1, 1, 1] + ICI: "[1, 8, 1]" + DCN: "[1, 1, 1]" TASKS: 8 ADDITIONAL_ARGS: "" - TEST_NAME: 2DP1FSDP1TP4PP - ICI: [4, 2, 1, 1] - DCN: [1, 1, 1, 1] + ICI: "[4, 2, 1, 1]" + DCN: "[1, 1, 1, 1]" TASKS: 8 ADDITIONAL_ARGS: "" - TEST_NAME: 4DP1FSDP2TP1PP - ICI: [4, 2, 1] - DCN: [1, 1, 1] + ICI: "[4, 2, 1]" + DCN: "[1, 1, 1]" TASKS: 8 ADDITIONAL_ARGS: "" - TEST_NAME: 16DP1FSDP1TP1PP - ICI: [8, 1, 1] - DCN: [2, 1, 1] + ICI: "[8, 1, 1]" + DCN: "[2, 1, 1]" TASKS: 16 ADDITIONAL_ARGS: "" - TEST_NAME: 2DP1FSDP2TP4PP - ICI: [1, 1, 2, 4] - DCN: [2, 1, 1, 1] + ICI: "[1, 1, 2, 4]" + DCN: "[2, 1, 1, 1]" TASKS: 16 ADDITIONAL_ARGS: "" - TEST_NAME: LLaMA_eval - ICI: [1, 8, 1] - DCN: [1, 1, 1] + ICI: "[1, 8, 1]" + DCN: "[1, 1, 1]" TASKS: 16 EVALUATE: true ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate" - TEST_NAME: Grok - ICI: [1, 1, 8, 1] - DCN: [1, 2, 1, 1] + ICI: "[1, 1, 8, 1]" + DCN: "[1, 2, 1, 1]" TASKS: 16 EVALUATE: true ADDITIONAL_ARGS: "--model-type GrokProxy" @@ -374,8 +374,8 @@ jobs: matrix: include: - TEST_NAME: 8DP1FSDP1TP1PP_eval - ICI: [8, 1, 1] - DCN: [1, 1, 1] + ICI: "[8, 1, 1]" + DCN: "[1, 1, 1]" fail-fast: false runs-on: ubuntu-22.04 @@ -407,7 +407,7 @@ jobs: shell: bash -x -e {0} run: | IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" - TEST_CASE_NAME=${{ matrix.TEST_CASE_NAME }}PP + TEST_CASE_NAME=${{ matrix.TEST_NAME }} TOTAL_TASKS=1 NODES=1 GPUS_PER_NODE=8 From e5bd8511fbc239d0f3b135b8422677e8da212312 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Thu, 30 May 2024 22:12:04 -0700 Subject: [PATCH 10/20] fix max_steps --- .github/container/test-pax.sh | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh index fe8f460f4..d49b640fe 100755 --- a/.github/container/test-pax.sh +++ b/.github/container/test-pax.sh @@ -356,7 +356,6 @@ else fi CMD_LINE_FLAGS="--fdl_config=${CONFIG} \ - --fdl.MAX_STEPS=0 \ --job_log_dir=${OUTPUT} \ --alsologtostderr \ --fdl.PERCORE_BATCH_SIZE=${BATCH_PER_GPU} \ @@ -370,15 +369,13 @@ if [[ ${EVALUATE} -ne 0 ]]; then trap "rm -rf ${OUTPUT}/checkpoints" ERR INT HUP TERM EXIT ## train for 0 steps to generate an initial checkpoint - python -m paxml.main ${CMD_LINE_FLAGS} + python -m paxml.main ${CMD_LINE_FLAGS} --fdl.MAX_STEPS=0 ## restore from initial checkpoint for eval - python -m paxml.main ${CMD_LINE_FLAGS} \ - --enable_checkpoint_saving=False + python -m paxml.main ${CMD_LINE_FLAGS} --enable_checkpoint_saving=False else - python -m paxml.main ${CMD_LINE_FLAGS} \ - --enable_checkpoint_saving=False + python -m paxml.main ${CMD_LINE_FLAGS} --enable_checkpoint_saving=False fi set +x From 817e3512637be80d5a1b6ad62621b81043aab418 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Thu, 30 May 2024 22:13:44 -0700 Subject: [PATCH 11/20] fix --- .github/container/test-pax.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh index d49b640fe..a1ca25f35 100755 --- a/.github/container/test-pax.sh +++ b/.github/container/test-pax.sh @@ -372,7 +372,7 @@ if [[ ${EVALUATE} -ne 0 ]]; then python -m paxml.main ${CMD_LINE_FLAGS} --fdl.MAX_STEPS=0 ## restore from initial checkpoint for eval - python -m paxml.main ${CMD_LINE_FLAGS} --enable_checkpoint_saving=False + python -m paxml.main ${CMD_LINE_FLAGS} --enable_checkpoint_saving=False --fdl.MAX_STEPS=0 else python -m paxml.main ${CMD_LINE_FLAGS} --enable_checkpoint_saving=False From e98cc98a500410e9c751365e40e915ec860f259f Mon Sep 17 00:00:00 2001 From: ashors1 Date: Thu, 30 May 2024 22:16:01 -0700 Subject: [PATCH 12/20] fix command line args --- .github/container/test-pax.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh index a1ca25f35..8ac2ac79f 100755 --- a/.github/container/test-pax.sh +++ b/.github/container/test-pax.sh @@ -372,10 +372,10 @@ if [[ ${EVALUATE} -ne 0 ]]; then python -m paxml.main ${CMD_LINE_FLAGS} --fdl.MAX_STEPS=0 ## restore from initial checkpoint for eval - python -m paxml.main ${CMD_LINE_FLAGS} --enable_checkpoint_saving=False --fdl.MAX_STEPS=0 + python -m paxml.main ${CMD_LINE_FLAGS} --enable_checkpoint_saving=False --fdl.MAX_STEPS=0 --mode='eval' else - python -m paxml.main ${CMD_LINE_FLAGS} --enable_checkpoint_saving=False + python -m paxml.main ${CMD_LINE_FLAGS} --enable_checkpoint_saving=False --fdl.MAX_STEPS=${STEPS} fi set +x From c53fab31f6acd30f83180e94583c4cee30f4b834 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Fri, 31 May 2024 09:10:01 -0700 Subject: [PATCH 13/20] cleanup, bug fixes --- .github/container/test-pax.sh | 44 +++++++++++------------ .github/workflows/_test_pax_rosetta.yaml | 5 +++ .github/workflows/_test_upstream_pax.yaml | 4 +-- 3 files changed, 28 insertions(+), 25 deletions(-) diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh index 8ac2ac79f..3aa711df9 100755 --- a/.github/container/test-pax.sh +++ b/.github/container/test-pax.sh @@ -24,13 +24,14 @@ usage() { echo " --multiprocess Enable the multiprocess GPU mode." echo " --ici ICI mesh shape." echo " --dcn DCN mesh shape." + echo " --enable-pipeline-parallel Whether to use pipeline parallelism." echo " -o, --output NAME Name for the output folder, a temporary folder will be created if none specified." echo " -n, --nodes Number of nodes." echo " -h, --help Print usage." exit $1 } -args=$(getopt -o a:b:s:o:n:h --long additional-args:,batch-per-gpu:,dtype:,enable-te,enable-dropout,enable-fused-attn,model-type:,evaluate,steps:,help,multiprocess,output:,ici:,dcn:,nodes: -- "$@") +args=$(getopt -o a:b:s:o:n:h --long additional-args:,batch-per-gpu:,dtype:,enable-te,enable-dropout,enable-fused-attn,enable-pipeline-parallel,model-type:,evaluate,steps:,help,multiprocess,output:,ici:,dcn:,nodes: -- "$@") if [[ $? -ne 0 ]]; then exit $1 fi @@ -48,6 +49,7 @@ NODES=1 ENABLE_TE=0 MODEL_TYPE=126M NVTE_FUSED_ATTN=0 +ENABLE_PP=0 DROPOUT=0 EVALUATE=0 ADDITIONAL_ARGS="" @@ -79,6 +81,10 @@ while [ : ]; do NVTE_FUSED_ATTN=1 shift 1 ;; + --enable-pipeline-parallel) + ENABLE_PP=1 + shift 1 + ;; --model-type) MODEL_TYPE=$2 shift 2 @@ -149,6 +155,7 @@ pushd ${PAXML_DIR} ## Create configs file cat > ci_configs.py < 1: - if dp % dcn_factor == 0: - dcn_dp = dcn_factor - dp = int(dp / dcn_factor) - elif fsdp % dcn_factor == 0: - dcn_fsdp = dcn_factor - fsdp = int(fsdp / dcn_factor) - elif pp % dcn_factor == 0: - dcn_pp = dcn_factor - pp = int(pp / dcn_factor) +assert num_gpus == np.prod(${ICI}) * np.prod(${DCN}), f'product of parallel strategies should equal number of available gpus. Have {num_gpus} gpus, but product of parallel strategies is {np.prod(${ICI}) * np.prod(${DCN})}' WeightInit = base_layer.WeightInit @@ -285,12 +279,12 @@ class LLaMA70BSyntheticSmall(BaseLLaMA, SyntheticDataset): return task_p -if pp > 1: +if pp == 1: @experiment_registry.register class Synthetic126MCI(GPT126MPP, SyntheticDataset): MICROBATCH_SIZE = 2 - NUM_STAGES = pp + NUM_STAGES = ${ICI}[0] FRPOP_DTYPE = dtype def task(self): @@ -355,14 +349,18 @@ else exit 1 fi +echo "HERE" + CMD_LINE_FLAGS="--fdl_config=${CONFIG} \ --job_log_dir=${OUTPUT} \ --alsologtostderr \ --fdl.PERCORE_BATCH_SIZE=${BATCH_PER_GPU} \ --fdl.ICI_MESH_SHAPE=${ICI} \ --fdl.DCN_MESH_SHAPE=${DCN} \ - $ADDITIONAL_ARGS \ - $([[ $MULTIPROCESS != 0 ]] && echo --multiprocess_gpu)" + $ADDITIONAL_ARGS" +if [[ $MULTIPROCESS != 0 ]]; then + CMD_LINE_FLAGS+=" --multiprocess_gpu" +fi if [[ ${EVALUATE} -ne 0 ]]; then diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml index 8193abbcf..8996ea8fb 100644 --- a/.github/workflows/_test_pax_rosetta.yaml +++ b/.github/workflows/_test_pax_rosetta.yaml @@ -471,18 +471,22 @@ jobs: ICI: "[8, 1, 1]" DCN: "[1, 1, 1]" TASKS: 1 + ADDITIONAL_ARGS: "" - TEST_NAME: 4DP1FSDP2TP1PP ICI: "[4, 1, 2]" DCN: "[1, 1, 1]" TASKS: 8 + ADDITIONAL_ARGS: "" - TEST_NAME: 4DP1FSDP1TP2PP ICI: "[4, 2, 1, 1]" DCN: "[1, 1, 1, 1]" TASKS: 8 + ADDITIONAL_ARGS: "--enable-pipeline-parallel" - TEST_NAME: 4DP1FSDP2TP2PP ICI: "[2, 2, 1, 2]" DCN: "[2, 1, 1, 1]" TASKS: 16 + ADDITIONAL_ARGS: "--enable-pipeline-parallel" fail-fast: false runs-on: ubuntu-22.04 @@ -568,6 +572,7 @@ jobs: --ici "\"${{ matrix.ICI }}\"" \ --dcn "\"${{ matrix.DCN }}\"" \ --nodes ${{ steps.meta.outputs.NODES }} \ + ${{ matrix.ADDITIONAL_ARGS }} \ $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess) EOF ) diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml index d69dcf5c2..dd1d301d8 100644 --- a/.github/workflows/_test_upstream_pax.yaml +++ b/.github/workflows/_test_upstream_pax.yaml @@ -199,7 +199,7 @@ jobs: ICI: "[4, 2, 1, 1]" DCN: "[1, 1, 1, 1]" TASKS: 8 - ADDITIONAL_ARGS: "" + ADDITIONAL_ARGS: "--enable-pipeline-parallel" - TEST_NAME: 4DP1FSDP2TP1PP ICI: "[4, 2, 1]" DCN: "[1, 1, 1]" @@ -214,7 +214,7 @@ jobs: ICI: "[1, 1, 2, 4]" DCN: "[2, 1, 1, 1]" TASKS: 16 - ADDITIONAL_ARGS: "" + ADDITIONAL_ARGS: "--enable-pipeline-parallel" - TEST_NAME: LLaMA_eval ICI: "[1, 8, 1]" DCN: "[1, 1, 1]" From ad61b488e8595db774505aa8c89c16e0476e7824 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Fri, 31 May 2024 16:32:29 -0700 Subject: [PATCH 14/20] attempt tp fix ici and dcn mesh shape formatting --- .github/workflows/_test_pax_rosetta.yaml | 20 ++++++++++---------- .github/workflows/_test_upstream_pax.yaml | 12 ++++++------ 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml index 8996ea8fb..a6e934935 100644 --- a/.github/workflows/_test_pax_rosetta.yaml +++ b/.github/workflows/_test_pax_rosetta.yaml @@ -122,8 +122,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu 4 \ --steps 500 \ - --ici "\"${{ matrix.ICI }}\"" \ - --dcn "\"${{ matrix.DCN }}\"" \ + --ici "${{ matrix.ICI }}" \ + --dcn "${{ matrix.DCN }}" \ --nodes ${{ steps.meta.outputs.NODES }} \ --enable-te EOF @@ -357,8 +357,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu ${{ matrix.BATCH_SIZE }} \ --steps 300 \ - --ici "\"${{ matrix.ICI }}\"" \ - --dcn "\"${{ matrix.DCN }}\"" \ + --ici "${{ matrix.ICI }}" \ + --dcn "${{ matrix.DCN }}" \ --nodes ${{ steps.meta.outputs.NODES }} \ --enable-te \ --additional-args "--fdl.PACKED_INPUT=False" \ @@ -569,8 +569,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu 4 \ --steps 300 \ - --ici "\"${{ matrix.ICI }}\"" \ - --dcn "\"${{ matrix.DCN }}\"" \ + --ici "${{ matrix.ICI }}" \ + --dcn "${{ matrix.DCN }}" \ --nodes ${{ steps.meta.outputs.NODES }} \ ${{ matrix.ADDITIONAL_ARGS }} \ $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess) @@ -763,8 +763,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu 4 \ --steps 300 \ - --ici "\"${{ matrix.ICI }}\"" \ - --dcn "\"${{ matrix.DCN }}\"" \ + --ici "${{ matrix.ICI }}" \ + --dcn "${{ matrix.DCN }}" \ --nodes ${{ steps.meta.outputs.NODES }} \ --enable-dropout \ --enable-te \ @@ -956,8 +956,8 @@ jobs: --batch-per-gpu 4 \ --steps 500 \ --evaluate \ - --ici "\"${{ matrix.ICI }}\"" \ - --dcn "\"${{ matrix.DCN }}\"" \ + --ici "${{ matrix.ICI }}" \ + --dcn "${{ matrix.DCN }}" \ --nodes 1 \ --enable-te EOF diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml index dd1d301d8..78aaa586f 100644 --- a/.github/workflows/_test_upstream_pax.yaml +++ b/.github/workflows/_test_upstream_pax.yaml @@ -118,8 +118,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu 4 \ --steps 500 \ - --ici "\"${{ matrix.ICI }}\"" \ - --dcn "\"${{ matrix.DCN }}\"" \ + --ici "${{ matrix.ICI }}" \ + --dcn "${{ matrix.DCN }}" \ --nodes ${{ steps.meta.outputs.NODES }} EOF ) @@ -307,8 +307,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu 4 \ --steps 500 \ - --ici "\"${{ matrix.ICI }}\"" \ - --dcn "\"${{ matrix.DCN }}\"" \ + --ici "${{ matrix.ICI }}" \ + --dcn "${{ matrix.DCN }}" \ --nodes ${{ steps.meta.outputs.NODES }} \ $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess) \ ${{ matrix.ADDITIONAL_ARGS }} @@ -456,8 +456,8 @@ jobs: --batch-per-gpu 4 \ --steps 500 \ --evaluate \ - --ici "\"${{ matrix.ICI }}\"" \ - --dcn "\"${{ matrix.DCN }}\"" \ + --ici "${{ matrix.ICI }}" \ + --dcn "${{ matrix.DCN }}" \ --nodes ${{ steps.meta.outputs.NODES }} \ $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess) EOF From a64bfad695e7aace6cf5ddbcec1f0c4355e8edd1 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Sat, 1 Jun 2024 12:51:23 -0700 Subject: [PATCH 15/20] fix mesh shapes --- .github/container/test-pax.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh index 3aa711df9..7b87394f7 100755 --- a/.github/container/test-pax.sh +++ b/.github/container/test-pax.sh @@ -355,8 +355,8 @@ CMD_LINE_FLAGS="--fdl_config=${CONFIG} \ --job_log_dir=${OUTPUT} \ --alsologtostderr \ --fdl.PERCORE_BATCH_SIZE=${BATCH_PER_GPU} \ - --fdl.ICI_MESH_SHAPE=${ICI} \ - --fdl.DCN_MESH_SHAPE=${DCN} \ + --fdl.ICI_MESH_SHAPE=\"${ICI}\" \ + --fdl.DCN_MESH_SHAPE=\"${DCN}\" \ $ADDITIONAL_ARGS" if [[ $MULTIPROCESS != 0 ]]; then CMD_LINE_FLAGS+=" --multiprocess_gpu" From 5101dbb61f0e89aff0bafbccf558584f84640692 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Sun, 2 Jun 2024 20:48:27 -0700 Subject: [PATCH 16/20] ici/dcn syntax fix --- .github/workflows/_test_pax_rosetta.yaml | 64 +++++++++++------------ .github/workflows/_test_upstream_pax.yaml | 48 ++++++++--------- 2 files changed, 56 insertions(+), 56 deletions(-) diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml index a6e934935..d3d167045 100644 --- a/.github/workflows/_test_pax_rosetta.yaml +++ b/.github/workflows/_test_pax_rosetta.yaml @@ -35,11 +35,11 @@ jobs: matrix: include: - TEST_NAME: 8DP1FSDP1TP1PP_single_process_TE - ICI: "[8, 1, 1]" - DCN: "[1, 1, 1]" + ICI: "[8,1,1]" + DCN: "[1,1,1]" - TEST_NAME: 1DP2FSDP4TP1PP_single_process_TE - ICI: "[8, 1, 1]" - DCN: "[1, 1, 1]" + ICI: "[8,1,1]" + DCN: "[1,1,1]" fail-fast: false runs-on: ubuntu-22.04 @@ -228,50 +228,50 @@ jobs: matrix: include: - TEST_NAME: 1DP1FSDP1TP1PP_TE - ICI: "[1, 1, 1]" - DCN: "[1, 1, 1]" + ICI: "[1,1,1]" + DCN: "[1,1,1]" BATCH_SIZE: 4 TASKS: 1 ADDITIONAL_ARGS: "" - TEST_NAME: 8DP1FSDP1TP1PP_TE - ICI: "[8, 1, 1]" - DCN: "[1, 1, 1]" + ICI: "[8,1,1]" + DCN: "[1,1,1]" ADDITIONAL_ARGS: "" BATCH_SIZE: 4 TASKS: 8 - TEST_NAME: 1DP8FSDP1TP1PP_TE - ICI: "[1, 8, 1]" - DCN: "[1, 1, 1]" + ICI: "[1,8,1]" + DCN: "[1,1,1]" BATCH_SIZE: 4 TASKS: 8 ADDITIONAL_ARGS: "" - TEST_NAME: 4DP1FSDP2TP1PP_TE - ICI: "[4, 1, 1]" - DCN: "[1, 1, 1]" + ICI: "[4,1,1]" + DCN: "[1,1,1]" BATCH_SIZE: 4 TASKS: 8 ADDITIONAL_ARGS: "" - TEST_NAME: 16DP1FSDP1TP1PP_TE - ICI: "[8, 1, 1]" - DCN: "[2, 1, 1]" + ICI: "[8,1,1]" + DCN: "[2,1,1]" BATCH_SIZE: 4 TASKS: 16 ADDITIONAL_ARGS: "" - TEST_NAME: 5B_fused_attn_1 - ICI: "[1, 8, 1]" - DCN: "[1, 1, 1]" + ICI: "[1,8,1]" + DCN: "[1,1,1]" BATCH_SIZE: 2 TASKS: 8 ADDITIONAL_ARGS: "--model-type 5B --enable-fused-attn" - TEST_NAME: 5B_fused_attn_0 - ICI: "[1, 8, 1]" - DCN: "[1, 1, 1]" + ICI: "[1,8,1]" + DCN: "[1,1,1]" BATCH_SIZE: 2 TASKS: 8 ADDITIONAL_ARGS: "--model-type 5B" - TEST_NAME: LLaMA_eval_TE - ICI: "[1, 8, 1]" - DCN: "[1, 1, 1]" + ICI: "[1,8,1]" + DCN: "[1,1,1]" BATCH_SIZE: 4 TASKS: 8 EVALUATE: true @@ -468,23 +468,23 @@ jobs: matrix: include: - TEST_NAME: 8DP1FSDP1TP1PP - ICI: "[8, 1, 1]" - DCN: "[1, 1, 1]" + ICI: "[8,1,1]" + DCN: "[1,1,1]" TASKS: 1 ADDITIONAL_ARGS: "" - TEST_NAME: 4DP1FSDP2TP1PP - ICI: "[4, 1, 2]" - DCN: "[1, 1, 1]" + ICI: "[4,1,2]" + DCN: "[1,1,1]" TASKS: 8 ADDITIONAL_ARGS: "" - TEST_NAME: 4DP1FSDP1TP2PP - ICI: "[4, 2, 1, 1]" - DCN: "[1, 1, 1, 1]" + ICI: "[4,2,1,1]" + DCN: "[1,1,1,1]" TASKS: 8 ADDITIONAL_ARGS: "--enable-pipeline-parallel" - TEST_NAME: 4DP1FSDP2TP2PP - ICI: "[2, 2, 1, 2]" - DCN: "[2, 1, 1, 1]" + ICI: "[2,2,1,2]" + DCN: "[2,1,1,1]" TASKS: 16 ADDITIONAL_ARGS: "--enable-pipeline-parallel" fail-fast: false @@ -678,8 +678,8 @@ jobs: matrix: include: - TEST_NAME: 8DP_TE_dropout - ICI: "[8, 1, 1]" - DCN: "[1, 1, 1]" + ICI: "[8,1,1]" + DCN: "[1,1,1]" TASKS: 8 fail-fast: false @@ -873,8 +873,8 @@ jobs: matrix: include: - TEST_NAME: 8DP1FSDP1TP1PP_eval_TE - ICI: "[8, 1, 1]" - DCN: "[1, 1, 1]" + ICI: "[8,1,1]" + DCN: "[1,1,1]" fail-fast: false runs-on: ubuntu-22.04 diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml index 78aaa586f..e8357cdf1 100644 --- a/.github/workflows/_test_upstream_pax.yaml +++ b/.github/workflows/_test_upstream_pax.yaml @@ -35,11 +35,11 @@ jobs: matrix: include: - TEST_NAME: 8DP1FSDP1TP1PP_single_process - ICI: "[8, 1, 1]" - DCN: "[1, 1, 1]" + ICI: "[8,1,1]" + DCN: "[1,1,1]" - TEST_NAME: 1DP2FSDP4TP1PP_single_process - ICI: "[8, 1, 1]" - DCN: "[1, 1, 1]" + ICI: "[8,1,1]" + DCN: "[1,1,1]" fail-fast: false runs-on: ubuntu-22.04 @@ -181,49 +181,49 @@ jobs: matrix: include: - TEST_NAME: 1DP1FSDP1TP1PP - ICI: "[1, 1, 1]" - DCN: "[1, 1, 1]" + ICI: "[1,1,1]" + DCN: "[1,1,1]" TASKS: 1 ADDITIONAL_ARGS: "" - TEST_NAME: 8DP1FSDP1TP1PP - ICI: "[8, 1, 1]" - DCN: "[1, 1, 1]" + ICI: "[8,1,1]" + DCN: "[1,1,1]" TASKS: 8 ADDITIONAL_ARGS: "" - TEST_NAME: 1DP8FSDP1TP1PP - ICI: "[1, 8, 1]" - DCN: "[1, 1, 1]" + ICI: "[1,8,1]" + DCN: "[1,1,1]" TASKS: 8 ADDITIONAL_ARGS: "" - TEST_NAME: 2DP1FSDP1TP4PP - ICI: "[4, 2, 1, 1]" - DCN: "[1, 1, 1, 1]" + ICI: "[4,2,1,1]" + DCN: "[1,1,1,1]" TASKS: 8 ADDITIONAL_ARGS: "--enable-pipeline-parallel" - TEST_NAME: 4DP1FSDP2TP1PP - ICI: "[4, 2, 1]" - DCN: "[1, 1, 1]" + ICI: "[4,2,1]" + DCN: "[1,1,1]" TASKS: 8 ADDITIONAL_ARGS: "" - TEST_NAME: 16DP1FSDP1TP1PP - ICI: "[8, 1, 1]" - DCN: "[2, 1, 1]" + ICI: "[8,1,1]" + DCN: "[2,1,1]" TASKS: 16 ADDITIONAL_ARGS: "" - TEST_NAME: 2DP1FSDP2TP4PP - ICI: "[1, 1, 2, 4]" - DCN: "[2, 1, 1, 1]" + ICI: "[1,1,2,4]" + DCN: "[2,1,1,1]" TASKS: 16 ADDITIONAL_ARGS: "--enable-pipeline-parallel" - TEST_NAME: LLaMA_eval - ICI: "[1, 8, 1]" - DCN: "[1, 1, 1]" + ICI: "[1,8,1]" + DCN: "[1,1,1]" TASKS: 16 EVALUATE: true ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate" - TEST_NAME: Grok - ICI: "[1, 1, 8, 1]" - DCN: "[1, 2, 1, 1]" + ICI: "[1,1,8,1]" + DCN: "[1,2,1,1]" TASKS: 16 EVALUATE: true ADDITIONAL_ARGS: "--model-type GrokProxy" @@ -374,8 +374,8 @@ jobs: matrix: include: - TEST_NAME: 8DP1FSDP1TP1PP_eval - ICI: "[8, 1, 1]" - DCN: "[1, 1, 1]" + ICI: "[8,1,1]" + DCN: "[1,1,1]" fail-fast: false runs-on: ubuntu-22.04 From 9e3b1aad4c2be5aa8c99d7118a91e6725b164675 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Mon, 3 Jun 2024 11:19:21 -0700 Subject: [PATCH 17/20] fix mesh shapes --- .github/workflows/_test_pax_rosetta.yaml | 20 ++++++++++---------- .github/workflows/_test_upstream_pax.yaml | 12 ++++++------ 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml index d3d167045..005ba0a90 100644 --- a/.github/workflows/_test_pax_rosetta.yaml +++ b/.github/workflows/_test_pax_rosetta.yaml @@ -122,8 +122,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu 4 \ --steps 500 \ - --ici "${{ matrix.ICI }}" \ - --dcn "${{ matrix.DCN }}" \ + --ici ${{ matrix.ICI }} \ + --dcn ${{ matrix.DCN }} \ --nodes ${{ steps.meta.outputs.NODES }} \ --enable-te EOF @@ -357,8 +357,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu ${{ matrix.BATCH_SIZE }} \ --steps 300 \ - --ici "${{ matrix.ICI }}" \ - --dcn "${{ matrix.DCN }}" \ + --ici ${{ matrix.ICI }} \ + --dcn ${{ matrix.DCN }} \ --nodes ${{ steps.meta.outputs.NODES }} \ --enable-te \ --additional-args "--fdl.PACKED_INPUT=False" \ @@ -569,8 +569,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu 4 \ --steps 300 \ - --ici "${{ matrix.ICI }}" \ - --dcn "${{ matrix.DCN }}" \ + --ici ${{ matrix.ICI }} \ + --dcn ${{ matrix.DCN }} \ --nodes ${{ steps.meta.outputs.NODES }} \ ${{ matrix.ADDITIONAL_ARGS }} \ $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess) @@ -763,8 +763,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu 4 \ --steps 300 \ - --ici "${{ matrix.ICI }}" \ - --dcn "${{ matrix.DCN }}" \ + --ici ${{ matrix.ICI }} \ + --dcn ${{ matrix.DCN }} \ --nodes ${{ steps.meta.outputs.NODES }} \ --enable-dropout \ --enable-te \ @@ -956,8 +956,8 @@ jobs: --batch-per-gpu 4 \ --steps 500 \ --evaluate \ - --ici "${{ matrix.ICI }}" \ - --dcn "${{ matrix.DCN }}" \ + --ici ${{ matrix.ICI }} \ + --dcn ${{ matrix.DCN }} \ --nodes 1 \ --enable-te EOF diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml index e8357cdf1..2967edab3 100644 --- a/.github/workflows/_test_upstream_pax.yaml +++ b/.github/workflows/_test_upstream_pax.yaml @@ -118,8 +118,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu 4 \ --steps 500 \ - --ici "${{ matrix.ICI }}" \ - --dcn "${{ matrix.DCN }}" \ + --ici ${{ matrix.ICI }} \ + --dcn ${{ matrix.DCN }} \ --nodes ${{ steps.meta.outputs.NODES }} EOF ) @@ -307,8 +307,8 @@ jobs: --dtype bfloat16 \ --batch-per-gpu 4 \ --steps 500 \ - --ici "${{ matrix.ICI }}" \ - --dcn "${{ matrix.DCN }}" \ + --ici ${{ matrix.ICI }} \ + --dcn ${{ matrix.DCN }} \ --nodes ${{ steps.meta.outputs.NODES }} \ $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess) \ ${{ matrix.ADDITIONAL_ARGS }} @@ -456,8 +456,8 @@ jobs: --batch-per-gpu 4 \ --steps 500 \ --evaluate \ - --ici "${{ matrix.ICI }}" \ - --dcn "${{ matrix.DCN }}" \ + --ici ${{ matrix.ICI }} \ + --dcn ${{ matrix.DCN }} \ --nodes ${{ steps.meta.outputs.NODES }} \ $([[ ${{ steps.meta.outputs.TOTAL_TASKS }} > 1 ]] && echo --multiprocess) EOF From 99e8ca6ee45e72214c64758d1ee62f140a8d3721 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Mon, 3 Jun 2024 13:45:40 -0700 Subject: [PATCH 18/20] fix --- .github/container/test-pax.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh index 7b87394f7..724501b03 100755 --- a/.github/container/test-pax.sh +++ b/.github/container/test-pax.sh @@ -106,11 +106,11 @@ while [ : ]; do shift 2 ;; --ici) - ICI="$2" + ICI=$2 shift 2 ;; --dcn) - DCN="$2" + DCN=$2 shift 2 ;; -n | --nodes) @@ -355,8 +355,8 @@ CMD_LINE_FLAGS="--fdl_config=${CONFIG} \ --job_log_dir=${OUTPUT} \ --alsologtostderr \ --fdl.PERCORE_BATCH_SIZE=${BATCH_PER_GPU} \ - --fdl.ICI_MESH_SHAPE=\"${ICI}\" \ - --fdl.DCN_MESH_SHAPE=\"${DCN}\" \ + --fdl.ICI_MESH_SHAPE=${ICI} \ + --fdl.DCN_MESH_SHAPE=${DCN} \ $ADDITIONAL_ARGS" if [[ $MULTIPROCESS != 0 ]]; then CMD_LINE_FLAGS+=" --multiprocess_gpu" From d8ea091b6f0a8c221ba258654c71b093a70a91d6 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Tue, 4 Jun 2024 10:01:57 -0700 Subject: [PATCH 19/20] bug fixes --- .github/workflows/_test_pax_rosetta.yaml | 8 ++++---- .github/workflows/_test_upstream_pax.yaml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml index 005ba0a90..fef938315 100644 --- a/.github/workflows/_test_pax_rosetta.yaml +++ b/.github/workflows/_test_pax_rosetta.yaml @@ -246,7 +246,7 @@ jobs: TASKS: 8 ADDITIONAL_ARGS: "" - TEST_NAME: 4DP1FSDP2TP1PP_TE - ICI: "[4,1,1]" + ICI: "[4,1,2]" DCN: "[1,1,1]" BATCH_SIZE: 4 TASKS: 8 @@ -470,19 +470,19 @@ jobs: - TEST_NAME: 8DP1FSDP1TP1PP ICI: "[8,1,1]" DCN: "[1,1,1]" - TASKS: 1 + TASKS: 8 ADDITIONAL_ARGS: "" - TEST_NAME: 4DP1FSDP2TP1PP ICI: "[4,1,2]" DCN: "[1,1,1]" TASKS: 8 ADDITIONAL_ARGS: "" - - TEST_NAME: 4DP1FSDP1TP2PP + - TEST_NAME: 2DP1FSDP1TP4PP ICI: "[4,2,1,1]" DCN: "[1,1,1,1]" TASKS: 8 ADDITIONAL_ARGS: "--enable-pipeline-parallel" - - TEST_NAME: 4DP1FSDP2TP2PP + - TEST_NAME: 2DP1FSDP2TP4PP ICI: "[2,2,1,2]" DCN: "[2,1,1,1]" TASKS: 16 diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml index 2967edab3..7e0f0e1c4 100644 --- a/.github/workflows/_test_upstream_pax.yaml +++ b/.github/workflows/_test_upstream_pax.yaml @@ -218,7 +218,7 @@ jobs: - TEST_NAME: LLaMA_eval ICI: "[1,8,1]" DCN: "[1,1,1]" - TASKS: 16 + TASKS: 8 EVALUATE: true ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate" - TEST_NAME: Grok From 3b8a66c93b3e8db7482d8038e886bb801fe671fa Mon Sep 17 00:00:00 2001 From: ashors1 Date: Fri, 7 Jun 2024 09:26:04 -0700 Subject: [PATCH 20/20] reduce num_layers for grok --- .github/container/test-pax.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh index 724501b03..407e0b817 100755 --- a/.github/container/test-pax.sh +++ b/.github/container/test-pax.sh @@ -343,7 +343,7 @@ elif [[ ${MODEL_TYPE} == "LLaMA70BProxy" ]]; then ## hard-code ICI mesh shape for Grok elif [[ ${MODEL_TYPE} == "GrokProxy" ]]; then CONFIG=paxml.tasks.lm.params.nvidia.Grok_Proxy - ADDITIONAL_ARGS+="--fdl.NUM_LAYERS=2" + ADDITIONAL_ARGS+="--fdl.NUM_LAYERS=1" else echo "Unsupported model ${MODEL_TYPE}" exit 1