Skip to content

Commit

Permalink
ablation: turn off DP as it was causing trouble during GPU sync
Browse files Browse the repository at this point in the history
  • Loading branch information
Agustin-Picard committed Jun 24, 2024
1 parent b14bf0c commit 9d41b9f
Showing 1 changed file with 6 additions and 3 deletions.
9 changes: 6 additions & 3 deletions ablation/perplexity.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@ fi
MASTER_ADDR=127.0.0.1 #$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
MASTER_PORT=6000

GPUS_PER_NODE=2
GPUS_PER_NODE=1
NNODES=$SLURM_NNODES

WOLD_SIZE=$(($GPUS_PER_NODE * $NNODES))

# ----- data
TOKENIZER_PATH=/datasets/lucie_tokens_65k_grouped/tokenizer
TOKENS_DIR=/local_data/lucie_tokens_65k_grouped
Expand Down Expand Up @@ -60,7 +62,8 @@ cat <<EOT > $config_json
}
EOT

CHECKPOINT_PATH=/home/lucas.hervier/Lucie-Training/results/checkpoints/test0
CHECKPOINT_PATH=/home/agustin-martin.picard/Lucie-Training/results/checkpoints/gyoza-85M
PERPLEXITY_RESULTS_PATH=/home/agustin-martin.picard/Lucie-Training/results/perplexity_results/gyoza-85M

# ------ Optimizer
TRAIN_STEPS=250000 # e.g. llama: 1T tokens / 4M tokens_per_batch = 250000 steps
Expand Down Expand Up @@ -142,7 +145,7 @@ torchrun $DISTRIBUTED_ARGS \
--tokenizer-name-or-path $TOKENIZER_PATH \
--distributed-backend nccl \
--load $CHECKPOINT_PATH \
--load-iteration 3000 \
--load-iteration 10000 \
--inference \
--finetune \
$DEEPSPEED_ARGS \
Expand Down

0 comments on commit 9d41b9f

Please sign in to comment.