From 9d41b9f18c873ec8bfef7f5bc507ceeea3ba1091 Mon Sep 17 00:00:00 2001 From: "agustin-martin.picard" Date: Mon, 24 Jun 2024 17:51:22 +0200 Subject: [PATCH] ablation: turn off DP as it was causing trouble during GPU sync --- ablation/perplexity.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ablation/perplexity.sh b/ablation/perplexity.sh index 59135ab..ab666c5 100755 --- a/ablation/perplexity.sh +++ b/ablation/perplexity.sh @@ -14,9 +14,11 @@ fi MASTER_ADDR=127.0.0.1 #$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) MASTER_PORT=6000 -GPUS_PER_NODE=2 +GPUS_PER_NODE=1 NNODES=$SLURM_NNODES +WOLD_SIZE=$(($GPUS_PER_NODE * $NNODES)) + # ----- data TOKENIZER_PATH=/datasets/lucie_tokens_65k_grouped/tokenizer TOKENS_DIR=/local_data/lucie_tokens_65k_grouped @@ -60,7 +62,8 @@ cat < $config_json } EOT -CHECKPOINT_PATH=/home/lucas.hervier/Lucie-Training/results/checkpoints/test0 +CHECKPOINT_PATH=/home/agustin-martin.picard/Lucie-Training/results/checkpoints/gyoza-85M +PERPLEXITY_RESULTS_PATH=/home/agustin-martin.picard/Lucie-Training/results/perplexity_results/gyoza-85M # ------ Optimizer TRAIN_STEPS=250000 # e.g. llama: 1T tokens / 4M tokens_per_batch = 250000 steps @@ -142,7 +145,7 @@ torchrun $DISTRIBUTED_ARGS \ --tokenizer-name-or-path $TOKENIZER_PATH \ --distributed-backend nccl \ --load $CHECKPOINT_PATH \ - --load-iteration 3000 \ + --load-iteration 10000 \ --inference \ --finetune \ $DEEPSPEED_ARGS \