From 9d41b9f18c873ec8bfef7f5bc507ceeea3ba1091 Mon Sep 17 00:00:00 2001
From: "agustin-martin.picard" <agustin-martin.picard@irt-saintexupery.com>
Date: Mon, 24 Jun 2024 17:51:22 +0200
Subject: [PATCH] ablation: turn off DP as it was causing trouble during GPU
 sync

---
 ablation/perplexity.sh | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/ablation/perplexity.sh b/ablation/perplexity.sh
index 59135ab..ab666c5 100755
--- a/ablation/perplexity.sh
+++ b/ablation/perplexity.sh
@@ -14,9 +14,11 @@ fi
 MASTER_ADDR=127.0.0.1 #$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
 MASTER_PORT=6000
 
-GPUS_PER_NODE=2
+GPUS_PER_NODE=1
 NNODES=$SLURM_NNODES
 
+WOLD_SIZE=$(($GPUS_PER_NODE * $NNODES))
+
 # ----- data
 TOKENIZER_PATH=/datasets/lucie_tokens_65k_grouped/tokenizer
 TOKENS_DIR=/local_data/lucie_tokens_65k_grouped
@@ -60,7 +62,8 @@ cat <<EOT > $config_json
 }
 EOT
 
-CHECKPOINT_PATH=/home/lucas.hervier/Lucie-Training/results/checkpoints/test0
+CHECKPOINT_PATH=/home/agustin-martin.picard/Lucie-Training/results/checkpoints/gyoza-85M
+PERPLEXITY_RESULTS_PATH=/home/agustin-martin.picard/Lucie-Training/results/perplexity_results/gyoza-85M
 
 # ------ Optimizer
 TRAIN_STEPS=250000 # e.g. llama: 1T tokens / 4M tokens_per_batch = 250000 steps
@@ -142,7 +145,7 @@ torchrun $DISTRIBUTED_ARGS \
       --tokenizer-name-or-path $TOKENIZER_PATH \
       --distributed-backend nccl \
       --load $CHECKPOINT_PATH \
-      --load-iteration 3000 \
+      --load-iteration 10000 \
       --inference \
       --finetune \
       $DEEPSPEED_ARGS \