-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathfinetune_llama_zh.sh
248 lines (203 loc) · 6.39 KB
/
finetune_llama_zh.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
#!/bin/bash
export NV_LIBNCCL_DEV_PACKAGE=
export NV_LIBNCCL_DEV_PACKAGE_VERSION=
export NV_LIBNCCL_DEV_PACKAGE_NAME=
export NV_LIBNCCL_PACKAGE=
export NV_LIBNCCL_PACKAGE_NAME=
export NV_LIBNCCL_PACKAGE_VERSION=
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
SLURM_NNODES=1
SLURM_PROCID=0
NNODES=1
GPUS_PER_NODE=8
SLURM_JOB_ID=23333
MASTER_ADDR=localhost
MASTER_PORT=46282
# export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
# echo "MASTER_ADDR: " $MASTER_ADDR
# export MASTER_PORT=$(expr 45000 + $(echo -n $SLURM_JOBID | tail -c 4))
# echo "MASTER_PORT: " $MASTER_PORT
# GPUS_PER_NODE=4
# NNODES=$SLURM_NNODES
# echo "NNODES: " $NNODES
#export RANK=$SLURM_PROCID
#echo "RANK: " $RANK
#export LOCAL_RANK=$SLURM_LOCALID
#echo "LOCAL_RANK: " $LOCAL_RANK
#export WORLD_SIZE=$(($SLURM_NNODES * $SLURM_NTASKS_PER_NODE))
#echo "WORLD_SIZE: " $WORLD_SIZE
# do not remove or the training will hang and nodes will be lost w/o this workaround
export CUDA_LAUNCH_BLOCKING=1
# hide duplicated errors using this hack - will be properly fixed in pt-1.12
export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error-genggui001.json
# force crashing on nccl issues like hanging broadcast
export NCCL_ASYNC_ERROR_HANDLING=1
# defining the right environment variables
export HF_DATASETS_OFFLINE=1
export TRANSFORMERS_OFFLINE=1
export PATH="/root/miniconda3/envs/torch1.13/bin:$PATH"
START_DATE=$(date "+%Y_%m_%d_%H:%M:%S")
echo "START TIME: $START_DATE"
variant=main
LOAD_CHECKPOINT_PATH=/mnt/data/smart_health_02/xuekui/pretrain_weights/nlp/chinese-llama-plus-7b-megatron-states
DATA_OUTPUT_PATH=./model_dir/llama_7b_zh
CHECKPOINT_PATH=$DATA_OUTPUT_PATH/checkpoints/$variant
REPO_PATH=$DATA_OUTPUT_PATH/experiment
TENSORBOARD_PATH=$REPO_PATH/tensorboard/$variant
LOGS_PATH=$REPO_PATH/logs/$variant
mkdir -p $LOGS_PATH
KILL_SWITCH_PATH=$REPO_PATH/kill-switch
DATA_PATH="/mnt/data/smart_health_02/xuekui/dataset/pretrain_data/zhs/common_crawl_2019_30_tmp_text_document"
TOKENIZER_NAME_OR_PATH=$LOAD_CHECKPOINT_PATH/tokenizer
TP_SIZE=2
PP_SIZE=4
MICRO_BATCH_SIZE=1 # was MBS=1 till GBS=784
GLOBAL_BATCH_SIZE=128 # 4.2M tokens. It is larger than the initial plan of 3.2M tokens to get higher throughput
NHIDDEN=4096
FFN_HIDDEN_SIZE=11008
NLAYERS=32
NHEADS=32
SEQ_LEN=2048
SP=12
ADAPTER_SIZE=0
SAVE_INTERVAL=256
TRAIN_SAMPLES=9_437_184 # 450B tokens
LR_DECAY_SAMPLES=8_437_120 # Decay for the first 410B tokens then continue at fixed --min-lr
LR_WARMUP_SAMPLES=64 # 375M tokens
OPTIMIZER_ARGS=" \
--optimizer adam \
--optimizer-split-size 1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--adam-eps 1e-8 \
--lr 6e-5 \
--min-lr 6e-6 \
--lr-decay-style cosine \
--lr-decay-samples $LR_DECAY_SAMPLES \
--lr-warmup-samples $LR_WARMUP_SAMPLES \
--clip-grad 1.0 \
--weight-decay 1e-1 \
"
# for 20h 1190, for 100h 5990
# --exit-duration-in-mins 1190
EXIT_OPTS=" \
--exit-duration-in-mins 99999999 \
"
GPT_ARGS=" \
--num-layers $NLAYERS \
--hidden-size $NHIDDEN \
--ffn-hidden-size $FFN_HIDDEN_SIZE \
--num-attention-heads $NHEADS \
--seq-length $SEQ_LEN \
--max-position-embeddings $SEQ_LEN \
--micro-batch-size $MICRO_BATCH_SIZE \
--global-batch-size $GLOBAL_BATCH_SIZE \
--train-samples $TRAIN_SAMPLES \
--tokenizer-type PretrainedFromHF \
--vocab-file $TOKENIZER_NAME_OR_PATH \
--loss-scale $SP \
--init-method-std 0.0048 \
--fp16 \
--seed 42 \
--checkpoint-activations \
--pad-vocab-size-to 50176 \
$OPTIMIZER_ARGS \
$EXIT_OPTS \
"
# TODO: decide on efficient eval-interval + eval-iters
OUTPUT_ARGS=" \
--log-interval 1 \
--save-total-limit 2 \
--metric-for-best-model 'lm loss' \
--save-interval $SAVE_INTERVAL \
--eval-interval $SAVE_INTERVAL \
--eval-iters 32 \
--tensorboard-dir $TENSORBOARD_PATH \
--tensorboard-queue-size 5 \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
"
ZERO_STAGE=1
config_json="./ds_config.$SLURM_JOB_ID.json"
# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
cat <<EOT > $config_json
{
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
"train_batch_size": $GLOBAL_BATCH_SIZE,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"checkpoint": {
"load_universal": false
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": $SP
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOT
DEEPSPEED_ARGS=" \
--deepspeed \
--deepspeed_config ${config_json} \
--zero-stage ${ZERO_STAGE} \
--deepspeed-activation-checkpointing \
"
export LAUNCHER="python -u -m torch.distributed.run \
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
--rdzv_backend c10d \
--max_restarts 0 \
--tee 3 \
"
export CMD=" \
pretrain_llama.py \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
$GPT_ARGS \
$OUTPUT_ARGS \
--save $CHECKPOINT_PATH \
--load $LOAD_CHECKPOINT_PATH \
--finetune \
--data-path $DATA_PATH \
--distributed-backend nccl \
$DEEPSPEED_ARGS \
"
# --skip-train-iteration-range 22528-45056 \
# export CMD=" \
# finetune_dig_gpt.py \
# --tensor-model-parallel-size $TP_SIZE \
# --pipeline-model-parallel-size $PP_SIZE \
# $GPT_ARGS \
# $OUTPUT_ARGS \
# --save $CHECKPOINT_PATH \
# --load $CHECKPOINT_PATH \
# --pretrained-checkpoint $LOAD_CHECKPOINT_PATH \
# --data-path $DATA_PATH \
# --distributed-backend nccl \
# $DEEPSPEED_ARGS \
# "
# export CMD=" \
# finetune_dig_gpt.py \
# --tensor-model-parallel-size $TP_SIZE \
# --pipeline-model-parallel-size $PP_SIZE \
# $GPT_ARGS \
# $OUTPUT_ARGS \
# --save $CHECKPOINT_PATH \
# --load $CHECKPOINT_PATH \
# --data-path $DATA_PATH \
# --distributed-backend nccl \
# $DEEPSPEED_ARGS \
# "
# echo "$LAUNCHER --node_rank $SLURM_PROCID $CMD"
echo "$LAUNCHER --node_rank $SLURM_PROCID $CMD" > $LOGS_PATH/main_${SLURM_PROCID}.log
echo "-----------------------------------------------" > $LOGS_PATH/main_${SLURM_PROCID}.log
bash -c "$LAUNCHER --node_rank $SLURM_PROCID $CMD" 2>&1 | tee -a $LOGS_PATH/main_${SLURM_PROCID}.log