-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlaunch_ddp_4N4G.sh
executable file
·66 lines (49 loc) · 2.13 KB
/
launch_ddp_4N4G.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/bin/bash
# SLURM Script to launch a multi-node multi-GPU pytorch DDP training
# on UF HiPerGator's AI partition
# 11/2022, Yunchao Yang, UF Research Computing
# The python script is adapted from pytorch ddp tutorial
# Torchrun on multiNode multiGPU,
# set #SBATCH --ntasks=--nodes
# set #SBATCH --ntasks-per-node=1
# set #SBATCH --gpus=total number of processes to run on all nodes
# set #SBATCH --gpus-per-task=--gpus / --ntasks
########################################################################################
# In this example, we are requesting 4 nodes with 1 GPUs per node, total GPUs = 4.
# Resource allocation.
#SBATCH --wait-all-nodes=1
#SBATCH --nodes=4 # How many DGX nodes? Each has 8 A100 GPUs
#SBATCH --ntasks=4 # How many tasks? One per GPU
#SBATCH --ntasks-per-node=1 # 1 torchrun per node
#SBATCH --gpus-per-task=1 # #GPU per srun step task
#SBATCH --cpus-per-task=4 # How many CPU cores per task, equal to your dataloader worker
#SBATCH --mem=24gb # CPU memory per node--up to 2TB (Not GPU memory--that is 80GB per A100 GPU)
#SBATCH --partition=hpg-ai # Specify the HPG AI partition
# Enable the following to limit the allocation to a single SU.
#SBATCH --time=48:00:00
#SBATCH --output=%x.%j.out
module load pytorch/1.10
#or using conda environment
# module load conda
# conda activate your_env_name
nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
nodes_array=($nodes)
echo Node list $nodes_array
#head_node=${nodes_array[0]}
#head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
#echo Node IP: $head_node_ip
# if above head_node_ip threw an error with hostname, use the following simple way to extract ip-address
head_node_ip=`hostname --ip-address`
echo HeadNodeIP: $head_node_ip
head_node_port=29500
export LOGLEVEL=INFO
# For NCCL debugging
# export NCCL_DEBUG=WARN #change to INFO if debugging DDP
pwd; hostname; date
srun --export=ALL torchrun \
--nnodes 4 \
--nproc_per_node 1 \
--rdzv_id $RANDOM \
--rdzv_backend c10d \
--rdzv_endpoint $head_node_ip:$head_node_port \
multigpu_torchrun.py 50 10