forked from cdt-data-science/cluster-scripts
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgpu-usage
executable file
·95 lines (78 loc) · 2.65 KB
/
gpu-usage
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/bin/bash
nodes=`sinfo --format="%N" --noheader`
header=true
pretty=false
print_usage () {
cat << EOM
Usage: $0 [-n charles[01-19]]
Get gpu usage for all, or specified nodes
Arguments:
-n (optional) specify nodes, either comma separated list, or summarised.
Mirrors slurm command -n usage in sinfo, and -w usage for squeue."
-h (optional) do not print header
-p (optional) make output pretty, ignores -h if specified
Output:
Outputs three columns summarising the current usage of GPUs on the cluster:
-- The 'in_use' column shows the number of GPUs that are currently allocated to a job.
-- The 'usable' column shows the number of GPUs that are not in an error state or otherwise unable to be
allocated to a job.
-- The 'total' column shows the total number of GPUs, including those that are currently unable to be allocated
to a job.
The number of free GPUs (GPUs not allocated to a job and not in an error state) is the difference between the
'usable' column and the 'in_use' column.
Examples:
# Get status of all nodes aggregated:
$ ./gpu-usage
> in_use,usable,total
> 18,34,55
# Suppress header:
$ ./gpu-usage -h
> 18,34,55
# Specify specific nodes in comma separated list:
$ ./gpu-usage -n charles01,charles03 -p
> in_use usable total
> 1 2 4
# Same as above but with summarised node format:
$ ./gpu-usage -n charles[01,03] -p
> in_use usable total
> 1 2 4
# Example with split list:
$ ./gpu-usage -p -n charles[01-06,11-19]
> in_use usable total
14 26 47
EOM
}
while getopts 'n:hp' flag; do
case "${flag}" in
n) nodes="${OPTARG}" ;;
h) header=false ;;
p) pretty=true ;;
*) print_usage
exit ;;
esac
done
if [ "$pretty" = true ]; then
$0 -n ${nodes} | column -t -s','
exit 0
fi
# Count number of GPUs requested by running jobs
USED=`squeue -t R -o %b -w ${nodes} | grep -o ':[0-9]' | awk ' {s+=substr($1,2)} END {print s}'`
if [ "$USED" == '' ]; then
USED=0
fi
# Count number of GPUS in nodes marked idle or mix
USABLE=$(sinfo -t idle,mix,alloc -r -N -o %G -n ${nodes} | grep -o ':[0-9]' | awk ' {s+=substr($1,2)} END {print s}')
if [ "$USABLE" == '' ]; then
USABLE=0
fi
# Count number of GPUs in all nodes
TOTAL=$(sinfo -N -o %G -n ${nodes} | grep -o ':[0-9]' | awk ' {s+=substr($1,2)} END {print s}')
if [ "$TOTAL" == '' ]; then
TOTAL=0
fi
# Get number of free GPUs
FREE=$(expr "${USABLE}" - "${USED}")
if [ "$header" = true ]; then
echo "in_use,usable,total,free"
fi
echo "${USED},${USABLE},${TOTAL},${FREE}"