Skip to content

Commit

Permalink
slurm 22.05 support and multiple-slurmd (#49)
Browse files Browse the repository at this point in the history
* Support Slurm 22.05.X

- also bump openssl, tiny and python versions
- emulate nodes with more resources than physically availableb by
  default, which is nice for testing purposes.

* support multiple slurmd nodes properly

- have by default 3 nodes
- compile slurm with --enable-multiple-slurmd - previously without this
  you cannot have multiple nodes on a single host, it would throw a lot
  of error messages and executing srun inside a batch job didn't work

Also:
- remove the check_running_status function - when doing "supervisorctl
  start" it will print the state anyway, no need to continously check
  for the state
- remove the check_port_status function - don't really see a proper use case
  here. If the services weren't able to bind to the port, they couldn't
  even start properly.
- remove the gres.conf - doesn't really work to properly "fake" having
  GPUs without a device file
  • Loading branch information
tazend authored Nov 4, 2023
1 parent 4f1bef5 commit 90fa6b8
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 63 deletions.
19 changes: 11 additions & 8 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -71,13 +71,16 @@ RUN set -ex \
&& git config --global push.default simple

# Add Tini
ENV TINI_VERSION v0.18.0
ENV TINI_VERSION v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini.asc /tini.asc
RUN gpg --batch --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 595E85A6B1B4779EA4DAAEC70B588DFF0527A9B7 \
&& gpg --batch --verify /tini.asc /tini
RUN chmod +x /tini

# Install OpenSSL1.1.1
# See PEP 644: https://www.python.org/dev/peps/pep-0644/
ARG OPENSSL_VERSION="1.1.1l"
ARG OPENSSL_VERSION="1.1.1s"
RUN set -ex \
&& wget --quiet https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz \
&& tar xzf openssl-${OPENSSL_VERSION}.tar.gz \
Expand All @@ -89,12 +92,12 @@ RUN set -ex \
&& echo "/opt/openssl/lib" >> /etc/ld.so.conf.d/openssl.conf \
&& ldconfig \
&& popd \
&& rm -rf openssl-${OPENSSL_VERSION}.tar.gz
&& rm -f openssl-${OPENSSL_VERSION}.tar.gz

# Install supported Python versions and install dependencies.
# Set the default global to the latest supported version.
# Use pyenv inside the container to switch between Python versions.
ARG PYTHON_VERSIONS="3.6.15 3.7.12 3.8.12 3.9.9 3.10.0"
ARG PYTHON_VERSIONS="3.6.15 3.7.16 3.8.16 3.9.16 3.10.9 3.11.1"
ARG CONFIGURE_OPTS="--with-openssl=/opt/openssl"
RUN set -ex \
&& curl https://pyenv.run | bash \
Expand All @@ -110,13 +113,13 @@ RUN set -ex \
done

# Compile, build and install Slurm from Git source
ARG SLURM_TAG=slurm-21-08-8-2
ARG SLURM_TAG=slurm-22-05-8-1
ARG JOBS=4
RUN set -ex \
&& git clone -b ${SLURM_TAG} --single-branch --depth=1 https://github.com/SchedMD/slurm.git \
&& pushd slurm \
&& ./configure --prefix=/usr --sysconfdir=/etc/slurm --enable-slurmrestd \
--with-mysql_config=/usr/bin --libdir=/usr/lib64 \
--with-mysql_config=/usr/bin --libdir=/usr/lib64 --enable-multiple-slurmd \
&& sed -e 's|#!/usr/bin/env python3|#!/usr/bin/python|' -i doc/html/shtml2html.py \
&& make -j ${JOBS} install \
&& install -D -m644 etc/cgroup.conf.example /etc/slurm/cgroup.conf.example \
Expand All @@ -139,9 +142,9 @@ RUN set -ex \
&& /sbin/create-munge-key

RUN dd if=/dev/random of=/etc/slurm/jwt_hs256.key bs=32 count=1 \
&& chmod 600 /etc/slurm/jwt_hs256.key && chown slurm.slurm /etc/slurm/jwt_hs256.key
&& chmod 600 /etc/slurm/jwt_hs256.key && chown slurm:slurm /etc/slurm/jwt_hs256.key

COPY --chown=slurm files/slurm/slurm.conf files/slurm/gres.conf files/slurm/slurmdbd.conf /etc/slurm/
COPY --chown=slurm files/slurm/slurm.conf files/slurm/slurmdbd.conf files/slurm/cgroup.conf /etc/slurm/
COPY files/supervisord.conf /etc/

RUN chmod 0600 /etc/slurm/slurmdbd.conf
Expand Down
50 changes: 9 additions & 41 deletions docker-entrypoint.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
#!/bin/bash

set -eo pipefail

export SLURM_NODE_COUNT=${SLURM_NODE_COUNT:-3}
SLURMD_PORT_BASE=7000
declare -a SLURMD_PORT_RANGE=$(seq -w $SLURMD_PORT_BASE $((SLURMD_PORT_BASE + SLURM_NODE_COUNT)))
declare -a SLURM_PORTS=(6817 6819 6820 3306 $SLURMD_PORT_RANGE)

function error_with_msg {
if [[ "$count" -eq 0 ]]
then
Expand All @@ -9,39 +16,6 @@ function error_with_msg {
fi
}

function check_running_status {
for count in {10..0}; do
STATUS=$(/usr/bin/supervisorctl status $1 | awk '{print $2}')
echo "- $1 is in the $STATUS state."
if [[ "$STATUS" = "RUNNING" ]]
then
break
else
sleep 1
fi
done
}

function check_port_status {
for count in {10..0}; do
echo 2>/dev/null >/dev/tcp/localhost/$1
if [[ "$?" -eq 0 ]]
then
echo "- Port $1 is listening"
break
else
echo "- Port $1 is not listening"
sleep 1
fi
done
}

function start_service {
echo "- Starting $1"
/usr/bin/supervisorctl start $1
check_running_status $1
}

if [ ! -d "/var/lib/mysql/mysql" ]
then
echo "[mysqld]\nskip-host-cache\nskip-name-resolve" > /etc/my.cnf.d/docker.cnf
Expand Down Expand Up @@ -88,15 +62,10 @@ fi
echo "- Starting supervisord process manager"
/usr/bin/supervisord --configuration /etc/supervisord.conf


echo "- Starting all Services..."
for service in munged mysqld slurmdbd slurmctld slurmd slurmrestd
do
start_service $service
done

for port in 6817 6818 6819 6820
do
check_port_status $port
/usr/bin/supervisorctl start "$service:*"
done

echo "- Waiting for the cluster to become available"
Expand All @@ -108,7 +77,6 @@ for count in {10..0}; do
break
fi
done

error_with_msg "Slurm partitions failed to start successfully."

echo "- Cluster is now available"
Expand Down
4 changes: 4 additions & 0 deletions files/slurm/cgroup.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
CgroupAutomount=yes
CgroupPlugin=cgroup/v1
ConstrainCores=no
ConstrainRAMSpace=no
1 change: 0 additions & 1 deletion files/slurm/gres.conf

This file was deleted.

16 changes: 7 additions & 9 deletions files/slurm/slurm.conf
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#AccountingStoragePass=
#AccountingStoragePort=
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageTRES=gres/gpu,gres/gpu:titanxp,gres/gpu:a100
#AccountingStorageUser=
#AccountingStoreFlags=
#BatchStartTimeout=10
Expand Down Expand Up @@ -53,14 +54,10 @@ KillWait=30
MinJobAge=300
MpiDefault=none
#MpiParams=ports=#-#
NodeName=c1 NodeHostName=slurmctl NodeAddr=127.0.0.1 RealMemory=1000
NodeName=c2 NodeAddr=127.0.0.1 RealMemory=1000
NodeName=c3 NodeAddr=127.0.0.1 RealMemory=1000 Gres=gpu:titanxp:1
NodeName=c4 NodeAddr=127.0.0.1 RealMemory=1000 Gres=gpu:titanxp:1
NodeName=node[001-003] NodeHostName=slurmctl NodeAddr=127.0.0.1 Port=[7000-7002] Sockets=2 CoresPerSocket=28 RealMemory=549756
#OverTimeLimit=0
PartitionName=debug Nodes=c[3-4] Priority=50 DefMemPerCPU=500 Shared=NO MaxNodes=2 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP
PartitionName=normal Default=yes Nodes=c[1-2] Priority=50 DefMemPerCPU=500 Shared=NO MaxNodes=2 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP
# PARTITIONS
PartitionName=normal Default=yes Nodes=node[001-003] Priority=50 DefMemPerCPU=500 Shared=NO MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP
#PluginDir=
#PlugStackConfig=
# POWER SAVE SUPPORT FOR IDLE NODES (optional)
Expand Down Expand Up @@ -102,10 +99,11 @@ SlurmctldPidFile=/var/run/slurm/slurmctld.pid
SlurmctldPort=6817
SlurmctldTimeout=120
SlurmdDebug=debug
SlurmdLogFile=/var/log/slurm/slurmd.log
SlurmdPidFile=/var/run/slurm/slurmd.pid
SlurmdLogFile=/var/log/slurm/slurmd-%n.log
SlurmdParameters=config_overrides
SlurmdPidFile=/var/run/slurm/slurmd-%n.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmdSpoolDir=/var/spool/slurmd/%n
SlurmdTimeout=300
#SlurmdUser=root
#SlurmSchedLogFile=
Expand Down
8 changes: 4 additions & 4 deletions files/supervisord.conf
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ priority=2

[program:slurmdbd]
user=root
#command=/bin/bash -c "until echo 'SELECT 1' | mysql -h localhost -uslurm -ppassword &> /dev/null; do echo 'Waiting for DB'; sleep 1; done && /usr/sbin/slurmdbd -Dvvv"
command=/usr/sbin/slurmdbd -Dvvv
autostart=false
autorestart=false
Expand All @@ -61,7 +60,6 @@ priority=10

[program:slurmctld]
user=root
#command=/bin/bash -c "until 2>/dev/null >/dev/tcp/localhost/6819; do echo 'Waiting for port 6819'; sleep 1; done && /usr/sbin/slurmctld -Dvvv"
command=/usr/sbin/slurmctld -Dvvv
autostart=false
autorestart=false
Expand All @@ -76,8 +74,7 @@ priority=50

[program:slurmd]
user=root
#command=/bin/bash -c "until 2>/dev/null >/dev/tcp/localhost/6817; do echo 'Waiting for port 6817'; sleep 1; done && /usr/sbin/slurmd -Dvvv"
command=/usr/sbin/slurmd -Dvvv
command=/usr/sbin/slurmd -Dvvv -N node%(process_num)03d
autostart=false
autorestart=false
exitcodes=0,1,2
Expand All @@ -88,6 +85,9 @@ stderr_logfile=/var/log/supervisor/slurmd.log
stderr_logfile_maxbytes=1MB
stderr_logfile_backups=5
priority=100
process_name=node%(process_num)03d
numprocs=%(ENV_SLURM_NODE_COUNT)s
numprocs_start=1

[program:slurmrestd]
user=root
Expand Down

0 comments on commit 90fa6b8

Please sign in to comment.