diff --git a/Dockerfile b/Dockerfile index e02969e..96ac47b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -71,13 +71,16 @@ RUN set -ex \ && git config --global push.default simple # Add Tini -ENV TINI_VERSION v0.18.0 +ENV TINI_VERSION v0.19.0 ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini +ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini.asc /tini.asc +RUN gpg --batch --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 595E85A6B1B4779EA4DAAEC70B588DFF0527A9B7 \ + && gpg --batch --verify /tini.asc /tini RUN chmod +x /tini # Install OpenSSL1.1.1 # See PEP 644: https://www.python.org/dev/peps/pep-0644/ -ARG OPENSSL_VERSION="1.1.1l" +ARG OPENSSL_VERSION="1.1.1s" RUN set -ex \ && wget --quiet https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz \ && tar xzf openssl-${OPENSSL_VERSION}.tar.gz \ @@ -89,12 +92,12 @@ RUN set -ex \ && echo "/opt/openssl/lib" >> /etc/ld.so.conf.d/openssl.conf \ && ldconfig \ && popd \ - && rm -rf openssl-${OPENSSL_VERSION}.tar.gz + && rm -f openssl-${OPENSSL_VERSION}.tar.gz # Install supported Python versions and install dependencies. # Set the default global to the latest supported version. # Use pyenv inside the container to switch between Python versions. -ARG PYTHON_VERSIONS="3.6.15 3.7.12 3.8.12 3.9.9 3.10.0" +ARG PYTHON_VERSIONS="3.6.15 3.7.16 3.8.16 3.9.16 3.10.9 3.11.1" ARG CONFIGURE_OPTS="--with-openssl=/opt/openssl" RUN set -ex \ && curl https://pyenv.run | bash \ @@ -110,13 +113,13 @@ RUN set -ex \ done # Compile, build and install Slurm from Git source -ARG SLURM_TAG=slurm-21-08-8-2 +ARG SLURM_TAG=slurm-22-05-8-1 ARG JOBS=4 RUN set -ex \ && git clone -b ${SLURM_TAG} --single-branch --depth=1 https://github.com/SchedMD/slurm.git \ && pushd slurm \ && ./configure --prefix=/usr --sysconfdir=/etc/slurm --enable-slurmrestd \ - --with-mysql_config=/usr/bin --libdir=/usr/lib64 \ + --with-mysql_config=/usr/bin --libdir=/usr/lib64 --enable-multiple-slurmd \ && sed -e 's|#!/usr/bin/env python3|#!/usr/bin/python|' -i doc/html/shtml2html.py \ && make -j ${JOBS} install \ && install -D -m644 etc/cgroup.conf.example /etc/slurm/cgroup.conf.example \ @@ -139,9 +142,9 @@ RUN set -ex \ && /sbin/create-munge-key RUN dd if=/dev/random of=/etc/slurm/jwt_hs256.key bs=32 count=1 \ - && chmod 600 /etc/slurm/jwt_hs256.key && chown slurm.slurm /etc/slurm/jwt_hs256.key + && chmod 600 /etc/slurm/jwt_hs256.key && chown slurm:slurm /etc/slurm/jwt_hs256.key -COPY --chown=slurm files/slurm/slurm.conf files/slurm/gres.conf files/slurm/slurmdbd.conf /etc/slurm/ +COPY --chown=slurm files/slurm/slurm.conf files/slurm/slurmdbd.conf files/slurm/cgroup.conf /etc/slurm/ COPY files/supervisord.conf /etc/ RUN chmod 0600 /etc/slurm/slurmdbd.conf diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index cade9b2..bf42580 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -1,5 +1,12 @@ #!/bin/bash +set -eo pipefail + +export SLURM_NODE_COUNT=${SLURM_NODE_COUNT:-3} +SLURMD_PORT_BASE=7000 +declare -a SLURMD_PORT_RANGE=$(seq -w $SLURMD_PORT_BASE $((SLURMD_PORT_BASE + SLURM_NODE_COUNT))) +declare -a SLURM_PORTS=(6817 6819 6820 3306 $SLURMD_PORT_RANGE) + function error_with_msg { if [[ "$count" -eq 0 ]] then @@ -9,39 +16,6 @@ function error_with_msg { fi } -function check_running_status { - for count in {10..0}; do - STATUS=$(/usr/bin/supervisorctl status $1 | awk '{print $2}') - echo "- $1 is in the $STATUS state." - if [[ "$STATUS" = "RUNNING" ]] - then - break - else - sleep 1 - fi - done -} - -function check_port_status { - for count in {10..0}; do - echo 2>/dev/null >/dev/tcp/localhost/$1 - if [[ "$?" -eq 0 ]] - then - echo "- Port $1 is listening" - break - else - echo "- Port $1 is not listening" - sleep 1 - fi - done -} - -function start_service { - echo "- Starting $1" - /usr/bin/supervisorctl start $1 - check_running_status $1 -} - if [ ! -d "/var/lib/mysql/mysql" ] then echo "[mysqld]\nskip-host-cache\nskip-name-resolve" > /etc/my.cnf.d/docker.cnf @@ -88,15 +62,10 @@ fi echo "- Starting supervisord process manager" /usr/bin/supervisord --configuration /etc/supervisord.conf - +echo "- Starting all Services..." for service in munged mysqld slurmdbd slurmctld slurmd slurmrestd do - start_service $service -done - -for port in 6817 6818 6819 6820 -do - check_port_status $port + /usr/bin/supervisorctl start "$service:*" done echo "- Waiting for the cluster to become available" @@ -108,7 +77,6 @@ for count in {10..0}; do break fi done - error_with_msg "Slurm partitions failed to start successfully." echo "- Cluster is now available" diff --git a/files/slurm/cgroup.conf b/files/slurm/cgroup.conf new file mode 100644 index 0000000..8bdef69 --- /dev/null +++ b/files/slurm/cgroup.conf @@ -0,0 +1,4 @@ +CgroupAutomount=yes +CgroupPlugin=cgroup/v1 +ConstrainCores=no +ConstrainRAMSpace=no diff --git a/files/slurm/gres.conf b/files/slurm/gres.conf deleted file mode 100644 index 48b36c9..0000000 --- a/files/slurm/gres.conf +++ /dev/null @@ -1 +0,0 @@ -Name=gpu Type=titanxp Cores=0 diff --git a/files/slurm/slurm.conf b/files/slurm/slurm.conf index a5230b1..babee8d 100644 --- a/files/slurm/slurm.conf +++ b/files/slurm/slurm.conf @@ -3,6 +3,7 @@ #AccountingStoragePass= #AccountingStoragePort= AccountingStorageType=accounting_storage/slurmdbd +AccountingStorageTRES=gres/gpu,gres/gpu:titanxp,gres/gpu:a100 #AccountingStorageUser= #AccountingStoreFlags= #BatchStartTimeout=10 @@ -53,14 +54,10 @@ KillWait=30 MinJobAge=300 MpiDefault=none #MpiParams=ports=#-# -NodeName=c1 NodeHostName=slurmctl NodeAddr=127.0.0.1 RealMemory=1000 -NodeName=c2 NodeAddr=127.0.0.1 RealMemory=1000 -NodeName=c3 NodeAddr=127.0.0.1 RealMemory=1000 Gres=gpu:titanxp:1 -NodeName=c4 NodeAddr=127.0.0.1 RealMemory=1000 Gres=gpu:titanxp:1 +NodeName=node[001-003] NodeHostName=slurmctl NodeAddr=127.0.0.1 Port=[7000-7002] Sockets=2 CoresPerSocket=28 RealMemory=549756 #OverTimeLimit=0 -PartitionName=debug Nodes=c[3-4] Priority=50 DefMemPerCPU=500 Shared=NO MaxNodes=2 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP -PartitionName=normal Default=yes Nodes=c[1-2] Priority=50 DefMemPerCPU=500 Shared=NO MaxNodes=2 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP # PARTITIONS +PartitionName=normal Default=yes Nodes=node[001-003] Priority=50 DefMemPerCPU=500 Shared=NO MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP #PluginDir= #PlugStackConfig= # POWER SAVE SUPPORT FOR IDLE NODES (optional) @@ -102,10 +99,11 @@ SlurmctldPidFile=/var/run/slurm/slurmctld.pid SlurmctldPort=6817 SlurmctldTimeout=120 SlurmdDebug=debug -SlurmdLogFile=/var/log/slurm/slurmd.log -SlurmdPidFile=/var/run/slurm/slurmd.pid +SlurmdLogFile=/var/log/slurm/slurmd-%n.log +SlurmdParameters=config_overrides +SlurmdPidFile=/var/run/slurm/slurmd-%n.pid SlurmdPort=6818 -SlurmdSpoolDir=/var/spool/slurmd +SlurmdSpoolDir=/var/spool/slurmd/%n SlurmdTimeout=300 #SlurmdUser=root #SlurmSchedLogFile= diff --git a/files/supervisord.conf b/files/supervisord.conf index 95e8c9a..b4bdb0e 100644 --- a/files/supervisord.conf +++ b/files/supervisord.conf @@ -46,7 +46,6 @@ priority=2 [program:slurmdbd] user=root -#command=/bin/bash -c "until echo 'SELECT 1' | mysql -h localhost -uslurm -ppassword &> /dev/null; do echo 'Waiting for DB'; sleep 1; done && /usr/sbin/slurmdbd -Dvvv" command=/usr/sbin/slurmdbd -Dvvv autostart=false autorestart=false @@ -61,7 +60,6 @@ priority=10 [program:slurmctld] user=root -#command=/bin/bash -c "until 2>/dev/null >/dev/tcp/localhost/6819; do echo 'Waiting for port 6819'; sleep 1; done && /usr/sbin/slurmctld -Dvvv" command=/usr/sbin/slurmctld -Dvvv autostart=false autorestart=false @@ -76,8 +74,7 @@ priority=50 [program:slurmd] user=root -#command=/bin/bash -c "until 2>/dev/null >/dev/tcp/localhost/6817; do echo 'Waiting for port 6817'; sleep 1; done && /usr/sbin/slurmd -Dvvv" -command=/usr/sbin/slurmd -Dvvv +command=/usr/sbin/slurmd -Dvvv -N node%(process_num)03d autostart=false autorestart=false exitcodes=0,1,2 @@ -88,6 +85,9 @@ stderr_logfile=/var/log/supervisor/slurmd.log stderr_logfile_maxbytes=1MB stderr_logfile_backups=5 priority=100 +process_name=node%(process_num)03d +numprocs=%(ENV_SLURM_NODE_COUNT)s +numprocs_start=1 [program:slurmrestd] user=root