Skip to content

Commit

Permalink
feat: add support for Slurm 20.11.7-1 (#29)
Browse files Browse the repository at this point in the history
* feat: add support for Slurm 20.11.7-1

* chore: extend timeout

* style: run black formatter

* refactor: build docker image before install

* refactor: start services sequentially to avoid race conditions

* test: add checks for listening ports

fix: manually chmod slurmdbd.conf
  • Loading branch information
giovtorres authored Oct 30, 2021
1 parent c201a3c commit 5eeab35
Show file tree
Hide file tree
Showing 9 changed files with 225 additions and 77 deletions.
4 changes: 3 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ python:
- "3.9"
services:
- docker
before_install:
docker build -t docker-centos7-slurm:test .
install:
pipenv sync --dev
script:
travis_wait pipenv run pytest -v
pipenv run pytest -svv
15 changes: 8 additions & 7 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -80,17 +80,16 @@ RUN set -ex \
&& rm -f /tmp/install-python.sh

# Compile, build and install Slurm from Git source
ARG SLURM_TAG=slurm-20-02-0-1
ARG SLURM_TAG=slurm-20-11-7-1
RUN set -ex \
&& git clone https://github.com/SchedMD/slurm.git \
&& git clone -b ${SLURM_TAG} --single-branch --depth=1 https://github.com/SchedMD/slurm.git \
&& pushd slurm \
&& git checkout $SLURM_TAG \
&& ./configure --enable-front-end --prefix=/usr --sysconfdir=/etc/slurm \
--with-mysql_config=/usr/bin --libdir=/usr/lib64 \
&& make install \
&& install -D -m644 etc/cgroup.conf.example /etc/slurm/cgroup.conf.example \
&& install -D -m644 etc/slurm.conf.example /etc/slurm/slurm.conf.example \
&& install -D -m644 etc/slurmdbd.conf.example /etc/slurm/slurmdbd.conf.example \
&& install -D -m600 etc/slurmdbd.conf.example /etc/slurm/slurmdbd.conf.example \
&& install -D -m644 contribs/slurm_completion_help/slurm_completion.sh /etc/profile.d/slurm_completion.sh \
&& popd \
&& rm -rf slurm \
Expand All @@ -104,11 +103,13 @@ RUN set -ex \
/var/log/slurm \
/var/run/slurmd \
&& /sbin/create-munge-key
COPY files/slurm/slurm.conf /etc/slurm/slurm.conf
COPY files/slurm/gres.conf /etc/slurm/gres.conf
COPY files/slurm/slurmdbd.conf /etc/slurm/slurmdbd.conf
COPY --chown=slurm files/slurm/slurm.conf /etc/slurm/slurm.conf
COPY --chown=slurm files/slurm/gres.conf /etc/slurm/gres.conf
COPY --chown=slurm files/slurm/slurmdbd.conf /etc/slurm/slurmdbd.conf
COPY files/supervisord.conf /etc/

RUN chmod 0600 /etc/slurm/slurmdbd.conf

# Mark externally mounted volumes
VOLUME ["/var/lib/mysql", "/var/lib/slurmd", "/var/spool/slurm", "/var/log/slurm"]

Expand Down
4 changes: 4 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ url = "https://pypi.org/simple"
verify_ssl = true

[dev-packages]
black = "*"
pytest = "*"
pytest-cov = "*"
pytest-sugar = "*"
Expand All @@ -13,3 +14,6 @@ pytest-testinfra = "*"

[requires]
python_version = "3.9"

[pipenv]
allow_prereleases = true
105 changes: 96 additions & 9 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

46 changes: 45 additions & 1 deletion docker-entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,39 @@ function error_with_msg {
fi
}

function check_running_status {
for count in {10..0}; do
STATUS=$(/usr/bin/supervisorctl status $1 | awk '{print $2}')
echo "- $1 is in the $STATUS state."
if [[ "$STATUS" = "RUNNING" ]]
then
break
else
sleep 1
fi
done
}

function check_port_status {
for count in {10..0}; do
echo 2>/dev/null >/dev/tcp/localhost/$1
if [[ "$?" -eq 0 ]]
then
echo "- Port $1 is listening"
break
else
echo "- Port $1 is not listening"
sleep 1
fi
done
}

function start_service {
echo "- Starting $1"
/usr/bin/supervisorctl start $1
check_running_status $1
}

if [ ! -d "/var/lib/mysql/mysql" ]
then
echo "[mysqld]\nskip-host-cache\nskip-name-resolve" > /etc/my.cnf.d/docker.cnf
Expand Down Expand Up @@ -52,9 +85,20 @@ then
error_with_msg "MariaDB did not stop"
fi

echo "- Starting all Slurm processes under supervisord"
echo "- Starting supervisord process manager"
/usr/bin/supervisord --configuration /etc/supervisord.conf


for service in munged mysqld slurmdbd slurmctld slurmd
do
start_service $service
done

for port in 6817 6818 6819
do
check_port_status $port
done

echo "- Waiting for the cluster to become available"
for count in {10..0}; do
if ! grep -q "normal.*idle" <(timeout 1 sinfo)
Expand Down
19 changes: 11 additions & 8 deletions files/supervisord.conf
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ serverurl=unix:///var/run/supervisor/supervisor.sock
[program:munged]
user=munge
command=/usr/sbin/munged -F
autostart=true
autostart=false
autorestart=true
startsecs=5
startretries=2
Expand All @@ -40,14 +40,15 @@ stderr_logfile=/var/log/supervisor/mysqld.log
stderr_logfile_maxbytes=1MB
stderr_logfile_backups=5
exitcodes=0,1,2
autostart=true
autostart=false
autorestart=false
priority=2

[program:slurmdbd]
user=root
command=/bin/bash -c "until echo 'SELECT 1' | mysql -h localhost -uslurm -ppassword &> /dev/null; do sleep 1; done && /usr/sbin/slurmdbd -Dvvv"
autostart=true
#command=/bin/bash -c "until echo 'SELECT 1' | mysql -h localhost -uslurm -ppassword &> /dev/null; do echo 'Waiting for DB'; sleep 1; done && /usr/sbin/slurmdbd -Dvvv"
command=/usr/sbin/slurmdbd -Dvvv
autostart=false
autorestart=false
exitcodes=0,1,2
stdout_logfile=/var/log/supervisor/slurmdbd.log
Expand All @@ -60,8 +61,9 @@ priority=10

[program:slurmctld]
user=root
command=/bin/bash -c "until 2>/dev/null >/dev/tcp/localhost/6819; do sleep 1; done && /usr/sbin/slurmctld -Dvvv"
autostart=true
#command=/bin/bash -c "until 2>/dev/null >/dev/tcp/localhost/6819; do echo 'Waiting for port 6819'; sleep 1; done && /usr/sbin/slurmctld -Dvvv"
command=/usr/sbin/slurmctld -Dvvv
autostart=false
autorestart=false
exitcodes=0,1,2
stdout_logfile=/var/log/supervisor/slurmctld.log
Expand All @@ -74,8 +76,9 @@ priority=50

[program:slurmd]
user=root
command=/bin/bash -c "until 2>/dev/null >/dev/tcp/localhost/6817; do sleep 1; done && /usr/sbin/slurmd -Dvvv"
autostart=true
#command=/bin/bash -c "until 2>/dev/null >/dev/tcp/localhost/6817; do echo 'Waiting for port 6817'; sleep 1; done && /usr/sbin/slurmd -Dvvv"
command=/usr/sbin/slurmd -Dvvv
autostart=false
autorestart=false
exitcodes=0,1,2
stdout_logfile=/var/log/supervisor/slurmd.log
Expand Down
47 changes: 37 additions & 10 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,45 @@

@pytest.fixture(scope="session")
def host(request):
subprocess.check_call(
["docker", "build", "-t", "docker-centos7-slurm:test", "."]
subprocess.run(["docker", "build", "-t", "docker-centos7-slurm:test", "."])

docker_id = (
subprocess.check_output(
[
"docker",
"run",
"-d",
"-it",
"-h",
"slurmctl",
"docker-centos7-slurm:test",
]
)
.decode()
.strip()
)

docker_id = subprocess.check_output(
["docker", "run", "-d", "-it", "-h", "slurmctl", "docker-centos7-slurm:test"]
).decode().strip()

time.sleep(15) # FIXME: needs to be dynamic
time.sleep(20) # FIXME: needs to be dynamic

yield testinfra.get_host(f"docker://{docker_id}")

subprocess.check_call(
["docker", "rm", "-f", docker_id]
)
subprocess.run(["docker", "rm", "-f", docker_id])


@pytest.fixture
def Slow():
def slow(check, timeout=30):
timeout_at = time.time() + timeout

while True:
try:
assert check()
except AssertionError as e:
if time.time() < timeout_at:
time.sleep(1)
else:
raise e
else:
return

return slow
Loading

0 comments on commit 5eeab35

Please sign in to comment.