Skip to content

Commit

Permalink
Refactor worker restart to prevent issues with periodic tests
Browse files Browse the repository at this point in the history
  • Loading branch information
aequitas committed Oct 25, 2024
1 parent ad33b86 commit e74e138
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 4 deletions.
8 changes: 6 additions & 2 deletions docker/cron-docker/periodic/15min/restart_nassl_worker
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
#!/bin/sh
set -e
# find nassl worker and restart the container(s)
docker ps --filter label=com.docker.compose.service=worker-nassl --quiet | xargs --no-run-if-empty docker restart
# stop and start worker one at a time to ensure (batch) tasks are still being picked up
# workers are sent a TERM signal with which a 10 minute grace period before QUIT is sent
for worker in $(docker ps --filter label=com.docker.compose.service=worker-nassl --quiet); do
docker stop "$worker"
docker start "$worker"
done
24 changes: 24 additions & 0 deletions docker/cron-docker/periodic/daily/restart_slow_worker
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/sh

# restart slow worker every day to prevent slow memory leaks
# as the slow worker can run very long tasks (eg: report generation)
# we first start a new container before stopping the previous one

set -e

cd /opt/Internet.nl

SERVICE=worker-slow
REPLICAS=$WORKER_SLOW_REPLICAS
COMPOSE_CMD="docker compose --env-file=docker/defaults.env --env-file=docker/host.env --env-file=docker/local.env"

OLD_CONTAINERS=$($COMPOSE_CMD ps --format "{{ .Name }}"|grep "$SERVICE")

# bring up new containers, wait until healthy
$COMPOSE_CMD up --no-deps --no-recreate --wait --scale="$SERVICE=$(($REPLICAS*2))" "$SERVICE"

# graceful shutdown and remove old containers
docker rm --force "$OLD_CONTAINERS"

# restore replica number to original
$COMPOSE_CMD scale $SERVICE=$REPLICAS
8 changes: 6 additions & 2 deletions docker/cron-docker/periodic/hourly/restart_worker
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
#!/bin/sh
set -e
# find worker and restart the container(s)
docker ps --filter label=com.docker.compose.service=worker --quiet | xargs --no-run-if-empty docker restart
# stop and start worker one at a time to ensure (batch) tasks are still being picked up
# workers are sent a TERM signal with which a 10 minute grace period before QUIT is sent
for worker in $(docker ps --filter label=com.docker.compose.service=worker --quiet); do
docker stop "$worker"
docker start "$worker"
done
5 changes: 5 additions & 0 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,8 @@ services:
# time after which a SIGKILL is sent to celery after a SIGTERM (warm shutdown), default 10s
# insufficient short grace period causes issues on batch when tasks are killed during the hourly worker restart
stop_grace_period: 10m
# SIGTERM is default, but make it explicit
stop_signal: SIGTERM

depends_on:
db-migrate:
Expand Down Expand Up @@ -735,6 +737,9 @@ services:
command: crond -f -d7 -c /etc/crontabs-docker
environment:
- AUTO_UPDATE_TO
- WORKER_SLOW_REPLICAS
- WORKER_REPLICAS
- RELEASE

restart: unless-stopped
logging:
Expand Down

0 comments on commit e74e138

Please sign in to comment.