From 4bba56635ebf976f82f53f231d39cf09507e9aca Mon Sep 17 00:00:00 2001 From: Michael Hammann Date: Fri, 25 Feb 2022 14:14:49 +0100 Subject: [PATCH 1/6] fix: properly shutdown child processes --- supervisor/process.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/supervisor/process.py b/supervisor/process.py index b394be812..d4cc46a91 100644 --- a/supervisor/process.py +++ b/supervisor/process.py @@ -5,6 +5,7 @@ import shlex import time import traceback +import psutil from supervisor.compat import maxint from supervisor.compat import as_bytes @@ -464,6 +465,11 @@ def kill(self, sig): try: try: + # make sure all child processes are properly stopped as well + parent = psutil.Process(abs(pid)) + for child in parent.children(recursive=True): # or parent.children() for recursive=False + # kill child processes with same signal as parent + options.kill(child.pid, sig) options.kill(pid, sig) except OSError as exc: if exc.errno == errno.ESRCH: From 09b82cf67d1cfd16e8c17140c89cf36cd14f4641 Mon Sep 17 00:00:00 2001 From: Michael Hammann Date: Fri, 25 Mar 2022 11:09:41 +0100 Subject: [PATCH 2/6] fix: improve shutdown behaviour. check if processes terminate properly otherwise send SIGKILL after timeout is reached --- supervisor/options.py | 1 + supervisor/process.py | 19 +++++++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/supervisor/options.py b/supervisor/options.py index 20c1b07aa..3e34e243d 100644 --- a/supervisor/options.py +++ b/supervisor/options.py @@ -942,6 +942,7 @@ def get(section, opt, *args, **kwargs): serverurl = get(section, 'serverurl', None) if serverurl and serverurl.strip().upper() == 'AUTO': serverurl = None + disable_force_shutdown = get(section, 'disable_force_shutdown', False) # find uid from "user" option user = get(section, 'user', None) diff --git a/supervisor/process.py b/supervisor/process.py index d4cc46a91..14abb1c32 100644 --- a/supervisor/process.py +++ b/supervisor/process.py @@ -467,10 +467,14 @@ def kill(self, sig): try: # make sure all child processes are properly stopped as well parent = psutil.Process(abs(pid)) - for child in parent.children(recursive=True): # or parent.children() for recursive=False - # kill child processes with same signal as parent + children = parent.children(recursive=True) + for child in reversed(children): + # kill all child processes with same signal as parent options.kill(child.pid, sig) + self.kill_process_after_timeout_is_reached(child.pid) options.kill(pid, sig) + self.kill_process_after_timeout_is_reached(pid) + except OSError as exc: if exc.errno == errno.ESRCH: msg = ("unable to signal %s (pid %s), it probably just exited " @@ -492,6 +496,17 @@ def kill(self, sig): return None + def kill_process_after_timeout_is_reached(self, pid, timeout=30): + start_time = time.time() + while time.time() < start_time + timeout: + if psutil.pid_exists(pid): + time.sleep(1) + else: + return + if not self.config.disable_force_shutdown: + # Send SIGKILL if process does not terminate succesfully + self.config.options.kill(pid, signal.SIGKILL) + def signal(self, sig): """Send a signal to the subprocess, without intending to kill it. From c26fc323f09e84206270513e51dad4cfd43ab05b Mon Sep 17 00:00:00 2001 From: Michael Hammann Date: Mon, 28 Mar 2022 17:49:20 +0200 Subject: [PATCH 3/6] fix: make sure the main loop is not stopped while checking for processes which need to be killed --- supervisor/options.py | 3 ++- supervisor/process.py | 32 ++++++++++++-------------------- supervisor/rpcinterface.py | 2 +- supervisor/supervisord.py | 31 +++++++++++++++++++++++++++---- 4 files changed, 42 insertions(+), 26 deletions(-) diff --git a/supervisor/options.py b/supervisor/options.py index 3e34e243d..a2586e2d5 100644 --- a/supervisor/options.py +++ b/supervisor/options.py @@ -1069,6 +1069,7 @@ def get(section, opt, *args, **kwargs): redirect_stderr=redirect_stderr, environment=environment, serverurl=serverurl) + disable_force_shutdown=disable_force_shutdown, programs.append(pconfig) @@ -1888,7 +1889,7 @@ class ProcessConfig(Config): 'stderr_logfile_backups', 'stderr_logfile_maxbytes', 'stderr_events_enabled', 'stderr_syslog', 'stopsignal', 'stopwaitsecs', 'stopasgroup', 'killasgroup', - 'exitcodes', 'redirect_stderr' ] + 'exitcodes', 'redirect_stderr', 'disable_force_shutdown' ] optional_param_names = [ 'environment', 'serverurl' ] def __init__(self, options, **params): diff --git a/supervisor/process.py b/supervisor/process.py index 14abb1c32..4a78244ec 100644 --- a/supervisor/process.py +++ b/supervisor/process.py @@ -377,11 +377,11 @@ def _check_and_adjust_for_system_clock_rollback(self, test_time): if self.delay > 0 and test_time < (self.delay - self.backoff): self.delay = test_time + self.backoff - def stop(self): + def stop(self, supervisor): """ Administrative stop """ self.administrative_stop = True self.laststopreport = 0 - return self.kill(self.config.stopsignal) + return self.kill(self.config.stopsignal, supervisor) def stop_report(self): """ Log a 'waiting for x to stop' message with throttling. """ @@ -402,7 +402,7 @@ def give_up(self): self._assertInState(ProcessStates.BACKOFF) self.change_state(ProcessStates.FATAL) - def kill(self, sig): + def kill(self, sig, supervisor): """Send a signal to the subprocess with the intention to kill it (to make it exit). This may or may not actually kill it. @@ -471,9 +471,12 @@ def kill(self, sig): for child in reversed(children): # kill all child processes with same signal as parent options.kill(child.pid, sig) - self.kill_process_after_timeout_is_reached(child.pid) + # add to list but only if not disabled + if not self.config.disable_force_shutdown and sig is not signal.SIGKILL: + supervisor.terminated_processes.append((child.pid, time.time())) options.kill(pid, sig) - self.kill_process_after_timeout_is_reached(pid) + if not self.config.disable_force_shutdown and sig is not signal.SIGKILL: + supervisor.terminated_processes.append((pid, time.time())) except OSError as exc: if exc.errno == errno.ESRCH: @@ -496,17 +499,6 @@ def kill(self, sig): return None - def kill_process_after_timeout_is_reached(self, pid, timeout=30): - start_time = time.time() - while time.time() < start_time + timeout: - if psutil.pid_exists(pid): - time.sleep(1) - else: - return - if not self.config.disable_force_shutdown: - # Send SIGKILL if process does not terminate succesfully - self.config.options.kill(pid, signal.SIGKILL) - def signal(self, sig): """Send a signal to the subprocess, without intending to kill it. @@ -736,7 +728,7 @@ def transition(self): self.config.options.logger.warn( 'killing \'%s\' (%s) with SIGKILL' % (processname, self.pid)) - self.kill(signal.SIGKILL) + self.kill(signal.SIGKILL, supervisor) class FastCGISubprocess(Subprocess): """Extends Subprocess class to handle FastCGI subprocesses""" @@ -831,7 +823,7 @@ def reopenlogs(self): for process in self.processes.values(): process.reopenlogs() - def stop_all(self): + def stop_all(self, supervisor): processes = list(self.processes.values()) processes.sort() processes.reverse() # stop in desc priority order @@ -840,10 +832,10 @@ def stop_all(self): state = proc.get_state() if state == ProcessStates.RUNNING: # RUNNING -> STOPPING - proc.stop() + proc.stop(supervisor) elif state == ProcessStates.STARTING: # STARTING -> STOPPING - proc.stop() + proc.stop(supervisor) elif state == ProcessStates.BACKOFF: # BACKOFF -> FATAL proc.give_up() diff --git a/supervisor/rpcinterface.py b/supervisor/rpcinterface.py index 7b32ae8c9..1e394dc2f 100644 --- a/supervisor/rpcinterface.py +++ b/supervisor/rpcinterface.py @@ -411,7 +411,7 @@ def stopProcess(self, name, wait=True): if process.get_state() not in RUNNING_STATES: raise RPCError(Faults.NOT_RUNNING, name) - msg = process.stop() + msg = process.stop(self.supervisord) if msg is not None: raise RPCError(Faults.FAILED, msg) diff --git a/supervisor/supervisord.py b/supervisor/supervisord.py index 138732dd4..cb4c7ec05 100755 --- a/supervisor/supervisord.py +++ b/supervisor/supervisord.py @@ -34,6 +34,7 @@ import os import time import signal +import psutil from supervisor.medusa import asyncore_25 as asyncore @@ -55,6 +56,7 @@ def __init__(self, options): self.options = options self.process_groups = {} self.ticks = {} + self.terminated_processes = [] def main(self): if not self.options.first: @@ -156,7 +158,7 @@ def shutdown_report(self): def ordered_stop_groups_phase_1(self): if self.stop_groups: # stop the last group (the one with the "highest" priority) - self.stop_groups[-1].stop_all() + self.stop_groups[-1].stop_all(supervisor=self) def ordered_stop_groups_phase_2(self): # after phase 1 we've transitioned and reaped, let's see if we @@ -196,9 +198,12 @@ def runforever(self): self.ordered_stop_groups_phase_1() if not self.shutdown_report(): - # if there are no unstopped processes (we're done - # killing everything), it's OK to shutdown or reload - raise asyncore.ExitNow + # make sure all processes are properly terminated + self.kill_process_after_reaching_timeout() + if not self.terminated_processes: + # if there are no unstopped processes (we're done + # killing everything), it's OK to shutdown or reload + raise asyncore.ExitNow for fd, dispatcher in combined_map.items(): if dispatcher.readable(): @@ -241,6 +246,7 @@ def runforever(self): for group in pgroups: group.transition() + self.kill_process_after_reaching_timeout() self.reap() self.handle_signal() self.tick() @@ -316,6 +322,23 @@ def handle_signal(self): def get_state(self): return self.options.mood + def kill_process_after_reaching_timeout(self, timeout=60): + """ + Iterate over list of previously terminated processes and check if they + are still alive. Send SIGKILL to all processes which are not not + properly shutdown before the timeout is reached. + """ + if self.terminated_processes: + current_time = time.time() + for pid, terminate_time in self.terminated_processes: + if psutil.pid_exists(pid): + if current_time - terminate_time > timeout: + # Send SIGKILL if process does not terminate succesfully + os.kill(pid, signal.SIGKILL) + self.terminated_processes.remove((pid, terminate_time)) + else: + self.terminated_processes.remove((pid, terminate_time)) + def timeslice(period, when): return int(when - (when % period)) From 9a2a1270389e146ea3de1b6ddd74d8fbb6244853 Mon Sep 17 00:00:00 2001 From: Michael Hammann Date: Thu, 14 Apr 2022 14:30:57 +0200 Subject: [PATCH 4/6] fix: make sure options are properly parsed --- supervisor/options.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/supervisor/options.py b/supervisor/options.py index a2586e2d5..75f61562b 100644 --- a/supervisor/options.py +++ b/supervisor/options.py @@ -1068,8 +1068,8 @@ def get(section, opt, *args, **kwargs): exitcodes=exitcodes, redirect_stderr=redirect_stderr, environment=environment, - serverurl=serverurl) - disable_force_shutdown=disable_force_shutdown, + serverurl=serverurl, + disable_force_shutdown=disable_force_shutdown) programs.append(pconfig) @@ -1889,8 +1889,8 @@ class ProcessConfig(Config): 'stderr_logfile_backups', 'stderr_logfile_maxbytes', 'stderr_events_enabled', 'stderr_syslog', 'stopsignal', 'stopwaitsecs', 'stopasgroup', 'killasgroup', - 'exitcodes', 'redirect_stderr', 'disable_force_shutdown' ] - optional_param_names = [ 'environment', 'serverurl' ] + 'exitcodes', 'redirect_stderr'] + optional_param_names = [ 'environment', 'serverurl', 'disable_force_shutdown'] def __init__(self, options, **params): self.options = options From e5fe207b7ff64e00ea3008a3f7a1b74c22a9b110 Mon Sep 17 00:00:00 2001 From: Michael Hammann Date: Thu, 14 Apr 2022 14:34:41 +0200 Subject: [PATCH 5/6] fix: update all function signatures and fix tests for improved shutdown behavior --- supervisor/process.py | 20 +++++++++++++------- supervisor/rpcinterface.py | 5 ++++- supervisor/tests/base.py | 9 +++++---- tox.ini | 1 + 4 files changed, 23 insertions(+), 12 deletions(-) diff --git a/supervisor/process.py b/supervisor/process.py index 4a78244ec..bc0da7dd6 100644 --- a/supervisor/process.py +++ b/supervisor/process.py @@ -377,7 +377,7 @@ def _check_and_adjust_for_system_clock_rollback(self, test_time): if self.delay > 0 and test_time < (self.delay - self.backoff): self.delay = test_time + self.backoff - def stop(self, supervisor): + def stop(self, supervisor=None): """ Administrative stop """ self.administrative_stop = True self.laststopreport = 0 @@ -402,12 +402,18 @@ def give_up(self): self._assertInState(ProcessStates.BACKOFF) self.change_state(ProcessStates.FATAL) - def kill(self, sig, supervisor): + def kill(self, sig, supervisor=None): """Send a signal to the subprocess with the intention to kill it (to make it exit). This may or may not actually kill it. Return None if the signal was sent, or an error message string if an error occurred or if the subprocess is not running. + Parameters: + sig: signal to kill the process + supervsior: supervisor instance to keep track of signaled + processes. This is needed to check if a process properly + terminated. If no supervisor instance is given, this will + not be checked. """ now = time.time() options = self.config.options @@ -471,11 +477,11 @@ def kill(self, sig, supervisor): for child in reversed(children): # kill all child processes with same signal as parent options.kill(child.pid, sig) - # add to list but only if not disabled - if not self.config.disable_force_shutdown and sig is not signal.SIGKILL: + # add to list but only if not disabled and supervisor instance provided + if not self.config.disable_force_shutdown and sig is not signal.SIGKILL and supervisor is not None: supervisor.terminated_processes.append((child.pid, time.time())) options.kill(pid, sig) - if not self.config.disable_force_shutdown and sig is not signal.SIGKILL: + if not self.config.disable_force_shutdown and sig is not signal.SIGKILL and supervisor is not None: supervisor.terminated_processes.append((pid, time.time())) except OSError as exc: @@ -666,7 +672,7 @@ def __repr__(self): def get_state(self): return self.state - def transition(self): + def transition(self, supervisor=None): now = time.time() state = self.state @@ -823,7 +829,7 @@ def reopenlogs(self): for process in self.processes.values(): process.reopenlogs() - def stop_all(self, supervisor): + def stop_all(self, supervisor=None): processes = list(self.processes.values()) processes.sort() processes.reverse() # stop in desc priority order diff --git a/supervisor/rpcinterface.py b/supervisor/rpcinterface.py index 1e394dc2f..12882df93 100644 --- a/supervisor/rpcinterface.py +++ b/supervisor/rpcinterface.py @@ -411,7 +411,10 @@ def stopProcess(self, name, wait=True): if process.get_state() not in RUNNING_STATES: raise RPCError(Faults.NOT_RUNNING, name) - msg = process.stop(self.supervisord) + try: + msg = process.stop(self.supervisord) + except: + msg = process.stop() if msg is not None: raise RPCError(Faults.FAILED, msg) diff --git a/supervisor/tests/base.py b/supervisor/tests/base.py index 7553a2cce..061a63111 100644 --- a/supervisor/tests/base.py +++ b/supervisor/tests/base.py @@ -434,7 +434,7 @@ def removelogs(self): def get_state(self): return self.state - def stop(self): + def stop(self, supervisor=None): self.stop_called = True self.killing = False from supervisor.process import ProcessStates @@ -443,7 +443,7 @@ def stop(self): def stop_report(self): self.stop_report_called = True - def kill(self, signal): + def kill(self, signal, supervisor=None): self.killed_with = signal def signal(self, signal): @@ -517,7 +517,7 @@ def __init__(self, options, name, command, directory=None, umask=None, stderr_syslog=False, redirect_stderr=False, stopsignal=None, stopwaitsecs=10, stopasgroup=False, killasgroup=False, - exitcodes=(0,), environment=None, serverurl=None): + exitcodes=(0,), environment=None, serverurl=None, disable_force_shutdown=False): self.options = options self.name = name self.command = command @@ -553,6 +553,7 @@ def __init__(self, options, name, command, directory=None, umask=None, self.umask = umask self.autochildlogs_created = False self.serverurl = serverurl + self.disable_force_shutdown = disable_force_shutdown def get_path(self): return ["/bin", "/usr/bin", "/usr/local/bin"] @@ -1049,7 +1050,7 @@ def transition(self): def before_remove(self): self.before_remove_called = True - def stop_all(self): + def stop_all(self, supervisor=None): self.all_stopped = True def get_unstopped_processes(self): diff --git a/tox.ini b/tox.ini index 28953354b..7c1da8daf 100644 --- a/tox.ini +++ b/tox.ini @@ -8,6 +8,7 @@ deps = pytest pexpect == 4.7.0 # see https://github.com/Supervisor/supervisor/issues/1327 mock >= 0.5.0 + psutil == 5.9.0 passenv = END_TO_END commands = pytest {posargs} From b243d05d8085597a46fabbd47e55dae001efb0b8 Mon Sep 17 00:00:00 2001 From: Michael Hammann Date: Tue, 26 Apr 2022 15:56:16 +0200 Subject: [PATCH 6/6] fix: add try catch block when killing child processes. this is needed when trying to kill child processes which run in a docker container --- supervisor/process.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/supervisor/process.py b/supervisor/process.py index bc0da7dd6..7d0059275 100644 --- a/supervisor/process.py +++ b/supervisor/process.py @@ -475,11 +475,17 @@ def kill(self, sig, supervisor=None): parent = psutil.Process(abs(pid)) children = parent.children(recursive=True) for child in reversed(children): - # kill all child processes with same signal as parent - options.kill(child.pid, sig) - # add to list but only if not disabled and supervisor instance provided - if not self.config.disable_force_shutdown and sig is not signal.SIGKILL and supervisor is not None: - supervisor.terminated_processes.append((child.pid, time.time())) + try: + # kill all child processes with same signal as parent + options.kill(child.pid, sig) + # add to list but only if not disabled + if not self.config.disable_force_shutdown and sig is not signal.SIGKILL: + supervisor.terminated_processes.append((child.pid, time.time())) + except PermissionError: + msg = ("No permission to signal child process with name '%s' and pid %i of parent process '%s' with pid %i. " + % (child.name(), child.pid, parent.name(), parent.pid) + + "Will continue with sending %s to parent process" % (sig.name)) + options.logger.warn(msg) options.kill(pid, sig) if not self.config.disable_force_shutdown and sig is not signal.SIGKILL and supervisor is not None: supervisor.terminated_processes.append((pid, time.time()))