Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved logging when plugins are cancelled for taking too long #323

Open
wants to merge 18 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
appease flake8
yrro committed Mar 12, 2024
commit 47473e0ed7ced817ba59da0810edc11108271c47
32 changes: 25 additions & 7 deletions src/ipahealthcheck/core/core.py
Original file line number Diff line number Diff line change
@@ -47,17 +47,25 @@ def find_plugins(name, registry):
def run_plugin(plugin, available=(), timeout=constants.DEFAULT_TIMEOUT):
plugin_name = F"{plugin.__class__.__module__}:{plugin.__class__.__name__}"
timed_out = []

def signal_handler(signum, frame):
# Our exception will likely be caught by code called by the plugin,
# which may not log the traceback, or may even catch the exception
# itself! So while we throw the exception in order to interrupt the
# plugin code, we also stash a copy of it so that we can log it after
# the plugin returns.
timed_out.append(TimeoutError(f"Health check {plugin_name} cancelled after after {timeout} sec"))
timed_out.append(TimeoutError(
f"Health check {plugin_name} cancelled after after {timeout} sec"
))
logger.error("--- %s ---", timed_out[0])
traceback.print_stack()
logger.error("--- The following messages were logged by the plugin after it was cancelled. They may not indicate the reason why the plugin took too long to run. ---")
logger.error(
"--- The following messages were logged by the plugin after it"
" was cancelled. They may not indicate the reason why the plugin"
" took too long to run. ---"
)
raise timed_out[0]

# manually calculate duration when we create results of our own
start = datetime.now(tz=timezone.utc)
signal.signal(signal.SIGALRM, signal_handler)
@@ -78,15 +86,22 @@ def signal_handler(signum, frame):
else:
# We've got no way to know whether the plugin's own exception was
# related to our TimeoutError; if it was then we will yield a
# result here based on the plugin's own exception, and _also_ later
# on in the finally block.
logger.log('Exception raised in health check %r: %s', plugin_name, e, level=constants.CRITICAL, exc_info=True)
# result here based on the plugin's own exception, and _also_
# later on in the finally block.
logger.log(
'Exception raised in health check %r: %s',
plugin_name, e, level=constants.CRITICAL, exc_info=True
)
yield Result(plugin, constants.CRITICAL, exception=str(e),
traceback=traceback.format_exc(),
start=start)
finally:
if timed_out:
logger.error("--- Increasing the timeout in /etc/ipahealthcheck/ipahealthcheck.conf may allow this healthcheck to complete before the deadline. ---")
logger.error(
"--- Increasing the timeout in"
" /etc/ipahealthcheck/ipahealthcheck.conf may allow this"
" healthcheck to complete before the deadline. ---"
)
# If the plugin already handled our TimeoutError properly (i.e,
# logged it and raised its own exception with a sensible error
# message) then there is no need to yield our own error result
@@ -97,7 +112,10 @@ def signal_handler(signum, frame):
yield Result(plugin, constants.ERROR, exception=str(timed_out[0]),
start=start)
else:
logging.debug("Plugin %s complete after %s sec", plugin_name, datetime.utcnow() - start)
logging.debug(
"Plugin %s complete after %s sec",
plugin_name, datetime.utcnow() - start
)
signal.alarm(0)