Improved logging when plugins are cancelled for taking too long

Fixes: #322 Signed-off-by: Sam Morris <sam@robots.org.uk>
freeipa · yrro · Mar 12, 2024 · Mar 12, 2024 · Mar 12, 2024 · Mar 12, 2024
commit 001e8d97a0c0efc84a39545d4f4d6459aacf758e
diff --git a/src/ipahealthcheck/core/core.py b/src/ipahealthcheck/core/core.py
@@ -45,9 +45,19 @@ def find_plugins(name, registry):
 
 
 def run_plugin(plugin, available=(), timeout=constants.DEFAULT_TIMEOUT):
+    plugin_name = F"{plugin.__class__.__module__}:{plugin.__class__.__name__}"
+    timed_out = []
     def signal_handler(signum, frame):
-        raise TimeoutError('Request timed out')
-
+        # Our exception will likely be caught by code called by the plugin,
+        # which may not log the traceback, or may even catch the exception
+        # itself! So while we throw the exception in order to interrupt the
+        # plugin code, we also stash a copy of it so that we can log it after
+        # the plugin returns.
+        timed_out.append(TimeoutError(f"Health check {plugin_name} cancelled after after {timeout} sec"))
+        logger.error("--- %s ---", timed_out[0])
+        traceback.print_stack()
+        logger.error("--- The following messages were logged by the plugin after it was cancelled. They may not indicate the reason why the plugin took too long to run. ---")
+        raise timed_out[0]
     # manually calculate duration when we create results of our own
     start = datetime.now(tz=timezone.utc)
     signal.signal(signal.SIGALRM, signal_handler)
@@ -58,16 +68,36 @@ def signal_handler(signum, frame):
                 # Treat no result as success, fudge start time
                 result = Result(plugin, constants.SUCCESS, start=start)
             yield result
-    except TimeoutError as e:
-        yield Result(plugin, constants.ERROR, exception=str(e),
-                     start=start)
     except Exception as e:
-        logger.debug('Exception raised: %s', e)
-        logger.debug(traceback.format_exc())
-        yield Result(plugin, constants.CRITICAL, exception=str(e),
-                     traceback=traceback.format_exc(),
-                     start=start)
+        if e is TimeoutError:
+            # The plugin code _may_ have raised our original TimeoutError
+            # however we also have to deal with cases where it did not raise
+            # anything! We do that in the finally block below; we don't
+            # therefore need to do anything here.
+            pass
+        else:
+            # We've got no way to know whether the plugin's own exception was
+            # related to our TimeoutError; if it was then we will yield a
+            # result here based on the plugin's own exception, and _also_ later
+            # on in the finally block.
+            logger.log('Exception raised in health check %r: %s', plugin_name, e, level=constants.CRITICAL, exc_info=True)
+            yield Result(plugin, constants.CRITICAL, exception=str(e),
+                         traceback=traceback.format_exc(),
+                         start=start)
     finally:
+        if timed_out:
+            logger.error("--- Increasing the timeout in /etc/ipahealthcheck/ipahealthcheck.conf may allow this healthcheck to complete before the deadline. ---")
+            # If the plugin already handled our TimeoutError properly (i.e,
+            # logged it and raised its own exception with a sensible error
+            # message) then there is no need to yield our own error result
+            # here; however we can't rely on the plugin doing so. It may have
+            # caught our TimeoutError and logged something misleading; in that
+            # case let's err on the side of caution and return an additional
+            # result.
+            yield Result(plugin, constants.ERROR, exception=str(timed_out[0]),
+                         start=start)
+        else:
+            logging.debug("Plugin %s complete after %s sec", plugin_name, datetime.utcnow() - start)
         signal.alarm(0)