diff --git a/celery_spider_cms.py b/celery_spider_cms.py index 89b63c2..fc7c292 100644 --- a/celery_spider_cms.py +++ b/celery_spider_cms.py @@ -62,8 +62,11 @@ def main_driver(args): # for logging pourposes. # The propagate false will prevent it to raise # an exception if any of the schedds query failed. - groups = res.get(propagate=False) - logging.debug(groups) + _query_res = res.get(propagate=False) + logging.debug(_query_res) + if res.failed(): + logging.warning("At least one of the schedd queries failed") + print(time.time() - start_time) diff --git a/src/htcondor_es/celery/tasks.py b/src/htcondor_es/celery/tasks.py index c9eb965..fa948e7 100644 --- a/src/htcondor_es/celery/tasks.py +++ b/src/htcondor_es/celery/tasks.py @@ -20,6 +20,7 @@ from htcondor_es.utils import ( get_schedds, collect_metadata, + send_email_alert, TIMEOUT_MINS, ) from htcondor_es.amq import post_ads @@ -55,6 +56,7 @@ acks_late=True, # Only ack the message when done retry_backoff=5, # Wait between retries (5, 10,15)s reject_on_worker_lost=True, # If the worker is killed (e.g. by k8s) reasign the task + on_failure=log_failure, ) def query_schedd( schedd_ad, @@ -223,6 +225,16 @@ def create_affiliation_dir(days=1): pass +def log_failure(self, exc, task_id, args, kwargs, einfo): + """Send email message and log error. + (this only should be send if all retries failed) + """ + message = f"failed to query {args}, {kwargs}" + logging.error(f"failed to query {args}, {kwargs}") + # TODO: Change email with parameters + send_email_alert("carizapo@cern.ch", "[Spider] Failed to query", message) + + # ---Utils--- def grouper(iterable, n, fillvalue=None): """Collect data into fixed-length chunks or blocks