From d43069c442b674ebc512b0574584b0ef6d28d4d5 Mon Sep 17 00:00:00 2001 From: Christopher Cooper Date: Mon, 3 Feb 2025 18:00:29 -0800 Subject: [PATCH] [jobs] catch jobs that are DONE but non-terminal Usually, a job should always transition to a terminal status before it is set to DONE. However, if a bug makes that fail for some reason (e.g. the issues fixed in #4637), we should still catch this state. --- sky/jobs/state.py | 15 ++++++++++++--- sky/jobs/utils.py | 17 ++++++++++++++++- sky/skylet/constants.py | 2 +- 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/sky/jobs/state.py b/sky/jobs/state.py index 07f62c3c82d..b10c287395e 100644 --- a/sky/jobs/state.py +++ b/sky/jobs/state.py @@ -705,8 +705,9 @@ def get_jobs_to_check_status(job_id: Optional[int] = None) -> List[int]: job_id: Optional job ID to check. If None, checks all jobs. Returns a list of job_ids, including the following: - - For jobs with schedule state: jobs that have schedule state not DONE - - For legacy jobs (no schedule state): jobs that are in non-terminal status + - Jobs that have a schedule_state that is not DONE + - Jobs have schedule_state DONE but are in a non-terminal status + - Legacy jobs (that is, no schedule state) that are in non-terminal status """ job_filter = '' if job_id is None else 'AND spot.spot_job_id=(?)' job_value = () if job_id is None else (job_id,) @@ -728,14 +729,22 @@ def get_jobs_to_check_status(job_id: Optional[int] = None) -> List[int]: LEFT OUTER JOIN job_info ON spot.spot_job_id=job_info.spot_job_id WHERE ( + -- non-legacy jobs that are not DONE (job_info.schedule_state IS NOT NULL AND job_info.schedule_state IS NOT ?) OR - (job_info.schedule_state IS NULL AND + -- legacy or DONE jobs that are in non-terminal status + ((-- legacy jobs + job_info.schedule_state IS NULL OR + -- DONE jobs + job_info.schedule_state IS ? + ) AND + -- non-terminal status NOT IN ({status_filter_str})) ) {job_filter} ORDER BY spot.spot_job_id DESC""", [ + ManagedJobScheduleState.DONE.value, ManagedJobScheduleState.DONE.value, *terminal_status_values, *job_value ]).fetchall() diff --git a/sky/jobs/utils.py b/sky/jobs/utils.py index 3ae51b42e26..249cfd22486 100644 --- a/sky/jobs/utils.py +++ b/sky/jobs/utils.py @@ -257,7 +257,22 @@ def _handle_legacy_job(job_id: int): # For jobs with schedule state: pid = tasks[0]['controller_pid'] - if pid is None: + if schedule_state == managed_job_state.ManagedJobScheduleState.DONE: + if all(task['status'].is_terminal() for task in tasks): + # Turns out this job is fine, even though it got pulled by + # get_jobs_to_check_status. Either the job was somehow in an + # inconsistent state that already resolved itself, or we have + # a bug in the query. + logger.warning('We expected to find a nonterminal done job ' + f'{job_id}, but all tasks are terminal.') + continue + + logger.error(f'Job {job_id} has DONE schedule state, but some ' + f'tasks are not terminal. Task statuses: ' + f'{", ".join(task["status"].value for task in tasks)}') + failure_reason = ('Job controller is DONE, but some tasks are not ' + 'terminal.') + elif pid is None: if schedule_state in ( managed_job_state.ManagedJobScheduleState.INACTIVE, managed_job_state.ManagedJobScheduleState.WAITING): diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 8d543322c5c..dfcdb34dfd7 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -86,7 +86,7 @@ # cluster yaml is updated. # # TODO(zongheng,zhanghao): make the upgrading of skylet automatic? -SKYLET_VERSION = '11' +SKYLET_VERSION = '12' # The version of the lib files that skylet/jobs use. Whenever there is an API # change for the job_lib or log_lib, we need to bump this version, so that the # user can be notified to update their SkyPilot version on the remote cluster.