Skip to content

Commit

Permalink
[jobs] catch jobs that are DONE but non-terminal
Browse files Browse the repository at this point in the history
Usually, a job should always transition to a terminal status before it
is set to DONE. However, if a bug makes that fail for some
reason (e.g. the issues fixed in skypilot-org#4637), we should still catch this
state.
  • Loading branch information
cg505 committed Feb 4, 2025
1 parent 25ae5f2 commit d43069c
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 5 deletions.
15 changes: 12 additions & 3 deletions sky/jobs/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -705,8 +705,9 @@ def get_jobs_to_check_status(job_id: Optional[int] = None) -> List[int]:
job_id: Optional job ID to check. If None, checks all jobs.
Returns a list of job_ids, including the following:
- For jobs with schedule state: jobs that have schedule state not DONE
- For legacy jobs (no schedule state): jobs that are in non-terminal status
- Jobs that have a schedule_state that is not DONE
- Jobs have schedule_state DONE but are in a non-terminal status
- Legacy jobs (that is, no schedule state) that are in non-terminal status
"""
job_filter = '' if job_id is None else 'AND spot.spot_job_id=(?)'
job_value = () if job_id is None else (job_id,)
Expand All @@ -728,14 +729,22 @@ def get_jobs_to_check_status(job_id: Optional[int] = None) -> List[int]:
LEFT OUTER JOIN job_info
ON spot.spot_job_id=job_info.spot_job_id
WHERE (
-- non-legacy jobs that are not DONE
(job_info.schedule_state IS NOT NULL AND
job_info.schedule_state IS NOT ?)
OR
(job_info.schedule_state IS NULL AND
-- legacy or DONE jobs that are in non-terminal status
((-- legacy jobs
job_info.schedule_state IS NULL OR
-- DONE jobs
job_info.schedule_state IS ?
) AND
-- non-terminal
status NOT IN ({status_filter_str}))
)
{job_filter}
ORDER BY spot.spot_job_id DESC""", [
ManagedJobScheduleState.DONE.value,
ManagedJobScheduleState.DONE.value, *terminal_status_values,
*job_value
]).fetchall()
Expand Down
17 changes: 16 additions & 1 deletion sky/jobs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,22 @@ def _handle_legacy_job(job_id: int):

# For jobs with schedule state:
pid = tasks[0]['controller_pid']
if pid is None:
if schedule_state == managed_job_state.ManagedJobScheduleState.DONE:
if all(task['status'].is_terminal() for task in tasks):
# Turns out this job is fine, even though it got pulled by
# get_jobs_to_check_status. Either the job was somehow in an
# inconsistent state that already resolved itself, or we have
# a bug in the query.
logger.warning('We expected to find a nonterminal done job '
f'{job_id}, but all tasks are terminal.')
continue

logger.error(f'Job {job_id} has DONE schedule state, but some '
f'tasks are not terminal. Task statuses: '
f'{", ".join(task["status"].value for task in tasks)}')
failure_reason = ('Job controller is DONE, but some tasks are not '
'terminal.')
elif pid is None:
if schedule_state in (
managed_job_state.ManagedJobScheduleState.INACTIVE,
managed_job_state.ManagedJobScheduleState.WAITING):
Expand Down
2 changes: 1 addition & 1 deletion sky/skylet/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@
# cluster yaml is updated.
#
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
SKYLET_VERSION = '11'
SKYLET_VERSION = '12'
# The version of the lib files that skylet/jobs use. Whenever there is an API
# change for the job_lib or log_lib, we need to bump this version, so that the
# user can be notified to update their SkyPilot version on the remote cluster.
Expand Down

0 comments on commit d43069c

Please sign in to comment.