Skip to content

Commit

Permalink
Do not put removed jobs on hold
Browse files Browse the repository at this point in the history
Without excluding removed jobs, this hold expression can result in a
race condition where jobs are removed and immediately put back on hold
so that they remain in the queue and oscillate between these two states
  • Loading branch information
brianhlin committed Jul 1, 2020
1 parent 00558a8 commit bf3e851
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 4 deletions.
4 changes: 2 additions & 2 deletions config/01-ce-router-defaults.conf.in
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,12 @@ GRIDMANAGER_MAX_SUBMITTED_JOBS_PER_RESOURCE = $(CONDORCE_MAX_JOBS)
# Only route jobs with a valid, non-expired proxy for either the vanilla or standard universe.
JOB_ROUTER_SOURCE_JOB_CONSTRAINT = (target.x509userproxysubject =!= UNDEFINED) && (target.x509UserProxyExpiration =!= UNDEFINED) && (time() < target.x509UserProxyExpiration) && (target.JobUniverse =?= 5 || target.JobUniverse =?= 1)

# Put jobs on hold if they meet any of the following requirements
# Put jobs on hold if they are not in the removed state and meet any of the following requirements:
# 1. Is missing a proxy subject or proxy expiration date.
# 2. Has an expired proxy and is not running
# 3. It has not been routed by the CE and is not a standard, vanilla, scheduler, or local job.
# 4. It has not been routed by the CE and has been idle for 30+ min
SYSTEM_PERIODIC_HOLD = (x509userproxysubject =?= UNDEFINED) || (x509UserProxyExpiration =?= UNDEFINED) || (time() > x509UserProxyExpiration && JobStatus =!= 2) || (RoutedBy is null && JobUniverse =!= 1 && JobUniverse =!= 5 && JobUniverse =!= 7 && JobUniverse =!= 12) || ((JobStatus =?= 1 && time() - EnteredCurrentStatus > 1800) && RoutedToJobId is null && RoutedJob =!= true)
SYSTEM_PERIODIC_HOLD = (JobStatus != 3) && ((x509userproxysubject =?= UNDEFINED) || (x509UserProxyExpiration =?= UNDEFINED) || (time() > x509UserProxyExpiration && JobStatus =!= 2) || (RoutedBy is null && JobUniverse =!= 1 && JobUniverse =!= 5 && JobUniverse =!= 7 && JobUniverse =!= 12) || ((JobStatus =?= 1 && time() - EnteredCurrentStatus > 1800) && RoutedToJobId is null && RoutedJob =!= true))

SYSTEM_PERIODIC_HOLD_REASON = \
strcat("HTCondor-CE held job due to ", \
Expand Down
4 changes: 2 additions & 2 deletions config/01-ce-router.conf.in
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,12 @@ CONDORCE_MAX_JOBS = 10000
# Only route jobs with a valid, non-expired proxy for either the vanilla or standard universe.
JOB_ROUTER_SOURCE_JOB_CONSTRAINT = (target.x509userproxysubject =!= UNDEFINED) && (target.x509UserProxyExpiration =!= UNDEFINED) && (time() < target.x509UserProxyExpiration) && (target.JobUniverse =?= 5 || target.JobUniverse =?= 1)

# Put jobs on hold if they meet any of the following requirements
# Put jobs on hold if they are not in the removed state and meet any of the following requirements:
# 1. Is missing a proxy subject or proxy expiration date.
# 2. Has an expired proxy and is not running
# 3. It has not been routed by the CE and is not a standard, vanilla, scheduler, or local job.
# 4. It has not been routed by the CE and has been idle for 30+ min
SYSTEM_PERIODIC_HOLD = (x509userproxysubject =?= UNDEFINED) || (x509UserProxyExpiration =?= UNDEFINED) || (time() > x509UserProxyExpiration && JobStatus =!= 2) || (RoutedBy is null && JobUniverse =!= 1 && JobUniverse =!= 5 && JobUniverse =!= 7 && JobUniverse =!= 12) || ((JobStatus =?= 1 && time() - EnteredCurrentStatus > 1800) && RoutedToJobId is null && RoutedJob =!= true)
SYSTEM_PERIODIC_HOLD = (JobStatus != 3) && ((x509userproxysubject =?= UNDEFINED) || (x509UserProxyExpiration =?= UNDEFINED) || (time() > x509UserProxyExpiration && JobStatus =!= 2) || (RoutedBy is null && JobUniverse =!= 1 && JobUniverse =!= 5 && JobUniverse =!= 7 && JobUniverse =!= 12) || ((JobStatus =?= 1 && time() - EnteredCurrentStatus > 1800) && RoutedToJobId is null && RoutedJob =!= true))

SYSTEM_PERIODIC_HOLD_REASON = \
strcat("HTCondor-CE held job due to ", \
Expand Down
2 changes: 2 additions & 0 deletions rpm/htcondor-ce.spec
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,8 @@ install -m 0755 -d -p $RPM_BUILD_ROOT/%{_sysconfdir}/condor-ce/bosco_override
* Tue Jun 23 2020 Brian Lin <[email protected]> - 3.4.3-1
- Fix a stacktrace with the BDII provider when `HTCONDORCE_SPEC` isn't
defined in the local HTCondor configuration
- Fixed a race condition that could result in removed jobs being put
on hold

* Mon Jun 15 2020 Brian Lin <[email protected]> - 3.4.2-1
- Replace APEL uploader SchedD cron with init and systemd services
Expand Down

0 comments on commit bf3e851

Please sign in to comment.