From bf3e85114357e27630c0d287f8805f5f13375eba Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Wed, 1 Jul 2020 16:04:11 -0500 Subject: [PATCH] Do not put removed jobs on hold Without excluding removed jobs, this hold expression can result in a race condition where jobs are removed and immediately put back on hold so that they remain in the queue and oscillate between these two states --- config/01-ce-router-defaults.conf.in | 4 ++-- config/01-ce-router.conf.in | 4 ++-- rpm/htcondor-ce.spec | 2 ++ 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/config/01-ce-router-defaults.conf.in b/config/01-ce-router-defaults.conf.in index 625031c38..a9a596bb2 100644 --- a/config/01-ce-router-defaults.conf.in +++ b/config/01-ce-router-defaults.conf.in @@ -35,12 +35,12 @@ GRIDMANAGER_MAX_SUBMITTED_JOBS_PER_RESOURCE = $(CONDORCE_MAX_JOBS) # Only route jobs with a valid, non-expired proxy for either the vanilla or standard universe. JOB_ROUTER_SOURCE_JOB_CONSTRAINT = (target.x509userproxysubject =!= UNDEFINED) && (target.x509UserProxyExpiration =!= UNDEFINED) && (time() < target.x509UserProxyExpiration) && (target.JobUniverse =?= 5 || target.JobUniverse =?= 1) -# Put jobs on hold if they meet any of the following requirements +# Put jobs on hold if they are not in the removed state and meet any of the following requirements: # 1. Is missing a proxy subject or proxy expiration date. # 2. Has an expired proxy and is not running # 3. It has not been routed by the CE and is not a standard, vanilla, scheduler, or local job. # 4. It has not been routed by the CE and has been idle for 30+ min -SYSTEM_PERIODIC_HOLD = (x509userproxysubject =?= UNDEFINED) || (x509UserProxyExpiration =?= UNDEFINED) || (time() > x509UserProxyExpiration && JobStatus =!= 2) || (RoutedBy is null && JobUniverse =!= 1 && JobUniverse =!= 5 && JobUniverse =!= 7 && JobUniverse =!= 12) || ((JobStatus =?= 1 && time() - EnteredCurrentStatus > 1800) && RoutedToJobId is null && RoutedJob =!= true) +SYSTEM_PERIODIC_HOLD = (JobStatus != 3) && ((x509userproxysubject =?= UNDEFINED) || (x509UserProxyExpiration =?= UNDEFINED) || (time() > x509UserProxyExpiration && JobStatus =!= 2) || (RoutedBy is null && JobUniverse =!= 1 && JobUniverse =!= 5 && JobUniverse =!= 7 && JobUniverse =!= 12) || ((JobStatus =?= 1 && time() - EnteredCurrentStatus > 1800) && RoutedToJobId is null && RoutedJob =!= true)) SYSTEM_PERIODIC_HOLD_REASON = \ strcat("HTCondor-CE held job due to ", \ diff --git a/config/01-ce-router.conf.in b/config/01-ce-router.conf.in index ba113b81b..a725e3c8e 100644 --- a/config/01-ce-router.conf.in +++ b/config/01-ce-router.conf.in @@ -31,12 +31,12 @@ CONDORCE_MAX_JOBS = 10000 # Only route jobs with a valid, non-expired proxy for either the vanilla or standard universe. JOB_ROUTER_SOURCE_JOB_CONSTRAINT = (target.x509userproxysubject =!= UNDEFINED) && (target.x509UserProxyExpiration =!= UNDEFINED) && (time() < target.x509UserProxyExpiration) && (target.JobUniverse =?= 5 || target.JobUniverse =?= 1) -# Put jobs on hold if they meet any of the following requirements +# Put jobs on hold if they are not in the removed state and meet any of the following requirements: # 1. Is missing a proxy subject or proxy expiration date. # 2. Has an expired proxy and is not running # 3. It has not been routed by the CE and is not a standard, vanilla, scheduler, or local job. # 4. It has not been routed by the CE and has been idle for 30+ min -SYSTEM_PERIODIC_HOLD = (x509userproxysubject =?= UNDEFINED) || (x509UserProxyExpiration =?= UNDEFINED) || (time() > x509UserProxyExpiration && JobStatus =!= 2) || (RoutedBy is null && JobUniverse =!= 1 && JobUniverse =!= 5 && JobUniverse =!= 7 && JobUniverse =!= 12) || ((JobStatus =?= 1 && time() - EnteredCurrentStatus > 1800) && RoutedToJobId is null && RoutedJob =!= true) +SYSTEM_PERIODIC_HOLD = (JobStatus != 3) && ((x509userproxysubject =?= UNDEFINED) || (x509UserProxyExpiration =?= UNDEFINED) || (time() > x509UserProxyExpiration && JobStatus =!= 2) || (RoutedBy is null && JobUniverse =!= 1 && JobUniverse =!= 5 && JobUniverse =!= 7 && JobUniverse =!= 12) || ((JobStatus =?= 1 && time() - EnteredCurrentStatus > 1800) && RoutedToJobId is null && RoutedJob =!= true)) SYSTEM_PERIODIC_HOLD_REASON = \ strcat("HTCondor-CE held job due to ", \ diff --git a/rpm/htcondor-ce.spec b/rpm/htcondor-ce.spec index 20625279b..e0933b53f 100644 --- a/rpm/htcondor-ce.spec +++ b/rpm/htcondor-ce.spec @@ -554,6 +554,8 @@ install -m 0755 -d -p $RPM_BUILD_ROOT/%{_sysconfdir}/condor-ce/bosco_override * Tue Jun 23 2020 Brian Lin - 3.4.3-1 - Fix a stacktrace with the BDII provider when `HTCONDORCE_SPEC` isn't defined in the local HTCondor configuration +- Fixed a race condition that could result in removed jobs being put + on hold * Mon Jun 15 2020 Brian Lin - 3.4.2-1 - Replace APEL uploader SchedD cron with init and systemd services