Skip to content

Commit

Permalink
01.02.rc0
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrew-McNab-UK committed May 23, 2024
1 parent 350abf2 commit 7d4a9cd
Show file tree
Hide file tree
Showing 8 changed files with 48 additions and 19 deletions.
30 changes: 20 additions & 10 deletions agents/justin-job-factory
Original file line number Diff line number Diff line change
Expand Up @@ -86,13 +86,14 @@ def submitJobs(numberToSubmit,
desiredEntryNames,
requestedProcessors,
requestedRssBytes,
requestedWallSeconds):
requestedWallSeconds,
requestedGPUs):

# If targetting one site, then firstSiteID should be non-zero
# desiredSiteNames is a comma separated list of jobsub style site names
# to match GLIDEIN_Site classads and xc`others used by GlideInWMS

logLine('submitJobs %d %s %s %s %s w%ds%d %d %s %s %s %d %d %d' %
logLine('submitJobs %d %s %s %s %s w%ds%d %d %s %s %s %d %d %d %d' %
(numberToSubmit,
userName,
needsInnerApptainer,
Expand All @@ -106,7 +107,8 @@ def submitJobs(numberToSubmit,
desiredEntryNames,
requestedProcessors,
requestedRssBytes,
requestedWallSeconds))
requestedWallSeconds,
requestedGPUs))
try:
wrapperText = open('/var/lib/justin/justin-wrapper-job','r').read()
except Exception as e:
Expand Down Expand Up @@ -185,6 +187,9 @@ def submitJobs(numberToSubmit,
% (justin.proDev.upper(),
workflowID, stageID, desiredEntryNames))

if requestedGPUs:
submitFile += '+RequestGPUs = 1\n'

submitFile += """
+Desired_Sites = "%s"
+SingularityImage = "/cvmfs/singularity.opensciencegrid.org/fermilab/fnal-wn-sl7:latest"
Expand Down Expand Up @@ -286,6 +291,7 @@ queue %d
'stage_id=%d,'
'site_id=%d,'
'requested_processors=%d,'
'requested_gpus=%d,'
'requested_rss_bytes=%d,'
'requested_wall_seconds=%d,'
'submitted_time=NOW(),'
Expand All @@ -297,6 +303,7 @@ queue %d
stageID,
firstSiteID,
requestedProcessors,
requestedGPUs,
requestedRssBytes,
requestedWallSeconds,
justinJobSecret,
Expand All @@ -319,7 +326,7 @@ queue %d
if firstSiteID:
try:
justin.cur.execute('UPDATE sites SET last_submitted_time=NOW() '
'WHERE site_id=%d' % firstSiteID)
'WHERE site_id=%d' % firstSiteID)

except Exception as e:
# But try to keep going
Expand All @@ -335,7 +342,8 @@ def workflowJobs():
try:
stagesRows = justin.select(
'SELECT workflows.workflow_id,stages.stage_id,max_distance,'
'wlcg_group_name,stages.processors,stages.wall_seconds,stages.rss_bytes,'
'wlcg_group_name,stages.processors,stages.needs_gpu,'
'stages.wall_seconds,stages.rss_bytes,'
'principal_names.principal_name,condor_group_name,jobscript_image '
'FROM workflows '
'LEFT JOIN stages ON stages.workflow_id=workflows.workflow_id '
Expand All @@ -344,9 +352,8 @@ def workflowJobs():
'LEFT JOIN users ON users.user_id=workflows.user_id '
'LEFT JOIN principal_names '
'ON principal_names.user_id=users.main_pn_id '
# GET condor_group_id FROM workflows ONCE ALL ACTIVE WORKFLOWS HAVE THEM
'LEFT JOIN condor_groups '
'ON condor_groups.condor_group_id=scopes.condor_group_id '
'ON condor_groups.condor_group_id=workflows.condor_group_id '
'WHERE workflows.state="running" AND workflows.workflow_id<>%d '
'ORDER BY workflows.workflow_id'
% justin.awtWorkflowID, showQuery = False
Expand Down Expand Up @@ -411,12 +418,13 @@ def workflowJobs():
"AND NOT storages.decommissioned "
"AND sites.site_id IS NOT NULL "
"AND sites.enabled "
"AND NOT entries.entry_has_gpus "
"AND %s entries.entry_has_gpus "
"%s "
"ORDER BY sites_storages.distance,files.file_id "
% (stageRow['max_distance'],
stageRow['workflow_id'],
stageRow['stage_id'],
'' if stageRow['needs_gpu'] else 'NOT',
'' if (stageRow['wlcg_group_name'] == '/dune/production')
else 'AND entries.always_inner_apptainer'))

Expand Down Expand Up @@ -485,7 +493,8 @@ def workflowJobs():
desiredEntryNames = desiredEntryNames,
requestedProcessors = stageRow['processors'],
requestedRssBytes = stageRow['rss_bytes'],
requestedWallSeconds = stageRow['wall_seconds'])
requestedWallSeconds = stageRow['wall_seconds'],
requestedGPUs = int(stageRow['needs_gpu']))

def awtJobsToSites():
# Go through the sites submitting AWT jobs
Expand Down Expand Up @@ -514,7 +523,8 @@ def awtJobsToSites():
desiredEntryNames = '',
requestedProcessors = 1,
requestedRssBytes = 1024 * 1024 * 1024,
requestedWallSeconds = 3600)
requestedWallSeconds = 3600,
requestedGPUs = 0)

def oneCycle():
# Update the database with the states of submitted jobsub jobs
Expand Down
6 changes: 5 additions & 1 deletion commands/justin
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ import argparse
import platform

# The make-justin-tag script looks for this and updates it - so format matters
versionNumber = '01.01.rc7'
versionNumber = '01.02.rc0'
sessionFile = '/var/tmp/justin.session.' + str(os.getuid())

def body(buf):
Expand Down Expand Up @@ -138,6 +138,10 @@ parser.add_argument("--processors",
type = int,
help = "Number of processors required")

parser.add_argument("--gpu",
action = "store_true",
help = "Require a GPU")

parser.add_argument("--wall-seconds",
type = int,
help = "Maximum wall seconds")
Expand Down
9 changes: 6 additions & 3 deletions commands/justin.1
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,8 @@ process.
.HP
.B "create-stage --workflow-id ID --stage-id ID
.B --jobscript FILENAME|--jobscript-git ORG/PATH:TAG
.B [--wall-seconds N] [--rss-mib N] [--processors N] [--max-distance DIST]
.B [--output-pattern PATTERN[:DESTINATION]]
.B [--wall-seconds N] [--rss-mib N] [--processors N] [--gpu]
.B [--max-distance DIST] [--output-pattern PATTERN[:DESTINATION]]
.B [--output-pattern-next-stage PATTERN[:DATASET]] [--output-rse NAME]
.B [--lifetime-days DAYS] [--env NAME=VALUE] [--classad NAME=VALUE]
.br
Expand Down Expand Up @@ -149,6 +149,9 @@ If the script can make use of multiple processors then
.B --processors
can be used to give the number needed, with a default of 1 if not given. The
value used is available to jobscripts as $JUSTIN_PROCESSORS.
If given then
.B --gpu
will require that jobs for this stage have access to a GPU.

By default, input files will only be allocated to a script which are on
storages at the same site (distance=0). This can be changed by setting
Expand Down Expand Up @@ -227,7 +230,7 @@ this stage.
.B [--refind-interval-hours HOURS]
.B --jobscript FILENAME|--jobscript-git ORG/PATH:TAG
.B [--wall-seconds N]
.B [--rss-mib N] [--processors N] [--max-distance DIST]
.B [--rss-mib N] [--processors N] [--gpu] --max-distance DIST]
.B [--output-pattern PATTERN[:DESTINATION]] [--output-rse NAME]
.B [--lifetime-days DAYS] [--env NAME=VALUE] [--classad NAME=VALUE]
.br
Expand Down
4 changes: 4 additions & 0 deletions dashboard/justin-wsgi-dashboard
Original file line number Diff line number Diff line change
Expand Up @@ -1407,6 +1407,7 @@ def showStage(environ, user, cgiValues):
query = ("SELECT "
"stages.stage_priority,"
"stages.processors,"
"stages.gpus,"
"stages.wall_seconds,"
"stages.rss_bytes,"
"stages.max_distance,"
Expand Down Expand Up @@ -1441,6 +1442,9 @@ def showStage(environ, user, cgiValues):
output += ('<tr><td>Processors</td>'
'<td>%s</td></tr>' % stageRow["processors"])

if stageRow["gpus"]:
output += ('<tr><td>GPU required</td><td>Yes</td></tr>')

output += ('<tr><td>Wall seconds</td>'
'<td>%s</td></tr>' % stageRow["wall_seconds"])

Expand Down
6 changes: 4 additions & 2 deletions database/justindb-create-tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ CREATE TABLE IF NOT EXISTS `jobs` (
`rss_bytes` bigint unsigned NOT NULL DEFAULT 0,
`requested_rss_bytes` bigint unsigned NOT NULL DEFAULT 0,
`processors` tinyint unsigned NOT NULL DEFAULT 0,
`requested_processors` tinyint unsigned NOT NULL DEFAULT 0,
`requested_processors` tinyint unsigned NOT NULL DEFAULT 1,
`requested_gpus` tinyint unsigned NOT NULL DEFAULT 1,
`wall_seconds` mediumint unsigned NOT NULL DEFAULT 0,
`requested_wall_seconds` mediumint unsigned NOT NULL DEFAULT 0,
`justin_job_secret` varchar(255) NOT NULL DEFAULT '',
Expand Down Expand Up @@ -284,7 +285,8 @@ CREATE TABLE IF NOT EXISTS `stages` (
`workflow_id` mediumint(8) unsigned NOT NULL,
`stage_id` tinyint(3) unsigned NOT NULL DEFAULT 1,
`stage_priority` tinyint(3) unsigned NOT NULL DEFAULT 0,
`processors` tinyint(3) unsigned NOT NULL,
`processors` tinyint(3) unsigned NOT NULL DEFAULT 1,
`needs_gpu` tinyint(3) unsigned NOT NULL DEFAULT 0,
`jobscript_git` varchar(255) NOT NULL DEFAULT '',
`jobscript_image` varchar(255) NOT NULL DEFAULT '',
`wall_seconds` mediumint(8) unsigned DEFAULT NULL,
Expand Down
2 changes: 1 addition & 1 deletion modules/justin_version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# This file must by valid Python AND valid Bash!
justinVersion='01.01.rc7'
justinVersion='01.02.rc0'
8 changes: 7 additions & 1 deletion services/justin-wsgi-ui
Original file line number Diff line number Diff line change
Expand Up @@ -776,6 +776,11 @@ def createStage(jsonDict, user, simple = False, scopeName = None):
except:
processors = 1

try:
needsGPU = jsonDict['gpu']
except:
needsGPU = False

try:
wallSeconds = int(jsonDict['wall_seconds'])
except:
Expand Down Expand Up @@ -850,13 +855,14 @@ def createStage(jsonDict, user, simple = False, scopeName = None):
'jobscript_git="%s",'
'jobscript_image="%s",'
'processors=%d,'
'needs_gpu=%s,'
'wall_seconds=%d,'
'rss_bytes=%d,'
'max_distance=%f'
% (workflowID, stageID,
jobscriptGit,
jobscriptImage,
processors, wallSeconds,
processors, needsGPU, wallSeconds,
rssBytes, maxDistance))

justin.insertUpdate('INSERT INTO stages_jobscripts SET '
Expand Down
2 changes: 1 addition & 1 deletion testing/make-justin-ups
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
# testing/make-justin-ups --default
#

export JUSTIN_VERSION=01.01.rc7
export JUSTIN_VERSION=01.02.rc0

export MJU_GIT_DIR=`mktemp -d /tmp/mju_git_XXXXXX`
( cd $MJU_GIT_DIR
Expand Down

0 comments on commit 7d4a9cd

Please sign in to comment.