From 666d1b743778a0bc1576287798f9707bb6033e22 Mon Sep 17 00:00:00 2001 From: Pierre Rioux Date: Sun, 8 Dec 2024 16:50:44 -0500 Subject: [PATCH 01/14] Experimental commit in special branch Adding gcloud support --- BrainPortal/lib/scir.rb | 2 +- BrainPortal/lib/scir_gcloud_batch.rb | 216 +++++++++++++++++++++++++++ 2 files changed, 217 insertions(+), 1 deletion(-) create mode 100644 BrainPortal/lib/scir_gcloud_batch.rb diff --git a/BrainPortal/lib/scir.rb b/BrainPortal/lib/scir.rb index 9b4d0a74c..5f9904e3d 100644 --- a/BrainPortal/lib/scir.rb +++ b/BrainPortal/lib/scir.rb @@ -214,7 +214,7 @@ class JobTemplate #:nodoc: # We only support a subset of DRMAA's job template attr_accessor :name, :command, :arg, :wd, :stdin, :stdout, :stderr, :join, - :queue, :walltime, :memory, + :queue, :walltime, :memory, # walltime is in seconds, memory in megabytes :tc_extra_qsub_args, :task_id def revision_info #:nodoc: diff --git a/BrainPortal/lib/scir_gcloud_batch.rb b/BrainPortal/lib/scir_gcloud_batch.rb new file mode 100644 index 000000000..0fa002454 --- /dev/null +++ b/BrainPortal/lib/scir_gcloud_batch.rb @@ -0,0 +1,216 @@ + +# +# CBRAIN Project +# +# Copyright (C) 2008-2024 +# The Royal Institution for the Advancement of Learning +# McGill University +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +# This particular subclass of class Scir implements the SLURM interface. +class ScirGcloudBatch < Scir + + Revision_info=CbrainFileRevision[__FILE__] #:nodoc: + + class Session < Scir::Session #:nodoc: + + def update_job_info_cache #:nodoc: + out_text, err_text = bash_this_and_capture_out_err( + # the '%A' format returns the job ID + # the '%t' format returns the status with the one or two letter codes. + "gcloud batch jobs list #{gcloud_location()}" + ) + raise "Cannot get output of 'squeue'" if err_text.present? + out_lines = out_text.split("\n") + @job_info_cache = {} + #NAME LOCATION STATE + #projects/tidal-reactor-438920-g4/locations/northamerica-northeast1/jobs/transcode northamerica-northeast1 FAILED + #projects/tidal-reactor-438920-g4/locations/northamerica-northeast1/jobs/test3 northamerica-northeast1 SUCCEEDED + #projects/tidal-reactor-438920-g4/locations/northamerica-northeast1/jobs/tr1 northamerica-northeast1 FAILED + # In a real deploy, all jobs IDs will be 'cbrain-{task.id}-{task.run_number}' + out_lines.each do |line| + job_path, job_location, job_status = line.split(/\s+/) + next unless job_path.present? && job_status.present? + job_id = Pathname.new(job_path).basename + state = statestring_to_stateconst(job_status) + @job_info_cache[job_id] = { :drmaa_state => state } + end + true + end + + def statestring_to_stateconst(state) #:nodoc: + return Scir::STATE_RUNNING if state =~ /RUNNING/i + return Scir::STATE_QUEUED_ACTIVE if state =~ /SCHEDULED/i + return Scir::STATE_DONE if state =~ /COMPLETED/i + return Scir::STATE_FAILED if state =~ /FAILED/i + return Scir::STATE_UNDETERMINED + end + + def hold(jid) #:nodoc: + raise "There is no 'hold' action available for GCLOUD clusters" + end + + def release(jid) #:nodoc: + raise "There is no 'release' action available for GCLOUD clusters" + end + + def suspend(jid) #:nodoc: + raise "There is no 'suspend' action available for GCLOUD clusters" + end + + def resume(jid) #:nodoc: + raise "There is no 'resume' action available for GCLOUD clusters" + end + + def terminate(jid) #:nodoc: + out = IO.popen("gcloud batch jobs delete #{gcloud_location()} #{shell_escape(jid)} 2>&1","r") { |i| i.read } + #raise "Error deleting: #{out.join("\n")}" if whatever TODO + return + end + + def gcloud_location + #TODO better + "--location northamerica-northeast1" + end + + def queue_tasks_tot_max #:nodoc: + # Not Yet Implemented + [ "unknown", "unknown" ] + end + + private + + def qsubout_to_jid(txt) #:nodoc: + struct = YAML.load(txt) + fullname = struct['name'] # "projects/tidal-reactor-438920-g4/locations/northamerica-northeast1/jobs/cbrain-123-1" + Pathname.new(fullname).basename # cbrain-123-1 + end + + end + + class JobTemplate < Scir::JobTemplate #:nodoc: + + def bucket_name + "bianca-9945788255514" + end + + def bucket_mount_point + "/mnt/disks/share" + end + + # Note: CBRAIN's 'queue' name is interpreted as SLURM's 'partition'. + def qsub_command #:nodoc: + raise "Error, this class only handle 'command' as /bin/bash and a single script in 'arg'" unless + self.command == "/bin/bash" && self.arg.size == 1 + raise "Error: stdin not supported" if self.stdin + raise "Error: name is required" if self.name.blank? + raise "Error: name must be made of alphanums and dashes" if self.name !~ /\A[a-zA-Z][\w\-]*\w\z/ + + command = "gcloud batch jobs submit #{self.name} #{gcloud_location} " + command += "#{self.tc_extra_qsub_args} " if self.tc_extra_qsub_args.present? + command += "#{Scir.cbrain_config[:extra_qsub_args]} " if Scir.cbrain_config[:extra_qsub_args].present? + + script_name = self.arg[0] + script_command = "" + script_command += "cd #{shell_escape(self.wd)} && " if self.wd.present? + script_command += "bash #{shell_escape(script_name)} " + script_command += "1> #{shell_escape(self.stdout.sub(/\A:/,""))} " if self.stdout.present? + script_command += "2> #{shell_escape(self.stderr.sub(/\A:/,""))} " if self.stderr.present? + + walltime = self.walltime.presence || 600 # seconds + memory = self.memory.presence || 2000 # mb + + json_config_text = json_cloud_batch_jobs_config( + script_command, + (walltime * 4000), # it's in millisecond, so that's 4 times the walltime in seconds + "#{memory}m", + bucket_name(), + bucket_mount_point(), + ) + + # Write the json config to a file; use a name unique enough for the current submission, + # bu we can crush at a later date too. Maybe use job name?!? + pid_threadid = "#{Process.pid}-#{Thread.current.object_id}" + json_tmp_config_file = "/tmp/job_submit-#{pid_threadid}.json" + File.open(json_tmp_config_file,"w") { |fh| wh.write json_config_text } + + command += "--config #{json_tmp_config_file} 2>/dev/null" # we must ignore the friendly message line in stderr + + return command + end + + def json_cloud_batch_jobs_config(command, maxcpu_ms, maxmem_mb, bucket_name, mount_point, wallime_s) + struct = struct_gcloud_batch_jobs_config_template.dup + task_spec = struct["taskGroups"][0]["taskSpec"] + task_spec["runnables"][0]["script"]["text"] = command + task_spec["computeResource"]["cpuMilli"] = maxcpu_ms + task_spec["computeResource"]["memoryMib"] = maxmem_mb + task_spec["volumes"][0]["gcs"]["remotePath"] = bucket_name + task_spec["volumes"][0]["mountPath"] = mount_point + task_spec["maxRunDuration"] = wallime_s + struct.to_json + end + + def struct_gcloud_batch_jobs_config_template + { + "taskGroups" => [ + { + "taskSpec" => { + "runnables" => [ + { + "script" => { + "text" => "COMMAND_ON_NODE_HERE", + } + } + ], + "computeResource" => { + "cpuMilli" => 2000, + "memoryMib" => 2048, + }, + "volumes" => [ + { + "gcs" => { + "remotePath" => "BUCKET_NAME_HERE", + }, + "mountPath" => "BUCKET_MOUNT_PATH_HERE", + } + ], + "maxRetryCount" => 1, + "maxRunDuration" => "WALLTIME_HERE", + }, + "taskCount" => 1, + "parallelism" => 1 + } + ], + "allocationPolicy" => { + "instances" => [ + { + "policy" => { + "machineType" => "n2d-standard-4", + "provisioningModel" => "SPOT", + } + } + ] + }, + "logsPolicy" => { + "destination" => "CLOUD_LOGGING", + } + }.freeze + end + + end # class JobTemplate + +end # class ScirGcloudBatch From 7ecbb256a540484333f7fcebc88595a4e8fdd48d Mon Sep 17 00:00:00 2001 From: Pierre Rioux Date: Sun, 8 Dec 2024 16:55:36 -0500 Subject: [PATCH 02/14] Added missing symlink --- Bourreau/lib/scir_gcloud_batch.rb | 1 + 1 file changed, 1 insertion(+) create mode 120000 Bourreau/lib/scir_gcloud_batch.rb diff --git a/Bourreau/lib/scir_gcloud_batch.rb b/Bourreau/lib/scir_gcloud_batch.rb new file mode 120000 index 000000000..8556e4f56 --- /dev/null +++ b/Bourreau/lib/scir_gcloud_batch.rb @@ -0,0 +1 @@ +../../BrainPortal/lib/scir_gcloud_batch.rb \ No newline at end of file From 912a6fa9dc4a3ed802ca538c5ab143af946907cf Mon Sep 17 00:00:00 2001 From: Pierre Rioux Date: Sun, 8 Dec 2024 17:15:45 -0500 Subject: [PATCH 03/14] Fix --- BrainPortal/lib/scir_gcloud_batch.rb | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/BrainPortal/lib/scir_gcloud_batch.rb b/BrainPortal/lib/scir_gcloud_batch.rb index 0fa002454..98deb8b61 100644 --- a/BrainPortal/lib/scir_gcloud_batch.rb +++ b/BrainPortal/lib/scir_gcloud_batch.rb @@ -111,6 +111,11 @@ def bucket_mount_point "/mnt/disks/share" end + def gcloud_location + #TODO better + "--location northamerica-northeast1" + end + # Note: CBRAIN's 'queue' name is interpreted as SLURM's 'partition'. def qsub_command #:nodoc: raise "Error, this class only handle 'command' as /bin/bash and a single script in 'arg'" unless From 6bae185b28691a680aca71217ea0c87e4ff83521 Mon Sep 17 00:00:00 2001 From: Pierre Rioux Date: Sun, 8 Dec 2024 17:40:29 -0500 Subject: [PATCH 04/14] More fixes --- BrainPortal/lib/scir_gcloud_batch.rb | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/BrainPortal/lib/scir_gcloud_batch.rb b/BrainPortal/lib/scir_gcloud_batch.rb index 98deb8b61..615c39a40 100644 --- a/BrainPortal/lib/scir_gcloud_batch.rb +++ b/BrainPortal/lib/scir_gcloud_batch.rb @@ -108,7 +108,7 @@ def bucket_name end def bucket_mount_point - "/mnt/disks/share" + "/mnt/cbrain" end def gcloud_location @@ -124,7 +124,7 @@ def qsub_command #:nodoc: raise "Error: name is required" if self.name.blank? raise "Error: name must be made of alphanums and dashes" if self.name !~ /\A[a-zA-Z][\w\-]*\w\z/ - command = "gcloud batch jobs submit #{self.name} #{gcloud_location} " + command = "gcloud batch jobs submit #{self.name.downcase} #{gcloud_location} " command += "#{self.tc_extra_qsub_args} " if self.tc_extra_qsub_args.present? command += "#{Scir.cbrain_config[:extra_qsub_args]} " if Scir.cbrain_config[:extra_qsub_args].present? @@ -140,10 +140,10 @@ def qsub_command #:nodoc: json_config_text = json_cloud_batch_jobs_config( script_command, - (walltime * 4000), # it's in millisecond, so that's 4 times the walltime in seconds - "#{memory}m", + memory, bucket_name(), bucket_mount_point(), + walltime, ) # Write the json config to a file; use a name unique enough for the current submission, @@ -157,15 +157,15 @@ def qsub_command #:nodoc: return command end - def json_cloud_batch_jobs_config(command, maxcpu_ms, maxmem_mb, bucket_name, mount_point, wallime_s) + def json_cloud_batch_jobs_config(command, maxmem_mb, bucket_name, mount_point, walltime_s) struct = struct_gcloud_batch_jobs_config_template.dup task_spec = struct["taskGroups"][0]["taskSpec"] task_spec["runnables"][0]["script"]["text"] = command - task_spec["computeResource"]["cpuMilli"] = maxcpu_ms + task_spec["computeResource"]["cpuMilli"] = 2000, # 1000 per core task_spec["computeResource"]["memoryMib"] = maxmem_mb task_spec["volumes"][0]["gcs"]["remotePath"] = bucket_name task_spec["volumes"][0]["mountPath"] = mount_point - task_spec["maxRunDuration"] = wallime_s + task_spec["maxRunDuration"] = "#{walltime_s}s" struct.to_json end From ca7e9218f105c44d2acc3418c1081d766a9faab1 Mon Sep 17 00:00:00 2001 From: Pierre Rioux Date: Sun, 8 Dec 2024 17:42:25 -0500 Subject: [PATCH 05/14] More fix --- BrainPortal/lib/scir_gcloud_batch.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BrainPortal/lib/scir_gcloud_batch.rb b/BrainPortal/lib/scir_gcloud_batch.rb index 615c39a40..f6726e11e 100644 --- a/BrainPortal/lib/scir_gcloud_batch.rb +++ b/BrainPortal/lib/scir_gcloud_batch.rb @@ -150,7 +150,7 @@ def qsub_command #:nodoc: # bu we can crush at a later date too. Maybe use job name?!? pid_threadid = "#{Process.pid}-#{Thread.current.object_id}" json_tmp_config_file = "/tmp/job_submit-#{pid_threadid}.json" - File.open(json_tmp_config_file,"w") { |fh| wh.write json_config_text } + File.open(json_tmp_config_file,"w") { |fh| fh.write json_config_text } command += "--config #{json_tmp_config_file} 2>/dev/null" # we must ignore the friendly message line in stderr From 040ed3f1af4725659f01b91aae2dbfa010fa7db0 Mon Sep 17 00:00:00 2001 From: Pierre Rioux Date: Mon, 9 Dec 2024 10:49:35 -0500 Subject: [PATCH 06/14] More fix --- BrainPortal/lib/scir_gcloud_batch.rb | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/BrainPortal/lib/scir_gcloud_batch.rb b/BrainPortal/lib/scir_gcloud_batch.rb index f6726e11e..1a7fb5969 100644 --- a/BrainPortal/lib/scir_gcloud_batch.rb +++ b/BrainPortal/lib/scir_gcloud_batch.rb @@ -95,8 +95,12 @@ def queue_tasks_tot_max #:nodoc: def qsubout_to_jid(txt) #:nodoc: struct = YAML.load(txt) - fullname = struct['name'] # "projects/tidal-reactor-438920-g4/locations/northamerica-northeast1/jobs/cbrain-123-1" - Pathname.new(fullname).basename # cbrain-123-1 + fullname = struct['name'] # "projects/tidal-reactor-438920-g4/locations/northamerica-northeast1/jobs/cbrain-123-1-092332" + Pathname.new(fullname).basename # cbrain-123-1-092332 + rescue => ex + raise "Cannot find job ID from 'gcloud batch jobs submit' output. Text was blank" if txt.blank? + File.open("/tmp/debug.submit_error.txt","a") { |fh| fh.write("\n----\n#{txt}") } + raise "Cannot find job ID from 'gcloud batch jobs submit' output." end end @@ -116,7 +120,6 @@ def gcloud_location "--location northamerica-northeast1" end - # Note: CBRAIN's 'queue' name is interpreted as SLURM's 'partition'. def qsub_command #:nodoc: raise "Error, this class only handle 'command' as /bin/bash and a single script in 'arg'" unless self.command == "/bin/bash" && self.arg.size == 1 @@ -124,6 +127,10 @@ def qsub_command #:nodoc: raise "Error: name is required" if self.name.blank? raise "Error: name must be made of alphanums and dashes" if self.name !~ /\A[a-zA-Z][\w\-]*\w\z/ + # The name is the job ID, so we need a distinct suffix even for the same task + name = name[0..50] if name.size > 50 + name = name + DateTime.now.strftime("-%H%M%S") # this should be good enough + command = "gcloud batch jobs submit #{self.name.downcase} #{gcloud_location} " command += "#{self.tc_extra_qsub_args} " if self.tc_extra_qsub_args.present? command += "#{Scir.cbrain_config[:extra_qsub_args]} " if Scir.cbrain_config[:extra_qsub_args].present? @@ -161,7 +168,7 @@ def json_cloud_batch_jobs_config(command, maxmem_mb, bucket_name, mount_point, w struct = struct_gcloud_batch_jobs_config_template.dup task_spec = struct["taskGroups"][0]["taskSpec"] task_spec["runnables"][0]["script"]["text"] = command - task_spec["computeResource"]["cpuMilli"] = 2000, # 1000 per core + task_spec["computeResource"]["cpuMilli"] = 2000 # 1000 per core task_spec["computeResource"]["memoryMib"] = maxmem_mb task_spec["volumes"][0]["gcs"]["remotePath"] = bucket_name task_spec["volumes"][0]["mountPath"] = mount_point From 43e67c02f4ff5692077dbd7344d1903e19d3927c Mon Sep 17 00:00:00 2001 From: Pierre Rioux Date: Tue, 10 Dec 2024 14:17:36 -0500 Subject: [PATCH 07/14] More adjustments. --- BrainPortal/lib/scir_gcloud_batch.rb | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/BrainPortal/lib/scir_gcloud_batch.rb b/BrainPortal/lib/scir_gcloud_batch.rb index 1a7fb5969..734996491 100644 --- a/BrainPortal/lib/scir_gcloud_batch.rb +++ b/BrainPortal/lib/scir_gcloud_batch.rb @@ -44,7 +44,7 @@ def update_job_info_cache #:nodoc: out_lines.each do |line| job_path, job_location, job_status = line.split(/\s+/) next unless job_path.present? && job_status.present? - job_id = Pathname.new(job_path).basename + job_id = Pathname.new(job_path).basename.to_s state = statestring_to_stateconst(job_status) @job_info_cache[job_id] = { :drmaa_state => state } end @@ -53,6 +53,7 @@ def update_job_info_cache #:nodoc: def statestring_to_stateconst(state) #:nodoc: return Scir::STATE_RUNNING if state =~ /RUNNING/i + return Scir::STATE_QUEUED_ACTIVE if state =~ /QUEUED/i return Scir::STATE_QUEUED_ACTIVE if state =~ /SCHEDULED/i return Scir::STATE_DONE if state =~ /COMPLETED/i return Scir::STATE_FAILED if state =~ /FAILED/i @@ -96,7 +97,7 @@ def queue_tasks_tot_max #:nodoc: def qsubout_to_jid(txt) #:nodoc: struct = YAML.load(txt) fullname = struct['name'] # "projects/tidal-reactor-438920-g4/locations/northamerica-northeast1/jobs/cbrain-123-1-092332" - Pathname.new(fullname).basename # cbrain-123-1-092332 + Pathname.new(fullname).basename.to_s # cbrain-123-1-092332 rescue => ex raise "Cannot find job ID from 'gcloud batch jobs submit' output. Text was blank" if txt.blank? File.open("/tmp/debug.submit_error.txt","a") { |fh| fh.write("\n----\n#{txt}") } @@ -128,10 +129,11 @@ def qsub_command #:nodoc: raise "Error: name must be made of alphanums and dashes" if self.name !~ /\A[a-zA-Z][\w\-]*\w\z/ # The name is the job ID, so we need a distinct suffix even for the same task - name = name[0..50] if name.size > 50 - name = name + DateTime.now.strftime("-%H%M%S") # this should be good enough + gname = self.name.downcase + gname = gname[0..50] if gname.size > 50 + gname = gname + DateTime.now.strftime("-%H%M%S") # this should be good enough - command = "gcloud batch jobs submit #{self.name.downcase} #{gcloud_location} " + command = "gcloud batch jobs submit #{gname} #{gcloud_location} " command += "#{self.tc_extra_qsub_args} " if self.tc_extra_qsub_args.present? command += "#{Scir.cbrain_config[:extra_qsub_args]} " if Scir.cbrain_config[:extra_qsub_args].present? From fd87d287ebc838032d0e9a1ad2b2c664b285deff Mon Sep 17 00:00:00 2001 From: Pierre Rioux Date: Tue, 7 Jan 2025 12:46:53 -0500 Subject: [PATCH 08/14] Adjusted scir_gcloud with hardcoded image name --- BrainPortal/lib/scir_gcloud_batch.rb | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/BrainPortal/lib/scir_gcloud_batch.rb b/BrainPortal/lib/scir_gcloud_batch.rb index 734996491..82677b44f 100644 --- a/BrainPortal/lib/scir_gcloud_batch.rb +++ b/BrainPortal/lib/scir_gcloud_batch.rb @@ -121,6 +121,10 @@ def gcloud_location "--location northamerica-northeast1" end + def compute_node_image_name + "cbrain-compute-node-2-image" + end + def qsub_command #:nodoc: raise "Error, this class only handle 'command' as /bin/bash and a single script in 'arg'" unless self.command == "/bin/bash" && self.arg.size == 1 @@ -153,6 +157,7 @@ def qsub_command #:nodoc: bucket_name(), bucket_mount_point(), walltime, + compute_node_image_name, ) # Write the json config to a file; use a name unique enough for the current submission, @@ -166,8 +171,9 @@ def qsub_command #:nodoc: return command end - def json_cloud_batch_jobs_config(command, maxmem_mb, bucket_name, mount_point, walltime_s) + def json_cloud_batch_jobs_config(command, maxmem_mb, bucket_name, mount_point, walltime_s, compute_node_image_name) struct = struct_gcloud_batch_jobs_config_template.dup + task_spec = struct["taskGroups"][0]["taskSpec"] task_spec["runnables"][0]["script"]["text"] = command task_spec["computeResource"]["cpuMilli"] = 2000 # 1000 per core @@ -175,10 +181,15 @@ def json_cloud_batch_jobs_config(command, maxmem_mb, bucket_name, mount_point, w task_spec["volumes"][0]["gcs"]["remotePath"] = bucket_name task_spec["volumes"][0]["mountPath"] = mount_point task_spec["maxRunDuration"] = "#{walltime_s}s" + + policy = struct["allocationPolicy"]["instances"][0]["policy"] + policy["bootDisk"]["image"] = compute_node_image_name + struct.to_json end def struct_gcloud_batch_jobs_config_template + @_cached_frozen_struct_ ||= { "taskGroups" => [ { @@ -214,7 +225,9 @@ def struct_gcloud_batch_jobs_config_template { "policy" => { "machineType" => "n2d-standard-4", - "provisioningModel" => "SPOT", + "bootDisk" => { + "image" => "COMPUTE_NODE_IMAGE_NAME_HERE", + } } } ] From 8ef158938ca6be8baad4d750c7ffe2d33c86377f Mon Sep 17 00:00:00 2001 From: Pierre Rioux Date: Tue, 7 Jan 2025 13:31:00 -0500 Subject: [PATCH 09/14] Adjusted name of image --- BrainPortal/lib/scir_gcloud_batch.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BrainPortal/lib/scir_gcloud_batch.rb b/BrainPortal/lib/scir_gcloud_batch.rb index 82677b44f..e6c49f7f9 100644 --- a/BrainPortal/lib/scir_gcloud_batch.rb +++ b/BrainPortal/lib/scir_gcloud_batch.rb @@ -122,7 +122,7 @@ def gcloud_location end def compute_node_image_name - "cbrain-compute-node-2-image" + "projects/tidal-reactor-438920-g4/global/images/cbrain-compute-node-2-image" end def qsub_command #:nodoc: From 7b176af050964a487992b51c8fb1c876af007568 Mon Sep 17 00:00:00 2001 From: Pierre Rioux Date: Tue, 7 Jan 2025 14:52:13 -0500 Subject: [PATCH 10/14] Added UID switching --- BrainPortal/lib/scir_gcloud_batch.rb | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/BrainPortal/lib/scir_gcloud_batch.rb b/BrainPortal/lib/scir_gcloud_batch.rb index e6c49f7f9..29ef99b48 100644 --- a/BrainPortal/lib/scir_gcloud_batch.rb +++ b/BrainPortal/lib/scir_gcloud_batch.rb @@ -148,6 +148,10 @@ def qsub_command #:nodoc: script_command += "1> #{shell_escape(self.stdout.sub(/\A:/,""))} " if self.stdout.present? script_command += "2> #{shell_escape(self.stderr.sub(/\A:/,""))} " if self.stderr.present? + # Wrapper around the command to switch UID, as normally + # the stupid GoogleCloud batch engine starts everything as root + script_command = "sudo -u #{CBRAIN::Rails_UserName.bash_escape} bash -c #{script_command.bash_escape}" + walltime = self.walltime.presence || 600 # seconds memory = self.memory.presence || 2000 # mb From 4958a1f1da16ba94bd16c6c5e4820e6c68ac7279 Mon Sep 17 00:00:00 2001 From: Pierre Rioux Date: Fri, 7 Feb 2025 14:06:20 -0500 Subject: [PATCH 11/14] No longer using buckets in GCLOUD scir Adjusted hardcoded image name; TODO make it configurable. --- BrainPortal/lib/scir_gcloud_batch.rb | 30 ++++++---------------------- 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/BrainPortal/lib/scir_gcloud_batch.rb b/BrainPortal/lib/scir_gcloud_batch.rb index 29ef99b48..64df3ffd2 100644 --- a/BrainPortal/lib/scir_gcloud_batch.rb +++ b/BrainPortal/lib/scir_gcloud_batch.rb @@ -2,7 +2,7 @@ # # CBRAIN Project # -# Copyright (C) 2008-2024 +# Copyright (C) 2008-2025 # The Royal Institution for the Advancement of Learning # McGill University # @@ -20,7 +20,8 @@ # along with this program. If not, see . # -# This particular subclass of class Scir implements the SLURM interface. +# This particular subclass of class Scir implements a simulated +# cluster system on the Google Cloud Platform's Batch. class ScirGcloudBatch < Scir Revision_info=CbrainFileRevision[__FILE__] #:nodoc: @@ -108,21 +109,14 @@ def qsubout_to_jid(txt) #:nodoc: class JobTemplate < Scir::JobTemplate #:nodoc: - def bucket_name - "bianca-9945788255514" - end - - def bucket_mount_point - "/mnt/cbrain" - end - def gcloud_location #TODO better "--location northamerica-northeast1" end def compute_node_image_name - "projects/tidal-reactor-438920-g4/global/images/cbrain-compute-node-2-image" + #TODO better + "projects/cbrain-449118/zones/northamerica-northeast1-b/disks/cbrain-compute" end def qsub_command #:nodoc: @@ -158,8 +152,6 @@ def qsub_command #:nodoc: json_config_text = json_cloud_batch_jobs_config( script_command, memory, - bucket_name(), - bucket_mount_point(), walltime, compute_node_image_name, ) @@ -175,15 +167,13 @@ def qsub_command #:nodoc: return command end - def json_cloud_batch_jobs_config(command, maxmem_mb, bucket_name, mount_point, walltime_s, compute_node_image_name) + def json_cloud_batch_jobs_config(command, maxmem_mb, walltime_s, compute_node_image_name) struct = struct_gcloud_batch_jobs_config_template.dup task_spec = struct["taskGroups"][0]["taskSpec"] task_spec["runnables"][0]["script"]["text"] = command task_spec["computeResource"]["cpuMilli"] = 2000 # 1000 per core task_spec["computeResource"]["memoryMib"] = maxmem_mb - task_spec["volumes"][0]["gcs"]["remotePath"] = bucket_name - task_spec["volumes"][0]["mountPath"] = mount_point task_spec["maxRunDuration"] = "#{walltime_s}s" policy = struct["allocationPolicy"]["instances"][0]["policy"] @@ -209,14 +199,6 @@ def struct_gcloud_batch_jobs_config_template "cpuMilli" => 2000, "memoryMib" => 2048, }, - "volumes" => [ - { - "gcs" => { - "remotePath" => "BUCKET_NAME_HERE", - }, - "mountPath" => "BUCKET_MOUNT_PATH_HERE", - } - ], "maxRetryCount" => 1, "maxRunDuration" => "WALLTIME_HERE", }, From dcfe65689ea5b1237429270854030de94ed9eac7 Mon Sep 17 00:00:00 2001 From: Pierre Rioux Date: Thu, 13 Feb 2025 13:20:09 -0500 Subject: [PATCH 12/14] Adjusted image for scir_gcloud --- BrainPortal/lib/scir_gcloud_batch.rb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/BrainPortal/lib/scir_gcloud_batch.rb b/BrainPortal/lib/scir_gcloud_batch.rb index 64df3ffd2..8af6a7e2b 100644 --- a/BrainPortal/lib/scir_gcloud_batch.rb +++ b/BrainPortal/lib/scir_gcloud_batch.rb @@ -116,7 +116,7 @@ def gcloud_location def compute_node_image_name #TODO better - "projects/cbrain-449118/zones/northamerica-northeast1-b/disks/cbrain-compute" + "projects/cbrain-449118/global/images/cbrain-compute" end def qsub_command #:nodoc: @@ -210,7 +210,6 @@ def struct_gcloud_batch_jobs_config_template "instances" => [ { "policy" => { - "machineType" => "n2d-standard-4", "bootDisk" => { "image" => "COMPUTE_NODE_IMAGE_NAME_HERE", } From ccb3f969d67f43789144ff87a5ebae7f7708c341 Mon Sep 17 00:00:00 2001 From: Pierre Rioux Date: Thu, 13 Feb 2025 17:17:09 -0500 Subject: [PATCH 13/14] Made ScirGcloudBatch configurable Three variables need to be configured by the CBRAIN admin: GCLOUD_PROJECT=project_id GCLOUD_LOCATION=location GCLOUD_IMAGE_BASENAME=computeimagename These are normally set either at the bourreau level in the 'extra qsub arguments' field, or at the tool config level (also in the extra qesub arguments). They should be put together separated by spaces, e.g. GCLOUD_PROJECT=abcd GCLOUD_LOCATION=northamerica-northeast1-b --- BrainPortal/lib/scir_gcloud_batch.rb | 61 ++++++++++++++++++++-------- 1 file changed, 45 insertions(+), 16 deletions(-) diff --git a/BrainPortal/lib/scir_gcloud_batch.rb b/BrainPortal/lib/scir_gcloud_batch.rb index 8af6a7e2b..4fe4c524c 100644 --- a/BrainPortal/lib/scir_gcloud_batch.rb +++ b/BrainPortal/lib/scir_gcloud_batch.rb @@ -32,7 +32,7 @@ def update_job_info_cache #:nodoc: out_text, err_text = bash_this_and_capture_out_err( # the '%A' format returns the job ID # the '%t' format returns the status with the one or two letter codes. - "gcloud batch jobs list #{gcloud_location()}" + "gcloud batch jobs list --location #{gcloud_location()}" ) raise "Cannot get output of 'squeue'" if err_text.present? out_lines = out_text.split("\n") @@ -43,7 +43,7 @@ def update_job_info_cache #:nodoc: #projects/tidal-reactor-438920-g4/locations/northamerica-northeast1/jobs/tr1 northamerica-northeast1 FAILED # In a real deploy, all jobs IDs will be 'cbrain-{task.id}-{task.run_number}' out_lines.each do |line| - job_path, job_location, job_status = line.split(/\s+/) + job_path, _, job_status = line.split(/\s+/) next unless job_path.present? && job_status.present? job_id = Pathname.new(job_path).basename.to_s state = statestring_to_stateconst(job_status) @@ -78,7 +78,7 @@ def resume(jid) #:nodoc: end def terminate(jid) #:nodoc: - out = IO.popen("gcloud batch jobs delete #{gcloud_location()} #{shell_escape(jid)} 2>&1","r") { |i| i.read } + IO.popen("gcloud batch jobs delete --location #{gcloud_location} #{shell_escape(jid)} 2>&1","r") { |i| i.read } #raise "Error deleting: #{out.join("\n")}" if whatever TODO return end @@ -99,7 +99,7 @@ def qsubout_to_jid(txt) #:nodoc: struct = YAML.load(txt) fullname = struct['name'] # "projects/tidal-reactor-438920-g4/locations/northamerica-northeast1/jobs/cbrain-123-1-092332" Pathname.new(fullname).basename.to_s # cbrain-123-1-092332 - rescue => ex + rescue raise "Cannot find job ID from 'gcloud batch jobs submit' output. Text was blank" if txt.blank? File.open("/tmp/debug.submit_error.txt","a") { |fh| fh.write("\n----\n#{txt}") } raise "Cannot find job ID from 'gcloud batch jobs submit' output." @@ -110,13 +110,44 @@ def qsubout_to_jid(txt) #:nodoc: class JobTemplate < Scir::JobTemplate #:nodoc: def gcloud_location - #TODO better - "--location northamerica-northeast1" + get_config_value_from_extra_qsubs('GCLOUD_LOCATION') end - def compute_node_image_name - #TODO better - "projects/cbrain-449118/global/images/cbrain-compute" + def gcloud_compute_image_basename + get_config_value_from_extra_qsubs('GCLOUD_IMAGE_BASENAME') + end + + def gcloud_project + get_config_value_from_extra_qsubs('GCLOUD_PROJECT') + end + + # This method should not be overriden + def full_compute_node_image_name + "projects/#{gcloud_project}/global/images/#{gcloud_compute_image_basename}" + end + + # The admin is expected to have configured three values + # GCLOUD_PROJECT, GCLOUD_IMAGE_BASENAME and GCLOUD_LOCATION + # either in the Bourreau or ToolConfig levels, within the + # attribute known as 'extra_qsub_args'. The attributes + # should be set with "NAME=VALUE" substrings, separated by blanks. + # Values found at the ToolConfig level have priority. + def get_config_value_from_extra_qsubs(varname) + value = + extract_config_value(varname, self.tc_extra_qsub_args ) || + extract_config_value(varname, Scir.cbrain_config[:extra_qsub_args]) + raise "Missing Gcloud configuration value for '#{varname}'. Add it in extra_qsub_args at the Bourreau or ToolConfig level as '#{varname}=value'" if value.blank? + value + end + + # Given a from_string like + # "GCLOUD_PROJECT=abcde GCLOUD_IMAGE_BASENAME=baseim GCLOUD_LOCATION=westofhere" + # and a varname like "GCLOUD_PROJECT", this method returns the value, "abcde". + def extract_config_value(varname, from_string) + return nil if from_string.blank? + search_val = Regexp.new('\b' + Regexp.escape(varname) + '\s*=\s*(\w[\.\w-]+)', Regexp::IGNORECASE) + return Regexp.last_match[1] if from_string.match(search_val) + nil end def qsub_command #:nodoc: @@ -131,9 +162,7 @@ def qsub_command #:nodoc: gname = gname[0..50] if gname.size > 50 gname = gname + DateTime.now.strftime("-%H%M%S") # this should be good enough - command = "gcloud batch jobs submit #{gname} #{gcloud_location} " - command += "#{self.tc_extra_qsub_args} " if self.tc_extra_qsub_args.present? - command += "#{Scir.cbrain_config[:extra_qsub_args]} " if Scir.cbrain_config[:extra_qsub_args].present? + command = "gcloud batch jobs submit #{gname} --location #{gcloud_location} " script_name = self.arg[0] script_command = "" @@ -153,7 +182,7 @@ def qsub_command #:nodoc: script_command, memory, walltime, - compute_node_image_name, + full_compute_node_image_name, ) # Write the json config to a file; use a name unique enough for the current submission, @@ -162,12 +191,12 @@ def qsub_command #:nodoc: json_tmp_config_file = "/tmp/job_submit-#{pid_threadid}.json" File.open(json_tmp_config_file,"w") { |fh| fh.write json_config_text } - command += "--config #{json_tmp_config_file} 2>/dev/null" # we must ignore the friendly message line in stderr + command += "--config #{json_tmp_config_file} 2>/dev/null && rm -f #{json_tmp_config_file}" # we must ignore the friendly message line in stderr return command end - def json_cloud_batch_jobs_config(command, maxmem_mb, walltime_s, compute_node_image_name) + def json_cloud_batch_jobs_config(command, maxmem_mb, walltime_s, full_compute_node_image_name) struct = struct_gcloud_batch_jobs_config_template.dup task_spec = struct["taskGroups"][0]["taskSpec"] @@ -177,7 +206,7 @@ def json_cloud_batch_jobs_config(command, maxmem_mb, walltime_s, compute_node_im task_spec["maxRunDuration"] = "#{walltime_s}s" policy = struct["allocationPolicy"]["instances"][0]["policy"] - policy["bootDisk"]["image"] = compute_node_image_name + policy["bootDisk"]["image"] = full_compute_node_image_name struct.to_json end From cc77c06e6af8578939e56f8523df26b9c40a8eaf Mon Sep 17 00:00:00 2001 From: Pierre Rioux Date: Thu, 13 Feb 2025 17:23:52 -0500 Subject: [PATCH 14/14] Added Gcloud option for new Bourreaux --- BrainPortal/app/views/bourreaux/new.html.erb | 3 ++- BrainPortal/app/views/bourreaux/show.html.erb | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/BrainPortal/app/views/bourreaux/new.html.erb b/BrainPortal/app/views/bourreaux/new.html.erb index 89bd66c39..df5b518cb 100644 --- a/BrainPortal/app/views/bourreaux/new.html.erb +++ b/BrainPortal/app/views/bourreaux/new.html.erb @@ -150,7 +150,8 @@ [ "LSF", "ScirLsf" ], [ "Amazon EC2", "ScirAmazon" ], [ "SLURM", "ScirSlurm" ], - [ "UNIX processes", "ScirUnix" ] + [ "Google Cloud", "ScirGcloudBatch" ], + [ "UNIX processes", "ScirUnix" ], ] %> diff --git a/BrainPortal/app/views/bourreaux/show.html.erb b/BrainPortal/app/views/bourreaux/show.html.erb index 1327a9cde..ef678d959 100644 --- a/BrainPortal/app/views/bourreaux/show.html.erb +++ b/BrainPortal/app/views/bourreaux/show.html.erb @@ -298,7 +298,8 @@ [ "LSF", "ScirLsf" ], [ "Amazon EC2", "ScirAmazon" ], [ "SLURM", "ScirSlurm" ], - [ "UNIX processes", "ScirUnix" ] + [ "Google Cloud", "ScirGcloudBatch" ], + [ "UNIX processes", "ScirUnix" ], ] %> <% end %>