Skip to content

Commit

Permalink
Merge pull request #3512 from alyssa-sm/many-ports
Browse files Browse the repository at this point in the history
Python Integration Test - Support tests running concurrently
  • Loading branch information
alyssa-sm authored Jan 9, 2025
2 parents 487c7de + aaf8527 commit 0890bd2
Show file tree
Hide file tree
Showing 8 changed files with 80 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ tags:

timeout: 14400s # 4hr
steps:
- id: slurm-topology
- id: slurm-reconfig
name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
entrypoint: /bin/bash
args:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ tags:

timeout: 14400s # 4hr
steps:
- id: slurm-topology
- id: slurm-job-completion
name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
entrypoint: /bin/bash
args:
Expand Down
2 changes: 1 addition & 1 deletion tools/cloud-build/daily-tests/validate_tests_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def get_blueprint(build_path: str) -> Optional[str]:
f"{BUILDS_DIR}/chrome-remote-desktop.yaml": "tools/cloud-build/daily-tests/blueprints/crd-default.yaml",
f"{BUILDS_DIR}/chrome-remote-desktop-ubuntu.yaml": "tools/cloud-build/daily-tests/blueprints/crd-ubuntu.yaml",
f"{BUILDS_DIR}/gcluster-dockerfile.yaml": "tools/cloud-build/daily-tests/blueprints/e2e.yaml",
f"{BUILDS_DIR}/slurm-gcp-v6-reconfig-size.yaml": "tools/python-integration-tests/blueprints/slurm-simple-reconfig.yaml",
f"{BUILDS_DIR}/slurm-gcp-v6-reconfig-size.yaml": "tools/python-integration-tests/blueprints/slurm-reconfig-before.yaml",
f"{BUILDS_DIR}/slurm-gcp-v6-simple-job-completion.yaml": "tools/python-integration-tests/blueprints/slurm-simple.yaml",
f"{BUILDS_DIR}/slurm-gcp-v6-topology.yaml": "tools/python-integration-tests/blueprints/topology-test.yaml",
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.

---
blueprint_name: slurm-simple
blueprint_name: slurm-reconfig

vars:
project_id: ## Set GCP Project ID Here ##
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---
blueprint_name: slurm-reconfig

vars:
project_id: ## Set GCP Project ID Here ##
deployment_name: ## Set Deployment Name Here ##
region: us-central1
zone: us-central1-a

deployment_groups:
- group: primary
modules:
- id: network
source: modules/network/pre-existing-vpc

- id: nodeset
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network]
settings:
bandwidth_tier: gvnic_enabled
machine_type: c2-standard-4
node_count_dynamic_max: 5
allow_automatic_updates: false

- id: partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
use: [nodeset]
settings:
is_default: true
partition_name: compute

- id: slurm_login
source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
use: [network]
settings:
machine_type: n1-standard-4
enable_login_public_ips: true

- id: slurm_controller
source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
use: [network, slurm_login, partition]
settings:
machine_type: n1-standard-4
enable_controller_public_ips: true
4 changes: 2 additions & 2 deletions tools/python-integration-tests/slurm_reconfig_size.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
class SlurmReconfigureSize(SlurmTest):
# Class to test simple reconfiguration
def __init__(self, deployment):
super().__init__(Deployment("tools/python-integration-tests/blueprints/slurm-simple.yaml"))
self.reconfig_blueprint = "tools/python-integration-tests/blueprints/slurm-simple-reconfig.yaml"
super().__init__(Deployment("tools/python-integration-tests/blueprints/slurm-reconfig-before.yaml"))
self.reconfig_blueprint = "tools/python-integration-tests/blueprints/slurm-reconfig-after.yaml"

def runTest(self):
# Check 5 nodes are available
Expand Down
18 changes: 14 additions & 4 deletions tools/python-integration-tests/ssh.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import os
import subprocess
import socket
import time
import paramiko

Expand All @@ -31,15 +32,23 @@ def __init__(self):
self.tunnel = None
self.key = None
self.ssh_client = None
self.local_port = None

def run_command(self, cmd: str) -> subprocess.CompletedProcess:
res = subprocess.run(cmd, text=True, check=True, capture_output=True)

def create_tunnel(self, instance_name, port, project_id, zone):
def get_available_port(self):
sock = socket.socket()
sock.bind(('', 0))
port = sock.getsockname()[1]
sock.close()
return port

def create_tunnel(self, instance_name, project_id, zone):
iap_tunnel_cmd = [
"gcloud", "compute", "start-iap-tunnel", instance_name,
"22", "--project", project_id, "--zone", zone,
f"--local-host-port=localhost:{port}"
f"--local-host-port=localhost:{self.local_port}"
]

self.tunnel = subprocess.Popen(iap_tunnel_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
Expand All @@ -59,11 +68,12 @@ def get_keypath(self):

return key_path

def setup_connection(self, instance_name, port, project_id, zone):
def setup_connection(self, instance_name, project_id, zone):
self.ssh_client = paramiko.SSHClient()
self.ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
self.key = paramiko.RSAKey.from_private_key_file(self.get_keypath())
self.create_tunnel(instance_name, port, project_id, zone)
self.local_port = self.get_available_port()
self.create_tunnel(instance_name, project_id, zone)

def close(self):
# Closes existing SSH connection and tunnel
Expand Down
4 changes: 2 additions & 2 deletions tools/python-integration-tests/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ class SlurmTest(Test):
# Base class for Slurm-specific tests.
def ssh(self, hostname):
self.ssh_manager = SSHManager()
self.ssh_manager.setup_connection(hostname, 10022, self.deployment.project_id, self.deployment.zone)
self.ssh_manager.setup_connection(hostname, self.deployment.project_id, self.deployment.zone)
self.ssh_client = self.ssh_manager.ssh_client
self.ssh_client.connect("localhost", 10022, username=self.deployment.username, pkey=self.ssh_manager.key)
self.ssh_client.connect("localhost", self.ssh_manager.local_port, username=self.deployment.username, pkey=self.ssh_manager.key)

def close_ssh(self):
self.ssh_manager.close()
Expand Down

0 comments on commit 0890bd2

Please sign in to comment.