Skip to content

Commit

Permalink
add hostname and cluster name to cloud ops
Browse files Browse the repository at this point in the history
  • Loading branch information
abbas1902 committed Nov 15, 2024
1 parent 3f66bf0 commit c7a0673
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import subprocess
import stat
import time
import yaml
from pathlib import Path

import util
Expand Down Expand Up @@ -459,6 +460,50 @@ def setup_compute():

log.info("Done setting up compute")

def setup_cloud_ops() -> None:
"""add deployment info to cloud ops config"""
cloudOpsStatus = run(
"systemctl is-active --quiet google-cloud-ops-agent.service", check=False
).returncode

if cloudOpsStatus != 0:
return

try:
with open("/etc/google-cloud-ops-agent/config.yaml", "r") as f:
file = yaml.safe_load(f)

cluster_info = {
'add_cluster_info': {
'type':'modify_fields',
'fields': {
'labels."cluster_name"':{
'static_value':f"{lookup().cfg.slurm_cluster_name}"
},
'labels."hostname"':{
'static_value': f"{lookup().hostname}"
}
}
}
}

file["logging"]["processors"].update(cluster_info)
file["logging"]["service"]["pipelines"]["slurmlog_pipeline"]["processors"].append("add_cluster_info")
file["logging"]["service"]["pipelines"]["slurmlog2_pipeline"]["processors"].append("add_cluster_info")

with open("/etc/google-cloud-ops-agent/config.yaml", "w") as f:
yaml.safe_dump(file, f, sort_keys=False)

except Exception as e:
log.exception(
"Cloud Ops Agent setup has encountered an exception while trying to edit its configuration"
)
raise e

run("systemctl restart google-cloud-ops-agent.service", timeout=30)

log.info("Checking status of cloud-ops agent")
run("systemctl status google-cloud-ops-agent.service")

def main():
start_motd()
Expand All @@ -476,7 +521,7 @@ def main():
log.exception(f"unexpected error while fetching config, sleeping for {sleep_seconds}s")
time.sleep(sleep_seconds)
log.info("Config fetched")

setup_cloud_ops()
configure_dirs()
# call the setup function for the instance type
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,10 @@ output "instructions" {
value = <<-EOT
To SSH to the controller (may need to add '--tunnel-through-iap'):
gcloud compute ssh ${google_compute_instance_from_template.controller.self_link}
If you are using cloud ops agent with this deployment,
you can use the following command to see the logs for the entire cluster or any particular VM host:
gcloud logging read labels.cluster_name=${local.slurm_cluster_name}
gcloud logging read labels.hostname=${local.slurm_cluster_name}-controller
EOT
}

0 comments on commit c7a0673

Please sign in to comment.