diff --git a/annotate.py b/annotate.py index 52bd9f62..969ab207 100755 --- a/annotate.py +++ b/annotate.py @@ -2,7 +2,7 @@ # ------------------------------------------------------------------------------- # MIT License # -# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/collector_base.py b/collector_base.py index 5600bf7e..9472f2ee 100644 --- a/collector_base.py +++ b/collector_base.py @@ -1,7 +1,7 @@ # ------------------------------------------------------------------------------- # MIT License # -# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/collector_rocm_smi.py b/collector_rocm_smi.py index c627afd3..b7a46c16 100644 --- a/collector_rocm_smi.py +++ b/collector_rocm_smi.py @@ -1,7 +1,7 @@ # ------------------------------------------------------------------------------- # MIT License # -# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/collector_slurm.py b/collector_slurm.py index 6485deb1..3fc1e863 100644 --- a/collector_slurm.py +++ b/collector_slurm.py @@ -1,7 +1,7 @@ # ------------------------------------------------------------------------------- # MIT License # -# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/collector_smi.py b/collector_smi.py index 99198c9a..f3b00d55 100644 --- a/collector_smi.py +++ b/collector_smi.py @@ -1,7 +1,7 @@ # ------------------------------------------------------------------------------- # MIT License # -# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/monitor.py b/monitor.py index 8d1cf39e..ca975bc1 100644 --- a/monitor.py +++ b/monitor.py @@ -1,7 +1,7 @@ # ------------------------------------------------------------------------------- # MIT License # -# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -57,6 +57,7 @@ def __init__(self): self.runtimeConfig['collector_enable_rocm_smi'] = config['omniwatch.collectors'].getboolean('enable_rocm_smi',True) self.runtimeConfig['collector_enable_slurm'] = config['omniwatch.collectors'].getboolean('enable_slurm',False) self.runtimeConfig['collector_enable_rocprofiler'] = config['omniwatch.collectors'].getboolean('enable_rocprofiler',False) + self.runtimeConfig['slurm_collector_annotations'] = config['omniwatch.collectors.slurm'].getboolean('enable_annotations',False) self.runtimeConfig['collector_port'] = config['omniwatch.collectors'].get('port',8000) self.runtimeConfig['collector_usermode'] = config['omniwatch.collectors'].getboolean('usermode',False) diff --git a/node_monitoring.py b/node_monitoring.py index 502b0367..1077412c 100755 --- a/node_monitoring.py +++ b/node_monitoring.py @@ -2,7 +2,7 @@ # ------------------------------------------------------------------------------- # MIT License # -# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/omni_util.py b/omni_util.py index 69ee81a7..9b4d7d0f 100755 --- a/omni_util.py +++ b/omni_util.py @@ -2,7 +2,7 @@ # ------------------------------------------------------------------------------- # MIT License # -# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -93,7 +93,12 @@ def startPromServer(self): "template", "prometheus.yml.template" ) ps_binary = self.runtimeConfig[section].get("binary") - ps_datadir = self.runtimeConfig[section].get("datadir", "data_prom") + ps_datadir = self.runtimeConfig[section].get("datadir", "data_prom", vars=os.environ) + + # datadir can be overridden by separate env variable + if "OMNIWATCH_PROMSERVER_DATADIR" in os.environ: + ps_datadir = os.getenv("OMNIWATCH_PROMSERVER_DATADIR") + ps_logfile = self.runtimeConfig[section].get("logfile", "prom_server.log") ps_corebinding = self.runtimeConfig[section].get("corebinding","0") @@ -150,31 +155,42 @@ def startExporters(self): utils.runShellCommand(cmd,timeout=35,exit_on_error=True) logging.info("Launching exporters in parallel using pdsh") - tmp = tempfile.NamedTemporaryFile(mode='w',delete=False) - logging.info("--> hosts stored in %s" % tmp.name) - for host in self.slurmHosts: - tmp.write("%s\n" % host) - tmp.close() - - cmd = [ - "numactl", - "--physcpubind=%s" % corebinding, - "nice", - "-n 20", - "gunicorn", - "-D", - "-b 0.0.0.0:%s" % port, - # "--access-logfile %s" % (self.topDir / "access.log"), - # "--capture-output", - # "--log-file %s" % logpath, - "--pythonpath %s" % self.topDir, - "node_monitoring:app", - ] + # tmp = tempfile.NamedTemporaryFile(mode='w',delete=False) + # logging.info("--> hosts stored in %s" % tmp.name) + # for host in self.slurmHosts: + # tmp.write("%s\n" % host) + # tmp.close() + # + # cmd = [ + # "numactl", + # "--physcpubind=%s" % corebinding, + # "nice", + # "-n 20", + # "gunicorn", + # "-D", + # "-b 0.0.0.0:%s" % port, + # # "--access-logfile %s" % (self.topDir / "access.log"), + # # "--capture-output", + # # "--log-file %s" % logpath, + # "--pythonpath %s" % self.topDir, + # "node_monitoring:app", + # ] # base_cmd = ["pdsh","-O","-f 128","-t 180","-w ^%s" % tmp.name] # utils.runShellCommand(base_cmd + cmd,timeout=185,exit_on_error=False) client = ParallelSSHClient(self.slurmHosts,allow_agent=False,timeout=120) - cmd = "gunicorn -D -b 0.0.0.0:%s --error-logfile %s --capture-output --pythonpath %s node_monitoring:app" % (port,self.topDir / "error.log" ,self.topDir) + gunicorn_path = utils.resolvePath("gunicorn",'NONE') + + # cmd = "gunicorn -D -b 0.0.0.0:%s --error-logfile %s --capture-output --pythonpath %s node_monitoring:app" % (port,self.topDir / "error.log" ,self.topDir) + + # build up ssh command, preserving PYTHON environment + if "PYTHONPATH" in os.environ: + cmd = "PYTHONPATH=%s " % (os.getenv("PYTHONPATH")) + else: + cmd = "" + + cmd += "%s -D -b 0.0.0.0:%s" % (gunicorn_path, port) + cmd += " --pythonpath %s node_monitoring:app" % (self.topDir) output = client.run_command(cmd) # verify exporter available on all nodes... diff --git a/omniwatch.config b/omniwatch.default similarity index 100% rename from omniwatch.config rename to omniwatch.default diff --git a/omniwatch.ornl b/omniwatch.ornl new file mode 100644 index 00000000..372c84fb --- /dev/null +++ b/omniwatch.ornl @@ -0,0 +1,33 @@ +#-- +# Configuration for ORNL Crusher/Frontier +#-- + +[omniwatch.collectors] + +port = 8001 +usermode = True +usermode_port = 8001 +enable_rocm_smi = True +enable_slurm = True +corebinding = 31 +logfile = exporter.log + +[omniwatch.collectors.slurm] + +host_skip = "login.*" +enable_annotations = True + +[omniwatch.report] + +[omniwatch.query] +prometheus_url = http://localhost:9090 +system_name = ORNL Frontier/Crusher + +# options for user-spawned promserver +[omniwatch.promserver] +template = prometheus.yml.template +logfile = prom_server.log +corebinding = 95 +binary=/autofs/nccs-svm1_sw/crusher/amdsw/omniwatch/prometheus-2.45.1.linux-amd64/prometheus +#datadir = /lustre/orion/%(SLURM_JOB_ACCOUNT)s/scratch/%(USER)s/omniwatch/%(SLURM_JOB_ID)s +datadir = /lustre/orion/%(SLURM_JOB_ACCOUNT)s/world-shared/omniwatch/%(SLURM_JOB_ID)s diff --git a/query.py b/query.py index 1fa8ff17..33362007 100755 --- a/query.py +++ b/query.py @@ -2,7 +2,7 @@ # ------------------------------------------------------------------------------- # MIT License # -# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/requirements.txt b/requirements.txt index 078f3699..bfc5b6b6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,6 @@ Flask>=2.3.2 flask_prometheus_metrics>=1.0.0 prometheus_client>=0.17.0 gunicorn>=21.2.0 - +parallel-ssh>=2.12.0 diff --git a/slurm_env.py b/slurm_env.py index caa34e56..094bad54 100755 --- a/slurm_env.py +++ b/slurm_env.py @@ -2,7 +2,7 @@ # ------------------------------------------------------------------------------- # MIT License # -# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/utils.py b/utils.py index add294c6..564c8102 100644 --- a/utils.py +++ b/utils.py @@ -1,7 +1,7 @@ # ------------------------------------------------------------------------------- # MIT License # -# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal