Skip to content

Commit

Permalink
Merge branch 'main' into jorda/rocprofiler
Browse files Browse the repository at this point in the history
  • Loading branch information
jordap committed Mar 23, 2024
2 parents 3c59e0f + a7fbbc2 commit d6ae126
Show file tree
Hide file tree
Showing 14 changed files with 84 additions and 34 deletions.
2 changes: 1 addition & 1 deletion annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# -------------------------------------------------------------------------------
# MIT License
#
# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved.
# Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down
2 changes: 1 addition & 1 deletion collector_base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -------------------------------------------------------------------------------
# MIT License
#
# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved.
# Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down
2 changes: 1 addition & 1 deletion collector_rocm_smi.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -------------------------------------------------------------------------------
# MIT License
#
# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved.
# Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down
2 changes: 1 addition & 1 deletion collector_slurm.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -------------------------------------------------------------------------------
# MIT License
#
# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved.
# Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down
2 changes: 1 addition & 1 deletion collector_smi.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -------------------------------------------------------------------------------
# MIT License
#
# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved.
# Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down
3 changes: 2 additions & 1 deletion monitor.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -------------------------------------------------------------------------------
# MIT License
#
# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved.
# Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -57,6 +57,7 @@ def __init__(self):
self.runtimeConfig['collector_enable_rocm_smi'] = config['omniwatch.collectors'].getboolean('enable_rocm_smi',True)
self.runtimeConfig['collector_enable_slurm'] = config['omniwatch.collectors'].getboolean('enable_slurm',False)
self.runtimeConfig['collector_enable_rocprofiler'] = config['omniwatch.collectors'].getboolean('enable_rocprofiler',False)

self.runtimeConfig['slurm_collector_annotations'] = config['omniwatch.collectors.slurm'].getboolean('enable_annotations',False)
self.runtimeConfig['collector_port'] = config['omniwatch.collectors'].get('port',8000)
self.runtimeConfig['collector_usermode'] = config['omniwatch.collectors'].getboolean('usermode',False)
Expand Down
2 changes: 1 addition & 1 deletion node_monitoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# -------------------------------------------------------------------------------
# MIT License
#
# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved.
# Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down
62 changes: 39 additions & 23 deletions omni_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# -------------------------------------------------------------------------------
# MIT License
#
# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved.
# Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -93,7 +93,12 @@ def startPromServer(self):
"template", "prometheus.yml.template"
)
ps_binary = self.runtimeConfig[section].get("binary")
ps_datadir = self.runtimeConfig[section].get("datadir", "data_prom")
ps_datadir = self.runtimeConfig[section].get("datadir", "data_prom", vars=os.environ)

# datadir can be overridden by separate env variable
if "OMNIWATCH_PROMSERVER_DATADIR" in os.environ:
ps_datadir = os.getenv("OMNIWATCH_PROMSERVER_DATADIR")

ps_logfile = self.runtimeConfig[section].get("logfile", "prom_server.log")
ps_corebinding = self.runtimeConfig[section].get("corebinding","0")

Expand Down Expand Up @@ -150,31 +155,42 @@ def startExporters(self):
utils.runShellCommand(cmd,timeout=35,exit_on_error=True)

logging.info("Launching exporters in parallel using pdsh")
tmp = tempfile.NamedTemporaryFile(mode='w',delete=False)
logging.info("--> hosts stored in %s" % tmp.name)
for host in self.slurmHosts:
tmp.write("%s\n" % host)
tmp.close()

cmd = [
"numactl",
"--physcpubind=%s" % corebinding,
"nice",
"-n 20",
"gunicorn",
"-D",
"-b 0.0.0.0:%s" % port,
# "--access-logfile %s" % (self.topDir / "access.log"),
# "--capture-output",
# "--log-file %s" % logpath,
"--pythonpath %s" % self.topDir,
"node_monitoring:app",
]
# tmp = tempfile.NamedTemporaryFile(mode='w',delete=False)
# logging.info("--> hosts stored in %s" % tmp.name)
# for host in self.slurmHosts:
# tmp.write("%s\n" % host)
# tmp.close()
#
# cmd = [
# "numactl",
# "--physcpubind=%s" % corebinding,
# "nice",
# "-n 20",
# "gunicorn",
# "-D",
# "-b 0.0.0.0:%s" % port,
# # "--access-logfile %s" % (self.topDir / "access.log"),
# # "--capture-output",
# # "--log-file %s" % logpath,
# "--pythonpath %s" % self.topDir,
# "node_monitoring:app",
# ]
# base_cmd = ["pdsh","-O","-f 128","-t 180","-w ^%s" % tmp.name]
# utils.runShellCommand(base_cmd + cmd,timeout=185,exit_on_error=False)

client = ParallelSSHClient(self.slurmHosts,allow_agent=False,timeout=120)
cmd = "gunicorn -D -b 0.0.0.0:%s --error-logfile %s --capture-output --pythonpath %s node_monitoring:app" % (port,self.topDir / "error.log" ,self.topDir)
gunicorn_path = utils.resolvePath("gunicorn",'NONE')

# cmd = "gunicorn -D -b 0.0.0.0:%s --error-logfile %s --capture-output --pythonpath %s node_monitoring:app" % (port,self.topDir / "error.log" ,self.topDir)

# build up ssh command, preserving PYTHON environment
if "PYTHONPATH" in os.environ:
cmd = "PYTHONPATH=%s " % (os.getenv("PYTHONPATH"))
else:
cmd = ""

cmd += "%s -D -b 0.0.0.0:%s" % (gunicorn_path, port)
cmd += " --pythonpath %s node_monitoring:app" % (self.topDir)
output = client.run_command(cmd)

# verify exporter available on all nodes...
Expand Down
File renamed without changes.
33 changes: 33 additions & 0 deletions omniwatch.ornl
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#--
# Configuration for ORNL Crusher/Frontier
#--

[omniwatch.collectors]

port = 8001
usermode = True
usermode_port = 8001
enable_rocm_smi = True
enable_slurm = True
corebinding = 31
logfile = exporter.log

[omniwatch.collectors.slurm]

host_skip = "login.*"
enable_annotations = True

[omniwatch.report]

[omniwatch.query]
prometheus_url = http://localhost:9090
system_name = ORNL Frontier/Crusher

# options for user-spawned promserver
[omniwatch.promserver]
template = prometheus.yml.template
logfile = prom_server.log
corebinding = 95
binary=/autofs/nccs-svm1_sw/crusher/amdsw/omniwatch/prometheus-2.45.1.linux-amd64/prometheus
#datadir = /lustre/orion/%(SLURM_JOB_ACCOUNT)s/scratch/%(USER)s/omniwatch/%(SLURM_JOB_ID)s
datadir = /lustre/orion/%(SLURM_JOB_ACCOUNT)s/world-shared/omniwatch/%(SLURM_JOB_ID)s
2 changes: 1 addition & 1 deletion query.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# -------------------------------------------------------------------------------
# MIT License
#
# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved.
# Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ Flask>=2.3.2
flask_prometheus_metrics>=1.0.0
prometheus_client>=0.17.0
gunicorn>=21.2.0

parallel-ssh>=2.12.0


2 changes: 1 addition & 1 deletion slurm_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# -------------------------------------------------------------------------------
# MIT License
#
# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved.
# Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down
2 changes: 1 addition & 1 deletion utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -------------------------------------------------------------------------------
# MIT License
#
# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved.
# Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down

0 comments on commit d6ae126

Please sign in to comment.