From 79199711b866485059c5fcbbd5193ee003c98947 Mon Sep 17 00:00:00 2001 From: Sagar Dhawan Date: Thu, 21 Nov 2019 08:33:02 -0800 Subject: [PATCH] Add gpu resource usage tracking (#7075) --- .../dashboards/testnet-monitor.json | 74 +++++++++++++++++++ scripts/system-stats.sh | 21 +++++- 2 files changed, 94 insertions(+), 1 deletion(-) diff --git a/metrics/scripts/grafana-provisioning/dashboards/testnet-monitor.json b/metrics/scripts/grafana-provisioning/dashboards/testnet-monitor.json index f3b30d2f746050..667c1be7dcbbcd 100644 --- a/metrics/scripts/grafana-provisioning/dashboards/testnet-monitor.json +++ b/metrics/scripts/grafana-provisioning/dashboards/testnet-monitor.json @@ -9764,6 +9764,80 @@ ] ], "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"avg_gpu_usage\") as \"gpu_usage\" FROM \"$testnet\".\"autogen\".\"system-stats\" WHERE hostname =~ /$hostid/ AND $timeFilter GROUP BY time(5s) fill(null)\n", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"avg_gpu_mem_usage\") as \"gpu_memory_usage\" FROM \"$testnet\".\"autogen\".\"system-stats\" WHERE hostname =~ /$hostid/ AND $timeFilter GROUP BY time(5s) fill(null)\n", + "rawQuery": true, + "refId": "D", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] } ], "thresholds": [], diff --git a/scripts/system-stats.sh b/scripts/system-stats.sh index bd77e6043c0ead..08e27506b26697 100755 --- a/scripts/system-stats.sh +++ b/scripts/system-stats.sh @@ -19,8 +19,27 @@ while true; do ram_total_and_usage=$(echo "${top_ouput}" | grep '.*B Mem'| tail -1 | sed "s/.*: *\([0-9.]*\)%* total.*, *\([0-9.]*\)%* used.*/\1 \2/") read -r total used <<< "$ram_total_and_usage" ram_usage=$(awk "BEGIN {print $used / $total * 100}") + cpu_report="cpu_usage=$cpu_usage,ram_usage=$ram_usage" - report="cpu_usage=$cpu_usage,ram_usage=$ram_usage" + # if nvidia-smi exists, report gpu stats + gpu_report="" + if [ -x "$(command -v nvidia-smi)" ]; then + mapfile -t individual_gpu_usage < <(nvidia-smi --query-gpu=utilization.gpu,memory.used,memory.total --format=csv,nounits,noheader) + total_gpu_usage=0 + total_gpu_mem_usage=0 + num_gpus=${#individual_gpu_usage[@]} + for entry in "${individual_gpu_usage[@]}" + do + read -r compute mem_used mem_total <<< "${entry//,/}" + total_gpu_usage=$(awk "BEGIN {print $total_gpu_usage + $compute }") + total_gpu_mem_usage=$(awk "BEGIN {print $total_gpu_mem_usage + $mem_used / $mem_total * 100}") + done + avg_gpu_usage=$(awk "BEGIN {print $total_gpu_usage / $num_gpus}") + avg_gpu_mem_usage=$(awk "BEGIN {print $total_gpu_mem_usage / $num_gpus}") + gpu_report=",avg_gpu_usage=$avg_gpu_usage,avg_gpu_mem_usage=$avg_gpu_mem_usage" + fi + + report="${cpu_report}${gpu_report}" ./scripts/metrics-write-datapoint.sh "system-stats,hostname=$HOSTNAME $report" sleep 1 done