From 81bb208a62af84b933959d89867726b4d7666310 Mon Sep 17 00:00:00 2001 From: Michael Vines Date: Mon, 26 Aug 2019 15:17:19 -0700 Subject: [PATCH] Add open file descriptor monitoring (#5655) --- .../dashboards/testnet-monitor.json | 182 ++++++++++++++++-- net/net.sh | 2 +- net/remote/remote-client.sh | 6 +- net/remote/remote-node.sh | 2 + scripts/fd-monitor.sh | 20 ++ scripts/metrics-write-datapoint.sh | 2 +- 6 files changed, 196 insertions(+), 18 deletions(-) create mode 100755 scripts/fd-monitor.sh diff --git a/metrics/scripts/grafana-provisioning/dashboards/testnet-monitor.json b/metrics/scripts/grafana-provisioning/dashboards/testnet-monitor.json index 95937efbb0de00..6327f4f7129660 100644 --- a/metrics/scripts/grafana-provisioning/dashboards/testnet-monitor.json +++ b/metrics/scripts/grafana-provisioning/dashboards/testnet-monitor.json @@ -15,8 +15,8 @@ "editable": true, "gnetId": null, "graphTooltip": 0, - "id": 851, - "iteration": 1565991401072, + "id": 883, + "iteration": 1566852798488, "links": [ { "asDropdown": true, @@ -2516,7 +2516,7 @@ "x": 12, "y": 24 }, - "id": 23, + "id": 61, "interval": null, "links": [], "mappingType": 1, @@ -2569,7 +2569,7 @@ ], "orderByTime": "ASC", "policy": "default", - "query": "SELECT sum(\"one\") FROM \"$testnet\".\"autogen\".\"panic\" WHERE $timeFilter", + "query": "SELECT SUM(\"points_lost\") FROM \"$testnet\".\"autogen\".\"metrics\" WHERE $timeFilter\n", "rawQuery": true, "refId": "A", "resultFormat": "table", @@ -2591,7 +2591,7 @@ } ], "thresholds": "", - "title": "Total Panics", + "title": "Lost Datapoints", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -2840,7 +2840,7 @@ "datasource": "$datasource", "fill": 1, "gridPos": { - "h": 6, + "h": 3, "w": 8, "x": 0, "y": 26 @@ -2852,7 +2852,7 @@ "current": false, "max": false, "min": false, - "rightSide": false, + "rightSide": true, "show": true, "total": false, "values": false @@ -2888,7 +2888,7 @@ "hide": false, "orderByTime": "ASC", "policy": "default", - "query": "SELECT MEAN(\"points_written\") as \"Mean points written\" FROM \"$testnet\".\"autogen\".\"metrics\" WHERE $timeFilter GROUP BY time(5s) fill(null)\n", + "query": "SELECT MEAN(\"points_written\") as \"mean\" FROM \"$testnet\".\"autogen\".\"metrics\" WHERE $timeFilter GROUP BY time(5s) fill(null)\n", "rawQuery": true, "refId": "B", "resultFormat": "time_series", @@ -2925,7 +2925,7 @@ ], "orderByTime": "ASC", "policy": "default", - "query": "SELECT MAX(\"points_written\") as \"Max points written\" FROM \"$testnet\".\"autogen\".\"metrics\" WHERE $timeFilter GROUP BY time(5s) fill(null)\n", + "query": "SELECT MAX(\"points_written\") as \"max\" FROM \"$testnet\".\"autogen\".\"metrics\" WHERE $timeFilter GROUP BY time(5s) fill(null)\n", "rawQuery": true, "refId": "A", "resultFormat": "time_series", @@ -3263,6 +3263,162 @@ "alignLevel": null } }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + "h": 3, + "w": 8, + "x": 0, + "y": 29 + }, + "id": 62, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT MEAN(\"count\") as \"mean\" FROM \"$testnet\".\"autogen\".\"open-files\" WHERE $timeFilter GROUP BY time(5s) fill(null)\n", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT MAX(\"count\") as \"max\" FROM \"$testnet\".\"autogen\".\"open-files\" WHERE $timeFilter GROUP BY time(5s) fill(null)\n", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Open Files per node", + "tooltip": { + "shared": true, + "sort": 1, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": "0.2", + "show": true + }, + { + "decimals": null, + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { "columns": [], "datasource": "$datasource", @@ -8173,10 +8329,6 @@ }, { "allValue": ".*", - "current": { - "text": "All", - "value": "$__all" - }, "datasource": "$datasource", "hide": 0, "includeAll": true, @@ -8228,5 +8380,5 @@ "timezone": "", "title": "Testnet Monitor (edge)", "uid": "testnet-edge", - "version": 3 -} + "version": 1 +} \ No newline at end of file diff --git a/net/net.sh b/net/net.sh index 69235e1aaf9c17..f1e17e290457f8 100755 --- a/net/net.sh +++ b/net/net.sh @@ -752,7 +752,7 @@ stopNode() { PS4=\"$PS4\" set -x ! tmux list-sessions || tmux kill-session - for pid in solana/{net-stats,oom-monitor}.pid; do + for pid in solana/{net-stats,fd-monitor,oom-monitor}.pid; do pgid=\$(ps opgid= \$(cat \$pid) | tr -d '[:space:]') if [[ -n \$pgid ]]; then sudo kill -- -\$pgid diff --git a/net/remote/remote-client.sh b/net/remote/remote-client.sh index 5c3080ed18c261..559ba8bd8caaba 100755 --- a/net/remote/remote-client.sh +++ b/net/remote/remote-client.sh @@ -50,9 +50,13 @@ skip) esac ( - sudo scripts/oom-monitor.sh + sudo SOLANA_METRICS_CONFIG="$SOLANA_METRICS_CONFIG" scripts/oom-monitor.sh ) > oom-monitor.log 2>&1 & +echo $! > oom-monitor.pid +scripts/fd-monitor.sh > fd-monitor.log 2>&1 & +echo $! > fd-monitor.pid scripts/net-stats.sh > net-stats.log 2>&1 & +echo $! > net-stats.pid ! tmux list-sessions || tmux kill-session diff --git a/net/remote/remote-node.sh b/net/remote/remote-node.sh index d16b29c91949bf..498f068a91c71a 100755 --- a/net/remote/remote-node.sh +++ b/net/remote/remote-node.sh @@ -93,6 +93,8 @@ local|tar|skip) sudo SOLANA_METRICS_CONFIG="$SOLANA_METRICS_CONFIG" scripts/oom-monitor.sh ) > oom-monitor.log 2>&1 & echo $! > oom-monitor.pid + scripts/fd-monitor.sh > fd-monitor.log 2>&1 & + echo $! > fd-monitor.pid scripts/net-stats.sh > net-stats.log 2>&1 & echo $! > net-stats.pid diff --git a/scripts/fd-monitor.sh b/scripts/fd-monitor.sh new file mode 100755 index 00000000000000..16d24eb6f40832 --- /dev/null +++ b/scripts/fd-monitor.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +# +# Reports open file descriptors for the current user +# +set -e + +[[ $(uname) == Linux ]] || exit 0 + +cd "$(dirname "$0")" + +# shellcheck source=scripts/configure-metrics.sh +source configure-metrics.sh + +while true; do + count=$(lsof -u $UID | wc -l) + ./metrics-write-datapoint.sh "open-files,hostname=$HOSTNAME count=$count" + sleep 10 +done + +exit 1 diff --git a/scripts/metrics-write-datapoint.sh b/scripts/metrics-write-datapoint.sh index 083ff4655fdc59..8954ec049002b6 100755 --- a/scripts/metrics-write-datapoint.sh +++ b/scripts/metrics-write-datapoint.sh @@ -22,5 +22,5 @@ if [[ -n $INFLUX_HOST ]]; then fi echo "${host}/write?db=${INFLUX_DATABASE}&u=${INFLUX_USERNAME}&p=${INFLUX_PASSWORD}" \ - | xargs curl --max-time 5 -XPOST --data-binary "$point" + | xargs curl --max-time 5 --silent --show-error -XPOST --data-binary "$point" exit 0