From d8adc025fd933bef9d3c0d519562a744c2f1c9ec Mon Sep 17 00:00:00 2001 From: Charles Zaffery Date: Tue, 10 Nov 2020 00:11:58 -0800 Subject: [PATCH] Collect operator autopilot health metrics Signed-off-by: Charles Zaffery --- README.md | 11 +++++++++ consul_exporter.go | 58 ++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 62 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 9b8a3182..91e59a13 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ make | ----------------------------------- | ---------------------------------------------------------------------------------------------------- | --------------------------------------------- | | consul_up | Was the last query of Consul successful | | | consul_raft_peers | How many peers (servers) are in the Raft cluster | | +| consul_operator_autopilot_server_health | The status of each servers health from a server cluster perspective | server_id, server_name, server_address, server_health_metric | | consul_serf_lan_members | How many members are in the cluster | | | consul_serf_lan_member_status | Status of member in the cluster. 1=Alive, 2=Leaving, 3=Left, 4=Failed. | member | | consul_catalog_services | How many services are in the cluster | | @@ -75,6 +76,16 @@ against the actual value found via monitoring. A prefix must be supplied to activate this feature. Pass `/` if you want to search the entire keyspace. +#### Operator Autopilot Server Health + +This exporter allows gathering low-level server metrics through the +Operator APIs Autopilot Health endpoint. This is a greatly elevated +endpoint that requires `operator:read`, and so should only be used +with a restricted ACL in a trusted fashion. + +* __`operator.autopilot-server-health`:__ Collects low-level server metrics + from the v1/operator/autopilot/health endpoint. + ### Environment variables The consul\_exporter supports all environment variables provided by the official diff --git a/consul_exporter.go b/consul_exporter.go index e9cc5c52..87d03213 100644 --- a/consul_exporter.go +++ b/consul_exporter.go @@ -58,6 +58,11 @@ var ( "Does Raft cluster have a leader (according to this node).", nil, nil, ) + operatorAutopilotServerHealth = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "operator_autopilot_server_health"), + "The status of each servers health from a server cluster perspective", + []string{"server_id", "server_name", "server_address", "server_health_metric"}, nil, + ) nodeCount = prometheus.NewDesc( prometheus.BuildFQName(namespace, "", "serf_lan_members"), "How many members are in the cluster.", @@ -121,6 +126,7 @@ type Exporter struct { kvPrefix string kvFilter *regexp.Regexp healthSummary bool + operatorHealth bool logger log.Logger requestLimitChan chan struct{} } @@ -137,7 +143,7 @@ type consulOpts struct { } // NewExporter returns an initialized Exporter. -func NewExporter(opts consulOpts, kvPrefix, kvFilter string, healthSummary bool, logger log.Logger) (*Exporter, error) { +func NewExporter(opts consulOpts, kvPrefix, kvFilter string, healthSummary bool, operatorHealth bool, logger log.Logger) (*Exporter, error) { uri := opts.uri if !strings.Contains(uri, "://") { uri = "http://" + uri @@ -188,6 +194,7 @@ func NewExporter(opts consulOpts, kvPrefix, kvFilter string, healthSummary bool, kvPrefix: kvPrefix, kvFilter: regexp.MustCompile(kvFilter), healthSummary: healthSummary, + operatorHealth: operatorHealth, logger: logger, requestLimitChan: requestLimitChan, }, nil @@ -199,6 +206,7 @@ func (e *Exporter) Describe(ch chan<- *prometheus.Desc) { ch <- up ch <- clusterServers ch <- clusterLeader + ch <- operatorAutopilotServerHealth ch <- nodeCount ch <- memberStatus ch <- serviceCount @@ -215,6 +223,7 @@ func (e *Exporter) Describe(ch chan<- *prometheus.Desc) { func (e *Exporter) Collect(ch chan<- prometheus.Metric) { ok := e.collectPeersMetric(ch) ok = e.collectLeaderMetric(ch) && ok + ok = e.collectOperatorAutopilotServerHealthMetric(ch) && ok ok = e.collectNodesMetric(ch) && ok ok = e.collectMembersMetric(ch) && ok ok = e.collectServicesMetric(ch) && ok @@ -262,6 +271,40 @@ func (e *Exporter) collectLeaderMetric(ch chan<- prometheus.Metric) bool { return true } +func (e *Exporter) collectOperatorAutopilotServerHealthMetric(ch chan<- prometheus.Metric) bool { + if e.operatorHealth == false { + return true + } + clusterHealth, err := e.client.Operator().AutopilotServerHealth(&queryOptions) + if err != nil { + level.Error(e.logger).Log("msg", "Failed to get autopilot server health", "err", err) + return false + } + for _, server := range clusterHealth.Servers { + ch <- prometheus.MustNewConstMetric( + operatorAutopilotServerHealth, prometheus.CounterValue, float64(server.LastIndex), server.ID, server.Name, server.Address, "LastIndex", + ) + ch <- prometheus.MustNewConstMetric( + operatorAutopilotServerHealth, prometheus.CounterValue, float64(server.LastTerm), server.ID, server.Name, server.Address, "LastTerm", + ) + server_health := 0.0 + if server.Healthy == true { + server_health = 1.0 + } + ch <- prometheus.MustNewConstMetric( + operatorAutopilotServerHealth, prometheus.CounterValue, server_health, server.ID, server.Name, server.Address, "Health", + ) + server_voter := 0.0 + if server.Voter == true { + server_voter = 1.0 + } + ch <- prometheus.MustNewConstMetric( + operatorAutopilotServerHealth, prometheus.CounterValue, server_voter, server.ID, server.Name, server.Address, "Voter", + ) + } + return true +} + func (e *Exporter) collectNodesMetric(ch chan<- prometheus.Metric) bool { nodes, _, err := e.client.Catalog().Nodes(&queryOptions) if err != nil { @@ -456,11 +499,12 @@ func init() { func main() { var ( - listenAddress = kingpin.Flag("web.listen-address", "Address to listen on for web interface and telemetry.").Default(":9107").String() - metricsPath = kingpin.Flag("web.telemetry-path", "Path under which to expose metrics.").Default("/metrics").String() - healthSummary = kingpin.Flag("consul.health-summary", "Generate a health summary for each service instance. Needs n+1 queries to collect all information.").Default("true").Bool() - kvPrefix = kingpin.Flag("kv.prefix", "Prefix from which to expose key/value pairs.").Default("").String() - kvFilter = kingpin.Flag("kv.filter", "Regex that determines which keys to expose.").Default(".*").String() + listenAddress = kingpin.Flag("web.listen-address", "Address to listen on for web interface and telemetry.").Default(":9107").String() + metricsPath = kingpin.Flag("web.telemetry-path", "Path under which to expose metrics.").Default("/metrics").String() + healthSummary = kingpin.Flag("consul.health-summary", "Generate a health summary for each service instance. Needs n+1 queries to collect all information.").Default("true").Bool() + kvPrefix = kingpin.Flag("kv.prefix", "Prefix from which to expose key/value pairs.").Default("").String() + kvFilter = kingpin.Flag("kv.filter", "Regex that determines which keys to expose.").Default(".*").String() + operatorHealth = kingpin.Flag("operator.autopilot-server-health", "Collect all operator autopilot server health").Default("true").Bool() opts = consulOpts{} ) @@ -486,7 +530,7 @@ func main() { level.Info(logger).Log("msg", "Starting consul_exporter", "version", version.Info()) level.Info(logger).Log("build_context", version.BuildContext()) - exporter, err := NewExporter(opts, *kvPrefix, *kvFilter, *healthSummary, logger) + exporter, err := NewExporter(opts, *kvPrefix, *kvFilter, *healthSummary, *operatorHealth, logger) if err != nil { level.Error(logger).Log("msg", "Error creating the exporter", "err", err) os.Exit(1)