From 96246aa0a827aaf1e1cfe8c64ddc21bf6d8c0589 Mon Sep 17 00:00:00 2001 From: Charles Zaffery Date: Mon, 23 Nov 2020 11:59:12 -0800 Subject: [PATCH] Collect operator autopilot health metrics Signed-off-by: Charles Zaffery --- README.md | 18 ++++++++++ consul_exporter.go | 76 +++++++++++++++++++++++++++++++++++++---- consul_exporter_test.go | 4 +-- 3 files changed, 89 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 9b8a3182..a275262c 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,10 @@ make | ----------------------------------- | ---------------------------------------------------------------------------------------------------- | --------------------------------------------- | | consul_up | Was the last query of Consul successful | | | consul_raft_peers | How many peers (servers) are in the Raft cluster | | +| consul_operator_autopilot_health_voter | If a server is a voter or not | server_id, server_name, server_address, server_version | +| consul_operator_autopilot_health_healthy | If a server is healthy or not (according to Raft Autopilot) | server_id, server_name, server_address, server_version | +| consul_operator_autopilot_health_last_index | The last known raft index a server has replayed | server_id, server_name, server_address, server_version | +| consul_operator_autopilot_health_last_term | The last known voting index a server has seen/sent | server_id, server_name, server_address, server_version | | consul_serf_lan_members | How many members are in the cluster | | | consul_serf_lan_member_status | Status of member in the cluster. 1=Alive, 2=Leaving, 3=Left, 4=Failed. | member | | consul_catalog_services | How many services are in the cluster | | @@ -75,6 +79,16 @@ against the actual value found via monitoring. A prefix must be supplied to activate this feature. Pass `/` if you want to search the entire keyspace. +#### Operator Autopilot Server Health + +This exporter allows gathering low-level server metrics through the +Operator APIs Autopilot Health endpoint. This is a greatly elevated +endpoint that requires `operator:read`, and so should only be used +with a restricted ACL in a trusted fashion. + +* __`operator.autopilot-server-health`:__ Collects low-level server metrics + from the v1/operator/autopilot/health endpoint. + ### Environment variables The consul\_exporter supports all environment variables provided by the official @@ -99,6 +113,10 @@ __What service checks are critical?__ You can query for the following health check states: "maintenance", "critical", "warning" or "passing" +__Which servers are often lagging behind the cluster?__ + + avg(consul_operator_autopilot_health_healthy) by (server_name) + ## Using Docker You can deploy this exporter using the [prom/consul-exporter](https://registry.hub.docker.com/r/prom/consul-exporter) Docker image. diff --git a/consul_exporter.go b/consul_exporter.go index e9cc5c52..39d257d8 100644 --- a/consul_exporter.go +++ b/consul_exporter.go @@ -58,6 +58,26 @@ var ( "Does Raft cluster have a leader (according to this node).", nil, nil, ) + operatorAutopilotVoter = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "operator_autopilot_health_voter"), + "If a server is a voter or not.", + []string{"server_id", "server_name", "server_address", "server_version"}, nil, + ) + operatorAutopilotHealthy = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "operator_autopilot_health_healthy"), + "If a server is healthy or not (according to raft autopilot).", + []string{"server_id", "server_name", "server_address", "server_version"}, nil, + ) + operatorAutopilotLastIndex = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "operator_autopilot_health_last_index"), + "The last known raft index a server has replayed.", + []string{"server_id", "server_name", "server_address", "server_version"}, nil, + ) + operatorAutopilotLastTerm = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "operator_autopilot_health_last_term"), + "The last known voting index a server has seen/sent.", + []string{"server_id", "server_name", "server_address", "server_version"}, nil, + ) nodeCount = prometheus.NewDesc( prometheus.BuildFQName(namespace, "", "serf_lan_members"), "How many members are in the cluster.", @@ -121,6 +141,7 @@ type Exporter struct { kvPrefix string kvFilter *regexp.Regexp healthSummary bool + operatorHealth bool logger log.Logger requestLimitChan chan struct{} } @@ -137,7 +158,7 @@ type consulOpts struct { } // NewExporter returns an initialized Exporter. -func NewExporter(opts consulOpts, kvPrefix, kvFilter string, healthSummary bool, logger log.Logger) (*Exporter, error) { +func NewExporter(opts consulOpts, kvPrefix, kvFilter string, healthSummary bool, operatorHealth bool, logger log.Logger) (*Exporter, error) { uri := opts.uri if !strings.Contains(uri, "://") { uri = "http://" + uri @@ -188,6 +209,7 @@ func NewExporter(opts consulOpts, kvPrefix, kvFilter string, healthSummary bool, kvPrefix: kvPrefix, kvFilter: regexp.MustCompile(kvFilter), healthSummary: healthSummary, + operatorHealth: operatorHealth, logger: logger, requestLimitChan: requestLimitChan, }, nil @@ -199,6 +221,10 @@ func (e *Exporter) Describe(ch chan<- *prometheus.Desc) { ch <- up ch <- clusterServers ch <- clusterLeader + ch <- operatorAutopilotHealthy + ch <- operatorAutopilotLastIndex + ch <- operatorAutopilotLastTerm + ch <- operatorAutopilotVoter ch <- nodeCount ch <- memberStatus ch <- serviceCount @@ -215,6 +241,7 @@ func (e *Exporter) Describe(ch chan<- *prometheus.Desc) { func (e *Exporter) Collect(ch chan<- prometheus.Metric) { ok := e.collectPeersMetric(ch) ok = e.collectLeaderMetric(ch) && ok + ok = e.collectOperatorAutopilotServerHealthMetric(ch) && ok ok = e.collectNodesMetric(ch) && ok ok = e.collectMembersMetric(ch) && ok ok = e.collectServicesMetric(ch) && ok @@ -262,6 +289,40 @@ func (e *Exporter) collectLeaderMetric(ch chan<- prometheus.Metric) bool { return true } +func (e *Exporter) collectOperatorAutopilotServerHealthMetric(ch chan<- prometheus.Metric) bool { + if !e.operatorHealth { + return true + } + clusterHealth, err := e.client.Operator().AutopilotServerHealth(&queryOptions) + if err != nil { + level.Error(e.logger).Log("msg", "Failed to get autopilot server health", "err", err) + return false + } + for _, server := range clusterHealth.Servers { + ch <- prometheus.MustNewConstMetric( + operatorAutopilotLastIndex, prometheus.CounterValue, float64(server.LastIndex), server.ID, server.Name, server.Address, server.Version, + ) + ch <- prometheus.MustNewConstMetric( + operatorAutopilotLastTerm, prometheus.CounterValue, float64(server.LastTerm), server.ID, server.Name, server.Address, server.Version, + ) + server_health := 0.0 + if server.Healthy { + server_health = 1.0 + } + ch <- prometheus.MustNewConstMetric( + operatorAutopilotHealthy, prometheus.CounterValue, server_health, server.ID, server.Name, server.Address, server.Version, + ) + server_voter := 0.0 + if server.Voter { + server_voter = 1.0 + } + ch <- prometheus.MustNewConstMetric( + operatorAutopilotVoter, prometheus.CounterValue, server_voter, server.ID, server.Name, server.Address, server.Version, + ) + } + return true +} + func (e *Exporter) collectNodesMetric(ch chan<- prometheus.Metric) bool { nodes, _, err := e.client.Catalog().Nodes(&queryOptions) if err != nil { @@ -456,11 +517,12 @@ func init() { func main() { var ( - listenAddress = kingpin.Flag("web.listen-address", "Address to listen on for web interface and telemetry.").Default(":9107").String() - metricsPath = kingpin.Flag("web.telemetry-path", "Path under which to expose metrics.").Default("/metrics").String() - healthSummary = kingpin.Flag("consul.health-summary", "Generate a health summary for each service instance. Needs n+1 queries to collect all information.").Default("true").Bool() - kvPrefix = kingpin.Flag("kv.prefix", "Prefix from which to expose key/value pairs.").Default("").String() - kvFilter = kingpin.Flag("kv.filter", "Regex that determines which keys to expose.").Default(".*").String() + listenAddress = kingpin.Flag("web.listen-address", "Address to listen on for web interface and telemetry.").Default(":9107").String() + metricsPath = kingpin.Flag("web.telemetry-path", "Path under which to expose metrics.").Default("/metrics").String() + healthSummary = kingpin.Flag("consul.health-summary", "Generate a health summary for each service instance. Needs n+1 queries to collect all information.").Default("true").Bool() + kvPrefix = kingpin.Flag("kv.prefix", "Prefix from which to expose key/value pairs.").Default("").String() + kvFilter = kingpin.Flag("kv.filter", "Regex that determines which keys to expose.").Default(".*").String() + operatorHealth = kingpin.Flag("operator.autopilot-server-health", "Collect all operator autopilot server health").Default("false").Bool() opts = consulOpts{} ) @@ -486,7 +548,7 @@ func main() { level.Info(logger).Log("msg", "Starting consul_exporter", "version", version.Info()) level.Info(logger).Log("build_context", version.BuildContext()) - exporter, err := NewExporter(opts, *kvPrefix, *kvFilter, *healthSummary, logger) + exporter, err := NewExporter(opts, *kvPrefix, *kvFilter, *healthSummary, *operatorHealth, logger) if err != nil { level.Error(logger).Log("msg", "Error creating the exporter", "err", err) os.Exit(1) diff --git a/consul_exporter_test.go b/consul_exporter_test.go index bec8bb40..72a1be0a 100644 --- a/consul_exporter_test.go +++ b/consul_exporter_test.go @@ -40,7 +40,7 @@ func TestNewExporter(t *testing.T) { } for _, test := range cases { - _, err := NewExporter(consulOpts{uri: test.uri}, "", ".*", true, log.NewNopLogger()) + _, err := NewExporter(consulOpts{uri: test.uri}, "", ".*", true, true, log.NewNopLogger()) if test.ok && err != nil { t.Errorf("expected no error w/ %q, but got %q", test.uri, err) } @@ -208,7 +208,7 @@ consul_service_tag{node="{{ .Node }}",service_id="foobar",tag="tag2"} 1 uri: addr, timeout: time.Duration(time.Second), requestLimit: tc.requestLimit, - }, "", "", true, log.NewNopLogger()) + }, "", "", true, true, log.NewNopLogger()) if err != nil { t.Errorf("expected no error but got %q", err) }