Skip to content

Commit

Permalink
Collect operator autopilot health metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
chuckyz committed Nov 10, 2020
1 parent 409cb07 commit f16696b
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 7 deletions.
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ make
| ----------------------------------- | ---------------------------------------------------------------------------------------------------- | --------------------------------------------- |
| consul_up | Was the last query of Consul successful | |
| consul_raft_peers | How many peers (servers) are in the Raft cluster | |
| consul_operator_autopilot_server_health | The status of each servers health from a server cluster perspective | server_id, server_name, server_address, server_health_metric |
| consul_serf_lan_members | How many members are in the cluster | |
| consul_serf_lan_member_status | Status of member in the cluster. 1=Alive, 2=Leaving, 3=Left, 4=Failed. | member |
| consul_catalog_services | How many services are in the cluster | |
Expand Down Expand Up @@ -75,6 +76,16 @@ against the actual value found via monitoring.
A prefix must be supplied to activate this feature. Pass `/` if you want to
search the entire keyspace.

#### Operator Autopilot Server Health

This exporter allows gathering low-level server metrics through the
Operator APIs Autopilot Health endpoint. This is a greatly elevated
endpoint that requires `operator:read`, and so should only be used
with a restricted ACL in a trusted fashion.

* __`operator.autopilot-server-health`:__ Collects low-level server metrics
from the v1/operator/autopilot/health endpoint.

### Environment variables

The consul\_exporter supports all environment variables provided by the official
Expand Down
58 changes: 51 additions & 7 deletions consul_exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ var (
"Does Raft cluster have a leader (according to this node).",
nil, nil,
)
operatorAutopilotServerHealth = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "operator_autopilot_server_health"),
"The status of each servers health from a server cluster perspective",
[]string{"server_id", "server_name", "server_address", "server_health_metric"}, nil,
)
nodeCount = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "serf_lan_members"),
"How many members are in the cluster.",
Expand Down Expand Up @@ -121,6 +126,7 @@ type Exporter struct {
kvPrefix string
kvFilter *regexp.Regexp
healthSummary bool
operatorHealth bool
logger log.Logger
requestLimitChan chan struct{}
}
Expand All @@ -137,7 +143,7 @@ type consulOpts struct {
}

// NewExporter returns an initialized Exporter.
func NewExporter(opts consulOpts, kvPrefix, kvFilter string, healthSummary bool, logger log.Logger) (*Exporter, error) {
func NewExporter(opts consulOpts, kvPrefix, kvFilter string, healthSummary bool, operatorHealth bool, logger log.Logger) (*Exporter, error) {
uri := opts.uri
if !strings.Contains(uri, "://") {
uri = "http://" + uri
Expand Down Expand Up @@ -188,6 +194,7 @@ func NewExporter(opts consulOpts, kvPrefix, kvFilter string, healthSummary bool,
kvPrefix: kvPrefix,
kvFilter: regexp.MustCompile(kvFilter),
healthSummary: healthSummary,
operatorHealth: operatorHealth,
logger: logger,
requestLimitChan: requestLimitChan,
}, nil
Expand All @@ -199,6 +206,7 @@ func (e *Exporter) Describe(ch chan<- *prometheus.Desc) {
ch <- up
ch <- clusterServers
ch <- clusterLeader
ch <- operatorAutopilotServerHealth
ch <- nodeCount
ch <- memberStatus
ch <- serviceCount
Expand All @@ -215,6 +223,7 @@ func (e *Exporter) Describe(ch chan<- *prometheus.Desc) {
func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
ok := e.collectPeersMetric(ch)
ok = e.collectLeaderMetric(ch) && ok
ok = e.collectOperatorAutopilotServerHealthMetric(ch) && ok
ok = e.collectNodesMetric(ch) && ok
ok = e.collectMembersMetric(ch) && ok
ok = e.collectServicesMetric(ch) && ok
Expand Down Expand Up @@ -262,6 +271,40 @@ func (e *Exporter) collectLeaderMetric(ch chan<- prometheus.Metric) bool {
return true
}

func (e *Exporter) collectOperatorAutopilotServerHealthMetric(ch chan<- prometheus.Metric) bool {
if e.operatorHealth == false {
return true
}
clusterHealth, err := e.client.Operator().AutopilotServerHealth(&queryOptions)
if err != nil {
level.Error(e.logger).Log("msg", "Failed to get autopilot server health", "err", err)
return false
}
for _, server := range clusterHealth.Servers {
ch <- prometheus.MustNewConstMetric(
operatorAutopilotServerHealth, prometheus.CounterValue, float64(server.LastIndex), server.ID, server.Name, server.Address, "LastIndex",
)
ch <- prometheus.MustNewConstMetric(
operatorAutopilotServerHealth, prometheus.CounterValue, float64(server.LastTerm), server.ID, server.Name, server.Address, "LastTerm",
)
server_health := 0.0
if server.Healthy == true {
server_health = 1.0
}
ch <- prometheus.MustNewConstMetric(
operatorAutopilotServerHealth, prometheus.CounterValue, server_health, server.ID, server.Name, server.Address, "Health",
)
server_voter := 0.0
if server.Voter == true {
server_voter = 1.0
}
ch <- prometheus.MustNewConstMetric(
operatorAutopilotServerHealth, prometheus.CounterValue, server_voter, server.ID, server.Name, server.Address, "Voter",
)
}
return true
}

func (e *Exporter) collectNodesMetric(ch chan<- prometheus.Metric) bool {
nodes, _, err := e.client.Catalog().Nodes(&queryOptions)
if err != nil {
Expand Down Expand Up @@ -456,11 +499,12 @@ func init() {

func main() {
var (
listenAddress = kingpin.Flag("web.listen-address", "Address to listen on for web interface and telemetry.").Default(":9107").String()
metricsPath = kingpin.Flag("web.telemetry-path", "Path under which to expose metrics.").Default("/metrics").String()
healthSummary = kingpin.Flag("consul.health-summary", "Generate a health summary for each service instance. Needs n+1 queries to collect all information.").Default("true").Bool()
kvPrefix = kingpin.Flag("kv.prefix", "Prefix from which to expose key/value pairs.").Default("").String()
kvFilter = kingpin.Flag("kv.filter", "Regex that determines which keys to expose.").Default(".*").String()
listenAddress = kingpin.Flag("web.listen-address", "Address to listen on for web interface and telemetry.").Default(":9107").String()
metricsPath = kingpin.Flag("web.telemetry-path", "Path under which to expose metrics.").Default("/metrics").String()
healthSummary = kingpin.Flag("consul.health-summary", "Generate a health summary for each service instance. Needs n+1 queries to collect all information.").Default("true").Bool()
kvPrefix = kingpin.Flag("kv.prefix", "Prefix from which to expose key/value pairs.").Default("").String()
kvFilter = kingpin.Flag("kv.filter", "Regex that determines which keys to expose.").Default(".*").String()
operatorHealth = kingpin.Flag("operator.autopilot-server-health", "Collect all operator autopilot server health").Default("true").Bool()

opts = consulOpts{}
)
Expand All @@ -486,7 +530,7 @@ func main() {
level.Info(logger).Log("msg", "Starting consul_exporter", "version", version.Info())
level.Info(logger).Log("build_context", version.BuildContext())

exporter, err := NewExporter(opts, *kvPrefix, *kvFilter, *healthSummary, logger)
exporter, err := NewExporter(opts, *kvPrefix, *kvFilter, *healthSummary, *operatorHealth, logger)
if err != nil {
level.Error(logger).Log("msg", "Error creating the exporter", "err", err)
os.Exit(1)
Expand Down

0 comments on commit f16696b

Please sign in to comment.