Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Collect operator autopilot health metrics #195

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ make
| ----------------------------------- | ---------------------------------------------------------------------------------------------------- | --------------------------------------------- |
| consul_up | Was the last query of Consul successful | |
| consul_raft_peers | How many peers (servers) are in the Raft cluster | |
| consul_operator_autopilot_health_voter | If a server is a voter or not | server_id, server_name, server_address, server_version |
| consul_operator_autopilot_health_healthy | If a server is healthy or not (according to Raft Autopilot) | server_id, server_name, server_address, server_version |
| consul_operator_autopilot_health_last_index | The last known raft index a server has replayed | server_id, server_name, server_address, server_version |
| consul_operator_autopilot_health_last_term | The last known voting index a server has seen/sent | server_id, server_name, server_address, server_version |
| consul_serf_lan_members | How many members are in the cluster | |
| consul_serf_lan_member_status | Status of member in the cluster. 1=Alive, 2=Leaving, 3=Left, 4=Failed. | member |
| consul_catalog_services | How many services are in the cluster | |
Expand Down Expand Up @@ -75,6 +79,16 @@ against the actual value found via monitoring.
A prefix must be supplied to activate this feature. Pass `/` if you want to
search the entire keyspace.

#### Operator Autopilot Server Health

This exporter allows gathering low-level server metrics through the
Operator APIs Autopilot Health endpoint. This is a greatly elevated
endpoint that requires `operator:read`, and so should only be used
with a restricted ACL in a trusted fashion.

* __`operator.autopilot-server-health`:__ Collects low-level server metrics
from the v1/operator/autopilot/health endpoint.

### Environment variables

The consul\_exporter supports all environment variables provided by the official
Expand All @@ -99,6 +113,10 @@ __What service checks are critical?__

You can query for the following health check states: "maintenance", "critical", "warning" or "passing"

__Which servers are often lagging behind the cluster?__

avg(consul_operator_autopilot_health_healthy) by (server_name)

## Using Docker

You can deploy this exporter using the [prom/consul-exporter](https://registry.hub.docker.com/r/prom/consul-exporter) Docker image.
Expand Down
76 changes: 69 additions & 7 deletions consul_exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,26 @@ var (
"Does Raft cluster have a leader (according to this node).",
nil, nil,
)
operatorAutopilotVoter = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "operator_autopilot_health_voter"),
"If a server is a voter or not.",
[]string{"server_id", "server_name", "server_address", "server_version"}, nil,
)
operatorAutopilotHealthy = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "operator_autopilot_health_healthy"),
"If a server is healthy or not (according to raft autopilot).",
[]string{"server_id", "server_name", "server_address", "server_version"}, nil,
)
operatorAutopilotLastIndex = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "operator_autopilot_health_last_index"),
"The last known raft index a server has replayed.",
[]string{"server_id", "server_name", "server_address", "server_version"}, nil,
)
operatorAutopilotLastTerm = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "operator_autopilot_health_last_term"),
"The last known voting index a server has seen/sent.",
[]string{"server_id", "server_name", "server_address", "server_version"}, nil,
)
nodeCount = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "serf_lan_members"),
"How many members are in the cluster.",
Expand Down Expand Up @@ -121,6 +141,7 @@ type Exporter struct {
kvPrefix string
kvFilter *regexp.Regexp
healthSummary bool
operatorHealth bool
logger log.Logger
requestLimitChan chan struct{}
}
Expand All @@ -137,7 +158,7 @@ type consulOpts struct {
}

// NewExporter returns an initialized Exporter.
func NewExporter(opts consulOpts, kvPrefix, kvFilter string, healthSummary bool, logger log.Logger) (*Exporter, error) {
func NewExporter(opts consulOpts, kvPrefix, kvFilter string, healthSummary bool, operatorHealth bool, logger log.Logger) (*Exporter, error) {
uri := opts.uri
if !strings.Contains(uri, "://") {
uri = "http://" + uri
Expand Down Expand Up @@ -188,6 +209,7 @@ func NewExporter(opts consulOpts, kvPrefix, kvFilter string, healthSummary bool,
kvPrefix: kvPrefix,
kvFilter: regexp.MustCompile(kvFilter),
healthSummary: healthSummary,
operatorHealth: operatorHealth,
logger: logger,
requestLimitChan: requestLimitChan,
}, nil
Expand All @@ -199,6 +221,10 @@ func (e *Exporter) Describe(ch chan<- *prometheus.Desc) {
ch <- up
ch <- clusterServers
ch <- clusterLeader
ch <- operatorAutopilotHealthy
ch <- operatorAutopilotLastIndex
ch <- operatorAutopilotLastTerm
ch <- operatorAutopilotVoter
ch <- nodeCount
ch <- memberStatus
ch <- serviceCount
Expand All @@ -215,6 +241,7 @@ func (e *Exporter) Describe(ch chan<- *prometheus.Desc) {
func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
ok := e.collectPeersMetric(ch)
ok = e.collectLeaderMetric(ch) && ok
ok = e.collectOperatorAutopilotServerHealthMetric(ch) && ok
ok = e.collectNodesMetric(ch) && ok
ok = e.collectMembersMetric(ch) && ok
ok = e.collectServicesMetric(ch) && ok
Expand Down Expand Up @@ -262,6 +289,40 @@ func (e *Exporter) collectLeaderMetric(ch chan<- prometheus.Metric) bool {
return true
}

func (e *Exporter) collectOperatorAutopilotServerHealthMetric(ch chan<- prometheus.Metric) bool {
if !e.operatorHealth {
return true
}
clusterHealth, err := e.client.Operator().AutopilotServerHealth(&queryOptions)
if err != nil {
level.Error(e.logger).Log("msg", "Failed to get autopilot server health", "err", err)
return false
}
for _, server := range clusterHealth.Servers {
ch <- prometheus.MustNewConstMetric(
operatorAutopilotLastIndex, prometheus.CounterValue, float64(server.LastIndex), server.ID, server.Name, server.Address, server.Version,
)
ch <- prometheus.MustNewConstMetric(
operatorAutopilotLastTerm, prometheus.CounterValue, float64(server.LastTerm), server.ID, server.Name, server.Address, server.Version,
)
server_health := 0.0
if server.Healthy {
server_health = 1.0
}
ch <- prometheus.MustNewConstMetric(
operatorAutopilotHealthy, prometheus.CounterValue, server_health, server.ID, server.Name, server.Address, server.Version,
)
server_voter := 0.0
if server.Voter {
server_voter = 1.0
}
ch <- prometheus.MustNewConstMetric(
operatorAutopilotVoter, prometheus.CounterValue, server_voter, server.ID, server.Name, server.Address, server.Version,
)
}
return true
}

func (e *Exporter) collectNodesMetric(ch chan<- prometheus.Metric) bool {
nodes, _, err := e.client.Catalog().Nodes(&queryOptions)
if err != nil {
Expand Down Expand Up @@ -456,11 +517,12 @@ func init() {

func main() {
var (
listenAddress = kingpin.Flag("web.listen-address", "Address to listen on for web interface and telemetry.").Default(":9107").String()
metricsPath = kingpin.Flag("web.telemetry-path", "Path under which to expose metrics.").Default("/metrics").String()
healthSummary = kingpin.Flag("consul.health-summary", "Generate a health summary for each service instance. Needs n+1 queries to collect all information.").Default("true").Bool()
kvPrefix = kingpin.Flag("kv.prefix", "Prefix from which to expose key/value pairs.").Default("").String()
kvFilter = kingpin.Flag("kv.filter", "Regex that determines which keys to expose.").Default(".*").String()
listenAddress = kingpin.Flag("web.listen-address", "Address to listen on for web interface and telemetry.").Default(":9107").String()
metricsPath = kingpin.Flag("web.telemetry-path", "Path under which to expose metrics.").Default("/metrics").String()
healthSummary = kingpin.Flag("consul.health-summary", "Generate a health summary for each service instance. Needs n+1 queries to collect all information.").Default("true").Bool()
kvPrefix = kingpin.Flag("kv.prefix", "Prefix from which to expose key/value pairs.").Default("").String()
kvFilter = kingpin.Flag("kv.filter", "Regex that determines which keys to expose.").Default(".*").String()
operatorHealth = kingpin.Flag("operator.autopilot-server-health", "Collect all operator autopilot server health").Default("false").Bool()

opts = consulOpts{}
)
Expand All @@ -486,7 +548,7 @@ func main() {
level.Info(logger).Log("msg", "Starting consul_exporter", "version", version.Info())
level.Info(logger).Log("build_context", version.BuildContext())

exporter, err := NewExporter(opts, *kvPrefix, *kvFilter, *healthSummary, logger)
exporter, err := NewExporter(opts, *kvPrefix, *kvFilter, *healthSummary, *operatorHealth, logger)
if err != nil {
level.Error(logger).Log("msg", "Error creating the exporter", "err", err)
os.Exit(1)
Expand Down
4 changes: 2 additions & 2 deletions consul_exporter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ func TestNewExporter(t *testing.T) {
}

for _, test := range cases {
_, err := NewExporter(consulOpts{uri: test.uri}, "", ".*", true, log.NewNopLogger())
_, err := NewExporter(consulOpts{uri: test.uri}, "", ".*", true, true, log.NewNopLogger())
if test.ok && err != nil {
t.Errorf("expected no error w/ %q, but got %q", test.uri, err)
}
Expand Down Expand Up @@ -208,7 +208,7 @@ consul_service_tag{node="{{ .Node }}",service_id="foobar",tag="tag2"} 1
uri: addr,
timeout: time.Duration(time.Second),
requestLimit: tc.requestLimit,
}, "", "", true, log.NewNopLogger())
}, "", "", true, true, log.NewNopLogger())
if err != nil {
t.Errorf("expected no error but got %q", err)
}
Expand Down