From 5cfdd0142245c9976d839febe81d6ea3b55bbb5a Mon Sep 17 00:00:00 2001 From: Temirlan Zhussupov <45013594+zhussupov-t@users.noreply.github.com> Date: Wed, 27 Jan 2021 19:02:30 +0300 Subject: [PATCH] Metrics: refactoring, shard state logging (#65) --- internal/metrics/metrics.go | 59 ++++++++++++++++-------- internal/vshard/cluster.go | 10 ++-- internal/vshard/orchestrator/failover.go | 2 + internal/vshard/orchestrator/monitor.go | 2 +- 4 files changed, 48 insertions(+), 25 deletions(-) diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index dce83d3..0c03ba9 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -1,12 +1,22 @@ package metrics -import "github.com/prometheus/client_golang/prometheus" +import ( + "github.com/prometheus/client_golang/prometheus" +) const ( discoveryInstanceDurations = "instance_durations" discoveryClusterDurations = "cluster_durations" shardCriticalLevel = "critical_level" shardState = "state" + shardStateEvent = "shard_state_event" +) + +const ( + labelClusterName = "cluster_name" + labelHostName = "hostname" + labelShardState = "shard_state" + labelShardUUID = "shard_uuid" ) var ( @@ -20,42 +30,49 @@ var ( Name: discoveryInstanceDurations, Help: "Instance discovery latencies in seconds", Buckets: discoveryInstanceDurationsBuckets, - }, []string{"cluster_name", "hostname"}) + }, []string{labelClusterName, labelHostName}) discoveryClusterDurationsSum = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Subsystem: "discovery", Name: discoveryClusterDurations, Help: "Cluster discovery latencies in seconds", Buckets: discoveryClusterDurationsBuckets, - }, []string{"cluster_name"}) + }, []string{labelClusterName}) shardCriticalLevelGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Subsystem: "shard", Name: shardCriticalLevel, Help: "Critical level of the replica set", - }, []string{"cluster_name", "uuid", "master_uri"}) + }, []string{labelClusterName, labelShardUUID}) shardStateGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Subsystem: "shard", Name: shardState, Help: "The state of each shard in the cluster; it will have one line for each possible state of each shard. A value of 1 means the shard is in the state specified by the state label, a value of 0 means it is not.", - }, []string{"cluster_name", "uuid", "master_uri", "state"}) + }, []string{labelClusterName, labelShardUUID, labelShardState}) - discoveryErrors = prometheus.NewCounterVec(prometheus.CounterOpts{ + discoveryErrors = prometheus.NewCounter(prometheus.CounterOpts{ Subsystem: "discovery", Name: "errors", Help: "Errors that happen during discovery process", - }, []string{"cluster_name", "uri"}) + }) + + shardStateCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ + Subsystem: "orchestrator", + Name: shardStateEvent, + Help: "Discovered shard state event", + }, []string{labelClusterName, labelShardUUID, labelShardState}) ) func init() { - discoveryErrors.With(prometheus.Labels{"cluster_name": "", "uri": ""}).Add(0) + discoveryErrors.Add(0) prometheus.MustRegister( discoveryInstanceDurationsSum, discoveryClusterDurationsSum, shardCriticalLevelGauge, shardStateGauge, discoveryErrors, + shardStateCounter, ) } @@ -95,26 +112,30 @@ func StartClusterDiscovery(clusterName string) Transaction { return txn.Start() } -func SetShardCriticalLevel(clusterName, uuid, masterURI string, level int) { - shardCriticalLevelGauge.WithLabelValues(clusterName, uuid, masterURI).Set(float64(level)) +func SetShardCriticalLevel(clusterName, uuid string, level int) { + shardCriticalLevelGauge.WithLabelValues(clusterName, uuid).Set(float64(level)) } -func SetShardState(clusterName, uuid, masterURI, state string, active bool) { +func SetShardState(clusterName, uuid, state string, active bool) { v := float64(0) if active { v = 1 } shardStateGauge.With(prometheus.Labels{ - "cluster_name": clusterName, - "uuid": uuid, - "master_uri": masterURI, - "state": state, + labelClusterName: clusterName, + labelShardUUID: uuid, + labelShardState: state, }).Set(v) } -func RecordDiscoveryError(clusterName, uri string) { - discoveryErrors.With(prometheus.Labels{ - "cluster_name": clusterName, - "uri": uri, +func RecordDiscoveryError() { + discoveryErrors.Inc() +} + +func RecordDiscoveredShardState(clusterName, shardUUID, state string) { + shardStateCounter.With(prometheus.Labels{ + labelClusterName: clusterName, + labelShardUUID: shardUUID, + labelShardState: state, }).Inc() } diff --git a/internal/vshard/cluster.go b/internal/vshard/cluster.go index 52bef3c..4044b1d 100644 --- a/internal/vshard/cluster.go +++ b/internal/vshard/cluster.go @@ -292,7 +292,7 @@ func (c *Cluster) Discover() { conn := c.Connector(router.URI) resp := conn.Exec(ctx, vshardRouterInfoQuery) if resp.Error != nil { - metrics.RecordDiscoveryError(c.Name, router.URI) + metrics.RecordDiscoveryError() c.logger. Err(resp.Error). Str("URI", router.URI). @@ -302,7 +302,7 @@ func (c *Cluster) Discover() { updatedRI, err := ParseRouterInfo(resp.Data) if err != nil { - metrics.RecordDiscoveryError(c.Name, router.URI) + metrics.RecordDiscoveryError() c.logger.Err(err). Str("URI", router.URI). Msg("Failed to discover the topology of the cluster using router") @@ -371,7 +371,7 @@ func (c *Cluster) Discover() { ns.ReplicaSets = append(ns.ReplicaSets, set) code, _ := set.HealthStatus() - metrics.SetShardCriticalLevel(c.Name, string(set.UUID), set.MasterURI, int(code)) + metrics.SetShardCriticalLevel(c.Name, string(set.UUID), int(code)) c.logDiscoveredReplicaSet(set) } @@ -446,7 +446,7 @@ func (c *Cluster) discoverInstance(ctx context.Context, inst *Instance) { conn := c.Connector(inst.URI) resp := conn.Exec(ctx, vshardInstanceInfoQuery) if resp.Error != nil { - metrics.RecordDiscoveryError(c.Name, inst.URI) + metrics.RecordDiscoveryError() c.logger.Err(resp.Error). Str("URI", inst.URI). Str("UUID", string(inst.UUID)). @@ -457,7 +457,7 @@ func (c *Cluster) discoverInstance(ctx context.Context, inst *Instance) { info, err := ParseInstanceInfo(resp.Data) if err != nil { - metrics.RecordDiscoveryError(c.Name, inst.URI) + metrics.RecordDiscoveryError() c.logger.Err(err). Str("URI", inst.URI). Str("UUID", string(inst.UUID)). diff --git a/internal/vshard/orchestrator/failover.go b/internal/vshard/orchestrator/failover.go index c67c790..eb35d2f 100644 --- a/internal/vshard/orchestrator/failover.go +++ b/internal/vshard/orchestrator/failover.go @@ -11,6 +11,7 @@ import ( "github.com/rs/zerolog" "github.com/viciious/go-tarantool" + "github.com/shmel1k/qumomf/internal/metrics" "github.com/shmel1k/qumomf/internal/quorum" "github.com/shmel1k/qumomf/internal/util" "github.com/shmel1k/qumomf/internal/vshard" @@ -148,6 +149,7 @@ func (f *failover) checkAndRecover(ctx context.Context, analysis *ReplicationAna Str("master_uri", analysis.Set.MasterURI). Logger() logger.WithLevel(f.sampler.sample(analysis)).Str("analysis", analysis.String()).Msg("checkAndRecover") + metrics.RecordDiscoveredShardState(f.cluster.Name, string(analysis.Set.UUID), string(analysis.State)) recvFunc, desc := f.getCheckAndRecoveryFunc(analysis.State) if recvFunc == nil { diff --git a/internal/vshard/orchestrator/monitor.go b/internal/vshard/orchestrator/monitor.go index 051f72a..1db29f1 100644 --- a/internal/vshard/orchestrator/monitor.go +++ b/internal/vshard/orchestrator/monitor.go @@ -86,7 +86,7 @@ func (m *storageMonitor) checkCluster(stream AnalysisWriteStream) { for _, state := range ReplicaSetStateEnum { active := state == analysis.State - metrics.SetShardState(m.cluster.Name, string(set.UUID), set.MasterURI, string(state), active) + metrics.SetShardState(m.cluster.Name, string(set.UUID), string(state), active) } } }(set)