diff --git a/README.md b/README.md index 0428b64..e9e206d 100644 --- a/README.md +++ b/README.md @@ -301,16 +301,14 @@ terraform-applier exports Prometheus metrics. The metrics are available on given In addition to the [controller-runtime](https://book.kubebuilder.io/reference/metrics-reference.html) default metrics, the following custom metrics are included: -- `terraform_applier_module_run_count` - (tags: `module`,`namespace`, `success`) A Counter for each module that has had a terraform run attempt over the lifetime of +- `terraform_applier_module_info`- (tags: `module`,`namespace`, `state`, `reason`) A Gauge that captures the current information about module including status +- `terraform_applier_module_run_count` - (tags: `module`,`namespace`, `run_type`, `success`) A Counter for each module that has had a terraform run attempt over the lifetime of the application, incremented with each apply attempt and tagged with the result of the run (`success=true|false`) -- `terraform_applier_module_run_duration_seconds` - (tags: `module`,`namespace`, `success`) A Summary that keeps track of the durations of each terraform run for +- `terraform_applier_module_run_duration_seconds` - (tags: `module`,`namespace`, `run_type`, `success`) A Summary that keeps track of the durations of each terraform run for each module, tagged with the result of the run (`success=true|false`) -- `terraform_applier_module_last_run_success` - (tags: `module`,`namespace`) A `Gauge` which +- `terraform_applier_module_last_run_success` - (tags: `module`,`namespace`, `run_type`) A `Gauge` which tracks whether the last terraform run for a module was successful. -- `terraform_applier_module_last_run_timestamp` - (tags: `module`,`namespace`) A Gauge that captures the Timestamp of the last successful module run. -- `terraform_applier_module_terraform_exit_code_count` - (tags: `module`,`namespace`, `command`, `exit_code`) A `Counter` for each exit code returned by executions of - `terraform`, labelled with the command issued (`init`, `plan`,`apply`) and the exit code. It's worth noting that `plan` will - return a code of `2` if there are changes to be made, which is not an error or a failure, so you may wish to account for this in your alerting. +- `terraform_applier_module_last_run_timestamp` - (tags: `module`,`namespace`,`run_type`) A Gauge that captures the Timestamp of the last successful module run. - `terraform_applier_git_last_mirror_timestamp` - (tags: `repo`) A Gauge that captures the Timestamp of the last successful git sync per repo. - `terraform_applier_git_mirror_count` - (tags: `repo`,`success`) A Counter for each repo sync, incremented with each sync attempt and tagged with the result (`success=true|false`) - `terraform_applier_git_mirror_latency_seconds` - (tags: `repo`) A Summary that keeps track of the git sync latency per repo. diff --git a/integration_test/module_controller_with_runner_test.go b/integration_test/module_controller_with_runner_test.go index 31d9715..42416ed 100644 --- a/integration_test/module_controller_with_runner_test.go +++ b/integration_test/module_controller_with_runner_test.go @@ -62,7 +62,6 @@ var _ = Describe("Module controller with Runner", func() { testMetrics.EXPECT().UpdateModuleRunDuration(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes() testMetrics.EXPECT().UpdateModuleSuccess(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes() - testMetrics.EXPECT().UpdateTerraformExitCodeCount(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes() testMetrics.EXPECT().SetRunPending(gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes() // clear state file if exits diff --git a/main.go b/main.go index cd703aa..38f614e 100644 --- a/main.go +++ b/main.go @@ -719,6 +719,25 @@ func run(c *cli.Context) { logger.Info("OIDC authentication configured", "issuer", c.String("oidc-issuer"), "clientID", c.String("oidc-client-id")) } + go func(client client.Client) { + if err := metrics.CollectModuleInfo(ctx, client); err != nil { + logger.Error("unable to collect module info metrics", "error", err) + } + + ticker := time.NewTicker(time.Minute) + defer ticker.Stop() + for { + select { + case <-ticker.C: + if err := metrics.CollectModuleInfo(ctx, client); err != nil { + logger.Error("unable to collect module info metrics", "error", err) + } + case <-ctx.Done(): + return + } + } + }(mgr.GetClient()) + webserver := &webserver.WebServer{ Authenticator: oidcAuthenticator, ListenAddress: c.String("webserver-bind-address"), diff --git a/metrics/mock_prometheus.go b/metrics/mock_prometheus.go index 2e85894..b0a7b2a 100644 --- a/metrics/mock_prometheus.go +++ b/metrics/mock_prometheus.go @@ -68,15 +68,3 @@ func (mr *MockPrometheusInterfaceMockRecorder) UpdateModuleSuccess(arg0, arg1, a mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateModuleSuccess", reflect.TypeOf((*MockPrometheusInterface)(nil).UpdateModuleSuccess), arg0, arg1, arg2, arg3) } - -// UpdateTerraformExitCodeCount mocks base method. -func (m *MockPrometheusInterface) UpdateTerraformExitCodeCount(arg0, arg1, arg2 string, arg3 int) { - m.ctrl.T.Helper() - m.ctrl.Call(m, "UpdateTerraformExitCodeCount", arg0, arg1, arg2, arg3) -} - -// UpdateTerraformExitCodeCount indicates an expected call of UpdateTerraformExitCodeCount. -func (mr *MockPrometheusInterfaceMockRecorder) UpdateTerraformExitCodeCount(arg0, arg1, arg2, arg3 interface{}) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateTerraformExitCodeCount", reflect.TypeOf((*MockPrometheusInterface)(nil).UpdateTerraformExitCodeCount), arg0, arg1, arg2, arg3) -} diff --git a/metrics/prometheus.go b/metrics/prometheus.go index ef2bdcf..ebc525a 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -1,10 +1,13 @@ package metrics import ( + "context" "strconv" "time" "github.com/prometheus/client_golang/prometheus" + tfaplv1beta1 "github.com/utilitywarehouse/terraform-applier/api/v1beta1" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/metrics" ) @@ -16,7 +19,6 @@ const ( // PrometheusInterface allows for mocking out the functionality of Prometheus when testing the full process of an apply run. type PrometheusInterface interface { - UpdateTerraformExitCodeCount(string, string, string, int) UpdateModuleSuccess(string, string, string, bool) UpdateModuleRunDuration(string, string, string, float64, bool) SetRunPending(string, string, bool) @@ -29,16 +31,32 @@ type PrometheusInterface interface { // moduleRunSuccess is the last run outcome of the module run. // moduleRunning is the number of modules currently in running state. type Prometheus struct { - terraformExitCodeCount *prometheus.CounterVec - moduleRunCount *prometheus.CounterVec - moduleRunDuration *prometheus.HistogramVec - moduleRunPending *prometheus.GaugeVec - moduleRunSuccess *prometheus.GaugeVec - moduleRunTimestamp *prometheus.GaugeVec + moduleRunCount *prometheus.CounterVec + moduleRunDuration *prometheus.HistogramVec + moduleRunPending *prometheus.GaugeVec + moduleRunSuccess *prometheus.GaugeVec + moduleRunTimestamp *prometheus.GaugeVec + moduleInfo *prometheus.GaugeVec } // Init creates and registers the custom metrics for terraform-applier. func (p *Prometheus) Init() { + p.moduleInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: metricsNamespace, + Name: "module_info", + Help: "Current information about module including status", + }, + []string{ + "module", + // Namespace name of the module that was ran + "namespace", + // state of the module + "state", + // potential reason associated with current state + "reason", + }, + ) + p.moduleRunCount = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: metricsNamespace, Name: "module_run_count", @@ -112,22 +130,6 @@ func (p *Prometheus) Init() { "run_type", }, ) - p.terraformExitCodeCount = prometheus.NewCounterVec(prometheus.CounterOpts{ - Namespace: metricsNamespace, - Name: "module_terraform_exit_code_count", - Help: "Count of terraform exit codes", - }, - []string{ - // Name of the module that was ran - "module", - // Namespace name of the module that was ran - "namespace", - // plan, apply, init etc - "command", - // Exit code - "exit_code", - }, - ) // Register custom metrics with the global prometheus registry metrics.Registry.MustRegister( @@ -136,21 +138,11 @@ func (p *Prometheus) Init() { p.moduleRunSuccess, p.moduleRunPending, p.moduleRunTimestamp, - p.terraformExitCodeCount, + p.moduleInfo, ) } -// UpdateTerraformExitCodeCount increments for each exit code returned by terraform -func (p *Prometheus) UpdateTerraformExitCodeCount(module, namespace string, cmd string, code int) { - p.terraformExitCodeCount.With(prometheus.Labels{ - "module": module, - "namespace": namespace, - "command": cmd, - "exit_code": strconv.Itoa(code), - }).Inc() -} - // UpdateModuleSuccess increments the given module's Counter for either successful or failed run attempts. func (p *Prometheus) UpdateModuleSuccess(module, namespace, runType string, success bool) { if success { @@ -203,3 +195,25 @@ func (p *Prometheus) SetRunPending(module, namespace string, pending bool) { "namespace": namespace, }).Set(as) } + +// CollectModuleInfo when called resets 'module_info' and collect current state of the modules +func (p *Prometheus) CollectModuleInfo(ctx context.Context, kc client.Client) error { + + kubeModuleList := &tfaplv1beta1.ModuleList{} + if err := kc.List(ctx, kubeModuleList); err != nil { + return err + } + + // reset all values and re-set current value + p.moduleInfo.Reset() + + for _, m := range kubeModuleList.Items { + p.moduleInfo.With(prometheus.Labels{ + "module": m.Name, + "namespace": m.Namespace, + "state": m.Status.CurrentState, + "reason": m.Status.StateReason, + }).Set(1) + } + return nil +} diff --git a/runner/runner.go b/runner/runner.go index 4521ddc..b4eedc2 100644 --- a/runner/runner.go +++ b/runner/runner.go @@ -132,7 +132,8 @@ func (r *Runner) process(run *tfaplv1beta1.Run, cancelChan <-chan struct{}, envs defer func() { // there are no annotations for schedule and polling runs if run.Request.Type == tfaplv1beta1.ScheduledRun || - run.Request.Type == tfaplv1beta1.PollingRun { + run.Request.Type == tfaplv1beta1.PollingRun || + run.Request.Type == tfaplv1beta1.PRPlan { return } if err := sysutil.RemoveRequest(context.Background(), r.ClusterClt, run.Module, run.Request); err != nil { diff --git a/runner/tfexec.go b/runner/tfexec.go index f9dcd0d..219d2b2 100644 --- a/runner/tfexec.go +++ b/runner/tfexec.go @@ -4,15 +4,12 @@ import ( "bytes" "context" "encoding/json" - "errors" "fmt" "os" - "os/exec" "path/filepath" "github.com/hashicorp/terraform-exec/tfexec" tfaplv1beta1 "github.com/utilitywarehouse/terraform-applier/api/v1beta1" - "github.com/utilitywarehouse/terraform-applier/metrics" "github.com/utilitywarehouse/terraform-applier/sysutil" ) @@ -36,8 +33,7 @@ type tfRunner struct { workingDir string planFileName string - metrics metrics.PrometheusInterface - tf *tfexec.Terraform + tf *tfexec.Terraform } func (r *Runner) NewTFRunner( @@ -73,7 +69,6 @@ func (r *Runner) NewTFRunner( tfr := &tfRunner{ moduleName: module.Name, moduleNamespace: module.Namespace, - metrics: r.Metrics, rootDir: tmpRoot, workingDir: filepath.Join(tmpRoot, module.Spec.Path), planFileName: "plan.out", @@ -174,14 +169,8 @@ func (te *tfRunner) init(ctx context.Context, backendConf map[string]string) (st } if err := te.tf.Init(ctx, opts...); err != nil { - if uerr := errors.Unwrap(err); uerr != nil { - if e, ok := uerr.(*exec.ExitError); ok { - te.metrics.UpdateTerraformExitCodeCount(te.moduleName, te.moduleNamespace, "init", e.ExitCode()) - } - } return out.String(), err } - te.metrics.UpdateTerraformExitCodeCount(te.moduleName, te.moduleNamespace, "init", 0) return out.String(), nil } @@ -195,19 +184,8 @@ func (te *tfRunner) plan(ctx context.Context) (bool, string, error) { changes, err := te.tf.Plan(ctx, tfexec.Out(planOut)) if err != nil { - if uerr := errors.Unwrap(err); uerr != nil { - if e, ok := uerr.(*exec.ExitError); ok { - te.metrics.UpdateTerraformExitCodeCount(te.moduleName, te.moduleNamespace, "plan", e.ExitCode()) - } - } return changes, out.String(), err } - if changes { - te.metrics.UpdateTerraformExitCodeCount(te.moduleName, te.moduleNamespace, "plan", 2) - } else { - te.metrics.UpdateTerraformExitCodeCount(te.moduleName, te.moduleNamespace, "plan", 0) - } - return changes, out.String(), nil } @@ -231,15 +209,8 @@ func (te *tfRunner) apply(ctx context.Context) (string, error) { } if err := te.tf.Apply(ctx, tfexec.DirOrPlan(planOut)); err != nil { - if uerr := errors.Unwrap(err); uerr != nil { - if e, ok := uerr.(*exec.ExitError); ok { - te.metrics.UpdateTerraformExitCodeCount(te.moduleName, te.moduleNamespace, "apply", e.ExitCode()) - } - } return out.String(), err } - te.metrics.UpdateTerraformExitCodeCount(te.moduleName, te.moduleNamespace, "apply", 0) - return out.String(), nil }