Skip to content

Commit

Permalink
Merge pull request #260 from utilitywarehouse/as-exitcode-metrics
Browse files Browse the repository at this point in the history
add module info metrics
  • Loading branch information
asiyani authored Aug 8, 2024
2 parents 17283e3 + 891b2f5 commit 3f58ead
Show file tree
Hide file tree
Showing 7 changed files with 75 additions and 85 deletions.
12 changes: 5 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -301,16 +301,14 @@ terraform-applier exports Prometheus metrics. The metrics are available on given

In addition to the [controller-runtime](https://book.kubebuilder.io/reference/metrics-reference.html) default metrics, the following custom metrics are included:

- `terraform_applier_module_run_count` - (tags: `module`,`namespace`, `success`) A Counter for each module that has had a terraform run attempt over the lifetime of
- `terraform_applier_module_info`- (tags: `module`,`namespace`, `state`, `reason`) A Gauge that captures the current information about module including status
- `terraform_applier_module_run_count` - (tags: `module`,`namespace`, `run_type`, `success`) A Counter for each module that has had a terraform run attempt over the lifetime of
the application, incremented with each apply attempt and tagged with the result of the run (`success=true|false`)
- `terraform_applier_module_run_duration_seconds` - (tags: `module`,`namespace`, `success`) A Summary that keeps track of the durations of each terraform run for
- `terraform_applier_module_run_duration_seconds` - (tags: `module`,`namespace`, `run_type`, `success`) A Summary that keeps track of the durations of each terraform run for
each module, tagged with the result of the run (`success=true|false`)
- `terraform_applier_module_last_run_success` - (tags: `module`,`namespace`) A `Gauge` which
- `terraform_applier_module_last_run_success` - (tags: `module`,`namespace`, `run_type`) A `Gauge` which
tracks whether the last terraform run for a module was successful.
- `terraform_applier_module_last_run_timestamp` - (tags: `module`,`namespace`) A Gauge that captures the Timestamp of the last successful module run.
- `terraform_applier_module_terraform_exit_code_count` - (tags: `module`,`namespace`, `command`, `exit_code`) A `Counter` for each exit code returned by executions of
`terraform`, labelled with the command issued (`init`, `plan`,`apply`) and the exit code. It's worth noting that `plan` will
return a code of `2` if there are changes to be made, which is not an error or a failure, so you may wish to account for this in your alerting.
- `terraform_applier_module_last_run_timestamp` - (tags: `module`,`namespace`,`run_type`) A Gauge that captures the Timestamp of the last successful module run.
- `terraform_applier_git_last_mirror_timestamp` - (tags: `repo`) A Gauge that captures the Timestamp of the last successful git sync per repo.
- `terraform_applier_git_mirror_count` - (tags: `repo`,`success`) A Counter for each repo sync, incremented with each sync attempt and tagged with the result (`success=true|false`)
- `terraform_applier_git_mirror_latency_seconds` - (tags: `repo`) A Summary that keeps track of the git sync latency per repo.
1 change: 0 additions & 1 deletion integration_test/module_controller_with_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ var _ = Describe("Module controller with Runner", func() {

testMetrics.EXPECT().UpdateModuleRunDuration(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes()
testMetrics.EXPECT().UpdateModuleSuccess(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes()
testMetrics.EXPECT().UpdateTerraformExitCodeCount(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes()
testMetrics.EXPECT().SetRunPending(gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes()

// clear state file if exits
Expand Down
19 changes: 19 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -719,6 +719,25 @@ func run(c *cli.Context) {
logger.Info("OIDC authentication configured", "issuer", c.String("oidc-issuer"), "clientID", c.String("oidc-client-id"))
}

go func(client client.Client) {
if err := metrics.CollectModuleInfo(ctx, client); err != nil {
logger.Error("unable to collect module info metrics", "error", err)
}

ticker := time.NewTicker(time.Minute)
defer ticker.Stop()
for {
select {
case <-ticker.C:
if err := metrics.CollectModuleInfo(ctx, client); err != nil {
logger.Error("unable to collect module info metrics", "error", err)
}
case <-ctx.Done():
return
}
}
}(mgr.GetClient())

webserver := &webserver.WebServer{
Authenticator: oidcAuthenticator,
ListenAddress: c.String("webserver-bind-address"),
Expand Down
12 changes: 0 additions & 12 deletions metrics/mock_prometheus.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

82 changes: 48 additions & 34 deletions metrics/prometheus.go
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
package metrics

import (
"context"
"strconv"
"time"

"github.com/prometheus/client_golang/prometheus"
tfaplv1beta1 "github.com/utilitywarehouse/terraform-applier/api/v1beta1"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

Expand All @@ -16,7 +19,6 @@ const (

// PrometheusInterface allows for mocking out the functionality of Prometheus when testing the full process of an apply run.
type PrometheusInterface interface {
UpdateTerraformExitCodeCount(string, string, string, int)
UpdateModuleSuccess(string, string, string, bool)
UpdateModuleRunDuration(string, string, string, float64, bool)
SetRunPending(string, string, bool)
Expand All @@ -29,16 +31,32 @@ type PrometheusInterface interface {
// moduleRunSuccess is the last run outcome of the module run.
// moduleRunning is the number of modules currently in running state.
type Prometheus struct {
terraformExitCodeCount *prometheus.CounterVec
moduleRunCount *prometheus.CounterVec
moduleRunDuration *prometheus.HistogramVec
moduleRunPending *prometheus.GaugeVec
moduleRunSuccess *prometheus.GaugeVec
moduleRunTimestamp *prometheus.GaugeVec
moduleRunCount *prometheus.CounterVec
moduleRunDuration *prometheus.HistogramVec
moduleRunPending *prometheus.GaugeVec
moduleRunSuccess *prometheus.GaugeVec
moduleRunTimestamp *prometheus.GaugeVec
moduleInfo *prometheus.GaugeVec
}

// Init creates and registers the custom metrics for terraform-applier.
func (p *Prometheus) Init() {
p.moduleInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Name: "module_info",
Help: "Current information about module including status",
},
[]string{
"module",
// Namespace name of the module that was ran
"namespace",
// state of the module
"state",
// potential reason associated with current state
"reason",
},
)

p.moduleRunCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Name: "module_run_count",
Expand Down Expand Up @@ -112,22 +130,6 @@ func (p *Prometheus) Init() {
"run_type",
},
)
p.terraformExitCodeCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Name: "module_terraform_exit_code_count",
Help: "Count of terraform exit codes",
},
[]string{
// Name of the module that was ran
"module",
// Namespace name of the module that was ran
"namespace",
// plan, apply, init etc
"command",
// Exit code
"exit_code",
},
)

// Register custom metrics with the global prometheus registry
metrics.Registry.MustRegister(
Expand All @@ -136,21 +138,11 @@ func (p *Prometheus) Init() {
p.moduleRunSuccess,
p.moduleRunPending,
p.moduleRunTimestamp,
p.terraformExitCodeCount,
p.moduleInfo,
)

}

// UpdateTerraformExitCodeCount increments for each exit code returned by terraform
func (p *Prometheus) UpdateTerraformExitCodeCount(module, namespace string, cmd string, code int) {
p.terraformExitCodeCount.With(prometheus.Labels{
"module": module,
"namespace": namespace,
"command": cmd,
"exit_code": strconv.Itoa(code),
}).Inc()
}

// UpdateModuleSuccess increments the given module's Counter for either successful or failed run attempts.
func (p *Prometheus) UpdateModuleSuccess(module, namespace, runType string, success bool) {
if success {
Expand Down Expand Up @@ -203,3 +195,25 @@ func (p *Prometheus) SetRunPending(module, namespace string, pending bool) {
"namespace": namespace,
}).Set(as)
}

// CollectModuleInfo when called resets 'module_info' and collect current state of the modules
func (p *Prometheus) CollectModuleInfo(ctx context.Context, kc client.Client) error {

kubeModuleList := &tfaplv1beta1.ModuleList{}
if err := kc.List(ctx, kubeModuleList); err != nil {
return err
}

// reset all values and re-set current value
p.moduleInfo.Reset()

for _, m := range kubeModuleList.Items {
p.moduleInfo.With(prometheus.Labels{
"module": m.Name,
"namespace": m.Namespace,
"state": m.Status.CurrentState,
"reason": m.Status.StateReason,
}).Set(1)
}
return nil
}
3 changes: 2 additions & 1 deletion runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,8 @@ func (r *Runner) process(run *tfaplv1beta1.Run, cancelChan <-chan struct{}, envs
defer func() {
// there are no annotations for schedule and polling runs
if run.Request.Type == tfaplv1beta1.ScheduledRun ||
run.Request.Type == tfaplv1beta1.PollingRun {
run.Request.Type == tfaplv1beta1.PollingRun ||
run.Request.Type == tfaplv1beta1.PRPlan {
return
}
if err := sysutil.RemoveRequest(context.Background(), r.ClusterClt, run.Module, run.Request); err != nil {
Expand Down
31 changes: 1 addition & 30 deletions runner/tfexec.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,12 @@ import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"os"
"os/exec"
"path/filepath"

"github.com/hashicorp/terraform-exec/tfexec"
tfaplv1beta1 "github.com/utilitywarehouse/terraform-applier/api/v1beta1"
"github.com/utilitywarehouse/terraform-applier/metrics"
"github.com/utilitywarehouse/terraform-applier/sysutil"
)

Expand All @@ -36,8 +33,7 @@ type tfRunner struct {
workingDir string
planFileName string

metrics metrics.PrometheusInterface
tf *tfexec.Terraform
tf *tfexec.Terraform
}

func (r *Runner) NewTFRunner(
Expand Down Expand Up @@ -73,7 +69,6 @@ func (r *Runner) NewTFRunner(
tfr := &tfRunner{
moduleName: module.Name,
moduleNamespace: module.Namespace,
metrics: r.Metrics,
rootDir: tmpRoot,
workingDir: filepath.Join(tmpRoot, module.Spec.Path),
planFileName: "plan.out",
Expand Down Expand Up @@ -174,14 +169,8 @@ func (te *tfRunner) init(ctx context.Context, backendConf map[string]string) (st
}

if err := te.tf.Init(ctx, opts...); err != nil {
if uerr := errors.Unwrap(err); uerr != nil {
if e, ok := uerr.(*exec.ExitError); ok {
te.metrics.UpdateTerraformExitCodeCount(te.moduleName, te.moduleNamespace, "init", e.ExitCode())
}
}
return out.String(), err
}
te.metrics.UpdateTerraformExitCodeCount(te.moduleName, te.moduleNamespace, "init", 0)

return out.String(), nil
}
Expand All @@ -195,19 +184,8 @@ func (te *tfRunner) plan(ctx context.Context) (bool, string, error) {

changes, err := te.tf.Plan(ctx, tfexec.Out(planOut))
if err != nil {
if uerr := errors.Unwrap(err); uerr != nil {
if e, ok := uerr.(*exec.ExitError); ok {
te.metrics.UpdateTerraformExitCodeCount(te.moduleName, te.moduleNamespace, "plan", e.ExitCode())
}
}
return changes, out.String(), err
}
if changes {
te.metrics.UpdateTerraformExitCodeCount(te.moduleName, te.moduleNamespace, "plan", 2)
} else {
te.metrics.UpdateTerraformExitCodeCount(te.moduleName, te.moduleNamespace, "plan", 0)
}

return changes, out.String(), nil
}

Expand All @@ -231,15 +209,8 @@ func (te *tfRunner) apply(ctx context.Context) (string, error) {
}

if err := te.tf.Apply(ctx, tfexec.DirOrPlan(planOut)); err != nil {
if uerr := errors.Unwrap(err); uerr != nil {
if e, ok := uerr.(*exec.ExitError); ok {
te.metrics.UpdateTerraformExitCodeCount(te.moduleName, te.moduleNamespace, "apply", e.ExitCode())
}
}
return out.String(), err
}

te.metrics.UpdateTerraformExitCodeCount(te.moduleName, te.moduleNamespace, "apply", 0)

return out.String(), nil
}

0 comments on commit 3f58ead

Please sign in to comment.