Skip to content

Commit 583922f

Browse files
author
Rohit Jaiswal
committed
This change enhances Flipt's Git-based feature flag backend observability by adding detailed synchronization metrics. Currently, failures during Git sync are only logged without metric visibility, limiting proactive monitoring and alerting capabilities.
- Introduce new OpenTelemetry metrics for Git sync operations: - Last sync time as an observable gauge (timestamp). - Sync duration histogram. - Counters for number of flags fetched. - Success and failure counts with failure reason attributes. - Instrument the `SnapshotStore.update` method, the core sync loop, to record these metrics accurately on every sync attempt, including partial failures and cleanups. - Extend the `Snapshot` type with `TotalFlagsCount()` to count all flags across namespaces for metric reporting. - Integrate metrics initialization in app startup ensuring consistent telemetry setup. - Improve test coverage by suggesting strategies to verify metric emission and sync behavior. These metric additions enable operators to monitor Git sync health, detect failures promptly, and troubleshoot issues efficiently, significantly improving runtime observability and system reliability.
1 parent e571d18 commit 583922f

File tree

7 files changed

+615
-125
lines changed

7 files changed

+615
-125
lines changed

DEVELOPMENT.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ If you run into errors such as:
2828
undefined: sqlite3.Error
2929
```
3030

31-
Then you need to enable CGO.
31+
Then you need to enable CGO.
3232

3333
### Windows
3434

@@ -43,10 +43,10 @@ Then you need to enable CGO.
4343
## Setup
4444

4545
1. Clone this repo: `git clone https://github.com/flipt-io/flipt`.
46-
1. Run `mage bootstrap` to install required development tools. See [#bootstrap](#bootstrap) below.
47-
1. Run `mage go:test` to execute the Go test suite. For more information on tests, see also [here](build/README.md)
48-
1. Run `mage` to build the binary with embedded assets.
49-
1. Run `mage -l` to see a full list of possible commands.
46+
2. Run `mage bootstrap` to install required development tools. See [#bootstrap](#bootstrap) below.
47+
3. Run `mage go:test` to execute the Go test suite. For more information on tests, see also [here](build/README.md)
48+
4. Run `mage` to build the binary with embedded assets.
49+
5. Run `mage -l` to see a full list of possible commands.
5050

5151
## Conventional Commits
5252

@@ -112,7 +112,7 @@ These ports will be forwarded to your local machine automatically if you are dev
112112

113113
## Docker Compose
114114

115-
If you want to develop Flipt using Docker Compose, you can use the `docker-compose.yml` file in the root of this repository.
115+
If you want to develop Flipt using Docker Compose, you can use the `docker-compose.yml` file in the root of this repository.
116116

117117
This will start two Docker containers:
118118

cmd/flipt/main.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"go.flipt.io/flipt/internal/cmd"
2424
"go.flipt.io/flipt/internal/config"
2525
"go.flipt.io/flipt/internal/info"
26+
"go.flipt.io/flipt/internal/metrics"
2627
"go.flipt.io/flipt/internal/release"
2728
"go.flipt.io/flipt/internal/telemetry"
2829
"go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
@@ -105,6 +106,8 @@ func exec() error {
105106
return err
106107
}
107108

109+
metrics.InitMetrics()
110+
108111
defer func() {
109112
_ = logger.Sync()
110113
}()

go.work.sum

Lines changed: 417 additions & 0 deletions
Large diffs are not rendered by default.

internal/metrics/metrics.go

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,76 @@ import (
1919
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
2020
)
2121

22+
var (
23+
GitSyncLastTime metric.Int64ObservableGauge
24+
GitSyncDuration metric.Float64Histogram
25+
GitSyncFlagsFetched metric.Int64Counter
26+
GitSyncSuccess metric.Int64Counter
27+
GitSyncFailure metric.Int64Counter
28+
29+
// internal storage for last sync time value
30+
lastSyncTimeValue int64
31+
lastSyncTimeMu sync.RWMutex
32+
)
33+
2234
func init() {
2335
if otel.GetMeterProvider() == nil {
2436
otel.SetMeterProvider(metricnoop.NewMeterProvider())
2537
}
2638
}
2739

40+
func InitMetrics() {
41+
InitGitSyncMetrics(meter())
42+
}
43+
44+
func InitGitSyncMetrics(meter metric.Meter) {
45+
var err error
46+
47+
// Create counter and histogram instruments
48+
GitSyncDuration, err = meter.Float64Histogram("git_sync_duration_seconds")
49+
if err != nil {
50+
panic(fmt.Errorf("creating git_sync_duration_seconds histogram: %w", err))
51+
}
52+
GitSyncFlagsFetched, err = meter.Int64Counter("git_sync_flags_fetched")
53+
if err != nil {
54+
panic(fmt.Errorf("creating git_sync_flags_fetched counter: %w", err))
55+
}
56+
GitSyncSuccess, err = meter.Int64Counter("git_sync_success_count")
57+
if err != nil {
58+
panic(fmt.Errorf("creating git_sync_success_count counter: %w", err))
59+
}
60+
GitSyncFailure, err = meter.Int64Counter("git_sync_failure_count")
61+
if err != nil {
62+
panic(fmt.Errorf("creating git_sync_failure_count counter: %w", err))
63+
}
64+
65+
// Create ObservableGauge for last sync time and register callback
66+
GitSyncLastTime, err = meter.Int64ObservableGauge("git_sync_last_time_unix")
67+
if err != nil {
68+
panic(fmt.Errorf("creating git_sync_last_time_unix observable gauge: %w", err))
69+
}
70+
71+
_, err = meter.RegisterCallback(
72+
func(ctx context.Context, observer metric.Observer) error {
73+
lastSyncTimeMu.RLock()
74+
value := lastSyncTimeValue
75+
lastSyncTimeMu.RUnlock()
76+
observer.ObserveInt64(GitSyncLastTime, value)
77+
return nil
78+
},
79+
GitSyncLastTime,
80+
)
81+
if err != nil {
82+
panic(fmt.Errorf("registering callback for git_sync_last_time_unix: %w", err))
83+
}
84+
}
85+
86+
func SetGitSyncLastTime(ts int64) {
87+
lastSyncTimeMu.Lock()
88+
lastSyncTimeValue = ts
89+
lastSyncTimeMu.Unlock()
90+
}
91+
2892
// This is memoized in the OTEL library to avoid creating multiple instances of the same exporter.
2993
func meter() metric.Meter {
3094
return otel.Meter("github.com/flipt-io/flipt")

internal/storage/fs/git/store.go

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,13 @@ import (
44
"context"
55
"errors"
66
"fmt"
7+
"go.opentelemetry.io/otel/attribute"
8+
"go.opentelemetry.io/otel/metric"
79
"io/fs"
810
"os"
911
"slices"
1012
"sync"
13+
"time"
1114

1215
"github.com/go-git/go-billy/v5/osfs"
1316
"github.com/go-git/go-git/v5"
@@ -20,6 +23,7 @@ import (
2023
"github.com/go-git/go-git/v5/storage/memory"
2124
"go.flipt.io/flipt/internal/containers"
2225
"go.flipt.io/flipt/internal/gitfs"
26+
"go.flipt.io/flipt/internal/metrics"
2327
"go.flipt.io/flipt/internal/storage"
2428
storagefs "go.flipt.io/flipt/internal/storage/fs"
2529
"go.uber.org/zap"
@@ -294,6 +298,10 @@ func (s *SnapshotStore) View(ctx context.Context, storeRef storage.Reference, fn
294298
return fn(snap)
295299
}
296300

301+
func (s *SnapshotStore) Resolve(ref string) (plumbing.Hash, error) {
302+
return s.resolve(ref)
303+
}
304+
297305
// listRemoteRefs returns a set of branch and tag names present on the remote.
298306
func (s *SnapshotStore) listRemoteRefs(ctx context.Context) (map[string]struct{}, error) {
299307
remotes, err := s.repo.Remotes()
@@ -335,14 +343,27 @@ func (s *SnapshotStore) listRemoteRefs(ctx context.Context) (map[string]struct{}
335343
// HEAD updates to a new revision, it builds a snapshot and updates it
336344
// on the store.
337345
func (s *SnapshotStore) update(ctx context.Context) (bool, error) {
346+
syncStart := time.Now()
338347
updated, fetchErr := s.fetch(ctx, s.snaps.References())
339348

340349
if !updated && fetchErr == nil {
350+
// No update and no error: record metrics for a successful no-change sync
351+
duration := time.Since(syncStart).Seconds()
352+
metrics.SetGitSyncLastTime(time.Now().Unix())
353+
metrics.GitSyncDuration.Record(ctx, duration)
354+
metrics.GitSyncSuccess.Add(ctx, 1)
341355
return false, nil
342356
}
343357

344-
// If we can't fetch, we need to check if the remote refs have changed
345-
// and remove any references that are no longer present
358+
if fetchErr != nil {
359+
// Record failure early to capture fetch errors
360+
duration := time.Since(syncStart).Seconds()
361+
metrics.SetGitSyncLastTime(time.Now().Unix())
362+
metrics.GitSyncDuration.Record(ctx, duration)
363+
metrics.GitSyncFailure.Add(ctx, 1, metric.WithAttributes(attribute.String("reason", fetchErr.Error())))
364+
}
365+
366+
// If fetchErr exists, try cleaning up refs but do not declare full failure yet
346367
if fetchErr != nil {
347368
remoteRefs, listErr := s.listRemoteRefs(ctx)
348369
if listErr != nil {
@@ -364,6 +385,8 @@ func (s *SnapshotStore) update(ctx context.Context) (bool, error) {
364385
}
365386

366387
var errs []error
388+
flagsFetched := 0
389+
367390
if fetchErr != nil {
368391
errs = append(errs, fetchErr)
369392
}
@@ -373,11 +396,29 @@ func (s *SnapshotStore) update(ctx context.Context) (bool, error) {
373396
errs = append(errs, err)
374397
continue
375398
}
376-
if _, err := s.snaps.AddOrBuild(ctx, ref, hash, s.buildSnapshot); err != nil {
399+
400+
snap, err := s.snaps.AddOrBuild(ctx, ref, hash, s.buildSnapshot)
401+
if err != nil {
377402
errs = append(errs, err)
403+
continue
404+
}
405+
if snap != nil {
406+
flagsFetched += snap.TotalFlagsCount()
378407
}
379408
}
380-
return true, errors.Join(errs...)
409+
410+
duration := time.Since(syncStart).Seconds()
411+
metrics.SetGitSyncLastTime(time.Now().Unix())
412+
metrics.GitSyncDuration.Record(ctx, duration)
413+
metrics.GitSyncFlagsFetched.Add(ctx, int64(flagsFetched))
414+
415+
if fetchErr != nil || len(errs) > 0 {
416+
metrics.GitSyncFailure.Add(ctx, 1, metric.WithAttributes(attribute.String("reason", fmt.Sprintf("fetchErr: %v, buildErrors: %v", fetchErr, errs))))
417+
return true, errors.Join(append(errs, fetchErr)...)
418+
}
419+
420+
metrics.GitSyncSuccess.Add(ctx, 1)
421+
return true, nil
381422
}
382423

383424
func (s *SnapshotStore) fetch(ctx context.Context, heads []string) (bool, error) {

0 commit comments

Comments
 (0)