hashicorp · lalalalatt · Sep 1, 2024 · Sep 3, 2024 · Sep 4, 2024 · Sep 4, 2024
@@ -217,6 +217,10 @@ type Raft struct {
 	// preVoteDisabled control if the pre-vote feature is activated,
 	// prevote feature is disabled if set to true.
 	preVoteDisabled bool
+
+	// fastRecovery is used to enable fast recovery mode
+	// fast recovery mode is disabled if set to false.
+	fastRecovery bool
 }
 
 // BootstrapCluster initializes a server's storage with the given cluster
@@ -566,6 +570,7 @@ func NewRaft(conf *Config, fsm FSM, logs LogStore, stable StableStore, snaps Sna
 		followerNotifyCh:      make(chan struct{}, 1),
 		mainThreadSaturation:  newSaturationMetric([]string{"raft", "thread", "main", "saturation"}, 1*time.Second),
 		preVoteDisabled:       conf.PreVoteDisabled || !transportSupportPreVote,
+		fastRecovery:          conf.FastRecovery,
 	}
 	if !transportSupportPreVote && !conf.PreVoteDisabled {
 		r.logger.Warn("pre-vote is disabled because it is not supported by the Transport")
@@ -585,9 +590,12 @@ func NewRaft(conf *Config, fsm FSM, logs LogStore, stable StableStore, snaps Sna
 		return nil, err
 	}
 
+	r.recoverFromCommittedLogs()
+
 	// Scan through the log for any configuration change entries.
 	snapshotIndex, _ := r.getLastSnapshot()
-	for index := snapshotIndex + 1; index <= lastLog.Index; index++ {
+	lastappliedIndex := r.getLastApplied()
+	for index := max(snapshotIndex, lastappliedIndex) + 1; index <= lastLog.Index; index++ {
 		var entry Log
 		if err := r.logs.GetLog(index, &entry); err != nil {
 			r.logger.Error("failed to get log", "index", index, "error", err)
@@ -697,6 +705,39 @@ func (r *Raft) tryRestoreSingleSnapshot(snapshot *SnapshotMeta) bool {
 	return true
 }
 
+// recoverFromCommittedLogs recovers the Raft node from committed logs.
+func (r *Raft) recoverFromCommittedLogs() {
+	if !r.fastRecovery {
+		return
+	}
+
+	// If the store implements CommitTrackingLogStore, we can read the commit index from the store.
+	// This is useful when the store is able to track the commit index and we can avoid replaying logs.
+	store, ok := r.logs.(CommitTrackingLogStore)
+	if !ok {
+		r.logger.Warn("fast recovery enabled but log store does not support it", "log_store", fmt.Sprintf("%T", r.logs))
+		return
+	}
+
+	commitIndex, err := store.GetCommitIndex()
+	if err != nil {
+		r.logger.Error("failed to get commit index from store", "error", err)
+		panic(err)
+	}
+
+	lastIndex, err := r.logs.LastIndex()
+	if err != nil {
+		r.logger.Error("failed to get last log index from store", "error", err)
+		panic(err)
+	}
+	if commitIndex > lastIndex {
+		commitIndex = lastIndex
+	}
+
+	r.setCommitIndex(commitIndex)
+	r.processLogs(commitIndex, nil)
+}
+
 func (r *Raft) config() Config {
 	return r.conf.Load().(Config)
 }

@@ -235,6 +235,16 @@ type Config struct {
 	// PreVoteDisabled deactivate the pre-vote feature when set to true
 	PreVoteDisabled bool
 
+	// FastRecovery controls if the Raft server should use the fast recovery
+	// mechanism. Fast recovery requires a LogStore implementation that
+	// support commit tracking. When such a store is used and this config
+	// enabled, raft nodes will replay all known-committed logs on disk
+	// before completing `NewRaft` on startup. This is mainly useful where
+	// the application allows relaxed-consistency reads from followers as it
+	// will reduce how far behind the follower's FSM is when it starts. If all reads
+	// are forwarded to the leader then there won't be observable benefit from this feature.
+	FastRecovery bool
+
 	// skipStartup allows NewRaft() to bypass all background work goroutines
 	skipStartup bool
 }

@@ -6,8 +6,11 @@ package raft
 import (
 	"errors"
 	"sync"
+	"sync/atomic"
 )
 
+var _ CommitTrackingLogStore = &InmemCommitTrackingStore{}
+
 // InmemStore implements the LogStore and StableStore interface.
 // It should NOT EVER be used for production. It is used only for
 // unit tests. Use the MDBStore implementation instead.
@@ -131,3 +134,30 @@ func (i *InmemStore) GetUint64(key []byte) (uint64, error) {
 	defer i.l.RUnlock()
 	return i.kvInt[string(key)], nil
 }
+
+type commitIndexTrackingLog struct {
+	log         *Log
+	CommitIndex uint64
+}
+type InmemCommitTrackingStore struct {
+	InmemStore
+	commitIndex atomic.Uint64
+}
+
+// NewInmemCommitTrackingStore returns a new in-memory backend that tracks the commit index. Do not ever
+// use for production. Only for testing.
+func NewInmemCommitTrackingStore() *InmemCommitTrackingStore {
+	i := &InmemCommitTrackingStore{
+		InmemStore: *NewInmemStore(),
+	}
+	return i
+}
+
+func (i *InmemCommitTrackingStore) StageCommitIndex(index uint64) error {
+	i.commitIndex.Store(index)
+	return nil
+}
+
+func (i *InmemCommitTrackingStore) GetCommitIndex() (uint64, error) {
+	return i.commitIndex.Load(), nil
+}
@@ -190,3 +190,24 @@ func emitLogStoreMetrics(s LogStore, prefix []string, interval time.Duration, st
 		}
 	}
 }
+
+type CommitTrackingLogStore interface {
+	LogStore
+
+	// StageCommitIndex stages a new commit index to be persisted.
+	// The staged commit index MUST only be persisted in a manner that is atomic
+	// with the following StoreLogs call in the face of a crash.
+	// This allows the Raft implementation to optimize commit index updates
+	// without risking inconsistency between the commit index and the log entries.
+	//
+	// The implementation MUST NOT persist this value separately from the log entries.
+	// Instead, it should stage the value to be written atomically with the next
+	// StoreLogs call.
+	//
+	// GetCommitIndex MUST never return a value higher than the last index in the log,
+	// even if a higher value has been staged with this method.
+	//
+	// idx is the new commit index to stage.
+	StageCommitIndex(idx uint64) error
+	GetCommitIndex() (uint64, error)
+}
@@ -1262,6 +1262,8 @@
 		r.leaderState.inflight.PushBack(applyLog)
 	}
 
+	r.tryStageCommitIndex()
+
 	// Write the log entry locally
 	if err := r.logs.StoreLogs(logs); err != nil {
 		r.logger.Error("failed to commit logs", "error", err)
@@ -1385,6 +1387,21 @@
 	return nil
 }
 
+// tryStageCommitIndex updates the commit index in persist store if fast recovery is enabled and log store implements CommitTrackingLogStore.
+func (r *Raft) tryStageCommitIndex() {
+	commitIndex := r.getCommitIndex()
+	if !r.fastRecovery {
+		return
+	}
+	store, ok := r.logs.(CommitTrackingLogStore)
+	if !ok {
+		return
+	}
+	if err := store.StageCommitIndex(commitIndex); err != nil {
+		r.logger.Error("failed to stage commit index in commit tracking log store", "index", commitIndex, "error", err)
+	}
+}
+
 // processRPC is called to handle an incoming RPC request. This must only be
 // called from the main thread.
 func (r *Raft) processRPC(rpc RPC) {
@@ -1535,6 +1552,11 @@
 		}
 
 		if n := len(newEntries); n > 0 {
+			// Stage the future commit index if possible
+			lastNewIndex := newEntries[len(newEntries)-1].Index
+			commitIndex := min(a.LeaderCommitIndex, lastNewIndex)
+			r.tryStageCommitIndex(commitIndex)
+
 			// Append the new entries
 			if err := r.logs.StoreLogs(newEntries); err != nil {
 				r.logger.Error("failed to append to logs", "error", err)

@@ -1095,6 +1095,184 @@ func TestRaft_RestoreSnapshotOnStartup_Monotonic(t *testing.T) {
 	assert.Equal(t, lastIdx, last)
 }
 
+func TestRaft_RestoreSnapshotOnStartup_CommitTrackingLogs(t *testing.T) {
+	// Make the cluster
+	conf := inmemConfig(t)
+	conf.TrailingLogs = 10
+	opts := &MakeClusterOpts{
+		Peers:              1,
+		Bootstrap:          true,
+		Conf:               conf,
+		CommitTrackingLogs: true,
+	}
+	c := MakeClusterCustom(t, opts)
+	defer c.Close()
+
+	leader := c.Leader()
+
+	// Commit a lot of things
+	var future Future
+	for i := 0; i < 100; i++ {
+		future = leader.Apply([]byte(fmt.Sprintf("test%d", i)), 0)
+	}
+
+	// Wait for the last future to apply
+	if err := future.Error(); err != nil {
+		t.Fatalf("err: %v", err)
+	}
+
+	// Take a snapshot
+	snapFuture := leader.Snapshot()
+	if err := snapFuture.Error(); err != nil {
+		t.Fatalf("err: %v", err)
+	}
+
+	// Check for snapshot
+	snaps, _ := leader.snapshots.List()
+	if len(snaps) != 1 {
+		t.Fatalf("should have a snapshot")
+	}
+	snap := snaps[0]
+
+	// Logs should be trimmed
+	firstIdx, err := leader.logs.FirstIndex()
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+	lastIdx, err := leader.logs.LastIndex()
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+
+	if firstIdx != snap.Index-conf.TrailingLogs+1 {
+		t.Fatalf("should trim logs to %d: but is %d", snap.Index-conf.TrailingLogs+1, firstIdx)
+	}
+
+	// Shutdown
+	shutdown := leader.Shutdown()
+	if err := shutdown.Error(); err != nil {
+		t.Fatalf("err: %v", err)
+	}
+
+	// Restart the Raft
+	r := leader
+	// Can't just reuse the old transport as it will be closed
+	_, trans2 := NewInmemTransport(r.trans.LocalAddr())
+	cfg := r.config()
+	r, err = NewRaft(&cfg, r.fsm, r.logs, r.stable, r.snapshots, trans2)
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+	c.rafts[0] = r
+
+	// We should have restored from the snapshot!
+	if last := r.getLastApplied(); last != snap.Index {
+		t.Fatalf("bad last index: %d, expecting %d", last, snap.Index)
+	}
+
+	// Verify that logs have not been reset
+	first, _ := r.logs.FirstIndex()
+	last, _ := r.logs.LastIndex()
+	assert.Equal(t, firstIdx, first)
+	assert.Equal(t, lastIdx, last)
+}
+
+func TestRaft_FastRecovery(t *testing.T) {
+	// Make the cluster
+	conf := inmemConfig(t)
+	conf.TrailingLogs = 10
+	conf.FastRecovery = true
+	opts := &MakeClusterOpts{
+		Peers:              1,
+		Bootstrap:          true,
+		Conf:               conf,
+		CommitTrackingLogs: true,
+	}
+	c := MakeClusterCustom(t, opts)
+	defer c.Close()
+
+	leader := c.Leader()
+
+	// Commit a lot of things
+	var future Future
+	for i := 0; i < 100; i++ {
+		future = leader.Apply([]byte(fmt.Sprintf("test%d", i)), 0)
+	}
+
+	// Wait for the last future to apply
+	if err := future.Error(); err != nil {
+		t.Fatalf("err: %v", err)
+	}
+
+	// Take a snapshot
+	snapFuture := leader.Snapshot()
+	if err := snapFuture.Error(); err != nil {
+		t.Fatalf("err: %v", err)
+	}
+
+	// Check for snapshot
+	snaps, _ := leader.snapshots.List()
+	if len(snaps) != 1 {
+		t.Fatalf("should have a snapshot")
+	}
+	snap := snaps[0]
+
+	// Logs should be trimmed
+	firstIdx, err := leader.logs.FirstIndex()
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+
+	if firstIdx != snap.Index-conf.TrailingLogs+1 {
+		t.Fatalf("should trim logs to %d: but is %d", snap.Index-conf.TrailingLogs+1, firstIdx)
+	}
+
+	// Commit a lot of things (for fast recovery test)
+	for i := 0; i < 100; i++ {
+		future = leader.Apply([]byte(fmt.Sprintf("test%d", i)), 0)
+	}
+
+	// Wait for the last future to apply
+	if err := future.Error(); err != nil {
+		t.Fatalf("err: %v", err)
+	}
+
+	// Shutdown
+	shutdown := leader.Shutdown()
+	if err := shutdown.Error(); err != nil {
+		t.Fatalf("err: %v", err)
+	}
+
+	// Restart the Raft
+	r := leader
+	// Can't just reuse the old transport as it will be closed
+	_, trans2 := NewInmemTransport(r.trans.LocalAddr())
+	cfg := r.config()
+	r, err = NewRaft(&cfg, r.fsm, r.logs, r.stable, r.snapshots, trans2)
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+	c.rafts[0] = r
+
+	store, ok := r.logs.(CommitTrackingLogStore)
+	if !ok {
+		t.Fatal("err: raft log store does not implement CommitTrackingLogStore interface")
+	}
+	commitIdx, err := store.GetCommitIndex()
+	// We should have applied all committed logs
+	if last := r.getLastApplied(); last != commitIdx {
+		t.Fatalf("bad last index: %d, expecting %d", last, commitIdx)
+	}
+
+	// Expect: snap.Index --- commitIdx --- lastIdx
+	lastIdx, err := r.logs.LastIndex()
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+	assert.LessOrEqual(t, snap.Index, commitIdx)
+	assert.LessOrEqual(t, commitIdx, lastIdx)
+}
+
 func TestRaft_SnapshotRestore_Progress(t *testing.T) {
 	// Make the cluster
 	conf := inmemConfig(t)