Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding locking mechanism for snapshots to avoid race conditions #67

Merged
merged 1 commit into from
Jul 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cfg/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
type SnapshotStoreType string

const NodeNamePrefix = "marmot-node"
const EmbeddedClusterName = "e-marmot"
const (
Nats SnapshotStoreType = "nats"
S3 = "s3"
Expand Down
27 changes: 27 additions & 0 deletions logstream/replicator.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ import (
const maxReplicateRetries = 7
const SnapshotShardID = uint64(1)

var SnapshotLeaseTTL = 10 * time.Second

type Replicator struct {
nodeID uint64
shards uint64
Expand All @@ -25,6 +27,7 @@ type Replicator struct {

client *nats.Conn
repState *replicationState
metaStore *replicatorMetaStore
snapshot snapshot.NatsSnapshot
streamMap map[uint64]nats.JetStreamContext
}
Expand Down Expand Up @@ -85,6 +88,11 @@ func NewReplicator(
return nil, err
}

metaStore, err := newReplicatorMetaStore(cfg.EmbeddedClusterName, nc)
if err != nil {
return nil, err
}

return &Replicator{
client: nc,
nodeID: nodeID,
Expand All @@ -95,6 +103,7 @@ func NewReplicator(
streamMap: streamMap,
snapshot: snapshot,
repState: repState,
metaStore: metaStore,
}, nil
}

Expand Down Expand Up @@ -221,6 +230,24 @@ func (r *Replicator) LastSaveSnapshotTime() time.Time {
}

func (r *Replicator) SaveSnapshot() {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()

locked, err := r.metaStore.ContextRefreshingLease("snapshot", SnapshotLeaseTTL, ctx)
if err != nil {
log.Warn().Err(err).Msg("Error acquiring snapshot lock")
return
}

if !locked {
log.Info().Msg("Snapshot saving already locked, skipping")
return
}

r.ForceSaveSnapshot()
}

func (r *Replicator) ForceSaveSnapshot() {
if r.snapshot == nil {
return
}
Expand Down
126 changes: 126 additions & 0 deletions logstream/replicator_meta_store.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
package logstream

import (
"context"
"time"

"github.com/fxamacker/cbor/v2"
"github.com/maxpert/marmot/cfg"
"github.com/nats-io/nats.go"
"github.com/rs/zerolog/log"
)

type replicatorMetaStore struct {
nats.KeyValue
}

type replicatorLockInfo struct {
NodeID uint64
Timestamp int64
}

func newReplicatorMetaStore(name string, nc *nats.Conn) (*replicatorMetaStore, error) {
jsx, err := nc.JetStream()
if err != nil {
return nil, err
}

kv, err := jsx.KeyValue(name)
if err == nats.ErrBucketNotFound {
kv, err = jsx.CreateKeyValue(&nats.KeyValueConfig{
Storage: nats.FileStorage,
Bucket: name,
Replicas: cfg.Config.ReplicationLog.Replicas,
})
}

if err != nil {
return nil, err
}

return &replicatorMetaStore{KeyValue: kv}, nil
}

func (m *replicatorMetaStore) AcquireLease(name string, duration time.Duration) (bool, error) {
now := time.Now().UnixMilli()
info := &replicatorLockInfo{
NodeID: cfg.Config.NodeID,
Timestamp: now,
}
payload, err := info.Serialize()
if err != nil {
return false, err
}

entry, err := m.Get(name)
if err == nats.ErrKeyNotFound {
rev := uint64(0)
rev, err = m.Create(name, payload)
if rev != 0 && err == nil {
return true, nil
}
}

if err != nil {
return false, err
}

err = info.DeserializeFrom(entry.Value())
if err != nil {
return false, err
}

if info.NodeID != cfg.Config.NodeID && info.Timestamp+duration.Milliseconds() > now {
return false, err
}

_, err = m.Update(name, payload, entry.Revision())
if err != nil {
return false, err
}

return true, nil
}

func (m *replicatorMetaStore) ContextRefreshingLease(
name string,
duration time.Duration,
ctx context.Context,
) (bool, error) {
locked, err := m.AcquireLease(name, duration)
go func(locked bool, err error) {
if !locked || err != nil {
return
}

refresh := time.NewTicker(duration / 2)
for {
locked, err = m.AcquireLease(name, duration)
if err != nil {
log.Warn().Err(err).Str("name", name).Msg("Error acquiring lease")
return
} else if !locked {
log.Warn().Str("name", name).Msg("Unable to acquire lease")
return
}

refresh.Reset(duration / 2)
select {
case <-refresh.C:
continue
case <-ctx.Done():
return
}
}
}(locked, err)

return locked, err
}

func (r *replicatorLockInfo) Serialize() ([]byte, error) {
return cbor.Marshal(r)
}

func (r *replicatorLockInfo) DeserializeFrom(data []byte) error {
return cbor.Unmarshal(data, r)
}
6 changes: 3 additions & 3 deletions marmot.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ func main() {
}

if *cfg.SaveSnapshotFlag {
replicator.SaveSnapshot()
replicator.ForceSaveSnapshot()
return
}

Expand Down Expand Up @@ -144,7 +144,7 @@ func main() {
log.Info().
Time("last_snapshot", lastSnapshotTime).
Dur("duration", now.Sub(lastSnapshotTime)).
Msg("Saving timer based snapshot")
Msg("Triggering timer based snapshot save")
replicator.SaveSnapshot()
}
}
Expand All @@ -153,7 +153,7 @@ func main() {
ctxSt.Cancel()
if cfg.Config.Snapshot.Enable && cfg.Config.Publish {
log.Info().Msg("Saving snapshot before going to sleep")
replicator.SaveSnapshot()
replicator.ForceSaveSnapshot()
}

os.Exit(0)
Expand Down
2 changes: 1 addition & 1 deletion stream/embedded_nats.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ func startEmbeddedServer(nodeName string) (*embeddedNats, error) {
JetStreamMaxMemory: -1,
JetStreamMaxStore: -1,
Cluster: server.ClusterOpts{
Name: "e-marmot",
Name: cfg.EmbeddedClusterName,
},
LeafNode: server.LeafNodeOpts{},
}
Expand Down
Loading