From 8a0fd66db3291bd6397a1341dc07ad41294a3caf Mon Sep 17 00:00:00 2001 From: Shyam Jeedigunta Date: Thu, 17 Oct 2024 15:55:06 -0700 Subject: [PATCH] Fix risk of a partial write txn being applied Signed-off-by: Shyam Jeedigunta --- server/etcdserver/txn/txn.go | 9 +++++---- server/etcdserver/txn/txn_test.go | 32 ++++++++++++++++++++++++++++--- 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/server/etcdserver/txn/txn.go b/server/etcdserver/txn/txn.go index 8f0e6c4b4a7..220b7d31580 100644 --- a/server/etcdserver/txn/txn.go +++ b/server/etcdserver/txn/txn.go @@ -307,10 +307,11 @@ func txn(ctx context.Context, lg *zap.Logger, txnWrite mvcc.TxnWrite, rt *pb.Txn _, err := executeTxn(ctx, lg, txnWrite, rt, txnPath, txnResp) if err != nil { if isWrite { - // end txn to release locks before panic - txnWrite.End() - // When txn with write operations starts it has to be successful - // We don't have a way to recover state in case of write failure + // CAUTION: When a txn performing write operations starts, we always expect it to be successful. + // If a write failure is seen we SHOULD NOT try to recover the server, but crash with a panic to make the failure explicit. + // Trying to silently recover (e.g by ignoring the failed txn or calling txn.End() early) poses serious risks: + // - violation of transaction atomicity if some write operations have been partially executed + // - data inconsistency across different etcd members if they applied the txn asymmetrically lg.Panic("unexpected error during txn with writes", zap.Error(err)) } else { lg.Error("unexpected error during readonly txn", zap.Error(err)) diff --git a/server/etcdserver/txn/txn_test.go b/server/etcdserver/txn/txn_test.go index 0ff15c4570b..850c8a95b9b 100644 --- a/server/etcdserver/txn/txn_test.go +++ b/server/etcdserver/txn/txn_test.go @@ -16,6 +16,9 @@ package txn import ( "context" + "crypto/sha256" + "io" + "os" "strings" "testing" "time" @@ -336,9 +339,8 @@ func TestReadonlyTxnError(t *testing.T) { } } -func TestWriteTxnPanic(t *testing.T) { - b, _ := betesting.NewDefaultTmpBackend(t) - defer betesting.Close(t, b) +func TestWriteTxnPanicWithoutApply(t *testing.T) { + b, bePath := betesting.NewDefaultTmpBackend(t) s := mvcc.NewStore(zaptest.NewLogger(t), b, &lease.FakeLessor{}, mvcc.StoreConfig{}) defer s.Close() @@ -367,7 +369,17 @@ func TestWriteTxnPanic(t *testing.T) { }, } + // compute DB file hash before applying the txn + dbHashBefore, err := computeFileHash(bePath) + require.NoErrorf(t, err, "failed to compute DB file hash before txn") + + // we verify the following properties below: + // 1. server panics after a write txn aply fails (invariant: server should never try to move on from a failed write) + // 2. no writes from the txn are applied to the backend (invariant: failed write should have no side-effect on DB state besides panic) assert.Panicsf(t, func() { Txn(ctx, zaptest.NewLogger(t), txn, false, s, &lease.FakeLessor{}) }, "Expected panic in Txn with writes") + dbHashAfter, err := computeFileHash(bePath) + require.NoErrorf(t, err, "failed to compute DB file hash after txn") + require.Equalf(t, dbHashBefore, dbHashAfter, "mismatch in DB hash before and after failed write txn") } func TestCheckTxnAuth(t *testing.T) { @@ -566,6 +578,20 @@ func setupAuth(t *testing.T, be backend.Backend) auth.AuthStore { return as } +func computeFileHash(filePath string) (string, error) { + file, err := os.Open(filePath) + if err != nil { + return "", err + } + defer file.Close() + + h := sha256.New() + if _, err := io.Copy(h, file); err != nil { + return "", err + } + return string(h.Sum(nil)), nil +} + // CheckTxnAuth variables setup. var ( inRangeCompare = &pb.Compare{