diff --git a/app/keeper/cmd/main.go b/app/keeper/cmd/main.go index d7a17cc..b88ae6e 100644 --- a/app/keeper/cmd/main.go +++ b/app/keeper/cmd/main.go @@ -12,9 +12,7 @@ import ( "github.com/spiffe/spike-sdk-go/spiffe" "github.com/spiffe/spike/app/keeper/internal/env" - api "github.com/spiffe/spike/app/keeper/internal/net" "github.com/spiffe/spike/app/keeper/internal/route/handle" - "github.com/spiffe/spike/app/keeper/internal/state" "github.com/spiffe/spike/app/keeper/internal/trust" "github.com/spiffe/spike/internal/auth" "github.com/spiffe/spike/internal/config" @@ -37,34 +35,6 @@ func main() { trust.Authenticate(spiffeid) - keeperState := state.ReadAppState() - - if keeperState == state.AppStateError { - log.FatalLn( - "SPIKE Keeper is in ERROR state. Manual intervention required.", - ) - } - - if keeperState == state.AppStateNotReady { - log.Log().Info(appName, - "msg", "SPIKE Keeper is not ready. Will send shards") - - go api.Contribute(source) - - go state.WaitForShards() - } - - if keeperState == state.AppStateReady || - keeperState == state.AppStateRecovering { - // TODO: implement this case - // 1. Transition to a RECOVERING state, if not done already - // 2. Contact peers to recompute shard. - // 3. Try forever. - // 4. If something is irrevocably irrecoverable transition to ERROR state. - // 5. When everything is back to normal, transition to READY state. - panic("I started, but I don't know what to do.") - } - log.Log().Info(appName, "msg", fmt.Sprintf("Started service: %s v%s", appName, config.KeeperVersion)) if err := net.ServeWithPredicate( diff --git a/app/keeper/internal/net/contribute.go b/app/keeper/internal/net/contribute.go deleted file mode 100644 index 0f9b74b..0000000 --- a/app/keeper/internal/net/contribute.go +++ /dev/null @@ -1,83 +0,0 @@ -// \\ SPIKE: Secure your secrets with SPIFFE. -// \\\\\ Copyright 2024-present SPIKE contributors. -// \\\\\\\ SPDX-License-Identifier: Apache-2.0 - -package net - -import ( - "encoding/base64" - "encoding/json" - "net/url" - "time" - - "github.com/spiffe/go-spiffe/v2/workloadapi" - "github.com/spiffe/spike-sdk-go/api/entity/v1/reqres" - "github.com/spiffe/spike-sdk-go/net" - - "github.com/spiffe/spike/app/keeper/internal/env" - "github.com/spiffe/spike/app/keeper/internal/state" - "github.com/spiffe/spike/internal/auth" - "github.com/spiffe/spike/internal/log" -) - -func Contribute(source *workloadapi.X509Source) { - peers := env.Peers() - myId := env.KeeperId() - - for id, peer := range peers { - if id == myId { - continue - } - - contributeUrl, err := url.JoinPath(peer, "v1/store/contribute") - if err != nil { - log.FatalLn("Failed to join path: " + err.Error()) - } - - if source == nil { - log.FatalLn("contribute: source is nil") - } - - client, err := net.CreateMtlsClientWithPredicate( - source, - auth.IsKeeper, - ) - if err != nil { - panic(err) - } - - contribution := state.RandomContribution() - state.Shards.Store(myId, contribution) - - log.Log().Info( - "contribute", - "msg", "Sending contribution to peer", - "peer", peer, - ) - - md, err := json.Marshal( - reqres.ShardContributionRequest{ - KeeperId: myId, - Shard: base64.StdEncoding.EncodeToString(contribution), - }, - ) - if err != nil { - log.FatalLn( - "Failed to marshal shard contribution request: " + err.Error(), - ) - } - - _, err = net.Post(client, contributeUrl, md) - for err != nil { - time.Sleep(5 * time.Second) - _, err = net.Post(client, contributeUrl, md) - if err != nil { - log.Log().Info("contribute", - "msg", "Error sending contribution. Will retry", - "err", err, - ) - time.Sleep(5 * time.Second) - } - } - } -} diff --git a/app/keeper/internal/route/base/route.go b/app/keeper/internal/route/base/route.go index 384bdc9..e4f7028 100644 --- a/app/keeper/internal/route/base/route.go +++ b/app/keeper/internal/route/base/route.go @@ -27,12 +27,12 @@ func Route(w http.ResponseWriter, r *http.Request, a *log.AuditEntry) error { r.Method, func(a net.SpikeKeeperApiAction, p net.ApiUrl) net.Handler { switch { + // Get contribution from SPIKE Nexus case a == net.ActionKeeperDefault && p == net.SpikeKeeperUrlContribute: return store.RouteContribute + // Provide your shard to SPIKE Nexus case a == net.ActionKeeperDefault && p == net.SpikeKeeperUrlShard: return store.RouteShard - case a == net.ActionKeeperDefault && p == net.SpikeKeeperUrlStatus: - return store.RouteStatus default: return net.Fallback } diff --git a/app/keeper/internal/route/store/contribute.go b/app/keeper/internal/route/store/contribute.go index 64a1424..8cc56f9 100644 --- a/app/keeper/internal/route/store/contribute.go +++ b/app/keeper/internal/route/store/contribute.go @@ -53,8 +53,7 @@ func RouteContribute( // Store decoded shard in the map. state.Shards.Store(id, decodedShard) - - log.Log().Info(fName, "msg", "Shard stored", "id", id, "shard", decodedShard) + log.Log().Info(fName, "msg", "Shard stored", "id", id) responseBody := net.MarshalBody(reqres.ShardContributionResponse{}, w) diff --git a/app/keeper/internal/state/join.go b/app/keeper/internal/state/join.go deleted file mode 100644 index 5d63c08..0000000 --- a/app/keeper/internal/state/join.go +++ /dev/null @@ -1,115 +0,0 @@ -// \\ SPIKE: Secure your secrets with SPIFFE. -// \\\\\ Copyright 2024-present SPIKE contributors. -// \\\\\\\ SPDX-License-Identifier: Apache-2.0 - -package state - -import ( - "sort" - - "github.com/cloudflare/circl/group" - "github.com/cloudflare/circl/secretsharing" - - "github.com/spiffe/spike/app/keeper/internal/env" - "github.com/spiffe/spike/internal/log" - "github.com/spiffe/spike/pkg/crypto" -) - -func setInternalShard(shares []secretsharing.Share) { - // Sort the keys of env.Peers() alphabetically for deterministic - // shard indexing. - peers := env.Peers() - peerKeys := make([]string, 0, len(peers)) - for id := range peers { - peerKeys = append(peerKeys, id) - } - sort.Strings(peerKeys) - - myId := env.KeeperId() - - // Find the index of the current Keeper's ID - var myShard []byte - for index, id := range peerKeys { - if id == myId { - // Save the shard corresponding to this Keeper - if val, ok := Shards.Load(myId); ok { - myShard = val.([]byte) - - log.Log().Info("setInternalShard", "id", myId, "index", index) - - shareVal, _ := shares[index].Value.MarshalBinary() - - SetShard(shareVal) - EraseIntermediateShards() - - break - } - } - } - - // Ensure myShard is stored correctly in the state namespace - if myShard == nil { - log.FatalLn( - "setInternalShard: Shard for Keeper ID", myId, "could not be found", - ) - } -} - -func computeFinalKey() []byte { - finalKey := make([]byte, 32) - - counter := 0 - Shards.Range(func(key, value any) bool { - counter++ - shard := value.([]byte) - for i := 0; i < 32; i++ { - finalKey[i] ^= shard[i] - } - return true - }) - - if counter != 3 { - log.FatalLn("computeFinalKey: Not all shards received") - } - - if len(finalKey) != 32 { - log.FatalLn("computeFinalKey: FinalKey must be 32 bytes long") - } - - return finalKey -} - -func computeShares(finalKey []byte) (group.Scalar, []secretsharing.Share) { - // Initialize parameters - g := group.P256 - t := uint(1) // Need t+1 shares to reconstruct - n := uint(3) // Total number of shares - - // Create secret from your 32 byte key - secret := g.NewScalar() - if err := secret.UnmarshalBinary(finalKey); err != nil { - log.FatalLn("computeShares: Failed to unmarshal key: %v" + err.Error()) - } - - // TODO: give the end user to seed their own key if they want to via an environment variable. - - // Create deterministic random source using the key itself as seed - // You could use any other seed value for consistency - deterministicRand := crypto.NewDeterministicReader(finalKey) - - // Create shares - ss := secretsharing.New(deterministicRand, t, secret) - return secret, ss.Share(n) -} - -func sanityCheck(secret group.Scalar, shares []secretsharing.Share) { - t := uint(1) // Need t+1 shares to reconstruct - - reconstructed, err := secretsharing.Recover(t, shares[:2]) - if err != nil { - log.FatalLn("computeShares: Failed to recover: " + err.Error()) - } - if !secret.IsEqual(reconstructed) { - log.FatalLn("computeShares: Recovered secret does not match original") - } -} diff --git a/app/keeper/internal/state/shard.go b/app/keeper/internal/state/shard.go index 947797e..d9deb69 100644 --- a/app/keeper/internal/state/shard.go +++ b/app/keeper/internal/state/shard.go @@ -6,79 +6,8 @@ package state import ( "sync" - "time" - - "github.com/spiffe/spike/internal/log" - "github.com/spiffe/spike/pkg/crypto" ) -// WaitForShards blocks until exactly 3 shards are collected in the global -// Shards map. Once collected, it computes the final key, generates shares, -// sets the internal shard, and performs validation checks. -// -// The function: -// - Polls the Shards map every 2 seconds until 3 shards are present -// - Panics if more than 3 shards are received -// - Processes the shards to generate the final distributed secret -// -// Panics: -// - If more than 3 shards are received -func WaitForShards() { - for { - shardCount := 0 - Shards.Range(func(key, value any) bool { - shardCount++ - return true - }) - - log.Log().Info( - "waitForShards", "msg", "Current shard count", "count", shardCount, - ) - - if shardCount < 3 { - time.Sleep(2 * time.Second) - continue - } - - if shardCount > 3 { - // TODO: add an audit log, because this is a security incident likely. - log.FatalLn("waitForShards: Too many shards received") - } - - finalKey := computeFinalKey() - secret, shares := computeShares(finalKey) - setInternalShard(shares) - sanityCheck(secret, shares) - - break - } -} - -var myContribution []byte -var myContributionLock sync.Mutex - -// RandomContribution generates and caches a random contribution for the -// distributed secret. The contribution is generated only once and reused for -// subsequent calls. -// -// Returns: -// - []byte: Random contribution bytes from AES-256 seed -// -// Thread-safe through myContributionLock mutex. -func RandomContribution() []byte { - myContributionLock.Lock() - defer myContributionLock.Unlock() - - if len(myContribution) == 0 { - mySeed, _ := crypto.Aes256Seed() - myContribution = []byte(mySeed) - - return myContribution - } - - return myContribution -} - var Shards sync.Map var shard []byte @@ -107,14 +36,3 @@ func Shard() []byte { defer shardMutex.RUnlock() return shard } - -// EraseIntermediateShards removes all entries from the global Shards map, -// cleaning up intermediate secret sharing data. -// -// Thread-safe through sync.Map operations. -func EraseIntermediateShards() { - Shards.Range(func(key, value interface{}) bool { - Shards.Delete(key) - return true - }) -} diff --git a/app/nexus/internal/poll/poll.go b/app/nexus/internal/poll/poll.go index b89c785..9e7cafa 100644 --- a/app/nexus/internal/poll/poll.go +++ b/app/nexus/internal/poll/poll.go @@ -5,7 +5,6 @@ package poll import ( - "bytes" "context" "encoding/base64" "encoding/hex" @@ -13,17 +12,16 @@ import ( "net/url" "time" - "github.com/cloudflare/circl/group" - "github.com/cloudflare/circl/secretsharing" + "github.com/spiffe/go-spiffe/v2/workloadapi" "github.com/spiffe/spike-sdk-go/api/entity/v1/reqres" - "github.com/spiffe/spike-sdk-go/net" + network "github.com/spiffe/spike-sdk-go/net" + "github.com/spiffe/spike/app/nexus/internal/env" state "github.com/spiffe/spike/app/nexus/internal/state/base" "github.com/spiffe/spike/internal/auth" - - "github.com/spiffe/go-spiffe/v2/workloadapi" - "github.com/spiffe/spike/internal/log" + "github.com/spiffe/spike/internal/net" + "github.com/spiffe/spike/pkg/crypto" ) func Tick( @@ -31,56 +29,106 @@ func Tick( source *workloadapi.X509Source, ticker *time.Ticker, ) { - // Talk to all keeper endpoints until we get the minimum number of shards - // to reconstruct the root key. Once the root key is reconstructed, - // initialize the backing store with the root key and exit the ticker. + // Talk to all SPIKE Keeper endpoints and send their shards and get + // acknowledgement that they received the shard. - for { - if source == nil { - log.Log().Info("tick", "msg", "source is nil") - time.Sleep(time.Second * 5) - continue - } + if source == nil { + // If source is nil, nobody is going to recreate the source, + // it's better to log and crash. + log.FatalLn("Tick: source is nil. this should not happen.") + } + // Create the root key and create shards out of the root key. + rootKey, err := crypto.Aes256Seed() + if err != nil { + log.FatalLn("Tick: failed to create root key: " + err.Error()) + } + decodedRootKey, err := hex.DecodeString(rootKey) + if err != nil { + log.FatalLn("Tick: failed to decode root key: " + err.Error()) + } + rootSecret, rootShares := computeShares(decodedRootKey) + sanityCheck(rootSecret, rootShares) + + // Initialize the backend store before sending shards to the keepers. + // Keepers is our backup system, and they are not critical for system + // operations. Initializing early allows SPIKE Nexus to serve before + // keepers are hydrated. + state.Initialize(rootKey) + log.Log().Info("tick", "msg", "Initialized the backing store") + + successfulKeepers := make(map[string]bool) + + for { select { case <-ticker.C: keepers := env.Keepers() + if len(keepers) < 3 { + log.FatalLn("Tick: not enough keepers") + } - shardsNeeded := 2 - var shardsCollected [][]byte + // Ensure to get a success response from ALL keepers eventually. + for keeperId, keeperApiRoot := range keepers { + u, err := url.JoinPath( + keeperApiRoot, + string(net.SpikeKeeperUrlContribute), + ) - for _, keeperApiRoot := range keepers { - u, _ := url.JoinPath(keeperApiRoot, "/v1/store/shard") + if err != nil { + log.Log().Warn( + "tick", + "msg", "Failed to join path", + "url", keeperApiRoot, + ) + continue + } - client, err := net.CreateMtlsClientWithPredicate( + client, err := network.CreateMtlsClientWithPredicate( source, auth.IsKeeper, ) + if err != nil { - log.Log().Info("tick", "msg", - "Failed to create mTLS client", "err", err) + log.Log().Warn("tick", + "msg", "Failed to create mTLS client", + "err", err) continue } - md, err := json.Marshal(reqres.ShardRequest{}) + share := findShare(keeperId, keepers, rootShares) + + contribution, err := share.Value.MarshalBinary() if err != nil { - log.Log().Info("tick", "msg", - "Failed to marshal request", "err", err) + log.Log().Warn("tick", + "msg", "Failed to marshal share", + "err", err, "keeper_id", keeperId) continue } - data, err := net.Post(client, u, md) + scr := reqres.ShardContributionRequest{ + KeeperId: keeperId, + Shard: base64.StdEncoding.EncodeToString(contribution), + } + md, err := json.Marshal(scr) if err != nil { - log.Log().Info("tick", "msg", - "Failed to post request", "err", err) + log.Log().Warn("tick", + "msg", "Failed to marshal request", + "err", err, "keeper_id", keeperId) continue } - var res reqres.ShardResponse + + data, err := net.Post(client, u, md) + if err != nil { + log.Log().Warn("tick", "msg", + "Failed to post", + "err", err, "keeper_id", keeperId) + } if len(data) == 0 { log.Log().Info("tick", "msg", "No data") continue } + var res reqres.ShardContributionResponse err = json.Unmarshal(data, &res) if err != nil { log.Log().Info("tick", "msg", @@ -88,84 +136,20 @@ func Tick( continue } - if len(shardsCollected) < shardsNeeded { - decodedShard, err := base64.StdEncoding.DecodeString(res.Shard) - if err != nil { - log.Log().Info("tick", "msg", "Failed to decode shard") - continue - } - - // Check if the shard already exists in shardsCollected - shardExists := false - for _, existingShard := range shardsCollected { - if bytes.Equal(existingShard, decodedShard) { - shardExists = true - break - } - } - if shardExists { - continue - } - - shardsCollected = append(shardsCollected, decodedShard) - } + successfulKeepers[keeperId] = true + log.Log().Info("tick", "msg", "Success", "keeper_id", keeperId) - if len(shardsCollected) >= shardsNeeded { - log.Log().Info("tick", - "msg", "Collected required shards", - "shards_collected", len(shardsCollected)) - - g := group.P256 - - firstShard := shardsCollected[0] - firstShare := secretsharing.Share{ - ID: g.NewScalar(), - Value: g.NewScalar(), - } - firstShare.ID.SetUint64(1) - err := firstShare.Value.UnmarshalBinary(firstShard) - if err != nil { - log.FatalLn("Failed to unmarshal share: " + err.Error()) - } - - secondShard := shardsCollected[1] - secondShare := secretsharing.Share{ - ID: g.NewScalar(), - Value: g.NewScalar(), - } - secondShare.ID.SetUint64(2) - err = secondShare.Value.UnmarshalBinary(secondShard) - if err != nil { - log.FatalLn("Failed to unmarshal share: " + err.Error()) - } - - var shares []secretsharing.Share - shares = append(shares, firstShare) - shares = append(shares, secondShare) - - reconstructed, err := secretsharing.Recover(1, shares) - if err != nil { - log.FatalLn("Failed to recover: " + err.Error()) - } - - // TODO: check for errors. - binaryRec, _ := reconstructed.MarshalBinary() - - // TODO: check size 32bytes. - - encoded := hex.EncodeToString(binaryRec) - state.Initialize(encoded) - - log.Log().Info("tick", "msg", "Initialized backing store") + if len(successfulKeepers) == 3 { + log.Log().Info("tick", "msg", "All keepers initialized") return } } - - log.Log().Info("tick", - "msg", "Failed to collect shards... will retry", - ) case <-ctx.Done(): + log.Log().Info("tick", "msg", "Context done") return } + + log.Log().Info("tick", "msg", "Waiting for keepers to initialize") + time.Sleep(5 * time.Second) } } diff --git a/app/nexus/internal/poll/shamir.go b/app/nexus/internal/poll/shamir.go new file mode 100644 index 0000000..051bedf --- /dev/null +++ b/app/nexus/internal/poll/shamir.go @@ -0,0 +1,81 @@ +// \\ SPIKE: Secure your secrets with SPIFFE. +// \\\\\ Copyright 2024-present SPIKE contributors. +// \\\\\\\ SPDX-License-Identifier: Apache-2.0 + +package poll + +import ( + "sort" + + "github.com/cloudflare/circl/group" + shamir "github.com/cloudflare/circl/secretsharing" + + "github.com/spiffe/spike/internal/log" + "github.com/spiffe/spike/pkg/crypto" +) + +func sanityCheck(secret group.Scalar, shares []shamir.Share) { + t := uint(1) // Need t+1 shares to reconstruct + + reconstructed, err := shamir.Recover(t, shares[:2]) + if err != nil { + log.FatalLn("sanityCheck: Failed to recover: " + err.Error()) + } + if !secret.IsEqual(reconstructed) { + log.FatalLn("sanityCheck: Recovered secret does not match original") + } +} + +func computeShares(finalKey []byte) (group.Scalar, []shamir.Share) { + // Initialize parameters + g := group.P256 + t := uint(1) // Need t+1 shares to reconstruct + n := uint(3) // Total number of shares + + // Create secret from your 32 byte key + secret := g.NewScalar() + if err := secret.UnmarshalBinary(finalKey); err != nil { + log.FatalLn("computeShares: Failed to unmarshal key: %v" + err.Error()) + } + + // To compute identical shares, we need an identical seed for the random + // reader. Using `finalKey` for seed is secure because Shamir Secret Sharing + // algorithm's security does not depend on the random seed; it depends on + // the shards being securely kept secret. + // If we use `random.Read` instead, then synchronizing shards after Nexus + // crashes will be cumbersome and prone to edge-case failures. + reader := crypto.NewDeterministicReader(finalKey) + ss := shamir.New(reader, t, secret) + return secret, ss.Share(n) +} + +func findShare(id string, keepers map[string]string, + shares []shamir.Share, +) *shamir.Share { + // Each keeper needs to be mapped to a unique shard. + // We sort the keeper ids; so same-indexed shards will be sent + // to their appropriate keeper instances. + sortedKeys := make([]string, 0, len(keepers)) + for k := range keepers { + sortedKeys = append(sortedKeys, k) + } + sort.Strings(sortedKeys) + + matchingIndex := -1 + for i, key := range sortedKeys { + if key == id { + matchingIndex = i + break + } + } + + if matchingIndex == -1 { + return nil + } + + if matchingIndex < 0 || matchingIndex >= len(shares) { + return nil + } + + return &shares[matchingIndex] +} diff --git a/drafts.txt b/drafts.txt index 284a9b9..93d4896 100644 --- a/drafts.txt +++ b/drafts.txt @@ -2,6 +2,210 @@ // \\\\\ Copyright 2024-present SPIKE contributors. // \\\\\\\ SPDX-License-Identifier: Apache-2.0 + +-------------------------------------------------------------------------------- + +// +//select { +//case <-ticker.C: +// keepers := env.Keepers() +// +// shardsNeeded := 2 +// var shardsCollected [][]byte +// +// for _, keeperApiRoot := range keepers { +// u, _ := url.JoinPath(keeperApiRoot, "/v1/store/shard") +// +// client, err := net.CreateMtlsClientWithPredicate( +// source, auth.IsKeeper, +// ) +// if err != nil { +// log.Log().Info("tick", "msg", +// "Failed to create mTLS client", "err", err) +// continue +// } +// +// md, err := json.Marshal(reqres.ShardRequest{}) +// if err != nil { +// log.Log().Info("tick", "msg", +// "Failed to marshal request", "err", err) +// continue +// } +// +// data, err := net.Post(client, u, md) +// var res reqres.ShardResponse +// +// if len(data) == 0 { +// log.Log().Info("tick", "msg", "No data") +// continue +// } +// +// err = json.Unmarshal(data, &res) +// if err != nil { +// log.Log().Info("tick", "msg", +// "Failed to unmarshal response", "err", err) +// continue +// } +// +// if len(shardsCollected) < shardsNeeded { +// decodedShard, err := base64.StdEncoding.DecodeString(res.Shard) +// if err != nil { +// log.Log().Info("tick", "msg", "Failed to decode shard") +// continue +// } +// +// // Check if the shard already exists in shardsCollected +// shardExists := false +// for _, existingShard := range shardsCollected { +// if bytes.Equal(existingShard, decodedShard) { +// shardExists = true +// break +// } +// } +// if shardExists { +// continue +// } +// +// shardsCollected = append(shardsCollected, decodedShard) +// } +// +// if len(shardsCollected) >= shardsNeeded { +// log.Log().Info("tick", +// "msg", "Collected required shards", +// "shards_collected", len(shardsCollected)) +// +// g := group.P256 +// +// firstShard := shardsCollected[0] +// firstShare := secretsharing.Share{ +// ID: g.NewScalar(), +// Value: g.NewScalar(), +// } +// firstShare.ID.SetUint64(1) +// err := firstShare.Value.UnmarshalBinary(firstShard) +// if err != nil { +// log.FatalLn("Failed to unmarshal share: " + err.Error()) +// } +// +// secondShard := shardsCollected[1] secondShare := secretsharing.Share{ +// ID: g.NewScalar(), +// Value: g.NewScalar(), +// } +// secondShare.ID.SetUint64(2) +// err = secondShare.Value.UnmarshalBinary(secondShard) +// if err != nil { +// log.FatalLn("Failed to unmarshal share: " + err.Error()) +// } +// +// var shares []secretsharing.Share +// shares = append(shares, firstShare) +// shares = append(shares, secondShare) +// +// reconstructed, err := secretsharing.Recover(1, shares) +// if err != nil { +// log.FatalLn("Failed to recover: " + err.Error()) +// } +// +// // TODO: check for errors. +// binaryRec, _ := reconstructed.MarshalBinary() +// +// // TODO: check size 32bytes. +// +// encoded := hex.EncodeToString(binaryRec) +// state.Initialize(encoded) +// +// log.Log().Info("tick", "msg", "Initialized backing store") +// return +// } +// +// log.Log().Info("tick", +// "msg", "Failed to collect shards... will retry", +// ) +// case <-ctx.Done(): +// return +// } +//} + + +-------------------------------------------------------------------------------- + +From chat logs: + +> In a pinch, a spire-agent with unix attestor can identify individual users. +> This may just be enough for admin identification for initial +> bootstrapping/recovery purposes. + +That's a good idea. The admin user should already have an SVID for +"their" user anyway. + +For example the following entry identifies the user: + +```text +# [exhibit 1:] + +# Register SPIKE Pilot +spire-server entry create \ + -spiffeID spiffe://spike.ist/spike/pilot \ + -parentID "spiffe://spike.ist/spire-agent" \ + -selector unix:uid:"$(id -u)" \ + -selector unix:path:"$PILOT_PATH" \ + -selector unix:sha256:"$PILOT_SHA" +``` + +So the fact that the user can use the SPIKE Pilot (the `spike` binary) +indeed means that they are authenticated. They don't need a password to further +authenticate themselves. (reasoning: they cannot be that user unless they +log in to the unix box -- the trust boundary is the box itself) + + +> named admins. so, for most things you know who is doing what. +> JWT will handle this well, using something like Keycloak, +> Entra, github, gitlab, facebook, etc, etc. + +^ that needs to be a user story on its own. We can start experimenting with +keycloak and see how it goes from there. An OIDC is an OIDC is an +OIDC anyway -- how different can they be :) + +> traditional admin, needed when things go horribly wrong to +> reenable named admins. I'm thinking, SPIRE issued jwt/svid for +> a certain uid, on a certain machine + +I think `[exhibit 1:]` is good enough to secure the traditional admin. + +We can have a SPIFFE ID like `spiffe://spike.ist/spike/pilot/role/superuser`. +It does not even have to be a JWT SVID. Someone who can talk to SPIKE can +assign/unassign it. + +So. If hell broke lose, I'll assign myself a superadmin SVID; +fix stuff, and then unregister that SVID. + +For named admins, we'd need OIDC, which can wait for a while. +For now, one superadmin is good enough. + +> A token isn't useful without someone vetting it.... + +Yes, and I agree that it's not worth introducing to complexity, unless either of us +want to found a TPM/human-id startup (which is not a bad idea ideed :)) + +> Thats the problem with passwords. Your trusting a human, with limited memory, +> with a string a machine cares about and hope it stays secure. Thats proven hard. + +LOL, but agreed. + +-- + +So in short, the above approach I think... + +1. will eliminate need for password. +2. will push identifying the superadmin to their unix credentials + (one who owns the box owns spike, provided someone who owns spire + let them own spike -- I like the multi-level access approach. + So if I have login access to the box, but not to SPIRE, then the SPIRE admin can + remove my access if I turn out to be a bad superadmin :) -- but in reality + I will be the alpha and the omega (both SPIRE admin and also linux user)) +3. root key backup and rotation is figureoutable and FFS. +4. named admins are figureoutable and FFS. + -------------------------------------------------------------------------------- Idea: Inverting the root key flow diff --git a/examples/consume-secrets/demo-create-policy.sh b/examples/consume-secrets/demo-create-policy.sh index a3f6fa4..56f34e6 100755 --- a/examples/consume-secrets/demo-create-policy.sh +++ b/examples/consume-secrets/demo-create-policy.sh @@ -6,15 +6,17 @@ ./spike policy create --name=workload-can-read \ --path="/tenants/demo/db/*" \ - --spiffeid="^spiffe://spike.ist/workload/*" \ + --spiffeid="^spiffe://spike\\.ist/workload/*" \ --permissions="read" +# TODO: it should be spike\\.ist for proper escaping + ./spike policy create --name=workload-can-write \ --path="/tenants/demo/db/*" \ - --spiffeid="^spiffe://spike.ist/workload/*" \ + --spiffeid="^spiffe://spike\\.ist/workload/*" \ --permissions="write" #./spike policy create --name=workload-can-rw \ # --path="/tenants/demo/db/*" \ -# --spiffeid="^spiffe://spike.ist/workload/*" \ -# --permissions="read,write" \ No newline at end of file +# --spiffeid="^spiffe://spike\\.ist/workload/*" \ +# --permissions="read,write" diff --git a/go.mod b/go.mod index c7abd80..68e8d87 100644 --- a/go.mod +++ b/go.mod @@ -11,7 +11,7 @@ require ( github.com/mattn/go-sqlite3 v1.14.24 github.com/spf13/cobra v1.8.1 github.com/spiffe/go-spiffe/v2 v2.4.0 - github.com/spiffe/spike-sdk-go v0.1.34 + github.com/spiffe/spike-sdk-go v0.1.36 github.com/stretchr/testify v1.9.0 ) diff --git a/go.sum b/go.sum index f05e1a9..610c60a 100644 --- a/go.sum +++ b/go.sum @@ -55,8 +55,8 @@ github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spiffe/go-spiffe/v2 v2.4.0 h1:j/FynG7hi2azrBG5cvjRcnQ4sux/VNj8FAVc99Fl66c= github.com/spiffe/go-spiffe/v2 v2.4.0/go.mod h1:m5qJ1hGzjxjtrkGHZupoXHo/FDWwCB1MdSyBzfHugx0= -github.com/spiffe/spike-sdk-go v0.1.34 h1:1SURYNaVhutTTVCN2NC/Vj9CHRerEZP1oQOgHVuOQu8= -github.com/spiffe/spike-sdk-go v0.1.34/go.mod h1:WIserWbShAkDVoj+GYdcXKFmKp2IBIsZHKKDcStHrHw= +github.com/spiffe/spike-sdk-go v0.1.36 h1:+q2bhCxe5oj2VLRUCN0vs5EUlc+bqS65MzaJ91UKRtg= +github.com/spiffe/spike-sdk-go v0.1.36/go.mod h1:WIserWbShAkDVoj+GYdcXKFmKp2IBIsZHKKDcStHrHw= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/zeebo/errs v1.3.0 h1:hmiaKqgYZzcVgRL1Vkc1Mn2914BbzB0IBxs+ebeutGs= diff --git a/jira.xml b/jira.xml index 53b7ae0..0ddfac5 100644 --- a/jira.xml +++ b/jira.xml @@ -14,9 +14,175 @@ + + we don't need state.ReadAppState() + and other state enums for keepers anymore + keepers are just dummy stateless keepers. + + + this is for policy creation: + + allowed := state.CheckAccess( + spiffeid.String(), "*", + []data.PolicyPermission{data.PermissionSuper}, + ) + + instead of a wildcard, maybe have a predefined path + for access check like "/spike/system/acl" + + also disallow people creating secrets etc under + /spike/system + Invert shard generation flow. + + control these with flags. + i.e. the starter script can optionally NOT automatically + start nexus or keepers. + + #echo "" + #echo "Waiting before SPIKE Keeper 1..." + #sleep 5 + #run_background "./hack/start-keeper-1.sh" + #echo "" + #echo "Waiting before SPIKE Keeper 2..." + #sleep 5 + #run_background "./hack/start-keeper-2.sh" + #echo "" + #echo "Waiting before SPIKE Keeper 3..." + #sleep 5 + #run_background "./hack/start-keeper-3.sh" + + #echo "" + #echo "Waiting before SPIKE Nexus..." + #sleep 5 + #run_background "./hack/start-nexus.sh" + + + + validations: + + along with the error code, also return some explanatory message + + instead of this for example + + err = validation.ValidateSpiffeIdPattern(spiffeIdPattern) + if err != nil { + responseBody := net.MarshalBody(reqres.PolicyCreateResponse{ + Err: data.ErrBadInput, + }, w) + net.Respond(http.StatusBadRequest, responseBody, w) + return err + } + + do this + + err = validation.ValidateSpiffeIdPattern(spiffeIdPattern) + if err != nil { + responseBody := net.MarshalBody(reqres.PolicyCreateResponse{ + Err: data.ErrBadInput, + Reason: "Invalid spiffe id pattern. Matcher should be a regex that can match a spiffe id" + }, w) + net.Respond(http.StatusBadRequest, responseBody, w) + return err + } + + + implement keeper crash recovery: + i.e. ask shards from nexus + + + implement nexus crash recovery + i.e. ask shards from keepers + + + keeper does not need to store multiple shards; + each keeper should keep its own shard. + + // Store decoded shard in the map. + state.Shards.Store(id, decodedShard) + log.Log().Info(fName, "msg", "Shard stored", "id", id) + + + implement doomsday recovery + i.e. operator saves shards in a secure enclave. + + + admin should not be able to create two policies with the same name. + + + exponentially back off here + + log.Log().Info("tick", "msg", "Waiting for keepers to initialize") + time.Sleep(5 * time.Second) + + + Store root key in Nexus' memory (will be required for recovery later) + We can also keep shards in nexus' memory for convenience too + (reasoning: if we are keeping the root key, securely erasing shards + do not increase the security posture that much) + + + consider db backend as untrusted + i.e. encrypt everything you store there; including policies. + (that might already be the case actually) -- if so, document it + in the website. + + + // TODO: this check will change once we make #keepers configurable. + if len(keepers) < 3 { + log.FatalLn("Tick: not enough keepers") + } + + + For in-memory store, bypass initialization, shard creation etc. + + + SPIKE defaults to sqlite backing store + + + nexus tracks its keeper-initialization state. + (time shards created, number of shards etc) + + + 1. Nexus maintaining its initialization state + (i.e. if it successfully initialized the keepers it should recompute the + root key from the keepers instead of auto-generating itself. + for that; it will set a tombstone indicating it has initialized the keepers + the tombstone will be in SQLite; since shards do not make sense in in-memory + backing store) + + 2. Keepers advertising their status to Nexus regularly + IF Nexus has initialized keepers already Nexus will recompute and + provide the shard to the keeper. + (Nexus will keep the root key in its memory. The threat model of SPIKE + does not protect Nexus against memory-based attacks and it's up to the + user to harden and ensure that nexus runs with non-root privileges + (this threat model is the same for Vault and other secret stores too)) + + 3. Nexus crashes; it figures out it already initialized keepers; asks for + shards and rekeys itself. + + 4. Nexus crashes; but quorum of keepers that they know their shards cannot + be reached. + Nexus transitions to "locked" state and a manual unlock will be required + (this will be a separate user story) + + + // 3. spike policy list gives `null` for no policies instead of a message + // also the response is json rather than a more human readable output. + // also `createdBy` is emppy. + // we can create "good first issue"s for these. + + + func computeShares(finalKey []byte) (group.Scalar, []secretsharing.Share) { + // Initialize parameters + g := group.P256 + // TODO: these will be configurable + t := uint(1) // Need t+1 shares to reconstruct + n := uint(3) // Total number of shares + dr: keeper crash waiting-for: shard generation inversion. @@ -46,6 +212,28 @@ + + something similar for SPIKE too: + Dev mode + The Helm chart may run a OpenBao server in development. This installs a + single OpenBao server with a memory storage backend. + + For dev mode: + - no keepers + - no backing store (everything is in memory) + + + Consider using google kms, azure keyvault, and other providers + (including an external SPIKE deployment) for root key recovery. + question to consider is whether it's really needed + second question to consider is what to link kms to (keepers or nexus?) + keepers would be better because we'll back up the shards only then. + or google kms can be used as an alternative to keepers + (i.e., store encrypted dek, with the encrypted root key on nexus; + only kms can decrypt it -- but, to me, it does not provide any + additional advantage since if you are on the machine, you can talk to + google kms anyway) + enable SQLlite by default and test it (ie. crash nexus and ensure both secrets and policies can be recovered) @@ -86,6 +274,14 @@ + + ability to lock nexus programmatically. + when locked, nexus will deny almost all operations + locking is done by executing nexus binary with a certain command line flag. + (i.e. there is no API access, you'll need to physically exec the ./nexus + binary -- regular svid verifications are still required) + only a superadmin can lock or unlock nexus. + consider using NATS for cross trust boundary (or nor) secret federation @@ -284,6 +480,221 @@ Assigning secrets to SPIFFE IDs or SPIFFE ID prefixes. + + SPIKE CSI Driver + + the CSI Secrets Store driver enables users to create + `SecretProviderClass` objects. These objects define which secret provider + to use and what secrets to retrieve. When pods requesting CSI volumes are + made, the CSI Secrets Store driver sends the request to the OpenBao CSI + provider if the provider is `vault`. The CSI provider then uses the + specified `SecretProviderClass` and the pod’s service account to retrieve + the secrets from OpenBao and mount them into the pod’s CSI volume. Note + that the secret is retrieved from SPIKE Nexus and populated to the CSI + secrets store volume during the `ContainerCreation` phase. Therefore, pods + are blocked from starting until the secrets are read from SPIKE and + written to the volume. + + + shall we implement rate limiting; or should that be out of scope + (i.e. to be implemented by the user. + + + to docs: the backing store is considered untrusted and it stores + encrypted information + todo: if it's "really" untrusted then maybe it's better to encrypt everything + (including metadata) -- check how other secrets managers does this. + + + more fine grained policy management + + 1. an explicit deny will override allows + 2. have allowed/disallowed/required parameters + 3. etc. + + # This section grants all access on "secret/*". further restrictions can be + # applied to this broad policy, as shown below. + path "secret/*" { + capabilities = ["create", "read", "update", "patch", "delete", "list", "scan"] + } + + # Even though we allowed secret/*, this line explicitly denies + # secret/super-secret. this takes precedence. + path "secret/super-secret" { + capabilities = ["deny"] + } + + # Policies can also specify allowed, disallowed, and required parameters. here + # the key "secret/restricted" can only contain "foo" (any value) and "bar" (one + # of "zip" or "zap"). + path "secret/restricted" { + capabilities = ["create"] + allowed_parameters = { + "foo" = [] + "bar" = ["zip", "zap"] + } + + but also, instead of going deep down into the policy rabbit hole, maybe + it's better to rely on well-established policy engines like OPA. + + A rego-based evaluation will give allow/deny decisions, which SPIKE Nexus + can then honor. + + Think about pros/cons of each approach. -- SPIKE can have a good-enough + default policy engine, and for more sophisticated functionality we can + leverage OPA. + + + If nexus has not started SPIKE Pilot should give a more informative + error message (i.e. Nexus is not ready, or not initialized, or + unreachable, please check yadda yadda yadda) + + + key rotation + + NIST rotation guidance + + Periodic rotation of the encryption keys is recommended, even in the + absence of compromise. Due to the nature of the AES-256-GCM encryption + used, keys should be rotated before approximately 232 + encryptions have been performed, following the guidelines of NIST + publication 800-38D. + + SPIKE will automatically rotate the backend encryption key prior to reaching + 232 encryption operations by default. + + also support manual key rotation + + + Do an internal security analysis / threat model for spike. + + + TODO in-memory "dev mode" for SPIKE #spike (i.e. in memory mode will not be default) + nexus --dev or something similar (maybe an env var) + + + Use SPIKE in lieu of encryption as a service (similar to transit secrets) + + + dynamic secrets + + + document how to do checksum verification to ensure that the binaries + you download is authentic. + + + docs: + Since the storage backend resides outside the barrier, it’s considered + untrusted so SPIKE will encrypt the data before it sends them to the + storage backend. This mechanism ensures that if a malicious attacker + attempts to gain access to the storage backend, the data cannot be + compromised since it remains encrypted, until OpenBao decrypts the data. + The storage backend provides a durable data persistent layer where data + is secured and available across server restarts. + + + use case: + one time access to an extremely limited subset of secrets + (maybe using a one time, or time-bound token) + but also consider if SPIKE needs tokens at all; I think we can piggyback + most of the authentication to SPIFFE and/or JWT -- having to convert + various kinds of tokens into internal secrets store tokens is not that much needed. + + + - TODO Telemetry + - core system metrics + - audit log metrics + - authentication metrics + - database metrics + - policy metrics + - secrets metrics + + + "token" secret type + - will be secure random + - will have expiration + + + spike dev mode + + + document limits and maximums of SPIKE (such as key length, path lenght, policy size etc) + + + double encryption when passing secrets around + (can be optional for client-nexus interaction; and can be mandatory for + tools that transcend trust boundaries (as in a relay / message queue that + may be used for secrets federation) + + + active/standby HA mode + + + document the built-in spiife ids used by the system. + + + pattern-based random secret generation + + + admin ui + + + guidelines about how to backup and restore + + + - AWS KMS support for keepers + - Azure keyvault support for keepers + - GCP kms support for keepers + - HSM support for keepers + - OCI kms support for keepers + - keepers storing their shards in a separate SPIKE deployment + (i.e. SPIKE using another SPIKE to restore root token) + + + postgresql backend + + + audit targets: + - file + - syslog + - socket + (if audit targets are enabled then command will not execute unless an + audit trail is started) + + + OIDC authentication for named admins. + + + SPIKE Dynamic secret sidecar injector + + + To docs: Why not kubernetes? + ps: also inspire a bit from VSecM docs too. + + - Kubernetes is not a secrets management solution. It does have native + support for secrets, but that is quite different from a dedicated + secrets management solution. Kubernetes secrets are scoped to the cluster + only, and many applications will have some services running outside + Kubernetes or in other Kubernetes clusters. Having these applications + use Kubernetes secrets from outside a Kubernetes environment will be + cumbersome and introduce authentication and authorization challenges. + Therefore, considering the secret scope as part of the design process + is critical. + - Kubernetes secrets are static in nature. You can define secrets by using + kubectl or the Kubernetes API, but once they are defined, they are stored + in etcd and presented to pods only during pod creation. Defining secrets + in this manner may create scenarios where secrets get stale, outdated, + or expired, requiring additional workflows to update and rotate the + secrets, and then re-deploy the application to use the new version, which + can add complexity and become quite time-consuming. Ensure consideration + is given to all requirements for secret freshness, updates, and rotation + as part of your design process. + - The secret access management security model is tied to the Kubernetes + RBAC model. This model can be challenging for users who are not familiar + with Kubernetes. Adopting a platform-agnostic security governance model + can enable you to adapt workflows for applications regardless of how and + where they are running. +