diff --git a/app/keeper/cmd/main.go b/app/keeper/cmd/main.go
index d7a17cc..b88ae6e 100644
--- a/app/keeper/cmd/main.go
+++ b/app/keeper/cmd/main.go
@@ -12,9 +12,7 @@ import (
"github.com/spiffe/spike-sdk-go/spiffe"
"github.com/spiffe/spike/app/keeper/internal/env"
- api "github.com/spiffe/spike/app/keeper/internal/net"
"github.com/spiffe/spike/app/keeper/internal/route/handle"
- "github.com/spiffe/spike/app/keeper/internal/state"
"github.com/spiffe/spike/app/keeper/internal/trust"
"github.com/spiffe/spike/internal/auth"
"github.com/spiffe/spike/internal/config"
@@ -37,34 +35,6 @@ func main() {
trust.Authenticate(spiffeid)
- keeperState := state.ReadAppState()
-
- if keeperState == state.AppStateError {
- log.FatalLn(
- "SPIKE Keeper is in ERROR state. Manual intervention required.",
- )
- }
-
- if keeperState == state.AppStateNotReady {
- log.Log().Info(appName,
- "msg", "SPIKE Keeper is not ready. Will send shards")
-
- go api.Contribute(source)
-
- go state.WaitForShards()
- }
-
- if keeperState == state.AppStateReady ||
- keeperState == state.AppStateRecovering {
- // TODO: implement this case
- // 1. Transition to a RECOVERING state, if not done already
- // 2. Contact peers to recompute shard.
- // 3. Try forever.
- // 4. If something is irrevocably irrecoverable transition to ERROR state.
- // 5. When everything is back to normal, transition to READY state.
- panic("I started, but I don't know what to do.")
- }
-
log.Log().Info(appName, "msg", fmt.Sprintf("Started service: %s v%s",
appName, config.KeeperVersion))
if err := net.ServeWithPredicate(
diff --git a/app/keeper/internal/net/contribute.go b/app/keeper/internal/net/contribute.go
deleted file mode 100644
index 0f9b74b..0000000
--- a/app/keeper/internal/net/contribute.go
+++ /dev/null
@@ -1,83 +0,0 @@
-// \\ SPIKE: Secure your secrets with SPIFFE.
-// \\\\\ Copyright 2024-present SPIKE contributors.
-// \\\\\\\ SPDX-License-Identifier: Apache-2.0
-
-package net
-
-import (
- "encoding/base64"
- "encoding/json"
- "net/url"
- "time"
-
- "github.com/spiffe/go-spiffe/v2/workloadapi"
- "github.com/spiffe/spike-sdk-go/api/entity/v1/reqres"
- "github.com/spiffe/spike-sdk-go/net"
-
- "github.com/spiffe/spike/app/keeper/internal/env"
- "github.com/spiffe/spike/app/keeper/internal/state"
- "github.com/spiffe/spike/internal/auth"
- "github.com/spiffe/spike/internal/log"
-)
-
-func Contribute(source *workloadapi.X509Source) {
- peers := env.Peers()
- myId := env.KeeperId()
-
- for id, peer := range peers {
- if id == myId {
- continue
- }
-
- contributeUrl, err := url.JoinPath(peer, "v1/store/contribute")
- if err != nil {
- log.FatalLn("Failed to join path: " + err.Error())
- }
-
- if source == nil {
- log.FatalLn("contribute: source is nil")
- }
-
- client, err := net.CreateMtlsClientWithPredicate(
- source,
- auth.IsKeeper,
- )
- if err != nil {
- panic(err)
- }
-
- contribution := state.RandomContribution()
- state.Shards.Store(myId, contribution)
-
- log.Log().Info(
- "contribute",
- "msg", "Sending contribution to peer",
- "peer", peer,
- )
-
- md, err := json.Marshal(
- reqres.ShardContributionRequest{
- KeeperId: myId,
- Shard: base64.StdEncoding.EncodeToString(contribution),
- },
- )
- if err != nil {
- log.FatalLn(
- "Failed to marshal shard contribution request: " + err.Error(),
- )
- }
-
- _, err = net.Post(client, contributeUrl, md)
- for err != nil {
- time.Sleep(5 * time.Second)
- _, err = net.Post(client, contributeUrl, md)
- if err != nil {
- log.Log().Info("contribute",
- "msg", "Error sending contribution. Will retry",
- "err", err,
- )
- time.Sleep(5 * time.Second)
- }
- }
- }
-}
diff --git a/app/keeper/internal/route/base/route.go b/app/keeper/internal/route/base/route.go
index 384bdc9..e4f7028 100644
--- a/app/keeper/internal/route/base/route.go
+++ b/app/keeper/internal/route/base/route.go
@@ -27,12 +27,12 @@ func Route(w http.ResponseWriter, r *http.Request, a *log.AuditEntry) error {
r.Method,
func(a net.SpikeKeeperApiAction, p net.ApiUrl) net.Handler {
switch {
+ // Get contribution from SPIKE Nexus
case a == net.ActionKeeperDefault && p == net.SpikeKeeperUrlContribute:
return store.RouteContribute
+ // Provide your shard to SPIKE Nexus
case a == net.ActionKeeperDefault && p == net.SpikeKeeperUrlShard:
return store.RouteShard
- case a == net.ActionKeeperDefault && p == net.SpikeKeeperUrlStatus:
- return store.RouteStatus
default:
return net.Fallback
}
diff --git a/app/keeper/internal/route/store/contribute.go b/app/keeper/internal/route/store/contribute.go
index 64a1424..8cc56f9 100644
--- a/app/keeper/internal/route/store/contribute.go
+++ b/app/keeper/internal/route/store/contribute.go
@@ -53,8 +53,7 @@ func RouteContribute(
// Store decoded shard in the map.
state.Shards.Store(id, decodedShard)
-
- log.Log().Info(fName, "msg", "Shard stored", "id", id, "shard", decodedShard)
+ log.Log().Info(fName, "msg", "Shard stored", "id", id)
responseBody := net.MarshalBody(reqres.ShardContributionResponse{}, w)
diff --git a/app/keeper/internal/state/join.go b/app/keeper/internal/state/join.go
deleted file mode 100644
index 5d63c08..0000000
--- a/app/keeper/internal/state/join.go
+++ /dev/null
@@ -1,115 +0,0 @@
-// \\ SPIKE: Secure your secrets with SPIFFE.
-// \\\\\ Copyright 2024-present SPIKE contributors.
-// \\\\\\\ SPDX-License-Identifier: Apache-2.0
-
-package state
-
-import (
- "sort"
-
- "github.com/cloudflare/circl/group"
- "github.com/cloudflare/circl/secretsharing"
-
- "github.com/spiffe/spike/app/keeper/internal/env"
- "github.com/spiffe/spike/internal/log"
- "github.com/spiffe/spike/pkg/crypto"
-)
-
-func setInternalShard(shares []secretsharing.Share) {
- // Sort the keys of env.Peers() alphabetically for deterministic
- // shard indexing.
- peers := env.Peers()
- peerKeys := make([]string, 0, len(peers))
- for id := range peers {
- peerKeys = append(peerKeys, id)
- }
- sort.Strings(peerKeys)
-
- myId := env.KeeperId()
-
- // Find the index of the current Keeper's ID
- var myShard []byte
- for index, id := range peerKeys {
- if id == myId {
- // Save the shard corresponding to this Keeper
- if val, ok := Shards.Load(myId); ok {
- myShard = val.([]byte)
-
- log.Log().Info("setInternalShard", "id", myId, "index", index)
-
- shareVal, _ := shares[index].Value.MarshalBinary()
-
- SetShard(shareVal)
- EraseIntermediateShards()
-
- break
- }
- }
- }
-
- // Ensure myShard is stored correctly in the state namespace
- if myShard == nil {
- log.FatalLn(
- "setInternalShard: Shard for Keeper ID", myId, "could not be found",
- )
- }
-}
-
-func computeFinalKey() []byte {
- finalKey := make([]byte, 32)
-
- counter := 0
- Shards.Range(func(key, value any) bool {
- counter++
- shard := value.([]byte)
- for i := 0; i < 32; i++ {
- finalKey[i] ^= shard[i]
- }
- return true
- })
-
- if counter != 3 {
- log.FatalLn("computeFinalKey: Not all shards received")
- }
-
- if len(finalKey) != 32 {
- log.FatalLn("computeFinalKey: FinalKey must be 32 bytes long")
- }
-
- return finalKey
-}
-
-func computeShares(finalKey []byte) (group.Scalar, []secretsharing.Share) {
- // Initialize parameters
- g := group.P256
- t := uint(1) // Need t+1 shares to reconstruct
- n := uint(3) // Total number of shares
-
- // Create secret from your 32 byte key
- secret := g.NewScalar()
- if err := secret.UnmarshalBinary(finalKey); err != nil {
- log.FatalLn("computeShares: Failed to unmarshal key: %v" + err.Error())
- }
-
- // TODO: give the end user to seed their own key if they want to via an environment variable.
-
- // Create deterministic random source using the key itself as seed
- // You could use any other seed value for consistency
- deterministicRand := crypto.NewDeterministicReader(finalKey)
-
- // Create shares
- ss := secretsharing.New(deterministicRand, t, secret)
- return secret, ss.Share(n)
-}
-
-func sanityCheck(secret group.Scalar, shares []secretsharing.Share) {
- t := uint(1) // Need t+1 shares to reconstruct
-
- reconstructed, err := secretsharing.Recover(t, shares[:2])
- if err != nil {
- log.FatalLn("computeShares: Failed to recover: " + err.Error())
- }
- if !secret.IsEqual(reconstructed) {
- log.FatalLn("computeShares: Recovered secret does not match original")
- }
-}
diff --git a/app/keeper/internal/state/shard.go b/app/keeper/internal/state/shard.go
index 947797e..d9deb69 100644
--- a/app/keeper/internal/state/shard.go
+++ b/app/keeper/internal/state/shard.go
@@ -6,79 +6,8 @@ package state
import (
"sync"
- "time"
-
- "github.com/spiffe/spike/internal/log"
- "github.com/spiffe/spike/pkg/crypto"
)
-// WaitForShards blocks until exactly 3 shards are collected in the global
-// Shards map. Once collected, it computes the final key, generates shares,
-// sets the internal shard, and performs validation checks.
-//
-// The function:
-// - Polls the Shards map every 2 seconds until 3 shards are present
-// - Panics if more than 3 shards are received
-// - Processes the shards to generate the final distributed secret
-//
-// Panics:
-// - If more than 3 shards are received
-func WaitForShards() {
- for {
- shardCount := 0
- Shards.Range(func(key, value any) bool {
- shardCount++
- return true
- })
-
- log.Log().Info(
- "waitForShards", "msg", "Current shard count", "count", shardCount,
- )
-
- if shardCount < 3 {
- time.Sleep(2 * time.Second)
- continue
- }
-
- if shardCount > 3 {
- // TODO: add an audit log, because this is a security incident likely.
- log.FatalLn("waitForShards: Too many shards received")
- }
-
- finalKey := computeFinalKey()
- secret, shares := computeShares(finalKey)
- setInternalShard(shares)
- sanityCheck(secret, shares)
-
- break
- }
-}
-
-var myContribution []byte
-var myContributionLock sync.Mutex
-
-// RandomContribution generates and caches a random contribution for the
-// distributed secret. The contribution is generated only once and reused for
-// subsequent calls.
-//
-// Returns:
-// - []byte: Random contribution bytes from AES-256 seed
-//
-// Thread-safe through myContributionLock mutex.
-func RandomContribution() []byte {
- myContributionLock.Lock()
- defer myContributionLock.Unlock()
-
- if len(myContribution) == 0 {
- mySeed, _ := crypto.Aes256Seed()
- myContribution = []byte(mySeed)
-
- return myContribution
- }
-
- return myContribution
-}
-
var Shards sync.Map
var shard []byte
@@ -107,14 +36,3 @@ func Shard() []byte {
defer shardMutex.RUnlock()
return shard
}
-
-// EraseIntermediateShards removes all entries from the global Shards map,
-// cleaning up intermediate secret sharing data.
-//
-// Thread-safe through sync.Map operations.
-func EraseIntermediateShards() {
- Shards.Range(func(key, value interface{}) bool {
- Shards.Delete(key)
- return true
- })
-}
diff --git a/app/nexus/internal/poll/poll.go b/app/nexus/internal/poll/poll.go
index b89c785..9e7cafa 100644
--- a/app/nexus/internal/poll/poll.go
+++ b/app/nexus/internal/poll/poll.go
@@ -5,7 +5,6 @@
package poll
import (
- "bytes"
"context"
"encoding/base64"
"encoding/hex"
@@ -13,17 +12,16 @@ import (
"net/url"
"time"
- "github.com/cloudflare/circl/group"
- "github.com/cloudflare/circl/secretsharing"
+ "github.com/spiffe/go-spiffe/v2/workloadapi"
"github.com/spiffe/spike-sdk-go/api/entity/v1/reqres"
- "github.com/spiffe/spike-sdk-go/net"
+ network "github.com/spiffe/spike-sdk-go/net"
+
"github.com/spiffe/spike/app/nexus/internal/env"
state "github.com/spiffe/spike/app/nexus/internal/state/base"
"github.com/spiffe/spike/internal/auth"
-
- "github.com/spiffe/go-spiffe/v2/workloadapi"
-
"github.com/spiffe/spike/internal/log"
+ "github.com/spiffe/spike/internal/net"
+ "github.com/spiffe/spike/pkg/crypto"
)
func Tick(
@@ -31,56 +29,106 @@ func Tick(
source *workloadapi.X509Source,
ticker *time.Ticker,
) {
- // Talk to all keeper endpoints until we get the minimum number of shards
- // to reconstruct the root key. Once the root key is reconstructed,
- // initialize the backing store with the root key and exit the ticker.
+ // Talk to all SPIKE Keeper endpoints and send their shards and get
+ // acknowledgement that they received the shard.
- for {
- if source == nil {
- log.Log().Info("tick", "msg", "source is nil")
- time.Sleep(time.Second * 5)
- continue
- }
+ if source == nil {
+ // If source is nil, nobody is going to recreate the source,
+ // it's better to log and crash.
+ log.FatalLn("Tick: source is nil. this should not happen.")
+ }
+ // Create the root key and create shards out of the root key.
+ rootKey, err := crypto.Aes256Seed()
+ if err != nil {
+ log.FatalLn("Tick: failed to create root key: " + err.Error())
+ }
+ decodedRootKey, err := hex.DecodeString(rootKey)
+ if err != nil {
+ log.FatalLn("Tick: failed to decode root key: " + err.Error())
+ }
+ rootSecret, rootShares := computeShares(decodedRootKey)
+ sanityCheck(rootSecret, rootShares)
+
+ // Initialize the backend store before sending shards to the keepers.
+ // Keepers is our backup system, and they are not critical for system
+ // operations. Initializing early allows SPIKE Nexus to serve before
+ // keepers are hydrated.
+ state.Initialize(rootKey)
+ log.Log().Info("tick", "msg", "Initialized the backing store")
+
+ successfulKeepers := make(map[string]bool)
+
+ for {
select {
case <-ticker.C:
keepers := env.Keepers()
+ if len(keepers) < 3 {
+ log.FatalLn("Tick: not enough keepers")
+ }
- shardsNeeded := 2
- var shardsCollected [][]byte
+ // Ensure to get a success response from ALL keepers eventually.
+ for keeperId, keeperApiRoot := range keepers {
+ u, err := url.JoinPath(
+ keeperApiRoot,
+ string(net.SpikeKeeperUrlContribute),
+ )
- for _, keeperApiRoot := range keepers {
- u, _ := url.JoinPath(keeperApiRoot, "/v1/store/shard")
+ if err != nil {
+ log.Log().Warn(
+ "tick",
+ "msg", "Failed to join path",
+ "url", keeperApiRoot,
+ )
+ continue
+ }
- client, err := net.CreateMtlsClientWithPredicate(
+ client, err := network.CreateMtlsClientWithPredicate(
source, auth.IsKeeper,
)
+
if err != nil {
- log.Log().Info("tick", "msg",
- "Failed to create mTLS client", "err", err)
+ log.Log().Warn("tick",
+ "msg", "Failed to create mTLS client",
+ "err", err)
continue
}
- md, err := json.Marshal(reqres.ShardRequest{})
+ share := findShare(keeperId, keepers, rootShares)
+
+ contribution, err := share.Value.MarshalBinary()
if err != nil {
- log.Log().Info("tick", "msg",
- "Failed to marshal request", "err", err)
+ log.Log().Warn("tick",
+ "msg", "Failed to marshal share",
+ "err", err, "keeper_id", keeperId)
continue
}
- data, err := net.Post(client, u, md)
+ scr := reqres.ShardContributionRequest{
+ KeeperId: keeperId,
+ Shard: base64.StdEncoding.EncodeToString(contribution),
+ }
+ md, err := json.Marshal(scr)
if err != nil {
- log.Log().Info("tick", "msg",
- "Failed to post request", "err", err)
+ log.Log().Warn("tick",
+ "msg", "Failed to marshal request",
+ "err", err, "keeper_id", keeperId)
continue
}
- var res reqres.ShardResponse
+
+ data, err := net.Post(client, u, md)
+ if err != nil {
+ log.Log().Warn("tick", "msg",
+ "Failed to post",
+ "err", err, "keeper_id", keeperId)
+ }
if len(data) == 0 {
log.Log().Info("tick", "msg", "No data")
continue
}
+ var res reqres.ShardContributionResponse
err = json.Unmarshal(data, &res)
if err != nil {
log.Log().Info("tick", "msg",
@@ -88,84 +136,20 @@ func Tick(
continue
}
- if len(shardsCollected) < shardsNeeded {
- decodedShard, err := base64.StdEncoding.DecodeString(res.Shard)
- if err != nil {
- log.Log().Info("tick", "msg", "Failed to decode shard")
- continue
- }
-
- // Check if the shard already exists in shardsCollected
- shardExists := false
- for _, existingShard := range shardsCollected {
- if bytes.Equal(existingShard, decodedShard) {
- shardExists = true
- break
- }
- }
- if shardExists {
- continue
- }
-
- shardsCollected = append(shardsCollected, decodedShard)
- }
+ successfulKeepers[keeperId] = true
+ log.Log().Info("tick", "msg", "Success", "keeper_id", keeperId)
- if len(shardsCollected) >= shardsNeeded {
- log.Log().Info("tick",
- "msg", "Collected required shards",
- "shards_collected", len(shardsCollected))
-
- g := group.P256
-
- firstShard := shardsCollected[0]
- firstShare := secretsharing.Share{
- ID: g.NewScalar(),
- Value: g.NewScalar(),
- }
- firstShare.ID.SetUint64(1)
- err := firstShare.Value.UnmarshalBinary(firstShard)
- if err != nil {
- log.FatalLn("Failed to unmarshal share: " + err.Error())
- }
-
- secondShard := shardsCollected[1]
- secondShare := secretsharing.Share{
- ID: g.NewScalar(),
- Value: g.NewScalar(),
- }
- secondShare.ID.SetUint64(2)
- err = secondShare.Value.UnmarshalBinary(secondShard)
- if err != nil {
- log.FatalLn("Failed to unmarshal share: " + err.Error())
- }
-
- var shares []secretsharing.Share
- shares = append(shares, firstShare)
- shares = append(shares, secondShare)
-
- reconstructed, err := secretsharing.Recover(1, shares)
- if err != nil {
- log.FatalLn("Failed to recover: " + err.Error())
- }
-
- // TODO: check for errors.
- binaryRec, _ := reconstructed.MarshalBinary()
-
- // TODO: check size 32bytes.
-
- encoded := hex.EncodeToString(binaryRec)
- state.Initialize(encoded)
-
- log.Log().Info("tick", "msg", "Initialized backing store")
+ if len(successfulKeepers) == 3 {
+ log.Log().Info("tick", "msg", "All keepers initialized")
return
}
}
-
- log.Log().Info("tick",
- "msg", "Failed to collect shards... will retry",
- )
case <-ctx.Done():
+ log.Log().Info("tick", "msg", "Context done")
return
}
+
+ log.Log().Info("tick", "msg", "Waiting for keepers to initialize")
+ time.Sleep(5 * time.Second)
}
}
diff --git a/app/nexus/internal/poll/shamir.go b/app/nexus/internal/poll/shamir.go
new file mode 100644
index 0000000..051bedf
--- /dev/null
+++ b/app/nexus/internal/poll/shamir.go
@@ -0,0 +1,81 @@
+// \\ SPIKE: Secure your secrets with SPIFFE.
+// \\\\\ Copyright 2024-present SPIKE contributors.
+// \\\\\\\ SPDX-License-Identifier: Apache-2.0
+
+package poll
+
+import (
+ "sort"
+
+ "github.com/cloudflare/circl/group"
+ shamir "github.com/cloudflare/circl/secretsharing"
+
+ "github.com/spiffe/spike/internal/log"
+ "github.com/spiffe/spike/pkg/crypto"
+)
+
+func sanityCheck(secret group.Scalar, shares []shamir.Share) {
+ t := uint(1) // Need t+1 shares to reconstruct
+
+ reconstructed, err := shamir.Recover(t, shares[:2])
+ if err != nil {
+ log.FatalLn("sanityCheck: Failed to recover: " + err.Error())
+ }
+ if !secret.IsEqual(reconstructed) {
+ log.FatalLn("sanityCheck: Recovered secret does not match original")
+ }
+}
+
+func computeShares(finalKey []byte) (group.Scalar, []shamir.Share) {
+ // Initialize parameters
+ g := group.P256
+ t := uint(1) // Need t+1 shares to reconstruct
+ n := uint(3) // Total number of shares
+
+ // Create secret from your 32 byte key
+ secret := g.NewScalar()
+ if err := secret.UnmarshalBinary(finalKey); err != nil {
+ log.FatalLn("computeShares: Failed to unmarshal key: %v" + err.Error())
+ }
+
+ // To compute identical shares, we need an identical seed for the random
+ // reader. Using `finalKey` for seed is secure because Shamir Secret Sharing
+ // algorithm's security does not depend on the random seed; it depends on
+ // the shards being securely kept secret.
+ // If we use `random.Read` instead, then synchronizing shards after Nexus
+ // crashes will be cumbersome and prone to edge-case failures.
+ reader := crypto.NewDeterministicReader(finalKey)
+ ss := shamir.New(reader, t, secret)
+ return secret, ss.Share(n)
+}
+
+func findShare(id string, keepers map[string]string,
+ shares []shamir.Share,
+) *shamir.Share {
+ // Each keeper needs to be mapped to a unique shard.
+ // We sort the keeper ids; so same-indexed shards will be sent
+ // to their appropriate keeper instances.
+ sortedKeys := make([]string, 0, len(keepers))
+ for k := range keepers {
+ sortedKeys = append(sortedKeys, k)
+ }
+ sort.Strings(sortedKeys)
+
+ matchingIndex := -1
+ for i, key := range sortedKeys {
+ if key == id {
+ matchingIndex = i
+ break
+ }
+ }
+
+ if matchingIndex == -1 {
+ return nil
+ }
+
+ if matchingIndex < 0 || matchingIndex >= len(shares) {
+ return nil
+ }
+
+ return &shares[matchingIndex]
+}
diff --git a/drafts.txt b/drafts.txt
index 284a9b9..93d4896 100644
--- a/drafts.txt
+++ b/drafts.txt
@@ -2,6 +2,210 @@
// \\\\\ Copyright 2024-present SPIKE contributors.
// \\\\\\\ SPDX-License-Identifier: Apache-2.0
+
+--------------------------------------------------------------------------------
+
+//
+//select {
+//case <-ticker.C:
+// keepers := env.Keepers()
+//
+// shardsNeeded := 2
+// var shardsCollected [][]byte
+//
+// for _, keeperApiRoot := range keepers {
+// u, _ := url.JoinPath(keeperApiRoot, "/v1/store/shard")
+//
+// client, err := net.CreateMtlsClientWithPredicate(
+// source, auth.IsKeeper,
+// )
+// if err != nil {
+// log.Log().Info("tick", "msg",
+// "Failed to create mTLS client", "err", err)
+// continue
+// }
+//
+// md, err := json.Marshal(reqres.ShardRequest{})
+// if err != nil {
+// log.Log().Info("tick", "msg",
+// "Failed to marshal request", "err", err)
+// continue
+// }
+//
+// data, err := net.Post(client, u, md)
+// var res reqres.ShardResponse
+//
+// if len(data) == 0 {
+// log.Log().Info("tick", "msg", "No data")
+// continue
+// }
+//
+// err = json.Unmarshal(data, &res)
+// if err != nil {
+// log.Log().Info("tick", "msg",
+// "Failed to unmarshal response", "err", err)
+// continue
+// }
+//
+// if len(shardsCollected) < shardsNeeded {
+// decodedShard, err := base64.StdEncoding.DecodeString(res.Shard)
+// if err != nil {
+// log.Log().Info("tick", "msg", "Failed to decode shard")
+// continue
+// }
+//
+// // Check if the shard already exists in shardsCollected
+// shardExists := false
+// for _, existingShard := range shardsCollected {
+// if bytes.Equal(existingShard, decodedShard) {
+// shardExists = true
+// break
+// }
+// }
+// if shardExists {
+// continue
+// }
+//
+// shardsCollected = append(shardsCollected, decodedShard)
+// }
+//
+// if len(shardsCollected) >= shardsNeeded {
+// log.Log().Info("tick",
+// "msg", "Collected required shards",
+// "shards_collected", len(shardsCollected))
+//
+// g := group.P256
+//
+// firstShard := shardsCollected[0]
+// firstShare := secretsharing.Share{
+// ID: g.NewScalar(),
+// Value: g.NewScalar(),
+// }
+// firstShare.ID.SetUint64(1)
+// err := firstShare.Value.UnmarshalBinary(firstShard)
+// if err != nil {
+// log.FatalLn("Failed to unmarshal share: " + err.Error())
+// }
+//
+// secondShard := shardsCollected[1] secondShare := secretsharing.Share{
+// ID: g.NewScalar(),
+// Value: g.NewScalar(),
+// }
+// secondShare.ID.SetUint64(2)
+// err = secondShare.Value.UnmarshalBinary(secondShard)
+// if err != nil {
+// log.FatalLn("Failed to unmarshal share: " + err.Error())
+// }
+//
+// var shares []secretsharing.Share
+// shares = append(shares, firstShare)
+// shares = append(shares, secondShare)
+//
+// reconstructed, err := secretsharing.Recover(1, shares)
+// if err != nil {
+// log.FatalLn("Failed to recover: " + err.Error())
+// }
+//
+// // TODO: check for errors.
+// binaryRec, _ := reconstructed.MarshalBinary()
+//
+// // TODO: check size 32bytes.
+//
+// encoded := hex.EncodeToString(binaryRec)
+// state.Initialize(encoded)
+//
+// log.Log().Info("tick", "msg", "Initialized backing store")
+// return
+// }
+//
+// log.Log().Info("tick",
+// "msg", "Failed to collect shards... will retry",
+// )
+// case <-ctx.Done():
+// return
+// }
+//}
+
+
+--------------------------------------------------------------------------------
+
+From chat logs:
+
+> In a pinch, a spire-agent with unix attestor can identify individual users.
+> This may just be enough for admin identification for initial
+> bootstrapping/recovery purposes.
+
+That's a good idea. The admin user should already have an SVID for
+"their" user anyway.
+
+For example the following entry identifies the user:
+
+```text
+# [exhibit 1:]
+
+# Register SPIKE Pilot
+spire-server entry create \
+ -spiffeID spiffe://spike.ist/spike/pilot \
+ -parentID "spiffe://spike.ist/spire-agent" \
+ -selector unix:uid:"$(id -u)" \
+ -selector unix:path:"$PILOT_PATH" \
+ -selector unix:sha256:"$PILOT_SHA"
+```
+
+So the fact that the user can use the SPIKE Pilot (the `spike` binary)
+indeed means that they are authenticated. They don't need a password to further
+authenticate themselves. (reasoning: they cannot be that user unless they
+log in to the unix box -- the trust boundary is the box itself)
+
+
+> named admins. so, for most things you know who is doing what.
+> JWT will handle this well, using something like Keycloak,
+> Entra, github, gitlab, facebook, etc, etc.
+
+^ that needs to be a user story on its own. We can start experimenting with
+keycloak and see how it goes from there. An OIDC is an OIDC is an
+OIDC anyway -- how different can they be :)
+
+> traditional admin, needed when things go horribly wrong to
+> reenable named admins. I'm thinking, SPIRE issued jwt/svid for
+> a certain uid, on a certain machine
+
+I think `[exhibit 1:]` is good enough to secure the traditional admin.
+
+We can have a SPIFFE ID like `spiffe://spike.ist/spike/pilot/role/superuser`.
+It does not even have to be a JWT SVID. Someone who can talk to SPIKE can
+assign/unassign it.
+
+So. If hell broke lose, I'll assign myself a superadmin SVID;
+fix stuff, and then unregister that SVID.
+
+For named admins, we'd need OIDC, which can wait for a while.
+For now, one superadmin is good enough.
+
+> A token isn't useful without someone vetting it....
+
+Yes, and I agree that it's not worth introducing to complexity, unless either of us
+want to found a TPM/human-id startup (which is not a bad idea ideed :))
+
+> Thats the problem with passwords. Your trusting a human, with limited memory,
+> with a string a machine cares about and hope it stays secure. Thats proven hard.
+
+LOL, but agreed.
+
+--
+
+So in short, the above approach I think...
+
+1. will eliminate need for password.
+2. will push identifying the superadmin to their unix credentials
+ (one who owns the box owns spike, provided someone who owns spire
+ let them own spike -- I like the multi-level access approach.
+ So if I have login access to the box, but not to SPIRE, then the SPIRE admin can
+ remove my access if I turn out to be a bad superadmin :) -- but in reality
+ I will be the alpha and the omega (both SPIRE admin and also linux user))
+3. root key backup and rotation is figureoutable and FFS.
+4. named admins are figureoutable and FFS.
+
--------------------------------------------------------------------------------
Idea: Inverting the root key flow
diff --git a/examples/consume-secrets/demo-create-policy.sh b/examples/consume-secrets/demo-create-policy.sh
index a3f6fa4..56f34e6 100755
--- a/examples/consume-secrets/demo-create-policy.sh
+++ b/examples/consume-secrets/demo-create-policy.sh
@@ -6,15 +6,17 @@
./spike policy create --name=workload-can-read \
--path="/tenants/demo/db/*" \
- --spiffeid="^spiffe://spike.ist/workload/*" \
+ --spiffeid="^spiffe://spike\\.ist/workload/*" \
--permissions="read"
+# TODO: it should be spike\\.ist for proper escaping
+
./spike policy create --name=workload-can-write \
--path="/tenants/demo/db/*" \
- --spiffeid="^spiffe://spike.ist/workload/*" \
+ --spiffeid="^spiffe://spike\\.ist/workload/*" \
--permissions="write"
#./spike policy create --name=workload-can-rw \
# --path="/tenants/demo/db/*" \
-# --spiffeid="^spiffe://spike.ist/workload/*" \
-# --permissions="read,write"
\ No newline at end of file
+# --spiffeid="^spiffe://spike\\.ist/workload/*" \
+# --permissions="read,write"
diff --git a/go.mod b/go.mod
index c7abd80..68e8d87 100644
--- a/go.mod
+++ b/go.mod
@@ -11,7 +11,7 @@ require (
github.com/mattn/go-sqlite3 v1.14.24
github.com/spf13/cobra v1.8.1
github.com/spiffe/go-spiffe/v2 v2.4.0
- github.com/spiffe/spike-sdk-go v0.1.34
+ github.com/spiffe/spike-sdk-go v0.1.36
github.com/stretchr/testify v1.9.0
)
diff --git a/go.sum b/go.sum
index f05e1a9..610c60a 100644
--- a/go.sum
+++ b/go.sum
@@ -55,8 +55,8 @@ github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/spiffe/go-spiffe/v2 v2.4.0 h1:j/FynG7hi2azrBG5cvjRcnQ4sux/VNj8FAVc99Fl66c=
github.com/spiffe/go-spiffe/v2 v2.4.0/go.mod h1:m5qJ1hGzjxjtrkGHZupoXHo/FDWwCB1MdSyBzfHugx0=
-github.com/spiffe/spike-sdk-go v0.1.34 h1:1SURYNaVhutTTVCN2NC/Vj9CHRerEZP1oQOgHVuOQu8=
-github.com/spiffe/spike-sdk-go v0.1.34/go.mod h1:WIserWbShAkDVoj+GYdcXKFmKp2IBIsZHKKDcStHrHw=
+github.com/spiffe/spike-sdk-go v0.1.36 h1:+q2bhCxe5oj2VLRUCN0vs5EUlc+bqS65MzaJ91UKRtg=
+github.com/spiffe/spike-sdk-go v0.1.36/go.mod h1:WIserWbShAkDVoj+GYdcXKFmKp2IBIsZHKKDcStHrHw=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/zeebo/errs v1.3.0 h1:hmiaKqgYZzcVgRL1Vkc1Mn2914BbzB0IBxs+ebeutGs=
diff --git a/jira.xml b/jira.xml
index 53b7ae0..0ddfac5 100644
--- a/jira.xml
+++ b/jira.xml
@@ -14,9 +14,175 @@
+
+ we don't need state.ReadAppState()
+ and other state enums for keepers anymore
+ keepers are just dummy stateless keepers.
+
+
+ this is for policy creation:
+
+ allowed := state.CheckAccess(
+ spiffeid.String(), "*",
+ []data.PolicyPermission{data.PermissionSuper},
+ )
+
+ instead of a wildcard, maybe have a predefined path
+ for access check like "/spike/system/acl"
+
+ also disallow people creating secrets etc under
+ /spike/system
+
Invert shard generation flow.
+
+ control these with flags.
+ i.e. the starter script can optionally NOT automatically
+ start nexus or keepers.
+
+ #echo ""
+ #echo "Waiting before SPIKE Keeper 1..."
+ #sleep 5
+ #run_background "./hack/start-keeper-1.sh"
+ #echo ""
+ #echo "Waiting before SPIKE Keeper 2..."
+ #sleep 5
+ #run_background "./hack/start-keeper-2.sh"
+ #echo ""
+ #echo "Waiting before SPIKE Keeper 3..."
+ #sleep 5
+ #run_background "./hack/start-keeper-3.sh"
+
+ #echo ""
+ #echo "Waiting before SPIKE Nexus..."
+ #sleep 5
+ #run_background "./hack/start-nexus.sh"
+
+
+
+ validations:
+
+ along with the error code, also return some explanatory message
+
+ instead of this for example
+
+ err = validation.ValidateSpiffeIdPattern(spiffeIdPattern)
+ if err != nil {
+ responseBody := net.MarshalBody(reqres.PolicyCreateResponse{
+ Err: data.ErrBadInput,
+ }, w)
+ net.Respond(http.StatusBadRequest, responseBody, w)
+ return err
+ }
+
+ do this
+
+ err = validation.ValidateSpiffeIdPattern(spiffeIdPattern)
+ if err != nil {
+ responseBody := net.MarshalBody(reqres.PolicyCreateResponse{
+ Err: data.ErrBadInput,
+ Reason: "Invalid spiffe id pattern. Matcher should be a regex that can match a spiffe id"
+ }, w)
+ net.Respond(http.StatusBadRequest, responseBody, w)
+ return err
+ }
+
+
+ implement keeper crash recovery:
+ i.e. ask shards from nexus
+
+
+ implement nexus crash recovery
+ i.e. ask shards from keepers
+
+
+ keeper does not need to store multiple shards;
+ each keeper should keep its own shard.
+
+ // Store decoded shard in the map.
+ state.Shards.Store(id, decodedShard)
+ log.Log().Info(fName, "msg", "Shard stored", "id", id)
+
+
+ implement doomsday recovery
+ i.e. operator saves shards in a secure enclave.
+
+
+ admin should not be able to create two policies with the same name.
+
+
+ exponentially back off here
+
+ log.Log().Info("tick", "msg", "Waiting for keepers to initialize")
+ time.Sleep(5 * time.Second)
+
+
+ Store root key in Nexus' memory (will be required for recovery later)
+ We can also keep shards in nexus' memory for convenience too
+ (reasoning: if we are keeping the root key, securely erasing shards
+ do not increase the security posture that much)
+
+
+ consider db backend as untrusted
+ i.e. encrypt everything you store there; including policies.
+ (that might already be the case actually) -- if so, document it
+ in the website.
+
+
+ // TODO: this check will change once we make #keepers configurable.
+ if len(keepers) < 3 {
+ log.FatalLn("Tick: not enough keepers")
+ }
+
+
+ For in-memory store, bypass initialization, shard creation etc.
+
+
+ SPIKE defaults to sqlite backing store
+
+
+ nexus tracks its keeper-initialization state.
+ (time shards created, number of shards etc)
+
+
+ 1. Nexus maintaining its initialization state
+ (i.e. if it successfully initialized the keepers it should recompute the
+ root key from the keepers instead of auto-generating itself.
+ for that; it will set a tombstone indicating it has initialized the keepers
+ the tombstone will be in SQLite; since shards do not make sense in in-memory
+ backing store)
+
+ 2. Keepers advertising their status to Nexus regularly
+ IF Nexus has initialized keepers already Nexus will recompute and
+ provide the shard to the keeper.
+ (Nexus will keep the root key in its memory. The threat model of SPIKE
+ does not protect Nexus against memory-based attacks and it's up to the
+ user to harden and ensure that nexus runs with non-root privileges
+ (this threat model is the same for Vault and other secret stores too))
+
+ 3. Nexus crashes; it figures out it already initialized keepers; asks for
+ shards and rekeys itself.
+
+ 4. Nexus crashes; but quorum of keepers that they know their shards cannot
+ be reached.
+ Nexus transitions to "locked" state and a manual unlock will be required
+ (this will be a separate user story)
+
+
+ // 3. spike policy list gives `null` for no policies instead of a message
+ // also the response is json rather than a more human readable output.
+ // also `createdBy` is emppy.
+ // we can create "good first issue"s for these.
+
+
+ func computeShares(finalKey []byte) (group.Scalar, []secretsharing.Share) {
+ // Initialize parameters
+ g := group.P256
+ // TODO: these will be configurable
+ t := uint(1) // Need t+1 shares to reconstruct
+ n := uint(3) // Total number of shares
+
dr: keeper crash
waiting-for: shard generation inversion.
@@ -46,6 +212,28 @@
+
+ something similar for SPIKE too:
+ Dev mode
+ The Helm chart may run a OpenBao server in development. This installs a
+ single OpenBao server with a memory storage backend.
+
+ For dev mode:
+ - no keepers
+ - no backing store (everything is in memory)
+
+
+ Consider using google kms, azure keyvault, and other providers
+ (including an external SPIKE deployment) for root key recovery.
+ question to consider is whether it's really needed
+ second question to consider is what to link kms to (keepers or nexus?)
+ keepers would be better because we'll back up the shards only then.
+ or google kms can be used as an alternative to keepers
+ (i.e., store encrypted dek, with the encrypted root key on nexus;
+ only kms can decrypt it -- but, to me, it does not provide any
+ additional advantage since if you are on the machine, you can talk to
+ google kms anyway)
+
enable SQLlite by default
and test it (ie. crash nexus and ensure both secrets and policies can be recovered)
@@ -86,6 +274,14 @@
+
+ ability to lock nexus programmatically.
+ when locked, nexus will deny almost all operations
+ locking is done by executing nexus binary with a certain command line flag.
+ (i.e. there is no API access, you'll need to physically exec the ./nexus
+ binary -- regular svid verifications are still required)
+ only a superadmin can lock or unlock nexus.
+
consider using NATS for cross trust boundary (or nor) secret federation
@@ -284,6 +480,221 @@
Assigning secrets to SPIFFE IDs or SPIFFE ID prefixes.
+
+ SPIKE CSI Driver
+
+ the CSI Secrets Store driver enables users to create
+ `SecretProviderClass` objects. These objects define which secret provider
+ to use and what secrets to retrieve. When pods requesting CSI volumes are
+ made, the CSI Secrets Store driver sends the request to the OpenBao CSI
+ provider if the provider is `vault`. The CSI provider then uses the
+ specified `SecretProviderClass` and the pod’s service account to retrieve
+ the secrets from OpenBao and mount them into the pod’s CSI volume. Note
+ that the secret is retrieved from SPIKE Nexus and populated to the CSI
+ secrets store volume during the `ContainerCreation` phase. Therefore, pods
+ are blocked from starting until the secrets are read from SPIKE and
+ written to the volume.
+
+
+ shall we implement rate limiting; or should that be out of scope
+ (i.e. to be implemented by the user.
+
+
+ to docs: the backing store is considered untrusted and it stores
+ encrypted information
+ todo: if it's "really" untrusted then maybe it's better to encrypt everything
+ (including metadata) -- check how other secrets managers does this.
+
+
+ more fine grained policy management
+
+ 1. an explicit deny will override allows
+ 2. have allowed/disallowed/required parameters
+ 3. etc.
+
+ # This section grants all access on "secret/*". further restrictions can be
+ # applied to this broad policy, as shown below.
+ path "secret/*" {
+ capabilities = ["create", "read", "update", "patch", "delete", "list", "scan"]
+ }
+
+ # Even though we allowed secret/*, this line explicitly denies
+ # secret/super-secret. this takes precedence.
+ path "secret/super-secret" {
+ capabilities = ["deny"]
+ }
+
+ # Policies can also specify allowed, disallowed, and required parameters. here
+ # the key "secret/restricted" can only contain "foo" (any value) and "bar" (one
+ # of "zip" or "zap").
+ path "secret/restricted" {
+ capabilities = ["create"]
+ allowed_parameters = {
+ "foo" = []
+ "bar" = ["zip", "zap"]
+ }
+
+ but also, instead of going deep down into the policy rabbit hole, maybe
+ it's better to rely on well-established policy engines like OPA.
+
+ A rego-based evaluation will give allow/deny decisions, which SPIKE Nexus
+ can then honor.
+
+ Think about pros/cons of each approach. -- SPIKE can have a good-enough
+ default policy engine, and for more sophisticated functionality we can
+ leverage OPA.
+
+
+ If nexus has not started SPIKE Pilot should give a more informative
+ error message (i.e. Nexus is not ready, or not initialized, or
+ unreachable, please check yadda yadda yadda)
+
+
+ key rotation
+
+ NIST rotation guidance
+
+ Periodic rotation of the encryption keys is recommended, even in the
+ absence of compromise. Due to the nature of the AES-256-GCM encryption
+ used, keys should be rotated before approximately 232
+ encryptions have been performed, following the guidelines of NIST
+ publication 800-38D.
+
+ SPIKE will automatically rotate the backend encryption key prior to reaching
+ 232 encryption operations by default.
+
+ also support manual key rotation
+
+
+ Do an internal security analysis / threat model for spike.
+
+
+ TODO in-memory "dev mode" for SPIKE #spike (i.e. in memory mode will not be default)
+ nexus --dev or something similar (maybe an env var)
+
+
+ Use SPIKE in lieu of encryption as a service (similar to transit secrets)
+
+
+ dynamic secrets
+
+
+ document how to do checksum verification to ensure that the binaries
+ you download is authentic.
+
+
+ docs:
+ Since the storage backend resides outside the barrier, it’s considered
+ untrusted so SPIKE will encrypt the data before it sends them to the
+ storage backend. This mechanism ensures that if a malicious attacker
+ attempts to gain access to the storage backend, the data cannot be
+ compromised since it remains encrypted, until OpenBao decrypts the data.
+ The storage backend provides a durable data persistent layer where data
+ is secured and available across server restarts.
+
+
+ use case:
+ one time access to an extremely limited subset of secrets
+ (maybe using a one time, or time-bound token)
+ but also consider if SPIKE needs tokens at all; I think we can piggyback
+ most of the authentication to SPIFFE and/or JWT -- having to convert
+ various kinds of tokens into internal secrets store tokens is not that much needed.
+
+
+ - TODO Telemetry
+ - core system metrics
+ - audit log metrics
+ - authentication metrics
+ - database metrics
+ - policy metrics
+ - secrets metrics
+
+
+ "token" secret type
+ - will be secure random
+ - will have expiration
+
+
+ spike dev mode
+
+
+ document limits and maximums of SPIKE (such as key length, path lenght, policy size etc)
+
+
+ double encryption when passing secrets around
+ (can be optional for client-nexus interaction; and can be mandatory for
+ tools that transcend trust boundaries (as in a relay / message queue that
+ may be used for secrets federation)
+
+
+ active/standby HA mode
+
+
+ document the built-in spiife ids used by the system.
+
+
+ pattern-based random secret generation
+
+
+ admin ui
+
+
+ guidelines about how to backup and restore
+
+
+ - AWS KMS support for keepers
+ - Azure keyvault support for keepers
+ - GCP kms support for keepers
+ - HSM support for keepers
+ - OCI kms support for keepers
+ - keepers storing their shards in a separate SPIKE deployment
+ (i.e. SPIKE using another SPIKE to restore root token)
+
+
+ postgresql backend
+
+
+ audit targets:
+ - file
+ - syslog
+ - socket
+ (if audit targets are enabled then command will not execute unless an
+ audit trail is started)
+
+
+ OIDC authentication for named admins.
+
+
+ SPIKE Dynamic secret sidecar injector
+
+
+ To docs: Why not kubernetes?
+ ps: also inspire a bit from VSecM docs too.
+
+ - Kubernetes is not a secrets management solution. It does have native
+ support for secrets, but that is quite different from a dedicated
+ secrets management solution. Kubernetes secrets are scoped to the cluster
+ only, and many applications will have some services running outside
+ Kubernetes or in other Kubernetes clusters. Having these applications
+ use Kubernetes secrets from outside a Kubernetes environment will be
+ cumbersome and introduce authentication and authorization challenges.
+ Therefore, considering the secret scope as part of the design process
+ is critical.
+ - Kubernetes secrets are static in nature. You can define secrets by using
+ kubectl or the Kubernetes API, but once they are defined, they are stored
+ in etcd and presented to pods only during pod creation. Defining secrets
+ in this manner may create scenarios where secrets get stale, outdated,
+ or expired, requiring additional workflows to update and rotate the
+ secrets, and then re-deploy the application to use the new version, which
+ can add complexity and become quite time-consuming. Ensure consideration
+ is given to all requirements for secret freshness, updates, and rotation
+ as part of your design process.
+ - The secret access management security model is tied to the Kubernetes
+ RBAC model. This model can be challenging for users who are not familiar
+ with Kubernetes. Adopting a platform-agnostic security governance model
+ can enable you to adapt workflows for applications regardless of how and
+ where they are running.
+