Skip to content

Commit

Permalink
fix panic due to nil execution store (#4501)
Browse files Browse the repository at this point in the history
The problem is we we don't fail the node properly when we fail to open
execution store's boltdb, which can happen if we already have a node
running using the same repo. This can be easily reproduced by running
two nodes on separate ports but using the same rep, or just trying to
run a node while devstack is running:

```
bacalhau serve --compute --orchestrator --api-port 1122

bacalhau serve --compute --orchestrator --api-port 2233
```

Now we get a clearer error of why we failed to start. It is not the best
error as it exposes implementation details and doesn't tell exactly what
is happning and how to fix things, but at better than a panic at least
and time is tight to improve the propagation of this error for now

```
failed to configure compute node: failed to create execution store: timed out while opening database, file "/Users/walid/.bacalhau/compute/state_boltdb.db" might be in use
```

Closes #4491
  • Loading branch information
wdbaruni authored Sep 24, 2024
1 parent e4ccb07 commit c2440c1
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 1 deletion.
3 changes: 3 additions & 0 deletions cmd/cli/serve/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ func GetComputeConfig(
return node.ComputeConfig{}, err
}
executionStore, err = boltdb.NewStore(ctx, executionStoreDBPath)
if err != nil {
return node.ComputeConfig{}, pkgerrors.Wrapf(err, "failed to create execution store")
}
}

executionsPath, err := cfg.ExecutionDir()
Expand Down
4 changes: 4 additions & 0 deletions pkg/compute/store/boltdb/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@ func NewStore(ctx context.Context, dbPath string) (*Store, error) {
boltdb_watcher.WithEventSerializer(eventObjectSerializer),
)

if err != nil {
return nil, fmt.Errorf("failed to create event store: %w", err)
}

return &Store{
database: database,
marshaller: marshaller.NewJSONMarshaller(),
Expand Down
8 changes: 8 additions & 0 deletions pkg/node/config_compute.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/bacalhau-project/bacalhau/pkg/compute/store"
"github.com/bacalhau-project/bacalhau/pkg/config/types"
legacy_types "github.com/bacalhau-project/bacalhau/pkg/config_legacy/types"
"github.com/bacalhau-project/bacalhau/pkg/lib/validate"
"github.com/bacalhau-project/bacalhau/pkg/models"
)

Expand Down Expand Up @@ -244,3 +245,10 @@ func validateConfig(config ComputeConfig, physicalResources models.Resources) er

return err
}

func (c *ComputeConfig) Validate() error {
// TODO: add more validations
var mErr error
mErr = errors.Join(mErr, validate.NotNil(c.ExecutionStore, "execution store is required"))
return mErr
}
6 changes: 5 additions & 1 deletion pkg/node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
legacy_types "github.com/bacalhau-project/bacalhau/pkg/config_legacy/types"
baccrypto "github.com/bacalhau-project/bacalhau/pkg/lib/crypto"
"github.com/bacalhau-project/bacalhau/pkg/lib/policy"
"github.com/bacalhau-project/bacalhau/pkg/lib/validate"
"github.com/bacalhau-project/bacalhau/pkg/models"
nats_transport "github.com/bacalhau-project/bacalhau/pkg/nats/transport"
"github.com/bacalhau-project/bacalhau/pkg/node/metrics"
Expand Down Expand Up @@ -52,7 +53,6 @@ type NodeConfig struct {
RequesterNodeConfig RequesterConfig
APIServerConfig publicapi.Config
AuthConfig types.AuthConfig
NodeType models.NodeType
IsRequesterNode bool
IsComputeNode bool
Labels map[string]string
Expand All @@ -64,7 +64,11 @@ type NodeConfig struct {
func (c *NodeConfig) Validate() error {
// TODO: add more validations
var mErr error
mErr = errors.Join(mErr, validate.NotBlank(c.NodeID, "node id is required"))
mErr = errors.Join(mErr, c.NetworkConfig.Validate())
if c.IsComputeNode {
mErr = errors.Join(mErr, c.ComputeConfig.Validate())
}
return mErr
}

Expand Down

0 comments on commit c2440c1

Please sign in to comment.