From a02d156be9022416992f3200af3f3fd5f559ee66 Mon Sep 17 00:00:00 2001 From: Walid Baruni Date: Mon, 22 Jan 2024 10:29:41 +0300 Subject: [PATCH] introduce nats transport (#3188) This PR introduces a new transport layer for inter-node connectivity based on NATS, in addition to the existing libp2p transport. ## Integration Architecture NATS is embedded into Bacalhau nodes. Each Orchestrator node (aka Requester node) also runs a NATS server and connects with other Orchestrator nodes to form a NATS cluster. Compute nodes on the other hand only run a NATS client and connect to any of the Orchestrator nodes to listen for job requests and publish their node info to the network. With this approach, only the Orchestrator nodes need to be reachable, which simplies the network requirements for the Compute nodes. Also, the Compute nodes only need to know the address of only a single Orchestrator node, and will learn about the other Orchestrators at runtime. This simplifies bootstrapping, but also enables the Compute node to failover and reconnect to other Orchestrators at runtime. For completeness, the Orchestrator nodes also run a NATS client to subscribe for Compute node info, publish job requests and listen for responses. ### NATS Subjects NATS follows subject based addressing where clients publish and subscribe to subjects, and the servers take care of routing of those messages. These are the current subject patterns used to orchestrate jobs: - `node.compute..*` Every compute node subscribes to a topic in the format of `node.compute..*` to handle job requests from the orchestrator, such as `AskForBid` and `BidAccepted`. For example, if the Orchestrator selects node `QmUg1MoAUMEbpzg6jY45zpfNo2dmTBJS6CzZCTMbniYsvC` to run a job, it will send a message to `node.compute.QmUg1MoAUMEbpzg6jY45zpfNo2dmTBJS6CzZCTMbniYsvC.AskForBid/1`, which will land on the correct compute node. The suffix `/1` is just for versioning in case we change the `AskForBid` message format - `node.orchestrator..*` Similarly, Orchestrator nodes listen to a dedicate subject to handle compute callbacks, such as `OnRunComplete` and `OnComputeFailure` - `node.info.` Compute nodes periodically publish their node info to `node.info.` subject, whereas Orchestrator nodes listen to `node.info.*` so their can handle node info from all compute nodes ## Running with NATS NATS is currently opt-in, and libp2p is still the default transport layer. To run a network with NATS, use the following commands: ``` # Run Orchestrator node bacalhau serve --node-type=requester --use-nats # Run a second Orchestrator node bacalhau serve --node-type=requester --use-nats --cluster-peers=:6222 # Run a compute node bacalhau serve --node-type=compute --use-nats --orchestrators=:4222 # Run devstack with NATS bacalhau devstack --use-nats ``` ## Testing Done - Deployed and tested in `development` environment - All existing tests are passing when using NATS transport ## Remaining Work 1. Logstream support for NATS transport 1. Auth, as currently any node can join the network 1. Devstack tests based on NATS and deployed as part of circleci checks --- cmd/cli/agent/node_test.go | 4 +- cmd/cli/devstack/devstack.go | 4 +- cmd/cli/get/get_test.go | 12 +- cmd/cli/list/list_test.go | 18 +- cmd/cli/serve/serve.go | 297 ++++++++++++------ cmd/cli/serve/util.go | 32 +- cmd/util/flags/configflags/network.go | 54 ++++ cmd/util/flags/types.go | 1 - go.mod | 8 +- go.sum | 17 +- go.work.sum | 16 +- ops/terraform/dev.tfvars | 7 +- ops/terraform/main.tf | 8 + .../remote_files/scripts/install-node.sh | 6 +- .../remote_files/scripts/start-bacalhau.sh | 124 ++++---- ops/terraform/variables.tf | 6 + pkg/authn/challenge/authenticator_test.go | 8 +- pkg/authn/challenge/marshaller.go | 20 ++ pkg/authn/challenge/marshaller_test.go | 29 ++ pkg/compute/endpoint.go | 8 +- ...nfo_provider.go => node_info_decorator.go} | 16 +- pkg/compute/store/boltdb/store.go | 2 +- pkg/config/configenv/dev.go | 8 + pkg/config/configenv/local.go | 8 + pkg/config/configenv/production.go | 8 + pkg/config/configenv/staging.go | 8 + pkg/config/configenv/test.go | 8 + pkg/config/types/generated_constants.go | 9 + pkg/config/types/generated_viper_defaults.go | 19 +- pkg/config/types/node.go | 17 + pkg/devstack/devstack.go | 188 ++++++----- pkg/devstack/option.go | 10 +- pkg/lib/validate/general.go | 18 ++ pkg/lib/validate/general_test.go | 38 +++ pkg/lib/validate/numbers.go | 15 + pkg/lib/validate/numbers_test.go | 34 ++ pkg/lib/validate/util.go | 14 + pkg/lib/validate/util_test.go | 29 ++ pkg/libp2p/host.go | 3 + pkg/libp2p/info_decorator.go | 38 +++ pkg/libp2p/transport/libp2p.go | 217 +++++++++++++ pkg/models/constants.go | 12 +- pkg/models/migration/legacy/from.go | 1 - pkg/models/migration/legacy/to.go | 9 +- pkg/models/node_info.go | 27 +- pkg/nats/client.go | 61 ++++ pkg/nats/logger.go | 51 +++ pkg/nats/proxy/callback_handler.go | 82 +++++ pkg/nats/proxy/callback_proxy.go | 74 +++++ pkg/nats/proxy/compute_handler.go | 118 +++++++ pkg/nats/proxy/compute_proxy.go | 94 ++++++ pkg/nats/proxy/constants.go | 35 +++ pkg/nats/proxy/types.go | 30 ++ pkg/nats/pubsub/pubsub.go | 117 +++++++ pkg/nats/pubsub/pubsub_test.go | 131 ++++++++ pkg/nats/server.go | 80 +++++ pkg/nats/transport/nats.go | 229 ++++++++++++++ pkg/nats/util.go | 41 +++ pkg/node/compute.go | 116 ++++--- pkg/node/config_network.go | 45 +++ pkg/node/factories.go | 6 +- pkg/node/node.go | 239 +++++++------- pkg/node/requester.go | 105 ++----- pkg/orchestrator/scheduler/batch_job_test.go | 21 +- pkg/orchestrator/scheduler/daemon_job_test.go | 11 +- pkg/orchestrator/scheduler/ops_job_test.go | 11 +- .../scheduler/service_job_test.go | 21 +- pkg/orchestrator/scheduler/utils_test.go | 14 +- .../selection/discovery/chained.go | 7 +- .../selection/discovery/chained_test.go | 7 +- .../selection/discovery/store_test.go | 9 +- pkg/orchestrator/selection/ranking/chain.go | 13 +- .../selection/ranking/chain_test.go | 7 +- .../selection/ranking/features_test.go | 13 +- .../selection/ranking/max_usage_test.go | 7 +- .../selection/ranking/min_version_test.go | 5 +- .../selection/ranking/random_test.go | 3 +- .../selection/ranking/utils_test.go | 3 +- pkg/orchestrator/transformer/job.go | 6 +- pkg/orchestrator/types.go | 2 +- pkg/publicapi/endpoint/agent/endpoint.go | 40 ++- pkg/publicapi/endpoint/orchestrator/node.go | 2 +- pkg/publicapi/endpoint/shared/endpoint.go | 23 -- pkg/publicapi/test/agent_test.go | 9 +- pkg/publicapi/test/requester_server_test.go | 4 +- pkg/publicapi/test/util_test.go | 25 +- pkg/pubsub/libp2p/pubsub.go | 4 +- pkg/repo/fs.go | 2 +- pkg/repo/version.go | 5 + pkg/requester/endpoint.go | 3 +- pkg/requester/jobtransform/requester_info.go | 5 +- pkg/routing/inmemory/inmemory.go | 4 +- pkg/routing/inmemory/inmemory_test.go | 10 +- pkg/routing/node_info_provider.go | 49 ++- pkg/test/compute/resourcelimits_test.go | 4 +- pkg/test/compute/setup_test.go | 25 +- pkg/test/logstream/stream_address_test.go | 6 +- pkg/test/requester/node_selection_test.go | 6 +- pkg/test/teststack/stack.go | 2 +- pkg/transport/interfaces.go | 34 ++ 100 files changed, 2725 insertions(+), 786 deletions(-) create mode 100644 cmd/util/flags/configflags/network.go create mode 100644 pkg/authn/challenge/marshaller.go create mode 100644 pkg/authn/challenge/marshaller_test.go rename pkg/compute/{node_info_provider.go => node_info_decorator.go} (77%) create mode 100644 pkg/lib/validate/general.go create mode 100644 pkg/lib/validate/general_test.go create mode 100644 pkg/lib/validate/numbers.go create mode 100644 pkg/lib/validate/numbers_test.go create mode 100644 pkg/lib/validate/util.go create mode 100644 pkg/lib/validate/util_test.go create mode 100644 pkg/libp2p/info_decorator.go create mode 100644 pkg/libp2p/transport/libp2p.go create mode 100644 pkg/nats/client.go create mode 100644 pkg/nats/logger.go create mode 100644 pkg/nats/proxy/callback_handler.go create mode 100644 pkg/nats/proxy/callback_proxy.go create mode 100644 pkg/nats/proxy/compute_handler.go create mode 100644 pkg/nats/proxy/compute_proxy.go create mode 100644 pkg/nats/proxy/constants.go create mode 100644 pkg/nats/proxy/types.go create mode 100644 pkg/nats/pubsub/pubsub.go create mode 100644 pkg/nats/pubsub/pubsub_test.go create mode 100644 pkg/nats/server.go create mode 100644 pkg/nats/transport/nats.go create mode 100644 pkg/nats/util.go create mode 100644 pkg/node/config_network.go create mode 100644 pkg/transport/interfaces.go diff --git a/cmd/cli/agent/node_test.go b/cmd/cli/agent/node_test.go index b6250fa663..78f1d7c1d9 100644 --- a/cmd/cli/agent/node_test.go +++ b/cmd/cli/agent/node_test.go @@ -33,7 +33,7 @@ func (s *NodeSuite) TestNodeJSONOutput() { nodeInfo := &models.NodeInfo{} err = marshaller.JSONUnmarshalWithMax([]byte(out), &nodeInfo) s.Require().NoError(err, "Could not unmarshall the output into json - %+v", err) - s.Require().Equal(s.Node.Host.ID(), nodeInfo.PeerInfo.ID, "Node ID does not match in json.") + s.Require().Equal(s.Node.ID, nodeInfo.ID(), "Node ID does not match in json.") } func (s *NodeSuite) TestNodeYAMLOutput() { @@ -46,5 +46,5 @@ func (s *NodeSuite) TestNodeYAMLOutput() { nodeInfo := &models.NodeInfo{} err = marshaller.YAMLUnmarshalWithMax([]byte(out), &nodeInfo) s.Require().NoError(err, "Could not unmarshall the output into yaml - %+v", err) - s.Require().Equal(s.Node.Host.ID(), nodeInfo.PeerInfo.ID, "Node ID does not match in yaml.") + s.Require().Equal(s.Node.ID, nodeInfo.ID(), "Node ID does not match in yaml.") } diff --git a/cmd/cli/devstack/devstack.go b/cmd/cli/devstack/devstack.go index cb198205cf..2ad4254e2a 100644 --- a/cmd/cli/devstack/devstack.go +++ b/cmd/cli/devstack/devstack.go @@ -147,7 +147,9 @@ func NewCmd() *cobra.Command { &ODs.ConfigurationRepo, "stack-repo", ODs.ConfigurationRepo, "Folder to act as the devstack configuration repo", ) - + devstackCmd.PersistentFlags().StringVar( + &ODs.NetworkType, "network", ODs.NetworkType, + "Type of inter-node network layer. e.g. nats and libp2p") return devstackCmd } diff --git a/cmd/cli/get/get_test.go b/cmd/cli/get/get_test.go index 3dded99b3b..9f1c2a4531 100644 --- a/cmd/cli/get/get_test.go +++ b/cmd/cli/get/get_test.go @@ -136,7 +136,7 @@ func (s *GetSuite) TestDockerRunWriteToJobFolderAutoDownload() { _, runOutput, err := cmdtesting.ExecuteTestCobraCommand(args...) require.NoError(s.T(), err, "Error submitting job") jobID := system.FindJobIDInTestOutputLegacy(runOutput) - hostID := s.Node.Host.ID().String() + hostID := s.Node.ID outputFolder := filepath.Join(tempDir, util.GetDefaultJobFolder(jobID)) testDownloadOutput(s.T(), runOutput, jobID, tempDir) testResultsFolderStructure(s.T(), outputFolder, hostID, nil) @@ -157,7 +157,7 @@ func (s *GetSuite) TestDockerRunWriteToJobFolderNamedDownload() { _, runOutput, err := cmdtesting.ExecuteTestCobraCommand(args...) require.NoError(s.T(), err, "Error submitting job") jobID := system.FindJobIDInTestOutputLegacy(runOutput) - hostID := s.Node.Host.ID().String() + hostID := s.Node.ID testDownloadOutput(s.T(), runOutput, jobID, tempDir) testResultsFolderStructure(s.T(), tempDir, hostID, nil) } @@ -177,7 +177,7 @@ func (s *GetSuite) TestGetWriteToJobFolderAutoDownload() { _, out, err := cmdtesting.ExecuteTestCobraCommand(args...) require.NoError(s.T(), err, "Error submitting job") jobID := system.FindJobIDInTestOutputLegacy(out) - hostID := s.Node.Host.ID().String() + hostID := s.Node.ID _, getOutput, err := cmdtesting.ExecuteTestCobraCommand("get", "--api-host", s.Node.APIServer.Address, @@ -224,7 +224,7 @@ func (s *GetSuite) TestGetSingleFileFromOutput() { _, out, err := cmdtesting.ExecuteTestCobraCommand(args...) require.NoError(s.T(), err, "Error submitting job") jobID := system.FindJobIDInTestOutputLegacy(out) - hostID := s.Node.Host.ID().String() + hostID := s.Node.ID _, getOutput, err := cmdtesting.ExecuteTestCobraCommand("get", "--api-host", s.Node.APIServer.Address, @@ -250,7 +250,7 @@ func (s *GetSuite) TestGetSingleNestedFileFromOutput() { _, out, err := cmdtesting.ExecuteTestCobraCommand(args...) require.NoError(s.T(), err, "Error submitting job") jobID := system.FindJobIDInTestOutputLegacy(out) - hostID := s.Node.Host.ID().String() + hostID := s.Node.ID _, getOutput, err := cmdtesting.ExecuteTestCobraCommand("get", "--api-host", s.Node.APIServer.Address, @@ -288,7 +288,7 @@ func (s *GetSuite) TestGetWriteToJobFolderNamedDownload() { require.NoError(s.T(), err, "Error submitting job") jobID := system.FindJobIDInTestOutputLegacy(out) - hostID := s.Node.Host.ID().String() + hostID := s.Node.ID _, getOutput, err := cmdtesting.ExecuteTestCobraCommand("get", "--api-host", s.Node.APIServer.Address, diff --git a/cmd/cli/list/list_test.go b/cmd/cli/list/list_test.go index a3ca33b8e7..0b22115d46 100644 --- a/cmd/cli/list/list_test.go +++ b/cmd/cli/list/list_test.go @@ -38,6 +38,15 @@ func TestListSuite(t *testing.T) { suite.Run(t, new(ListSuite)) } +func (suite *ListSuite) setupRun() { + // have to create a fresh node for each test case to avoid jobs of different runs to be mixed up + suite.TearDownTest() + // Clear the repo that was created by the previous run so a fresh one is created + // TODO: find a better solution to set the repo path for tests in pkg/setup/setup.go:49 instead of env vars to avoid such hacks + suite.T().Setenv("BACALHAU_DIR", "") + suite.SetupTest() +} + func (suite *ListSuite) TestList_NumberOfJobs() { tests := []struct { numberOfJobs int @@ -167,9 +176,7 @@ func (suite *ListSuite) TestList_AnnotationFilter() { for _, tc := range testCases { suite.Run(tc.Name, func() { ctx := context.Background() - // have to create a fresh node for each test case to avoid jobs of different runs to be mixed up - suite.TearDownTest() - suite.SetupTest() + suite.setupRun() testJob := testutils.MakeJobWithOpts(suite.T(), jobutils.WithAnnotations(tc.JobLabels...), @@ -257,10 +264,7 @@ func (suite *ListSuite) TestList_SortFlags() { for _, sortFlags := range sortFlagsToTest { suite.Run(fmt.Sprintf("%+v/%+v", tc, sortFlags), func() { ctx := context.Background() - - // have to create a fresh node for each test case to avoid jobs of different runs to be mixed up - suite.TearDownTest() - suite.SetupTest() + suite.setupRun() var jobIDs []string for i := 0; i < tc.numberOfJobs; i++ { diff --git a/cmd/cli/serve/serve.go b/cmd/cli/serve/serve.go index 9e0ffbc18a..943a3c31c7 100644 --- a/cmd/cli/serve/serve.go +++ b/cmd/cli/serve/serve.go @@ -1,25 +1,29 @@ package serve import ( + "context" "fmt" "os" "sort" "strings" "time" - "github.com/multiformats/go-multiaddr" - "github.com/bacalhau-project/bacalhau/cmd/util" "github.com/bacalhau-project/bacalhau/cmd/util/flags/configflags" "github.com/bacalhau-project/bacalhau/pkg/config" "github.com/bacalhau-project/bacalhau/pkg/config/types" bac_libp2p "github.com/bacalhau-project/bacalhau/pkg/libp2p" + "github.com/bacalhau-project/bacalhau/pkg/libp2p/rcmgr" "github.com/bacalhau-project/bacalhau/pkg/logger" + "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/node" "github.com/bacalhau-project/bacalhau/pkg/repo" "github.com/bacalhau-project/bacalhau/pkg/system" "github.com/bacalhau-project/bacalhau/pkg/util/templates" "github.com/bacalhau-project/bacalhau/webui" + "github.com/libp2p/go-libp2p/core/host" + "github.com/libp2p/go-libp2p/core/peer" + "github.com/multiformats/go-multiaddr" "github.com/spf13/cobra" "k8s.io/kubectl/pkg/util/i18n" @@ -92,6 +96,7 @@ func NewCmd() *cobra.Command { serveFlags := map[string][]configflags.Definition{ "requester-tls": configflags.RequesterTLSFlags, "server-api": configflags.ServerAPIFlags, + "network": configflags.NetworkFlags, "libp2p": configflags.Libp2pFlags, "ipfs": configflags.IPFSFlags, "capacity": configflags.CapacityFlags, @@ -173,40 +178,41 @@ func serve(cmd *cobra.Command) error { return err } - // configure node type - isRequesterNode, isComputeNode, err := getNodeType() + nodeID, err := getNodeID() if err != nil { return err } + ctx = logger.ContextWithNodeIDLogger(ctx, nodeID) - libp2pCfg, err := config.GetLibp2pConfig() + // configure node type + isRequesterNode, isComputeNode, err := getNodeType() if err != nil { return err } - peers, err := GetPeers(libp2pCfg.PeerConnect) + // Establishing IPFS connection + ipfsConfig, err := getIPFSConfig() if err != nil { return err } - // configure libp2p - libp2pHost, err := setupLibp2pHost(libp2pCfg) + ipfsClient, err := SetupIPFSClient(ctx, cm, ipfsConfig) if err != nil { return err } - cm.RegisterCallback(libp2pHost.Close) - // add nodeID to logging context - ctx = logger.ContextWithNodeIDLogger(ctx, libp2pHost.ID().String()) - // Establishing IPFS connection - ipfsConfig, err := getIPFSConfig() + networkConfig, err := getNetworkConfig() if err != nil { return err } - ipfsClient, err := SetupIPFSClient(ctx, cm, ipfsConfig) - if err != nil { - return err + if networkConfig.Type == models.NetworkTypeLibp2p { + libp2pHost, peers, err := setupLibp2p() + if err != nil { + return err + } + networkConfig.Libp2pHost = libp2pHost + networkConfig.ClusterPeers = peers } computeConfig, err := GetComputeConfig() @@ -233,9 +239,9 @@ func serve(cmd *cobra.Command) error { // Create node config from cmd arguments nodeConfig := node.NodeConfig{ + NodeID: nodeID, CleanupManager: cm, IPFSClient: ipfsClient, - Host: libp2pHost, DisabledFeatures: featureConfig, HostAddress: config.ServerAPIHost(), APIPort: config.ServerAPIPort(), @@ -247,6 +253,7 @@ func serve(cmd *cobra.Command) error { AllowListedLocalPaths: allowedListLocalPaths, FsRepo: fsRepo, NodeInfoStoreTTL: nodeInfoStoreTTL, + NetworkConfig: networkConfig, } if isRequesterNode { @@ -266,12 +273,6 @@ func serve(cmd *cobra.Command) error { return fmt.Errorf("error creating node: %w", err) } - // Start transport layer - err = bac_libp2p.ConnectToPeersContinuously(ctx, cm, libp2pHost, peers) - if err != nil { - return err - } - // Start node if err := standardNode.Start(ctx); err != nil { return fmt.Errorf("error starting node: %w", err) @@ -308,95 +309,211 @@ func serve(cmd *cobra.Command) error { cmd.Printf("API: %s\n", standardNode.APIServer.GetURI().JoinPath("/api/v1/compute/debug")) } - if ipfsConfig.PrivateInternal && libp2pCfg.PeerConnect == DefaultPeerConnect { - if isComputeNode && !isRequesterNode { - cmd.Println("Make sure there's at least one requester node in your network.") - } + connectCmd, err := buildConnectCommand(ctx, &nodeConfig, ipfsConfig) + if err != nil { + return err + } + cmd.Println() + cmd.Println(connectCmd) - ipfsAddresses, err := ipfsClient.SwarmMultiAddresses(ctx) - if err != nil { - return fmt.Errorf("error looking up IPFS addresses: %w", err) - } + envVars, err := buildEnvVariables(ctx, &nodeConfig, ipfsConfig) + if err != nil { + return err + } + cmd.Println() + cmd.Println("To connect to this node from the client, run the following commands in your shell:") + cmd.Println(envVars) + + ripath, err := fsRepo.WriteRunInfo(ctx, envVars) + if err != nil { + return fmt.Errorf("writing run info to repo: %w", err) + } else { + cmd.Printf("A copy of these variables have been written to: %s\n", ripath) + } + if err != nil { + return err + } - p2pAddr, err := multiaddr.NewMultiaddr("/p2p/" + libp2pHost.ID().String()) + cm.RegisterCallback(func() error { + return os.Remove(ripath) + }) + + <-ctx.Done() // block until killed + return nil +} + +func setupLibp2p() (libp2pHost host.Host, peers []string, err error) { + defer func() { if err != nil { - return err + err = fmt.Errorf("failed to setup libp2p node. %w", err) } + }() + libp2pCfg, err := config.GetLibp2pConfig() + if err != nil { + return + } + + privKey, err := config.GetLibp2pPrivKey() + if err != nil { + return + } - peerAddress := pickP2pAddress(libp2pHost.Addrs()).Encapsulate(p2pAddr).String() - ipfsSwarmAddress := pickP2pAddress(ipfsAddresses).String() + libp2pHost, err = bac_libp2p.NewHost(libp2pCfg.SwarmPort, privKey, rcmgr.DefaultResourceManager) + if err != nil { + return + } - sb := strings.Builder{} - sb.WriteString("\n") - sb.WriteString("To connect another node to this private one, run the following command in your shell:\n") + peersAddrs, err := GetPeers(libp2pCfg.PeerConnect) + if err != nil { + return + } + peers = make([]string, len(peersAddrs)) + for i, p := range peersAddrs { + peers[i] = p.String() + } + return +} - sb.WriteString(fmt.Sprintf("%s serve ", os.Args[0])) +func getNodeID() (string, error) { + // for now, use libp2p host ID as node ID, regardless of using NATS or Libp2p + // TODO: allow users to specify node ID + privKey, err := config.GetLibp2pPrivKey() + if err != nil { + return "", err + } + peerID, err := peer.IDFromPrivateKey(privKey) + if err != nil { + return "", err + } + return peerID.String(), nil +} + +func buildConnectCommand(ctx context.Context, nodeConfig *node.NodeConfig, ipfsConfig types.IpfsConfig) (string, error) { + headerB := strings.Builder{} + cmdB := strings.Builder{} + if nodeConfig.IsRequesterNode { + cmdB.WriteString(fmt.Sprintf("%s serve ", os.Args[0])) // other nodes can be just compute nodes // no need to spawn 1+ requester nodes - sb.WriteString(fmt.Sprintf("%s=compute ", + cmdB.WriteString(fmt.Sprintf("%s=compute ", configflags.FlagNameForKey(types.NodeType, configflags.NodeTypeFlags...))) - sb.WriteString(fmt.Sprintf("%s ", - configflags.FlagNameForKey(types.NodeIPFSPrivateInternal, configflags.IPFSFlags...))) + cmdB.WriteString(fmt.Sprintf("%s=%s ", + configflags.FlagNameForKey(types.NodeNetworkType, configflags.NetworkFlags...), + nodeConfig.NetworkConfig.Type)) - sb.WriteString(fmt.Sprintf("%s=%s ", - configflags.FlagNameForKey(types.NodeLibp2pPeerConnect, configflags.Libp2pFlags...), - peerAddress, - )) - sb.WriteString(fmt.Sprintf("%s=%s ", - configflags.FlagNameForKey(types.NodeIPFSSwarmAddresses, configflags.IPFSFlags...), - ipfsSwarmAddress, - )) - cmd.Println(sb.String()) + switch nodeConfig.NetworkConfig.Type { + case models.NetworkTypeNATS: + advertisedAddr := nodeConfig.NetworkConfig.AdvertisedAddress + if advertisedAddr == "" { + advertisedAddr = fmt.Sprintf("127.0.0.1:%d", nodeConfig.NetworkConfig.Port) + } - summaryBuilder := strings.Builder{} - summaryBuilder.WriteString(fmt.Sprintf( - "export %s=%s\n", - config.KeyAsEnvVar(types.NodeIPFSSwarmAddresses), - ipfsSwarmAddress, - )) - summaryBuilder.WriteString(fmt.Sprintf( - "export %s=%s\n", - config.KeyAsEnvVar(types.NodeClientAPIHost), - config.ServerAPIHost(), - )) - summaryBuilder.WriteString(fmt.Sprintf( - "export %s=%d\n", - config.KeyAsEnvVar(types.NodeClientAPIPort), - config.ServerAPIPort(), - )) - summaryBuilder.WriteString(fmt.Sprintf( - "export %s=%s\n", - config.KeyAsEnvVar(types.NodeLibp2pPeerConnect), - peerAddress, - )) + headerB.WriteString("To connect a compute node to this orchestrator, run the following command in your shell:\n") + + cmdB.WriteString(fmt.Sprintf("%s=%s ", + configflags.FlagNameForKey(types.NodeNetworkOrchestrators, configflags.NetworkFlags...), + advertisedAddr, + )) - // Just convenience below - print out the last of the nodes information as the global variable - summaryShellVariablesString := summaryBuilder.String() + case models.NetworkTypeLibp2p: + headerB.WriteString("To connect another node to this one, run the following command in your shell:\n") - if isRequesterNode { - cmd.Println() - cmd.Println("To use this requester node from the client, run the following commands in your shell:") - cmd.Println(summaryShellVariablesString) + p2pAddr, err := multiaddr.NewMultiaddr("/p2p/" + nodeConfig.NetworkConfig.Libp2pHost.ID().String()) + if err != nil { + return "", err + } + peerAddress := pickP2pAddress(nodeConfig.NetworkConfig.Libp2pHost.Addrs()).Encapsulate(p2pAddr).String() + cmdB.WriteString(fmt.Sprintf("%s=%s ", + configflags.FlagNameForKey(types.NodeLibp2pPeerConnect, configflags.Libp2pFlags...), + peerAddress, + )) } - ripath, err := fsRepo.WriteRunInfo(ctx, summaryShellVariablesString) - if err != nil { - return fmt.Errorf("writing run info to repo: %w", err) - } else { - cmd.Printf("A copy of these variables have been written to: %s\n", ripath) + if ipfsConfig.PrivateInternal { + ipfsAddresses, err := nodeConfig.IPFSClient.SwarmMultiAddresses(ctx) + if err != nil { + return "", fmt.Errorf("error looking up IPFS addresses: %w", err) + } + + cmdB.WriteString(fmt.Sprintf("%s ", + configflags.FlagNameForKey(types.NodeIPFSPrivateInternal, configflags.IPFSFlags...))) + + cmdB.WriteString(fmt.Sprintf("%s=%s ", + configflags.FlagNameForKey(types.NodeIPFSSwarmAddresses, configflags.IPFSFlags...), + pickP2pAddress(ipfsAddresses).String(), + )) } - if err != nil { - return err + } else { + if nodeConfig.NetworkConfig.Type == models.NetworkTypeLibp2p { + headerB.WriteString("Make sure there's at least one requester node in your network.") + } + } + + return headerB.String() + cmdB.String(), nil +} + +func buildEnvVariables(ctx context.Context, nodeConfig *node.NodeConfig, ipfsConfig types.IpfsConfig) (string, error) { + // build shell variables to connect to this node + envVarBuilder := strings.Builder{} + envVarBuilder.WriteString(fmt.Sprintf( + "export %s=%s\n", + config.KeyAsEnvVar(types.NodeClientAPIHost), + config.ServerAPIHost(), + )) + envVarBuilder.WriteString(fmt.Sprintf( + "export %s=%d\n", + config.KeyAsEnvVar(types.NodeClientAPIPort), + config.ServerAPIPort(), + )) + + if nodeConfig.IsRequesterNode { + envVarBuilder.WriteString(fmt.Sprintf( + "export %s=%s\n", + config.KeyAsEnvVar(types.NodeNetworkType), nodeConfig.NetworkConfig.Type, + )) + + switch nodeConfig.NetworkConfig.Type { + case models.NetworkTypeNATS: + advertisedAddr := nodeConfig.NetworkConfig.AdvertisedAddress + if advertisedAddr == "" { + advertisedAddr = fmt.Sprintf("127.0.0.1:%d", nodeConfig.NetworkConfig.Port) + } + + envVarBuilder.WriteString(fmt.Sprintf( + "export %s=%s\n", + config.KeyAsEnvVar(types.NodeNetworkOrchestrators), + advertisedAddr, + )) + case models.NetworkTypeLibp2p: + p2pAddr, err := multiaddr.NewMultiaddr("/p2p/" + nodeConfig.NetworkConfig.Libp2pHost.ID().String()) + if err != nil { + return "", err + } + peerAddress := pickP2pAddress(nodeConfig.NetworkConfig.Libp2pHost.Addrs()).Encapsulate(p2pAddr).String() + + envVarBuilder.WriteString(fmt.Sprintf( + "export %s=%s\n", + config.KeyAsEnvVar(types.NodeLibp2pPeerConnect), + peerAddress, + )) } - cm.RegisterCallback(func() error { - return os.Remove(ripath) - }) + if ipfsConfig.PrivateInternal { + ipfsAddresses, err := nodeConfig.IPFSClient.SwarmMultiAddresses(ctx) + if err != nil { + return "", fmt.Errorf("error looking up IPFS addresses: %w", err) + } + + envVarBuilder.WriteString(fmt.Sprintf( + "export %s=%s\n", + config.KeyAsEnvVar(types.NodeIPFSSwarmAddresses), + pickP2pAddress(ipfsAddresses).String(), + )) + } } - <-ctx.Done() // block until killed - return nil + return envVarBuilder.String(), nil } // pickP2pAddress will aim to select a non-localhost IPv4 TCP address, or at least a non-localhost IPv6 one, from a list diff --git a/cmd/cli/serve/util.go b/cmd/cli/serve/util.go index e6f213aec0..ae63b933e1 100644 --- a/cmd/cli/serve/util.go +++ b/cmd/cli/serve/util.go @@ -5,7 +5,6 @@ import ( "fmt" "time" - "github.com/libp2p/go-libp2p/core/host" "github.com/rs/zerolog/log" "github.com/spf13/viper" "go.uber.org/multierr" @@ -17,8 +16,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/config" "github.com/bacalhau-project/bacalhau/pkg/config/types" "github.com/bacalhau-project/bacalhau/pkg/ipfs" - bac_libp2p "github.com/bacalhau-project/bacalhau/pkg/libp2p" - "github.com/bacalhau-project/bacalhau/pkg/libp2p/rcmgr" "github.com/bacalhau-project/bacalhau/pkg/node" "github.com/bacalhau-project/bacalhau/pkg/system" ) @@ -112,18 +109,6 @@ func getNodeType() (requester, compute bool, err error) { return } -func setupLibp2pHost(cfg types.Libp2pConfig) (host.Host, error) { - privKey, err := config.GetLibp2pPrivKey() - if err != nil { - return nil, err - } - libp2pHost, err := bac_libp2p.NewHost(cfg.SwarmPort, privKey, rcmgr.DefaultResourceManager) - if err != nil { - return nil, fmt.Errorf("error creating libp2p host: %w", err) - } - return libp2pHost, nil -} - func getIPFSConfig() (types.IpfsConfig, error) { var ipfsConfig types.IpfsConfig if err := config.ForKey(types.NodeIPFS, &ipfsConfig); err != nil { @@ -188,3 +173,20 @@ func getDisabledFeatures() (node.FeatureConfig, error) { func getAllowListedLocalPathsConfig() []string { return viper.GetStringSlice(types.NodeAllowListedLocalPaths) } + +func getNetworkConfig() (node.NetworkConfig, error) { + var networkCfg types.NetworkConfig + if err := config.ForKey(types.NodeNetwork, &networkCfg); err != nil { + return node.NetworkConfig{}, err + } + return node.NetworkConfig{ + Type: networkCfg.Type, + Port: networkCfg.Port, + AdvertisedAddress: networkCfg.AdvertisedAddress, + Orchestrators: networkCfg.Orchestrators, + ClusterName: networkCfg.Cluster.Name, + ClusterPort: networkCfg.Cluster.Port, + ClusterAdvertisedAddress: networkCfg.Cluster.AdvertisedAddress, + ClusterPeers: networkCfg.Cluster.Peers, + }, nil +} diff --git a/cmd/util/flags/configflags/network.go b/cmd/util/flags/configflags/network.go new file mode 100644 index 0000000000..1d3c2a68b6 --- /dev/null +++ b/cmd/util/flags/configflags/network.go @@ -0,0 +1,54 @@ +package configflags + +import "github.com/bacalhau-project/bacalhau/pkg/config/types" + +var NetworkFlags = []Definition{ + { + FlagName: "network", + ConfigPath: types.NodeNetworkType, + DefaultValue: Default.Node.Network.Type, + Description: `Inter-node network layer type (e.g. nats, libp2p).`, + }, + { + FlagName: "network-port", + ConfigPath: types.NodeNetworkPort, + DefaultValue: Default.Node.Network.Port, + Description: `Port to listen for connections from other nodes. Applies to orchestrator nodes.`, + }, + { + FlagName: "orchestrators", + ConfigPath: types.NodeNetworkOrchestrators, + DefaultValue: Default.Node.Network.Orchestrators, + Description: `Comma-separated list of orchestrators to connect to. Applies to compute nodes.`, + }, + { + FlagName: "advertised-address", + ConfigPath: types.NodeNetworkAdvertisedAddress, + DefaultValue: Default.Node.Network.AdvertisedAddress, + Description: `Address to advertise to compute nodes to connect to.`, + }, + { + FlagName: "cluster-name", + ConfigPath: types.NodeNetworkClusterName, + DefaultValue: Default.Node.Network.Cluster.Name, + Description: `Name of the cluster to join.`, + }, + { + FlagName: "cluster-port", + ConfigPath: types.NodeNetworkClusterPort, + DefaultValue: Default.Node.Network.Cluster.Port, + Description: `Port to listen for connections from other orchestrators to form a cluster.`, + }, + { + FlagName: "cluster-advertised-address", + ConfigPath: types.NodeNetworkClusterAdvertisedAddress, + DefaultValue: Default.Node.Network.Cluster.AdvertisedAddress, + Description: `Address to advertise to other orchestrators to connect to.`, + }, + { + FlagName: "cluster-peers", + ConfigPath: types.NodeNetworkClusterPeers, + DefaultValue: Default.Node.Network.Cluster.Peers, + Description: `Comma-separated list of other orchestrators to connect to to form a cluster.`, + }, +} diff --git a/cmd/util/flags/types.go b/cmd/util/flags/types.go index f52d52604d..4408470c73 100644 --- a/cmd/util/flags/types.go +++ b/cmd/util/flags/types.go @@ -7,7 +7,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/bidstrategy/semantic" "github.com/bacalhau-project/bacalhau/pkg/config/types" - "github.com/spf13/pflag" "golang.org/x/exp/slices" diff --git a/go.mod b/go.mod index b0b4895f0b..3ac505c7e5 100644 --- a/go.mod +++ b/go.mod @@ -48,6 +48,8 @@ require ( github.com/multiformats/go-multiaddr v0.9.0 github.com/multiformats/go-multicodec v0.9.0 github.com/multiformats/go-multihash v0.2.3 + github.com/nats-io/nats-server/v2 v2.10.7 + github.com/nats-io/nats.go v1.31.0 github.com/open-policy-agent/opa v0.60.0 github.com/opencontainers/image-spec v1.1.0-rc5 github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 @@ -148,8 +150,12 @@ require ( github.com/lestrrat-go/iter v1.0.2 // indirect github.com/lestrrat-go/option v1.0.1 // indirect github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect + github.com/minio/highwayhash v1.0.2 // indirect github.com/mitchellh/go-testing-interface v1.0.0 // indirect github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect + github.com/nats-io/jwt/v2 v2.5.3 // indirect + github.com/nats-io/nkeys v0.4.6 // indirect + github.com/nats-io/nuid v1.0.1 // indirect github.com/oklog/run v1.0.0 // indirect github.com/pelletier/go-toml/v2 v2.0.8 // indirect github.com/pjbgf/sha1cd v0.3.0 // indirect @@ -262,7 +268,7 @@ require ( github.com/jbenet/go-temp-err-catcher v0.1.0 // indirect github.com/jbenet/goprocess v0.1.4 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/klauspost/compress v1.16.5 // indirect + github.com/klauspost/compress v1.17.4 // indirect github.com/klauspost/cpuid/v2 v2.2.5 // indirect github.com/koron/go-ssdp v0.0.4 // indirect github.com/libp2p/go-buffer-pool v0.1.0 // indirect diff --git a/go.sum b/go.sum index 82b4919d35..7efaabd9da 100644 --- a/go.sum +++ b/go.sum @@ -700,8 +700,8 @@ github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/kkdai/bstream v0.0.0-20161212061736-f391b8402d23/go.mod h1:J+Gs4SYgM6CZQHDETBtE9HaSEkGmuNXF86RwHhHUvq4= github.com/klauspost/compress v1.10.3/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs= -github.com/klauspost/compress v1.16.5 h1:IFV2oUNUzZaz+XyusxpLzpzS8Pt5rh0Z16For/djlyI= -github.com/klauspost/compress v1.16.5/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= +github.com/klauspost/compress v1.17.4 h1:Ej5ixsIri7BrIjBkRZLTo6ghwrEtHFk7ijlczPW4fZ4= +github.com/klauspost/compress v1.17.4/go.mod h1:/dCuZOvVtNoHsyb+cuJD3itjs3NbnF6KH9zAO4BDxPM= github.com/klauspost/cpuid/v2 v2.0.4/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= github.com/klauspost/cpuid/v2 v2.2.5 h1:0E5MSMDEoAulmXNFquVs//DdoomxaoTY1kUhbc/qbZg= @@ -855,6 +855,8 @@ github.com/mikioh/tcpinfo v0.0.0-20190314235526-30a79bb1804b/go.mod h1:lxPUiZwKo github.com/mikioh/tcpopt v0.0.0-20190314235656-172688c1accc h1:PTfri+PuQmWDqERdnNMiD9ZejrlswWrCpBEZgWOiTrc= github.com/mikioh/tcpopt v0.0.0-20190314235656-172688c1accc/go.mod h1:cGKTAVKx4SxOuR/czcZ/E2RSJ3sfHs8FpHhQ5CWMf9s= github.com/minio/blake2b-simd v0.0.0-20160723061019-3f5f724cb5b1/go.mod h1:pD8RvIylQ358TN4wwqatJ8rNavkEINozVn9DtGI3dfQ= +github.com/minio/highwayhash v1.0.2 h1:Aak5U0nElisjDCfPSG79Tgzkn2gl66NxOMspRrKnA/g= +github.com/minio/highwayhash v1.0.2/go.mod h1:BQskDq+xkJ12lmlUUi7U0M5Swg3EWR+dLTk+kldvVxY= github.com/minio/sha256-simd v0.0.0-20190131020904-2d45a736cd16/go.mod h1:2FMWW+8GMoPweT6+pI63m9YE3Lmw4J71hV56Chs1E/U= github.com/minio/sha256-simd v0.1.1-0.20190913151208-6de447530771/go.mod h1:B5e1o+1/KgNmWrSQK08Y6Z1Vb5pwIktudl0J58iy0KM= github.com/minio/sha256-simd v0.1.1/go.mod h1:B5e1o+1/KgNmWrSQK08Y6Z1Vb5pwIktudl0J58iy0KM= @@ -933,6 +935,16 @@ github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRW github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= +github.com/nats-io/jwt/v2 v2.5.3 h1:/9SWvzc6hTfamcgXJ3uYRpgj+QuY2aLNqRiqrKcrpEo= +github.com/nats-io/jwt/v2 v2.5.3/go.mod h1:iysuPemFcc7p4IoYots3IuELSI4EDe9Y0bQMe+I3Bf4= +github.com/nats-io/nats-server/v2 v2.10.7 h1:f5VDy+GMu7JyuFA0Fef+6TfulfCs5nBTgq7MMkFJx5Y= +github.com/nats-io/nats-server/v2 v2.10.7/go.mod h1:V2JHOvPiPdtfDXTuEUsthUnCvSDeFrK4Xn9hRo6du7c= +github.com/nats-io/nats.go v1.31.0 h1:/WFBHEc/dOKBF6qf1TZhrdEfTmOZ5JzdJ+Y3m6Y/p7E= +github.com/nats-io/nats.go v1.31.0/go.mod h1:di3Bm5MLsoB4Bx61CBTsxuarI36WbhAwOm8QrW39+i8= +github.com/nats-io/nkeys v0.4.6 h1:IzVe95ru2CT6ta874rt9saQRkWfe2nFj1NtvYSLqMzY= +github.com/nats-io/nkeys v0.4.6/go.mod h1:4DxZNzenSVd1cYQoAa8948QY3QDjrHfcfVADymtkpts= +github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= +github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c= github.com/neelance/astrewrite v0.0.0-20160511093645-99348263ae86/go.mod h1:kHJEU3ofeGjhHklVoIGuVj85JJwZ6kWPaJwCIxgnFmo= github.com/neelance/sourcemap v0.0.0-20151028013722-8c68805598ab/go.mod h1:Qr6/a/Q4r9LP1IltGz7tA7iOK1WonHEYhu1HRBA7ZiM= github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= @@ -1471,6 +1483,7 @@ golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20181029174526-d69651ed3497/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181205085412-a5c9d58dba9a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190130150945-aca44879d564/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190219092855-153ac476189d/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= diff --git a/go.work.sum b/go.work.sum index ee29128933..f8a4445a9b 100644 --- a/go.work.sum +++ b/go.work.sum @@ -1075,7 +1075,6 @@ github.com/circonus-labs/circonusllhist v0.1.3/go.mod h1:kMXHVDlOchFAehlya5ePtbp github.com/clbanning/x2j v0.0.0-20191024224557-825249438eec h1:EdRZT3IeKQmfCSrgo8SZ8V3MEnskuJP0wCYNpe+aiXo= github.com/clbanning/x2j v0.0.0-20191024224557-825249438eec/go.mod h1:jMjuTZXRI4dUb/I5gc9Hdhagfvm9+RyrPryS/auMzxE= github.com/client9/misspell v0.3.4 h1:ta993UF76GwbvJcIo3Y68y/M3WxlpEHPWIGDkJYwzJI= -github.com/cloudflare/circl v1.3.7/go.mod h1:sRTcRWXGLrKw6yIGJ+l7amYJFfAXbZG0kBSc8r4zxgA= github.com/cloudflare/golz4 v0.0.0-20150217214814-ef862a3cdc58 h1:F1EaeKL/ta07PY/k9Os/UFtwERei2/XzGemhpGnBKNg= github.com/cloudflare/golz4 v0.0.0-20150217214814-ef862a3cdc58/go.mod h1:EOBUe0h4xcZ5GoxqC5SDxFQ8gwyZPKQoEzownBlhI80= github.com/cncf/udpa/go v0.0.0-20210930031921-04548b0d99d4 h1:hzAQntlaYRkVSFEfj9OTWlVV1H155FMD8BTKktLv0QI= @@ -1158,7 +1157,6 @@ github.com/danieljoos/wincred v1.1.2/go.mod h1:GijpziifJoIBfYh+S7BbkdUTU4LfM+QnG github.com/daviddengcn/go-colortext v1.0.0 h1:ANqDyC0ys6qCSvuEK7l3g5RaehL/Xck9EX8ATG8oKsE= github.com/daviddengcn/go-colortext v1.0.0/go.mod h1:zDqEI5NVUop5QPpVJUxE9UO10hRnmkD5G4Pmri9+m4c= github.com/davidlazar/go-crypto v0.0.0-20170701192655-dcfb0a7ac018/go.mod h1:rQYf4tfk5sSwFsnDg3qYaBxSjsD9S8+59vW0dKUgme4= -github.com/decred/dcrd/crypto/blake256 v1.0.1/go.mod h1:2OfgNZ5wDpcsFmHmCK5gZTPcCXqlm2ArzUIkw9czNJo= github.com/decred/dcrd/dcrec/secp256k1/v4 v4.1.0/go.mod h1:DZGJHZMqrU4JJqFAWUS2UO1+lbSKsdiOoYi9Zzey7Fc= github.com/decred/dcrd/lru v1.0.0 h1:Kbsb1SFDsIlaupWPwsPp+dkxiBY1frcS07PCPgotKz8= github.com/decred/dcrd/lru v1.0.0/go.mod h1:mxKOwFd7lFjN2GZYsiz/ecgqR6kkYAl+0pz0tEMk218= @@ -1291,7 +1289,6 @@ github.com/go-latex/latex v0.0.0-20210118124228-b3d85cf34e07 h1:OTlfMvwR1rLyf9go github.com/go-latex/latex v0.0.0-20230307184459-12ec69307ad9 h1:NxXI5pTAtpEaU49bpLpQoDsu1zrteW/vxzTz8Cd2UAs= github.com/go-latex/latex v0.0.0-20230307184459-12ec69307ad9/go.mod h1:gWuR/CrFDDeVRFQwHPvsv9soJVB/iqymhuZQuJ3a9OM= github.com/go-logr/logr v1.2.3/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-openapi/jsonpointer v0.20.2/go.mod h1:bHen+N0u1KEO3YlmqOjTT9Adn1RfD91Ar825/PuiRVs= github.com/go-openapi/jsonreference v0.19.6/go.mod h1:diGHMEHg2IqXZGKxqyvWdfWU/aim5Dprw5bqpKkTvns= github.com/go-openapi/spec v0.20.4/go.mod h1:faYFR1CvsJZ0mNsmsphTMSoRrNV3TEDoAM7FOEWeq8I= github.com/go-openapi/spec v0.20.6/go.mod h1:2OpW+JddWPrpXSCIX8eOx7lZ5iyuWj3RYR6VaaBKcWA= @@ -2149,8 +2146,6 @@ github.com/nats-io/nats.go v1.9.1/go.mod h1:ZjDU1L/7fJ09jvUSRVBR2e7+RnLiiIQyqyzE github.com/nats-io/nkeys v0.1.0/go.mod h1:xpnFELMwJABBLVhffcfd1MZx6VsNRFpEugbxziKVo7w= github.com/nats-io/nkeys v0.1.3 h1:6JrEfig+HzTH85yxzhSVbjHRJv9cn0p6n3IngIcM5/k= github.com/nats-io/nkeys v0.1.3/go.mod h1:xpnFELMwJABBLVhffcfd1MZx6VsNRFpEugbxziKVo7w= -github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= -github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c= github.com/ncw/swift v1.0.47 h1:4DQRPj35Y41WogBxyhOXlrI37nzGlyEcsforeudyYPQ= github.com/neelance/astrewrite v0.0.0-20160511093645-99348263ae86 h1:D6paGObi5Wud7xg83MaEFyjxQB1W5bz5d0IFppr+ymk= github.com/neelance/sourcemap v0.0.0-20151028013722-8c68805598ab h1:eFXv9Nu1lGbrNbj619aWwZfVF5HBrm9Plte8aNptuTI= @@ -2189,6 +2184,7 @@ github.com/onsi/gomega v1.27.10/go.mod h1:RsS8tutOdbdgzbPtzzATp12yT7kM5I5aElG3ev github.com/onsi/gomega v1.29.0/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ= github.com/op/go-logging v0.0.0-20160315200505-970db520ece7 h1:lDH9UUVJtmYCjyT0CI4q8xvlXPxeZ0gYCVvWbmPlp88= github.com/op/go-logging v0.0.0-20160315200505-970db520ece7/go.mod h1:HzydrMdWErDVzsI23lYNej1Htcns9BCg93Dk0bBINWk= +github.com/opencontainers/image-spec v1.0.2/go.mod h1:BtxoFyWECRxE4U/7sNtV5W15zMzWCbyJoFRP3s7yZA0= github.com/opencontainers/runc v1.1.0 h1:O9+X96OcDjkmmZyfaG996kV7yq8HsoU2h1XRRQcefG8= github.com/opencontainers/runtime-tools v0.0.0-20181011054405-1d69bd0f9c39 h1:H7DMc6FAjgwZZi8BRqjrAAHWoqEr5e5L6pS4V0ezet4= github.com/opencontainers/selinux v1.10.0 h1:rAiKF8hTcgLI3w0DHm6i0ylVVcOrlgR1kK99DRLDhyU= @@ -2241,6 +2237,8 @@ github.com/posener/complete v1.1.1/go.mod h1:em0nMJCgc9GFtwrmVmEMR/ZL6WyhyjMBndr github.com/posener/complete v1.2.3 h1:NP0eAhjcjImqslEwo/1hq7gpajME0fTLTezBKDqfXqo= github.com/posener/complete v1.2.3/go.mod h1:WZIdtGGp+qx0sLrYKtIRAruyNpv6hFCicSgv7Sy7s/s= github.com/pquerna/cachecontrol v0.0.0-20171018203845-0dec1b30a021 h1:0XM1XL/OFFJjXsYXlG30spTkV/E9+gmd5GD1w2HE8xM= +github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g= +github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U= github.com/prometheus/client_golang v0.9.2/go.mod h1:OsXs2jCmiKlQ1lTBmv21f2mNfw4xf/QclQDMrYNZzcM= github.com/prometheus/client_golang v0.9.3-0.20190127221311-3c4408c8b829/go.mod h1:p2iRAGwDERtqlqzRXnrOVns+ignqQo//hLXqYxZYVNs= github.com/prometheus/client_golang v1.3.0/go.mod h1:hJaj2vgQTGQmVCsAACORcieXFeDPbaTKGT+JTgUa3og= @@ -2365,7 +2363,6 @@ github.com/streadway/handy v0.0.0-20190108123426-d5acb3125c2a h1:AhmOdSHeswKHBjh github.com/streadway/handy v0.0.0-20190108123426-d5acb3125c2a/go.mod h1:qNTQ5P5JnDBl6z3cMAg/SywNDC5ABu5ApDIw6lUbRmI= github.com/stvp/go-udp-testing v0.0.0-20201019212854-469649b16807 h1:LUsDduamlucuNnWcaTbXQ6aLILFcLXADpOzeEH3U+OI= github.com/swaggo/swag v1.8.1/go.mod h1:ugemnJsPZm/kRwFUnzBlbHRd0JY9zE1M4F+uy2pAaPQ= -github.com/swaggo/swag v1.16.2/go.mod h1:6YzXnDcpr0767iOejs318CwYkCQqyGer6BizOg03f+E= github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 h1:kdXcSzyDtseVEc4yCz2qF8ZrQvIDBJLl4S1c3GCXmoI= github.com/tarm/serial v0.0.0-20180830185346-98f6abe2eb07 h1:UyzmZLoiDWMRywV4DUYb9Fbt8uiOSooupjTq10vpvnU= github.com/tchap/go-patricia v2.2.6+incompatible h1:JvoDL7JSoIP2HDE8AbDH3zC8QBPxmzYe32HHy5yQ+Ck= @@ -2445,7 +2442,6 @@ gitlab.com/nyarla/go-crypt v0.0.0-20160106005555-d9a5dc2b789b/go.mod h1:T3BPAOm2 go.etcd.io/bbolt v1.3.3 h1:MUGmc65QhB3pIlaQ5bB4LwqSj6GIonVJXpZiaKNyaKk= go.etcd.io/bbolt v1.3.3/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= go.etcd.io/bbolt v1.3.6 h1:/ecaJf0sk1l4l6V4awd65v2C3ILy7MSj+s/x1ADCIMU= -go.etcd.io/bbolt v1.3.8/go.mod h1:N9Mkw9X8x5fupy0IKsmuqVtoGDyxsaDlbk4Rd05IAQw= go.etcd.io/etcd v0.0.0-20191023171146-3cf2f69b5738 h1:VcrIfasaLFkyjk6KNlXQSzO+B0fZcnECiDrKJsfxka0= go.etcd.io/etcd v0.0.0-20191023171146-3cf2f69b5738/go.mod h1:dnLIgRNXwCJa5e+c6mIZCrds/GIG4ncV9HhK5PX7jPg= go.etcd.io/etcd v0.5.0-alpha.5.0.20200910180754-dd1b699fc489 h1:1JFLBqwIgdyHN1ZtgjTBwO+blA6gVOmZurpiMEsETKo= @@ -2596,7 +2592,6 @@ golang.org/x/net v0.0.0-20220923203811-8be639271d50/go.mod h1:YDH+HFinaLZZlnHAfS golang.org/x/net v0.4.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE= golang.org/x/net v0.5.0/go.mod h1:DivGGAXEgPSlEBzxGzZI+ZLohi+xUj054jfeKui00ws= golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= -golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/net v0.11.0/go.mod h1:2L/ixqYpgIVXmeoSA/4Lu7BzTG4KIyPIryS4IsOd1oQ= golang.org/x/net v0.12.0/go.mod h1:zEVYFnQC7m/vmpQFELhcD1EWkZlX69l4oqgmer6hfKA= golang.org/x/net v0.13.0/go.mod h1:zEVYFnQC7m/vmpQFELhcD1EWkZlX69l4oqgmer6hfKA= @@ -2613,7 +2608,7 @@ golang.org/x/perf v0.0.0-20180704124530-6e6d33e29852 h1:xYq6+9AtI+xP3M4r0N1hCkHr golang.org/x/sync v0.2.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= golang.org/x/sync v0.4.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= -golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181026203630-95b1ffbd15a5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181107165924-66b7b1311ac8/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -2638,17 +2633,14 @@ golang.org/x/sys v0.0.0-20220825204002-c680a09ffe64/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220919091848-fb04ddd9f9c8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.4.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.14.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20220722155259-a9ba230a4035/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.10.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.11.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.12.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= diff --git a/ops/terraform/dev.tfvars b/ops/terraform/dev.tfvars index 3858f58fb0..54275dc638 100644 --- a/ops/terraform/dev.tfvars +++ b/ops/terraform/dev.tfvars @@ -1,5 +1,5 @@ -bacalhau_version = "v1.1.3" -bacalhau_branch = "" +bacalhau_version = "" +bacalhau_branch = "nats" bacalhau_port = "1235" bacalhau_node_id_0 = "QmfYBQ3HouX9zKcANNXbgJnpyLpTYS9nKBANw6RUQKZffu" bacalhau_node_id_1 = "QmNjEQByyK8GiMTvnZqGyURuwXDCtzp9X6gJRKkpWfai7S" @@ -28,4 +28,5 @@ public_ip_addresses = ["34.86.177.175", "35.245.221.171"] num_gpu_machines = 0 log_level = "debug" otel_collector_version = "0.70.0" -otel_collector_endpoint = "http://localhost:4318" \ No newline at end of file +otel_collector_endpoint = "http://localhost:4318" +network_type = "nats" \ No newline at end of file diff --git a/ops/terraform/main.tf b/ops/terraform/main.tf index ea40f96230..ac170b5ae9 100644 --- a/ops/terraform/main.tf +++ b/ops/terraform/main.tf @@ -71,6 +71,10 @@ export GRAFANA_CLOUD_TEMPO_ENDPOINT="${var.grafana_cloud_tempo_endpoint}" export OTEL_COLLECTOR_VERSION="${var.otel_collector_version}" export OTEL_EXPORTER_OTLP_ENDPOINT="${var.otel_collector_endpoint}" export OTEL_RESOURCE_ATTRIBUTES="deployment.environment=${terraform.workspace}" +export BACALHAU_NODE_NETWORK_TYPE=${var.network_type} +export BACALHAU_NODE_NETWORK_ORCHESTRATORS="${var.internal_ip_addresses[0]}:4222" +export BACALHAU_NODE_NETWORK_ADVERTISEDADDRESS="${var.public_ip_addresses[count.index]}:4222" +export BACALHAU_NODE_NETWORK_CLUSTER_PEERS="${var.internal_ip_addresses[0]}:6222" ### secrets are installed in the install-node.sh script export SECRETS_GRAFANA_CLOUD_PROMETHEUS_API_KEY="${var.grafana_cloud_prometheus_api_key}" @@ -295,6 +299,8 @@ resource "google_compute_firewall" "bacalhau_ingress_firewall" { "55679", // otel collector zpages extension "44443", // nginx is healthy - for running health check scripts "44444", // nginx node health check scripts + "4222", // nats + "6222", // nats cluster ] } @@ -320,6 +326,8 @@ resource "google_compute_firewall" "bacalhau_egress_firewall" { ports = [ "4001", // ipfs swarm "1235", // bacalhau swarm + "4222", // nats + "6222", // nats cluster ] } diff --git a/ops/terraform/remote_files/scripts/install-node.sh b/ops/terraform/remote_files/scripts/install-node.sh index fec079b7f3..8f6deab21f 100644 --- a/ops/terraform/remote_files/scripts/install-node.sh +++ b/ops/terraform/remote_files/scripts/install-node.sh @@ -117,12 +117,14 @@ function install-bacalhau-from-release() { function install-bacalhau-from-source() { echo "Installing Bacalhau from branch ${BACALHAU_BRANCH}" - sudo apt-get -y install --no-install-recommends jq nodejs npm make + # make sure we have the desired version of nodejs to build webui + curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash - + sudo apt-get -y install --no-install-recommends jq nodejs make git clone --branch ${BACALHAU_BRANCH} https://github.com/bacalhau-project/bacalhau.git pushd bacalhau pushd webui && npm install && popd make build-bacalhau - sudo mv ./bin/*/bacalhau /usr/local/bin/bacalhau + sudo mv ./bin/*/*/bacalhau /usr/local/bin/bacalhau popd } diff --git a/ops/terraform/remote_files/scripts/start-bacalhau.sh b/ops/terraform/remote_files/scripts/start-bacalhau.sh index 512304ec8a..53a5a1ad8e 100644 --- a/ops/terraform/remote_files/scripts/start-bacalhau.sh +++ b/ops/terraform/remote_files/scripts/start-bacalhau.sh @@ -20,61 +20,79 @@ mount /dev/sdb /data || true # import the secrets source /data/secrets.sh -function getMultiaddress() { - echo -n "/ip4/${1}/tcp/${BACALHAU_PORT}/p2p/${2}" -} - -# we start with none as the default ("none" prevents the node connecting to our default bootstrap list) -export CONNECT_PEER="none" - -# use the BACALHAU_CONNECT_PEER env var if it is set -if [[ -n "${BACALHAU_CONNECT_PEER}" ]]; then - export CONNECT_PEER=$BACALHAU_CONNECT_PEER -# if we are node0 then we do not connect to anything -elif [[ "${TERRAFORM_NODE_INDEX}" != "0" ]]; then - # if we are in unsafe mode - then we connect to a single node and it's ID - # is pre-determined by the $BACALHAU_NODE0_UNSAFE_ID variable - if [[ -n "${BACALHAU_UNSAFE_CLUSTER}" ]]; then - export UNSAFE_NODE0_ID="$BACALHAU_NODE_ID_0" - if [[ -z "$UNSAFE_NODE0_ID" ]]; then - export UNSAFE_NODE0_ID="$BACALHAU_NODE0_UNSAFE_ID" - fi - export CONNECT_PEER=$(getMultiaddress "$TERRAFORM_NODE0_IP" "$UNSAFE_NODE0_ID") - # otherwise we will construct our connect string based on - # what node index we are - else - # we are > node0 so we can connect to node0 - export CONNECT_PEER=$(getMultiaddress "$TERRAFORM_NODE0_IP" "$BACALHAU_NODE_ID_0") - # we are > node1 so we can also connect to node1 - if [[ "${TERRAFORM_NODE_INDEX}" -ge "2" ]]; then - export CONNECT_PEER="$CONNECT_PEER,$(getMultiaddress "$TERRAFORM_NODE1_IP" "$BACALHAU_NODE_ID_1")" - fi - # we are > node2 so we can also connect to node2 - if [[ "${TERRAFORM_NODE_INDEX}" -ge "3" ]]; then - export CONNECT_PEER="$CONNECT_PEER,$(getMultiaddress "$TERRAFORM_NODE2_IP" "$BACALHAU_NODE_ID_2")" - fi - fi -fi - BACALHAU_PROBE_EXEC='/terraform_node/apply-http-allowlist.sh' - TRUSTED_CLIENT_IDS="\ 1df7b01ed77ca81bb6d6f06f6cbcd76a6a9e450d175dfac1e4ba70494fddd576,\ b43517b5449d383ab00ca1d2b1c558d710ba79f51c800fbf4c35ed4d0198aec5" -bacalhau serve \ - --node-type "${BACALHAU_NODE_TYPE}" \ - --job-selection-data-locality anywhere \ - --job-selection-accept-networked \ - --job-selection-probe-exec "${BACALHAU_PROBE_EXEC}" \ - --max-job-execution-timeout '60m' \ - --job-execution-timeout-bypass-client-id="${TRUSTED_CLIENT_IDS}" \ - --ipfs-swarm-addrs "" \ - --ipfs-connect /ip4/127.0.0.1/tcp/5001 \ - --swarm-port "${BACALHAU_PORT}" \ - --api-port 1234 \ - --peer "${CONNECT_PEER}" \ - --private-internal-ipfs=false \ - --web-ui "${BACALHAU_NODE_WEBUI}" \ - --labels owner=bacalhau \ - --requester-job-translation-enabled +# Check if using NATS +if [[ "${BACALHAU_NODE_NETWORK_TYPE}" == "nats" ]]; then + # nats related config as set as env vars in main.tf and no need to pass them to serve command + bacalhau serve \ + --node-type "${BACALHAU_NODE_TYPE}" \ + --job-selection-data-locality anywhere \ + --job-selection-accept-networked \ + --job-selection-probe-exec "${BACALHAU_PROBE_EXEC}" \ + --max-job-execution-timeout '60m' \ + --job-execution-timeout-bypass-client-id="${TRUSTED_CLIENT_IDS}" \ + --ipfs-swarm-addrs "" \ + --ipfs-connect /ip4/127.0.0.1/tcp/5001 \ + --api-port 1234 \ + --private-internal-ipfs=false \ + --web-ui "${BACALHAU_NODE_WEBUI}" \ + --web-ui-port 80 \ + --labels owner=bacalhau \ + --requester-job-translation-enabled + +else + function getMultiaddress() { + echo -n "/ip4/${1}/tcp/${BACALHAU_PORT}/p2p/${2}" + } + + # use the BACALHAU_CONNECT_PEER env var if it is set + if [[ -n "${BACALHAU_CONNECT_PEER}" ]]; then + export CONNECT_PEER=$BACALHAU_CONNECT_PEER + # if we are node0 then we do not connect to anything + elif [[ "${TERRAFORM_NODE_INDEX}" != "0" ]]; then + # if we are in unsafe mode - then we connect to a single node and it's ID + # is pre-determined by the $BACALHAU_NODE0_UNSAFE_ID variable + if [[ -n "${BACALHAU_UNSAFE_CLUSTER}" ]]; then + export UNSAFE_NODE0_ID="$BACALHAU_NODE_ID_0" + if [[ -z "$UNSAFE_NODE0_ID" ]]; then + export UNSAFE_NODE0_ID="$BACALHAU_NODE0_UNSAFE_ID" + fi + export CONNECT_PEER=$(getMultiaddress "$TERRAFORM_NODE0_IP" "$UNSAFE_NODE0_ID") + # otherwise we will construct our connect string based on + # what node index we are + else + # we are > node0 so we can connect to node0 + export CONNECT_PEER=$(getMultiaddress "$TERRAFORM_NODE0_IP" "$BACALHAU_NODE_ID_0") + # we are > node1 so we can also connect to node1 + if [[ "${TERRAFORM_NODE_INDEX}" -ge "2" ]]; then + export CONNECT_PEER="$CONNECT_PEER,$(getMultiaddress "$TERRAFORM_NODE1_IP" "$BACALHAU_NODE_ID_1")" + fi + # we are > node2 so we can also connect to node2 + if [[ "${TERRAFORM_NODE_INDEX}" -ge "3" ]]; then + export CONNECT_PEER="$CONNECT_PEER,$(getMultiaddress "$TERRAFORM_NODE2_IP" "$BACALHAU_NODE_ID_2")" + fi + fi + fi + + bacalhau serve \ + --node-type "${BACALHAU_NODE_TYPE}" \ + --job-selection-data-locality anywhere \ + --job-selection-accept-networked \ + --job-selection-probe-exec "${BACALHAU_PROBE_EXEC}" \ + --max-job-execution-timeout '60m' \ + --job-execution-timeout-bypass-client-id="${TRUSTED_CLIENT_IDS}" \ + --ipfs-swarm-addrs "" \ + --ipfs-connect /ip4/127.0.0.1/tcp/5001 \ + --swarm-port "${BACALHAU_PORT}" \ + --api-port 1234 \ + --peer "${CONNECT_PEER}" \ + --private-internal-ipfs=false \ + --web-ui "${BACALHAU_NODE_WEBUI}" \ + --web-ui-port 80 \ + --labels owner=bacalhau \ + --requester-job-translation-enabled +fi \ No newline at end of file diff --git a/ops/terraform/variables.tf b/ops/terraform/variables.tf index 9839690d78..5b24fd1459 100644 --- a/ops/terraform/variables.tf +++ b/ops/terraform/variables.tf @@ -230,3 +230,9 @@ variable "docker_password" { default = "" sensitive = true } + +// Use NATs for transport instead of libp2p +variable "network_type" { + type = string + default = "libp2p" +} \ No newline at end of file diff --git a/pkg/authn/challenge/authenticator_test.go b/pkg/authn/challenge/authenticator_test.go index a82c0d6380..e91c5a879c 100644 --- a/pkg/authn/challenge/authenticator_test.go +++ b/pkg/authn/challenge/authenticator_test.go @@ -18,12 +18,6 @@ import ( "github.com/stretchr/testify/require" ) -type testData []byte - -func (t testData) MarshalBinary() ([]byte, error) { - return t, nil -} - func setup(t *testing.T) authn.Authenticator { logger.ConfigureTestLogging(t) @@ -33,7 +27,7 @@ func setup(t *testing.T) authn.Authenticator { rsaKey, err := rsa.GenerateKey(rand.Reader, 2048) require.NoError(t, err) - return NewAuthenticator(anonPolicy, testData([]byte("test")), rsaKey, "node") + return NewAuthenticator(anonPolicy, NewStringMarshaller("test"), rsaKey, "node") } func try(t *testing.T, authenticator authn.Authenticator, r any) authn.Authentication { diff --git a/pkg/authn/challenge/marshaller.go b/pkg/authn/challenge/marshaller.go new file mode 100644 index 0000000000..1f1e9a318d --- /dev/null +++ b/pkg/authn/challenge/marshaller.go @@ -0,0 +1,20 @@ +package challenge + +// StringMarshaller is a struct that implements the encoding.BinaryMarshaler interface for strings. +// It holds a string value that can be marshaled into a byte slice. +type StringMarshaller struct { + Input string +} + +// NewStringMarshaller returns a pointer to a new StringMarshaller initialized with the given input string. +// This function is typically used to prepare a string for binary marshaling. +func NewStringMarshaller(input string) *StringMarshaller { + return &StringMarshaller{Input: input} +} + +// MarshalBinary implements the encoding.BinaryMarshaler interface. +// It converts the string held by StringMarshaller into a slice of bytes. +// As string to byte conversion in Go is straightforward and error-free, this method returns nil for the error. +func (m *StringMarshaller) MarshalBinary() ([]byte, error) { + return []byte(m.Input), nil +} diff --git a/pkg/authn/challenge/marshaller_test.go b/pkg/authn/challenge/marshaller_test.go new file mode 100644 index 0000000000..4e252219f0 --- /dev/null +++ b/pkg/authn/challenge/marshaller_test.go @@ -0,0 +1,29 @@ +//go:build unit || !integration + +package challenge + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +// TestStringMarshaller_MarshalBinary tests the MarshalBinary method of StringMarshaller. +func TestStringMarshaller_MarshalBinary(t *testing.T) { + testCases := []struct { + input string + }{ + {"hello"}, + {""}, + {"12345"}, + } + + for _, tc := range testCases { + m := NewStringMarshaller(tc.input) + marshaled, err := m.MarshalBinary() + require.NoError(t, err, "MarshalBinary() with input %s returned an unexpected error", tc.input) + + // Manually unmarshal and compare with the original input + require.Equal(t, []byte(tc.input), marshaled, "MarshalBinary() with input %s returned an unexpected byte slice", tc.input) + } +} diff --git a/pkg/compute/endpoint.go b/pkg/compute/endpoint.go index 3134b3c979..4085bee950 100644 --- a/pkg/compute/endpoint.go +++ b/pkg/compute/endpoint.go @@ -19,7 +19,7 @@ type BaseEndpointParams struct { UsageCalculator capacity.UsageCalculator Bidder Bidder Executor Executor - LogServer logstream.LogStreamServer + LogServer *logstream.LogStreamServer } // Base implementation of Endpoint @@ -29,7 +29,7 @@ type BaseEndpoint struct { usageCalculator capacity.UsageCalculator bidder Bidder executor Executor - logServer logstream.LogStreamServer + logServer *logstream.LogStreamServer } func NewBaseEndpoint(params BaseEndpointParams) BaseEndpoint { @@ -142,6 +142,10 @@ func (s BaseEndpoint) CancelExecution(ctx context.Context, request CancelExecuti func (s BaseEndpoint) ExecutionLogs(ctx context.Context, request ExecutionLogsRequest) (ExecutionLogsResponse, error) { log.Ctx(ctx).Debug().Msgf("processing log request for %s", request.ExecutionID) + // TODO: remove this once we support log streaming with nats + if s.logServer == nil { + return ExecutionLogsResponse{}, fmt.Errorf("log server not configured") + } execution, err := s.executionStore.GetExecution(ctx, request.ExecutionID) if err != nil { return ExecutionLogsResponse{}, err diff --git a/pkg/compute/node_info_provider.go b/pkg/compute/node_info_decorator.go similarity index 77% rename from pkg/compute/node_info_provider.go rename to pkg/compute/node_info_decorator.go index c24abc9e3a..fe848c482b 100644 --- a/pkg/compute/node_info_provider.go +++ b/pkg/compute/node_info_decorator.go @@ -10,7 +10,7 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/storage" ) -type NodeInfoProviderParams struct { +type NodeInfoDecoratorParams struct { Executors executor.ExecutorProvider Publisher publisher.PublisherProvider Storages storage.StorageProvider @@ -19,7 +19,7 @@ type NodeInfoProviderParams struct { MaxJobRequirements models.Resources } -type NodeInfoProvider struct { +type NodeInfoDecorator struct { executors executor.ExecutorProvider publishers publisher.PublisherProvider storages storage.StorageProvider @@ -28,8 +28,8 @@ type NodeInfoProvider struct { maxJobRequirements models.Resources } -func NewNodeInfoProvider(params NodeInfoProviderParams) *NodeInfoProvider { - return &NodeInfoProvider{ +func NewNodeInfoDecorator(params NodeInfoDecoratorParams) *NodeInfoDecorator { + return &NodeInfoDecorator{ executors: params.Executors, publishers: params.Publisher, storages: params.Storages, @@ -39,8 +39,9 @@ func NewNodeInfoProvider(params NodeInfoProviderParams) *NodeInfoProvider { } } -func (n *NodeInfoProvider) GetComputeInfo(ctx context.Context) models.ComputeNodeInfo { - return models.ComputeNodeInfo{ +func (n *NodeInfoDecorator) DecorateNodeInfo(ctx context.Context, nodeInfo models.NodeInfo) models.NodeInfo { + nodeInfo.NodeType = models.NodeTypeCompute + nodeInfo.ComputeNodeInfo = &models.ComputeNodeInfo{ ExecutionEngines: n.executors.Keys(ctx), Publishers: n.publishers.Keys(ctx), StorageSources: n.storages.Keys(ctx), @@ -50,7 +51,8 @@ func (n *NodeInfoProvider) GetComputeInfo(ctx context.Context) models.ComputeNod RunningExecutions: len(n.executorBuffer.RunningExecutions()), EnqueuedExecutions: n.executorBuffer.EnqueuedExecutionsCount(), } + return nodeInfo } // compile-time interface check -var _ models.ComputeNodeInfoProvider = &NodeInfoProvider{} +var _ models.NodeInfoDecorator = &NodeInfoDecorator{} diff --git a/pkg/compute/store/boltdb/store.go b/pkg/compute/store/boltdb/store.go index bc49781f7c..3c3e4d82c1 100644 --- a/pkg/compute/store/boltdb/store.go +++ b/pkg/compute/store/boltdb/store.go @@ -83,7 +83,7 @@ func NewStore(ctx context.Context, dbPath string) (*Store, error) { starting: sync.WaitGroup{}, stateCounter: NewStateCounter(), } - log.Ctx(ctx).Info().Msg("creating new bbolt database") + log.Ctx(ctx).Info().Msgf("creating new bbolt database at %s", dbPath) database, err := GetDatabase(dbPath) if err != nil { diff --git a/pkg/config/configenv/dev.go b/pkg/config/configenv/dev.go index ebca3f2824..f9ad62089c 100644 --- a/pkg/config/configenv/dev.go +++ b/pkg/config/configenv/dev.go @@ -30,6 +30,14 @@ var Development = types.BacalhauConfig{ Port: 1234, TLS: types.TLSConfiguration{}, }, + Network: types.NetworkConfig{ + Type: models.NetworkTypeLibp2p, + Port: 4222, + Cluster: types.NetworkClusterConfig{ + Name: "global", + Port: 6222, + }, + }, BootstrapAddresses: []string{ "/ip4/34.86.177.175/tcp/1235/p2p/QmfYBQ3HouX9zKcANNXbgJnpyLpTYS9nKBANw6RUQKZffu", "/ip4/35.245.221.171/tcp/1235/p2p/QmNjEQByyK8GiMTvnZqGyURuwXDCtzp9X6gJRKkpWfai7S", diff --git a/pkg/config/configenv/local.go b/pkg/config/configenv/local.go index 8b8f50ad83..54617bd9b9 100644 --- a/pkg/config/configenv/local.go +++ b/pkg/config/configenv/local.go @@ -30,6 +30,14 @@ var Local = types.BacalhauConfig{ Port: 1234, TLS: types.TLSConfiguration{}, }, + Network: types.NetworkConfig{ + Type: models.NetworkTypeLibp2p, + Port: 4222, + Cluster: types.NetworkClusterConfig{ + Name: "global", + Port: 6222, + }, + }, BootstrapAddresses: []string{}, DownloadURLRequestTimeout: types.Duration(300 * time.Second), VolumeSizeRequestTimeout: types.Duration(2 * time.Minute), diff --git a/pkg/config/configenv/production.go b/pkg/config/configenv/production.go index 604f22b1bc..77ed8d4b19 100644 --- a/pkg/config/configenv/production.go +++ b/pkg/config/configenv/production.go @@ -31,6 +31,14 @@ var Production = types.BacalhauConfig{ Port: 1234, TLS: types.TLSConfiguration{}, }, + Network: types.NetworkConfig{ + Type: models.NetworkTypeLibp2p, + Port: 4222, + Cluster: types.NetworkClusterConfig{ + Name: "global", + Port: 6222, + }, + }, BootstrapAddresses: []string{ "/ip4/35.245.161.250/tcp/1235/p2p/QmbxGSsM6saCTyKkiWSxhJCt6Fgj7M9cns1vzYtfDbB5Ws", "/ip4/34.86.254.26/tcp/1235/p2p/QmeXjeQDinxm7zRiEo8ekrJdbs7585BM6j7ZeLVFrA7GPe", diff --git a/pkg/config/configenv/staging.go b/pkg/config/configenv/staging.go index 3a2d05b020..8ae1e7a56e 100644 --- a/pkg/config/configenv/staging.go +++ b/pkg/config/configenv/staging.go @@ -31,6 +31,14 @@ var Staging = types.BacalhauConfig{ Port: 1234, TLS: types.TLSConfiguration{}, }, + Network: types.NetworkConfig{ + Type: models.NetworkTypeLibp2p, + Port: 4222, + Cluster: types.NetworkClusterConfig{ + Name: "global", + Port: 6222, + }, + }, BootstrapAddresses: []string{ "/ip4/34.85.228.65/tcp/1235/p2p/QmafZ9oCXCJZX9Wt1nhrGS9FVVq41qhcBRSNWCkVhz3Nvv", "/ip4/34.86.73.105/tcp/1235/p2p/QmVHCeiLzhFJPCyCj5S1RTAk1vBEvxd8r5A6E4HyJGQtbJ", diff --git a/pkg/config/configenv/test.go b/pkg/config/configenv/test.go index b5611f3831..8fa4ccc7de 100644 --- a/pkg/config/configenv/test.go +++ b/pkg/config/configenv/test.go @@ -30,6 +30,14 @@ var Testing = types.BacalhauConfig{ Port: 9999, TLS: types.TLSConfiguration{}, }, + Network: types.NetworkConfig{ + Type: models.NetworkTypeLibp2p, + Port: 4222, + Cluster: types.NetworkClusterConfig{ + Name: "global", + Port: 6222, + }, + }, BootstrapAddresses: []string{ "/ip4/0.0.0.0/tcp/1235/p2p/QmcWJnVXJ82DKJq8ED79LADR4ZBTnwgTK7yn6JQbNVMbbC", }, diff --git a/pkg/config/types/generated_constants.go b/pkg/config/types/generated_constants.go index 89d75afd81..55b23a3896 100644 --- a/pkg/config/types/generated_constants.go +++ b/pkg/config/types/generated_constants.go @@ -27,6 +27,15 @@ const NodeServerAPITLSAutoCert = "Node.ServerAPI.TLS.AutoCert" const NodeServerAPITLSAutoCertCachePath = "Node.ServerAPI.TLS.AutoCertCachePath" const NodeServerAPITLSServerCertificate = "Node.ServerAPI.TLS.ServerCertificate" const NodeServerAPITLSServerKey = "Node.ServerAPI.TLS.ServerKey" +const NodeNetwork = "Node.Network" +const NodeNetworkType = "Node.Network.Type" +const NodeNetworkPort = "Node.Network.Port" +const NodeNetworkAdvertisedAddress = "Node.Network.AdvertisedAddress" +const NodeNetworkOrchestrators = "Node.Network.Orchestrators" +const NodeNetworkClusterName = "Node.Network.Cluster.Name" +const NodeNetworkClusterPort = "Node.Network.Cluster.Port" +const NodeNetworkClusterAdvertisedAddress = "Node.Network.Cluster.AdvertisedAddress" +const NodeNetworkClusterPeers = "Node.Network.Cluster.Peers" const NodeLibp2p = "Node.Libp2p" const NodeLibp2pSwarmPort = "Node.Libp2p.SwarmPort" const NodeLibp2pPeerConnect = "Node.Libp2p.PeerConnect" diff --git a/pkg/config/types/generated_viper_defaults.go b/pkg/config/types/generated_viper_defaults.go index cf96b1b3c8..c529e8295f 100644 --- a/pkg/config/types/generated_viper_defaults.go +++ b/pkg/config/types/generated_viper_defaults.go @@ -1,4 +1,3 @@ - // CODE GENERATED BY pkg/config/types/gen_viper DO NOT EDIT package types @@ -50,6 +49,15 @@ func SetDefaults(cfg BacalhauConfig, opts ...SetOption) { p.Viper.SetDefault(NodeServerAPITLSAutoCertCachePath, cfg.Node.ServerAPI.TLS.AutoCertCachePath) p.Viper.SetDefault(NodeServerAPITLSServerCertificate, cfg.Node.ServerAPI.TLS.ServerCertificate) p.Viper.SetDefault(NodeServerAPITLSServerKey, cfg.Node.ServerAPI.TLS.ServerKey) + p.Viper.SetDefault(NodeNetwork, cfg.Node.Network) + p.Viper.SetDefault(NodeNetworkType, cfg.Node.Network.Type) + p.Viper.SetDefault(NodeNetworkPort, cfg.Node.Network.Port) + p.Viper.SetDefault(NodeNetworkAdvertisedAddress, cfg.Node.Network.AdvertisedAddress) + p.Viper.SetDefault(NodeNetworkOrchestrators, cfg.Node.Network.Orchestrators) + p.Viper.SetDefault(NodeNetworkClusterName, cfg.Node.Network.Cluster.Name) + p.Viper.SetDefault(NodeNetworkClusterPort, cfg.Node.Network.Cluster.Port) + p.Viper.SetDefault(NodeNetworkClusterAdvertisedAddress, cfg.Node.Network.Cluster.AdvertisedAddress) + p.Viper.SetDefault(NodeNetworkClusterPeers, cfg.Node.Network.Cluster.Peers) p.Viper.SetDefault(NodeLibp2p, cfg.Node.Libp2p) p.Viper.SetDefault(NodeLibp2pSwarmPort, cfg.Node.Libp2p.SwarmPort) p.Viper.SetDefault(NodeLibp2pPeerConnect, cfg.Node.Libp2p.PeerConnect) @@ -205,6 +213,15 @@ func Set(cfg BacalhauConfig, opts ...SetOption) { p.Viper.Set(NodeServerAPITLSAutoCertCachePath, cfg.Node.ServerAPI.TLS.AutoCertCachePath) p.Viper.Set(NodeServerAPITLSServerCertificate, cfg.Node.ServerAPI.TLS.ServerCertificate) p.Viper.Set(NodeServerAPITLSServerKey, cfg.Node.ServerAPI.TLS.ServerKey) + p.Viper.Set(NodeNetwork, cfg.Node.Network) + p.Viper.Set(NodeNetworkType, cfg.Node.Network.Type) + p.Viper.Set(NodeNetworkPort, cfg.Node.Network.Port) + p.Viper.Set(NodeNetworkAdvertisedAddress, cfg.Node.Network.AdvertisedAddress) + p.Viper.Set(NodeNetworkOrchestrators, cfg.Node.Network.Orchestrators) + p.Viper.Set(NodeNetworkClusterName, cfg.Node.Network.Cluster.Name) + p.Viper.Set(NodeNetworkClusterPort, cfg.Node.Network.Cluster.Port) + p.Viper.Set(NodeNetworkClusterAdvertisedAddress, cfg.Node.Network.Cluster.AdvertisedAddress) + p.Viper.Set(NodeNetworkClusterPeers, cfg.Node.Network.Cluster.Peers) p.Viper.Set(NodeLibp2p, cfg.Node.Libp2p) p.Viper.Set(NodeLibp2pSwarmPort, cfg.Node.Libp2p.SwarmPort) p.Viper.Set(NodeLibp2pPeerConnect, cfg.Node.Libp2p.PeerConnect) diff --git a/pkg/config/types/node.go b/pkg/config/types/node.go index b06071beb4..6bc6d28728 100644 --- a/pkg/config/types/node.go +++ b/pkg/config/types/node.go @@ -42,6 +42,8 @@ type NodeConfig struct { // Configuration for the web UI WebUI WebUIConfig `yaml:"WebUI"` + Network NetworkConfig `yaml:"Network"` + StrictVersionMatch bool `yaml:"StrictVersionMatch"` } @@ -138,3 +140,18 @@ type FeatureConfig struct { Publishers []string `yaml:"Publishers"` Storages []string `yaml:"Storages"` } + +type NetworkConfig struct { + Type string `yaml:"Type"` + Port int `yaml:"Port"` + AdvertisedAddress string `yaml:"AdvertisedAddress"` + Orchestrators []string `yaml:"Orchestrators"` + Cluster NetworkClusterConfig `yaml:"Cluster"` +} + +type NetworkClusterConfig struct { + Name string `yaml:"Name"` + Port int `yaml:"Port"` + AdvertisedAddress string `yaml:"AdvertisedAddress"` + Peers []string `yaml:"Peers"` +} diff --git a/pkg/devstack/devstack.go b/pkg/devstack/devstack.go index 8746987758..9b7078a1f9 100644 --- a/pkg/devstack/devstack.go +++ b/pkg/devstack/devstack.go @@ -5,9 +5,11 @@ import ( "fmt" "os" "strings" - "time" + "github.com/bacalhau-project/bacalhau/pkg/models" + "github.com/bacalhau-project/bacalhau/pkg/util/multiaddresses" "github.com/imdario/mergo" + "github.com/libp2p/go-libp2p/core/host" "github.com/multiformats/go-multiaddr" "github.com/phayes/freeport" "github.com/rs/zerolog/log" @@ -17,12 +19,10 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/ipfs" bac_libp2p "github.com/bacalhau-project/bacalhau/pkg/libp2p" "github.com/bacalhau-project/bacalhau/pkg/logger" + "github.com/bacalhau-project/bacalhau/pkg/node" "github.com/bacalhau-project/bacalhau/pkg/repo" "github.com/bacalhau-project/bacalhau/pkg/routing" "github.com/bacalhau-project/bacalhau/pkg/system" - "github.com/bacalhau-project/bacalhau/pkg/util/multiaddresses" - - "github.com/bacalhau-project/bacalhau/pkg/node" ) const ( @@ -44,10 +44,11 @@ type DevStackOptions struct { NodeInfoPublisherInterval routing.NodeInfoPublisherIntervalConfig ExecutorPlugins bool // when true pluggable executors will be used. ConfigurationRepo string // A custom config repo + NetworkType string } func (o *DevStackOptions) Options() []ConfigOption { - return []ConfigOption{ + opts := []ConfigOption{ WithNumberOfHybridNodes(o.NumberOfHybridNodes), WithNumberOfRequesterOnlyNodes(o.NumberOfRequesterOnlyNodes), WithNumberOfComputeOnlyNodes(o.NumberOfComputeOnlyNodes), @@ -61,7 +62,9 @@ func (o *DevStackOptions) Options() []ConfigOption { WithAllowListedLocalPaths(o.AllowListedLocalPaths), WithNodeInfoPublisherInterval(o.NodeInfoPublisherInterval), WithExecutorPlugins(o.ExecutorPlugins), + WithNetworkType(o.NetworkType), } + return opts } type DevStack struct { @@ -93,6 +96,8 @@ func Setup( defer span.End() var nodes []*node.Node + orchestratorAddrs := make([]string, 0) + clusterPeersAddrs := make([]string, 0) totalNodeCount := stackConfig.NumberOfHybridNodes + stackConfig.NumberOfRequesterOnlyNodes + stackConfig.NumberOfComputeOnlyNodes requesterNodeCount := stackConfig.NumberOfHybridNodes + stackConfig.NumberOfRequesterOnlyNodes @@ -101,7 +106,20 @@ func Setup( if requesterNodeCount == 0 { return nil, fmt.Errorf("at least one requester node is required") } + + // Enable testing using different network stacks by setting env variable + if stackConfig.NetworkType == "" { + networkType, ok := os.LookupEnv("BACALHAU_NODE_NETWORK_TYPE") + if !ok { + networkType = models.NetworkTypeLibp2p + } + stackConfig.NetworkType = networkType + } + for i := 0; i < totalNodeCount; i++ { + nodeID := fmt.Sprintf("node-%d", i) + ctx = logger.ContextWithNodeIDLogger(ctx, nodeID) + isRequesterNode := i < requesterNodeCount isComputeNode := (totalNodeCount - i) <= computeNodeCount log.Ctx(ctx).Debug().Msgf(`Creating Node #%d as {RequesterNode: %t, ComputeNode: %t}`, i+1, isRequesterNode, isComputeNode) @@ -128,57 +146,66 @@ func Setup( } // //////////////////////////////////// - // libp2p + // Transport layer (NATS or Libp2p) // //////////////////////////////////// - var libp2pPeer []multiaddr.Multiaddr - libp2pPort, err := freeport.GetFreePort() - if err != nil { - return nil, err + var swarmPort int + if os.Getenv("PREDICTABLE_API_PORT") != "" { + const startSwarmPort = 4222 // 4222 is the default NATS port + swarmPort = startSwarmPort + i + } else { + swarmPort, err = freeport.GetFreePort() + if err != nil { + return nil, err + } + } + clusterConfig := node.NetworkConfig{ + Type: stackConfig.NetworkType, + Orchestrators: orchestratorAddrs, + Port: swarmPort, + ClusterPeers: clusterPeersAddrs, } - if i == 0 { - if stackConfig.Peer != "" { - // connect 0'th node to external peer if specified - log.Ctx(ctx).Debug().Msgf("Connecting 0'th node to remote peer: %s", stackConfig.Peer) - peerAddr, addrErr := multiaddr.NewMultiaddr(stackConfig.Peer) - if addrErr != nil { - return nil, fmt.Errorf("failed to parse peer address: %w", addrErr) + if stackConfig.NetworkType == models.NetworkTypeNATS { + var clusterPort int + if os.Getenv("PREDICTABLE_API_PORT") != "" { + const startClusterPort = 6222 + clusterPort = startClusterPort + i + } else { + clusterPort, err = freeport.GetFreePort() + if err != nil { + return nil, err } - libp2pPeer = append(libp2pPeer, peerAddr) + } + + if isRequesterNode { + clusterConfig.ClusterName = "devstack" + clusterConfig.ClusterPort = clusterPort + orchestratorAddrs = append(orchestratorAddrs, fmt.Sprintf("127.0.0.1:%d", swarmPort)) + clusterPeersAddrs = append(clusterPeersAddrs, fmt.Sprintf("127.0.0.1:%d", clusterPort)) } } else { - p2pAddr, err := multiaddr.NewMultiaddr("/p2p/" + nodes[0].Host.ID().String()) + if i == 0 { + if stackConfig.Peer != "" { + clusterConfig.ClusterPeers = append(clusterConfig.ClusterPeers, stackConfig.Peer) + } + } else { + p2pAddr, err := multiaddr.NewMultiaddr("/p2p/" + nodes[0].Libp2pHost.ID().String()) + if err != nil { + return nil, err + } + addresses := multiaddresses.SortLocalhostFirst(nodes[0].Libp2pHost.Addrs()) + clusterConfig.ClusterPeers = append(clusterConfig.ClusterPeers, addresses[0].Encapsulate(p2pAddr).String()) + } + + clusterConfig.Libp2pHost, err = createLibp2pHost(ctx, cm, swarmPort) if err != nil { return nil, err } - addresses := multiaddresses.SortLocalhostFirst(nodes[0].Host.Addrs()) - // Only use a single address as libp2p seems to have concurrency issues, like two nodes not able to finish - // connecting/joining topics, when using multiple addresses for a single host. - libp2pPeer = append(libp2pPeer, addresses[0].Encapsulate(p2pAddr)) - log.Ctx(ctx).Debug().Msgf("Connecting to first libp2p requester node: %s", libp2pPeer) - } - - // TODO(forrest): [devstack] Refactor the devstack s.t. each node has its own repo and config. - // previously the config would generate a key using the host port as the postfix - // this is not longer the case as a node should have a single libp2p key, but since - // all devstack nodes share a repo we will get a self dial error if we use the same - // key from the config for each devstack node. The solution here is to refactor the - // the devstack such that all nodes in the stack have their own repos and configuration - // rather than rely on global values and one off key gen via the config. - // Creates a new RSA key pair for this host. - privKey, err := bac_libp2p.GeneratePrivateKey(DefaultLibp2pKeySize) - if err != nil { - return nil, err + // nodeID must match the libp2p host ID + nodeID = clusterConfig.Libp2pHost.ID().String() + ctx = logger.ContextWithNodeIDLogger(ctx, nodeID) } - libp2pHost, err := bac_libp2p.NewHost(libp2pPort, privKey) - if err != nil { - return nil, err - } - cm.RegisterCallback(libp2pHost.Close) - - // add NodeID to logging context - ctx = logger.ContextWithNodeIDLogger(ctx, libp2pHost.ID().String()) // //////////////////////////////////// // port for API @@ -212,9 +239,9 @@ func Setup( } nodeConfig := node.NodeConfig{ + NodeID: nodeID, IPFSClient: ipfsNode.Client(), CleanupManager: cm, - Host: libp2pHost, HostAddress: "0.0.0.0", APIPort: apiPort, ComputeConfig: stackConfig.ComputeConfig, @@ -222,8 +249,8 @@ func Setup( IsComputeNode: isComputeNode, IsRequesterNode: isRequesterNode, Labels: map[string]string{ + "id": nodeID, "name": fmt.Sprintf("node-%d", i), - "id": libp2pHost.ID().String(), "env": "devstack", }, DependencyInjector: stackConfig.NodeDependencyInjector, @@ -232,6 +259,7 @@ func Setup( NodeInfoPublisherInterval: nodeInfoPublisherInterval, FsRepo: fsRepo, NodeInfoStoreTTL: stackConfig.NodeInfoStoreTTL, + NetworkConfig: clusterConfig, } if isRequesterNode && stackConfig.TLS.Certificate != "" && stackConfig.TLS.Key != "" { @@ -257,12 +285,6 @@ func Setup( return nil, err } - // Start transport layer - err = bac_libp2p.ConnectToPeersContinuouslyWithRetryDuration(ctx, cm, libp2pHost, libp2pPeer, 2*time.Second) - if err != nil { - return nil, err - } - // start the node err = n.Start(ctx) if err != nil { @@ -284,6 +306,30 @@ func Setup( }, nil } +func createLibp2pHost(ctx context.Context, cm *system.CleanupManager, port int) (host.Host, error) { + var err error + + // TODO(forrest): [devstack] Refactor the devstack s.t. each node has its own repo and config. + // previously the config would generate a key using the host port as the postfix + // this is not longer the case as a node should have a single libp2p key, but since + // all devstack nodes share a repo we will get a self dial error if we use the same + // key from the config for each devstack node. The solution here is to refactor the + // the devstack such that all nodes in the stack have their own repos and configuration + // rather than rely on global values and one off key gen via the config. + + privKey, err := bac_libp2p.GeneratePrivateKey(DefaultLibp2pKeySize) + if err != nil { + return nil, err + } + + libp2pHost, err := bac_libp2p.NewHost(port, privKey) + if err != nil { + return nil, fmt.Errorf("error creating libp2p host: %w", err) + } + + return libp2pHost, nil +} + func createIPFSNode(ctx context.Context, cm *system.CleanupManager, publicIPFSMode bool, @@ -326,34 +372,8 @@ func (stack *DevStack) PrintNodeInfo(ctx context.Context, fsRepo *repo.FsRepo, c swarmAddrrs = strings.Join(swarmAddresses, ",") } - var libp2pPeer []string - for _, addrs := range node.Host.Addrs() { - p2pAddr, p2pAddrErr := multiaddr.NewMultiaddr("/p2p/" + node.Host.ID().String()) - if p2pAddrErr != nil { - return "", p2pAddrErr - } - libp2pPeer = append(libp2pPeer, addrs.Encapsulate(p2pAddr).String()) - } - devstackPeerAddr := strings.Join(libp2pPeer, ",") - if len(libp2pPeer) > 0 { - chosen := false - preferredAddress := config.PreferredAddress() - if preferredAddress != "" { - for _, addr := range libp2pPeer { - if strings.Contains(addr, preferredAddress) { - devstackPeerAddrs = append(devstackPeerAddrs, addr) - chosen = true - break - } - } - } - - if !chosen { - // only add one of the addrs for this peer and we will choose the first - // in the absence of a preference - devstackPeerAddrs = append(devstackPeerAddrs, libp2pPeer[0]) - } - } + peerConnect := fmt.Sprintf("/ip4/%s/tcp/%d/http", node.APIServer.Address, node.APIServer.Port) + devstackPeerAddrs = append(devstackPeerAddrs, peerConnect) logString += fmt.Sprintf(` export BACALHAU_IPFS_%d=%s @@ -366,7 +386,7 @@ export BACALHAU_API_PORT_%d=%d`, nodeIndex, swarmAddrrs, nodeIndex, - devstackPeerAddr, + peerConnect, nodeIndex, stack.Nodes[nodeIndex].APIServer.Address, nodeIndex, @@ -449,7 +469,7 @@ The above variables were also written to this file (will be deleted when devstac func (stack *DevStack) GetNode(_ context.Context, nodeID string) ( *node.Node, error) { for _, node := range stack.Nodes { - if node.Host.ID().String() == nodeID { + if node.ID == nodeID { return node, nil } } @@ -467,7 +487,7 @@ func (stack *DevStack) IPFSClients() []ipfs.Client { func (stack *DevStack) GetNodeIds() []string { var ids []string for _, node := range stack.Nodes { - ids = append(ids, node.Host.ID().String()) + ids = append(ids, node.ID) } return ids } diff --git a/pkg/devstack/option.go b/pkg/devstack/option.go index 73d2570253..b2a7213028 100644 --- a/pkg/devstack/option.go +++ b/pkg/devstack/option.go @@ -75,6 +75,7 @@ type DevStackConfig struct { ExecutorPlugins bool // when true pluggable executors will be used. NodeInfoStoreTTL time.Duration TLS DevstackTLSSettings + NetworkType string } func (o *DevStackConfig) MarshalZerologObject(e *zerolog.Event) { @@ -90,7 +91,8 @@ func (o *DevStackConfig) MarshalZerologObject(e *zerolog.Event) { Strs("AllowListedLocalPaths", o.AllowListedLocalPaths). Str("NodeInfoPublisherInterval", fmt.Sprintf("%v", o.NodeInfoPublisherInterval)). Bool("PublicIPFSMode", o.PublicIPFSMode). - Bool("ExecutorPlugins", o.ExecutorPlugins) + Bool("ExecutorPlugins", o.ExecutorPlugins). + Str("NetworkType", o.NetworkType) } func (o *DevStackConfig) Validate() error { @@ -220,6 +222,12 @@ func WithExecutorPlugins(enabled bool) ConfigOption { } } +func WithNetworkType(typ string) ConfigOption { + return func(cfg *DevStackConfig) { + cfg.NetworkType = typ + } +} + func WithSelfSignedCertificate(cert string, key string) ConfigOption { return func(cfg *DevStackConfig) { cfg.TLS = DevstackTLSSettings{ diff --git a/pkg/lib/validate/general.go b/pkg/lib/validate/general.go new file mode 100644 index 0000000000..9c1266e4eb --- /dev/null +++ b/pkg/lib/validate/general.go @@ -0,0 +1,18 @@ +package validate + +import "reflect" + +// IsNotNil checks if the provided value is not nil. +// Returns an error if the value is nil, using the provided message and arguments. +func IsNotNil(value any, msg string, args ...any) error { + if value == nil { + return createError(msg, args...) + } + + // Use reflection to handle cases where value is a nil pointer wrapped in an interface + val := reflect.ValueOf(value) + if val.Kind() == reflect.Ptr && val.IsNil() { + return createError(msg, args...) + } + return nil +} diff --git a/pkg/lib/validate/general_test.go b/pkg/lib/validate/general_test.go new file mode 100644 index 0000000000..bc9a4e351d --- /dev/null +++ b/pkg/lib/validate/general_test.go @@ -0,0 +1,38 @@ +//go:build unit || !integration + +package validate + +import "testing" + +// TestIsNotNil tests the IsNotNil function for various scenarios. +func TestIsNotNil(t *testing.T) { + t.Run("NilValue", func(t *testing.T) { + err := IsNotNil(nil, "value should not be nil") + if err == nil { + t.Errorf("IsNotNil failed: expected error for nil value") + } + }) + + t.Run("NonNilValue", func(t *testing.T) { + err := IsNotNil(42, "value should not be nil") + if err != nil { + t.Errorf("IsNotNil failed: unexpected error for non-nil value") + } + }) + + t.Run("NilPointer", func(t *testing.T) { + var nilPointer *int + err := IsNotNil(nilPointer, "value should not be nil") + if err == nil { + t.Errorf("IsNotNil failed: expected error for nil pointer") + } + }) + + t.Run("NonNilPointer", func(t *testing.T) { + nonNilPointer := new(int) + err := IsNotNil(nonNilPointer, "value should not be nil") + if err != nil { + t.Errorf("IsNotNil failed: unexpected error for non-nil pointer") + } + }) +} diff --git a/pkg/lib/validate/numbers.go b/pkg/lib/validate/numbers.go new file mode 100644 index 0000000000..67f3d7cd76 --- /dev/null +++ b/pkg/lib/validate/numbers.go @@ -0,0 +1,15 @@ +package validate + +import ( + "github.com/bacalhau-project/bacalhau/pkg/lib/math" +) + +// IsGreaterThanZero checks if the provided numeric value (of type T) is greater than zero. +// It returns an error if the value is not greater than zero, using the provided message and arguments. +// T is a generic type constrained to math.Number, allowing the function to work with various numeric types. +func IsGreaterThanZero[T math.Number](value T, msg string, args ...any) error { + if value <= 0 { + return createError(msg, args...) + } + return nil +} diff --git a/pkg/lib/validate/numbers_test.go b/pkg/lib/validate/numbers_test.go new file mode 100644 index 0000000000..47d8722cc7 --- /dev/null +++ b/pkg/lib/validate/numbers_test.go @@ -0,0 +1,34 @@ +//go:build unit || !integration + +package validate + +import ( + "testing" +) + +func TestIsGreaterThanZero(t *testing.T) { + // Test with value less than zero + err := IsGreaterThanZero(-1, "value should be greater than zero") + if err == nil || err.Error() != "value should be greater than zero" { + t.Errorf("IsGreaterThanZero failed: expected error for value -1") + } + + // Test with zero + err = IsGreaterThanZero(0, "value should be greater than zero") + if err == nil || err.Error() != "value should be greater than zero" { + t.Errorf("IsGreaterThanZero failed: expected error for value 0") + } + + // Test with value greater than zero + err = IsGreaterThanZero(1, "value should be greater than zero") + if err != nil { + t.Errorf("IsGreaterThanZero failed: unexpected error for value 1") + } + + // Test with different numeric types + var floatValue float64 = 1.5 + err = IsGreaterThanZero(floatValue, "value should be greater than zero") + if err != nil { + t.Errorf("IsGreaterThanZero failed: unexpected error for float value %v", floatValue) + } +} diff --git a/pkg/lib/validate/util.go b/pkg/lib/validate/util.go new file mode 100644 index 0000000000..cc445e4d47 --- /dev/null +++ b/pkg/lib/validate/util.go @@ -0,0 +1,14 @@ +package validate + +import "fmt" + +// createError constructs an error with a formatted message. +// 'msg' is a format string and 'args' are the values to be formatted into the message. +func createError(msg string, args ...any) error { + if len(args) == 0 { + // If no arguments, return the message as-is. + return fmt.Errorf(msg) + } + // If arguments are provided, format the message. + return fmt.Errorf(msg, args...) +} diff --git a/pkg/lib/validate/util_test.go b/pkg/lib/validate/util_test.go new file mode 100644 index 0000000000..eb9d21b60e --- /dev/null +++ b/pkg/lib/validate/util_test.go @@ -0,0 +1,29 @@ +//go:build unit || !integration + +package validate + +import ( + "strings" + "testing" +) + +func TestCreateError(t *testing.T) { + // Test with no arguments + err := createError("simple error") + if err == nil || err.Error() != "simple error" { + t.Errorf("createError failed: expected 'simple error', got '%v'", err) + } + + // Test with arguments + err = createError("error with argument: %v", 42) + if err == nil || !strings.Contains(err.Error(), "42") { + t.Errorf("createError failed: expected string containing '42', got '%v'", err) + } + + // Test with multiple arguments + err = createError("error with multiple arguments: %v %s", 42, "test") + expectedMsg := "error with multiple arguments: 42 test" + if err == nil || err.Error() != expectedMsg { + t.Errorf("createError failed: expected '%s', got '%v'", expectedMsg, err) + } +} diff --git a/pkg/libp2p/host.go b/pkg/libp2p/host.go index ab9990be00..1e2dcdd9fa 100644 --- a/pkg/libp2p/host.go +++ b/pkg/libp2p/host.go @@ -78,6 +78,9 @@ func ConnectToPeersContinuouslyWithRetryDuration( peers []multiaddr.Multiaddr, tickDuration time.Duration, ) error { + if tickDuration == 0 { + tickDuration = continuouslyConnectPeersLoopDelay + } if err := connectToPeers(ctx, h, peers); err != nil { return err } diff --git a/pkg/libp2p/info_decorator.go b/pkg/libp2p/info_decorator.go new file mode 100644 index 0000000000..7c796c4960 --- /dev/null +++ b/pkg/libp2p/info_decorator.go @@ -0,0 +1,38 @@ +package libp2p + +import ( + "context" + + "github.com/bacalhau-project/bacalhau/pkg/models" + "github.com/libp2p/go-libp2p/core/host" + "github.com/libp2p/go-libp2p/core/peer" + "github.com/libp2p/go-libp2p/p2p/protocol/identify" +) + +type PeerInfoDecoratorParams struct { + Host host.Host + IdentityService identify.IDService +} + +type PeerInfoDecorator struct { + host host.Host + identityService identify.IDService +} + +func NewPeerInfoDecorator(params PeerInfoDecoratorParams) *PeerInfoDecorator { + return &PeerInfoDecorator{ + host: params.Host, + identityService: params.IdentityService, + } +} + +func (l *PeerInfoDecorator) DecorateNodeInfo(ctx context.Context, nodeInfo models.NodeInfo) models.NodeInfo { + nodeInfo.PeerInfo = &peer.AddrInfo{ + ID: l.host.ID(), + Addrs: l.identityService.OwnObservedAddrs(), + } + return nodeInfo +} + +// compile-time check whether the PeerInfoDecorator implements the PeerInfoDecorator interface. +var _ models.NodeInfoDecorator = (*PeerInfoDecorator)(nil) diff --git a/pkg/libp2p/transport/libp2p.go b/pkg/libp2p/transport/libp2p.go new file mode 100644 index 0000000000..b69d8d34c3 --- /dev/null +++ b/pkg/libp2p/transport/libp2p.go @@ -0,0 +1,217 @@ +package transport + +import ( + "context" + "fmt" + "time" + + "github.com/bacalhau-project/bacalhau/pkg/compute" + pkgconfig "github.com/bacalhau-project/bacalhau/pkg/config" + "github.com/bacalhau-project/bacalhau/pkg/lib/validate" + libp2p_host "github.com/bacalhau-project/bacalhau/pkg/libp2p" + "github.com/bacalhau-project/bacalhau/pkg/model" + "github.com/bacalhau-project/bacalhau/pkg/models" + "github.com/bacalhau-project/bacalhau/pkg/pubsub" + "github.com/bacalhau-project/bacalhau/pkg/pubsub/libp2p" + "github.com/bacalhau-project/bacalhau/pkg/routing" + "github.com/bacalhau-project/bacalhau/pkg/system" + core_transport "github.com/bacalhau-project/bacalhau/pkg/transport" + "github.com/bacalhau-project/bacalhau/pkg/transport/bprotocol" + "github.com/hashicorp/go-multierror" + libp2p_pubsub "github.com/libp2p/go-libp2p-pubsub" + "github.com/libp2p/go-libp2p/core/host" + basichost "github.com/libp2p/go-libp2p/p2p/host/basic" + routedhost "github.com/libp2p/go-libp2p/p2p/host/routed" + "github.com/libp2p/go-libp2p/p2p/protocol/identify" + "github.com/multiformats/go-multiaddr" +) + +const NodeInfoTopic = "bacalhau-node-info" + +type Libp2pTransportConfig struct { + Host host.Host + Peers []string + ReconnectDelay time.Duration + CleanupManager *system.CleanupManager +} + +func (c *Libp2pTransportConfig) Validate() error { + var mErr *multierror.Error + mErr = multierror.Append(mErr, validate.IsNotNil(c.Host, "libp2p host cannot be nil")) + mErr = multierror.Append(mErr, validate.IsNotNil(c.CleanupManager, "cleanupManager cannot be nil")) + return mErr.ErrorOrNil() +} + +type Libp2pTransport struct { + Host host.Host + computeProxy *bprotocol.ComputeProxy + callbackProxy *bprotocol.CallbackProxy + nodeInfoPubSub pubsub.PubSub[models.NodeInfo] + nodeInfoDecorator models.NodeInfoDecorator +} + +func NewLibp2pTransport(ctx context.Context, + config Libp2pTransportConfig, + nodeInfoStore routing.NodeInfoStore) (*Libp2pTransport, error) { + if err := config.Validate(); err != nil { + return nil, fmt.Errorf("error validating libp2p transport config. %w", err) + } + + // Monkey patch the identify protocol to allow discovering advertised addresses of networks of 3 or more nodes, instead of 5. + // Setting the value to 2 means two other nodes must see the same addr for a node to discover its observed addr, which enables a network + // of at least 3 nodes. + identify.ActivationThresh = 2 + + libp2pHost := config.Host + + // A single gossipSub instance that will be used by all topics + gossipSub, err := newLibp2pPubSub(ctx, libp2pHost) + if err != nil { + return nil, err + } + + // PubSub to publish node info to the network + nodeInfoPubSub, err := libp2p.NewPubSub[models.NodeInfo](libp2p.PubSubParams{ + Host: libp2pHost, + TopicName: NodeInfoTopic, + PubSub: gossipSub, + }) + if err != nil { + return nil, err + } + + // node info provider + basicHost, ok := libp2pHost.(*basichost.BasicHost) + if !ok { + return nil, fmt.Errorf("host is not a basic host") + } + + peerInfoDecorator := libp2p_host.NewPeerInfoDecorator(libp2p_host.PeerInfoDecoratorParams{ + Host: basicHost, + IdentityService: basicHost.IDService(), + }) + + libp2pHost = routedhost.Wrap(libp2pHost, nodeInfoStore) + + // register consumers of node info published over gossipSub + nodeInfoSubscriber := pubsub.NewChainedSubscriber[models.NodeInfo](true) + nodeInfoSubscriber.Add(pubsub.SubscriberFunc[models.NodeInfo](nodeInfoStore.Add)) + err = nodeInfoPubSub.Subscribe(ctx, nodeInfoSubscriber) + if err != nil { + return nil, err + } + + // compute proxy + computeProxy := bprotocol.NewComputeProxy(bprotocol.ComputeProxyParams{ + Host: libp2pHost, + }) + + // Callback to send compute events (i.e. requester endpoint) + computeCallback := bprotocol.NewCallbackProxy(bprotocol.CallbackProxyParams{ + Host: libp2pHost, + }) + + var libp2pPeer []multiaddr.Multiaddr + for _, addr := range config.Peers { + maddr, err := multiaddr.NewMultiaddr(addr) + if err != nil { + return nil, err + } + libp2pPeer = append(libp2pPeer, maddr) + } + + err = libp2p_host.ConnectToPeersContinuouslyWithRetryDuration( + ctx, config.CleanupManager, libp2pHost, libp2pPeer, config.ReconnectDelay) + if err != nil { + return nil, err + } + + return &Libp2pTransport{ + Host: libp2pHost, + computeProxy: computeProxy, + callbackProxy: computeCallback, + nodeInfoPubSub: nodeInfoPubSub, + nodeInfoDecorator: peerInfoDecorator, + }, nil +} + +// RegisterComputeCallback registers a compute callback with the transport layer. +func (t *Libp2pTransport) RegisterComputeCallback(callback compute.Callback) error { + bprotocol.NewCallbackHandler(bprotocol.CallbackHandlerParams{ + Host: t.Host, + Callback: callback, + }) + // To enable nodes self-dialing themselves as libp2p doesn't support it. + t.callbackProxy.RegisterLocalComputeCallback(callback) + + return nil +} + +// RegisterComputeEndpoint registers a compute endpoint with the transport layer. +func (t *Libp2pTransport) RegisterComputeEndpoint(endpoint compute.Endpoint) error { + bprotocol.NewComputeHandler(bprotocol.ComputeHandlerParams{ + Host: t.Host, + ComputeEndpoint: endpoint, + }) + // To enable nodes self-dialing themselves as libp2p doesn't support it. + t.computeProxy.RegisterLocalComputeEndpoint(endpoint) + + return nil +} + +// ComputeProxy returns the compute proxy. +func (t *Libp2pTransport) ComputeProxy() compute.Endpoint { + return t.computeProxy +} + +// CallbackProxy returns the callback proxy. +func (t *Libp2pTransport) CallbackProxy() compute.Callback { + return t.callbackProxy +} + +// NodeInfoPubSub returns the node info pubsub. +func (t *Libp2pTransport) NodeInfoPubSub() pubsub.PubSub[models.NodeInfo] { + return t.nodeInfoPubSub +} + +// NodeInfoDecorator returns the node info decorator. +func (t *Libp2pTransport) NodeInfoDecorator() models.NodeInfoDecorator { + return t.nodeInfoDecorator +} + +// DebugInfoProviders returns the debug info. +func (t *Libp2pTransport) DebugInfoProviders() []model.DebugInfoProvider { + return []model.DebugInfoProvider{} +} + +// Close closes the transport layer. +func (t *Libp2pTransport) Close(ctx context.Context) error { + var errors *multierror.Error + errors = multierror.Append(errors, t.nodeInfoPubSub.Close(ctx)) + errors = multierror.Append(errors, t.Host.Close()) + return errors.ErrorOrNil() +} + +func newLibp2pPubSub(ctx context.Context, host host.Host) (*libp2p_pubsub.PubSub, error) { + tracer, err := libp2p_pubsub.NewJSONTracer(pkgconfig.GetLibp2pTracerPath()) + if err != nil { + return nil, err + } + + pgParams := libp2p_pubsub.NewPeerGaterParams( + 0.33, //nolint:gomnd + libp2p_pubsub.ScoreParameterDecay(2*time.Minute), //nolint:gomnd + libp2p_pubsub.ScoreParameterDecay(10*time.Minute), //nolint:gomnd + ) + + return libp2p_pubsub.NewGossipSub( + ctx, + host, + libp2p_pubsub.WithPeerExchange(true), + libp2p_pubsub.WithPeerGater(pgParams), + libp2p_pubsub.WithEventTracer(tracer), + ) +} + +// compile-time interface check +var _ core_transport.TransportLayer = (*Libp2pTransport)(nil) diff --git a/pkg/models/constants.go b/pkg/models/constants.go index fac87da4cc..58822f604e 100644 --- a/pkg/models/constants.go +++ b/pkg/models/constants.go @@ -47,6 +47,11 @@ const ( PublisherS3 = "s3" ) +const ( + NetworkTypeNATS = "nats" + NetworkTypeLibp2p = "libp2p" +) + const ( DownloadFilenameStdout = "stdout" DownloadFilenameStderr = "stderr" @@ -57,10 +62,9 @@ const ( ) const ( - MetaReservedPrefix = "bacalhau.org/" - MetaRequesterID = "bacalhau.org/requester.id" - MetaRequesterPublicKey = "bacalhau.org/requester.publicKey" - MetaClientID = "bacalhau.org/client.id" + MetaReservedPrefix = "bacalhau.org/" + MetaRequesterID = "bacalhau.org/requester.id" + MetaClientID = "bacalhau.org/client.id" // Job provenance metadata used to track the origin of a job where // it may have been translated from another job. diff --git a/pkg/models/migration/legacy/from.go b/pkg/models/migration/legacy/from.go index 99e1549465..dbe5f9f16b 100644 --- a/pkg/models/migration/legacy/from.go +++ b/pkg/models/migration/legacy/from.go @@ -28,7 +28,6 @@ func FromLegacyJob(legacy *model.Job) (*models.Job, error) { metadata := make(map[string]string) metadata[models.MetaRequesterID] = legacy.Metadata.Requester.RequesterNodeID - metadata[models.MetaRequesterPublicKey] = legacy.Metadata.Requester.RequesterPublicKey.String() metadata[models.MetaClientID] = legacy.Metadata.ClientID labels := make(map[string]string) diff --git a/pkg/models/migration/legacy/to.go b/pkg/models/migration/legacy/to.go index 9dc1247d5b..ff250aa87d 100644 --- a/pkg/models/migration/legacy/to.go +++ b/pkg/models/migration/legacy/to.go @@ -10,12 +10,6 @@ import ( ) func ToLegacyJob(job *models.Job) (*model.Job, error) { - pk := new(model.PublicKey) - err := pk.UnmarshalText([]byte(job.Meta[models.MetaRequesterPublicKey])) - if err != nil { - return nil, err - } - spec, err := ToLegacyJobSpec(job) if err != nil { return nil, err @@ -28,8 +22,7 @@ func ToLegacyJob(job *models.Job) (*model.Job, error) { CreatedAt: time.Unix(0, job.CreateTime), ClientID: job.Meta[models.MetaClientID], Requester: model.JobRequester{ - RequesterNodeID: job.Meta[models.MetaRequesterID], - RequesterPublicKey: *pk, + RequesterNodeID: job.Meta[models.MetaRequesterID], }, }, Spec: *spec, diff --git a/pkg/models/node_info.go b/pkg/models/node_info.go index 7d7a72aac0..778bcc199e 100644 --- a/pkg/models/node_info.go +++ b/pkg/models/node_info.go @@ -42,10 +42,6 @@ type NodeInfoProvider interface { GetNodeInfo(ctx context.Context) NodeInfo } -type ComputeNodeInfoProvider interface { - GetComputeInfo(ctx context.Context) ComputeNodeInfo -} - type LabelsProvider interface { GetLabels(ctx context.Context) map[string]string } @@ -67,8 +63,22 @@ func MergeLabelsInOrder(providers ...LabelsProvider) LabelsProvider { return mergeProvider{providers: providers} } +type NodeInfoDecorator interface { + DecorateNodeInfo(ctx context.Context, nodeInfo NodeInfo) NodeInfo +} + +// NoopNodeInfoDecorator is a decorator that does nothing +type NoopNodeInfoDecorator struct{} + +func (n NoopNodeInfoDecorator) DecorateNodeInfo(ctx context.Context, nodeInfo NodeInfo) NodeInfo { + return nodeInfo +} + +// NodeInfo +// TODO: add Validate() method to NodeInfo and make sure it is called in all the places where it is initialized type NodeInfo struct { - PeerInfo peer.AddrInfo `json:"PeerInfo"` + NodeID string `json:"NodeID"` + PeerInfo *peer.AddrInfo `json:"PeerInfo,omitempty" yaml:",omitempty"` NodeType NodeType `json:"NodeType"` Labels map[string]string `json:"Labels"` ComputeNodeInfo *ComputeNodeInfo `json:"ComputeNodeInfo,omitempty" yaml:",omitempty"` @@ -77,7 +87,12 @@ type NodeInfo struct { // ID returns the node ID func (n NodeInfo) ID() string { - return n.PeerInfo.ID.String() + if n.NodeID != "" { + return n.NodeID + } else if n.PeerInfo != nil { + return n.PeerInfo.ID.String() + } + return "" } // IsComputeNode returns true if the node is a compute node diff --git a/pkg/nats/client.go b/pkg/nats/client.go new file mode 100644 index 0000000000..9cc837e334 --- /dev/null +++ b/pkg/nats/client.go @@ -0,0 +1,61 @@ +package nats + +import ( + "context" + + "github.com/bacalhau-project/bacalhau/pkg/model" + "github.com/nats-io/nats.go" +) + +type ClientManagerParams struct { + Name string + Servers string +} + +type ClientManager struct { + Client *nats.Conn +} + +// NewClientManager is a helper function to create a NATS client connection with a given name and servers string +func NewClientManager(ctx context.Context, params ClientManagerParams) (*ClientManager, error) { + nc, err := nats.Connect(params.Servers, nats.Name(params.Name)) + if err != nil { + return nil, err + } + return &ClientManager{ + Client: nc, + }, nil +} + +// Stop stops the NATS client +func (cm *ClientManager) Stop() { + cm.Client.Close() +} + +// DebugInfo returns the debug info of the NATS client +func (cm *ClientManager) GetDebugInfo(ctx context.Context) (model.DebugInfo, error) { + stats := cm.Client.Stats() + servers := cm.Client.Servers() + buffered, err := cm.Client.Buffered() + if err != nil { + return model.DebugInfo{}, err + } + + return model.DebugInfo{ + Component: "NATSClient", + Info: map[string]interface{}{ + "Name": cm.Client.Opts.Name, + "Stats": stats, + "Servers": servers, + "Buffered": buffered, + "Connection": map[string]interface{}{ + "IsConnected": cm.Client.IsConnected(), + "Addr": cm.Client.ConnectedAddr(), + "Url": cm.Client.ConnectedUrl(), + "ServerId": cm.Client.ConnectedServerId(), + "ServerName": cm.Client.ConnectedServerName(), + "ClusterName": cm.Client.ConnectedClusterName(), + }, + }, + }, nil +} diff --git a/pkg/nats/logger.go b/pkg/nats/logger.go new file mode 100644 index 0000000000..98adc24e37 --- /dev/null +++ b/pkg/nats/logger.go @@ -0,0 +1,51 @@ +package nats + +import ( + "github.com/nats-io/nats-server/v2/server" + "github.com/rs/zerolog" +) + +// ZeroLogger is a wrapper around zerolog.Logger to implement the NATS Logger interface +type ZeroLogger struct { + logger zerolog.Logger + serverID string +} + +// NewZeroLogger creates a new ZeroLogger +func NewZeroLogger(logger zerolog.Logger, serverID string) ZeroLogger { + return ZeroLogger{ + logger: logger, + serverID: serverID, + } +} + +func (l ZeroLogger) Noticef(format string, v ...interface{}) { + l.logWithLevel(zerolog.InfoLevel, format, v) +} + +func (l ZeroLogger) Warnf(format string, v ...interface{}) { + l.logWithLevel(zerolog.WarnLevel, format, v) +} + +func (l ZeroLogger) Fatalf(format string, v ...interface{}) { + l.logWithLevel(zerolog.FatalLevel, format, v) +} + +func (l ZeroLogger) Errorf(format string, v ...interface{}) { + l.logWithLevel(zerolog.ErrorLevel, format, v) +} + +func (l ZeroLogger) Debugf(format string, v ...interface{}) { + l.logWithLevel(zerolog.DebugLevel, format, v) +} + +func (l ZeroLogger) Tracef(format string, v ...interface{}) { + l.logWithLevel(zerolog.TraceLevel, format, v) +} + +func (l ZeroLogger) logWithLevel(level zerolog.Level, format string, v []interface{}) { + l.logger.WithLevel(level).Str("Server", l.serverID).Msgf(format, v...) +} + +// compile-time check whether the ZeroLogger implements the Logger interface +var _ server.Logger = (*ZeroLogger)(nil) diff --git a/pkg/nats/proxy/callback_handler.go b/pkg/nats/proxy/callback_handler.go new file mode 100644 index 0000000000..e739854342 --- /dev/null +++ b/pkg/nats/proxy/callback_handler.go @@ -0,0 +1,82 @@ +package proxy + +import ( + "context" + "encoding/json" + "reflect" + "strings" + + "github.com/bacalhau-project/bacalhau/pkg/compute" + "github.com/nats-io/nats.go" + "github.com/rs/zerolog/log" +) + +type CallbackHandlerParams struct { + Name string + Conn *nats.Conn + Callback compute.Callback +} + +// CallbackHandler is a handler for callback events that registers for incoming nats requests to Bacalhau callback +// protocol, and delegates the handling of the request to the provided callback. +type CallbackHandler struct { + name string + conn *nats.Conn + callback compute.Callback +} + +type callbackHandler[Request any] func(context.Context, Request) + +func NewCallbackHandler(params CallbackHandlerParams) (*CallbackHandler, error) { + handler := &CallbackHandler{ + name: params.Name, + conn: params.Conn, + callback: params.Callback, + } + + subject := callbackSubscribeSubject(handler.name) + _, err := handler.conn.Subscribe(subject, func(m *nats.Msg) { + handler.handle(m) + }) + if err != nil { + return nil, err + } + log.Debug().Msgf("ComputeHandler %s subscribed to %s", handler.name, subject) + return handler, nil +} + +// handle handles incoming NATS messages. +func (h *CallbackHandler) handle(msg *nats.Msg) { + ctx := context.Background() + + subjectParts := strings.Split(msg.Subject, ".") + method := subjectParts[len(subjectParts)-1] + + switch method { + case OnBidComplete: + processCallback(ctx, msg, h.callback.OnBidComplete) + case OnRunComplete: + processCallback(ctx, msg, h.callback.OnRunComplete) + case OnCancelComplete: + processCallback(ctx, msg, h.callback.OnCancelComplete) + case OnComputeFailure: + processCallback(ctx, msg, h.callback.OnComputeFailure) + default: + // Noop, not subscribed to this method + return + } +} + +func processCallback[Request any]( + ctx context.Context, + msg *nats.Msg, + f callbackHandler[Request]) { + request := new(Request) + err := json.Unmarshal(msg.Data, request) + if err != nil { + log.Ctx(ctx).Error().Msgf("error decoding %s: %s", reflect.TypeOf(request), err) + return + } + + go f(ctx, *request) +} diff --git a/pkg/nats/proxy/callback_proxy.go b/pkg/nats/proxy/callback_proxy.go new file mode 100644 index 0000000000..8d34e05a9a --- /dev/null +++ b/pkg/nats/proxy/callback_proxy.go @@ -0,0 +1,74 @@ +package proxy + +import ( + "context" + "encoding/json" + "reflect" + + "github.com/bacalhau-project/bacalhau/pkg/compute" + "github.com/nats-io/nats.go" + "github.com/pkg/errors" + "github.com/rs/zerolog/log" +) + +type CallbackProxyParams struct { + Conn *nats.Conn +} + +// CallbackProxy is a proxy for a compute.Callback that can be used to send compute callbacks to the requester node, +// such as when the execution is completed or when a failure occurs. +// The proxy can forward callbacks to a remote requester node, or locally if the node is the requester and a +// LocalCallback is provided. +type CallbackProxy struct { + conn *nats.Conn +} + +func NewCallbackProxy(params CallbackProxyParams) *CallbackProxy { + proxy := &CallbackProxy{ + conn: params.Conn, + } + return proxy +} + +func (p *CallbackProxy) OnBidComplete(ctx context.Context, result compute.BidResult) { + proxyCallbackRequest(ctx, p.conn, result.RoutingMetadata.TargetPeerID, OnBidComplete, result) +} + +func (p *CallbackProxy) OnRunComplete(ctx context.Context, result compute.RunResult) { + proxyCallbackRequest(ctx, p.conn, result.RoutingMetadata.TargetPeerID, OnRunComplete, result) +} + +func (p *CallbackProxy) OnCancelComplete(ctx context.Context, result compute.CancelResult) { + proxyCallbackRequest(ctx, p.conn, result.RoutingMetadata.TargetPeerID, OnCancelComplete, result) +} + +func (p *CallbackProxy) OnComputeFailure(ctx context.Context, result compute.ComputeError) { + proxyCallbackRequest(ctx, p.conn, result.RoutingMetadata.TargetPeerID, OnComputeFailure, result) +} + +func proxyCallbackRequest( + ctx context.Context, + conn *nats.Conn, + destNodeID string, + method string, + request interface{}) { + // deserialize the request object + data, err := json.Marshal(request) + if err != nil { + log.Ctx(ctx).Error().Err(errors.WithStack(err)).Msgf("%s: failed to marshal request", reflect.TypeOf(request)) + return + } + + subject := callbackPublishSubject(destNodeID, method) + log.Ctx(ctx).Trace().Msgf("Sending request %+v to subject %s", request, subject) + + // We use Publish instead of Request as Orchestrator callbacks do not return a response, for now. + err = conn.Publish(subject, data) + if err != nil { + log.Ctx(ctx).Error().Err(err).Msgf("%s: failed to send callback to node %s", reflect.TypeOf(request), destNodeID) + return + } +} + +// Compile-time interface check: +var _ compute.Callback = (*CallbackProxy)(nil) diff --git a/pkg/nats/proxy/compute_handler.go b/pkg/nats/proxy/compute_handler.go new file mode 100644 index 0000000000..f3f83909b1 --- /dev/null +++ b/pkg/nats/proxy/compute_handler.go @@ -0,0 +1,118 @@ +package proxy + +import ( + "context" + "encoding/json" + "fmt" + "reflect" + "strings" + + "github.com/bacalhau-project/bacalhau/pkg/compute" + "github.com/nats-io/nats.go" + "github.com/rs/zerolog/log" +) + +// ComputeHandlerParams defines parameters for creating a new ComputeHandler. +type ComputeHandlerParams struct { + Name string + Conn *nats.Conn + ComputeEndpoint compute.Endpoint +} + +// ComputeHandler handles NATS messages for compute operations. +type ComputeHandler struct { + name string + conn *nats.Conn + computeEndpoint compute.Endpoint + subscription *nats.Subscription +} + +// handlerWithResponse represents a function that processes a request and returns a response. +type handlerWithResponse[Request, Response any] func(context.Context, Request) (Response, error) + +// NewComputeHandler creates a new ComputeHandler. +func NewComputeHandler(params ComputeHandlerParams) (*ComputeHandler, error) { + handler := &ComputeHandler{ + name: params.Name, + conn: params.Conn, + computeEndpoint: params.ComputeEndpoint, + } + + subject := computeEndpointSubscribeSubject(handler.name) + subscription, err := handler.conn.Subscribe(subject, func(m *nats.Msg) { + handleRequest(m, handler) + }) + if err != nil { + return nil, err + } + handler.subscription = subscription + log.Debug().Msgf("ComputeHandler %s subscribed to %s", handler.name, subject) + return handler, nil +} + +// handleRequest handles incoming NATS messages. +func handleRequest(msg *nats.Msg, handler *ComputeHandler) { + ctx := context.Background() + + subjectParts := strings.Split(msg.Subject, ".") + method := subjectParts[len(subjectParts)-1] + + switch method { + case AskForBid: + processAndRespond(ctx, msg, handler.computeEndpoint.AskForBid) + case BidAccepted: + processAndRespond(ctx, msg, handler.computeEndpoint.BidAccepted) + case BidRejected: + processAndRespond(ctx, msg, handler.computeEndpoint.BidRejected) + case CancelExecution: + processAndRespond(ctx, msg, handler.computeEndpoint.CancelExecution) + case ExecutionLogs: + processAndRespond(ctx, msg, handler.computeEndpoint.ExecutionLogs) + default: + // Noop, not subscribed to this method + return + } +} + +// processAndRespond processes the request and sends a response. +func processAndRespond[Request, Response any](ctx context.Context, msg *nats.Msg, f handlerWithResponse[Request, Response]) { + response, err := processRequest(ctx, msg, f) + if err != nil { + log.Ctx(ctx).Error().Err(err) + } + + // We will wrap up the response/error in a Result type which can be decoded by the proxy itself. + result := newResult(response, err) + + err = sendResponse(result, msg) + if err != nil { + log.Ctx(ctx).Error().Msgf("error sending response: %s", err) + } +} + +// processRequest decodes the request, invokes the handler, and returns the response. +func processRequest[Request, Response any]( + ctx context.Context, msg *nats.Msg, f handlerWithResponse[Request, Response]) (*Response, error) { + request := new(Request) + err := json.Unmarshal(msg.Data, request) + if err != nil { + return nil, fmt.Errorf("error decoding %s: %s", reflect.TypeOf(request).Name(), err) + } + + response, err := f(ctx, *request) + if err != nil { + return nil, fmt.Errorf("error in handler %s: %s", reflect.TypeOf(request).Name(), err) + } + + return &response, nil +} + +// sendResponse marshals the response and sends it back to the requester. +func sendResponse[Response any](result Result[Response], msg *nats.Msg) error { + resultData, err := json.Marshal(result) + if err != nil { + return fmt.Errorf("error encoding %s: %s", reflect.TypeOf(result.Response).Name(), err) + } + + return msg.Respond(resultData) +} diff --git a/pkg/nats/proxy/compute_proxy.go b/pkg/nats/proxy/compute_proxy.go new file mode 100644 index 0000000000..d0812e650a --- /dev/null +++ b/pkg/nats/proxy/compute_proxy.go @@ -0,0 +1,94 @@ +package proxy + +import ( + "context" + "encoding/json" + "fmt" + "reflect" + + "github.com/bacalhau-project/bacalhau/pkg/compute" + "github.com/nats-io/nats.go" + "github.com/rs/zerolog/log" +) + +type ComputeProxyParams struct { + Conn *nats.Conn +} + +// ComputeProxy is a proxy to a compute node endpoint that will forward requests to remote compute nodes, or +// to a local compute node if the target peer ID is the same as the local host, and a LocalEndpoint implementation +// is provided. +type ComputeProxy struct { + conn *nats.Conn +} + +func NewComputeProxy(params ComputeProxyParams) *ComputeProxy { + proxy := &ComputeProxy{ + conn: params.Conn, + } + return proxy +} + +func (p *ComputeProxy) AskForBid(ctx context.Context, request compute.AskForBidRequest) (compute.AskForBidResponse, error) { + return proxyRequest[compute.AskForBidRequest, compute.AskForBidResponse]( + ctx, p.conn, request.TargetPeerID, AskForBid, request) +} + +func (p *ComputeProxy) BidAccepted(ctx context.Context, request compute.BidAcceptedRequest) (compute.BidAcceptedResponse, error) { + return proxyRequest[compute.BidAcceptedRequest, compute.BidAcceptedResponse]( + ctx, p.conn, request.TargetPeerID, BidAccepted, request) +} + +func (p *ComputeProxy) BidRejected(ctx context.Context, request compute.BidRejectedRequest) (compute.BidRejectedResponse, error) { + return proxyRequest[compute.BidRejectedRequest, compute.BidRejectedResponse]( + ctx, p.conn, request.TargetPeerID, BidRejected, request) +} + +func (p *ComputeProxy) CancelExecution( + ctx context.Context, request compute.CancelExecutionRequest) (compute.CancelExecutionResponse, error) { + return proxyRequest[compute.CancelExecutionRequest, compute.CancelExecutionResponse]( + ctx, p.conn, request.TargetPeerID, CancelExecution, request) +} + +func (p *ComputeProxy) ExecutionLogs( + ctx context.Context, request compute.ExecutionLogsRequest) (compute.ExecutionLogsResponse, error) { + return proxyRequest[compute.ExecutionLogsRequest, compute.ExecutionLogsResponse]( + ctx, p.conn, request.TargetPeerID, ExecutionLogs, request) +} + +func proxyRequest[Request any, Response any]( + ctx context.Context, + conn *nats.Conn, + destNodeID string, + method string, + request Request) (Response, error) { + // response object + response := new(Response) + + // deserialize the request object + data, err := json.Marshal(request) + if err != nil { + return *response, fmt.Errorf("%s: failed to marshal request: %w", reflect.TypeOf(request), err) + } + + subject := computeEndpointPublishSubject(destNodeID, method) + log.Ctx(ctx).Trace().Msgf("Sending request %+v to subject %s", request, subject) + res, err := conn.RequestWithContext(ctx, subject, data) + if err != nil { + return *response, fmt.Errorf("%s: failed to send request to node %s: %w", reflect.TypeOf(request), destNodeID, err) + } + + // The handler will have wrapped the response in a Result[T] along with + // any error that occurred, so we will decode it and pass the + // inner response/error on to the caller. + result := &Result[Response]{} + err = json.Unmarshal(res.Data, result) + if err != nil { + return *response, fmt.Errorf("%s: failed to decode response from peer %s: %w", reflect.TypeOf(request), destNodeID, err) + } + + return result.Rehydrate() +} + +// Compile-time interface check: +var _ compute.Endpoint = (*ComputeProxy)(nil) diff --git a/pkg/nats/proxy/constants.go b/pkg/nats/proxy/constants.go new file mode 100644 index 0000000000..5ebf6337bb --- /dev/null +++ b/pkg/nats/proxy/constants.go @@ -0,0 +1,35 @@ +package proxy + +import "fmt" + +const ( + ComputeEndpointSubjectPrefix = "node.compute" + CallbackSubjectPrefix = "node.orchestrator" + + AskForBid = "AskForBid/v1" + BidAccepted = "BidAccepted/v1" + BidRejected = "BidRejected/v1" + CancelExecution = "CancelExecution/v1" + ExecutionLogs = "ExecutionLogs/v1" + + OnBidComplete = "OnBidComplete/v1" + OnRunComplete = "OnRunComplete/v1" + OnCancelComplete = "OnCancelComplete/v1" + OnComputeFailure = "OnComputeFailure/v1" +) + +func computeEndpointPublishSubject(nodeID string, method string) string { + return fmt.Sprintf("%s.%s.%s", ComputeEndpointSubjectPrefix, nodeID, method) +} + +func computeEndpointSubscribeSubject(nodeID string) string { + return fmt.Sprintf("%s.%s.>", ComputeEndpointSubjectPrefix, nodeID) +} + +func callbackPublishSubject(nodeID string, method string) string { + return fmt.Sprintf("%s.%s.%s", CallbackSubjectPrefix, nodeID, method) +} + +func callbackSubscribeSubject(nodeID string) string { + return fmt.Sprintf("%s.%s.>", CallbackSubjectPrefix, nodeID) +} diff --git a/pkg/nats/proxy/types.go b/pkg/nats/proxy/types.go new file mode 100644 index 0000000000..a5b2bdf18a --- /dev/null +++ b/pkg/nats/proxy/types.go @@ -0,0 +1,30 @@ +package proxy + +import "errors" + +type Result[T any] struct { + Response T + Error string +} + +func newResult[T any](response *T, err error) Result[T] { + if err != nil { + return Result[T]{ + Error: err.Error(), + } + } + + return Result[T]{ + Response: *response, + } +} + +func (r *Result[T]) Rehydrate() (T, error) { + var e error = nil + + if r.Error != "" { + e = errors.New(r.Error) + } + + return r.Response, e +} diff --git a/pkg/nats/pubsub/pubsub.go b/pkg/nats/pubsub/pubsub.go new file mode 100644 index 0000000000..5665067238 --- /dev/null +++ b/pkg/nats/pubsub/pubsub.go @@ -0,0 +1,117 @@ +package pubsub + +import ( + "context" + "errors" + "reflect" + realsync "sync" + + "github.com/bacalhau-project/bacalhau/pkg/lib/marshaller" + "github.com/bacalhau-project/bacalhau/pkg/pubsub" + "github.com/bacalhau-project/bacalhau/pkg/system" + "github.com/nats-io/nats.go" + "github.com/rs/zerolog/log" +) + +type PubSubParams struct { + // Subject is the NATS subject to publish to. It is also used as the subscription subject if SubscriptionSubject is empty. + Subject string + // SubscriptionSubject is the NATS subject to subscribe to. If empty, Subject is used. + // This is useful when the subscription subject is different from the publishing subject, e.g. when using wildcards. + SubscriptionSubject string + // Conn is the NATS connection to use for publishing and subscribing. + Conn *nats.Conn +} + +type PubSub[T any] struct { + subject string + subscriptionSubject string + conn *nats.Conn + + subscription *nats.Subscription + subscriber pubsub.Subscriber[T] + subscriberOnce realsync.Once + closeOnce realsync.Once +} + +func NewPubSub[T any](params PubSubParams) (*PubSub[T], error) { + newPubSub := &PubSub[T]{ + conn: params.Conn, + subject: params.Subject, + subscriptionSubject: params.SubscriptionSubject, + } + if newPubSub.subscriptionSubject == "" { + newPubSub.subscriptionSubject = newPubSub.subject + } + return newPubSub, nil +} + +func (p *PubSub[T]) Publish(ctx context.Context, message T) error { + ctx, span := system.NewSpan(ctx, system.GetTracer(), "pkg/pubsub/nats.publish") + defer span.End() + + payload, err := marshaller.JSONMarshalWithMax(message) + if err != nil { + return err + } + + log.Ctx(ctx).Trace().Msgf("Sending message %+v", message) + return p.conn.Publish(p.subject, payload) +} + +func (p *PubSub[T]) Subscribe(ctx context.Context, subscriber pubsub.Subscriber[T]) (err error) { + var firstSubscriber bool + p.subscriberOnce.Do(func() { + log.Ctx(ctx).Debug().Msgf("Subscribing to subject %s", p.subscriptionSubject) + + // register the subscriber + p.subscriber = subscriber + + // subscribe to the subject + p.subscription, err = p.conn.Subscribe(p.subscriptionSubject, func(msg *nats.Msg) { + p.readMessage(context.Background(), msg) + }) + if err != nil { + return + } + + firstSubscriber = true + }) + if err != nil { + return err + } + if !firstSubscriber { + err = errors.New("only a single subscriber is allowed. Use ChainedSubscriber to chain multiple subscribers") + } + return err +} + +func (p *PubSub[T]) readMessage(ctx context.Context, msg *nats.Msg) { + var payload T + err := marshaller.JSONUnmarshalWithMax(msg.Data, &payload) + if err != nil { + log.Ctx(ctx).Error().Err(err).Msgf("error unmarshalling nats payload from subject %s", msg.Subject) + return + } + + err = p.subscriber.Handle(ctx, payload) + if err != nil { + log.Ctx(ctx).Error().Err(err).Msgf("error in handle message of type: %s", reflect.TypeOf(payload)) + } +} + +func (p *PubSub[T]) Close(ctx context.Context) (err error) { + p.closeOnce.Do(func() { + if p.subscription != nil { + err = p.subscription.Unsubscribe() + } + }) + if err != nil { + return err + } + log.Ctx(ctx).Info().Msgf("done closing nats pubsub for subject %s", p.subscriptionSubject) + return nil +} + +// compile-time interface assertions +var _ pubsub.PubSub[string] = (*PubSub[string])(nil) diff --git a/pkg/nats/pubsub/pubsub_test.go b/pkg/nats/pubsub/pubsub_test.go new file mode 100644 index 0000000000..3c8b18e3bf --- /dev/null +++ b/pkg/nats/pubsub/pubsub_test.go @@ -0,0 +1,131 @@ +//go:build unit || !integration + +package pubsub + +import ( + "context" + "testing" + "time" + + nats_helper "github.com/bacalhau-project/bacalhau/pkg/nats" + "github.com/bacalhau-project/bacalhau/pkg/pubsub" + "github.com/nats-io/nats-server/v2/server" + "github.com/phayes/freeport" + "github.com/rs/zerolog/log" + "github.com/stretchr/testify/suite" +) + +const subjectName = "topic.greetings" + +type PubSubSuite struct { + suite.Suite + natsServer *server.Server + node1 *PubSub[string] + node2 *PubSub[string] + subscriber1 *pubsub.InMemorySubscriber[string] + subscriber2 *pubsub.InMemorySubscriber[string] +} + +func (s *PubSubSuite) SetupSuite() { + ctx := context.Background() + s.natsServer = s.createNatsServer() + s.node1 = s.createPubSub(ctx, subjectName, "", s.natsServer.ClientURL()) + s.node2 = s.createPubSub(ctx, subjectName, "topic.*", s.natsServer.ClientURL()) + + s.subscriber1 = pubsub.NewInMemorySubscriber[string]() + s.subscriber2 = pubsub.NewInMemorySubscriber[string]() + s.NoError(s.node1.Subscribe(context.Background(), s.subscriber1)) + s.NoError(s.node2.Subscribe(context.Background(), s.subscriber2)) + + // wait for up to 10 seconds (5 loops with 2 seconds each) for nodes to discover each other + var s1, s2 bool + for i := 0; i < 5; i++ { + s.NoError(s.node1.Publish(context.Background(), "ping")) + s1, s2 = s.waitForMessage("ping", 2*time.Second, true, true) + if s1 || s2 { + // still one of the subscribers is waiting for the message + continue + } + } + if s1 { + s.FailNow("subscriber 1 didn't receive initialization message") + } + if s2 { + s.FailNow("subscriber 2 didn't receive initialization message") + } + log.Debug().Msg("nats pubsub suite is ready") +} + +func (s *PubSubSuite) TearDownSuite() { + s.NoError(s.node1.Close(context.Background())) + s.NoError(s.node2.Close(context.Background())) +} + +// createNatsServer creates a new nats server +func (s *PubSubSuite) createNatsServer() *server.Server { + ctx := context.Background() + port, err := freeport.GetFreePort() + s.Require().NoError(err) + + serverOpts := server.Options{ + Port: port, + } + + ns, err := nats_helper.NewServerManager(ctx, nats_helper.ServerManagerParams{ + Options: &serverOpts, + }) + s.Require().NoError(err) + + return ns.Server +} + +func (s *PubSubSuite) createPubSub(ctx context.Context, subject, subscriptionSubject string, server string) *PubSub[string] { + clientManager, err := nats_helper.NewClientManager(ctx, nats_helper.ClientManagerParams{ + Name: "test", + Servers: server, + }) + s.Require().NoError(err) + + pubSub, err := NewPubSub[string](PubSubParams{ + Conn: clientManager.Client, + Subject: subject, + SubscriptionSubject: subscriptionSubject, + }) + s.Require().NoError(err) + + return pubSub +} + +func TestPubSubSuite(t *testing.T) { + suite.Run(t, new(PubSubSuite)) +} + +func (s *PubSubSuite) TestPubSub() { + msg := "TestPubSub" + s.NoError(s.node1.Publish(context.Background(), msg)) + s.waitForMessage(msg, 10*time.Second, true, true) +} + +func (s *PubSubSuite) waitForMessage(msg string, duration time.Duration, checkSubscriber1, checkSubscriber2 bool) (bool, bool) { + waitUntil := time.Now().Add(duration) + checkSubscriber := func(subscriber *pubsub.InMemorySubscriber[string]) bool { + events := subscriber.Events() + if len(events) == 0 { + return false + } + s.Equal([]string{msg}, events) + return true + } + + for time.Now().Before(waitUntil) && (checkSubscriber1 || checkSubscriber2) { + time.Sleep(100 * time.Millisecond) + if checkSubscriber1 && checkSubscriber(s.subscriber1) { + checkSubscriber1 = false + } + if checkSubscriber2 && checkSubscriber(s.subscriber2) { + checkSubscriber2 = false + } + } + + return checkSubscriber1, checkSubscriber1 +} diff --git a/pkg/nats/server.go b/pkg/nats/server.go new file mode 100644 index 0000000000..5903b86bcc --- /dev/null +++ b/pkg/nats/server.go @@ -0,0 +1,80 @@ +package nats + +import ( + "context" + "fmt" + "time" + + "github.com/bacalhau-project/bacalhau/pkg/model" + "github.com/nats-io/nats-server/v2/server" + "github.com/rs/zerolog/log" +) + +const ReadyForConnectionsTimeout = 5 * time.Second + +type ServerManagerParams struct { + Options *server.Options + ConnectionTimeout time.Duration +} + +// ServerManager is a helper struct to manage a NATS server +type ServerManager struct { + Server *server.Server +} + +// NewServerManager is a helper function to create a NATS server with a given options +func NewServerManager(ctx context.Context, params ServerManagerParams) (*ServerManager, error) { + opts := params.Options + ns, err := server.NewServer(opts) + if err != nil { + return nil, err + } + ns.SetLoggerV2(NewZeroLogger(log.Logger, opts.ServerName), opts.Debug, opts.Trace, opts.TraceVerbose) + go ns.Start() + + if params.ConnectionTimeout == 0 { + params.ConnectionTimeout = ReadyForConnectionsTimeout + } + if !ns.ReadyForConnections(params.ConnectionTimeout) { + return nil, fmt.Errorf("could not start nats server on time") + } + log.Ctx(ctx).Info().Msgf("NATS server %s listening on %s", ns.ID(), ns.ClientURL()) + return &ServerManager{ + Server: ns, + }, err +} + +// Stop stops the NATS server +func (sm *ServerManager) Stop() { + sm.Server.Shutdown() +} + +// GetDebugInfo returns the debug info of the NATS server +func (sm *ServerManager) GetDebugInfo(ctx context.Context) (model.DebugInfo, error) { + varz, err := sm.Server.Varz(&server.VarzOptions{}) + if err != nil { + return model.DebugInfo{}, err + } + connz, err := sm.Server.Connz(&server.ConnzOptions{}) + if err != nil { + return model.DebugInfo{}, err + } + routez, err := sm.Server.Routez(&server.RoutezOptions{}) + if err != nil { + return model.DebugInfo{}, err + } + subsz, err := sm.Server.Subsz(&server.SubszOptions{}) + if err != nil { + return model.DebugInfo{}, err + } + return model.DebugInfo{ + Component: "NATSServer", + Info: map[string]interface{}{ + "ID": sm.Server.ID(), + "Varz": varz, + "Connz": connz, + "Routez": routez, + "Subsz": subsz, + }, + }, nil +} diff --git a/pkg/nats/transport/nats.go b/pkg/nats/transport/nats.go new file mode 100644 index 0000000000..f69086a1d2 --- /dev/null +++ b/pkg/nats/transport/nats.go @@ -0,0 +1,229 @@ +package transport + +import ( + "context" + "errors" + "fmt" + "strings" + + "github.com/bacalhau-project/bacalhau/pkg/compute" + "github.com/bacalhau-project/bacalhau/pkg/lib/validate" + "github.com/bacalhau-project/bacalhau/pkg/model" + "github.com/bacalhau-project/bacalhau/pkg/models" + nats_helper "github.com/bacalhau-project/bacalhau/pkg/nats" + "github.com/bacalhau-project/bacalhau/pkg/nats/proxy" + nats_pubsub "github.com/bacalhau-project/bacalhau/pkg/nats/pubsub" + "github.com/bacalhau-project/bacalhau/pkg/pubsub" + "github.com/bacalhau-project/bacalhau/pkg/routing" + core_transport "github.com/bacalhau-project/bacalhau/pkg/transport" + "github.com/hashicorp/go-multierror" + "github.com/nats-io/nats-server/v2/server" + "github.com/rs/zerolog/log" +) + +const NodeInfoSubjectPrefix = "node.info." + +type NATSTransportConfig struct { + NodeID string + Port int + AdvertisedAddress string + Orchestrators []string + IsRequesterNode bool + + // Cluster config for requester nodes to connect with each other + ClusterName string + ClusterPort int + ClusterAdvertisedAddress string + ClusterPeers []string +} + +func (c *NATSTransportConfig) Validate() error { + var mErr *multierror.Error + if validate.IsBlank(c.NodeID) { + mErr = multierror.Append(mErr, errors.New("missing node ID")) + } else if validate.ContainsSpaces(c.NodeID) { + mErr = multierror.Append(mErr, errors.New("node ID contains a space")) + } else if validate.ContainsNull(c.NodeID) { + mErr = multierror.Append(mErr, errors.New("node ID contains a null character")) + } + + if c.IsRequesterNode { + mErr = multierror.Append(mErr, validate.IsGreaterThanZero(c.Port, "port %d must be greater than zero", c.Port)) + + // if cluster config is set, validate it + if c.ClusterName != "" || c.ClusterPort != 0 || c.ClusterAdvertisedAddress != "" || len(c.ClusterPeers) > 0 { + mErr = multierror.Append(mErr, validate.IsGreaterThanZero(c.ClusterPort, "cluster port %d must be greater than zero", c.Port)) + } + } else { + if validate.IsEmpty(c.Orchestrators) { + mErr = multierror.Append(mErr, errors.New("missing orchestrators")) + } + } + return mErr.ErrorOrNil() +} + +type NATSTransport struct { + nodeID string + natsServer *nats_helper.ServerManager + natsClient *nats_helper.ClientManager + computeProxy compute.Endpoint + callbackProxy compute.Callback + nodeInfoPubSub pubsub.PubSub[models.NodeInfo] + nodeInfoDecorator models.NodeInfoDecorator +} + +func NewNATSTransport(ctx context.Context, + config NATSTransportConfig, + nodeInfoStore routing.NodeInfoStore) (*NATSTransport, error) { + log.Debug().Msgf("Creating NATS transport with config: %+v", config) + if err := config.Validate(); err != nil { + return nil, fmt.Errorf("error validating nats transport config. %w", err) + } + + var sm *nats_helper.ServerManager + if config.IsRequesterNode { + // create nats server with servers acting as its cluster peers + routes, err := nats_helper.RoutesFromSlice(config.ClusterPeers) + if err != nil { + return nil, err + } + serverOps := &server.Options{ + ServerName: config.NodeID, + Port: config.Port, + ClientAdvertise: config.AdvertisedAddress, + Routes: routes, + Debug: true, // will only be used if log level is debug + Cluster: server.ClusterOpts{ + Name: config.ClusterName, + Port: config.ClusterPort, + Advertise: config.ClusterAdvertisedAddress, + }, + } + log.Debug().Msgf("Creating NATS server with options: %+v", serverOps) + sm, err = nats_helper.NewServerManager(ctx, nats_helper.ServerManagerParams{ + Options: serverOps, + }) + if err != nil { + return nil, err + } + + config.Orchestrators = append(config.Orchestrators, sm.Server.ClientURL()) + } + + // create nats client + log.Debug().Msgf("Creating NATS client with servers: %s", strings.Join(config.Orchestrators, ",")) + nc, err := nats_helper.NewClientManager(ctx, nats_helper.ClientManagerParams{ + Name: config.NodeID, + Servers: strings.Join(config.Orchestrators, ","), + }) + if err != nil { + return nil, err + } + + // PubSub to publish and consume node info messages + nodeInfoPubSub, err := nats_pubsub.NewPubSub[models.NodeInfo](nats_pubsub.PubSubParams{ + Conn: nc.Client, + Subject: NodeInfoSubjectPrefix + config.NodeID, + SubscriptionSubject: NodeInfoSubjectPrefix + "*", + }) + if err != nil { + return nil, err + } + + if config.IsRequesterNode { + // subscribe to nodeInfo subject and add nodeInfo to nodeInfoStore + nodeInfoSubscriber := pubsub.NewChainedSubscriber[models.NodeInfo](true) + nodeInfoSubscriber.Add(pubsub.SubscriberFunc[models.NodeInfo](nodeInfoStore.Add)) + err = nodeInfoPubSub.Subscribe(ctx, nodeInfoSubscriber) + if err != nil { + return nil, err + } + } + + // compute proxy + computeProxy := proxy.NewComputeProxy(proxy.ComputeProxyParams{ + Conn: nc.Client, + }) + + // Callback to send compute events (i.e. requester endpoint) + computeCallback := proxy.NewCallbackProxy(proxy.CallbackProxyParams{ + Conn: nc.Client, + }) + + return &NATSTransport{ + nodeID: config.NodeID, + natsServer: sm, + natsClient: nc, + computeProxy: computeProxy, + callbackProxy: computeCallback, + nodeInfoPubSub: nodeInfoPubSub, + nodeInfoDecorator: models.NoopNodeInfoDecorator{}, + }, nil +} + +// RegisterComputeCallback registers a compute callback with the transport layer. +func (t *NATSTransport) RegisterComputeCallback(callback compute.Callback) error { + _, err := proxy.NewCallbackHandler(proxy.CallbackHandlerParams{ + Name: t.nodeID, + Conn: t.natsClient.Client, + Callback: callback, + }) + return err +} + +// RegisterComputeEndpoint registers a compute endpoint with the transport layer. +func (t *NATSTransport) RegisterComputeEndpoint(endpoint compute.Endpoint) error { + _, err := proxy.NewComputeHandler(proxy.ComputeHandlerParams{ + Name: t.nodeID, + Conn: t.natsClient.Client, + ComputeEndpoint: endpoint, + }) + return err +} + +// ComputeProxy returns the compute proxy. +func (t *NATSTransport) ComputeProxy() compute.Endpoint { + return t.computeProxy +} + +// CallbackProxy returns the callback proxy. +func (t *NATSTransport) CallbackProxy() compute.Callback { + return t.callbackProxy +} + +// NodeInfoPubSub returns the node info pubsub. +func (t *NATSTransport) NodeInfoPubSub() pubsub.PubSub[models.NodeInfo] { + return t.nodeInfoPubSub +} + +// NodeInfoDecorator returns the node info decorator. +func (t *NATSTransport) NodeInfoDecorator() models.NodeInfoDecorator { + return t.nodeInfoDecorator +} + +// DebugInfoProviders returns the debug info of the NATS transport layer +func (t *NATSTransport) DebugInfoProviders() []model.DebugInfoProvider { + var debugInfoProviders []model.DebugInfoProvider + if t.natsServer != nil { + debugInfoProviders = append(debugInfoProviders, t.natsServer) + } + if t.natsClient != nil { + debugInfoProviders = append(debugInfoProviders, t.natsClient) + } + return debugInfoProviders +} + +// Close closes the transport layer. +func (t *NATSTransport) Close(ctx context.Context) error { + if t.natsServer != nil { + log.Ctx(ctx).Debug().Msgf("Shutting down server %s", t.natsServer.Server.Name()) + t.natsServer.Stop() + } + if t.natsClient != nil { + t.natsClient.Stop() + } + return nil +} + +// compile-time interface check +var _ core_transport.TransportLayer = (*NATSTransport)(nil) diff --git a/pkg/nats/util.go b/pkg/nats/util.go new file mode 100644 index 0000000000..9a82803500 --- /dev/null +++ b/pkg/nats/util.go @@ -0,0 +1,41 @@ +package nats + +import ( + "net/url" + "regexp" + "strings" +) + +var schemeRegex = regexp.MustCompile(`^[a-zA-Z][a-zA-Z0-9+-.]*://`) + +const defaultScheme = "nats://" + +// RoutesFromStr parses route URLs from a string +// e.g. "nats://localhost:4222,nats://localhost:4223" +func RoutesFromStr(routesStr string) ([]*url.URL, error) { + routes := strings.Split(routesStr, ",") + if len(routes) == 0 { + return nil, nil + } + var routeUrls []*url.URL + for _, r := range routes { + r = strings.TrimSpace(r) + if !schemeRegex.MatchString(r) { + r = defaultScheme + r + } + u, err := url.Parse(r) + if err != nil { + return nil, err + } + routeUrls = append(routeUrls, u) + } + return routeUrls, nil +} + +// RoutesFromSlice parses route URLs from a slice of strings +func RoutesFromSlice(routes []string) ([]*url.URL, error) { + if len(routes) == 0 { + return []*url.URL{}, nil + } + return RoutesFromStr(strings.Join(routes, ",")) +} diff --git a/pkg/node/compute.go b/pkg/node/compute.go index 2448a0a6ce..06c2f2e6f6 100644 --- a/pkg/node/compute.go +++ b/pkg/node/compute.go @@ -6,11 +6,6 @@ import ( "net/url" "github.com/bacalhau-project/bacalhau/pkg/bidstrategy" - "github.com/bacalhau-project/bacalhau/pkg/models" - "github.com/bacalhau-project/bacalhau/pkg/publicapi" - compute_endpoint "github.com/bacalhau-project/bacalhau/pkg/publicapi/endpoint/compute" - "github.com/libp2p/go-libp2p/core/host" - "github.com/bacalhau-project/bacalhau/pkg/bidstrategy/resource" "github.com/bacalhau-project/bacalhau/pkg/bidstrategy/semantic" "github.com/bacalhau-project/bacalhau/pkg/compute" @@ -22,33 +17,37 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/executor" executor_util "github.com/bacalhau-project/bacalhau/pkg/executor/util" "github.com/bacalhau-project/bacalhau/pkg/model" + "github.com/bacalhau-project/bacalhau/pkg/models" + "github.com/bacalhau-project/bacalhau/pkg/publicapi" + compute_endpoint "github.com/bacalhau-project/bacalhau/pkg/publicapi/endpoint/compute" "github.com/bacalhau-project/bacalhau/pkg/publisher" "github.com/bacalhau-project/bacalhau/pkg/repo" "github.com/bacalhau-project/bacalhau/pkg/storage" repo_storage "github.com/bacalhau-project/bacalhau/pkg/storage/repo" "github.com/bacalhau-project/bacalhau/pkg/system" - "github.com/bacalhau-project/bacalhau/pkg/transport/bprotocol" + "github.com/libp2p/go-libp2p/core/host" ) type Compute struct { // Visible for testing - ID string - LocalEndpoint compute.Endpoint - Capacity capacity.Tracker - ExecutionStore store.ExecutionStore - Executors executor.ExecutorProvider - Storages storage.StorageProvider - LogServer *logstream.LogStreamServer - Bidder compute.Bidder - computeCallback *bprotocol.CallbackProxy - cleanupFunc func(ctx context.Context) - computeInfoProvider models.ComputeNodeInfoProvider - autoLabelsProvider models.LabelsProvider + ID string + LocalEndpoint compute.Endpoint + Capacity capacity.Tracker + ExecutionStore store.ExecutionStore + Executors executor.ExecutorProvider + Storages storage.StorageProvider + LogServer *logstream.LogStreamServer + Bidder compute.Bidder + cleanupFunc func(ctx context.Context) + nodeInfoDecorator models.NodeInfoDecorator + autoLabelsProvider models.LabelsProvider + debugInfoProviders []model.DebugInfoProvider } //nolint:funlen func NewComputeNode( ctx context.Context, + nodeID string, cleanupManager *system.CleanupManager, host host.Host, apiServer *publicapi.Server, @@ -58,12 +57,13 @@ func NewComputeNode( executors executor.ExecutorProvider, publishers publisher.PublisherProvider, fsRepo *repo.FsRepo, + computeCallback compute.Callback, ) (*Compute, error) { var executionStore store.ExecutionStore // create the execution store if config.ExecutionStore == nil { var err error - executionStore, err = fsRepo.InitExecutionStore(ctx, host.ID().String()) + executionStore, err = fsRepo.InitExecutionStore(ctx, nodeID) if err != nil { return nil, err } @@ -79,17 +79,12 @@ func NewComputeNode( MaxCapacity: config.QueueResourceLimits, }) - // Callback to send compute events (i.e. requester endpoint) - computeCallback := bprotocol.NewCallbackProxy(bprotocol.CallbackProxyParams{ - Host: host, - }) - resultsPath, err := compute.NewResultsPath() if err != nil { return nil, err } baseExecutor := compute.NewBaseExecutor(compute.BaseExecutorParams{ - ID: host.ID().String(), + ID: nodeID, Callback: computeCallback, Store: executionStore, StorageDirectory: storagePath, @@ -101,7 +96,7 @@ func NewComputeNode( }) bufferRunner := compute.NewExecutorBuffer(compute.ExecutorBufferParams{ - ID: host.ID().String(), + ID: nodeID, DelegateExecutor: baseExecutor, Callback: computeCallback, RunningCapacityTracker: runningCapacityTracker, @@ -183,21 +178,25 @@ func NewComputeNode( } // logging server - logserver := logstream.NewLogStreamServer(logstream.LogStreamServerOptions{ - Ctx: ctx, - Host: host, - ExecutionStore: executionStore, - // - Executors: executors, - }) - _, loggingCancel := context.WithCancel(ctx) - cleanupManager.RegisterCallback(func() error { - loggingCancel() - return nil - }) + // TODO: make logging server agnostic to libp2p transport + var logserver *logstream.LogStreamServer + if host != nil { + logserver = logstream.NewLogStreamServer(logstream.LogStreamServerOptions{ + Ctx: ctx, + Host: host, + ExecutionStore: executionStore, + // + Executors: executors, + }) + _, loggingCancel := context.WithCancel(ctx) + cleanupManager.RegisterCallback(func() error { + loggingCancel() + return nil + }) + } // node info - nodeInfoProvider := compute.NewNodeInfoProvider(compute.NodeInfoProviderParams{ + nodeInfoDecorator := compute.NewNodeInfoDecorator(compute.NodeInfoDecoratorParams{ Executors: executors, Publisher: publishers, Storages: storages, @@ -208,7 +207,7 @@ func NewComputeNode( bidStrat := bidstrategy.NewChainedBidStrategy(semanticBidStrat, resourceBidStrat) bidder := compute.NewBidder(compute.BidderParams{ - NodeID: host.ID().String(), + NodeID: nodeID, SemanticStrategy: bidStrat, ResourceStrategy: bidStrat, Store: executionStore, @@ -220,17 +219,12 @@ func NewComputeNode( }) baseEndpoint := compute.NewBaseEndpoint(compute.BaseEndpointParams{ - ID: host.ID().String(), + ID: nodeID, ExecutionStore: executionStore, UsageCalculator: capacityCalculator, Bidder: bidder, Executor: bufferRunner, - LogServer: *logserver, - }) - - bprotocol.NewComputeHandler(bprotocol.ComputeHandlerParams{ - Host: host, - ComputeEndpoint: baseEndpoint, + LogServer: logserver, }) // register debug info providers for the /debug endpoint @@ -267,25 +261,21 @@ func NewComputeNode( ) return &Compute{ - ID: host.ID().String(), - LocalEndpoint: baseEndpoint, - Capacity: runningCapacityTracker, - ExecutionStore: executionStore, - Executors: executors, - Storages: storages, - Bidder: bidder, - LogServer: logserver, - computeCallback: computeCallback, - cleanupFunc: cleanupFunc, - computeInfoProvider: nodeInfoProvider, - autoLabelsProvider: labelsProvider, + ID: nodeID, + LocalEndpoint: baseEndpoint, + Capacity: runningCapacityTracker, + ExecutionStore: executionStore, + Executors: executors, + Storages: storages, + Bidder: bidder, + LogServer: logserver, + cleanupFunc: cleanupFunc, + nodeInfoDecorator: nodeInfoDecorator, + autoLabelsProvider: labelsProvider, + debugInfoProviders: debugInfoProviders, }, nil } -func (c *Compute) RegisterLocalComputeCallback(callback compute.Callback) { - c.computeCallback.RegisterLocalComputeCallback(callback) -} - func (c *Compute) cleanup(ctx context.Context) { c.cleanupFunc(ctx) } diff --git a/pkg/node/config_network.go b/pkg/node/config_network.go new file mode 100644 index 0000000000..f31cbe7991 --- /dev/null +++ b/pkg/node/config_network.go @@ -0,0 +1,45 @@ +package node + +import ( + "errors" + "fmt" + "time" + + "github.com/bacalhau-project/bacalhau/pkg/lib/validate" + "github.com/bacalhau-project/bacalhau/pkg/models" + "github.com/hashicorp/go-multierror" + "github.com/libp2p/go-libp2p/core/host" + "github.com/samber/lo" +) + +var supportedNetworks = []string{ + models.NetworkTypeLibp2p, + models.NetworkTypeNATS, +} + +type NetworkConfig struct { + Type string + Libp2pHost host.Host // only set if using libp2p transport, nil otherwise + ReconnectDelay time.Duration + + // NATS config for requesters to be reachable by compute nodes + Port int + AdvertisedAddress string + Orchestrators []string + + // NATS config for requester nodes to connect with each other + ClusterName string + ClusterPort int + ClusterAdvertisedAddress string + ClusterPeers []string +} + +func (c *NetworkConfig) Validate() error { + var mErr *multierror.Error + if validate.IsBlank(c.Type) { + mErr = multierror.Append(mErr, errors.New("missing network type")) + } else if !lo.Contains(supportedNetworks, c.Type) { + mErr = multierror.Append(mErr, fmt.Errorf("network type %s not in supported values %s", c.Type, supportedNetworks)) + } + return mErr.ErrorOrNil() +} diff --git a/pkg/node/factories.go b/pkg/node/factories.go index 1745245510..1721f39682 100644 --- a/pkg/node/factories.go +++ b/pkg/node/factories.go @@ -70,7 +70,7 @@ func NewStandardExecutorsFactory() ExecutorsFactory { ctx, nodeConfig.CleanupManager, executor_util.StandardExecutorOptions{ - DockerID: fmt.Sprintf("bacalhau-%s", nodeConfig.Host.ID().String()), + DockerID: fmt.Sprintf("bacalhau-%s", nodeConfig.NodeID), }, ) if err != nil { @@ -142,9 +142,9 @@ func NewStandardAuthenticatorsFactory() AuthenticatorsFactory { map[string]authn.Authenticator{ "ClientKey": challenge.NewAuthenticator( challenge.AnonymousModePolicy, - nodeConfig.Host.ID(), + challenge.NewStringMarshaller(nodeConfig.NodeID), privKey, - nodeConfig.Host.ID().String(), + nodeConfig.NodeID, ), }, ), nil diff --git a/pkg/node/node.go b/pkg/node/node.go index 82049f85b7..4a05fc891e 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -5,36 +5,29 @@ import ( "fmt" "time" - "github.com/imdario/mergo" - "github.com/labstack/echo/v4" - libp2p_pubsub "github.com/libp2p/go-libp2p-pubsub" - "github.com/libp2p/go-libp2p/core/host" - basichost "github.com/libp2p/go-libp2p/p2p/host/basic" - routedhost "github.com/libp2p/go-libp2p/p2p/host/routed" - "github.com/libp2p/go-libp2p/p2p/protocol/identify" - "github.com/bacalhau-project/bacalhau/pkg/authz" + pkgconfig "github.com/bacalhau-project/bacalhau/pkg/config" + "github.com/bacalhau-project/bacalhau/pkg/ipfs" + libp2p_transport "github.com/bacalhau-project/bacalhau/pkg/libp2p/transport" + "github.com/bacalhau-project/bacalhau/pkg/model" "github.com/bacalhau-project/bacalhau/pkg/models" + nats_transport "github.com/bacalhau-project/bacalhau/pkg/nats/transport" "github.com/bacalhau-project/bacalhau/pkg/publicapi" "github.com/bacalhau-project/bacalhau/pkg/publicapi/apimodels" "github.com/bacalhau-project/bacalhau/pkg/publicapi/endpoint/agent" "github.com/bacalhau-project/bacalhau/pkg/publicapi/endpoint/shared" - - pkgconfig "github.com/bacalhau-project/bacalhau/pkg/config" - "github.com/bacalhau-project/bacalhau/pkg/ipfs" - "github.com/bacalhau-project/bacalhau/pkg/pubsub" - "github.com/bacalhau-project/bacalhau/pkg/pubsub/libp2p" "github.com/bacalhau-project/bacalhau/pkg/repo" "github.com/bacalhau-project/bacalhau/pkg/routing" "github.com/bacalhau-project/bacalhau/pkg/routing/inmemory" "github.com/bacalhau-project/bacalhau/pkg/system" - "github.com/bacalhau-project/bacalhau/pkg/util" + "github.com/bacalhau-project/bacalhau/pkg/transport" "github.com/bacalhau-project/bacalhau/pkg/version" + "github.com/hashicorp/go-multierror" + "github.com/imdario/mergo" + "github.com/labstack/echo/v4" + "github.com/libp2p/go-libp2p/core/host" ) -const JobInfoTopic = "bacalhau-job-info" -const NodeInfoTopic = "bacalhau-node-info" - type FeatureConfig struct { Engines []string Publishers []string @@ -43,9 +36,9 @@ type FeatureConfig struct { // Node configuration type NodeConfig struct { + NodeID string IPFSClient ipfs.Client CleanupManager *system.CleanupManager - Host host.Host HostAddress string APIPort uint16 RequesterAutoCert string @@ -64,7 +57,15 @@ type NodeConfig struct { AllowListedLocalPaths []string NodeInfoStoreTTL time.Duration - FsRepo *repo.FsRepo + FsRepo *repo.FsRepo + NetworkConfig NetworkConfig +} + +func (c *NodeConfig) Validate() error { + // TODO: add more validations + var mErr *multierror.Error + mErr = multierror.Append(mErr, c.NetworkConfig.Validate()) + return mErr.ErrorOrNil() } // Lazy node dependency injector that generate instances of different @@ -96,13 +97,14 @@ func NewStandardNodeDependencyInjector() NodeDependencyInjector { type Node struct { // Visible for testing + ID string APIServer *publicapi.Server ComputeNode *Compute RequesterNode *Requester NodeInfoStore routing.NodeInfoStore CleanupManager *system.CleanupManager IPFSClient ipfs.Client - Host host.Host + Libp2pHost host.Host // only set if using libp2p transport, nil otherwise } func (n *Node) Start(ctx context.Context) error { @@ -113,13 +115,16 @@ func (n *Node) Start(ctx context.Context) error { func NewNode( ctx context.Context, config NodeConfig) (*Node, error) { - ctx, span := system.NewSpan(ctx, system.GetTracer(), "pkg/node.NewNode") - defer span.End() - - identify.ActivationThresh = 2 + var err error + ctx, cancel := context.WithCancel(ctx) + defer func() { + if err != nil { + cancel() + } + }() config.DependencyInjector = mergeDependencyInjectors(config.DependencyInjector, NewStandardNodeDependencyInjector()) - err := mergo.Merge(&config.APIServerConfig, publicapi.DefaultConfig()) + err = mergo.Merge(&config.APIServerConfig, publicapi.DefaultConfig()) if err != nil { return nil, err } @@ -128,6 +133,11 @@ func NewNode( config.APIServerConfig.LogLevel = "trace" } + err = config.Validate() + if err != nil { + return nil, fmt.Errorf("error validating node config. %w", err) + } + storageProviders, err := config.DependencyInjector.StorageProvidersFactory.Get(ctx, config) if err != nil { return nil, err @@ -148,49 +158,6 @@ func NewNode( return nil, err } - // A single gossipSub instance that will be used by all topics - gossipSubCtx, gossipSubCancel := context.WithCancel(ctx) - gossipSub, err := newLibp2pPubSub(gossipSubCtx, config) - defer func() { - if err != nil { - gossipSubCancel() - } - }() - - if err != nil { - return nil, err - } - - // PubSub to publish node info to the network - nodeInfoPubSub, err := libp2p.NewPubSub[models.NodeInfo](libp2p.PubSubParams{ - Host: config.Host, - TopicName: NodeInfoTopic, - PubSub: gossipSub, - }) - if err != nil { - return nil, err - } - - // node info publisher - nodeInfoPublisherInterval := config.NodeInfoPublisherInterval - if nodeInfoPublisherInterval.IsZero() { - nodeInfoPublisherInterval = GetNodeInfoPublishConfig() - } - - // node info store that is used for both discovering compute nodes, as to find addresses of other nodes for routing requests. - nodeInfoStore := inmemory.NewNodeInfoStore(inmemory.NodeInfoStoreParams{ - TTL: config.NodeInfoStoreTTL, - }) - routedHost := routedhost.Wrap(config.Host, nodeInfoStore) - - // register consumers of node info published over gossipSub - nodeInfoSubscriber := pubsub.NewChainedSubscriber[models.NodeInfo](true) - nodeInfoSubscriber.Add(pubsub.SubscriberFunc[models.NodeInfo](nodeInfoStore.Add)) - err = nodeInfoPubSub.Subscribe(ctx, nodeInfoSubscriber) - if err != nil { - return nil, err - } - // timeoutHandler doesn't implement http.Hijacker, so we need to skip it for websocket endpoints config.APIServerConfig.SkippedTimeoutPaths = append(config.APIServerConfig.SkippedTimeoutPaths, []string{ "/api/v1/requester/websocket/events", @@ -203,7 +170,7 @@ func NewNode( Router: echo.New(), Address: config.HostAddress, Port: config.APIPort, - HostID: config.Host.ID().String(), + HostID: config.NodeID, Config: config.APIServerConfig, Authorizer: authz.AlwaysAllow, Headers: map[string]string{ @@ -228,28 +195,67 @@ func NewNode( return nil, err } + // node info store that is used for both discovering compute nodes, as to find addresses of other nodes for routing requests. + nodeInfoStore := inmemory.NewNodeInfoStore(inmemory.NodeInfoStoreParams{ + TTL: config.NodeInfoStoreTTL, + }) + + var transportLayer transport.TransportLayer + + if config.NetworkConfig.Type == models.NetworkTypeNATS { + natsConfig := nats_transport.NATSTransportConfig{ + NodeID: config.NodeID, + Port: config.NetworkConfig.Port, + AdvertisedAddress: config.NetworkConfig.AdvertisedAddress, + Orchestrators: config.NetworkConfig.Orchestrators, + ClusterName: config.NetworkConfig.ClusterName, + ClusterPort: config.NetworkConfig.ClusterPort, + ClusterPeers: config.NetworkConfig.ClusterPeers, + ClusterAdvertisedAddress: config.NetworkConfig.ClusterAdvertisedAddress, + IsRequesterNode: config.IsRequesterNode, + } + transportLayer, err = nats_transport.NewNATSTransport(ctx, natsConfig, nodeInfoStore) + } else { + libp2pConfig := libp2p_transport.Libp2pTransportConfig{ + Host: config.NetworkConfig.Libp2pHost, + Peers: config.NetworkConfig.ClusterPeers, + ReconnectDelay: config.NetworkConfig.ReconnectDelay, + CleanupManager: config.CleanupManager, + } + transportLayer, err = libp2p_transport.NewLibp2pTransport(ctx, libp2pConfig, nodeInfoStore) + } + if err != nil { + return nil, err + } + + var debugInfoProviders []model.DebugInfoProvider + debugInfoProviders = append(debugInfoProviders, transportLayer.DebugInfoProviders()...) + var requesterNode *Requester var computeNode *Compute - - var computeInfoProvider models.ComputeNodeInfoProvider var labelsProvider models.LabelsProvider = &ConfigLabelsProvider{staticLabels: config.Labels} // setup requester node if config.IsRequesterNode { requesterNode, err = NewRequesterNode( ctx, - routedHost, + config.NodeID, apiServer, config.RequesterNodeConfig, storageProviders, authenticators, nodeInfoStore, - gossipSub, config.FsRepo, + transportLayer.ComputeProxy(), ) if err != nil { return nil, err } + err = transportLayer.RegisterComputeCallback(requesterNode.localCallback) + if err != nil { + return nil, err + } + debugInfoProviders = append(debugInfoProviders, requesterNode.debugInfoProviders...) } if config.IsComputeNode { @@ -258,8 +264,9 @@ func NewNode( // setup compute node computeNode, err = NewComputeNode( ctx, + config.NodeID, config.CleanupManager, - routedHost, + config.NetworkConfig.Libp2pHost, apiServer, config.ComputeConfig, storagePath, @@ -267,47 +274,56 @@ func NewNode( executors, publishers, config.FsRepo, + transportLayer.CallbackProxy(), ) if err != nil { return nil, err } - computeInfoProvider = computeNode.computeInfoProvider + err = transportLayer.RegisterComputeEndpoint(computeNode.LocalEndpoint) + if err != nil { + return nil, err + } + labelsProvider = models.MergeLabelsInOrder( computeNode.autoLabelsProvider, labelsProvider, ) + debugInfoProviders = append(debugInfoProviders, computeNode.debugInfoProviders...) } - // node info provider - basicHost, ok := config.Host.(*basichost.BasicHost) - if !ok { - return nil, fmt.Errorf("host is not a basic host") - } nodeInfoProvider := routing.NewNodeInfoProvider(routing.NodeInfoProviderParams{ - Host: basicHost, - IdentityService: basicHost.IDService(), - LabelsProvider: labelsProvider, - ComputeInfoProvider: computeInfoProvider, - BacalhauVersion: *version.Get(), + NodeID: config.NodeID, + LabelsProvider: labelsProvider, + BacalhauVersion: *version.Get(), }) + nodeInfoProvider.RegisterNodeInfoDecorator(transportLayer.NodeInfoDecorator()) + if computeNode != nil { + nodeInfoProvider.RegisterNodeInfoDecorator(computeNode.nodeInfoDecorator) + } shared.NewEndpoint(shared.EndpointParams{ Router: apiServer.Router, - NodeID: config.Host.ID().String(), - PeerStore: config.Host.Peerstore(), + NodeID: config.NodeID, NodeInfoProvider: nodeInfoProvider, }) agent.NewEndpoint(agent.EndpointParams{ - Router: apiServer.Router, - NodeInfoProvider: nodeInfoProvider, + Router: apiServer.Router, + NodeInfoProvider: nodeInfoProvider, + DebugInfoProviders: debugInfoProviders, }) + // node info publisher + nodeInfoPublisherInterval := config.NodeInfoPublisherInterval + if nodeInfoPublisherInterval.IsZero() { + nodeInfoPublisherInterval = GetNodeInfoPublishConfig() + } + // NB(forrest): this must be done last to avoid eager publishing before nodes are constructed // TODO(forrest) [fixme] we should fix this to make it less racy in testing nodeInfoPublisher := routing.NewNodeInfoPublisher(routing.NodeInfoPublisherParams{ - PubSub: nodeInfoPubSub, + PubSub: transportLayer.NodeInfoPubSub(), NodeInfoProvider: nodeInfoProvider, IntervalConfig: nodeInfoPublisherInterval, }) @@ -333,31 +349,23 @@ func NewNode( requesterNode.cleanup(ctx) } nodeInfoPublisher.Stop(ctx) - cleanupErr := nodeInfoPubSub.Close(ctx) - util.LogDebugIfContextCancelled(ctx, cleanupErr, "node info pub sub") - gossipSubCancel() - - cleanupErr = config.Host.Close() - util.LogDebugIfContextCancelled(ctx, cleanupErr, "host") - cleanupErr = apiServer.Shutdown(ctx) - return cleanupErr + var errors *multierror.Error + errors = multierror.Append(errors, transportLayer.Close(ctx)) + errors = multierror.Append(errors, apiServer.Shutdown(ctx)) + cancel() + return errors.ErrorOrNil() }) - if requesterNode != nil && computeNode != nil { - // To enable nodes self-dialing themselves as libp2p doesn't support it. - computeNode.RegisterLocalComputeCallback(requesterNode.localCallback) - requesterNode.RegisterLocalComputeEndpoint(computeNode.LocalEndpoint) - } - node := &Node{ + ID: config.NodeID, CleanupManager: config.CleanupManager, APIServer: apiServer, IPFSClient: config.IPFSClient, ComputeNode: computeNode, RequesterNode: requesterNode, NodeInfoStore: nodeInfoStore, - Host: routedHost, + Libp2pHost: config.NetworkConfig.Libp2pHost, } return node, nil @@ -373,27 +381,6 @@ func (n *Node) IsComputeNode() bool { return n.ComputeNode != nil } -func newLibp2pPubSub(ctx context.Context, nodeConfig NodeConfig) (*libp2p_pubsub.PubSub, error) { - tracer, err := libp2p_pubsub.NewJSONTracer(pkgconfig.GetLibp2pTracerPath()) - if err != nil { - return nil, err - } - - pgParams := libp2p_pubsub.NewPeerGaterParams( - 0.33, //nolint:gomnd - libp2p_pubsub.ScoreParameterDecay(2*time.Minute), //nolint:gomnd - libp2p_pubsub.ScoreParameterDecay(10*time.Minute), //nolint:gomnd - ) - - return libp2p_pubsub.NewGossipSub( - ctx, - nodeConfig.Host, - libp2p_pubsub.WithPeerExchange(true), - libp2p_pubsub.WithPeerGater(pgParams), - libp2p_pubsub.WithEventTracer(tracer), - ) -} - func mergeDependencyInjectors(injector NodeDependencyInjector, defaultInjector NodeDependencyInjector) NodeDependencyInjector { if injector.StorageProvidersFactory == nil { injector.StorageProvidersFactory = defaultInjector.StorageProvidersFactory diff --git a/pkg/node/requester.go b/pkg/node/requester.go index 78b38f47f9..25b2dd90f5 100644 --- a/pkg/node/requester.go +++ b/pkg/node/requester.go @@ -17,15 +17,10 @@ import ( auth_endpoint "github.com/bacalhau-project/bacalhau/pkg/publicapi/endpoint/auth" orchestrator_endpoint "github.com/bacalhau-project/bacalhau/pkg/publicapi/endpoint/orchestrator" requester_endpoint "github.com/bacalhau-project/bacalhau/pkg/publicapi/endpoint/requester" - "github.com/bacalhau-project/bacalhau/pkg/pubsub" - "github.com/bacalhau-project/bacalhau/pkg/pubsub/libp2p" - "github.com/bacalhau-project/bacalhau/pkg/requester/pubsub/jobinfo" + "github.com/bacalhau-project/bacalhau/pkg/routing" s3helper "github.com/bacalhau-project/bacalhau/pkg/s3" "github.com/bacalhau-project/bacalhau/pkg/translation" "github.com/bacalhau-project/bacalhau/pkg/util" - libp2p_pubsub "github.com/libp2p/go-libp2p-pubsub" - "github.com/libp2p/go-libp2p/core/crypto" - "github.com/libp2p/go-libp2p/core/host" "github.com/rs/zerolog/log" "github.com/bacalhau-project/bacalhau/pkg/compute" @@ -36,67 +31,41 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/orchestrator/selection/ranking" "github.com/bacalhau-project/bacalhau/pkg/repo" "github.com/bacalhau-project/bacalhau/pkg/requester" - "github.com/bacalhau-project/bacalhau/pkg/routing" "github.com/bacalhau-project/bacalhau/pkg/storage" "github.com/bacalhau-project/bacalhau/pkg/system" - "github.com/bacalhau-project/bacalhau/pkg/transport/bprotocol" ) type Requester struct { // Visible for testing - Endpoint requester.Endpoint - JobStore jobstore.Store - NodeDiscoverer orchestrator.NodeDiscoverer - computeProxy *bprotocol.ComputeProxy - localCallback compute.Callback - cleanupFunc func(ctx context.Context) + Endpoint requester.Endpoint + JobStore jobstore.Store + NodeDiscoverer orchestrator.NodeDiscoverer + localCallback compute.Callback + cleanupFunc func(ctx context.Context) + debugInfoProviders []model.DebugInfoProvider } //nolint:funlen func NewRequesterNode( ctx context.Context, - host host.Host, + nodeID string, apiServer *publicapi.Server, requesterConfig RequesterConfig, storageProvider storage.StorageProvider, authnProvider authn.Provider, nodeInfoStore routing.NodeInfoStore, - gossipSub *libp2p_pubsub.PubSub, fsRepo *repo.FsRepo, + computeProxy compute.Endpoint, ) (*Requester, error) { // prepare event handlers - tracerContextProvider := eventhandler.NewTracerContextProvider(host.ID().String()) + tracerContextProvider := eventhandler.NewTracerContextProvider(nodeID) localJobEventConsumer := eventhandler.NewChainedJobEventHandler(tracerContextProvider) - // compute proxy - computeProxy := bprotocol.NewComputeProxy(bprotocol.ComputeProxyParams{ - Host: host, - }) - eventEmitter := orchestrator.NewEventEmitter(orchestrator.EventEmitterParams{ EventConsumer: localJobEventConsumer, }) - jobStore, err := fsRepo.InitJobStore(ctx, host.ID().String()) - if err != nil { - return nil, err - } - - // PubSub to publish job events to the network - jobInfoPubSub, err := libp2p.NewPubSub[jobinfo.Envelope](libp2p.PubSubParams{ - Host: host, - TopicName: JobInfoTopic, - PubSub: gossipSub, - IgnoreLocal: true, - }) - if err != nil { - return nil, err - } - jobInfoPublisher := jobinfo.NewPublisher(jobinfo.PublisherParams{ - JobStore: jobStore, - PubSub: jobInfoPubSub, - }) - err = jobInfoPubSub.Subscribe(ctx, pubsub.NewNoopSubscriber[jobinfo.Envelope]()) + jobStore, err := fsRepo.InitJobStore(ctx, nodeID) if err != nil { return nil, err } @@ -153,14 +122,14 @@ func NewRequesterNode( // planner that forwards the desired state to the compute nodes, // and updates the observed state if the compute node accepts the desired state planner.NewComputeForwarder(planner.ComputeForwarderParams{ - ID: host.ID().String(), + ID: nodeID, ComputeService: computeProxy, JobStore: jobStore, }), // planner that publishes events on job completion or failure planner.NewEventEmitter(planner.EventEmitterParams{ - ID: host.ID().String(), + ID: nodeID, EventEmitter: eventEmitter, }), @@ -214,12 +183,6 @@ func NewRequesterNode( worker.Start(ctx) } - publicKey := host.Peerstore().PubKey(host.ID()) - marshaledPublicKey, err := crypto.MarshalPublicKey(publicKey) - if err != nil { - return nil, err - } - // result transformers that are applied to the result before it is returned to the user resultTransformers := transformer.ChainedTransformer[*models.SpecConfig]{} @@ -239,8 +202,7 @@ func NewRequesterNode( } endpoint := requester.NewBaseEndpoint(&requester.BaseEndpointParams{ - ID: host.ID().String(), - PublicKey: marshaledPublicKey, + ID: nodeID, EvaluationBroker: evalBroker, EventEmitter: eventEmitter, ComputeEndpoint: computeProxy, @@ -255,7 +217,7 @@ func NewRequesterNode( } endpointV2 := orchestrator.NewBaseEndpoint(&orchestrator.BaseEndpointParams{ - ID: host.ID().String(), + ID: nodeID, EvaluationBroker: evalBroker, Store: jobStore, EventEmitter: eventEmitter, @@ -264,7 +226,7 @@ func NewRequesterNode( transformer.JobFn(transformer.IDGenerator), transformer.NameOptional(), transformer.DefaultsApplier(requesterConfig.JobDefaults), - transformer.RequesterInfo(host.ID().String(), marshaledPublicKey), + transformer.RequesterInfo(nodeID), transformer.NewInlineStoragePinner(storageProvider), }, TaskTranslator: translationProvider, @@ -274,16 +236,10 @@ func NewRequesterNode( housekeeping := requester.NewHousekeeping(requester.HousekeepingParams{ Endpoint: endpoint, JobStore: jobStore, - NodeID: host.ID().String(), + NodeID: nodeID, Interval: requesterConfig.HousekeepingBackgroundTaskInterval, }) - // register a handler for the bacalhau protocol handler that will forward requests to the scheduler - bprotocol.NewCallbackHandler(bprotocol.CallbackHandlerParams{ - Host: host, - Callback: endpoint, - }) - // register debug info providers for the /debug endpoint debugInfoProviders := []model.DebugInfoProvider{ discovery.NewDebugInfoProvider(nodeDiscoveryChain), @@ -308,7 +264,7 @@ func NewRequesterNode( auth_endpoint.BindEndpoint(ctx, apiServer.Router, authnProvider) // Register event handlers - lifecycleEventHandler := system.NewJobLifecycleEventHandler(host.ID().String()) + lifecycleEventHandler := system.NewJobLifecycleEventHandler(nodeID) eventTracer, err := eventhandler.NewTracer() if err != nil { return nil, err @@ -324,8 +280,6 @@ func NewRequesterNode( eventTracer, // dispatches events to listening websockets requesterAPIServer, - // publish job events to the network - jobInfoPublisher, ) // A single cleanup function to make sure the order of closing dependencies is correct @@ -337,12 +291,7 @@ func NewRequesterNode( } evalBroker.SetEnabled(false) - cleanupErr := jobInfoPubSub.Close(ctx) - if cleanupErr != nil { - util.LogDebugIfContextCancelled(ctx, cleanupErr, "failed to shutdown job info pubsub") - } - - cleanupErr = tracerContextProvider.Shutdown() + cleanupErr := tracerContextProvider.Shutdown() if cleanupErr != nil { util.LogDebugIfContextCancelled(ctx, cleanupErr, "failed to shutdown tracer context provider") } @@ -359,19 +308,15 @@ func NewRequesterNode( } return &Requester{ - Endpoint: endpoint, - localCallback: endpoint, - NodeDiscoverer: nodeDiscoveryChain, - JobStore: jobStore, - computeProxy: computeProxy, - cleanupFunc: cleanupFunc, + Endpoint: endpoint, + localCallback: endpoint, + NodeDiscoverer: nodeDiscoveryChain, + JobStore: jobStore, + cleanupFunc: cleanupFunc, + debugInfoProviders: debugInfoProviders, }, nil } -func (r *Requester) RegisterLocalComputeEndpoint(endpoint compute.Endpoint) { - r.computeProxy.RegisterLocalComputeEndpoint(endpoint) -} - func (r *Requester) cleanup(ctx context.Context) { r.cleanupFunc(ctx) } diff --git a/pkg/orchestrator/scheduler/batch_job_test.go b/pkg/orchestrator/scheduler/batch_job_test.go index 7c929074c7..2cfa9c449b 100644 --- a/pkg/orchestrator/scheduler/batch_job_test.go +++ b/pkg/orchestrator/scheduler/batch_job_test.go @@ -12,7 +12,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/orchestrator/retry" "github.com/bacalhau-project/bacalhau/pkg/test/mock" "github.com/google/uuid" - "github.com/libp2p/go-libp2p/core/peer" "github.com/stretchr/testify/suite" "go.uber.org/mock/gomock" ) @@ -26,11 +25,11 @@ const ( ) var nodeIDs = []string{ - "QmdZQ7ZbhnvWY1J12XYKGHApJ6aufKyLNSvf8jZBrBaAVL", - "QmXaXu9N5GNetatsvwnTfQqNtSeKAD6uCmarbh3LMRYAcF", - "QmYgxZiySj3MRkwLSL4X2MF5F9f2PMhAE3LV49XkfNL1o3", - "QmcWJnVXJ82DKJq8ED79LADR4ZBTnwgTK7yn6JQbNVMbbC", - "QmXRdLruWyETS2Z8XFrXxBFYXctfjT8T9mZWyuqwUm6rQk", + "Node0", + "Node1", + "Node2", + "Node3", + "Node4", } type BatchJobSchedulerTestSuite struct { @@ -80,10 +79,10 @@ func (s *BatchJobSchedulerTestSuite) TestProcess_ShouldCreateEnoughExecutions() matcher := NewPlanMatcher(s.T(), PlanMatcherParams{ Evaluation: evaluation, - NewExecutionsNodes: []peer.ID{ - nodeInfos[0].PeerInfo.ID, - nodeInfos[1].PeerInfo.ID, - nodeInfos[2].PeerInfo.ID, + NewExecutionsNodes: []string{ + nodeInfos[0].ID(), + nodeInfos[1].ID(), + nodeInfos[2].ID(), }, }) s.planner.EXPECT().Process(gomock.Any(), matcher).Times(1) @@ -230,7 +229,7 @@ func (s *BatchJobSchedulerTestSuite) TestFailUnhealthyExecs_ShouldMarkExecutions matcher := NewPlanMatcher(s.T(), PlanMatcherParams{ Evaluation: evaluation, - NewExecutionsNodes: []peer.ID{nodeInfos[0].PeerInfo.ID}, + NewExecutionsNodes: []string{nodeInfos[0].ID()}, StoppedExecutions: []string{ executions[execBidAccepted].ID, }, diff --git a/pkg/orchestrator/scheduler/daemon_job_test.go b/pkg/orchestrator/scheduler/daemon_job_test.go index 71b6a63aaa..bf97c62886 100644 --- a/pkg/orchestrator/scheduler/daemon_job_test.go +++ b/pkg/orchestrator/scheduler/daemon_job_test.go @@ -11,7 +11,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/orchestrator" "github.com/bacalhau-project/bacalhau/pkg/test/mock" "github.com/google/uuid" - "github.com/libp2p/go-libp2p/core/peer" "github.com/stretchr/testify/suite" "go.uber.org/mock/gomock" ) @@ -59,10 +58,10 @@ func (s *DaemonJobSchedulerTestSuite) TestProcess_ShouldCreateNewExecutions() { Evaluation: evaluation, JobState: models.JobStateTypeRunning, NewExecutionDesiredState: models.ExecutionDesiredStateRunning, - NewExecutionsNodes: []peer.ID{ - nodeInfos[0].PeerInfo.ID, - nodeInfos[1].PeerInfo.ID, - nodeInfos[2].PeerInfo.ID, + NewExecutionsNodes: []string{ + nodeInfos[0].ID(), + nodeInfos[1].ID(), + nodeInfos[2].ID(), }, }) s.planner.EXPECT().Process(gomock.Any(), matcher).Times(1) @@ -105,7 +104,7 @@ func (s *DaemonJobSchedulerTestSuite) TestProcess_ShouldMarkLostExecutionsOnUnhe matcher := NewPlanMatcher(s.T(), PlanMatcherParams{ Evaluation: evaluation, - NewExecutionsNodes: []peer.ID{}, + NewExecutionsNodes: []string{}, StoppedExecutions: []string{ executions[0].ID, }, diff --git a/pkg/orchestrator/scheduler/ops_job_test.go b/pkg/orchestrator/scheduler/ops_job_test.go index 359b3617c5..156db876d7 100644 --- a/pkg/orchestrator/scheduler/ops_job_test.go +++ b/pkg/orchestrator/scheduler/ops_job_test.go @@ -11,7 +11,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/orchestrator" "github.com/bacalhau-project/bacalhau/pkg/test/mock" "github.com/google/uuid" - "github.com/libp2p/go-libp2p/core/peer" "github.com/stretchr/testify/suite" "go.uber.org/mock/gomock" ) @@ -59,10 +58,10 @@ func (s *OpsJobSchedulerTestSuite) TestProcess_ShouldCreateNewExecutions() { Evaluation: evaluation, JobState: models.JobStateTypeRunning, NewExecutionDesiredState: models.ExecutionDesiredStateRunning, - NewExecutionsNodes: []peer.ID{ - nodeInfos[0].PeerInfo.ID, - nodeInfos[1].PeerInfo.ID, - nodeInfos[2].PeerInfo.ID, + NewExecutionsNodes: []string{ + nodeInfos[0].ID(), + nodeInfos[1].ID(), + nodeInfos[2].ID(), }, }) s.planner.EXPECT().Process(gomock.Any(), matcher).Times(1) @@ -100,7 +99,7 @@ func (s *OpsJobSchedulerTestSuite) TestProcess_ShouldMarkLostExecutionsOnUnhealt matcher := NewPlanMatcher(s.T(), PlanMatcherParams{ Evaluation: evaluation, - NewExecutionsNodes: []peer.ID{}, + NewExecutionsNodes: []string{}, StoppedExecutions: []string{ executions[0].ID, }, diff --git a/pkg/orchestrator/scheduler/service_job_test.go b/pkg/orchestrator/scheduler/service_job_test.go index 80eb975ba3..6784e922ab 100644 --- a/pkg/orchestrator/scheduler/service_job_test.go +++ b/pkg/orchestrator/scheduler/service_job_test.go @@ -12,7 +12,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/orchestrator/retry" "github.com/bacalhau-project/bacalhau/pkg/test/mock" "github.com/google/uuid" - "github.com/libp2p/go-libp2p/core/peer" "github.com/stretchr/testify/suite" "go.uber.org/mock/gomock" ) @@ -72,10 +71,10 @@ func (s *ServiceJobSchedulerTestSuite) TestProcess_ShouldCreateEnoughExecutions( matcher := NewPlanMatcher(s.T(), PlanMatcherParams{ Evaluation: evaluation, - NewExecutionsNodes: []peer.ID{ - nodeInfos[0].PeerInfo.ID, - nodeInfos[1].PeerInfo.ID, - nodeInfos[2].PeerInfo.ID, + NewExecutionsNodes: []string{ + nodeInfos[0].ID(), + nodeInfos[1].ID(), + nodeInfos[2].ID(), }, }) s.planner.EXPECT().Process(gomock.Any(), matcher).Times(1) @@ -226,9 +225,9 @@ func (s *ServiceJobSchedulerTestSuite) TestFailUnhealthyExecs_ShouldMarkExecutio matcher := NewPlanMatcher(s.T(), PlanMatcherParams{ Evaluation: evaluation, - NewExecutionsNodes: []peer.ID{ - nodeInfos[0].PeerInfo.ID, - nodeInfos[1].PeerInfo.ID, + NewExecutionsNodes: []string{ + nodeInfos[0].ID(), + nodeInfos[1].ID(), }, StoppedExecutions: []string{ executions[execServiceBidAccepted1].ID, @@ -262,9 +261,9 @@ func (s *ServiceJobSchedulerTestSuite) TestProcess_TreatCompletedExecutionsAsFai matcher := NewPlanMatcher(s.T(), PlanMatcherParams{ Evaluation: evaluation, - NewExecutionsNodes: []peer.ID{ - nodeInfos[0].PeerInfo.ID, - nodeInfos[1].PeerInfo.ID, + NewExecutionsNodes: []string{ + nodeInfos[0].ID(), + nodeInfos[1].ID(), }, }) s.planner.EXPECT().Process(gomock.Any(), matcher).Times(1) diff --git a/pkg/orchestrator/scheduler/utils_test.go b/pkg/orchestrator/scheduler/utils_test.go index 587367fcfb..cc8b243822 100644 --- a/pkg/orchestrator/scheduler/utils_test.go +++ b/pkg/orchestrator/scheduler/utils_test.go @@ -7,15 +7,13 @@ import ( "testing" "github.com/bacalhau-project/bacalhau/pkg/models" - "github.com/libp2p/go-libp2p/core/peer" - "github.com/stretchr/testify/require" ) type PlanMatcher struct { t *testing.T JobState models.JobStateType Evaluation *models.Evaluation - NewExecutionsNodes []peer.ID + NewExecutionsNodes []string NewExecutionsDesiredState models.ExecutionDesiredStateType StoppedExecutions []string ApprovedExecutions []string @@ -24,7 +22,7 @@ type PlanMatcher struct { type PlanMatcherParams struct { JobState models.JobStateType Evaluation *models.Evaluation - NewExecutionsNodes []peer.ID + NewExecutionsNodes []string NewExecutionDesiredState models.ExecutionDesiredStateType StoppedExecutions []string ApprovedExecutions []string @@ -68,7 +66,7 @@ func (m PlanMatcher) Matches(x interface{}) bool { return false } for _, node := range m.NewExecutionsNodes { - desiredState, ok := newExecutionNodes[node.String()] + desiredState, ok := newExecutionNodes[node] if !ok { m.t.Logf("NewExecutionsNodes: %v != %s", newExecutionNodes, m.NewExecutionsNodes) return false @@ -123,11 +121,7 @@ func (m PlanMatcher) String() string { } func mockNodeInfo(t *testing.T, nodeID string) *models.NodeInfo { - id, err := peer.Decode(nodeID) - require.NoError(t, err) return &models.NodeInfo{ - PeerInfo: peer.AddrInfo{ - ID: id, - }, + NodeID: nodeID, } } diff --git a/pkg/orchestrator/selection/discovery/chained.go b/pkg/orchestrator/selection/discovery/chained.go index a3278cc7a8..b1aeb0f8bf 100644 --- a/pkg/orchestrator/selection/discovery/chained.go +++ b/pkg/orchestrator/selection/discovery/chained.go @@ -5,7 +5,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/orchestrator" - "github.com/libp2p/go-libp2p/core/peer" "github.com/pkg/errors" "github.com/rs/zerolog/log" "go.uber.org/multierr" @@ -39,14 +38,14 @@ func (c *Chain) chainDiscovery( getNodes func(orchestrator.NodeDiscoverer) ([]models.NodeInfo, error), ) ([]models.NodeInfo, error) { var err error - uniqueNodes := make(map[peer.ID]models.NodeInfo, 0) + uniqueNodes := make(map[string]models.NodeInfo, 0) for _, discoverer := range c.discoverers { nodeInfos, discoverErr := getNodes(discoverer) err = multierr.Append(err, errors.Wrapf(discoverErr, "error finding nodes from %T", discoverer)) currentNodesCount := len(uniqueNodes) for _, nodeInfo := range nodeInfos { - if _, ok := uniqueNodes[nodeInfo.PeerInfo.ID]; !ok { - uniqueNodes[nodeInfo.PeerInfo.ID] = nodeInfo + if _, ok := uniqueNodes[nodeInfo.ID()]; !ok { + uniqueNodes[nodeInfo.ID()] = nodeInfo } } log.Ctx(ctx).Debug().Msgf("[%s] found %d more nodes by %T", caller, len(uniqueNodes)-currentNodesCount, discoverer) diff --git a/pkg/orchestrator/selection/discovery/chained_test.go b/pkg/orchestrator/selection/discovery/chained_test.go index 88673d74d2..b542a5eb77 100644 --- a/pkg/orchestrator/selection/discovery/chained_test.go +++ b/pkg/orchestrator/selection/discovery/chained_test.go @@ -8,7 +8,6 @@ import ( "testing" "github.com/bacalhau-project/bacalhau/pkg/models" - "github.com/libp2p/go-libp2p/core/peer" "github.com/stretchr/testify/suite" ) @@ -21,9 +20,9 @@ type ChainedSuite struct { } func (s *ChainedSuite) SetupSuite() { - s.peerID1 = models.NodeInfo{PeerInfo: peer.AddrInfo{ID: peer.ID("peerID1")}} - s.peerID2 = models.NodeInfo{PeerInfo: peer.AddrInfo{ID: peer.ID("peerID2")}} - s.peerID3 = models.NodeInfo{PeerInfo: peer.AddrInfo{ID: peer.ID("peerID3")}} + s.peerID1 = models.NodeInfo{NodeID: "peerID1"} + s.peerID2 = models.NodeInfo{NodeID: "peerID2"} + s.peerID3 = models.NodeInfo{NodeID: "peerID3"} } func (s *ChainedSuite) SetupTest() { diff --git a/pkg/orchestrator/selection/discovery/store_test.go b/pkg/orchestrator/selection/discovery/store_test.go index 6f32a7ea75..0e1cc6699e 100644 --- a/pkg/orchestrator/selection/discovery/store_test.go +++ b/pkg/orchestrator/selection/discovery/store_test.go @@ -10,8 +10,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/routing/inmemory" "github.com/bacalhau-project/bacalhau/pkg/test/mock" - "github.com/libp2p/go-libp2p/core/peer" - "github.com/multiformats/go-multiaddr" "github.com/stretchr/testify/suite" ) @@ -63,12 +61,7 @@ func (s *StoreNodeDiscovererSuite) TestListNodes_Empty() { func generateNodeInfo(id string, engines ...string) models.NodeInfo { return models.NodeInfo{ - PeerInfo: peer.AddrInfo{ - ID: peer.ID(id), - Addrs: []multiaddr.Multiaddr{ - multiaddr.StringCast("/ip4/0.0.0.0/tcp/1234"), - }, - }, + NodeID: id, NodeType: models.NodeTypeCompute, ComputeNodeInfo: &models.ComputeNodeInfo{ ExecutionEngines: engines, diff --git a/pkg/orchestrator/selection/ranking/chain.go b/pkg/orchestrator/selection/ranking/chain.go index 2c2a621e8b..e2a636c4f1 100644 --- a/pkg/orchestrator/selection/ranking/chain.go +++ b/pkg/orchestrator/selection/ranking/chain.go @@ -5,7 +5,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/orchestrator" - "github.com/libp2p/go-libp2p/core/peer" ) // Chain assigns a random rank to each node to allow the orchestrator to select random top nodes @@ -25,9 +24,9 @@ func (c *Chain) Add(ranker ...orchestrator.NodeRanker) { func (c *Chain) RankNodes(ctx context.Context, job models.Job, nodes []models.NodeInfo) ([]orchestrator.NodeRank, error) { // initialize map of node ranks - ranksMap := make(map[peer.ID]*orchestrator.NodeRank, len(nodes)) + ranksMap := make(map[string]*orchestrator.NodeRank, len(nodes)) for _, node := range nodes { - ranksMap[node.PeerInfo.ID] = &orchestrator.NodeRank{NodeInfo: node, Rank: orchestrator.RankPossible} + ranksMap[node.ID()] = &orchestrator.NodeRank{NodeInfo: node, Rank: orchestrator.RankPossible} } // iterate over the rankers and add their ranks to the map @@ -40,10 +39,10 @@ func (c *Chain) RankNodes(ctx context.Context, job models.Job, nodes []models.No } for _, nodeRank := range nodeRanks { if !nodeRank.MeetsRequirement() { - ranksMap[nodeRank.NodeInfo.PeerInfo.ID].Rank = orchestrator.RankUnsuitable - ranksMap[nodeRank.NodeInfo.PeerInfo.ID].Reason = nodeRank.Reason - } else if ranksMap[nodeRank.NodeInfo.PeerInfo.ID].MeetsRequirement() { - ranksMap[nodeRank.NodeInfo.PeerInfo.ID].Rank += nodeRank.Rank + ranksMap[nodeRank.NodeInfo.ID()].Rank = orchestrator.RankUnsuitable + ranksMap[nodeRank.NodeInfo.ID()].Reason = nodeRank.Reason + } else if ranksMap[nodeRank.NodeInfo.ID()].MeetsRequirement() { + ranksMap[nodeRank.NodeInfo.ID()].Rank += nodeRank.Rank } } } diff --git a/pkg/orchestrator/selection/ranking/chain_test.go b/pkg/orchestrator/selection/ranking/chain_test.go index 5aeae01c13..14baebbe59 100644 --- a/pkg/orchestrator/selection/ranking/chain_test.go +++ b/pkg/orchestrator/selection/ranking/chain_test.go @@ -7,7 +7,6 @@ import ( "testing" "github.com/bacalhau-project/bacalhau/pkg/models" - "github.com/libp2p/go-libp2p/core/peer" "github.com/stretchr/testify/suite" ) @@ -20,9 +19,9 @@ type ChainSuite struct { } func (s *ChainSuite) SetupSuite() { - s.peerID1 = models.NodeInfo{PeerInfo: peer.AddrInfo{ID: peer.ID("peerID1")}} - s.peerID2 = models.NodeInfo{PeerInfo: peer.AddrInfo{ID: peer.ID("peerID2")}} - s.peerID3 = models.NodeInfo{PeerInfo: peer.AddrInfo{ID: peer.ID("peerID3")}} + s.peerID1 = models.NodeInfo{NodeID: "peerID1"} + s.peerID2 = models.NodeInfo{NodeID: "peerID2"} + s.peerID3 = models.NodeInfo{NodeID: "peerID3"} } func (s *ChainSuite) SetupTest() { diff --git a/pkg/orchestrator/selection/ranking/features_test.go b/pkg/orchestrator/selection/ranking/features_test.go index fff6578606..5d577ccd12 100644 --- a/pkg/orchestrator/selection/ranking/features_test.go +++ b/pkg/orchestrator/selection/ranking/features_test.go @@ -8,7 +8,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/test/mock" - "github.com/libp2p/go-libp2p/core/peer" "github.com/stretchr/testify/suite" ) @@ -22,23 +21,23 @@ type FeatureNodeRankerSuite struct { func (s *FeatureNodeRankerSuite) Nodes() []models.NodeInfo { return []models.NodeInfo{ { - PeerInfo: peer.AddrInfo{ID: peer.ID("docker")}, + NodeID: "docker", ComputeNodeInfo: &models.ComputeNodeInfo{ExecutionEngines: []string{models.EngineDocker}}, }, { - PeerInfo: peer.AddrInfo{ID: peer.ID("wasm")}, + NodeID: "wasm", ComputeNodeInfo: &models.ComputeNodeInfo{ExecutionEngines: []string{models.EngineWasm}}, }, { - PeerInfo: peer.AddrInfo{ID: peer.ID("ipfs")}, + NodeID: "ipfs", ComputeNodeInfo: &models.ComputeNodeInfo{StorageSources: []string{models.StorageSourceIPFS}}, }, { - PeerInfo: peer.AddrInfo{ID: peer.ID("url")}, + NodeID: "url", ComputeNodeInfo: &models.ComputeNodeInfo{StorageSources: []string{models.StorageSourceURL}}, }, { - PeerInfo: peer.AddrInfo{ID: peer.ID("combo")}, + NodeID: "combo", ComputeNodeInfo: &models.ComputeNodeInfo{ ExecutionEngines: []string{models.EngineDocker, models.EngineWasm}, Publishers: []string{models.PublisherIPFS, models.PublisherS3}, @@ -46,7 +45,7 @@ func (s *FeatureNodeRankerSuite) Nodes() []models.NodeInfo { }, }, { - PeerInfo: peer.AddrInfo{ID: peer.ID("unknown")}, + NodeID: "unknown", }, } } diff --git a/pkg/orchestrator/selection/ranking/max_usage_test.go b/pkg/orchestrator/selection/ranking/max_usage_test.go index f49df4f789..11df116190 100644 --- a/pkg/orchestrator/selection/ranking/max_usage_test.go +++ b/pkg/orchestrator/selection/ranking/max_usage_test.go @@ -8,7 +8,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/test/mock" - "github.com/libp2p/go-libp2p/core/peer" "github.com/stretchr/testify/suite" ) @@ -22,15 +21,15 @@ type MaxUsageNodeRankerSuite struct { func (s *MaxUsageNodeRankerSuite) SetupSuite() { s.smallPeer = models.NodeInfo{ - PeerInfo: peer.AddrInfo{ID: peer.ID("small")}, + NodeID: "small", ComputeNodeInfo: &models.ComputeNodeInfo{MaxJobRequirements: models.Resources{CPU: 1}}, } s.medPeer = models.NodeInfo{ - PeerInfo: peer.AddrInfo{ID: peer.ID("med")}, + NodeID: "med", ComputeNodeInfo: &models.ComputeNodeInfo{MaxJobRequirements: models.Resources{CPU: 2}}, } s.largePeer = models.NodeInfo{ - PeerInfo: peer.AddrInfo{ID: peer.ID("large")}, + NodeID: "large", ComputeNodeInfo: &models.ComputeNodeInfo{MaxJobRequirements: models.Resources{CPU: 3}}, } } diff --git a/pkg/orchestrator/selection/ranking/min_version_test.go b/pkg/orchestrator/selection/ranking/min_version_test.go index 5f882bd370..74db51cfae 100644 --- a/pkg/orchestrator/selection/ranking/min_version_test.go +++ b/pkg/orchestrator/selection/ranking/min_version_test.go @@ -7,7 +7,6 @@ import ( "testing" "github.com/bacalhau-project/bacalhau/pkg/models" - "github.com/libp2p/go-libp2p/core/peer" "github.com/stretchr/testify/suite" ) @@ -89,7 +88,7 @@ func (s *MinVersionNodeRankerSuite) TestRankNodes() { var nodes []models.NodeInfo for _, t := range minVersionNodeRankerTestCases { nodes = append(nodes, models.NodeInfo{ - PeerInfo: peer.AddrInfo{ID: peer.ID(t.name)}, + NodeID: t.name, BacalhauVersion: t.version, }) } @@ -110,7 +109,7 @@ func (s *MinVersionNodeRankerSuite) TestRankNodes_NilMinVersion() { var nodes []models.NodeInfo for _, t := range minVersionNodeRankerTestCases { nodes = append(nodes, models.NodeInfo{ - PeerInfo: peer.AddrInfo{ID: peer.ID(t.name)}, + NodeID: t.name, BacalhauVersion: t.version, }) } diff --git a/pkg/orchestrator/selection/ranking/random_test.go b/pkg/orchestrator/selection/ranking/random_test.go index 30e7c597d7..9ac48d84b6 100644 --- a/pkg/orchestrator/selection/ranking/random_test.go +++ b/pkg/orchestrator/selection/ranking/random_test.go @@ -8,7 +8,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/orchestrator" - "github.com/libp2p/go-libp2p/core/peer" "github.com/stretchr/testify/suite" ) @@ -28,7 +27,7 @@ func (s *RandomNodeRankerSuite) TestRankNodes() { var nodes []models.NodeInfo for i := 0; i < nodeCount; i++ { nodes = append(nodes, models.NodeInfo{ - PeerInfo: peer.AddrInfo{ID: peer.ID(rune(i))}, + NodeID: "node" + string(rune(i)), }) } s.RandomNodeRanker = NewRandomNodeRanker(RandomNodeRankerParams{RandomnessRange: randomnessRange}) diff --git a/pkg/orchestrator/selection/ranking/utils_test.go b/pkg/orchestrator/selection/ranking/utils_test.go index c5484256aa..bb6951c12a 100644 --- a/pkg/orchestrator/selection/ranking/utils_test.go +++ b/pkg/orchestrator/selection/ranking/utils_test.go @@ -4,12 +4,11 @@ import ( "testing" "github.com/bacalhau-project/bacalhau/pkg/orchestrator" - "github.com/libp2p/go-libp2p/core/peer" ) func assertEquals(t *testing.T, ranks []orchestrator.NodeRank, nodeID string, expectedRank int) { for _, rank := range ranks { - if rank.NodeInfo.PeerInfo.ID == peer.ID(nodeID) { + if rank.NodeInfo.ID() == nodeID { if rank.Rank != expectedRank { t.Errorf("expected rank %d for node %s, got %d", expectedRank, nodeID, rank.Rank) } diff --git a/pkg/orchestrator/transformer/job.go b/pkg/orchestrator/transformer/job.go index b8773b8f11..c21a9e7b93 100644 --- a/pkg/orchestrator/transformer/job.go +++ b/pkg/orchestrator/transformer/job.go @@ -4,7 +4,6 @@ import ( "context" "time" - "github.com/bacalhau-project/bacalhau/pkg/model" "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/util/idgen" ) @@ -37,11 +36,10 @@ func DefaultsApplier(defaults JobDefaults) JobTransformer { return JobFn(f) } -// RequesterInfo is a transformer that sets the requester ID and public key in the job meta. -func RequesterInfo(requesterNodeID string, requesterPubKey model.PublicKey) JobTransformer { +// RequesterInfo is a transformer that sets the requester ID in the job meta. +func RequesterInfo(requesterNodeID string) JobTransformer { f := func(ctx context.Context, job *models.Job) error { job.Meta[models.MetaRequesterID] = requesterNodeID - job.Meta[models.MetaRequesterPublicKey] = requesterPubKey.String() return nil } return JobFn(f) diff --git a/pkg/orchestrator/types.go b/pkg/orchestrator/types.go index 537ae71e13..beebc076be 100644 --- a/pkg/orchestrator/types.go +++ b/pkg/orchestrator/types.go @@ -70,7 +70,7 @@ func (r NodeRank) MeetsRequirement() bool { } func (r NodeRank) MarshalZerologObject(e *zerolog.Event) { - e.Stringer("Node", r.NodeInfo.PeerInfo.ID). + e.Str("Node", r.NodeInfo.ID()). Bool("MeetsRequirement", r.MeetsRequirement()). Str("Reason", r.Reason) } diff --git a/pkg/publicapi/endpoint/agent/endpoint.go b/pkg/publicapi/endpoint/agent/endpoint.go index 8d791e6f8f..ad0e5a82ea 100644 --- a/pkg/publicapi/endpoint/agent/endpoint.go +++ b/pkg/publicapi/endpoint/agent/endpoint.go @@ -3,27 +3,32 @@ package agent import ( "net/http" + "github.com/bacalhau-project/bacalhau/pkg/model" "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/publicapi/apimodels" "github.com/bacalhau-project/bacalhau/pkg/publicapi/middleware" "github.com/bacalhau-project/bacalhau/pkg/version" "github.com/labstack/echo/v4" + "github.com/rs/zerolog/log" ) type EndpointParams struct { - Router *echo.Echo - NodeInfoProvider models.NodeInfoProvider + Router *echo.Echo + NodeInfoProvider models.NodeInfoProvider + DebugInfoProviders []model.DebugInfoProvider } type Endpoint struct { - router *echo.Echo - nodeInfoProvider models.NodeInfoProvider + router *echo.Echo + nodeInfoProvider models.NodeInfoProvider + debugInfoProviders []model.DebugInfoProvider } func NewEndpoint(params EndpointParams) *Endpoint { e := &Endpoint{ - router: params.Router, - nodeInfoProvider: params.NodeInfoProvider, + router: params.Router, + nodeInfoProvider: params.NodeInfoProvider, + debugInfoProviders: params.DebugInfoProviders, } // JSON group @@ -32,6 +37,7 @@ func NewEndpoint(params EndpointParams) *Endpoint { g.GET("/alive", e.alive) g.GET("/version", e.version) g.GET("/node", e.node) + g.GET("/debug", e.debug) return e } @@ -79,3 +85,25 @@ func (e *Endpoint) node(c echo.Context) error { NodeInfo: &nodeInfo, }) } + +// debug godoc +// +// @ID agent/debug +// @Summary Returns debug information on what the current node is doing. +// @Tags Ops +// @Produce json +// @Success 200 {object} model.DebugInfo +// @Failure 500 {object} string +// @Router /api/v1/agent/debug [get] +func (e *Endpoint) debug(c echo.Context) error { + debugInfoMap := make(map[string]interface{}) + for _, provider := range e.debugInfoProviders { + debugInfo, err := provider.GetDebugInfo(c.Request().Context()) + if err != nil { + log.Ctx(c.Request().Context()).Error().Msgf("could not get debug info from some providers: %s", err) + continue + } + debugInfoMap[debugInfo.Component] = debugInfo.Info + } + return c.JSON(http.StatusOK, debugInfoMap) +} diff --git a/pkg/publicapi/endpoint/orchestrator/node.go b/pkg/publicapi/endpoint/orchestrator/node.go index f0f12b18b6..7b095ecc56 100644 --- a/pkg/publicapi/endpoint/orchestrator/node.go +++ b/pkg/publicapi/endpoint/orchestrator/node.go @@ -51,7 +51,7 @@ func (e *Endpoint) listNodes(c echo.Context) error { var sortFnc func(a, b *models.NodeInfo) bool switch args.OrderBy { case "id", "": - sortFnc = func(a, b *models.NodeInfo) bool { return a.PeerInfo.ID < b.PeerInfo.ID } + sortFnc = func(a, b *models.NodeInfo) bool { return a.ID() < b.ID() } case "type": sortFnc = func(a, b *models.NodeInfo) bool { return a.NodeType < b.NodeType } case "available_cpu": diff --git a/pkg/publicapi/endpoint/shared/endpoint.go b/pkg/publicapi/endpoint/shared/endpoint.go index 801bb73cfe..0235e474a3 100644 --- a/pkg/publicapi/endpoint/shared/endpoint.go +++ b/pkg/publicapi/endpoint/shared/endpoint.go @@ -8,21 +8,17 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/publicapi/middleware" "github.com/bacalhau-project/bacalhau/pkg/version" "github.com/labstack/echo/v4" - "github.com/libp2p/go-libp2p/core/peer" - "github.com/libp2p/go-libp2p/core/peerstore" ) type EndpointParams struct { Router *echo.Echo NodeID string - PeerStore peerstore.Peerstore NodeInfoProvider models.NodeInfoProvider } type Endpoint struct { router *echo.Echo nodeID string - peerStore peerstore.Peerstore nodeInfoProvider models.NodeInfoProvider } @@ -30,14 +26,12 @@ func NewEndpoint(params EndpointParams) *Endpoint { e := &Endpoint{ router: params.Router, nodeID: params.NodeID, - peerStore: params.PeerStore, nodeInfoProvider: params.NodeInfoProvider, } // JSON group g := e.router.Group("/api/v1") g.Use(middleware.SetContentType(echo.MIMEApplicationJSON)) - g.GET("/peers", e.peers) g.GET("/node_info", e.nodeInfo) g.POST("/version", e.version) g.GET("/healthz", e.healthz) @@ -70,23 +64,6 @@ func (e *Endpoint) id(c echo.Context) error { return c.String(http.StatusOK, e.nodeID) } -// @ID peers -// @Summary Returns the peers connected to the host via the transport layer. -// @Description As described in the [architecture docs](https://docs.bacalhau.org/about-bacalhau/architecture), -// @Description each node is connected to a number of peer nodes. -// @Tags Utils -// @Produce json -// @Success 200 {object} []peer.AddrInfo -// @Failure 500 {object} string -// @Router /api/v1/peers [get] -func (e *Endpoint) peers(c echo.Context) error { - var peerInfos []peer.AddrInfo - for _, p := range e.peerStore.Peers() { - peerInfos = append(peerInfos, e.peerStore.PeerInfo(p)) - } - return c.JSON(http.StatusOK, peerInfos) -} - // nodeInfo godoc // // @ID nodeInfo diff --git a/pkg/publicapi/test/agent_test.go b/pkg/publicapi/test/agent_test.go index 568ead488e..1f5d32f5c3 100644 --- a/pkg/publicapi/test/agent_test.go +++ b/pkg/publicapi/test/agent_test.go @@ -34,7 +34,7 @@ func (s *ServerSuite) TestAgentNode() { s.Require().NotEmpty(resp) s.Require().NotNil(resp.NodeInfo) - expectedNode, err := s.requesterNode.NodeInfoStore.Get(context.Background(), s.requesterNode.Host.ID().String()) + expectedNode, err := s.requesterNode.NodeInfoStore.Get(context.Background(), s.requesterNode.ID) s.Require().NoError(err) equalNodeInfo(s.T(), expectedNode, *resp.NodeInfo) @@ -45,16 +45,11 @@ func (s *ServerSuite) TestAgentNodeCompute() { s.Require().NoError(err) s.Require().NotEmpty(resp) s.Require().NotNil(resp.NodeInfo) - - expectedNode, err := s.computeNode.NodeInfoStore.Get(context.Background(), s.computeNode.Host.ID().String()) - s.Require().NoError(err) - - equalNodeInfo(s.T(), expectedNode, *resp.NodeInfo) } func equalNodeInfo(t *testing.T, a, b models.NodeInfo) { require.Equal(t, a.BacalhauVersion, b.BacalhauVersion) - require.Equal(t, a.PeerInfo, b.PeerInfo) + require.Equal(t, a.ID(), b.ID()) require.Equal(t, a.NodeType, b.NodeType) require.Equal(t, a.Labels, b.Labels) diff --git a/pkg/publicapi/test/requester_server_test.go b/pkg/publicapi/test/requester_server_test.go index 63eacabbcc..fa92a007a5 100644 --- a/pkg/publicapi/test/requester_server_test.go +++ b/pkg/publicapi/test/requester_server_test.go @@ -42,7 +42,9 @@ func (s *RequesterSuite) SetupTest() { // After each test func (s *RequesterSuite) TearDownTest() { - s.node.CleanupManager.Cleanup(context.Background()) + if s.node != nil { + s.node.CleanupManager.Cleanup(context.Background()) + } } func (s *RequesterSuite) TestList() { diff --git a/pkg/publicapi/test/util_test.go b/pkg/publicapi/test/util_test.go index 2a8336a571..2f136ea132 100644 --- a/pkg/publicapi/test/util_test.go +++ b/pkg/publicapi/test/util_test.go @@ -2,9 +2,13 @@ package test import ( "context" + "os" "testing" "time" + "github.com/bacalhau-project/bacalhau/pkg/models" + "github.com/libp2p/go-libp2p/core/host" + "github.com/libp2p/go-libp2p/core/peer" "github.com/phayes/freeport" "github.com/stretchr/testify/require" @@ -35,8 +39,21 @@ func setupNodeForTestWithConfig(t *testing.T, apiCfg publicapi.Config) (*node.No privKey, err := config.GetLibp2pPrivKey() require.NoError(t, err) - libp2pHost, err := libp2p.NewHost(libp2pPort, privKey) + + peerID, err := peer.IDFromPrivateKey(privKey) require.NoError(t, err) + nodeID := peerID.String() + + var libp2pHost host.Host + networkType, ok := os.LookupEnv("BACALHAU_NODE_NETWORK_TYPE") + if !ok { + networkType = models.NetworkTypeLibp2p + } + + if networkType == models.NetworkTypeLibp2p { + libp2pHost, err = libp2p.NewHost(libp2pPort, privKey) + require.NoError(t, err) + } computeConfig, err := node.NewComputeConfigWithDefaults() require.NoError(t, err) @@ -44,8 +61,8 @@ func setupNodeForTestWithConfig(t *testing.T, apiCfg publicapi.Config) (*node.No require.NoError(t, err) nodeConfig := node.NodeConfig{ + NodeID: nodeID, CleanupManager: cm, - Host: libp2pHost, HostAddress: "0.0.0.0", APIPort: 0, ComputeConfig: computeConfig, @@ -57,6 +74,10 @@ func setupNodeForTestWithConfig(t *testing.T, apiCfg publicapi.Config) (*node.No NodeInfoPublisherInterval: node.TestNodeInfoPublishConfig, FsRepo: fsRepo, NodeInfoStoreTTL: 10 * time.Minute, + NetworkConfig: node.NetworkConfig{ + Type: networkType, + Libp2pHost: libp2pHost, + }, } n, err := node.NewNode(ctx, nodeConfig) diff --git a/pkg/pubsub/libp2p/pubsub.go b/pkg/pubsub/libp2p/pubsub.go index f13f47f02c..f04f7e07ee 100644 --- a/pkg/pubsub/libp2p/pubsub.go +++ b/pkg/pubsub/libp2p/pubsub.go @@ -63,9 +63,11 @@ func (p *PubSub[T]) Publish(ctx context.Context, message T) error { return p.topic.Publish(ctx, payload) } -func (p *PubSub[T]) Subscribe(_ context.Context, subscriber pubsub.Subscriber[T]) (err error) { +func (p *PubSub[T]) Subscribe(ctx context.Context, subscriber pubsub.Subscriber[T]) (err error) { var firstSubscriber bool p.subscriberOnce.Do(func() { + log.Ctx(ctx).Debug().Msgf("Subscribing to subject %s", p.topicName) + // register the subscriber p.subscriber = subscriber diff --git a/pkg/repo/fs.go b/pkg/repo/fs.go index c1c0e5db83..43cef17481 100644 --- a/pkg/repo/fs.go +++ b/pkg/repo/fs.go @@ -65,7 +65,7 @@ func (fsr *FsRepo) Exists() (bool, error) { if err != nil { return false, err } - if version != RepoVersion1 && version != RepoVersion2 { + if !IsValidVersion(version) { return false, fmt.Errorf("unknown repo version %d", version) } return true, nil diff --git a/pkg/repo/version.go b/pkg/repo/version.go index d27b7d9f8a..506b307eb9 100644 --- a/pkg/repo/version.go +++ b/pkg/repo/version.go @@ -17,6 +17,11 @@ const ( RepoVersionFile = "repo.version" ) +// IsValidVersion returns true if the version is valid. +func IsValidVersion(version int) bool { + return version == RepoVersion1 || version == RepoVersion2 +} + type RepoVersion struct { Version int } diff --git a/pkg/requester/endpoint.go b/pkg/requester/endpoint.go index b2a46c9f0a..9342f6352a 100644 --- a/pkg/requester/endpoint.go +++ b/pkg/requester/endpoint.go @@ -24,7 +24,6 @@ import ( type BaseEndpointParams struct { ID string - PublicKey []byte EvaluationBroker orchestrator.EvaluationBroker Store jobstore.Store EventEmitter orchestrator.EventEmitter @@ -48,7 +47,7 @@ type BaseEndpoint struct { func NewBaseEndpoint(params *BaseEndpointParams) *BaseEndpoint { transforms := []jobtransform.Transformer{ jobtransform.NewTimeoutApplier(params.MinJobExecutionTimeout, params.DefaultJobExecutionTimeout), - jobtransform.NewRequesterInfo(params.ID, params.PublicKey), + jobtransform.NewRequesterInfo(params.ID), jobtransform.RepoExistsOnIPFS(params.StorageProviders), jobtransform.NewPublisherMigrator(), jobtransform.NewEngineMigrator(), diff --git a/pkg/requester/jobtransform/requester_info.go b/pkg/requester/jobtransform/requester_info.go index d37303f8f1..7da5b0aae0 100644 --- a/pkg/requester/jobtransform/requester_info.go +++ b/pkg/requester/jobtransform/requester_info.go @@ -6,11 +6,10 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/model" ) -func NewRequesterInfo(requesterNodeID string, requesterPubKey model.PublicKey) Transformer { +func NewRequesterInfo(requesterNodeID string) Transformer { return func(ctx context.Context, j *model.Job) (modified bool, err error) { j.Metadata.Requester = model.JobRequester{ - RequesterNodeID: requesterNodeID, - RequesterPublicKey: requesterPubKey, + RequesterNodeID: requesterNodeID, } return true, nil } diff --git a/pkg/routing/inmemory/inmemory.go b/pkg/routing/inmemory/inmemory.go index 527775ad24..36173b9856 100644 --- a/pkg/routing/inmemory/inmemory.go +++ b/pkg/routing/inmemory/inmemory.go @@ -113,8 +113,8 @@ func (r *NodeInfoStore) FindPeer(ctx context.Context, peerID peer.ID) (peer.Addr if !ok { return peer.AddrInfo{}, nil } - if len(infoWrapper.PeerInfo.Addrs) > 0 { - return infoWrapper.PeerInfo, nil + if infoWrapper.PeerInfo != nil && len(infoWrapper.PeerInfo.Addrs) > 0 { + return *infoWrapper.PeerInfo, nil } return peer.AddrInfo{}, nil } diff --git a/pkg/routing/inmemory/inmemory_test.go b/pkg/routing/inmemory/inmemory_test.go index 5c0544a7ac..d8ad7e1ed8 100644 --- a/pkg/routing/inmemory/inmemory_test.go +++ b/pkg/routing/inmemory/inmemory_test.go @@ -9,8 +9,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/routing" - "github.com/libp2p/go-libp2p/core/peer" - "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" ) @@ -145,7 +143,7 @@ func (s *InMemoryNodeInfoStoreSuite) Test_Replace() { s.NoError(s.store.Add(ctx, nodeInfo0)) nodeInfo1 := generateNodeInfo(s.T(), nodeIDs[0], models.EngineWasm) - nodeInfo1.PeerInfo.ID = nodeInfo0.PeerInfo.ID + nodeInfo1.NodeID = nodeInfo0.NodeID s.NoError(s.store.Add(ctx, nodeInfo1)) res, err := s.store.Get(ctx, nodeInfo0.ID()) @@ -180,12 +178,8 @@ func (s *InMemoryNodeInfoStoreSuite) Test_Eviction() { } func generateNodeInfo(t *testing.T, peerID string, engines ...string) models.NodeInfo { - id, err := peer.Decode(peerID) - require.NoError(t, err) return models.NodeInfo{ - PeerInfo: peer.AddrInfo{ - ID: id, - }, + NodeID: peerID, NodeType: models.NodeTypeCompute, ComputeNodeInfo: &models.ComputeNodeInfo{ ExecutionEngines: engines, diff --git a/pkg/routing/node_info_provider.go b/pkg/routing/node_info_provider.go index 3ae5f4265c..3678bfa2ff 100644 --- a/pkg/routing/node_info_provider.go +++ b/pkg/routing/node_info_provider.go @@ -4,51 +4,44 @@ import ( "context" "github.com/bacalhau-project/bacalhau/pkg/models" - "github.com/libp2p/go-libp2p/core/host" - "github.com/libp2p/go-libp2p/core/peer" - "github.com/libp2p/go-libp2p/p2p/protocol/identify" ) type NodeInfoProviderParams struct { - Host host.Host - IdentityService identify.IDService - LabelsProvider models.LabelsProvider - ComputeInfoProvider models.ComputeNodeInfoProvider - BacalhauVersion models.BuildVersionInfo + NodeID string + LabelsProvider models.LabelsProvider + BacalhauVersion models.BuildVersionInfo } type NodeInfoProvider struct { - h host.Host - identityService identify.IDService - labelsProvider models.LabelsProvider - computeInfoProvider models.ComputeNodeInfoProvider - bacalhauVersion models.BuildVersionInfo + nodeID string + labelsProvider models.LabelsProvider + bacalhauVersion models.BuildVersionInfo + nodeInfoDecorators []models.NodeInfoDecorator } func NewNodeInfoProvider(params NodeInfoProviderParams) *NodeInfoProvider { return &NodeInfoProvider{ - h: params.Host, - identityService: params.IdentityService, - labelsProvider: params.LabelsProvider, - computeInfoProvider: params.ComputeInfoProvider, - bacalhauVersion: params.BacalhauVersion, + nodeID: params.NodeID, + labelsProvider: params.LabelsProvider, + bacalhauVersion: params.BacalhauVersion, + nodeInfoDecorators: make([]models.NodeInfoDecorator, 0), } } +// RegisterNodeInfoDecorator registers a node info decorator with the node info provider. +func (n *NodeInfoProvider) RegisterNodeInfoDecorator(decorator models.NodeInfoDecorator) { + n.nodeInfoDecorators = append(n.nodeInfoDecorators, decorator) +} + func (n *NodeInfoProvider) GetNodeInfo(ctx context.Context) models.NodeInfo { res := models.NodeInfo{ + NodeID: n.nodeID, BacalhauVersion: n.bacalhauVersion, - PeerInfo: peer.AddrInfo{ - ID: n.h.ID(), - Addrs: n.identityService.OwnObservedAddrs(), - }, - Labels: n.labelsProvider.GetLabels(ctx), - NodeType: models.NodeTypeRequester, + Labels: n.labelsProvider.GetLabels(ctx), + NodeType: models.NodeTypeRequester, } - if n.computeInfoProvider != nil { - info := n.computeInfoProvider.GetComputeInfo(ctx) - res.NodeType = models.NodeTypeCompute - res.ComputeNodeInfo = &info + for _, decorator := range n.nodeInfoDecorators { + res = decorator.DecorateNodeInfo(ctx, res) } return res } diff --git a/pkg/test/compute/resourcelimits_test.go b/pkg/test/compute/resourcelimits_test.go index ac21a30732..1624ef83e3 100644 --- a/pkg/test/compute/resourcelimits_test.go +++ b/pkg/test/compute/resourcelimits_test.go @@ -359,11 +359,11 @@ func (suite *ComputeNodeResourceLimitsSuite) TestParallelGPU() { } // test that each node has 2 job allocated to it - node1Count, ok := allocationMap[stack.Nodes[0].Host.ID().String()] + node1Count, ok := allocationMap[stack.Nodes[0].ID] require.True(suite.T(), ok) require.Equal(suite.T(), jobsPerNode, node1Count) - node2Count, ok := allocationMap[stack.Nodes[1].Host.ID().String()] + node2Count, ok := allocationMap[stack.Nodes[1].ID] require.True(suite.T(), ok) require.Equal(suite.T(), jobsPerNode, node2Count) } diff --git a/pkg/test/compute/setup_test.go b/pkg/test/compute/setup_test.go index 247342432c..c1f5f9ceea 100644 --- a/pkg/test/compute/setup_test.go +++ b/pkg/test/compute/setup_test.go @@ -91,8 +91,21 @@ func (s *ComputeSuite) setupNode() { storagePath := s.T().TempDir() noopstorage := noop_storage.NewNoopStorage() + callback := compute.CallbackMock{ + OnBidCompleteHandler: func(ctx context.Context, result compute.BidResult) { + s.bidChannel <- result + }, + OnRunCompleteHandler: func(ctx context.Context, result compute.RunResult) { + s.completedChannel <- result + }, + OnComputeFailureHandler: func(ctx context.Context, err compute.ComputeError) { + s.failureChannel <- err + }, + } + s.node, err = node.NewComputeNode( context.Background(), + host.ID().String(), s.cm, host, apiServer, @@ -102,23 +115,13 @@ func (s *ComputeSuite) setupNode() { provider.NewNoopProvider[executor.Executor](s.executor), provider.NewNoopProvider[publisher.Publisher](s.publisher), repo, + callback, ) s.NoError(err) s.stateResolver = *resolver.NewStateResolver(resolver.StateResolverParams{ ExecutionStore: s.node.ExecutionStore, }) - s.node.RegisterLocalComputeCallback(compute.CallbackMock{ - OnBidCompleteHandler: func(ctx context.Context, result compute.BidResult) { - s.bidChannel <- result - }, - OnRunCompleteHandler: func(ctx context.Context, result compute.RunResult) { - s.completedChannel <- result - }, - OnComputeFailureHandler: func(ctx context.Context, err compute.ComputeError) { - s.failureChannel <- err - }, - }) s.T().Cleanup(func() { close(s.bidChannel) }) } diff --git a/pkg/test/logstream/stream_address_test.go b/pkg/test/logstream/stream_address_test.go index b4af812c5a..ea49a4658c 100644 --- a/pkg/test/logstream/stream_address_test.go +++ b/pkg/test/logstream/stream_address_test.go @@ -20,6 +20,10 @@ func (s *LogStreamTestSuite) TestStreamAddress() { docker.MustHaveDocker(s.T()) + if s.stack.Nodes[0].Libp2pHost == nil { + // TODO: un-skip once we add log stream support for nats transport layer + s.T().Skip("skipping log stream tests for non-libp2p transports") + } node := s.stack.Nodes[0] task := mock.TaskBuilder(). @@ -31,7 +35,7 @@ func (s *LogStreamTestSuite) TestStreamAddress() { job.Tasks[0] = task execution := mock.ExecutionForJob(job) - execution.NodeID = node.Host.ID().Pretty() + execution.NodeID = node.ID execution.AllocateResources(task.Name, models.Resources{}) err := node.RequesterNode.JobStore.CreateJob(s.ctx, *job) diff --git a/pkg/test/requester/node_selection_test.go b/pkg/test/requester/node_selection_test.go index 695e206924..9c6b41ea7e 100644 --- a/pkg/test/requester/node_selection_test.go +++ b/pkg/test/requester/node_selection_test.go @@ -189,7 +189,7 @@ func (s *NodeSelectionSuite) getSelectedNodes(jobID string) []*node.Node { for _, executionState := range completedExecutionStates { nodeFound := false for _, n := range s.computeNodes { - if n.Host.ID().String() == executionState.NodeID { + if n.ID == executionState.NodeID { nodes = append(nodes, n) nodeFound = true break @@ -206,10 +206,10 @@ func (s *NodeSelectionSuite) assertNodesMatch(expected, selected []*node.Node) { expectedNodeNames := make([]string, 0, len(expected)) selectedNodeNames := make([]string, 0, len(selected)) for _, n := range expected { - expectedNodeNames = append(expectedNodeNames, n.Host.ID().String()) + expectedNodeNames = append(expectedNodeNames, n.ID) } for _, n := range selected { - selectedNodeNames = append(selectedNodeNames, n.Host.ID().String()) + selectedNodeNames = append(selectedNodeNames, n.ID) } s.ElementsMatch(expectedNodeNames, selectedNodeNames) } diff --git a/pkg/test/teststack/stack.go b/pkg/test/teststack/stack.go index 76a4267a0f..58613deb24 100644 --- a/pkg/test/teststack/stack.go +++ b/pkg/test/teststack/stack.go @@ -93,7 +93,7 @@ func WithNoopExecutor(noopConfig noop_executor.ExecutorConfig) devstack.ConfigOp func allNodesDiscovered(t testing.TB, stack *devstack.DevStack) bool { for _, node := range stack.Nodes { - ctx := logger.ContextWithNodeIDLogger(context.Background(), node.Host.ID().String()) + ctx := logger.ContextWithNodeIDLogger(context.Background(), node.ID) if !node.IsRequesterNode() || node.RequesterNode == nil { continue diff --git a/pkg/transport/interfaces.go b/pkg/transport/interfaces.go new file mode 100644 index 0000000000..c314a71302 --- /dev/null +++ b/pkg/transport/interfaces.go @@ -0,0 +1,34 @@ +package transport + +import ( + "context" + + "github.com/bacalhau-project/bacalhau/pkg/compute" + "github.com/bacalhau-project/bacalhau/pkg/model" + "github.com/bacalhau-project/bacalhau/pkg/models" + "github.com/bacalhau-project/bacalhau/pkg/pubsub" +) + +// TransportLayer is the interface for the transport layer. +type TransportLayer interface { + // ComputeProxy enables orchestrator nodes to send job requests to compute nodes. + ComputeProxy() compute.Endpoint + // CallbackProxy enables compute nodes to send results and responses back to orchestrator nodes + CallbackProxy() compute.Callback + // NodeInfoPubSub enables compute nodes to publish their info and capabilities + // to orchestrator nodes for job matching and discovery. + NodeInfoPubSub() pubsub.PubSub[models.NodeInfo] + // NodeInfoDecorator enables transport layer to enrich node info with data + // required for request routing + NodeInfoDecorator() models.NodeInfoDecorator + // DebugInfoProviders enables transport layer to provide meaningful debug info to operators + DebugInfoProviders() []model.DebugInfoProvider + // RegisterComputeCallback registers a compute callback with the transport layer + // so that incoming compute responses are forwarded to the handler + RegisterComputeCallback(callback compute.Callback) error + // RegisterComputeEndpoint registers a compute endpoint with the transport layer + // so that incoming orchestrator requests are forwarded to the handler + RegisterComputeEndpoint(endpoint compute.Endpoint) error + // Close closes the transport layer. + Close(ctx context.Context) error +}