diff --git a/cmd/cli/agent/node_test.go b/cmd/cli/agent/node_test.go index b6250fa663..78f1d7c1d9 100644 --- a/cmd/cli/agent/node_test.go +++ b/cmd/cli/agent/node_test.go @@ -33,7 +33,7 @@ func (s *NodeSuite) TestNodeJSONOutput() { nodeInfo := &models.NodeInfo{} err = marshaller.JSONUnmarshalWithMax([]byte(out), &nodeInfo) s.Require().NoError(err, "Could not unmarshall the output into json - %+v", err) - s.Require().Equal(s.Node.Host.ID(), nodeInfo.PeerInfo.ID, "Node ID does not match in json.") + s.Require().Equal(s.Node.ID, nodeInfo.ID(), "Node ID does not match in json.") } func (s *NodeSuite) TestNodeYAMLOutput() { @@ -46,5 +46,5 @@ func (s *NodeSuite) TestNodeYAMLOutput() { nodeInfo := &models.NodeInfo{} err = marshaller.YAMLUnmarshalWithMax([]byte(out), &nodeInfo) s.Require().NoError(err, "Could not unmarshall the output into yaml - %+v", err) - s.Require().Equal(s.Node.Host.ID(), nodeInfo.PeerInfo.ID, "Node ID does not match in yaml.") + s.Require().Equal(s.Node.ID, nodeInfo.ID(), "Node ID does not match in yaml.") } diff --git a/cmd/cli/devstack/devstack.go b/cmd/cli/devstack/devstack.go index cb198205cf..2ad4254e2a 100644 --- a/cmd/cli/devstack/devstack.go +++ b/cmd/cli/devstack/devstack.go @@ -147,7 +147,9 @@ func NewCmd() *cobra.Command { &ODs.ConfigurationRepo, "stack-repo", ODs.ConfigurationRepo, "Folder to act as the devstack configuration repo", ) - + devstackCmd.PersistentFlags().StringVar( + &ODs.NetworkType, "network", ODs.NetworkType, + "Type of inter-node network layer. e.g. nats and libp2p") return devstackCmd } diff --git a/cmd/cli/get/get_test.go b/cmd/cli/get/get_test.go index 3dded99b3b..9f1c2a4531 100644 --- a/cmd/cli/get/get_test.go +++ b/cmd/cli/get/get_test.go @@ -136,7 +136,7 @@ func (s *GetSuite) TestDockerRunWriteToJobFolderAutoDownload() { _, runOutput, err := cmdtesting.ExecuteTestCobraCommand(args...) require.NoError(s.T(), err, "Error submitting job") jobID := system.FindJobIDInTestOutputLegacy(runOutput) - hostID := s.Node.Host.ID().String() + hostID := s.Node.ID outputFolder := filepath.Join(tempDir, util.GetDefaultJobFolder(jobID)) testDownloadOutput(s.T(), runOutput, jobID, tempDir) testResultsFolderStructure(s.T(), outputFolder, hostID, nil) @@ -157,7 +157,7 @@ func (s *GetSuite) TestDockerRunWriteToJobFolderNamedDownload() { _, runOutput, err := cmdtesting.ExecuteTestCobraCommand(args...) require.NoError(s.T(), err, "Error submitting job") jobID := system.FindJobIDInTestOutputLegacy(runOutput) - hostID := s.Node.Host.ID().String() + hostID := s.Node.ID testDownloadOutput(s.T(), runOutput, jobID, tempDir) testResultsFolderStructure(s.T(), tempDir, hostID, nil) } @@ -177,7 +177,7 @@ func (s *GetSuite) TestGetWriteToJobFolderAutoDownload() { _, out, err := cmdtesting.ExecuteTestCobraCommand(args...) require.NoError(s.T(), err, "Error submitting job") jobID := system.FindJobIDInTestOutputLegacy(out) - hostID := s.Node.Host.ID().String() + hostID := s.Node.ID _, getOutput, err := cmdtesting.ExecuteTestCobraCommand("get", "--api-host", s.Node.APIServer.Address, @@ -224,7 +224,7 @@ func (s *GetSuite) TestGetSingleFileFromOutput() { _, out, err := cmdtesting.ExecuteTestCobraCommand(args...) require.NoError(s.T(), err, "Error submitting job") jobID := system.FindJobIDInTestOutputLegacy(out) - hostID := s.Node.Host.ID().String() + hostID := s.Node.ID _, getOutput, err := cmdtesting.ExecuteTestCobraCommand("get", "--api-host", s.Node.APIServer.Address, @@ -250,7 +250,7 @@ func (s *GetSuite) TestGetSingleNestedFileFromOutput() { _, out, err := cmdtesting.ExecuteTestCobraCommand(args...) require.NoError(s.T(), err, "Error submitting job") jobID := system.FindJobIDInTestOutputLegacy(out) - hostID := s.Node.Host.ID().String() + hostID := s.Node.ID _, getOutput, err := cmdtesting.ExecuteTestCobraCommand("get", "--api-host", s.Node.APIServer.Address, @@ -288,7 +288,7 @@ func (s *GetSuite) TestGetWriteToJobFolderNamedDownload() { require.NoError(s.T(), err, "Error submitting job") jobID := system.FindJobIDInTestOutputLegacy(out) - hostID := s.Node.Host.ID().String() + hostID := s.Node.ID _, getOutput, err := cmdtesting.ExecuteTestCobraCommand("get", "--api-host", s.Node.APIServer.Address, diff --git a/cmd/cli/list/list_test.go b/cmd/cli/list/list_test.go index a3ca33b8e7..0b22115d46 100644 --- a/cmd/cli/list/list_test.go +++ b/cmd/cli/list/list_test.go @@ -38,6 +38,15 @@ func TestListSuite(t *testing.T) { suite.Run(t, new(ListSuite)) } +func (suite *ListSuite) setupRun() { + // have to create a fresh node for each test case to avoid jobs of different runs to be mixed up + suite.TearDownTest() + // Clear the repo that was created by the previous run so a fresh one is created + // TODO: find a better solution to set the repo path for tests in pkg/setup/setup.go:49 instead of env vars to avoid such hacks + suite.T().Setenv("BACALHAU_DIR", "") + suite.SetupTest() +} + func (suite *ListSuite) TestList_NumberOfJobs() { tests := []struct { numberOfJobs int @@ -167,9 +176,7 @@ func (suite *ListSuite) TestList_AnnotationFilter() { for _, tc := range testCases { suite.Run(tc.Name, func() { ctx := context.Background() - // have to create a fresh node for each test case to avoid jobs of different runs to be mixed up - suite.TearDownTest() - suite.SetupTest() + suite.setupRun() testJob := testutils.MakeJobWithOpts(suite.T(), jobutils.WithAnnotations(tc.JobLabels...), @@ -257,10 +264,7 @@ func (suite *ListSuite) TestList_SortFlags() { for _, sortFlags := range sortFlagsToTest { suite.Run(fmt.Sprintf("%+v/%+v", tc, sortFlags), func() { ctx := context.Background() - - // have to create a fresh node for each test case to avoid jobs of different runs to be mixed up - suite.TearDownTest() - suite.SetupTest() + suite.setupRun() var jobIDs []string for i := 0; i < tc.numberOfJobs; i++ { diff --git a/cmd/cli/serve/serve.go b/cmd/cli/serve/serve.go index 9e0ffbc18a..943a3c31c7 100644 --- a/cmd/cli/serve/serve.go +++ b/cmd/cli/serve/serve.go @@ -1,25 +1,29 @@ package serve import ( + "context" "fmt" "os" "sort" "strings" "time" - "github.com/multiformats/go-multiaddr" - "github.com/bacalhau-project/bacalhau/cmd/util" "github.com/bacalhau-project/bacalhau/cmd/util/flags/configflags" "github.com/bacalhau-project/bacalhau/pkg/config" "github.com/bacalhau-project/bacalhau/pkg/config/types" bac_libp2p "github.com/bacalhau-project/bacalhau/pkg/libp2p" + "github.com/bacalhau-project/bacalhau/pkg/libp2p/rcmgr" "github.com/bacalhau-project/bacalhau/pkg/logger" + "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/node" "github.com/bacalhau-project/bacalhau/pkg/repo" "github.com/bacalhau-project/bacalhau/pkg/system" "github.com/bacalhau-project/bacalhau/pkg/util/templates" "github.com/bacalhau-project/bacalhau/webui" + "github.com/libp2p/go-libp2p/core/host" + "github.com/libp2p/go-libp2p/core/peer" + "github.com/multiformats/go-multiaddr" "github.com/spf13/cobra" "k8s.io/kubectl/pkg/util/i18n" @@ -92,6 +96,7 @@ func NewCmd() *cobra.Command { serveFlags := map[string][]configflags.Definition{ "requester-tls": configflags.RequesterTLSFlags, "server-api": configflags.ServerAPIFlags, + "network": configflags.NetworkFlags, "libp2p": configflags.Libp2pFlags, "ipfs": configflags.IPFSFlags, "capacity": configflags.CapacityFlags, @@ -173,40 +178,41 @@ func serve(cmd *cobra.Command) error { return err } - // configure node type - isRequesterNode, isComputeNode, err := getNodeType() + nodeID, err := getNodeID() if err != nil { return err } + ctx = logger.ContextWithNodeIDLogger(ctx, nodeID) - libp2pCfg, err := config.GetLibp2pConfig() + // configure node type + isRequesterNode, isComputeNode, err := getNodeType() if err != nil { return err } - peers, err := GetPeers(libp2pCfg.PeerConnect) + // Establishing IPFS connection + ipfsConfig, err := getIPFSConfig() if err != nil { return err } - // configure libp2p - libp2pHost, err := setupLibp2pHost(libp2pCfg) + ipfsClient, err := SetupIPFSClient(ctx, cm, ipfsConfig) if err != nil { return err } - cm.RegisterCallback(libp2pHost.Close) - // add nodeID to logging context - ctx = logger.ContextWithNodeIDLogger(ctx, libp2pHost.ID().String()) - // Establishing IPFS connection - ipfsConfig, err := getIPFSConfig() + networkConfig, err := getNetworkConfig() if err != nil { return err } - ipfsClient, err := SetupIPFSClient(ctx, cm, ipfsConfig) - if err != nil { - return err + if networkConfig.Type == models.NetworkTypeLibp2p { + libp2pHost, peers, err := setupLibp2p() + if err != nil { + return err + } + networkConfig.Libp2pHost = libp2pHost + networkConfig.ClusterPeers = peers } computeConfig, err := GetComputeConfig() @@ -233,9 +239,9 @@ func serve(cmd *cobra.Command) error { // Create node config from cmd arguments nodeConfig := node.NodeConfig{ + NodeID: nodeID, CleanupManager: cm, IPFSClient: ipfsClient, - Host: libp2pHost, DisabledFeatures: featureConfig, HostAddress: config.ServerAPIHost(), APIPort: config.ServerAPIPort(), @@ -247,6 +253,7 @@ func serve(cmd *cobra.Command) error { AllowListedLocalPaths: allowedListLocalPaths, FsRepo: fsRepo, NodeInfoStoreTTL: nodeInfoStoreTTL, + NetworkConfig: networkConfig, } if isRequesterNode { @@ -266,12 +273,6 @@ func serve(cmd *cobra.Command) error { return fmt.Errorf("error creating node: %w", err) } - // Start transport layer - err = bac_libp2p.ConnectToPeersContinuously(ctx, cm, libp2pHost, peers) - if err != nil { - return err - } - // Start node if err := standardNode.Start(ctx); err != nil { return fmt.Errorf("error starting node: %w", err) @@ -308,95 +309,211 @@ func serve(cmd *cobra.Command) error { cmd.Printf("API: %s\n", standardNode.APIServer.GetURI().JoinPath("/api/v1/compute/debug")) } - if ipfsConfig.PrivateInternal && libp2pCfg.PeerConnect == DefaultPeerConnect { - if isComputeNode && !isRequesterNode { - cmd.Println("Make sure there's at least one requester node in your network.") - } + connectCmd, err := buildConnectCommand(ctx, &nodeConfig, ipfsConfig) + if err != nil { + return err + } + cmd.Println() + cmd.Println(connectCmd) - ipfsAddresses, err := ipfsClient.SwarmMultiAddresses(ctx) - if err != nil { - return fmt.Errorf("error looking up IPFS addresses: %w", err) - } + envVars, err := buildEnvVariables(ctx, &nodeConfig, ipfsConfig) + if err != nil { + return err + } + cmd.Println() + cmd.Println("To connect to this node from the client, run the following commands in your shell:") + cmd.Println(envVars) + + ripath, err := fsRepo.WriteRunInfo(ctx, envVars) + if err != nil { + return fmt.Errorf("writing run info to repo: %w", err) + } else { + cmd.Printf("A copy of these variables have been written to: %s\n", ripath) + } + if err != nil { + return err + } - p2pAddr, err := multiaddr.NewMultiaddr("/p2p/" + libp2pHost.ID().String()) + cm.RegisterCallback(func() error { + return os.Remove(ripath) + }) + + <-ctx.Done() // block until killed + return nil +} + +func setupLibp2p() (libp2pHost host.Host, peers []string, err error) { + defer func() { if err != nil { - return err + err = fmt.Errorf("failed to setup libp2p node. %w", err) } + }() + libp2pCfg, err := config.GetLibp2pConfig() + if err != nil { + return + } + + privKey, err := config.GetLibp2pPrivKey() + if err != nil { + return + } - peerAddress := pickP2pAddress(libp2pHost.Addrs()).Encapsulate(p2pAddr).String() - ipfsSwarmAddress := pickP2pAddress(ipfsAddresses).String() + libp2pHost, err = bac_libp2p.NewHost(libp2pCfg.SwarmPort, privKey, rcmgr.DefaultResourceManager) + if err != nil { + return + } - sb := strings.Builder{} - sb.WriteString("\n") - sb.WriteString("To connect another node to this private one, run the following command in your shell:\n") + peersAddrs, err := GetPeers(libp2pCfg.PeerConnect) + if err != nil { + return + } + peers = make([]string, len(peersAddrs)) + for i, p := range peersAddrs { + peers[i] = p.String() + } + return +} - sb.WriteString(fmt.Sprintf("%s serve ", os.Args[0])) +func getNodeID() (string, error) { + // for now, use libp2p host ID as node ID, regardless of using NATS or Libp2p + // TODO: allow users to specify node ID + privKey, err := config.GetLibp2pPrivKey() + if err != nil { + return "", err + } + peerID, err := peer.IDFromPrivateKey(privKey) + if err != nil { + return "", err + } + return peerID.String(), nil +} + +func buildConnectCommand(ctx context.Context, nodeConfig *node.NodeConfig, ipfsConfig types.IpfsConfig) (string, error) { + headerB := strings.Builder{} + cmdB := strings.Builder{} + if nodeConfig.IsRequesterNode { + cmdB.WriteString(fmt.Sprintf("%s serve ", os.Args[0])) // other nodes can be just compute nodes // no need to spawn 1+ requester nodes - sb.WriteString(fmt.Sprintf("%s=compute ", + cmdB.WriteString(fmt.Sprintf("%s=compute ", configflags.FlagNameForKey(types.NodeType, configflags.NodeTypeFlags...))) - sb.WriteString(fmt.Sprintf("%s ", - configflags.FlagNameForKey(types.NodeIPFSPrivateInternal, configflags.IPFSFlags...))) + cmdB.WriteString(fmt.Sprintf("%s=%s ", + configflags.FlagNameForKey(types.NodeNetworkType, configflags.NetworkFlags...), + nodeConfig.NetworkConfig.Type)) - sb.WriteString(fmt.Sprintf("%s=%s ", - configflags.FlagNameForKey(types.NodeLibp2pPeerConnect, configflags.Libp2pFlags...), - peerAddress, - )) - sb.WriteString(fmt.Sprintf("%s=%s ", - configflags.FlagNameForKey(types.NodeIPFSSwarmAddresses, configflags.IPFSFlags...), - ipfsSwarmAddress, - )) - cmd.Println(sb.String()) + switch nodeConfig.NetworkConfig.Type { + case models.NetworkTypeNATS: + advertisedAddr := nodeConfig.NetworkConfig.AdvertisedAddress + if advertisedAddr == "" { + advertisedAddr = fmt.Sprintf("127.0.0.1:%d", nodeConfig.NetworkConfig.Port) + } - summaryBuilder := strings.Builder{} - summaryBuilder.WriteString(fmt.Sprintf( - "export %s=%s\n", - config.KeyAsEnvVar(types.NodeIPFSSwarmAddresses), - ipfsSwarmAddress, - )) - summaryBuilder.WriteString(fmt.Sprintf( - "export %s=%s\n", - config.KeyAsEnvVar(types.NodeClientAPIHost), - config.ServerAPIHost(), - )) - summaryBuilder.WriteString(fmt.Sprintf( - "export %s=%d\n", - config.KeyAsEnvVar(types.NodeClientAPIPort), - config.ServerAPIPort(), - )) - summaryBuilder.WriteString(fmt.Sprintf( - "export %s=%s\n", - config.KeyAsEnvVar(types.NodeLibp2pPeerConnect), - peerAddress, - )) + headerB.WriteString("To connect a compute node to this orchestrator, run the following command in your shell:\n") + + cmdB.WriteString(fmt.Sprintf("%s=%s ", + configflags.FlagNameForKey(types.NodeNetworkOrchestrators, configflags.NetworkFlags...), + advertisedAddr, + )) - // Just convenience below - print out the last of the nodes information as the global variable - summaryShellVariablesString := summaryBuilder.String() + case models.NetworkTypeLibp2p: + headerB.WriteString("To connect another node to this one, run the following command in your shell:\n") - if isRequesterNode { - cmd.Println() - cmd.Println("To use this requester node from the client, run the following commands in your shell:") - cmd.Println(summaryShellVariablesString) + p2pAddr, err := multiaddr.NewMultiaddr("/p2p/" + nodeConfig.NetworkConfig.Libp2pHost.ID().String()) + if err != nil { + return "", err + } + peerAddress := pickP2pAddress(nodeConfig.NetworkConfig.Libp2pHost.Addrs()).Encapsulate(p2pAddr).String() + cmdB.WriteString(fmt.Sprintf("%s=%s ", + configflags.FlagNameForKey(types.NodeLibp2pPeerConnect, configflags.Libp2pFlags...), + peerAddress, + )) } - ripath, err := fsRepo.WriteRunInfo(ctx, summaryShellVariablesString) - if err != nil { - return fmt.Errorf("writing run info to repo: %w", err) - } else { - cmd.Printf("A copy of these variables have been written to: %s\n", ripath) + if ipfsConfig.PrivateInternal { + ipfsAddresses, err := nodeConfig.IPFSClient.SwarmMultiAddresses(ctx) + if err != nil { + return "", fmt.Errorf("error looking up IPFS addresses: %w", err) + } + + cmdB.WriteString(fmt.Sprintf("%s ", + configflags.FlagNameForKey(types.NodeIPFSPrivateInternal, configflags.IPFSFlags...))) + + cmdB.WriteString(fmt.Sprintf("%s=%s ", + configflags.FlagNameForKey(types.NodeIPFSSwarmAddresses, configflags.IPFSFlags...), + pickP2pAddress(ipfsAddresses).String(), + )) } - if err != nil { - return err + } else { + if nodeConfig.NetworkConfig.Type == models.NetworkTypeLibp2p { + headerB.WriteString("Make sure there's at least one requester node in your network.") + } + } + + return headerB.String() + cmdB.String(), nil +} + +func buildEnvVariables(ctx context.Context, nodeConfig *node.NodeConfig, ipfsConfig types.IpfsConfig) (string, error) { + // build shell variables to connect to this node + envVarBuilder := strings.Builder{} + envVarBuilder.WriteString(fmt.Sprintf( + "export %s=%s\n", + config.KeyAsEnvVar(types.NodeClientAPIHost), + config.ServerAPIHost(), + )) + envVarBuilder.WriteString(fmt.Sprintf( + "export %s=%d\n", + config.KeyAsEnvVar(types.NodeClientAPIPort), + config.ServerAPIPort(), + )) + + if nodeConfig.IsRequesterNode { + envVarBuilder.WriteString(fmt.Sprintf( + "export %s=%s\n", + config.KeyAsEnvVar(types.NodeNetworkType), nodeConfig.NetworkConfig.Type, + )) + + switch nodeConfig.NetworkConfig.Type { + case models.NetworkTypeNATS: + advertisedAddr := nodeConfig.NetworkConfig.AdvertisedAddress + if advertisedAddr == "" { + advertisedAddr = fmt.Sprintf("127.0.0.1:%d", nodeConfig.NetworkConfig.Port) + } + + envVarBuilder.WriteString(fmt.Sprintf( + "export %s=%s\n", + config.KeyAsEnvVar(types.NodeNetworkOrchestrators), + advertisedAddr, + )) + case models.NetworkTypeLibp2p: + p2pAddr, err := multiaddr.NewMultiaddr("/p2p/" + nodeConfig.NetworkConfig.Libp2pHost.ID().String()) + if err != nil { + return "", err + } + peerAddress := pickP2pAddress(nodeConfig.NetworkConfig.Libp2pHost.Addrs()).Encapsulate(p2pAddr).String() + + envVarBuilder.WriteString(fmt.Sprintf( + "export %s=%s\n", + config.KeyAsEnvVar(types.NodeLibp2pPeerConnect), + peerAddress, + )) } - cm.RegisterCallback(func() error { - return os.Remove(ripath) - }) + if ipfsConfig.PrivateInternal { + ipfsAddresses, err := nodeConfig.IPFSClient.SwarmMultiAddresses(ctx) + if err != nil { + return "", fmt.Errorf("error looking up IPFS addresses: %w", err) + } + + envVarBuilder.WriteString(fmt.Sprintf( + "export %s=%s\n", + config.KeyAsEnvVar(types.NodeIPFSSwarmAddresses), + pickP2pAddress(ipfsAddresses).String(), + )) + } } - <-ctx.Done() // block until killed - return nil + return envVarBuilder.String(), nil } // pickP2pAddress will aim to select a non-localhost IPv4 TCP address, or at least a non-localhost IPv6 one, from a list diff --git a/cmd/cli/serve/util.go b/cmd/cli/serve/util.go index e6f213aec0..ae63b933e1 100644 --- a/cmd/cli/serve/util.go +++ b/cmd/cli/serve/util.go @@ -5,7 +5,6 @@ import ( "fmt" "time" - "github.com/libp2p/go-libp2p/core/host" "github.com/rs/zerolog/log" "github.com/spf13/viper" "go.uber.org/multierr" @@ -17,8 +16,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/config" "github.com/bacalhau-project/bacalhau/pkg/config/types" "github.com/bacalhau-project/bacalhau/pkg/ipfs" - bac_libp2p "github.com/bacalhau-project/bacalhau/pkg/libp2p" - "github.com/bacalhau-project/bacalhau/pkg/libp2p/rcmgr" "github.com/bacalhau-project/bacalhau/pkg/node" "github.com/bacalhau-project/bacalhau/pkg/system" ) @@ -112,18 +109,6 @@ func getNodeType() (requester, compute bool, err error) { return } -func setupLibp2pHost(cfg types.Libp2pConfig) (host.Host, error) { - privKey, err := config.GetLibp2pPrivKey() - if err != nil { - return nil, err - } - libp2pHost, err := bac_libp2p.NewHost(cfg.SwarmPort, privKey, rcmgr.DefaultResourceManager) - if err != nil { - return nil, fmt.Errorf("error creating libp2p host: %w", err) - } - return libp2pHost, nil -} - func getIPFSConfig() (types.IpfsConfig, error) { var ipfsConfig types.IpfsConfig if err := config.ForKey(types.NodeIPFS, &ipfsConfig); err != nil { @@ -188,3 +173,20 @@ func getDisabledFeatures() (node.FeatureConfig, error) { func getAllowListedLocalPathsConfig() []string { return viper.GetStringSlice(types.NodeAllowListedLocalPaths) } + +func getNetworkConfig() (node.NetworkConfig, error) { + var networkCfg types.NetworkConfig + if err := config.ForKey(types.NodeNetwork, &networkCfg); err != nil { + return node.NetworkConfig{}, err + } + return node.NetworkConfig{ + Type: networkCfg.Type, + Port: networkCfg.Port, + AdvertisedAddress: networkCfg.AdvertisedAddress, + Orchestrators: networkCfg.Orchestrators, + ClusterName: networkCfg.Cluster.Name, + ClusterPort: networkCfg.Cluster.Port, + ClusterAdvertisedAddress: networkCfg.Cluster.AdvertisedAddress, + ClusterPeers: networkCfg.Cluster.Peers, + }, nil +} diff --git a/cmd/util/flags/configflags/network.go b/cmd/util/flags/configflags/network.go new file mode 100644 index 0000000000..1d3c2a68b6 --- /dev/null +++ b/cmd/util/flags/configflags/network.go @@ -0,0 +1,54 @@ +package configflags + +import "github.com/bacalhau-project/bacalhau/pkg/config/types" + +var NetworkFlags = []Definition{ + { + FlagName: "network", + ConfigPath: types.NodeNetworkType, + DefaultValue: Default.Node.Network.Type, + Description: `Inter-node network layer type (e.g. nats, libp2p).`, + }, + { + FlagName: "network-port", + ConfigPath: types.NodeNetworkPort, + DefaultValue: Default.Node.Network.Port, + Description: `Port to listen for connections from other nodes. Applies to orchestrator nodes.`, + }, + { + FlagName: "orchestrators", + ConfigPath: types.NodeNetworkOrchestrators, + DefaultValue: Default.Node.Network.Orchestrators, + Description: `Comma-separated list of orchestrators to connect to. Applies to compute nodes.`, + }, + { + FlagName: "advertised-address", + ConfigPath: types.NodeNetworkAdvertisedAddress, + DefaultValue: Default.Node.Network.AdvertisedAddress, + Description: `Address to advertise to compute nodes to connect to.`, + }, + { + FlagName: "cluster-name", + ConfigPath: types.NodeNetworkClusterName, + DefaultValue: Default.Node.Network.Cluster.Name, + Description: `Name of the cluster to join.`, + }, + { + FlagName: "cluster-port", + ConfigPath: types.NodeNetworkClusterPort, + DefaultValue: Default.Node.Network.Cluster.Port, + Description: `Port to listen for connections from other orchestrators to form a cluster.`, + }, + { + FlagName: "cluster-advertised-address", + ConfigPath: types.NodeNetworkClusterAdvertisedAddress, + DefaultValue: Default.Node.Network.Cluster.AdvertisedAddress, + Description: `Address to advertise to other orchestrators to connect to.`, + }, + { + FlagName: "cluster-peers", + ConfigPath: types.NodeNetworkClusterPeers, + DefaultValue: Default.Node.Network.Cluster.Peers, + Description: `Comma-separated list of other orchestrators to connect to to form a cluster.`, + }, +} diff --git a/cmd/util/flags/types.go b/cmd/util/flags/types.go index f52d52604d..4408470c73 100644 --- a/cmd/util/flags/types.go +++ b/cmd/util/flags/types.go @@ -7,7 +7,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/bidstrategy/semantic" "github.com/bacalhau-project/bacalhau/pkg/config/types" - "github.com/spf13/pflag" "golang.org/x/exp/slices" diff --git a/go.mod b/go.mod index b0b4895f0b..3ac505c7e5 100644 --- a/go.mod +++ b/go.mod @@ -48,6 +48,8 @@ require ( github.com/multiformats/go-multiaddr v0.9.0 github.com/multiformats/go-multicodec v0.9.0 github.com/multiformats/go-multihash v0.2.3 + github.com/nats-io/nats-server/v2 v2.10.7 + github.com/nats-io/nats.go v1.31.0 github.com/open-policy-agent/opa v0.60.0 github.com/opencontainers/image-spec v1.1.0-rc5 github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 @@ -148,8 +150,12 @@ require ( github.com/lestrrat-go/iter v1.0.2 // indirect github.com/lestrrat-go/option v1.0.1 // indirect github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect + github.com/minio/highwayhash v1.0.2 // indirect github.com/mitchellh/go-testing-interface v1.0.0 // indirect github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect + github.com/nats-io/jwt/v2 v2.5.3 // indirect + github.com/nats-io/nkeys v0.4.6 // indirect + github.com/nats-io/nuid v1.0.1 // indirect github.com/oklog/run v1.0.0 // indirect github.com/pelletier/go-toml/v2 v2.0.8 // indirect github.com/pjbgf/sha1cd v0.3.0 // indirect @@ -262,7 +268,7 @@ require ( github.com/jbenet/go-temp-err-catcher v0.1.0 // indirect github.com/jbenet/goprocess v0.1.4 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/klauspost/compress v1.16.5 // indirect + github.com/klauspost/compress v1.17.4 // indirect github.com/klauspost/cpuid/v2 v2.2.5 // indirect github.com/koron/go-ssdp v0.0.4 // indirect github.com/libp2p/go-buffer-pool v0.1.0 // indirect diff --git a/go.sum b/go.sum index 82b4919d35..7efaabd9da 100644 --- a/go.sum +++ b/go.sum @@ -700,8 +700,8 @@ github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/kkdai/bstream v0.0.0-20161212061736-f391b8402d23/go.mod h1:J+Gs4SYgM6CZQHDETBtE9HaSEkGmuNXF86RwHhHUvq4= github.com/klauspost/compress v1.10.3/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs= -github.com/klauspost/compress v1.16.5 h1:IFV2oUNUzZaz+XyusxpLzpzS8Pt5rh0Z16For/djlyI= -github.com/klauspost/compress v1.16.5/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= +github.com/klauspost/compress v1.17.4 h1:Ej5ixsIri7BrIjBkRZLTo6ghwrEtHFk7ijlczPW4fZ4= +github.com/klauspost/compress v1.17.4/go.mod h1:/dCuZOvVtNoHsyb+cuJD3itjs3NbnF6KH9zAO4BDxPM= github.com/klauspost/cpuid/v2 v2.0.4/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= github.com/klauspost/cpuid/v2 v2.2.5 h1:0E5MSMDEoAulmXNFquVs//DdoomxaoTY1kUhbc/qbZg= @@ -855,6 +855,8 @@ github.com/mikioh/tcpinfo v0.0.0-20190314235526-30a79bb1804b/go.mod h1:lxPUiZwKo github.com/mikioh/tcpopt v0.0.0-20190314235656-172688c1accc h1:PTfri+PuQmWDqERdnNMiD9ZejrlswWrCpBEZgWOiTrc= github.com/mikioh/tcpopt v0.0.0-20190314235656-172688c1accc/go.mod h1:cGKTAVKx4SxOuR/czcZ/E2RSJ3sfHs8FpHhQ5CWMf9s= github.com/minio/blake2b-simd v0.0.0-20160723061019-3f5f724cb5b1/go.mod h1:pD8RvIylQ358TN4wwqatJ8rNavkEINozVn9DtGI3dfQ= +github.com/minio/highwayhash v1.0.2 h1:Aak5U0nElisjDCfPSG79Tgzkn2gl66NxOMspRrKnA/g= +github.com/minio/highwayhash v1.0.2/go.mod h1:BQskDq+xkJ12lmlUUi7U0M5Swg3EWR+dLTk+kldvVxY= github.com/minio/sha256-simd v0.0.0-20190131020904-2d45a736cd16/go.mod h1:2FMWW+8GMoPweT6+pI63m9YE3Lmw4J71hV56Chs1E/U= github.com/minio/sha256-simd v0.1.1-0.20190913151208-6de447530771/go.mod h1:B5e1o+1/KgNmWrSQK08Y6Z1Vb5pwIktudl0J58iy0KM= github.com/minio/sha256-simd v0.1.1/go.mod h1:B5e1o+1/KgNmWrSQK08Y6Z1Vb5pwIktudl0J58iy0KM= @@ -933,6 +935,16 @@ github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRW github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= +github.com/nats-io/jwt/v2 v2.5.3 h1:/9SWvzc6hTfamcgXJ3uYRpgj+QuY2aLNqRiqrKcrpEo= +github.com/nats-io/jwt/v2 v2.5.3/go.mod h1:iysuPemFcc7p4IoYots3IuELSI4EDe9Y0bQMe+I3Bf4= +github.com/nats-io/nats-server/v2 v2.10.7 h1:f5VDy+GMu7JyuFA0Fef+6TfulfCs5nBTgq7MMkFJx5Y= +github.com/nats-io/nats-server/v2 v2.10.7/go.mod h1:V2JHOvPiPdtfDXTuEUsthUnCvSDeFrK4Xn9hRo6du7c= +github.com/nats-io/nats.go v1.31.0 h1:/WFBHEc/dOKBF6qf1TZhrdEfTmOZ5JzdJ+Y3m6Y/p7E= +github.com/nats-io/nats.go v1.31.0/go.mod h1:di3Bm5MLsoB4Bx61CBTsxuarI36WbhAwOm8QrW39+i8= +github.com/nats-io/nkeys v0.4.6 h1:IzVe95ru2CT6ta874rt9saQRkWfe2nFj1NtvYSLqMzY= +github.com/nats-io/nkeys v0.4.6/go.mod h1:4DxZNzenSVd1cYQoAa8948QY3QDjrHfcfVADymtkpts= +github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= +github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c= github.com/neelance/astrewrite v0.0.0-20160511093645-99348263ae86/go.mod h1:kHJEU3ofeGjhHklVoIGuVj85JJwZ6kWPaJwCIxgnFmo= github.com/neelance/sourcemap v0.0.0-20151028013722-8c68805598ab/go.mod h1:Qr6/a/Q4r9LP1IltGz7tA7iOK1WonHEYhu1HRBA7ZiM= github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= @@ -1471,6 +1483,7 @@ golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20181029174526-d69651ed3497/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181205085412-a5c9d58dba9a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190130150945-aca44879d564/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190219092855-153ac476189d/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= diff --git a/go.work.sum b/go.work.sum index ee29128933..f8a4445a9b 100644 --- a/go.work.sum +++ b/go.work.sum @@ -1075,7 +1075,6 @@ github.com/circonus-labs/circonusllhist v0.1.3/go.mod h1:kMXHVDlOchFAehlya5ePtbp github.com/clbanning/x2j v0.0.0-20191024224557-825249438eec h1:EdRZT3IeKQmfCSrgo8SZ8V3MEnskuJP0wCYNpe+aiXo= github.com/clbanning/x2j v0.0.0-20191024224557-825249438eec/go.mod h1:jMjuTZXRI4dUb/I5gc9Hdhagfvm9+RyrPryS/auMzxE= github.com/client9/misspell v0.3.4 h1:ta993UF76GwbvJcIo3Y68y/M3WxlpEHPWIGDkJYwzJI= -github.com/cloudflare/circl v1.3.7/go.mod h1:sRTcRWXGLrKw6yIGJ+l7amYJFfAXbZG0kBSc8r4zxgA= github.com/cloudflare/golz4 v0.0.0-20150217214814-ef862a3cdc58 h1:F1EaeKL/ta07PY/k9Os/UFtwERei2/XzGemhpGnBKNg= github.com/cloudflare/golz4 v0.0.0-20150217214814-ef862a3cdc58/go.mod h1:EOBUe0h4xcZ5GoxqC5SDxFQ8gwyZPKQoEzownBlhI80= github.com/cncf/udpa/go v0.0.0-20210930031921-04548b0d99d4 h1:hzAQntlaYRkVSFEfj9OTWlVV1H155FMD8BTKktLv0QI= @@ -1158,7 +1157,6 @@ github.com/danieljoos/wincred v1.1.2/go.mod h1:GijpziifJoIBfYh+S7BbkdUTU4LfM+QnG github.com/daviddengcn/go-colortext v1.0.0 h1:ANqDyC0ys6qCSvuEK7l3g5RaehL/Xck9EX8ATG8oKsE= github.com/daviddengcn/go-colortext v1.0.0/go.mod h1:zDqEI5NVUop5QPpVJUxE9UO10hRnmkD5G4Pmri9+m4c= github.com/davidlazar/go-crypto v0.0.0-20170701192655-dcfb0a7ac018/go.mod h1:rQYf4tfk5sSwFsnDg3qYaBxSjsD9S8+59vW0dKUgme4= -github.com/decred/dcrd/crypto/blake256 v1.0.1/go.mod h1:2OfgNZ5wDpcsFmHmCK5gZTPcCXqlm2ArzUIkw9czNJo= github.com/decred/dcrd/dcrec/secp256k1/v4 v4.1.0/go.mod h1:DZGJHZMqrU4JJqFAWUS2UO1+lbSKsdiOoYi9Zzey7Fc= github.com/decred/dcrd/lru v1.0.0 h1:Kbsb1SFDsIlaupWPwsPp+dkxiBY1frcS07PCPgotKz8= github.com/decred/dcrd/lru v1.0.0/go.mod h1:mxKOwFd7lFjN2GZYsiz/ecgqR6kkYAl+0pz0tEMk218= @@ -1291,7 +1289,6 @@ github.com/go-latex/latex v0.0.0-20210118124228-b3d85cf34e07 h1:OTlfMvwR1rLyf9go github.com/go-latex/latex v0.0.0-20230307184459-12ec69307ad9 h1:NxXI5pTAtpEaU49bpLpQoDsu1zrteW/vxzTz8Cd2UAs= github.com/go-latex/latex v0.0.0-20230307184459-12ec69307ad9/go.mod h1:gWuR/CrFDDeVRFQwHPvsv9soJVB/iqymhuZQuJ3a9OM= github.com/go-logr/logr v1.2.3/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-openapi/jsonpointer v0.20.2/go.mod h1:bHen+N0u1KEO3YlmqOjTT9Adn1RfD91Ar825/PuiRVs= github.com/go-openapi/jsonreference v0.19.6/go.mod h1:diGHMEHg2IqXZGKxqyvWdfWU/aim5Dprw5bqpKkTvns= github.com/go-openapi/spec v0.20.4/go.mod h1:faYFR1CvsJZ0mNsmsphTMSoRrNV3TEDoAM7FOEWeq8I= github.com/go-openapi/spec v0.20.6/go.mod h1:2OpW+JddWPrpXSCIX8eOx7lZ5iyuWj3RYR6VaaBKcWA= @@ -2149,8 +2146,6 @@ github.com/nats-io/nats.go v1.9.1/go.mod h1:ZjDU1L/7fJ09jvUSRVBR2e7+RnLiiIQyqyzE github.com/nats-io/nkeys v0.1.0/go.mod h1:xpnFELMwJABBLVhffcfd1MZx6VsNRFpEugbxziKVo7w= github.com/nats-io/nkeys v0.1.3 h1:6JrEfig+HzTH85yxzhSVbjHRJv9cn0p6n3IngIcM5/k= github.com/nats-io/nkeys v0.1.3/go.mod h1:xpnFELMwJABBLVhffcfd1MZx6VsNRFpEugbxziKVo7w= -github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= -github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c= github.com/ncw/swift v1.0.47 h1:4DQRPj35Y41WogBxyhOXlrI37nzGlyEcsforeudyYPQ= github.com/neelance/astrewrite v0.0.0-20160511093645-99348263ae86 h1:D6paGObi5Wud7xg83MaEFyjxQB1W5bz5d0IFppr+ymk= github.com/neelance/sourcemap v0.0.0-20151028013722-8c68805598ab h1:eFXv9Nu1lGbrNbj619aWwZfVF5HBrm9Plte8aNptuTI= @@ -2189,6 +2184,7 @@ github.com/onsi/gomega v1.27.10/go.mod h1:RsS8tutOdbdgzbPtzzATp12yT7kM5I5aElG3ev github.com/onsi/gomega v1.29.0/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ= github.com/op/go-logging v0.0.0-20160315200505-970db520ece7 h1:lDH9UUVJtmYCjyT0CI4q8xvlXPxeZ0gYCVvWbmPlp88= github.com/op/go-logging v0.0.0-20160315200505-970db520ece7/go.mod h1:HzydrMdWErDVzsI23lYNej1Htcns9BCg93Dk0bBINWk= +github.com/opencontainers/image-spec v1.0.2/go.mod h1:BtxoFyWECRxE4U/7sNtV5W15zMzWCbyJoFRP3s7yZA0= github.com/opencontainers/runc v1.1.0 h1:O9+X96OcDjkmmZyfaG996kV7yq8HsoU2h1XRRQcefG8= github.com/opencontainers/runtime-tools v0.0.0-20181011054405-1d69bd0f9c39 h1:H7DMc6FAjgwZZi8BRqjrAAHWoqEr5e5L6pS4V0ezet4= github.com/opencontainers/selinux v1.10.0 h1:rAiKF8hTcgLI3w0DHm6i0ylVVcOrlgR1kK99DRLDhyU= @@ -2241,6 +2237,8 @@ github.com/posener/complete v1.1.1/go.mod h1:em0nMJCgc9GFtwrmVmEMR/ZL6WyhyjMBndr github.com/posener/complete v1.2.3 h1:NP0eAhjcjImqslEwo/1hq7gpajME0fTLTezBKDqfXqo= github.com/posener/complete v1.2.3/go.mod h1:WZIdtGGp+qx0sLrYKtIRAruyNpv6hFCicSgv7Sy7s/s= github.com/pquerna/cachecontrol v0.0.0-20171018203845-0dec1b30a021 h1:0XM1XL/OFFJjXsYXlG30spTkV/E9+gmd5GD1w2HE8xM= +github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g= +github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U= github.com/prometheus/client_golang v0.9.2/go.mod h1:OsXs2jCmiKlQ1lTBmv21f2mNfw4xf/QclQDMrYNZzcM= github.com/prometheus/client_golang v0.9.3-0.20190127221311-3c4408c8b829/go.mod h1:p2iRAGwDERtqlqzRXnrOVns+ignqQo//hLXqYxZYVNs= github.com/prometheus/client_golang v1.3.0/go.mod h1:hJaj2vgQTGQmVCsAACORcieXFeDPbaTKGT+JTgUa3og= @@ -2365,7 +2363,6 @@ github.com/streadway/handy v0.0.0-20190108123426-d5acb3125c2a h1:AhmOdSHeswKHBjh github.com/streadway/handy v0.0.0-20190108123426-d5acb3125c2a/go.mod h1:qNTQ5P5JnDBl6z3cMAg/SywNDC5ABu5ApDIw6lUbRmI= github.com/stvp/go-udp-testing v0.0.0-20201019212854-469649b16807 h1:LUsDduamlucuNnWcaTbXQ6aLILFcLXADpOzeEH3U+OI= github.com/swaggo/swag v1.8.1/go.mod h1:ugemnJsPZm/kRwFUnzBlbHRd0JY9zE1M4F+uy2pAaPQ= -github.com/swaggo/swag v1.16.2/go.mod h1:6YzXnDcpr0767iOejs318CwYkCQqyGer6BizOg03f+E= github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 h1:kdXcSzyDtseVEc4yCz2qF8ZrQvIDBJLl4S1c3GCXmoI= github.com/tarm/serial v0.0.0-20180830185346-98f6abe2eb07 h1:UyzmZLoiDWMRywV4DUYb9Fbt8uiOSooupjTq10vpvnU= github.com/tchap/go-patricia v2.2.6+incompatible h1:JvoDL7JSoIP2HDE8AbDH3zC8QBPxmzYe32HHy5yQ+Ck= @@ -2445,7 +2442,6 @@ gitlab.com/nyarla/go-crypt v0.0.0-20160106005555-d9a5dc2b789b/go.mod h1:T3BPAOm2 go.etcd.io/bbolt v1.3.3 h1:MUGmc65QhB3pIlaQ5bB4LwqSj6GIonVJXpZiaKNyaKk= go.etcd.io/bbolt v1.3.3/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= go.etcd.io/bbolt v1.3.6 h1:/ecaJf0sk1l4l6V4awd65v2C3ILy7MSj+s/x1ADCIMU= -go.etcd.io/bbolt v1.3.8/go.mod h1:N9Mkw9X8x5fupy0IKsmuqVtoGDyxsaDlbk4Rd05IAQw= go.etcd.io/etcd v0.0.0-20191023171146-3cf2f69b5738 h1:VcrIfasaLFkyjk6KNlXQSzO+B0fZcnECiDrKJsfxka0= go.etcd.io/etcd v0.0.0-20191023171146-3cf2f69b5738/go.mod h1:dnLIgRNXwCJa5e+c6mIZCrds/GIG4ncV9HhK5PX7jPg= go.etcd.io/etcd v0.5.0-alpha.5.0.20200910180754-dd1b699fc489 h1:1JFLBqwIgdyHN1ZtgjTBwO+blA6gVOmZurpiMEsETKo= @@ -2596,7 +2592,6 @@ golang.org/x/net v0.0.0-20220923203811-8be639271d50/go.mod h1:YDH+HFinaLZZlnHAfS golang.org/x/net v0.4.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE= golang.org/x/net v0.5.0/go.mod h1:DivGGAXEgPSlEBzxGzZI+ZLohi+xUj054jfeKui00ws= golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= -golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/net v0.11.0/go.mod h1:2L/ixqYpgIVXmeoSA/4Lu7BzTG4KIyPIryS4IsOd1oQ= golang.org/x/net v0.12.0/go.mod h1:zEVYFnQC7m/vmpQFELhcD1EWkZlX69l4oqgmer6hfKA= golang.org/x/net v0.13.0/go.mod h1:zEVYFnQC7m/vmpQFELhcD1EWkZlX69l4oqgmer6hfKA= @@ -2613,7 +2608,7 @@ golang.org/x/perf v0.0.0-20180704124530-6e6d33e29852 h1:xYq6+9AtI+xP3M4r0N1hCkHr golang.org/x/sync v0.2.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= golang.org/x/sync v0.4.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= -golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181026203630-95b1ffbd15a5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181107165924-66b7b1311ac8/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -2638,17 +2633,14 @@ golang.org/x/sys v0.0.0-20220825204002-c680a09ffe64/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220919091848-fb04ddd9f9c8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.4.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.14.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20220722155259-a9ba230a4035/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.10.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.11.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.12.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= diff --git a/ops/terraform/dev.tfvars b/ops/terraform/dev.tfvars index 3858f58fb0..54275dc638 100644 --- a/ops/terraform/dev.tfvars +++ b/ops/terraform/dev.tfvars @@ -1,5 +1,5 @@ -bacalhau_version = "v1.1.3" -bacalhau_branch = "" +bacalhau_version = "" +bacalhau_branch = "nats" bacalhau_port = "1235" bacalhau_node_id_0 = "QmfYBQ3HouX9zKcANNXbgJnpyLpTYS9nKBANw6RUQKZffu" bacalhau_node_id_1 = "QmNjEQByyK8GiMTvnZqGyURuwXDCtzp9X6gJRKkpWfai7S" @@ -28,4 +28,5 @@ public_ip_addresses = ["34.86.177.175", "35.245.221.171"] num_gpu_machines = 0 log_level = "debug" otel_collector_version = "0.70.0" -otel_collector_endpoint = "http://localhost:4318" \ No newline at end of file +otel_collector_endpoint = "http://localhost:4318" +network_type = "nats" \ No newline at end of file diff --git a/ops/terraform/main.tf b/ops/terraform/main.tf index ea40f96230..ac170b5ae9 100644 --- a/ops/terraform/main.tf +++ b/ops/terraform/main.tf @@ -71,6 +71,10 @@ export GRAFANA_CLOUD_TEMPO_ENDPOINT="${var.grafana_cloud_tempo_endpoint}" export OTEL_COLLECTOR_VERSION="${var.otel_collector_version}" export OTEL_EXPORTER_OTLP_ENDPOINT="${var.otel_collector_endpoint}" export OTEL_RESOURCE_ATTRIBUTES="deployment.environment=${terraform.workspace}" +export BACALHAU_NODE_NETWORK_TYPE=${var.network_type} +export BACALHAU_NODE_NETWORK_ORCHESTRATORS="${var.internal_ip_addresses[0]}:4222" +export BACALHAU_NODE_NETWORK_ADVERTISEDADDRESS="${var.public_ip_addresses[count.index]}:4222" +export BACALHAU_NODE_NETWORK_CLUSTER_PEERS="${var.internal_ip_addresses[0]}:6222" ### secrets are installed in the install-node.sh script export SECRETS_GRAFANA_CLOUD_PROMETHEUS_API_KEY="${var.grafana_cloud_prometheus_api_key}" @@ -295,6 +299,8 @@ resource "google_compute_firewall" "bacalhau_ingress_firewall" { "55679", // otel collector zpages extension "44443", // nginx is healthy - for running health check scripts "44444", // nginx node health check scripts + "4222", // nats + "6222", // nats cluster ] } @@ -320,6 +326,8 @@ resource "google_compute_firewall" "bacalhau_egress_firewall" { ports = [ "4001", // ipfs swarm "1235", // bacalhau swarm + "4222", // nats + "6222", // nats cluster ] } diff --git a/ops/terraform/remote_files/scripts/install-node.sh b/ops/terraform/remote_files/scripts/install-node.sh index fec079b7f3..8f6deab21f 100644 --- a/ops/terraform/remote_files/scripts/install-node.sh +++ b/ops/terraform/remote_files/scripts/install-node.sh @@ -117,12 +117,14 @@ function install-bacalhau-from-release() { function install-bacalhau-from-source() { echo "Installing Bacalhau from branch ${BACALHAU_BRANCH}" - sudo apt-get -y install --no-install-recommends jq nodejs npm make + # make sure we have the desired version of nodejs to build webui + curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash - + sudo apt-get -y install --no-install-recommends jq nodejs make git clone --branch ${BACALHAU_BRANCH} https://github.com/bacalhau-project/bacalhau.git pushd bacalhau pushd webui && npm install && popd make build-bacalhau - sudo mv ./bin/*/bacalhau /usr/local/bin/bacalhau + sudo mv ./bin/*/*/bacalhau /usr/local/bin/bacalhau popd } diff --git a/ops/terraform/remote_files/scripts/start-bacalhau.sh b/ops/terraform/remote_files/scripts/start-bacalhau.sh index 512304ec8a..53a5a1ad8e 100644 --- a/ops/terraform/remote_files/scripts/start-bacalhau.sh +++ b/ops/terraform/remote_files/scripts/start-bacalhau.sh @@ -20,61 +20,79 @@ mount /dev/sdb /data || true # import the secrets source /data/secrets.sh -function getMultiaddress() { - echo -n "/ip4/${1}/tcp/${BACALHAU_PORT}/p2p/${2}" -} - -# we start with none as the default ("none" prevents the node connecting to our default bootstrap list) -export CONNECT_PEER="none" - -# use the BACALHAU_CONNECT_PEER env var if it is set -if [[ -n "${BACALHAU_CONNECT_PEER}" ]]; then - export CONNECT_PEER=$BACALHAU_CONNECT_PEER -# if we are node0 then we do not connect to anything -elif [[ "${TERRAFORM_NODE_INDEX}" != "0" ]]; then - # if we are in unsafe mode - then we connect to a single node and it's ID - # is pre-determined by the $BACALHAU_NODE0_UNSAFE_ID variable - if [[ -n "${BACALHAU_UNSAFE_CLUSTER}" ]]; then - export UNSAFE_NODE0_ID="$BACALHAU_NODE_ID_0" - if [[ -z "$UNSAFE_NODE0_ID" ]]; then - export UNSAFE_NODE0_ID="$BACALHAU_NODE0_UNSAFE_ID" - fi - export CONNECT_PEER=$(getMultiaddress "$TERRAFORM_NODE0_IP" "$UNSAFE_NODE0_ID") - # otherwise we will construct our connect string based on - # what node index we are - else - # we are > node0 so we can connect to node0 - export CONNECT_PEER=$(getMultiaddress "$TERRAFORM_NODE0_IP" "$BACALHAU_NODE_ID_0") - # we are > node1 so we can also connect to node1 - if [[ "${TERRAFORM_NODE_INDEX}" -ge "2" ]]; then - export CONNECT_PEER="$CONNECT_PEER,$(getMultiaddress "$TERRAFORM_NODE1_IP" "$BACALHAU_NODE_ID_1")" - fi - # we are > node2 so we can also connect to node2 - if [[ "${TERRAFORM_NODE_INDEX}" -ge "3" ]]; then - export CONNECT_PEER="$CONNECT_PEER,$(getMultiaddress "$TERRAFORM_NODE2_IP" "$BACALHAU_NODE_ID_2")" - fi - fi -fi - BACALHAU_PROBE_EXEC='/terraform_node/apply-http-allowlist.sh' - TRUSTED_CLIENT_IDS="\ 1df7b01ed77ca81bb6d6f06f6cbcd76a6a9e450d175dfac1e4ba70494fddd576,\ b43517b5449d383ab00ca1d2b1c558d710ba79f51c800fbf4c35ed4d0198aec5" -bacalhau serve \ - --node-type "${BACALHAU_NODE_TYPE}" \ - --job-selection-data-locality anywhere \ - --job-selection-accept-networked \ - --job-selection-probe-exec "${BACALHAU_PROBE_EXEC}" \ - --max-job-execution-timeout '60m' \ - --job-execution-timeout-bypass-client-id="${TRUSTED_CLIENT_IDS}" \ - --ipfs-swarm-addrs "" \ - --ipfs-connect /ip4/127.0.0.1/tcp/5001 \ - --swarm-port "${BACALHAU_PORT}" \ - --api-port 1234 \ - --peer "${CONNECT_PEER}" \ - --private-internal-ipfs=false \ - --web-ui "${BACALHAU_NODE_WEBUI}" \ - --labels owner=bacalhau \ - --requester-job-translation-enabled +# Check if using NATS +if [[ "${BACALHAU_NODE_NETWORK_TYPE}" == "nats" ]]; then + # nats related config as set as env vars in main.tf and no need to pass them to serve command + bacalhau serve \ + --node-type "${BACALHAU_NODE_TYPE}" \ + --job-selection-data-locality anywhere \ + --job-selection-accept-networked \ + --job-selection-probe-exec "${BACALHAU_PROBE_EXEC}" \ + --max-job-execution-timeout '60m' \ + --job-execution-timeout-bypass-client-id="${TRUSTED_CLIENT_IDS}" \ + --ipfs-swarm-addrs "" \ + --ipfs-connect /ip4/127.0.0.1/tcp/5001 \ + --api-port 1234 \ + --private-internal-ipfs=false \ + --web-ui "${BACALHAU_NODE_WEBUI}" \ + --web-ui-port 80 \ + --labels owner=bacalhau \ + --requester-job-translation-enabled + +else + function getMultiaddress() { + echo -n "/ip4/${1}/tcp/${BACALHAU_PORT}/p2p/${2}" + } + + # use the BACALHAU_CONNECT_PEER env var if it is set + if [[ -n "${BACALHAU_CONNECT_PEER}" ]]; then + export CONNECT_PEER=$BACALHAU_CONNECT_PEER + # if we are node0 then we do not connect to anything + elif [[ "${TERRAFORM_NODE_INDEX}" != "0" ]]; then + # if we are in unsafe mode - then we connect to a single node and it's ID + # is pre-determined by the $BACALHAU_NODE0_UNSAFE_ID variable + if [[ -n "${BACALHAU_UNSAFE_CLUSTER}" ]]; then + export UNSAFE_NODE0_ID="$BACALHAU_NODE_ID_0" + if [[ -z "$UNSAFE_NODE0_ID" ]]; then + export UNSAFE_NODE0_ID="$BACALHAU_NODE0_UNSAFE_ID" + fi + export CONNECT_PEER=$(getMultiaddress "$TERRAFORM_NODE0_IP" "$UNSAFE_NODE0_ID") + # otherwise we will construct our connect string based on + # what node index we are + else + # we are > node0 so we can connect to node0 + export CONNECT_PEER=$(getMultiaddress "$TERRAFORM_NODE0_IP" "$BACALHAU_NODE_ID_0") + # we are > node1 so we can also connect to node1 + if [[ "${TERRAFORM_NODE_INDEX}" -ge "2" ]]; then + export CONNECT_PEER="$CONNECT_PEER,$(getMultiaddress "$TERRAFORM_NODE1_IP" "$BACALHAU_NODE_ID_1")" + fi + # we are > node2 so we can also connect to node2 + if [[ "${TERRAFORM_NODE_INDEX}" -ge "3" ]]; then + export CONNECT_PEER="$CONNECT_PEER,$(getMultiaddress "$TERRAFORM_NODE2_IP" "$BACALHAU_NODE_ID_2")" + fi + fi + fi + + bacalhau serve \ + --node-type "${BACALHAU_NODE_TYPE}" \ + --job-selection-data-locality anywhere \ + --job-selection-accept-networked \ + --job-selection-probe-exec "${BACALHAU_PROBE_EXEC}" \ + --max-job-execution-timeout '60m' \ + --job-execution-timeout-bypass-client-id="${TRUSTED_CLIENT_IDS}" \ + --ipfs-swarm-addrs "" \ + --ipfs-connect /ip4/127.0.0.1/tcp/5001 \ + --swarm-port "${BACALHAU_PORT}" \ + --api-port 1234 \ + --peer "${CONNECT_PEER}" \ + --private-internal-ipfs=false \ + --web-ui "${BACALHAU_NODE_WEBUI}" \ + --web-ui-port 80 \ + --labels owner=bacalhau \ + --requester-job-translation-enabled +fi \ No newline at end of file diff --git a/ops/terraform/variables.tf b/ops/terraform/variables.tf index 9839690d78..5b24fd1459 100644 --- a/ops/terraform/variables.tf +++ b/ops/terraform/variables.tf @@ -230,3 +230,9 @@ variable "docker_password" { default = "" sensitive = true } + +// Use NATs for transport instead of libp2p +variable "network_type" { + type = string + default = "libp2p" +} \ No newline at end of file diff --git a/pkg/authn/challenge/authenticator_test.go b/pkg/authn/challenge/authenticator_test.go index a82c0d6380..e91c5a879c 100644 --- a/pkg/authn/challenge/authenticator_test.go +++ b/pkg/authn/challenge/authenticator_test.go @@ -18,12 +18,6 @@ import ( "github.com/stretchr/testify/require" ) -type testData []byte - -func (t testData) MarshalBinary() ([]byte, error) { - return t, nil -} - func setup(t *testing.T) authn.Authenticator { logger.ConfigureTestLogging(t) @@ -33,7 +27,7 @@ func setup(t *testing.T) authn.Authenticator { rsaKey, err := rsa.GenerateKey(rand.Reader, 2048) require.NoError(t, err) - return NewAuthenticator(anonPolicy, testData([]byte("test")), rsaKey, "node") + return NewAuthenticator(anonPolicy, NewStringMarshaller("test"), rsaKey, "node") } func try(t *testing.T, authenticator authn.Authenticator, r any) authn.Authentication { diff --git a/pkg/authn/challenge/marshaller.go b/pkg/authn/challenge/marshaller.go new file mode 100644 index 0000000000..1f1e9a318d --- /dev/null +++ b/pkg/authn/challenge/marshaller.go @@ -0,0 +1,20 @@ +package challenge + +// StringMarshaller is a struct that implements the encoding.BinaryMarshaler interface for strings. +// It holds a string value that can be marshaled into a byte slice. +type StringMarshaller struct { + Input string +} + +// NewStringMarshaller returns a pointer to a new StringMarshaller initialized with the given input string. +// This function is typically used to prepare a string for binary marshaling. +func NewStringMarshaller(input string) *StringMarshaller { + return &StringMarshaller{Input: input} +} + +// MarshalBinary implements the encoding.BinaryMarshaler interface. +// It converts the string held by StringMarshaller into a slice of bytes. +// As string to byte conversion in Go is straightforward and error-free, this method returns nil for the error. +func (m *StringMarshaller) MarshalBinary() ([]byte, error) { + return []byte(m.Input), nil +} diff --git a/pkg/authn/challenge/marshaller_test.go b/pkg/authn/challenge/marshaller_test.go new file mode 100644 index 0000000000..4e252219f0 --- /dev/null +++ b/pkg/authn/challenge/marshaller_test.go @@ -0,0 +1,29 @@ +//go:build unit || !integration + +package challenge + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +// TestStringMarshaller_MarshalBinary tests the MarshalBinary method of StringMarshaller. +func TestStringMarshaller_MarshalBinary(t *testing.T) { + testCases := []struct { + input string + }{ + {"hello"}, + {""}, + {"12345"}, + } + + for _, tc := range testCases { + m := NewStringMarshaller(tc.input) + marshaled, err := m.MarshalBinary() + require.NoError(t, err, "MarshalBinary() with input %s returned an unexpected error", tc.input) + + // Manually unmarshal and compare with the original input + require.Equal(t, []byte(tc.input), marshaled, "MarshalBinary() with input %s returned an unexpected byte slice", tc.input) + } +} diff --git a/pkg/compute/endpoint.go b/pkg/compute/endpoint.go index 3134b3c979..4085bee950 100644 --- a/pkg/compute/endpoint.go +++ b/pkg/compute/endpoint.go @@ -19,7 +19,7 @@ type BaseEndpointParams struct { UsageCalculator capacity.UsageCalculator Bidder Bidder Executor Executor - LogServer logstream.LogStreamServer + LogServer *logstream.LogStreamServer } // Base implementation of Endpoint @@ -29,7 +29,7 @@ type BaseEndpoint struct { usageCalculator capacity.UsageCalculator bidder Bidder executor Executor - logServer logstream.LogStreamServer + logServer *logstream.LogStreamServer } func NewBaseEndpoint(params BaseEndpointParams) BaseEndpoint { @@ -142,6 +142,10 @@ func (s BaseEndpoint) CancelExecution(ctx context.Context, request CancelExecuti func (s BaseEndpoint) ExecutionLogs(ctx context.Context, request ExecutionLogsRequest) (ExecutionLogsResponse, error) { log.Ctx(ctx).Debug().Msgf("processing log request for %s", request.ExecutionID) + // TODO: remove this once we support log streaming with nats + if s.logServer == nil { + return ExecutionLogsResponse{}, fmt.Errorf("log server not configured") + } execution, err := s.executionStore.GetExecution(ctx, request.ExecutionID) if err != nil { return ExecutionLogsResponse{}, err diff --git a/pkg/compute/node_info_provider.go b/pkg/compute/node_info_decorator.go similarity index 77% rename from pkg/compute/node_info_provider.go rename to pkg/compute/node_info_decorator.go index c24abc9e3a..fe848c482b 100644 --- a/pkg/compute/node_info_provider.go +++ b/pkg/compute/node_info_decorator.go @@ -10,7 +10,7 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/storage" ) -type NodeInfoProviderParams struct { +type NodeInfoDecoratorParams struct { Executors executor.ExecutorProvider Publisher publisher.PublisherProvider Storages storage.StorageProvider @@ -19,7 +19,7 @@ type NodeInfoProviderParams struct { MaxJobRequirements models.Resources } -type NodeInfoProvider struct { +type NodeInfoDecorator struct { executors executor.ExecutorProvider publishers publisher.PublisherProvider storages storage.StorageProvider @@ -28,8 +28,8 @@ type NodeInfoProvider struct { maxJobRequirements models.Resources } -func NewNodeInfoProvider(params NodeInfoProviderParams) *NodeInfoProvider { - return &NodeInfoProvider{ +func NewNodeInfoDecorator(params NodeInfoDecoratorParams) *NodeInfoDecorator { + return &NodeInfoDecorator{ executors: params.Executors, publishers: params.Publisher, storages: params.Storages, @@ -39,8 +39,9 @@ func NewNodeInfoProvider(params NodeInfoProviderParams) *NodeInfoProvider { } } -func (n *NodeInfoProvider) GetComputeInfo(ctx context.Context) models.ComputeNodeInfo { - return models.ComputeNodeInfo{ +func (n *NodeInfoDecorator) DecorateNodeInfo(ctx context.Context, nodeInfo models.NodeInfo) models.NodeInfo { + nodeInfo.NodeType = models.NodeTypeCompute + nodeInfo.ComputeNodeInfo = &models.ComputeNodeInfo{ ExecutionEngines: n.executors.Keys(ctx), Publishers: n.publishers.Keys(ctx), StorageSources: n.storages.Keys(ctx), @@ -50,7 +51,8 @@ func (n *NodeInfoProvider) GetComputeInfo(ctx context.Context) models.ComputeNod RunningExecutions: len(n.executorBuffer.RunningExecutions()), EnqueuedExecutions: n.executorBuffer.EnqueuedExecutionsCount(), } + return nodeInfo } // compile-time interface check -var _ models.ComputeNodeInfoProvider = &NodeInfoProvider{} +var _ models.NodeInfoDecorator = &NodeInfoDecorator{} diff --git a/pkg/compute/store/boltdb/store.go b/pkg/compute/store/boltdb/store.go index bc49781f7c..3c3e4d82c1 100644 --- a/pkg/compute/store/boltdb/store.go +++ b/pkg/compute/store/boltdb/store.go @@ -83,7 +83,7 @@ func NewStore(ctx context.Context, dbPath string) (*Store, error) { starting: sync.WaitGroup{}, stateCounter: NewStateCounter(), } - log.Ctx(ctx).Info().Msg("creating new bbolt database") + log.Ctx(ctx).Info().Msgf("creating new bbolt database at %s", dbPath) database, err := GetDatabase(dbPath) if err != nil { diff --git a/pkg/config/configenv/dev.go b/pkg/config/configenv/dev.go index ebca3f2824..f9ad62089c 100644 --- a/pkg/config/configenv/dev.go +++ b/pkg/config/configenv/dev.go @@ -30,6 +30,14 @@ var Development = types.BacalhauConfig{ Port: 1234, TLS: types.TLSConfiguration{}, }, + Network: types.NetworkConfig{ + Type: models.NetworkTypeLibp2p, + Port: 4222, + Cluster: types.NetworkClusterConfig{ + Name: "global", + Port: 6222, + }, + }, BootstrapAddresses: []string{ "/ip4/34.86.177.175/tcp/1235/p2p/QmfYBQ3HouX9zKcANNXbgJnpyLpTYS9nKBANw6RUQKZffu", "/ip4/35.245.221.171/tcp/1235/p2p/QmNjEQByyK8GiMTvnZqGyURuwXDCtzp9X6gJRKkpWfai7S", diff --git a/pkg/config/configenv/local.go b/pkg/config/configenv/local.go index 8b8f50ad83..54617bd9b9 100644 --- a/pkg/config/configenv/local.go +++ b/pkg/config/configenv/local.go @@ -30,6 +30,14 @@ var Local = types.BacalhauConfig{ Port: 1234, TLS: types.TLSConfiguration{}, }, + Network: types.NetworkConfig{ + Type: models.NetworkTypeLibp2p, + Port: 4222, + Cluster: types.NetworkClusterConfig{ + Name: "global", + Port: 6222, + }, + }, BootstrapAddresses: []string{}, DownloadURLRequestTimeout: types.Duration(300 * time.Second), VolumeSizeRequestTimeout: types.Duration(2 * time.Minute), diff --git a/pkg/config/configenv/production.go b/pkg/config/configenv/production.go index 604f22b1bc..77ed8d4b19 100644 --- a/pkg/config/configenv/production.go +++ b/pkg/config/configenv/production.go @@ -31,6 +31,14 @@ var Production = types.BacalhauConfig{ Port: 1234, TLS: types.TLSConfiguration{}, }, + Network: types.NetworkConfig{ + Type: models.NetworkTypeLibp2p, + Port: 4222, + Cluster: types.NetworkClusterConfig{ + Name: "global", + Port: 6222, + }, + }, BootstrapAddresses: []string{ "/ip4/35.245.161.250/tcp/1235/p2p/QmbxGSsM6saCTyKkiWSxhJCt6Fgj7M9cns1vzYtfDbB5Ws", "/ip4/34.86.254.26/tcp/1235/p2p/QmeXjeQDinxm7zRiEo8ekrJdbs7585BM6j7ZeLVFrA7GPe", diff --git a/pkg/config/configenv/staging.go b/pkg/config/configenv/staging.go index 3a2d05b020..8ae1e7a56e 100644 --- a/pkg/config/configenv/staging.go +++ b/pkg/config/configenv/staging.go @@ -31,6 +31,14 @@ var Staging = types.BacalhauConfig{ Port: 1234, TLS: types.TLSConfiguration{}, }, + Network: types.NetworkConfig{ + Type: models.NetworkTypeLibp2p, + Port: 4222, + Cluster: types.NetworkClusterConfig{ + Name: "global", + Port: 6222, + }, + }, BootstrapAddresses: []string{ "/ip4/34.85.228.65/tcp/1235/p2p/QmafZ9oCXCJZX9Wt1nhrGS9FVVq41qhcBRSNWCkVhz3Nvv", "/ip4/34.86.73.105/tcp/1235/p2p/QmVHCeiLzhFJPCyCj5S1RTAk1vBEvxd8r5A6E4HyJGQtbJ", diff --git a/pkg/config/configenv/test.go b/pkg/config/configenv/test.go index b5611f3831..8fa4ccc7de 100644 --- a/pkg/config/configenv/test.go +++ b/pkg/config/configenv/test.go @@ -30,6 +30,14 @@ var Testing = types.BacalhauConfig{ Port: 9999, TLS: types.TLSConfiguration{}, }, + Network: types.NetworkConfig{ + Type: models.NetworkTypeLibp2p, + Port: 4222, + Cluster: types.NetworkClusterConfig{ + Name: "global", + Port: 6222, + }, + }, BootstrapAddresses: []string{ "/ip4/0.0.0.0/tcp/1235/p2p/QmcWJnVXJ82DKJq8ED79LADR4ZBTnwgTK7yn6JQbNVMbbC", }, diff --git a/pkg/config/types/generated_constants.go b/pkg/config/types/generated_constants.go index 89d75afd81..55b23a3896 100644 --- a/pkg/config/types/generated_constants.go +++ b/pkg/config/types/generated_constants.go @@ -27,6 +27,15 @@ const NodeServerAPITLSAutoCert = "Node.ServerAPI.TLS.AutoCert" const NodeServerAPITLSAutoCertCachePath = "Node.ServerAPI.TLS.AutoCertCachePath" const NodeServerAPITLSServerCertificate = "Node.ServerAPI.TLS.ServerCertificate" const NodeServerAPITLSServerKey = "Node.ServerAPI.TLS.ServerKey" +const NodeNetwork = "Node.Network" +const NodeNetworkType = "Node.Network.Type" +const NodeNetworkPort = "Node.Network.Port" +const NodeNetworkAdvertisedAddress = "Node.Network.AdvertisedAddress" +const NodeNetworkOrchestrators = "Node.Network.Orchestrators" +const NodeNetworkClusterName = "Node.Network.Cluster.Name" +const NodeNetworkClusterPort = "Node.Network.Cluster.Port" +const NodeNetworkClusterAdvertisedAddress = "Node.Network.Cluster.AdvertisedAddress" +const NodeNetworkClusterPeers = "Node.Network.Cluster.Peers" const NodeLibp2p = "Node.Libp2p" const NodeLibp2pSwarmPort = "Node.Libp2p.SwarmPort" const NodeLibp2pPeerConnect = "Node.Libp2p.PeerConnect" diff --git a/pkg/config/types/generated_viper_defaults.go b/pkg/config/types/generated_viper_defaults.go index cf96b1b3c8..c529e8295f 100644 --- a/pkg/config/types/generated_viper_defaults.go +++ b/pkg/config/types/generated_viper_defaults.go @@ -1,4 +1,3 @@ - // CODE GENERATED BY pkg/config/types/gen_viper DO NOT EDIT package types @@ -50,6 +49,15 @@ func SetDefaults(cfg BacalhauConfig, opts ...SetOption) { p.Viper.SetDefault(NodeServerAPITLSAutoCertCachePath, cfg.Node.ServerAPI.TLS.AutoCertCachePath) p.Viper.SetDefault(NodeServerAPITLSServerCertificate, cfg.Node.ServerAPI.TLS.ServerCertificate) p.Viper.SetDefault(NodeServerAPITLSServerKey, cfg.Node.ServerAPI.TLS.ServerKey) + p.Viper.SetDefault(NodeNetwork, cfg.Node.Network) + p.Viper.SetDefault(NodeNetworkType, cfg.Node.Network.Type) + p.Viper.SetDefault(NodeNetworkPort, cfg.Node.Network.Port) + p.Viper.SetDefault(NodeNetworkAdvertisedAddress, cfg.Node.Network.AdvertisedAddress) + p.Viper.SetDefault(NodeNetworkOrchestrators, cfg.Node.Network.Orchestrators) + p.Viper.SetDefault(NodeNetworkClusterName, cfg.Node.Network.Cluster.Name) + p.Viper.SetDefault(NodeNetworkClusterPort, cfg.Node.Network.Cluster.Port) + p.Viper.SetDefault(NodeNetworkClusterAdvertisedAddress, cfg.Node.Network.Cluster.AdvertisedAddress) + p.Viper.SetDefault(NodeNetworkClusterPeers, cfg.Node.Network.Cluster.Peers) p.Viper.SetDefault(NodeLibp2p, cfg.Node.Libp2p) p.Viper.SetDefault(NodeLibp2pSwarmPort, cfg.Node.Libp2p.SwarmPort) p.Viper.SetDefault(NodeLibp2pPeerConnect, cfg.Node.Libp2p.PeerConnect) @@ -205,6 +213,15 @@ func Set(cfg BacalhauConfig, opts ...SetOption) { p.Viper.Set(NodeServerAPITLSAutoCertCachePath, cfg.Node.ServerAPI.TLS.AutoCertCachePath) p.Viper.Set(NodeServerAPITLSServerCertificate, cfg.Node.ServerAPI.TLS.ServerCertificate) p.Viper.Set(NodeServerAPITLSServerKey, cfg.Node.ServerAPI.TLS.ServerKey) + p.Viper.Set(NodeNetwork, cfg.Node.Network) + p.Viper.Set(NodeNetworkType, cfg.Node.Network.Type) + p.Viper.Set(NodeNetworkPort, cfg.Node.Network.Port) + p.Viper.Set(NodeNetworkAdvertisedAddress, cfg.Node.Network.AdvertisedAddress) + p.Viper.Set(NodeNetworkOrchestrators, cfg.Node.Network.Orchestrators) + p.Viper.Set(NodeNetworkClusterName, cfg.Node.Network.Cluster.Name) + p.Viper.Set(NodeNetworkClusterPort, cfg.Node.Network.Cluster.Port) + p.Viper.Set(NodeNetworkClusterAdvertisedAddress, cfg.Node.Network.Cluster.AdvertisedAddress) + p.Viper.Set(NodeNetworkClusterPeers, cfg.Node.Network.Cluster.Peers) p.Viper.Set(NodeLibp2p, cfg.Node.Libp2p) p.Viper.Set(NodeLibp2pSwarmPort, cfg.Node.Libp2p.SwarmPort) p.Viper.Set(NodeLibp2pPeerConnect, cfg.Node.Libp2p.PeerConnect) diff --git a/pkg/config/types/node.go b/pkg/config/types/node.go index b06071beb4..6bc6d28728 100644 --- a/pkg/config/types/node.go +++ b/pkg/config/types/node.go @@ -42,6 +42,8 @@ type NodeConfig struct { // Configuration for the web UI WebUI WebUIConfig `yaml:"WebUI"` + Network NetworkConfig `yaml:"Network"` + StrictVersionMatch bool `yaml:"StrictVersionMatch"` } @@ -138,3 +140,18 @@ type FeatureConfig struct { Publishers []string `yaml:"Publishers"` Storages []string `yaml:"Storages"` } + +type NetworkConfig struct { + Type string `yaml:"Type"` + Port int `yaml:"Port"` + AdvertisedAddress string `yaml:"AdvertisedAddress"` + Orchestrators []string `yaml:"Orchestrators"` + Cluster NetworkClusterConfig `yaml:"Cluster"` +} + +type NetworkClusterConfig struct { + Name string `yaml:"Name"` + Port int `yaml:"Port"` + AdvertisedAddress string `yaml:"AdvertisedAddress"` + Peers []string `yaml:"Peers"` +} diff --git a/pkg/devstack/devstack.go b/pkg/devstack/devstack.go index 8746987758..9b7078a1f9 100644 --- a/pkg/devstack/devstack.go +++ b/pkg/devstack/devstack.go @@ -5,9 +5,11 @@ import ( "fmt" "os" "strings" - "time" + "github.com/bacalhau-project/bacalhau/pkg/models" + "github.com/bacalhau-project/bacalhau/pkg/util/multiaddresses" "github.com/imdario/mergo" + "github.com/libp2p/go-libp2p/core/host" "github.com/multiformats/go-multiaddr" "github.com/phayes/freeport" "github.com/rs/zerolog/log" @@ -17,12 +19,10 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/ipfs" bac_libp2p "github.com/bacalhau-project/bacalhau/pkg/libp2p" "github.com/bacalhau-project/bacalhau/pkg/logger" + "github.com/bacalhau-project/bacalhau/pkg/node" "github.com/bacalhau-project/bacalhau/pkg/repo" "github.com/bacalhau-project/bacalhau/pkg/routing" "github.com/bacalhau-project/bacalhau/pkg/system" - "github.com/bacalhau-project/bacalhau/pkg/util/multiaddresses" - - "github.com/bacalhau-project/bacalhau/pkg/node" ) const ( @@ -44,10 +44,11 @@ type DevStackOptions struct { NodeInfoPublisherInterval routing.NodeInfoPublisherIntervalConfig ExecutorPlugins bool // when true pluggable executors will be used. ConfigurationRepo string // A custom config repo + NetworkType string } func (o *DevStackOptions) Options() []ConfigOption { - return []ConfigOption{ + opts := []ConfigOption{ WithNumberOfHybridNodes(o.NumberOfHybridNodes), WithNumberOfRequesterOnlyNodes(o.NumberOfRequesterOnlyNodes), WithNumberOfComputeOnlyNodes(o.NumberOfComputeOnlyNodes), @@ -61,7 +62,9 @@ func (o *DevStackOptions) Options() []ConfigOption { WithAllowListedLocalPaths(o.AllowListedLocalPaths), WithNodeInfoPublisherInterval(o.NodeInfoPublisherInterval), WithExecutorPlugins(o.ExecutorPlugins), + WithNetworkType(o.NetworkType), } + return opts } type DevStack struct { @@ -93,6 +96,8 @@ func Setup( defer span.End() var nodes []*node.Node + orchestratorAddrs := make([]string, 0) + clusterPeersAddrs := make([]string, 0) totalNodeCount := stackConfig.NumberOfHybridNodes + stackConfig.NumberOfRequesterOnlyNodes + stackConfig.NumberOfComputeOnlyNodes requesterNodeCount := stackConfig.NumberOfHybridNodes + stackConfig.NumberOfRequesterOnlyNodes @@ -101,7 +106,20 @@ func Setup( if requesterNodeCount == 0 { return nil, fmt.Errorf("at least one requester node is required") } + + // Enable testing using different network stacks by setting env variable + if stackConfig.NetworkType == "" { + networkType, ok := os.LookupEnv("BACALHAU_NODE_NETWORK_TYPE") + if !ok { + networkType = models.NetworkTypeLibp2p + } + stackConfig.NetworkType = networkType + } + for i := 0; i < totalNodeCount; i++ { + nodeID := fmt.Sprintf("node-%d", i) + ctx = logger.ContextWithNodeIDLogger(ctx, nodeID) + isRequesterNode := i < requesterNodeCount isComputeNode := (totalNodeCount - i) <= computeNodeCount log.Ctx(ctx).Debug().Msgf(`Creating Node #%d as {RequesterNode: %t, ComputeNode: %t}`, i+1, isRequesterNode, isComputeNode) @@ -128,57 +146,66 @@ func Setup( } // //////////////////////////////////// - // libp2p + // Transport layer (NATS or Libp2p) // //////////////////////////////////// - var libp2pPeer []multiaddr.Multiaddr - libp2pPort, err := freeport.GetFreePort() - if err != nil { - return nil, err + var swarmPort int + if os.Getenv("PREDICTABLE_API_PORT") != "" { + const startSwarmPort = 4222 // 4222 is the default NATS port + swarmPort = startSwarmPort + i + } else { + swarmPort, err = freeport.GetFreePort() + if err != nil { + return nil, err + } + } + clusterConfig := node.NetworkConfig{ + Type: stackConfig.NetworkType, + Orchestrators: orchestratorAddrs, + Port: swarmPort, + ClusterPeers: clusterPeersAddrs, } - if i == 0 { - if stackConfig.Peer != "" { - // connect 0'th node to external peer if specified - log.Ctx(ctx).Debug().Msgf("Connecting 0'th node to remote peer: %s", stackConfig.Peer) - peerAddr, addrErr := multiaddr.NewMultiaddr(stackConfig.Peer) - if addrErr != nil { - return nil, fmt.Errorf("failed to parse peer address: %w", addrErr) + if stackConfig.NetworkType == models.NetworkTypeNATS { + var clusterPort int + if os.Getenv("PREDICTABLE_API_PORT") != "" { + const startClusterPort = 6222 + clusterPort = startClusterPort + i + } else { + clusterPort, err = freeport.GetFreePort() + if err != nil { + return nil, err } - libp2pPeer = append(libp2pPeer, peerAddr) + } + + if isRequesterNode { + clusterConfig.ClusterName = "devstack" + clusterConfig.ClusterPort = clusterPort + orchestratorAddrs = append(orchestratorAddrs, fmt.Sprintf("127.0.0.1:%d", swarmPort)) + clusterPeersAddrs = append(clusterPeersAddrs, fmt.Sprintf("127.0.0.1:%d", clusterPort)) } } else { - p2pAddr, err := multiaddr.NewMultiaddr("/p2p/" + nodes[0].Host.ID().String()) + if i == 0 { + if stackConfig.Peer != "" { + clusterConfig.ClusterPeers = append(clusterConfig.ClusterPeers, stackConfig.Peer) + } + } else { + p2pAddr, err := multiaddr.NewMultiaddr("/p2p/" + nodes[0].Libp2pHost.ID().String()) + if err != nil { + return nil, err + } + addresses := multiaddresses.SortLocalhostFirst(nodes[0].Libp2pHost.Addrs()) + clusterConfig.ClusterPeers = append(clusterConfig.ClusterPeers, addresses[0].Encapsulate(p2pAddr).String()) + } + + clusterConfig.Libp2pHost, err = createLibp2pHost(ctx, cm, swarmPort) if err != nil { return nil, err } - addresses := multiaddresses.SortLocalhostFirst(nodes[0].Host.Addrs()) - // Only use a single address as libp2p seems to have concurrency issues, like two nodes not able to finish - // connecting/joining topics, when using multiple addresses for a single host. - libp2pPeer = append(libp2pPeer, addresses[0].Encapsulate(p2pAddr)) - log.Ctx(ctx).Debug().Msgf("Connecting to first libp2p requester node: %s", libp2pPeer) - } - - // TODO(forrest): [devstack] Refactor the devstack s.t. each node has its own repo and config. - // previously the config would generate a key using the host port as the postfix - // this is not longer the case as a node should have a single libp2p key, but since - // all devstack nodes share a repo we will get a self dial error if we use the same - // key from the config for each devstack node. The solution here is to refactor the - // the devstack such that all nodes in the stack have their own repos and configuration - // rather than rely on global values and one off key gen via the config. - // Creates a new RSA key pair for this host. - privKey, err := bac_libp2p.GeneratePrivateKey(DefaultLibp2pKeySize) - if err != nil { - return nil, err + // nodeID must match the libp2p host ID + nodeID = clusterConfig.Libp2pHost.ID().String() + ctx = logger.ContextWithNodeIDLogger(ctx, nodeID) } - libp2pHost, err := bac_libp2p.NewHost(libp2pPort, privKey) - if err != nil { - return nil, err - } - cm.RegisterCallback(libp2pHost.Close) - - // add NodeID to logging context - ctx = logger.ContextWithNodeIDLogger(ctx, libp2pHost.ID().String()) // //////////////////////////////////// // port for API @@ -212,9 +239,9 @@ func Setup( } nodeConfig := node.NodeConfig{ + NodeID: nodeID, IPFSClient: ipfsNode.Client(), CleanupManager: cm, - Host: libp2pHost, HostAddress: "0.0.0.0", APIPort: apiPort, ComputeConfig: stackConfig.ComputeConfig, @@ -222,8 +249,8 @@ func Setup( IsComputeNode: isComputeNode, IsRequesterNode: isRequesterNode, Labels: map[string]string{ + "id": nodeID, "name": fmt.Sprintf("node-%d", i), - "id": libp2pHost.ID().String(), "env": "devstack", }, DependencyInjector: stackConfig.NodeDependencyInjector, @@ -232,6 +259,7 @@ func Setup( NodeInfoPublisherInterval: nodeInfoPublisherInterval, FsRepo: fsRepo, NodeInfoStoreTTL: stackConfig.NodeInfoStoreTTL, + NetworkConfig: clusterConfig, } if isRequesterNode && stackConfig.TLS.Certificate != "" && stackConfig.TLS.Key != "" { @@ -257,12 +285,6 @@ func Setup( return nil, err } - // Start transport layer - err = bac_libp2p.ConnectToPeersContinuouslyWithRetryDuration(ctx, cm, libp2pHost, libp2pPeer, 2*time.Second) - if err != nil { - return nil, err - } - // start the node err = n.Start(ctx) if err != nil { @@ -284,6 +306,30 @@ func Setup( }, nil } +func createLibp2pHost(ctx context.Context, cm *system.CleanupManager, port int) (host.Host, error) { + var err error + + // TODO(forrest): [devstack] Refactor the devstack s.t. each node has its own repo and config. + // previously the config would generate a key using the host port as the postfix + // this is not longer the case as a node should have a single libp2p key, but since + // all devstack nodes share a repo we will get a self dial error if we use the same + // key from the config for each devstack node. The solution here is to refactor the + // the devstack such that all nodes in the stack have their own repos and configuration + // rather than rely on global values and one off key gen via the config. + + privKey, err := bac_libp2p.GeneratePrivateKey(DefaultLibp2pKeySize) + if err != nil { + return nil, err + } + + libp2pHost, err := bac_libp2p.NewHost(port, privKey) + if err != nil { + return nil, fmt.Errorf("error creating libp2p host: %w", err) + } + + return libp2pHost, nil +} + func createIPFSNode(ctx context.Context, cm *system.CleanupManager, publicIPFSMode bool, @@ -326,34 +372,8 @@ func (stack *DevStack) PrintNodeInfo(ctx context.Context, fsRepo *repo.FsRepo, c swarmAddrrs = strings.Join(swarmAddresses, ",") } - var libp2pPeer []string - for _, addrs := range node.Host.Addrs() { - p2pAddr, p2pAddrErr := multiaddr.NewMultiaddr("/p2p/" + node.Host.ID().String()) - if p2pAddrErr != nil { - return "", p2pAddrErr - } - libp2pPeer = append(libp2pPeer, addrs.Encapsulate(p2pAddr).String()) - } - devstackPeerAddr := strings.Join(libp2pPeer, ",") - if len(libp2pPeer) > 0 { - chosen := false - preferredAddress := config.PreferredAddress() - if preferredAddress != "" { - for _, addr := range libp2pPeer { - if strings.Contains(addr, preferredAddress) { - devstackPeerAddrs = append(devstackPeerAddrs, addr) - chosen = true - break - } - } - } - - if !chosen { - // only add one of the addrs for this peer and we will choose the first - // in the absence of a preference - devstackPeerAddrs = append(devstackPeerAddrs, libp2pPeer[0]) - } - } + peerConnect := fmt.Sprintf("/ip4/%s/tcp/%d/http", node.APIServer.Address, node.APIServer.Port) + devstackPeerAddrs = append(devstackPeerAddrs, peerConnect) logString += fmt.Sprintf(` export BACALHAU_IPFS_%d=%s @@ -366,7 +386,7 @@ export BACALHAU_API_PORT_%d=%d`, nodeIndex, swarmAddrrs, nodeIndex, - devstackPeerAddr, + peerConnect, nodeIndex, stack.Nodes[nodeIndex].APIServer.Address, nodeIndex, @@ -449,7 +469,7 @@ The above variables were also written to this file (will be deleted when devstac func (stack *DevStack) GetNode(_ context.Context, nodeID string) ( *node.Node, error) { for _, node := range stack.Nodes { - if node.Host.ID().String() == nodeID { + if node.ID == nodeID { return node, nil } } @@ -467,7 +487,7 @@ func (stack *DevStack) IPFSClients() []ipfs.Client { func (stack *DevStack) GetNodeIds() []string { var ids []string for _, node := range stack.Nodes { - ids = append(ids, node.Host.ID().String()) + ids = append(ids, node.ID) } return ids } diff --git a/pkg/devstack/option.go b/pkg/devstack/option.go index 73d2570253..b2a7213028 100644 --- a/pkg/devstack/option.go +++ b/pkg/devstack/option.go @@ -75,6 +75,7 @@ type DevStackConfig struct { ExecutorPlugins bool // when true pluggable executors will be used. NodeInfoStoreTTL time.Duration TLS DevstackTLSSettings + NetworkType string } func (o *DevStackConfig) MarshalZerologObject(e *zerolog.Event) { @@ -90,7 +91,8 @@ func (o *DevStackConfig) MarshalZerologObject(e *zerolog.Event) { Strs("AllowListedLocalPaths", o.AllowListedLocalPaths). Str("NodeInfoPublisherInterval", fmt.Sprintf("%v", o.NodeInfoPublisherInterval)). Bool("PublicIPFSMode", o.PublicIPFSMode). - Bool("ExecutorPlugins", o.ExecutorPlugins) + Bool("ExecutorPlugins", o.ExecutorPlugins). + Str("NetworkType", o.NetworkType) } func (o *DevStackConfig) Validate() error { @@ -220,6 +222,12 @@ func WithExecutorPlugins(enabled bool) ConfigOption { } } +func WithNetworkType(typ string) ConfigOption { + return func(cfg *DevStackConfig) { + cfg.NetworkType = typ + } +} + func WithSelfSignedCertificate(cert string, key string) ConfigOption { return func(cfg *DevStackConfig) { cfg.TLS = DevstackTLSSettings{ diff --git a/pkg/lib/validate/general.go b/pkg/lib/validate/general.go new file mode 100644 index 0000000000..9c1266e4eb --- /dev/null +++ b/pkg/lib/validate/general.go @@ -0,0 +1,18 @@ +package validate + +import "reflect" + +// IsNotNil checks if the provided value is not nil. +// Returns an error if the value is nil, using the provided message and arguments. +func IsNotNil(value any, msg string, args ...any) error { + if value == nil { + return createError(msg, args...) + } + + // Use reflection to handle cases where value is a nil pointer wrapped in an interface + val := reflect.ValueOf(value) + if val.Kind() == reflect.Ptr && val.IsNil() { + return createError(msg, args...) + } + return nil +} diff --git a/pkg/lib/validate/general_test.go b/pkg/lib/validate/general_test.go new file mode 100644 index 0000000000..bc9a4e351d --- /dev/null +++ b/pkg/lib/validate/general_test.go @@ -0,0 +1,38 @@ +//go:build unit || !integration + +package validate + +import "testing" + +// TestIsNotNil tests the IsNotNil function for various scenarios. +func TestIsNotNil(t *testing.T) { + t.Run("NilValue", func(t *testing.T) { + err := IsNotNil(nil, "value should not be nil") + if err == nil { + t.Errorf("IsNotNil failed: expected error for nil value") + } + }) + + t.Run("NonNilValue", func(t *testing.T) { + err := IsNotNil(42, "value should not be nil") + if err != nil { + t.Errorf("IsNotNil failed: unexpected error for non-nil value") + } + }) + + t.Run("NilPointer", func(t *testing.T) { + var nilPointer *int + err := IsNotNil(nilPointer, "value should not be nil") + if err == nil { + t.Errorf("IsNotNil failed: expected error for nil pointer") + } + }) + + t.Run("NonNilPointer", func(t *testing.T) { + nonNilPointer := new(int) + err := IsNotNil(nonNilPointer, "value should not be nil") + if err != nil { + t.Errorf("IsNotNil failed: unexpected error for non-nil pointer") + } + }) +} diff --git a/pkg/lib/validate/numbers.go b/pkg/lib/validate/numbers.go new file mode 100644 index 0000000000..67f3d7cd76 --- /dev/null +++ b/pkg/lib/validate/numbers.go @@ -0,0 +1,15 @@ +package validate + +import ( + "github.com/bacalhau-project/bacalhau/pkg/lib/math" +) + +// IsGreaterThanZero checks if the provided numeric value (of type T) is greater than zero. +// It returns an error if the value is not greater than zero, using the provided message and arguments. +// T is a generic type constrained to math.Number, allowing the function to work with various numeric types. +func IsGreaterThanZero[T math.Number](value T, msg string, args ...any) error { + if value <= 0 { + return createError(msg, args...) + } + return nil +} diff --git a/pkg/lib/validate/numbers_test.go b/pkg/lib/validate/numbers_test.go new file mode 100644 index 0000000000..47d8722cc7 --- /dev/null +++ b/pkg/lib/validate/numbers_test.go @@ -0,0 +1,34 @@ +//go:build unit || !integration + +package validate + +import ( + "testing" +) + +func TestIsGreaterThanZero(t *testing.T) { + // Test with value less than zero + err := IsGreaterThanZero(-1, "value should be greater than zero") + if err == nil || err.Error() != "value should be greater than zero" { + t.Errorf("IsGreaterThanZero failed: expected error for value -1") + } + + // Test with zero + err = IsGreaterThanZero(0, "value should be greater than zero") + if err == nil || err.Error() != "value should be greater than zero" { + t.Errorf("IsGreaterThanZero failed: expected error for value 0") + } + + // Test with value greater than zero + err = IsGreaterThanZero(1, "value should be greater than zero") + if err != nil { + t.Errorf("IsGreaterThanZero failed: unexpected error for value 1") + } + + // Test with different numeric types + var floatValue float64 = 1.5 + err = IsGreaterThanZero(floatValue, "value should be greater than zero") + if err != nil { + t.Errorf("IsGreaterThanZero failed: unexpected error for float value %v", floatValue) + } +} diff --git a/pkg/lib/validate/util.go b/pkg/lib/validate/util.go new file mode 100644 index 0000000000..cc445e4d47 --- /dev/null +++ b/pkg/lib/validate/util.go @@ -0,0 +1,14 @@ +package validate + +import "fmt" + +// createError constructs an error with a formatted message. +// 'msg' is a format string and 'args' are the values to be formatted into the message. +func createError(msg string, args ...any) error { + if len(args) == 0 { + // If no arguments, return the message as-is. + return fmt.Errorf(msg) + } + // If arguments are provided, format the message. + return fmt.Errorf(msg, args...) +} diff --git a/pkg/lib/validate/util_test.go b/pkg/lib/validate/util_test.go new file mode 100644 index 0000000000..eb9d21b60e --- /dev/null +++ b/pkg/lib/validate/util_test.go @@ -0,0 +1,29 @@ +//go:build unit || !integration + +package validate + +import ( + "strings" + "testing" +) + +func TestCreateError(t *testing.T) { + // Test with no arguments + err := createError("simple error") + if err == nil || err.Error() != "simple error" { + t.Errorf("createError failed: expected 'simple error', got '%v'", err) + } + + // Test with arguments + err = createError("error with argument: %v", 42) + if err == nil || !strings.Contains(err.Error(), "42") { + t.Errorf("createError failed: expected string containing '42', got '%v'", err) + } + + // Test with multiple arguments + err = createError("error with multiple arguments: %v %s", 42, "test") + expectedMsg := "error with multiple arguments: 42 test" + if err == nil || err.Error() != expectedMsg { + t.Errorf("createError failed: expected '%s', got '%v'", expectedMsg, err) + } +} diff --git a/pkg/libp2p/host.go b/pkg/libp2p/host.go index ab9990be00..1e2dcdd9fa 100644 --- a/pkg/libp2p/host.go +++ b/pkg/libp2p/host.go @@ -78,6 +78,9 @@ func ConnectToPeersContinuouslyWithRetryDuration( peers []multiaddr.Multiaddr, tickDuration time.Duration, ) error { + if tickDuration == 0 { + tickDuration = continuouslyConnectPeersLoopDelay + } if err := connectToPeers(ctx, h, peers); err != nil { return err } diff --git a/pkg/libp2p/info_decorator.go b/pkg/libp2p/info_decorator.go new file mode 100644 index 0000000000..7c796c4960 --- /dev/null +++ b/pkg/libp2p/info_decorator.go @@ -0,0 +1,38 @@ +package libp2p + +import ( + "context" + + "github.com/bacalhau-project/bacalhau/pkg/models" + "github.com/libp2p/go-libp2p/core/host" + "github.com/libp2p/go-libp2p/core/peer" + "github.com/libp2p/go-libp2p/p2p/protocol/identify" +) + +type PeerInfoDecoratorParams struct { + Host host.Host + IdentityService identify.IDService +} + +type PeerInfoDecorator struct { + host host.Host + identityService identify.IDService +} + +func NewPeerInfoDecorator(params PeerInfoDecoratorParams) *PeerInfoDecorator { + return &PeerInfoDecorator{ + host: params.Host, + identityService: params.IdentityService, + } +} + +func (l *PeerInfoDecorator) DecorateNodeInfo(ctx context.Context, nodeInfo models.NodeInfo) models.NodeInfo { + nodeInfo.PeerInfo = &peer.AddrInfo{ + ID: l.host.ID(), + Addrs: l.identityService.OwnObservedAddrs(), + } + return nodeInfo +} + +// compile-time check whether the PeerInfoDecorator implements the PeerInfoDecorator interface. +var _ models.NodeInfoDecorator = (*PeerInfoDecorator)(nil) diff --git a/pkg/libp2p/transport/libp2p.go b/pkg/libp2p/transport/libp2p.go new file mode 100644 index 0000000000..b69d8d34c3 --- /dev/null +++ b/pkg/libp2p/transport/libp2p.go @@ -0,0 +1,217 @@ +package transport + +import ( + "context" + "fmt" + "time" + + "github.com/bacalhau-project/bacalhau/pkg/compute" + pkgconfig "github.com/bacalhau-project/bacalhau/pkg/config" + "github.com/bacalhau-project/bacalhau/pkg/lib/validate" + libp2p_host "github.com/bacalhau-project/bacalhau/pkg/libp2p" + "github.com/bacalhau-project/bacalhau/pkg/model" + "github.com/bacalhau-project/bacalhau/pkg/models" + "github.com/bacalhau-project/bacalhau/pkg/pubsub" + "github.com/bacalhau-project/bacalhau/pkg/pubsub/libp2p" + "github.com/bacalhau-project/bacalhau/pkg/routing" + "github.com/bacalhau-project/bacalhau/pkg/system" + core_transport "github.com/bacalhau-project/bacalhau/pkg/transport" + "github.com/bacalhau-project/bacalhau/pkg/transport/bprotocol" + "github.com/hashicorp/go-multierror" + libp2p_pubsub "github.com/libp2p/go-libp2p-pubsub" + "github.com/libp2p/go-libp2p/core/host" + basichost "github.com/libp2p/go-libp2p/p2p/host/basic" + routedhost "github.com/libp2p/go-libp2p/p2p/host/routed" + "github.com/libp2p/go-libp2p/p2p/protocol/identify" + "github.com/multiformats/go-multiaddr" +) + +const NodeInfoTopic = "bacalhau-node-info" + +type Libp2pTransportConfig struct { + Host host.Host + Peers []string + ReconnectDelay time.Duration + CleanupManager *system.CleanupManager +} + +func (c *Libp2pTransportConfig) Validate() error { + var mErr *multierror.Error + mErr = multierror.Append(mErr, validate.IsNotNil(c.Host, "libp2p host cannot be nil")) + mErr = multierror.Append(mErr, validate.IsNotNil(c.CleanupManager, "cleanupManager cannot be nil")) + return mErr.ErrorOrNil() +} + +type Libp2pTransport struct { + Host host.Host + computeProxy *bprotocol.ComputeProxy + callbackProxy *bprotocol.CallbackProxy + nodeInfoPubSub pubsub.PubSub[models.NodeInfo] + nodeInfoDecorator models.NodeInfoDecorator +} + +func NewLibp2pTransport(ctx context.Context, + config Libp2pTransportConfig, + nodeInfoStore routing.NodeInfoStore) (*Libp2pTransport, error) { + if err := config.Validate(); err != nil { + return nil, fmt.Errorf("error validating libp2p transport config. %w", err) + } + + // Monkey patch the identify protocol to allow discovering advertised addresses of networks of 3 or more nodes, instead of 5. + // Setting the value to 2 means two other nodes must see the same addr for a node to discover its observed addr, which enables a network + // of at least 3 nodes. + identify.ActivationThresh = 2 + + libp2pHost := config.Host + + // A single gossipSub instance that will be used by all topics + gossipSub, err := newLibp2pPubSub(ctx, libp2pHost) + if err != nil { + return nil, err + } + + // PubSub to publish node info to the network + nodeInfoPubSub, err := libp2p.NewPubSub[models.NodeInfo](libp2p.PubSubParams{ + Host: libp2pHost, + TopicName: NodeInfoTopic, + PubSub: gossipSub, + }) + if err != nil { + return nil, err + } + + // node info provider + basicHost, ok := libp2pHost.(*basichost.BasicHost) + if !ok { + return nil, fmt.Errorf("host is not a basic host") + } + + peerInfoDecorator := libp2p_host.NewPeerInfoDecorator(libp2p_host.PeerInfoDecoratorParams{ + Host: basicHost, + IdentityService: basicHost.IDService(), + }) + + libp2pHost = routedhost.Wrap(libp2pHost, nodeInfoStore) + + // register consumers of node info published over gossipSub + nodeInfoSubscriber := pubsub.NewChainedSubscriber[models.NodeInfo](true) + nodeInfoSubscriber.Add(pubsub.SubscriberFunc[models.NodeInfo](nodeInfoStore.Add)) + err = nodeInfoPubSub.Subscribe(ctx, nodeInfoSubscriber) + if err != nil { + return nil, err + } + + // compute proxy + computeProxy := bprotocol.NewComputeProxy(bprotocol.ComputeProxyParams{ + Host: libp2pHost, + }) + + // Callback to send compute events (i.e. requester endpoint) + computeCallback := bprotocol.NewCallbackProxy(bprotocol.CallbackProxyParams{ + Host: libp2pHost, + }) + + var libp2pPeer []multiaddr.Multiaddr + for _, addr := range config.Peers { + maddr, err := multiaddr.NewMultiaddr(addr) + if err != nil { + return nil, err + } + libp2pPeer = append(libp2pPeer, maddr) + } + + err = libp2p_host.ConnectToPeersContinuouslyWithRetryDuration( + ctx, config.CleanupManager, libp2pHost, libp2pPeer, config.ReconnectDelay) + if err != nil { + return nil, err + } + + return &Libp2pTransport{ + Host: libp2pHost, + computeProxy: computeProxy, + callbackProxy: computeCallback, + nodeInfoPubSub: nodeInfoPubSub, + nodeInfoDecorator: peerInfoDecorator, + }, nil +} + +// RegisterComputeCallback registers a compute callback with the transport layer. +func (t *Libp2pTransport) RegisterComputeCallback(callback compute.Callback) error { + bprotocol.NewCallbackHandler(bprotocol.CallbackHandlerParams{ + Host: t.Host, + Callback: callback, + }) + // To enable nodes self-dialing themselves as libp2p doesn't support it. + t.callbackProxy.RegisterLocalComputeCallback(callback) + + return nil +} + +// RegisterComputeEndpoint registers a compute endpoint with the transport layer. +func (t *Libp2pTransport) RegisterComputeEndpoint(endpoint compute.Endpoint) error { + bprotocol.NewComputeHandler(bprotocol.ComputeHandlerParams{ + Host: t.Host, + ComputeEndpoint: endpoint, + }) + // To enable nodes self-dialing themselves as libp2p doesn't support it. + t.computeProxy.RegisterLocalComputeEndpoint(endpoint) + + return nil +} + +// ComputeProxy returns the compute proxy. +func (t *Libp2pTransport) ComputeProxy() compute.Endpoint { + return t.computeProxy +} + +// CallbackProxy returns the callback proxy. +func (t *Libp2pTransport) CallbackProxy() compute.Callback { + return t.callbackProxy +} + +// NodeInfoPubSub returns the node info pubsub. +func (t *Libp2pTransport) NodeInfoPubSub() pubsub.PubSub[models.NodeInfo] { + return t.nodeInfoPubSub +} + +// NodeInfoDecorator returns the node info decorator. +func (t *Libp2pTransport) NodeInfoDecorator() models.NodeInfoDecorator { + return t.nodeInfoDecorator +} + +// DebugInfoProviders returns the debug info. +func (t *Libp2pTransport) DebugInfoProviders() []model.DebugInfoProvider { + return []model.DebugInfoProvider{} +} + +// Close closes the transport layer. +func (t *Libp2pTransport) Close(ctx context.Context) error { + var errors *multierror.Error + errors = multierror.Append(errors, t.nodeInfoPubSub.Close(ctx)) + errors = multierror.Append(errors, t.Host.Close()) + return errors.ErrorOrNil() +} + +func newLibp2pPubSub(ctx context.Context, host host.Host) (*libp2p_pubsub.PubSub, error) { + tracer, err := libp2p_pubsub.NewJSONTracer(pkgconfig.GetLibp2pTracerPath()) + if err != nil { + return nil, err + } + + pgParams := libp2p_pubsub.NewPeerGaterParams( + 0.33, //nolint:gomnd + libp2p_pubsub.ScoreParameterDecay(2*time.Minute), //nolint:gomnd + libp2p_pubsub.ScoreParameterDecay(10*time.Minute), //nolint:gomnd + ) + + return libp2p_pubsub.NewGossipSub( + ctx, + host, + libp2p_pubsub.WithPeerExchange(true), + libp2p_pubsub.WithPeerGater(pgParams), + libp2p_pubsub.WithEventTracer(tracer), + ) +} + +// compile-time interface check +var _ core_transport.TransportLayer = (*Libp2pTransport)(nil) diff --git a/pkg/models/constants.go b/pkg/models/constants.go index fac87da4cc..58822f604e 100644 --- a/pkg/models/constants.go +++ b/pkg/models/constants.go @@ -47,6 +47,11 @@ const ( PublisherS3 = "s3" ) +const ( + NetworkTypeNATS = "nats" + NetworkTypeLibp2p = "libp2p" +) + const ( DownloadFilenameStdout = "stdout" DownloadFilenameStderr = "stderr" @@ -57,10 +62,9 @@ const ( ) const ( - MetaReservedPrefix = "bacalhau.org/" - MetaRequesterID = "bacalhau.org/requester.id" - MetaRequesterPublicKey = "bacalhau.org/requester.publicKey" - MetaClientID = "bacalhau.org/client.id" + MetaReservedPrefix = "bacalhau.org/" + MetaRequesterID = "bacalhau.org/requester.id" + MetaClientID = "bacalhau.org/client.id" // Job provenance metadata used to track the origin of a job where // it may have been translated from another job. diff --git a/pkg/models/migration/legacy/from.go b/pkg/models/migration/legacy/from.go index 99e1549465..dbe5f9f16b 100644 --- a/pkg/models/migration/legacy/from.go +++ b/pkg/models/migration/legacy/from.go @@ -28,7 +28,6 @@ func FromLegacyJob(legacy *model.Job) (*models.Job, error) { metadata := make(map[string]string) metadata[models.MetaRequesterID] = legacy.Metadata.Requester.RequesterNodeID - metadata[models.MetaRequesterPublicKey] = legacy.Metadata.Requester.RequesterPublicKey.String() metadata[models.MetaClientID] = legacy.Metadata.ClientID labels := make(map[string]string) diff --git a/pkg/models/migration/legacy/to.go b/pkg/models/migration/legacy/to.go index 9dc1247d5b..ff250aa87d 100644 --- a/pkg/models/migration/legacy/to.go +++ b/pkg/models/migration/legacy/to.go @@ -10,12 +10,6 @@ import ( ) func ToLegacyJob(job *models.Job) (*model.Job, error) { - pk := new(model.PublicKey) - err := pk.UnmarshalText([]byte(job.Meta[models.MetaRequesterPublicKey])) - if err != nil { - return nil, err - } - spec, err := ToLegacyJobSpec(job) if err != nil { return nil, err @@ -28,8 +22,7 @@ func ToLegacyJob(job *models.Job) (*model.Job, error) { CreatedAt: time.Unix(0, job.CreateTime), ClientID: job.Meta[models.MetaClientID], Requester: model.JobRequester{ - RequesterNodeID: job.Meta[models.MetaRequesterID], - RequesterPublicKey: *pk, + RequesterNodeID: job.Meta[models.MetaRequesterID], }, }, Spec: *spec, diff --git a/pkg/models/node_info.go b/pkg/models/node_info.go index 7d7a72aac0..778bcc199e 100644 --- a/pkg/models/node_info.go +++ b/pkg/models/node_info.go @@ -42,10 +42,6 @@ type NodeInfoProvider interface { GetNodeInfo(ctx context.Context) NodeInfo } -type ComputeNodeInfoProvider interface { - GetComputeInfo(ctx context.Context) ComputeNodeInfo -} - type LabelsProvider interface { GetLabels(ctx context.Context) map[string]string } @@ -67,8 +63,22 @@ func MergeLabelsInOrder(providers ...LabelsProvider) LabelsProvider { return mergeProvider{providers: providers} } +type NodeInfoDecorator interface { + DecorateNodeInfo(ctx context.Context, nodeInfo NodeInfo) NodeInfo +} + +// NoopNodeInfoDecorator is a decorator that does nothing +type NoopNodeInfoDecorator struct{} + +func (n NoopNodeInfoDecorator) DecorateNodeInfo(ctx context.Context, nodeInfo NodeInfo) NodeInfo { + return nodeInfo +} + +// NodeInfo +// TODO: add Validate() method to NodeInfo and make sure it is called in all the places where it is initialized type NodeInfo struct { - PeerInfo peer.AddrInfo `json:"PeerInfo"` + NodeID string `json:"NodeID"` + PeerInfo *peer.AddrInfo `json:"PeerInfo,omitempty" yaml:",omitempty"` NodeType NodeType `json:"NodeType"` Labels map[string]string `json:"Labels"` ComputeNodeInfo *ComputeNodeInfo `json:"ComputeNodeInfo,omitempty" yaml:",omitempty"` @@ -77,7 +87,12 @@ type NodeInfo struct { // ID returns the node ID func (n NodeInfo) ID() string { - return n.PeerInfo.ID.String() + if n.NodeID != "" { + return n.NodeID + } else if n.PeerInfo != nil { + return n.PeerInfo.ID.String() + } + return "" } // IsComputeNode returns true if the node is a compute node diff --git a/pkg/nats/client.go b/pkg/nats/client.go new file mode 100644 index 0000000000..9cc837e334 --- /dev/null +++ b/pkg/nats/client.go @@ -0,0 +1,61 @@ +package nats + +import ( + "context" + + "github.com/bacalhau-project/bacalhau/pkg/model" + "github.com/nats-io/nats.go" +) + +type ClientManagerParams struct { + Name string + Servers string +} + +type ClientManager struct { + Client *nats.Conn +} + +// NewClientManager is a helper function to create a NATS client connection with a given name and servers string +func NewClientManager(ctx context.Context, params ClientManagerParams) (*ClientManager, error) { + nc, err := nats.Connect(params.Servers, nats.Name(params.Name)) + if err != nil { + return nil, err + } + return &ClientManager{ + Client: nc, + }, nil +} + +// Stop stops the NATS client +func (cm *ClientManager) Stop() { + cm.Client.Close() +} + +// DebugInfo returns the debug info of the NATS client +func (cm *ClientManager) GetDebugInfo(ctx context.Context) (model.DebugInfo, error) { + stats := cm.Client.Stats() + servers := cm.Client.Servers() + buffered, err := cm.Client.Buffered() + if err != nil { + return model.DebugInfo{}, err + } + + return model.DebugInfo{ + Component: "NATSClient", + Info: map[string]interface{}{ + "Name": cm.Client.Opts.Name, + "Stats": stats, + "Servers": servers, + "Buffered": buffered, + "Connection": map[string]interface{}{ + "IsConnected": cm.Client.IsConnected(), + "Addr": cm.Client.ConnectedAddr(), + "Url": cm.Client.ConnectedUrl(), + "ServerId": cm.Client.ConnectedServerId(), + "ServerName": cm.Client.ConnectedServerName(), + "ClusterName": cm.Client.ConnectedClusterName(), + }, + }, + }, nil +} diff --git a/pkg/nats/logger.go b/pkg/nats/logger.go new file mode 100644 index 0000000000..98adc24e37 --- /dev/null +++ b/pkg/nats/logger.go @@ -0,0 +1,51 @@ +package nats + +import ( + "github.com/nats-io/nats-server/v2/server" + "github.com/rs/zerolog" +) + +// ZeroLogger is a wrapper around zerolog.Logger to implement the NATS Logger interface +type ZeroLogger struct { + logger zerolog.Logger + serverID string +} + +// NewZeroLogger creates a new ZeroLogger +func NewZeroLogger(logger zerolog.Logger, serverID string) ZeroLogger { + return ZeroLogger{ + logger: logger, + serverID: serverID, + } +} + +func (l ZeroLogger) Noticef(format string, v ...interface{}) { + l.logWithLevel(zerolog.InfoLevel, format, v) +} + +func (l ZeroLogger) Warnf(format string, v ...interface{}) { + l.logWithLevel(zerolog.WarnLevel, format, v) +} + +func (l ZeroLogger) Fatalf(format string, v ...interface{}) { + l.logWithLevel(zerolog.FatalLevel, format, v) +} + +func (l ZeroLogger) Errorf(format string, v ...interface{}) { + l.logWithLevel(zerolog.ErrorLevel, format, v) +} + +func (l ZeroLogger) Debugf(format string, v ...interface{}) { + l.logWithLevel(zerolog.DebugLevel, format, v) +} + +func (l ZeroLogger) Tracef(format string, v ...interface{}) { + l.logWithLevel(zerolog.TraceLevel, format, v) +} + +func (l ZeroLogger) logWithLevel(level zerolog.Level, format string, v []interface{}) { + l.logger.WithLevel(level).Str("Server", l.serverID).Msgf(format, v...) +} + +// compile-time check whether the ZeroLogger implements the Logger interface +var _ server.Logger = (*ZeroLogger)(nil) diff --git a/pkg/nats/proxy/callback_handler.go b/pkg/nats/proxy/callback_handler.go new file mode 100644 index 0000000000..e739854342 --- /dev/null +++ b/pkg/nats/proxy/callback_handler.go @@ -0,0 +1,82 @@ +package proxy + +import ( + "context" + "encoding/json" + "reflect" + "strings" + + "github.com/bacalhau-project/bacalhau/pkg/compute" + "github.com/nats-io/nats.go" + "github.com/rs/zerolog/log" +) + +type CallbackHandlerParams struct { + Name string + Conn *nats.Conn + Callback compute.Callback +} + +// CallbackHandler is a handler for callback events that registers for incoming nats requests to Bacalhau callback +// protocol, and delegates the handling of the request to the provided callback. +type CallbackHandler struct { + name string + conn *nats.Conn + callback compute.Callback +} + +type callbackHandler[Request any] func(context.Context, Request) + +func NewCallbackHandler(params CallbackHandlerParams) (*CallbackHandler, error) { + handler := &CallbackHandler{ + name: params.Name, + conn: params.Conn, + callback: params.Callback, + } + + subject := callbackSubscribeSubject(handler.name) + _, err := handler.conn.Subscribe(subject, func(m *nats.Msg) { + handler.handle(m) + }) + if err != nil { + return nil, err + } + log.Debug().Msgf("ComputeHandler %s subscribed to %s", handler.name, subject) + return handler, nil +} + +// handle handles incoming NATS messages. +func (h *CallbackHandler) handle(msg *nats.Msg) { + ctx := context.Background() + + subjectParts := strings.Split(msg.Subject, ".") + method := subjectParts[len(subjectParts)-1] + + switch method { + case OnBidComplete: + processCallback(ctx, msg, h.callback.OnBidComplete) + case OnRunComplete: + processCallback(ctx, msg, h.callback.OnRunComplete) + case OnCancelComplete: + processCallback(ctx, msg, h.callback.OnCancelComplete) + case OnComputeFailure: + processCallback(ctx, msg, h.callback.OnComputeFailure) + default: + // Noop, not subscribed to this method + return + } +} + +func processCallback[Request any]( + ctx context.Context, + msg *nats.Msg, + f callbackHandler[Request]) { + request := new(Request) + err := json.Unmarshal(msg.Data, request) + if err != nil { + log.Ctx(ctx).Error().Msgf("error decoding %s: %s", reflect.TypeOf(request), err) + return + } + + go f(ctx, *request) +} diff --git a/pkg/nats/proxy/callback_proxy.go b/pkg/nats/proxy/callback_proxy.go new file mode 100644 index 0000000000..8d34e05a9a --- /dev/null +++ b/pkg/nats/proxy/callback_proxy.go @@ -0,0 +1,74 @@ +package proxy + +import ( + "context" + "encoding/json" + "reflect" + + "github.com/bacalhau-project/bacalhau/pkg/compute" + "github.com/nats-io/nats.go" + "github.com/pkg/errors" + "github.com/rs/zerolog/log" +) + +type CallbackProxyParams struct { + Conn *nats.Conn +} + +// CallbackProxy is a proxy for a compute.Callback that can be used to send compute callbacks to the requester node, +// such as when the execution is completed or when a failure occurs. +// The proxy can forward callbacks to a remote requester node, or locally if the node is the requester and a +// LocalCallback is provided. +type CallbackProxy struct { + conn *nats.Conn +} + +func NewCallbackProxy(params CallbackProxyParams) *CallbackProxy { + proxy := &CallbackProxy{ + conn: params.Conn, + } + return proxy +} + +func (p *CallbackProxy) OnBidComplete(ctx context.Context, result compute.BidResult) { + proxyCallbackRequest(ctx, p.conn, result.RoutingMetadata.TargetPeerID, OnBidComplete, result) +} + +func (p *CallbackProxy) OnRunComplete(ctx context.Context, result compute.RunResult) { + proxyCallbackRequest(ctx, p.conn, result.RoutingMetadata.TargetPeerID, OnRunComplete, result) +} + +func (p *CallbackProxy) OnCancelComplete(ctx context.Context, result compute.CancelResult) { + proxyCallbackRequest(ctx, p.conn, result.RoutingMetadata.TargetPeerID, OnCancelComplete, result) +} + +func (p *CallbackProxy) OnComputeFailure(ctx context.Context, result compute.ComputeError) { + proxyCallbackRequest(ctx, p.conn, result.RoutingMetadata.TargetPeerID, OnComputeFailure, result) +} + +func proxyCallbackRequest( + ctx context.Context, + conn *nats.Conn, + destNodeID string, + method string, + request interface{}) { + // deserialize the request object + data, err := json.Marshal(request) + if err != nil { + log.Ctx(ctx).Error().Err(errors.WithStack(err)).Msgf("%s: failed to marshal request", reflect.TypeOf(request)) + return + } + + subject := callbackPublishSubject(destNodeID, method) + log.Ctx(ctx).Trace().Msgf("Sending request %+v to subject %s", request, subject) + + // We use Publish instead of Request as Orchestrator callbacks do not return a response, for now. + err = conn.Publish(subject, data) + if err != nil { + log.Ctx(ctx).Error().Err(err).Msgf("%s: failed to send callback to node %s", reflect.TypeOf(request), destNodeID) + return + } +} + +// Compile-time interface check: +var _ compute.Callback = (*CallbackProxy)(nil) diff --git a/pkg/nats/proxy/compute_handler.go b/pkg/nats/proxy/compute_handler.go new file mode 100644 index 0000000000..f3f83909b1 --- /dev/null +++ b/pkg/nats/proxy/compute_handler.go @@ -0,0 +1,118 @@ +package proxy + +import ( + "context" + "encoding/json" + "fmt" + "reflect" + "strings" + + "github.com/bacalhau-project/bacalhau/pkg/compute" + "github.com/nats-io/nats.go" + "github.com/rs/zerolog/log" +) + +// ComputeHandlerParams defines parameters for creating a new ComputeHandler. +type ComputeHandlerParams struct { + Name string + Conn *nats.Conn + ComputeEndpoint compute.Endpoint +} + +// ComputeHandler handles NATS messages for compute operations. +type ComputeHandler struct { + name string + conn *nats.Conn + computeEndpoint compute.Endpoint + subscription *nats.Subscription +} + +// handlerWithResponse represents a function that processes a request and returns a response. +type handlerWithResponse[Request, Response any] func(context.Context, Request) (Response, error) + +// NewComputeHandler creates a new ComputeHandler. +func NewComputeHandler(params ComputeHandlerParams) (*ComputeHandler, error) { + handler := &ComputeHandler{ + name: params.Name, + conn: params.Conn, + computeEndpoint: params.ComputeEndpoint, + } + + subject := computeEndpointSubscribeSubject(handler.name) + subscription, err := handler.conn.Subscribe(subject, func(m *nats.Msg) { + handleRequest(m, handler) + }) + if err != nil { + return nil, err + } + handler.subscription = subscription + log.Debug().Msgf("ComputeHandler %s subscribed to %s", handler.name, subject) + return handler, nil +} + +// handleRequest handles incoming NATS messages. +func handleRequest(msg *nats.Msg, handler *ComputeHandler) { + ctx := context.Background() + + subjectParts := strings.Split(msg.Subject, ".") + method := subjectParts[len(subjectParts)-1] + + switch method { + case AskForBid: + processAndRespond(ctx, msg, handler.computeEndpoint.AskForBid) + case BidAccepted: + processAndRespond(ctx, msg, handler.computeEndpoint.BidAccepted) + case BidRejected: + processAndRespond(ctx, msg, handler.computeEndpoint.BidRejected) + case CancelExecution: + processAndRespond(ctx, msg, handler.computeEndpoint.CancelExecution) + case ExecutionLogs: + processAndRespond(ctx, msg, handler.computeEndpoint.ExecutionLogs) + default: + // Noop, not subscribed to this method + return + } +} + +// processAndRespond processes the request and sends a response. +func processAndRespond[Request, Response any](ctx context.Context, msg *nats.Msg, f handlerWithResponse[Request, Response]) { + response, err := processRequest(ctx, msg, f) + if err != nil { + log.Ctx(ctx).Error().Err(err) + } + + // We will wrap up the response/error in a Result type which can be decoded by the proxy itself. + result := newResult(response, err) + + err = sendResponse(result, msg) + if err != nil { + log.Ctx(ctx).Error().Msgf("error sending response: %s", err) + } +} + +// processRequest decodes the request, invokes the handler, and returns the response. +func processRequest[Request, Response any]( + ctx context.Context, msg *nats.Msg, f handlerWithResponse[Request, Response]) (*Response, error) { + request := new(Request) + err := json.Unmarshal(msg.Data, request) + if err != nil { + return nil, fmt.Errorf("error decoding %s: %s", reflect.TypeOf(request).Name(), err) + } + + response, err := f(ctx, *request) + if err != nil { + return nil, fmt.Errorf("error in handler %s: %s", reflect.TypeOf(request).Name(), err) + } + + return &response, nil +} + +// sendResponse marshals the response and sends it back to the requester. +func sendResponse[Response any](result Result[Response], msg *nats.Msg) error { + resultData, err := json.Marshal(result) + if err != nil { + return fmt.Errorf("error encoding %s: %s", reflect.TypeOf(result.Response).Name(), err) + } + + return msg.Respond(resultData) +} diff --git a/pkg/nats/proxy/compute_proxy.go b/pkg/nats/proxy/compute_proxy.go new file mode 100644 index 0000000000..d0812e650a --- /dev/null +++ b/pkg/nats/proxy/compute_proxy.go @@ -0,0 +1,94 @@ +package proxy + +import ( + "context" + "encoding/json" + "fmt" + "reflect" + + "github.com/bacalhau-project/bacalhau/pkg/compute" + "github.com/nats-io/nats.go" + "github.com/rs/zerolog/log" +) + +type ComputeProxyParams struct { + Conn *nats.Conn +} + +// ComputeProxy is a proxy to a compute node endpoint that will forward requests to remote compute nodes, or +// to a local compute node if the target peer ID is the same as the local host, and a LocalEndpoint implementation +// is provided. +type ComputeProxy struct { + conn *nats.Conn +} + +func NewComputeProxy(params ComputeProxyParams) *ComputeProxy { + proxy := &ComputeProxy{ + conn: params.Conn, + } + return proxy +} + +func (p *ComputeProxy) AskForBid(ctx context.Context, request compute.AskForBidRequest) (compute.AskForBidResponse, error) { + return proxyRequest[compute.AskForBidRequest, compute.AskForBidResponse]( + ctx, p.conn, request.TargetPeerID, AskForBid, request) +} + +func (p *ComputeProxy) BidAccepted(ctx context.Context, request compute.BidAcceptedRequest) (compute.BidAcceptedResponse, error) { + return proxyRequest[compute.BidAcceptedRequest, compute.BidAcceptedResponse]( + ctx, p.conn, request.TargetPeerID, BidAccepted, request) +} + +func (p *ComputeProxy) BidRejected(ctx context.Context, request compute.BidRejectedRequest) (compute.BidRejectedResponse, error) { + return proxyRequest[compute.BidRejectedRequest, compute.BidRejectedResponse]( + ctx, p.conn, request.TargetPeerID, BidRejected, request) +} + +func (p *ComputeProxy) CancelExecution( + ctx context.Context, request compute.CancelExecutionRequest) (compute.CancelExecutionResponse, error) { + return proxyRequest[compute.CancelExecutionRequest, compute.CancelExecutionResponse]( + ctx, p.conn, request.TargetPeerID, CancelExecution, request) +} + +func (p *ComputeProxy) ExecutionLogs( + ctx context.Context, request compute.ExecutionLogsRequest) (compute.ExecutionLogsResponse, error) { + return proxyRequest[compute.ExecutionLogsRequest, compute.ExecutionLogsResponse]( + ctx, p.conn, request.TargetPeerID, ExecutionLogs, request) +} + +func proxyRequest[Request any, Response any]( + ctx context.Context, + conn *nats.Conn, + destNodeID string, + method string, + request Request) (Response, error) { + // response object + response := new(Response) + + // deserialize the request object + data, err := json.Marshal(request) + if err != nil { + return *response, fmt.Errorf("%s: failed to marshal request: %w", reflect.TypeOf(request), err) + } + + subject := computeEndpointPublishSubject(destNodeID, method) + log.Ctx(ctx).Trace().Msgf("Sending request %+v to subject %s", request, subject) + res, err := conn.RequestWithContext(ctx, subject, data) + if err != nil { + return *response, fmt.Errorf("%s: failed to send request to node %s: %w", reflect.TypeOf(request), destNodeID, err) + } + + // The handler will have wrapped the response in a Result[T] along with + // any error that occurred, so we will decode it and pass the + // inner response/error on to the caller. + result := &Result[Response]{} + err = json.Unmarshal(res.Data, result) + if err != nil { + return *response, fmt.Errorf("%s: failed to decode response from peer %s: %w", reflect.TypeOf(request), destNodeID, err) + } + + return result.Rehydrate() +} + +// Compile-time interface check: +var _ compute.Endpoint = (*ComputeProxy)(nil) diff --git a/pkg/nats/proxy/constants.go b/pkg/nats/proxy/constants.go new file mode 100644 index 0000000000..5ebf6337bb --- /dev/null +++ b/pkg/nats/proxy/constants.go @@ -0,0 +1,35 @@ +package proxy + +import "fmt" + +const ( + ComputeEndpointSubjectPrefix = "node.compute" + CallbackSubjectPrefix = "node.orchestrator" + + AskForBid = "AskForBid/v1" + BidAccepted = "BidAccepted/v1" + BidRejected = "BidRejected/v1" + CancelExecution = "CancelExecution/v1" + ExecutionLogs = "ExecutionLogs/v1" + + OnBidComplete = "OnBidComplete/v1" + OnRunComplete = "OnRunComplete/v1" + OnCancelComplete = "OnCancelComplete/v1" + OnComputeFailure = "OnComputeFailure/v1" +) + +func computeEndpointPublishSubject(nodeID string, method string) string { + return fmt.Sprintf("%s.%s.%s", ComputeEndpointSubjectPrefix, nodeID, method) +} + +func computeEndpointSubscribeSubject(nodeID string) string { + return fmt.Sprintf("%s.%s.>", ComputeEndpointSubjectPrefix, nodeID) +} + +func callbackPublishSubject(nodeID string, method string) string { + return fmt.Sprintf("%s.%s.%s", CallbackSubjectPrefix, nodeID, method) +} + +func callbackSubscribeSubject(nodeID string) string { + return fmt.Sprintf("%s.%s.>", CallbackSubjectPrefix, nodeID) +} diff --git a/pkg/nats/proxy/types.go b/pkg/nats/proxy/types.go new file mode 100644 index 0000000000..a5b2bdf18a --- /dev/null +++ b/pkg/nats/proxy/types.go @@ -0,0 +1,30 @@ +package proxy + +import "errors" + +type Result[T any] struct { + Response T + Error string +} + +func newResult[T any](response *T, err error) Result[T] { + if err != nil { + return Result[T]{ + Error: err.Error(), + } + } + + return Result[T]{ + Response: *response, + } +} + +func (r *Result[T]) Rehydrate() (T, error) { + var e error = nil + + if r.Error != "" { + e = errors.New(r.Error) + } + + return r.Response, e +} diff --git a/pkg/nats/pubsub/pubsub.go b/pkg/nats/pubsub/pubsub.go new file mode 100644 index 0000000000..5665067238 --- /dev/null +++ b/pkg/nats/pubsub/pubsub.go @@ -0,0 +1,117 @@ +package pubsub + +import ( + "context" + "errors" + "reflect" + realsync "sync" + + "github.com/bacalhau-project/bacalhau/pkg/lib/marshaller" + "github.com/bacalhau-project/bacalhau/pkg/pubsub" + "github.com/bacalhau-project/bacalhau/pkg/system" + "github.com/nats-io/nats.go" + "github.com/rs/zerolog/log" +) + +type PubSubParams struct { + // Subject is the NATS subject to publish to. It is also used as the subscription subject if SubscriptionSubject is empty. + Subject string + // SubscriptionSubject is the NATS subject to subscribe to. If empty, Subject is used. + // This is useful when the subscription subject is different from the publishing subject, e.g. when using wildcards. + SubscriptionSubject string + // Conn is the NATS connection to use for publishing and subscribing. + Conn *nats.Conn +} + +type PubSub[T any] struct { + subject string + subscriptionSubject string + conn *nats.Conn + + subscription *nats.Subscription + subscriber pubsub.Subscriber[T] + subscriberOnce realsync.Once + closeOnce realsync.Once +} + +func NewPubSub[T any](params PubSubParams) (*PubSub[T], error) { + newPubSub := &PubSub[T]{ + conn: params.Conn, + subject: params.Subject, + subscriptionSubject: params.SubscriptionSubject, + } + if newPubSub.subscriptionSubject == "" { + newPubSub.subscriptionSubject = newPubSub.subject + } + return newPubSub, nil +} + +func (p *PubSub[T]) Publish(ctx context.Context, message T) error { + ctx, span := system.NewSpan(ctx, system.GetTracer(), "pkg/pubsub/nats.publish") + defer span.End() + + payload, err := marshaller.JSONMarshalWithMax(message) + if err != nil { + return err + } + + log.Ctx(ctx).Trace().Msgf("Sending message %+v", message) + return p.conn.Publish(p.subject, payload) +} + +func (p *PubSub[T]) Subscribe(ctx context.Context, subscriber pubsub.Subscriber[T]) (err error) { + var firstSubscriber bool + p.subscriberOnce.Do(func() { + log.Ctx(ctx).Debug().Msgf("Subscribing to subject %s", p.subscriptionSubject) + + // register the subscriber + p.subscriber = subscriber + + // subscribe to the subject + p.subscription, err = p.conn.Subscribe(p.subscriptionSubject, func(msg *nats.Msg) { + p.readMessage(context.Background(), msg) + }) + if err != nil { + return + } + + firstSubscriber = true + }) + if err != nil { + return err + } + if !firstSubscriber { + err = errors.New("only a single subscriber is allowed. Use ChainedSubscriber to chain multiple subscribers") + } + return err +} + +func (p *PubSub[T]) readMessage(ctx context.Context, msg *nats.Msg) { + var payload T + err := marshaller.JSONUnmarshalWithMax(msg.Data, &payload) + if err != nil { + log.Ctx(ctx).Error().Err(err).Msgf("error unmarshalling nats payload from subject %s", msg.Subject) + return + } + + err = p.subscriber.Handle(ctx, payload) + if err != nil { + log.Ctx(ctx).Error().Err(err).Msgf("error in handle message of type: %s", reflect.TypeOf(payload)) + } +} + +func (p *PubSub[T]) Close(ctx context.Context) (err error) { + p.closeOnce.Do(func() { + if p.subscription != nil { + err = p.subscription.Unsubscribe() + } + }) + if err != nil { + return err + } + log.Ctx(ctx).Info().Msgf("done closing nats pubsub for subject %s", p.subscriptionSubject) + return nil +} + +// compile-time interface assertions +var _ pubsub.PubSub[string] = (*PubSub[string])(nil) diff --git a/pkg/nats/pubsub/pubsub_test.go b/pkg/nats/pubsub/pubsub_test.go new file mode 100644 index 0000000000..3c8b18e3bf --- /dev/null +++ b/pkg/nats/pubsub/pubsub_test.go @@ -0,0 +1,131 @@ +//go:build unit || !integration + +package pubsub + +import ( + "context" + "testing" + "time" + + nats_helper "github.com/bacalhau-project/bacalhau/pkg/nats" + "github.com/bacalhau-project/bacalhau/pkg/pubsub" + "github.com/nats-io/nats-server/v2/server" + "github.com/phayes/freeport" + "github.com/rs/zerolog/log" + "github.com/stretchr/testify/suite" +) + +const subjectName = "topic.greetings" + +type PubSubSuite struct { + suite.Suite + natsServer *server.Server + node1 *PubSub[string] + node2 *PubSub[string] + subscriber1 *pubsub.InMemorySubscriber[string] + subscriber2 *pubsub.InMemorySubscriber[string] +} + +func (s *PubSubSuite) SetupSuite() { + ctx := context.Background() + s.natsServer = s.createNatsServer() + s.node1 = s.createPubSub(ctx, subjectName, "", s.natsServer.ClientURL()) + s.node2 = s.createPubSub(ctx, subjectName, "topic.*", s.natsServer.ClientURL()) + + s.subscriber1 = pubsub.NewInMemorySubscriber[string]() + s.subscriber2 = pubsub.NewInMemorySubscriber[string]() + s.NoError(s.node1.Subscribe(context.Background(), s.subscriber1)) + s.NoError(s.node2.Subscribe(context.Background(), s.subscriber2)) + + // wait for up to 10 seconds (5 loops with 2 seconds each) for nodes to discover each other + var s1, s2 bool + for i := 0; i < 5; i++ { + s.NoError(s.node1.Publish(context.Background(), "ping")) + s1, s2 = s.waitForMessage("ping", 2*time.Second, true, true) + if s1 || s2 { + // still one of the subscribers is waiting for the message + continue + } + } + if s1 { + s.FailNow("subscriber 1 didn't receive initialization message") + } + if s2 { + s.FailNow("subscriber 2 didn't receive initialization message") + } + log.Debug().Msg("nats pubsub suite is ready") +} + +func (s *PubSubSuite) TearDownSuite() { + s.NoError(s.node1.Close(context.Background())) + s.NoError(s.node2.Close(context.Background())) +} + +// createNatsServer creates a new nats server +func (s *PubSubSuite) createNatsServer() *server.Server { + ctx := context.Background() + port, err := freeport.GetFreePort() + s.Require().NoError(err) + + serverOpts := server.Options{ + Port: port, + } + + ns, err := nats_helper.NewServerManager(ctx, nats_helper.ServerManagerParams{ + Options: &serverOpts, + }) + s.Require().NoError(err) + + return ns.Server +} + +func (s *PubSubSuite) createPubSub(ctx context.Context, subject, subscriptionSubject string, server string) *PubSub[string] { + clientManager, err := nats_helper.NewClientManager(ctx, nats_helper.ClientManagerParams{ + Name: "test", + Servers: server, + }) + s.Require().NoError(err) + + pubSub, err := NewPubSub[string](PubSubParams{ + Conn: clientManager.Client, + Subject: subject, + SubscriptionSubject: subscriptionSubject, + }) + s.Require().NoError(err) + + return pubSub +} + +func TestPubSubSuite(t *testing.T) { + suite.Run(t, new(PubSubSuite)) +} + +func (s *PubSubSuite) TestPubSub() { + msg := "TestPubSub" + s.NoError(s.node1.Publish(context.Background(), msg)) + s.waitForMessage(msg, 10*time.Second, true, true) +} + +func (s *PubSubSuite) waitForMessage(msg string, duration time.Duration, checkSubscriber1, checkSubscriber2 bool) (bool, bool) { + waitUntil := time.Now().Add(duration) + checkSubscriber := func(subscriber *pubsub.InMemorySubscriber[string]) bool { + events := subscriber.Events() + if len(events) == 0 { + return false + } + s.Equal([]string{msg}, events) + return true + } + + for time.Now().Before(waitUntil) && (checkSubscriber1 || checkSubscriber2) { + time.Sleep(100 * time.Millisecond) + if checkSubscriber1 && checkSubscriber(s.subscriber1) { + checkSubscriber1 = false + } + if checkSubscriber2 && checkSubscriber(s.subscriber2) { + checkSubscriber2 = false + } + } + + return checkSubscriber1, checkSubscriber1 +} diff --git a/pkg/nats/server.go b/pkg/nats/server.go new file mode 100644 index 0000000000..5903b86bcc --- /dev/null +++ b/pkg/nats/server.go @@ -0,0 +1,80 @@ +package nats + +import ( + "context" + "fmt" + "time" + + "github.com/bacalhau-project/bacalhau/pkg/model" + "github.com/nats-io/nats-server/v2/server" + "github.com/rs/zerolog/log" +) + +const ReadyForConnectionsTimeout = 5 * time.Second + +type ServerManagerParams struct { + Options *server.Options + ConnectionTimeout time.Duration +} + +// ServerManager is a helper struct to manage a NATS server +type ServerManager struct { + Server *server.Server +} + +// NewServerManager is a helper function to create a NATS server with a given options +func NewServerManager(ctx context.Context, params ServerManagerParams) (*ServerManager, error) { + opts := params.Options + ns, err := server.NewServer(opts) + if err != nil { + return nil, err + } + ns.SetLoggerV2(NewZeroLogger(log.Logger, opts.ServerName), opts.Debug, opts.Trace, opts.TraceVerbose) + go ns.Start() + + if params.ConnectionTimeout == 0 { + params.ConnectionTimeout = ReadyForConnectionsTimeout + } + if !ns.ReadyForConnections(params.ConnectionTimeout) { + return nil, fmt.Errorf("could not start nats server on time") + } + log.Ctx(ctx).Info().Msgf("NATS server %s listening on %s", ns.ID(), ns.ClientURL()) + return &ServerManager{ + Server: ns, + }, err +} + +// Stop stops the NATS server +func (sm *ServerManager) Stop() { + sm.Server.Shutdown() +} + +// GetDebugInfo returns the debug info of the NATS server +func (sm *ServerManager) GetDebugInfo(ctx context.Context) (model.DebugInfo, error) { + varz, err := sm.Server.Varz(&server.VarzOptions{}) + if err != nil { + return model.DebugInfo{}, err + } + connz, err := sm.Server.Connz(&server.ConnzOptions{}) + if err != nil { + return model.DebugInfo{}, err + } + routez, err := sm.Server.Routez(&server.RoutezOptions{}) + if err != nil { + return model.DebugInfo{}, err + } + subsz, err := sm.Server.Subsz(&server.SubszOptions{}) + if err != nil { + return model.DebugInfo{}, err + } + return model.DebugInfo{ + Component: "NATSServer", + Info: map[string]interface{}{ + "ID": sm.Server.ID(), + "Varz": varz, + "Connz": connz, + "Routez": routez, + "Subsz": subsz, + }, + }, nil +} diff --git a/pkg/nats/transport/nats.go b/pkg/nats/transport/nats.go new file mode 100644 index 0000000000..f69086a1d2 --- /dev/null +++ b/pkg/nats/transport/nats.go @@ -0,0 +1,229 @@ +package transport + +import ( + "context" + "errors" + "fmt" + "strings" + + "github.com/bacalhau-project/bacalhau/pkg/compute" + "github.com/bacalhau-project/bacalhau/pkg/lib/validate" + "github.com/bacalhau-project/bacalhau/pkg/model" + "github.com/bacalhau-project/bacalhau/pkg/models" + nats_helper "github.com/bacalhau-project/bacalhau/pkg/nats" + "github.com/bacalhau-project/bacalhau/pkg/nats/proxy" + nats_pubsub "github.com/bacalhau-project/bacalhau/pkg/nats/pubsub" + "github.com/bacalhau-project/bacalhau/pkg/pubsub" + "github.com/bacalhau-project/bacalhau/pkg/routing" + core_transport "github.com/bacalhau-project/bacalhau/pkg/transport" + "github.com/hashicorp/go-multierror" + "github.com/nats-io/nats-server/v2/server" + "github.com/rs/zerolog/log" +) + +const NodeInfoSubjectPrefix = "node.info." + +type NATSTransportConfig struct { + NodeID string + Port int + AdvertisedAddress string + Orchestrators []string + IsRequesterNode bool + + // Cluster config for requester nodes to connect with each other + ClusterName string + ClusterPort int + ClusterAdvertisedAddress string + ClusterPeers []string +} + +func (c *NATSTransportConfig) Validate() error { + var mErr *multierror.Error + if validate.IsBlank(c.NodeID) { + mErr = multierror.Append(mErr, errors.New("missing node ID")) + } else if validate.ContainsSpaces(c.NodeID) { + mErr = multierror.Append(mErr, errors.New("node ID contains a space")) + } else if validate.ContainsNull(c.NodeID) { + mErr = multierror.Append(mErr, errors.New("node ID contains a null character")) + } + + if c.IsRequesterNode { + mErr = multierror.Append(mErr, validate.IsGreaterThanZero(c.Port, "port %d must be greater than zero", c.Port)) + + // if cluster config is set, validate it + if c.ClusterName != "" || c.ClusterPort != 0 || c.ClusterAdvertisedAddress != "" || len(c.ClusterPeers) > 0 { + mErr = multierror.Append(mErr, validate.IsGreaterThanZero(c.ClusterPort, "cluster port %d must be greater than zero", c.Port)) + } + } else { + if validate.IsEmpty(c.Orchestrators) { + mErr = multierror.Append(mErr, errors.New("missing orchestrators")) + } + } + return mErr.ErrorOrNil() +} + +type NATSTransport struct { + nodeID string + natsServer *nats_helper.ServerManager + natsClient *nats_helper.ClientManager + computeProxy compute.Endpoint + callbackProxy compute.Callback + nodeInfoPubSub pubsub.PubSub[models.NodeInfo] + nodeInfoDecorator models.NodeInfoDecorator +} + +func NewNATSTransport(ctx context.Context, + config NATSTransportConfig, + nodeInfoStore routing.NodeInfoStore) (*NATSTransport, error) { + log.Debug().Msgf("Creating NATS transport with config: %+v", config) + if err := config.Validate(); err != nil { + return nil, fmt.Errorf("error validating nats transport config. %w", err) + } + + var sm *nats_helper.ServerManager + if config.IsRequesterNode { + // create nats server with servers acting as its cluster peers + routes, err := nats_helper.RoutesFromSlice(config.ClusterPeers) + if err != nil { + return nil, err + } + serverOps := &server.Options{ + ServerName: config.NodeID, + Port: config.Port, + ClientAdvertise: config.AdvertisedAddress, + Routes: routes, + Debug: true, // will only be used if log level is debug + Cluster: server.ClusterOpts{ + Name: config.ClusterName, + Port: config.ClusterPort, + Advertise: config.ClusterAdvertisedAddress, + }, + } + log.Debug().Msgf("Creating NATS server with options: %+v", serverOps) + sm, err = nats_helper.NewServerManager(ctx, nats_helper.ServerManagerParams{ + Options: serverOps, + }) + if err != nil { + return nil, err + } + + config.Orchestrators = append(config.Orchestrators, sm.Server.ClientURL()) + } + + // create nats client + log.Debug().Msgf("Creating NATS client with servers: %s", strings.Join(config.Orchestrators, ",")) + nc, err := nats_helper.NewClientManager(ctx, nats_helper.ClientManagerParams{ + Name: config.NodeID, + Servers: strings.Join(config.Orchestrators, ","), + }) + if err != nil { + return nil, err + } + + // PubSub to publish and consume node info messages + nodeInfoPubSub, err := nats_pubsub.NewPubSub[models.NodeInfo](nats_pubsub.PubSubParams{ + Conn: nc.Client, + Subject: NodeInfoSubjectPrefix + config.NodeID, + SubscriptionSubject: NodeInfoSubjectPrefix + "*", + }) + if err != nil { + return nil, err + } + + if config.IsRequesterNode { + // subscribe to nodeInfo subject and add nodeInfo to nodeInfoStore + nodeInfoSubscriber := pubsub.NewChainedSubscriber[models.NodeInfo](true) + nodeInfoSubscriber.Add(pubsub.SubscriberFunc[models.NodeInfo](nodeInfoStore.Add)) + err = nodeInfoPubSub.Subscribe(ctx, nodeInfoSubscriber) + if err != nil { + return nil, err + } + } + + // compute proxy + computeProxy := proxy.NewComputeProxy(proxy.ComputeProxyParams{ + Conn: nc.Client, + }) + + // Callback to send compute events (i.e. requester endpoint) + computeCallback := proxy.NewCallbackProxy(proxy.CallbackProxyParams{ + Conn: nc.Client, + }) + + return &NATSTransport{ + nodeID: config.NodeID, + natsServer: sm, + natsClient: nc, + computeProxy: computeProxy, + callbackProxy: computeCallback, + nodeInfoPubSub: nodeInfoPubSub, + nodeInfoDecorator: models.NoopNodeInfoDecorator{}, + }, nil +} + +// RegisterComputeCallback registers a compute callback with the transport layer. +func (t *NATSTransport) RegisterComputeCallback(callback compute.Callback) error { + _, err := proxy.NewCallbackHandler(proxy.CallbackHandlerParams{ + Name: t.nodeID, + Conn: t.natsClient.Client, + Callback: callback, + }) + return err +} + +// RegisterComputeEndpoint registers a compute endpoint with the transport layer. +func (t *NATSTransport) RegisterComputeEndpoint(endpoint compute.Endpoint) error { + _, err := proxy.NewComputeHandler(proxy.ComputeHandlerParams{ + Name: t.nodeID, + Conn: t.natsClient.Client, + ComputeEndpoint: endpoint, + }) + return err +} + +// ComputeProxy returns the compute proxy. +func (t *NATSTransport) ComputeProxy() compute.Endpoint { + return t.computeProxy +} + +// CallbackProxy returns the callback proxy. +func (t *NATSTransport) CallbackProxy() compute.Callback { + return t.callbackProxy +} + +// NodeInfoPubSub returns the node info pubsub. +func (t *NATSTransport) NodeInfoPubSub() pubsub.PubSub[models.NodeInfo] { + return t.nodeInfoPubSub +} + +// NodeInfoDecorator returns the node info decorator. +func (t *NATSTransport) NodeInfoDecorator() models.NodeInfoDecorator { + return t.nodeInfoDecorator +} + +// DebugInfoProviders returns the debug info of the NATS transport layer +func (t *NATSTransport) DebugInfoProviders() []model.DebugInfoProvider { + var debugInfoProviders []model.DebugInfoProvider + if t.natsServer != nil { + debugInfoProviders = append(debugInfoProviders, t.natsServer) + } + if t.natsClient != nil { + debugInfoProviders = append(debugInfoProviders, t.natsClient) + } + return debugInfoProviders +} + +// Close closes the transport layer. +func (t *NATSTransport) Close(ctx context.Context) error { + if t.natsServer != nil { + log.Ctx(ctx).Debug().Msgf("Shutting down server %s", t.natsServer.Server.Name()) + t.natsServer.Stop() + } + if t.natsClient != nil { + t.natsClient.Stop() + } + return nil +} + +// compile-time interface check +var _ core_transport.TransportLayer = (*NATSTransport)(nil) diff --git a/pkg/nats/util.go b/pkg/nats/util.go new file mode 100644 index 0000000000..9a82803500 --- /dev/null +++ b/pkg/nats/util.go @@ -0,0 +1,41 @@ +package nats + +import ( + "net/url" + "regexp" + "strings" +) + +var schemeRegex = regexp.MustCompile(`^[a-zA-Z][a-zA-Z0-9+-.]*://`) + +const defaultScheme = "nats://" + +// RoutesFromStr parses route URLs from a string +// e.g. "nats://localhost:4222,nats://localhost:4223" +func RoutesFromStr(routesStr string) ([]*url.URL, error) { + routes := strings.Split(routesStr, ",") + if len(routes) == 0 { + return nil, nil + } + var routeUrls []*url.URL + for _, r := range routes { + r = strings.TrimSpace(r) + if !schemeRegex.MatchString(r) { + r = defaultScheme + r + } + u, err := url.Parse(r) + if err != nil { + return nil, err + } + routeUrls = append(routeUrls, u) + } + return routeUrls, nil +} + +// RoutesFromSlice parses route URLs from a slice of strings +func RoutesFromSlice(routes []string) ([]*url.URL, error) { + if len(routes) == 0 { + return []*url.URL{}, nil + } + return RoutesFromStr(strings.Join(routes, ",")) +} diff --git a/pkg/node/compute.go b/pkg/node/compute.go index 2448a0a6ce..06c2f2e6f6 100644 --- a/pkg/node/compute.go +++ b/pkg/node/compute.go @@ -6,11 +6,6 @@ import ( "net/url" "github.com/bacalhau-project/bacalhau/pkg/bidstrategy" - "github.com/bacalhau-project/bacalhau/pkg/models" - "github.com/bacalhau-project/bacalhau/pkg/publicapi" - compute_endpoint "github.com/bacalhau-project/bacalhau/pkg/publicapi/endpoint/compute" - "github.com/libp2p/go-libp2p/core/host" - "github.com/bacalhau-project/bacalhau/pkg/bidstrategy/resource" "github.com/bacalhau-project/bacalhau/pkg/bidstrategy/semantic" "github.com/bacalhau-project/bacalhau/pkg/compute" @@ -22,33 +17,37 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/executor" executor_util "github.com/bacalhau-project/bacalhau/pkg/executor/util" "github.com/bacalhau-project/bacalhau/pkg/model" + "github.com/bacalhau-project/bacalhau/pkg/models" + "github.com/bacalhau-project/bacalhau/pkg/publicapi" + compute_endpoint "github.com/bacalhau-project/bacalhau/pkg/publicapi/endpoint/compute" "github.com/bacalhau-project/bacalhau/pkg/publisher" "github.com/bacalhau-project/bacalhau/pkg/repo" "github.com/bacalhau-project/bacalhau/pkg/storage" repo_storage "github.com/bacalhau-project/bacalhau/pkg/storage/repo" "github.com/bacalhau-project/bacalhau/pkg/system" - "github.com/bacalhau-project/bacalhau/pkg/transport/bprotocol" + "github.com/libp2p/go-libp2p/core/host" ) type Compute struct { // Visible for testing - ID string - LocalEndpoint compute.Endpoint - Capacity capacity.Tracker - ExecutionStore store.ExecutionStore - Executors executor.ExecutorProvider - Storages storage.StorageProvider - LogServer *logstream.LogStreamServer - Bidder compute.Bidder - computeCallback *bprotocol.CallbackProxy - cleanupFunc func(ctx context.Context) - computeInfoProvider models.ComputeNodeInfoProvider - autoLabelsProvider models.LabelsProvider + ID string + LocalEndpoint compute.Endpoint + Capacity capacity.Tracker + ExecutionStore store.ExecutionStore + Executors executor.ExecutorProvider + Storages storage.StorageProvider + LogServer *logstream.LogStreamServer + Bidder compute.Bidder + cleanupFunc func(ctx context.Context) + nodeInfoDecorator models.NodeInfoDecorator + autoLabelsProvider models.LabelsProvider + debugInfoProviders []model.DebugInfoProvider } //nolint:funlen func NewComputeNode( ctx context.Context, + nodeID string, cleanupManager *system.CleanupManager, host host.Host, apiServer *publicapi.Server, @@ -58,12 +57,13 @@ func NewComputeNode( executors executor.ExecutorProvider, publishers publisher.PublisherProvider, fsRepo *repo.FsRepo, + computeCallback compute.Callback, ) (*Compute, error) { var executionStore store.ExecutionStore // create the execution store if config.ExecutionStore == nil { var err error - executionStore, err = fsRepo.InitExecutionStore(ctx, host.ID().String()) + executionStore, err = fsRepo.InitExecutionStore(ctx, nodeID) if err != nil { return nil, err } @@ -79,17 +79,12 @@ func NewComputeNode( MaxCapacity: config.QueueResourceLimits, }) - // Callback to send compute events (i.e. requester endpoint) - computeCallback := bprotocol.NewCallbackProxy(bprotocol.CallbackProxyParams{ - Host: host, - }) - resultsPath, err := compute.NewResultsPath() if err != nil { return nil, err } baseExecutor := compute.NewBaseExecutor(compute.BaseExecutorParams{ - ID: host.ID().String(), + ID: nodeID, Callback: computeCallback, Store: executionStore, StorageDirectory: storagePath, @@ -101,7 +96,7 @@ func NewComputeNode( }) bufferRunner := compute.NewExecutorBuffer(compute.ExecutorBufferParams{ - ID: host.ID().String(), + ID: nodeID, DelegateExecutor: baseExecutor, Callback: computeCallback, RunningCapacityTracker: runningCapacityTracker, @@ -183,21 +178,25 @@ func NewComputeNode( } // logging server - logserver := logstream.NewLogStreamServer(logstream.LogStreamServerOptions{ - Ctx: ctx, - Host: host, - ExecutionStore: executionStore, - // - Executors: executors, - }) - _, loggingCancel := context.WithCancel(ctx) - cleanupManager.RegisterCallback(func() error { - loggingCancel() - return nil - }) + // TODO: make logging server agnostic to libp2p transport + var logserver *logstream.LogStreamServer + if host != nil { + logserver = logstream.NewLogStreamServer(logstream.LogStreamServerOptions{ + Ctx: ctx, + Host: host, + ExecutionStore: executionStore, + // + Executors: executors, + }) + _, loggingCancel := context.WithCancel(ctx) + cleanupManager.RegisterCallback(func() error { + loggingCancel() + return nil + }) + } // node info - nodeInfoProvider := compute.NewNodeInfoProvider(compute.NodeInfoProviderParams{ + nodeInfoDecorator := compute.NewNodeInfoDecorator(compute.NodeInfoDecoratorParams{ Executors: executors, Publisher: publishers, Storages: storages, @@ -208,7 +207,7 @@ func NewComputeNode( bidStrat := bidstrategy.NewChainedBidStrategy(semanticBidStrat, resourceBidStrat) bidder := compute.NewBidder(compute.BidderParams{ - NodeID: host.ID().String(), + NodeID: nodeID, SemanticStrategy: bidStrat, ResourceStrategy: bidStrat, Store: executionStore, @@ -220,17 +219,12 @@ func NewComputeNode( }) baseEndpoint := compute.NewBaseEndpoint(compute.BaseEndpointParams{ - ID: host.ID().String(), + ID: nodeID, ExecutionStore: executionStore, UsageCalculator: capacityCalculator, Bidder: bidder, Executor: bufferRunner, - LogServer: *logserver, - }) - - bprotocol.NewComputeHandler(bprotocol.ComputeHandlerParams{ - Host: host, - ComputeEndpoint: baseEndpoint, + LogServer: logserver, }) // register debug info providers for the /debug endpoint @@ -267,25 +261,21 @@ func NewComputeNode( ) return &Compute{ - ID: host.ID().String(), - LocalEndpoint: baseEndpoint, - Capacity: runningCapacityTracker, - ExecutionStore: executionStore, - Executors: executors, - Storages: storages, - Bidder: bidder, - LogServer: logserver, - computeCallback: computeCallback, - cleanupFunc: cleanupFunc, - computeInfoProvider: nodeInfoProvider, - autoLabelsProvider: labelsProvider, + ID: nodeID, + LocalEndpoint: baseEndpoint, + Capacity: runningCapacityTracker, + ExecutionStore: executionStore, + Executors: executors, + Storages: storages, + Bidder: bidder, + LogServer: logserver, + cleanupFunc: cleanupFunc, + nodeInfoDecorator: nodeInfoDecorator, + autoLabelsProvider: labelsProvider, + debugInfoProviders: debugInfoProviders, }, nil } -func (c *Compute) RegisterLocalComputeCallback(callback compute.Callback) { - c.computeCallback.RegisterLocalComputeCallback(callback) -} - func (c *Compute) cleanup(ctx context.Context) { c.cleanupFunc(ctx) } diff --git a/pkg/node/config_network.go b/pkg/node/config_network.go new file mode 100644 index 0000000000..f31cbe7991 --- /dev/null +++ b/pkg/node/config_network.go @@ -0,0 +1,45 @@ +package node + +import ( + "errors" + "fmt" + "time" + + "github.com/bacalhau-project/bacalhau/pkg/lib/validate" + "github.com/bacalhau-project/bacalhau/pkg/models" + "github.com/hashicorp/go-multierror" + "github.com/libp2p/go-libp2p/core/host" + "github.com/samber/lo" +) + +var supportedNetworks = []string{ + models.NetworkTypeLibp2p, + models.NetworkTypeNATS, +} + +type NetworkConfig struct { + Type string + Libp2pHost host.Host // only set if using libp2p transport, nil otherwise + ReconnectDelay time.Duration + + // NATS config for requesters to be reachable by compute nodes + Port int + AdvertisedAddress string + Orchestrators []string + + // NATS config for requester nodes to connect with each other + ClusterName string + ClusterPort int + ClusterAdvertisedAddress string + ClusterPeers []string +} + +func (c *NetworkConfig) Validate() error { + var mErr *multierror.Error + if validate.IsBlank(c.Type) { + mErr = multierror.Append(mErr, errors.New("missing network type")) + } else if !lo.Contains(supportedNetworks, c.Type) { + mErr = multierror.Append(mErr, fmt.Errorf("network type %s not in supported values %s", c.Type, supportedNetworks)) + } + return mErr.ErrorOrNil() +} diff --git a/pkg/node/factories.go b/pkg/node/factories.go index 1745245510..1721f39682 100644 --- a/pkg/node/factories.go +++ b/pkg/node/factories.go @@ -70,7 +70,7 @@ func NewStandardExecutorsFactory() ExecutorsFactory { ctx, nodeConfig.CleanupManager, executor_util.StandardExecutorOptions{ - DockerID: fmt.Sprintf("bacalhau-%s", nodeConfig.Host.ID().String()), + DockerID: fmt.Sprintf("bacalhau-%s", nodeConfig.NodeID), }, ) if err != nil { @@ -142,9 +142,9 @@ func NewStandardAuthenticatorsFactory() AuthenticatorsFactory { map[string]authn.Authenticator{ "ClientKey": challenge.NewAuthenticator( challenge.AnonymousModePolicy, - nodeConfig.Host.ID(), + challenge.NewStringMarshaller(nodeConfig.NodeID), privKey, - nodeConfig.Host.ID().String(), + nodeConfig.NodeID, ), }, ), nil diff --git a/pkg/node/node.go b/pkg/node/node.go index 82049f85b7..4a05fc891e 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -5,36 +5,29 @@ import ( "fmt" "time" - "github.com/imdario/mergo" - "github.com/labstack/echo/v4" - libp2p_pubsub "github.com/libp2p/go-libp2p-pubsub" - "github.com/libp2p/go-libp2p/core/host" - basichost "github.com/libp2p/go-libp2p/p2p/host/basic" - routedhost "github.com/libp2p/go-libp2p/p2p/host/routed" - "github.com/libp2p/go-libp2p/p2p/protocol/identify" - "github.com/bacalhau-project/bacalhau/pkg/authz" + pkgconfig "github.com/bacalhau-project/bacalhau/pkg/config" + "github.com/bacalhau-project/bacalhau/pkg/ipfs" + libp2p_transport "github.com/bacalhau-project/bacalhau/pkg/libp2p/transport" + "github.com/bacalhau-project/bacalhau/pkg/model" "github.com/bacalhau-project/bacalhau/pkg/models" + nats_transport "github.com/bacalhau-project/bacalhau/pkg/nats/transport" "github.com/bacalhau-project/bacalhau/pkg/publicapi" "github.com/bacalhau-project/bacalhau/pkg/publicapi/apimodels" "github.com/bacalhau-project/bacalhau/pkg/publicapi/endpoint/agent" "github.com/bacalhau-project/bacalhau/pkg/publicapi/endpoint/shared" - - pkgconfig "github.com/bacalhau-project/bacalhau/pkg/config" - "github.com/bacalhau-project/bacalhau/pkg/ipfs" - "github.com/bacalhau-project/bacalhau/pkg/pubsub" - "github.com/bacalhau-project/bacalhau/pkg/pubsub/libp2p" "github.com/bacalhau-project/bacalhau/pkg/repo" "github.com/bacalhau-project/bacalhau/pkg/routing" "github.com/bacalhau-project/bacalhau/pkg/routing/inmemory" "github.com/bacalhau-project/bacalhau/pkg/system" - "github.com/bacalhau-project/bacalhau/pkg/util" + "github.com/bacalhau-project/bacalhau/pkg/transport" "github.com/bacalhau-project/bacalhau/pkg/version" + "github.com/hashicorp/go-multierror" + "github.com/imdario/mergo" + "github.com/labstack/echo/v4" + "github.com/libp2p/go-libp2p/core/host" ) -const JobInfoTopic = "bacalhau-job-info" -const NodeInfoTopic = "bacalhau-node-info" - type FeatureConfig struct { Engines []string Publishers []string @@ -43,9 +36,9 @@ type FeatureConfig struct { // Node configuration type NodeConfig struct { + NodeID string IPFSClient ipfs.Client CleanupManager *system.CleanupManager - Host host.Host HostAddress string APIPort uint16 RequesterAutoCert string @@ -64,7 +57,15 @@ type NodeConfig struct { AllowListedLocalPaths []string NodeInfoStoreTTL time.Duration - FsRepo *repo.FsRepo + FsRepo *repo.FsRepo + NetworkConfig NetworkConfig +} + +func (c *NodeConfig) Validate() error { + // TODO: add more validations + var mErr *multierror.Error + mErr = multierror.Append(mErr, c.NetworkConfig.Validate()) + return mErr.ErrorOrNil() } // Lazy node dependency injector that generate instances of different @@ -96,13 +97,14 @@ func NewStandardNodeDependencyInjector() NodeDependencyInjector { type Node struct { // Visible for testing + ID string APIServer *publicapi.Server ComputeNode *Compute RequesterNode *Requester NodeInfoStore routing.NodeInfoStore CleanupManager *system.CleanupManager IPFSClient ipfs.Client - Host host.Host + Libp2pHost host.Host // only set if using libp2p transport, nil otherwise } func (n *Node) Start(ctx context.Context) error { @@ -113,13 +115,16 @@ func (n *Node) Start(ctx context.Context) error { func NewNode( ctx context.Context, config NodeConfig) (*Node, error) { - ctx, span := system.NewSpan(ctx, system.GetTracer(), "pkg/node.NewNode") - defer span.End() - - identify.ActivationThresh = 2 + var err error + ctx, cancel := context.WithCancel(ctx) + defer func() { + if err != nil { + cancel() + } + }() config.DependencyInjector = mergeDependencyInjectors(config.DependencyInjector, NewStandardNodeDependencyInjector()) - err := mergo.Merge(&config.APIServerConfig, publicapi.DefaultConfig()) + err = mergo.Merge(&config.APIServerConfig, publicapi.DefaultConfig()) if err != nil { return nil, err } @@ -128,6 +133,11 @@ func NewNode( config.APIServerConfig.LogLevel = "trace" } + err = config.Validate() + if err != nil { + return nil, fmt.Errorf("error validating node config. %w", err) + } + storageProviders, err := config.DependencyInjector.StorageProvidersFactory.Get(ctx, config) if err != nil { return nil, err @@ -148,49 +158,6 @@ func NewNode( return nil, err } - // A single gossipSub instance that will be used by all topics - gossipSubCtx, gossipSubCancel := context.WithCancel(ctx) - gossipSub, err := newLibp2pPubSub(gossipSubCtx, config) - defer func() { - if err != nil { - gossipSubCancel() - } - }() - - if err != nil { - return nil, err - } - - // PubSub to publish node info to the network - nodeInfoPubSub, err := libp2p.NewPubSub[models.NodeInfo](libp2p.PubSubParams{ - Host: config.Host, - TopicName: NodeInfoTopic, - PubSub: gossipSub, - }) - if err != nil { - return nil, err - } - - // node info publisher - nodeInfoPublisherInterval := config.NodeInfoPublisherInterval - if nodeInfoPublisherInterval.IsZero() { - nodeInfoPublisherInterval = GetNodeInfoPublishConfig() - } - - // node info store that is used for both discovering compute nodes, as to find addresses of other nodes for routing requests. - nodeInfoStore := inmemory.NewNodeInfoStore(inmemory.NodeInfoStoreParams{ - TTL: config.NodeInfoStoreTTL, - }) - routedHost := routedhost.Wrap(config.Host, nodeInfoStore) - - // register consumers of node info published over gossipSub - nodeInfoSubscriber := pubsub.NewChainedSubscriber[models.NodeInfo](true) - nodeInfoSubscriber.Add(pubsub.SubscriberFunc[models.NodeInfo](nodeInfoStore.Add)) - err = nodeInfoPubSub.Subscribe(ctx, nodeInfoSubscriber) - if err != nil { - return nil, err - } - // timeoutHandler doesn't implement http.Hijacker, so we need to skip it for websocket endpoints config.APIServerConfig.SkippedTimeoutPaths = append(config.APIServerConfig.SkippedTimeoutPaths, []string{ "/api/v1/requester/websocket/events", @@ -203,7 +170,7 @@ func NewNode( Router: echo.New(), Address: config.HostAddress, Port: config.APIPort, - HostID: config.Host.ID().String(), + HostID: config.NodeID, Config: config.APIServerConfig, Authorizer: authz.AlwaysAllow, Headers: map[string]string{ @@ -228,28 +195,67 @@ func NewNode( return nil, err } + // node info store that is used for both discovering compute nodes, as to find addresses of other nodes for routing requests. + nodeInfoStore := inmemory.NewNodeInfoStore(inmemory.NodeInfoStoreParams{ + TTL: config.NodeInfoStoreTTL, + }) + + var transportLayer transport.TransportLayer + + if config.NetworkConfig.Type == models.NetworkTypeNATS { + natsConfig := nats_transport.NATSTransportConfig{ + NodeID: config.NodeID, + Port: config.NetworkConfig.Port, + AdvertisedAddress: config.NetworkConfig.AdvertisedAddress, + Orchestrators: config.NetworkConfig.Orchestrators, + ClusterName: config.NetworkConfig.ClusterName, + ClusterPort: config.NetworkConfig.ClusterPort, + ClusterPeers: config.NetworkConfig.ClusterPeers, + ClusterAdvertisedAddress: config.NetworkConfig.ClusterAdvertisedAddress, + IsRequesterNode: config.IsRequesterNode, + } + transportLayer, err = nats_transport.NewNATSTransport(ctx, natsConfig, nodeInfoStore) + } else { + libp2pConfig := libp2p_transport.Libp2pTransportConfig{ + Host: config.NetworkConfig.Libp2pHost, + Peers: config.NetworkConfig.ClusterPeers, + ReconnectDelay: config.NetworkConfig.ReconnectDelay, + CleanupManager: config.CleanupManager, + } + transportLayer, err = libp2p_transport.NewLibp2pTransport(ctx, libp2pConfig, nodeInfoStore) + } + if err != nil { + return nil, err + } + + var debugInfoProviders []model.DebugInfoProvider + debugInfoProviders = append(debugInfoProviders, transportLayer.DebugInfoProviders()...) + var requesterNode *Requester var computeNode *Compute - - var computeInfoProvider models.ComputeNodeInfoProvider var labelsProvider models.LabelsProvider = &ConfigLabelsProvider{staticLabels: config.Labels} // setup requester node if config.IsRequesterNode { requesterNode, err = NewRequesterNode( ctx, - routedHost, + config.NodeID, apiServer, config.RequesterNodeConfig, storageProviders, authenticators, nodeInfoStore, - gossipSub, config.FsRepo, + transportLayer.ComputeProxy(), ) if err != nil { return nil, err } + err = transportLayer.RegisterComputeCallback(requesterNode.localCallback) + if err != nil { + return nil, err + } + debugInfoProviders = append(debugInfoProviders, requesterNode.debugInfoProviders...) } if config.IsComputeNode { @@ -258,8 +264,9 @@ func NewNode( // setup compute node computeNode, err = NewComputeNode( ctx, + config.NodeID, config.CleanupManager, - routedHost, + config.NetworkConfig.Libp2pHost, apiServer, config.ComputeConfig, storagePath, @@ -267,47 +274,56 @@ func NewNode( executors, publishers, config.FsRepo, + transportLayer.CallbackProxy(), ) if err != nil { return nil, err } - computeInfoProvider = computeNode.computeInfoProvider + err = transportLayer.RegisterComputeEndpoint(computeNode.LocalEndpoint) + if err != nil { + return nil, err + } + labelsProvider = models.MergeLabelsInOrder( computeNode.autoLabelsProvider, labelsProvider, ) + debugInfoProviders = append(debugInfoProviders, computeNode.debugInfoProviders...) } - // node info provider - basicHost, ok := config.Host.(*basichost.BasicHost) - if !ok { - return nil, fmt.Errorf("host is not a basic host") - } nodeInfoProvider := routing.NewNodeInfoProvider(routing.NodeInfoProviderParams{ - Host: basicHost, - IdentityService: basicHost.IDService(), - LabelsProvider: labelsProvider, - ComputeInfoProvider: computeInfoProvider, - BacalhauVersion: *version.Get(), + NodeID: config.NodeID, + LabelsProvider: labelsProvider, + BacalhauVersion: *version.Get(), }) + nodeInfoProvider.RegisterNodeInfoDecorator(transportLayer.NodeInfoDecorator()) + if computeNode != nil { + nodeInfoProvider.RegisterNodeInfoDecorator(computeNode.nodeInfoDecorator) + } shared.NewEndpoint(shared.EndpointParams{ Router: apiServer.Router, - NodeID: config.Host.ID().String(), - PeerStore: config.Host.Peerstore(), + NodeID: config.NodeID, NodeInfoProvider: nodeInfoProvider, }) agent.NewEndpoint(agent.EndpointParams{ - Router: apiServer.Router, - NodeInfoProvider: nodeInfoProvider, + Router: apiServer.Router, + NodeInfoProvider: nodeInfoProvider, + DebugInfoProviders: debugInfoProviders, }) + // node info publisher + nodeInfoPublisherInterval := config.NodeInfoPublisherInterval + if nodeInfoPublisherInterval.IsZero() { + nodeInfoPublisherInterval = GetNodeInfoPublishConfig() + } + // NB(forrest): this must be done last to avoid eager publishing before nodes are constructed // TODO(forrest) [fixme] we should fix this to make it less racy in testing nodeInfoPublisher := routing.NewNodeInfoPublisher(routing.NodeInfoPublisherParams{ - PubSub: nodeInfoPubSub, + PubSub: transportLayer.NodeInfoPubSub(), NodeInfoProvider: nodeInfoProvider, IntervalConfig: nodeInfoPublisherInterval, }) @@ -333,31 +349,23 @@ func NewNode( requesterNode.cleanup(ctx) } nodeInfoPublisher.Stop(ctx) - cleanupErr := nodeInfoPubSub.Close(ctx) - util.LogDebugIfContextCancelled(ctx, cleanupErr, "node info pub sub") - gossipSubCancel() - - cleanupErr = config.Host.Close() - util.LogDebugIfContextCancelled(ctx, cleanupErr, "host") - cleanupErr = apiServer.Shutdown(ctx) - return cleanupErr + var errors *multierror.Error + errors = multierror.Append(errors, transportLayer.Close(ctx)) + errors = multierror.Append(errors, apiServer.Shutdown(ctx)) + cancel() + return errors.ErrorOrNil() }) - if requesterNode != nil && computeNode != nil { - // To enable nodes self-dialing themselves as libp2p doesn't support it. - computeNode.RegisterLocalComputeCallback(requesterNode.localCallback) - requesterNode.RegisterLocalComputeEndpoint(computeNode.LocalEndpoint) - } - node := &Node{ + ID: config.NodeID, CleanupManager: config.CleanupManager, APIServer: apiServer, IPFSClient: config.IPFSClient, ComputeNode: computeNode, RequesterNode: requesterNode, NodeInfoStore: nodeInfoStore, - Host: routedHost, + Libp2pHost: config.NetworkConfig.Libp2pHost, } return node, nil @@ -373,27 +381,6 @@ func (n *Node) IsComputeNode() bool { return n.ComputeNode != nil } -func newLibp2pPubSub(ctx context.Context, nodeConfig NodeConfig) (*libp2p_pubsub.PubSub, error) { - tracer, err := libp2p_pubsub.NewJSONTracer(pkgconfig.GetLibp2pTracerPath()) - if err != nil { - return nil, err - } - - pgParams := libp2p_pubsub.NewPeerGaterParams( - 0.33, //nolint:gomnd - libp2p_pubsub.ScoreParameterDecay(2*time.Minute), //nolint:gomnd - libp2p_pubsub.ScoreParameterDecay(10*time.Minute), //nolint:gomnd - ) - - return libp2p_pubsub.NewGossipSub( - ctx, - nodeConfig.Host, - libp2p_pubsub.WithPeerExchange(true), - libp2p_pubsub.WithPeerGater(pgParams), - libp2p_pubsub.WithEventTracer(tracer), - ) -} - func mergeDependencyInjectors(injector NodeDependencyInjector, defaultInjector NodeDependencyInjector) NodeDependencyInjector { if injector.StorageProvidersFactory == nil { injector.StorageProvidersFactory = defaultInjector.StorageProvidersFactory diff --git a/pkg/node/requester.go b/pkg/node/requester.go index 78b38f47f9..25b2dd90f5 100644 --- a/pkg/node/requester.go +++ b/pkg/node/requester.go @@ -17,15 +17,10 @@ import ( auth_endpoint "github.com/bacalhau-project/bacalhau/pkg/publicapi/endpoint/auth" orchestrator_endpoint "github.com/bacalhau-project/bacalhau/pkg/publicapi/endpoint/orchestrator" requester_endpoint "github.com/bacalhau-project/bacalhau/pkg/publicapi/endpoint/requester" - "github.com/bacalhau-project/bacalhau/pkg/pubsub" - "github.com/bacalhau-project/bacalhau/pkg/pubsub/libp2p" - "github.com/bacalhau-project/bacalhau/pkg/requester/pubsub/jobinfo" + "github.com/bacalhau-project/bacalhau/pkg/routing" s3helper "github.com/bacalhau-project/bacalhau/pkg/s3" "github.com/bacalhau-project/bacalhau/pkg/translation" "github.com/bacalhau-project/bacalhau/pkg/util" - libp2p_pubsub "github.com/libp2p/go-libp2p-pubsub" - "github.com/libp2p/go-libp2p/core/crypto" - "github.com/libp2p/go-libp2p/core/host" "github.com/rs/zerolog/log" "github.com/bacalhau-project/bacalhau/pkg/compute" @@ -36,67 +31,41 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/orchestrator/selection/ranking" "github.com/bacalhau-project/bacalhau/pkg/repo" "github.com/bacalhau-project/bacalhau/pkg/requester" - "github.com/bacalhau-project/bacalhau/pkg/routing" "github.com/bacalhau-project/bacalhau/pkg/storage" "github.com/bacalhau-project/bacalhau/pkg/system" - "github.com/bacalhau-project/bacalhau/pkg/transport/bprotocol" ) type Requester struct { // Visible for testing - Endpoint requester.Endpoint - JobStore jobstore.Store - NodeDiscoverer orchestrator.NodeDiscoverer - computeProxy *bprotocol.ComputeProxy - localCallback compute.Callback - cleanupFunc func(ctx context.Context) + Endpoint requester.Endpoint + JobStore jobstore.Store + NodeDiscoverer orchestrator.NodeDiscoverer + localCallback compute.Callback + cleanupFunc func(ctx context.Context) + debugInfoProviders []model.DebugInfoProvider } //nolint:funlen func NewRequesterNode( ctx context.Context, - host host.Host, + nodeID string, apiServer *publicapi.Server, requesterConfig RequesterConfig, storageProvider storage.StorageProvider, authnProvider authn.Provider, nodeInfoStore routing.NodeInfoStore, - gossipSub *libp2p_pubsub.PubSub, fsRepo *repo.FsRepo, + computeProxy compute.Endpoint, ) (*Requester, error) { // prepare event handlers - tracerContextProvider := eventhandler.NewTracerContextProvider(host.ID().String()) + tracerContextProvider := eventhandler.NewTracerContextProvider(nodeID) localJobEventConsumer := eventhandler.NewChainedJobEventHandler(tracerContextProvider) - // compute proxy - computeProxy := bprotocol.NewComputeProxy(bprotocol.ComputeProxyParams{ - Host: host, - }) - eventEmitter := orchestrator.NewEventEmitter(orchestrator.EventEmitterParams{ EventConsumer: localJobEventConsumer, }) - jobStore, err := fsRepo.InitJobStore(ctx, host.ID().String()) - if err != nil { - return nil, err - } - - // PubSub to publish job events to the network - jobInfoPubSub, err := libp2p.NewPubSub[jobinfo.Envelope](libp2p.PubSubParams{ - Host: host, - TopicName: JobInfoTopic, - PubSub: gossipSub, - IgnoreLocal: true, - }) - if err != nil { - return nil, err - } - jobInfoPublisher := jobinfo.NewPublisher(jobinfo.PublisherParams{ - JobStore: jobStore, - PubSub: jobInfoPubSub, - }) - err = jobInfoPubSub.Subscribe(ctx, pubsub.NewNoopSubscriber[jobinfo.Envelope]()) + jobStore, err := fsRepo.InitJobStore(ctx, nodeID) if err != nil { return nil, err } @@ -153,14 +122,14 @@ func NewRequesterNode( // planner that forwards the desired state to the compute nodes, // and updates the observed state if the compute node accepts the desired state planner.NewComputeForwarder(planner.ComputeForwarderParams{ - ID: host.ID().String(), + ID: nodeID, ComputeService: computeProxy, JobStore: jobStore, }), // planner that publishes events on job completion or failure planner.NewEventEmitter(planner.EventEmitterParams{ - ID: host.ID().String(), + ID: nodeID, EventEmitter: eventEmitter, }), @@ -214,12 +183,6 @@ func NewRequesterNode( worker.Start(ctx) } - publicKey := host.Peerstore().PubKey(host.ID()) - marshaledPublicKey, err := crypto.MarshalPublicKey(publicKey) - if err != nil { - return nil, err - } - // result transformers that are applied to the result before it is returned to the user resultTransformers := transformer.ChainedTransformer[*models.SpecConfig]{} @@ -239,8 +202,7 @@ func NewRequesterNode( } endpoint := requester.NewBaseEndpoint(&requester.BaseEndpointParams{ - ID: host.ID().String(), - PublicKey: marshaledPublicKey, + ID: nodeID, EvaluationBroker: evalBroker, EventEmitter: eventEmitter, ComputeEndpoint: computeProxy, @@ -255,7 +217,7 @@ func NewRequesterNode( } endpointV2 := orchestrator.NewBaseEndpoint(&orchestrator.BaseEndpointParams{ - ID: host.ID().String(), + ID: nodeID, EvaluationBroker: evalBroker, Store: jobStore, EventEmitter: eventEmitter, @@ -264,7 +226,7 @@ func NewRequesterNode( transformer.JobFn(transformer.IDGenerator), transformer.NameOptional(), transformer.DefaultsApplier(requesterConfig.JobDefaults), - transformer.RequesterInfo(host.ID().String(), marshaledPublicKey), + transformer.RequesterInfo(nodeID), transformer.NewInlineStoragePinner(storageProvider), }, TaskTranslator: translationProvider, @@ -274,16 +236,10 @@ func NewRequesterNode( housekeeping := requester.NewHousekeeping(requester.HousekeepingParams{ Endpoint: endpoint, JobStore: jobStore, - NodeID: host.ID().String(), + NodeID: nodeID, Interval: requesterConfig.HousekeepingBackgroundTaskInterval, }) - // register a handler for the bacalhau protocol handler that will forward requests to the scheduler - bprotocol.NewCallbackHandler(bprotocol.CallbackHandlerParams{ - Host: host, - Callback: endpoint, - }) - // register debug info providers for the /debug endpoint debugInfoProviders := []model.DebugInfoProvider{ discovery.NewDebugInfoProvider(nodeDiscoveryChain), @@ -308,7 +264,7 @@ func NewRequesterNode( auth_endpoint.BindEndpoint(ctx, apiServer.Router, authnProvider) // Register event handlers - lifecycleEventHandler := system.NewJobLifecycleEventHandler(host.ID().String()) + lifecycleEventHandler := system.NewJobLifecycleEventHandler(nodeID) eventTracer, err := eventhandler.NewTracer() if err != nil { return nil, err @@ -324,8 +280,6 @@ func NewRequesterNode( eventTracer, // dispatches events to listening websockets requesterAPIServer, - // publish job events to the network - jobInfoPublisher, ) // A single cleanup function to make sure the order of closing dependencies is correct @@ -337,12 +291,7 @@ func NewRequesterNode( } evalBroker.SetEnabled(false) - cleanupErr := jobInfoPubSub.Close(ctx) - if cleanupErr != nil { - util.LogDebugIfContextCancelled(ctx, cleanupErr, "failed to shutdown job info pubsub") - } - - cleanupErr = tracerContextProvider.Shutdown() + cleanupErr := tracerContextProvider.Shutdown() if cleanupErr != nil { util.LogDebugIfContextCancelled(ctx, cleanupErr, "failed to shutdown tracer context provider") } @@ -359,19 +308,15 @@ func NewRequesterNode( } return &Requester{ - Endpoint: endpoint, - localCallback: endpoint, - NodeDiscoverer: nodeDiscoveryChain, - JobStore: jobStore, - computeProxy: computeProxy, - cleanupFunc: cleanupFunc, + Endpoint: endpoint, + localCallback: endpoint, + NodeDiscoverer: nodeDiscoveryChain, + JobStore: jobStore, + cleanupFunc: cleanupFunc, + debugInfoProviders: debugInfoProviders, }, nil } -func (r *Requester) RegisterLocalComputeEndpoint(endpoint compute.Endpoint) { - r.computeProxy.RegisterLocalComputeEndpoint(endpoint) -} - func (r *Requester) cleanup(ctx context.Context) { r.cleanupFunc(ctx) } diff --git a/pkg/orchestrator/scheduler/batch_job_test.go b/pkg/orchestrator/scheduler/batch_job_test.go index 7c929074c7..2cfa9c449b 100644 --- a/pkg/orchestrator/scheduler/batch_job_test.go +++ b/pkg/orchestrator/scheduler/batch_job_test.go @@ -12,7 +12,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/orchestrator/retry" "github.com/bacalhau-project/bacalhau/pkg/test/mock" "github.com/google/uuid" - "github.com/libp2p/go-libp2p/core/peer" "github.com/stretchr/testify/suite" "go.uber.org/mock/gomock" ) @@ -26,11 +25,11 @@ const ( ) var nodeIDs = []string{ - "QmdZQ7ZbhnvWY1J12XYKGHApJ6aufKyLNSvf8jZBrBaAVL", - "QmXaXu9N5GNetatsvwnTfQqNtSeKAD6uCmarbh3LMRYAcF", - "QmYgxZiySj3MRkwLSL4X2MF5F9f2PMhAE3LV49XkfNL1o3", - "QmcWJnVXJ82DKJq8ED79LADR4ZBTnwgTK7yn6JQbNVMbbC", - "QmXRdLruWyETS2Z8XFrXxBFYXctfjT8T9mZWyuqwUm6rQk", + "Node0", + "Node1", + "Node2", + "Node3", + "Node4", } type BatchJobSchedulerTestSuite struct { @@ -80,10 +79,10 @@ func (s *BatchJobSchedulerTestSuite) TestProcess_ShouldCreateEnoughExecutions() matcher := NewPlanMatcher(s.T(), PlanMatcherParams{ Evaluation: evaluation, - NewExecutionsNodes: []peer.ID{ - nodeInfos[0].PeerInfo.ID, - nodeInfos[1].PeerInfo.ID, - nodeInfos[2].PeerInfo.ID, + NewExecutionsNodes: []string{ + nodeInfos[0].ID(), + nodeInfos[1].ID(), + nodeInfos[2].ID(), }, }) s.planner.EXPECT().Process(gomock.Any(), matcher).Times(1) @@ -230,7 +229,7 @@ func (s *BatchJobSchedulerTestSuite) TestFailUnhealthyExecs_ShouldMarkExecutions matcher := NewPlanMatcher(s.T(), PlanMatcherParams{ Evaluation: evaluation, - NewExecutionsNodes: []peer.ID{nodeInfos[0].PeerInfo.ID}, + NewExecutionsNodes: []string{nodeInfos[0].ID()}, StoppedExecutions: []string{ executions[execBidAccepted].ID, }, diff --git a/pkg/orchestrator/scheduler/daemon_job_test.go b/pkg/orchestrator/scheduler/daemon_job_test.go index 71b6a63aaa..bf97c62886 100644 --- a/pkg/orchestrator/scheduler/daemon_job_test.go +++ b/pkg/orchestrator/scheduler/daemon_job_test.go @@ -11,7 +11,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/orchestrator" "github.com/bacalhau-project/bacalhau/pkg/test/mock" "github.com/google/uuid" - "github.com/libp2p/go-libp2p/core/peer" "github.com/stretchr/testify/suite" "go.uber.org/mock/gomock" ) @@ -59,10 +58,10 @@ func (s *DaemonJobSchedulerTestSuite) TestProcess_ShouldCreateNewExecutions() { Evaluation: evaluation, JobState: models.JobStateTypeRunning, NewExecutionDesiredState: models.ExecutionDesiredStateRunning, - NewExecutionsNodes: []peer.ID{ - nodeInfos[0].PeerInfo.ID, - nodeInfos[1].PeerInfo.ID, - nodeInfos[2].PeerInfo.ID, + NewExecutionsNodes: []string{ + nodeInfos[0].ID(), + nodeInfos[1].ID(), + nodeInfos[2].ID(), }, }) s.planner.EXPECT().Process(gomock.Any(), matcher).Times(1) @@ -105,7 +104,7 @@ func (s *DaemonJobSchedulerTestSuite) TestProcess_ShouldMarkLostExecutionsOnUnhe matcher := NewPlanMatcher(s.T(), PlanMatcherParams{ Evaluation: evaluation, - NewExecutionsNodes: []peer.ID{}, + NewExecutionsNodes: []string{}, StoppedExecutions: []string{ executions[0].ID, }, diff --git a/pkg/orchestrator/scheduler/ops_job_test.go b/pkg/orchestrator/scheduler/ops_job_test.go index 359b3617c5..156db876d7 100644 --- a/pkg/orchestrator/scheduler/ops_job_test.go +++ b/pkg/orchestrator/scheduler/ops_job_test.go @@ -11,7 +11,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/orchestrator" "github.com/bacalhau-project/bacalhau/pkg/test/mock" "github.com/google/uuid" - "github.com/libp2p/go-libp2p/core/peer" "github.com/stretchr/testify/suite" "go.uber.org/mock/gomock" ) @@ -59,10 +58,10 @@ func (s *OpsJobSchedulerTestSuite) TestProcess_ShouldCreateNewExecutions() { Evaluation: evaluation, JobState: models.JobStateTypeRunning, NewExecutionDesiredState: models.ExecutionDesiredStateRunning, - NewExecutionsNodes: []peer.ID{ - nodeInfos[0].PeerInfo.ID, - nodeInfos[1].PeerInfo.ID, - nodeInfos[2].PeerInfo.ID, + NewExecutionsNodes: []string{ + nodeInfos[0].ID(), + nodeInfos[1].ID(), + nodeInfos[2].ID(), }, }) s.planner.EXPECT().Process(gomock.Any(), matcher).Times(1) @@ -100,7 +99,7 @@ func (s *OpsJobSchedulerTestSuite) TestProcess_ShouldMarkLostExecutionsOnUnhealt matcher := NewPlanMatcher(s.T(), PlanMatcherParams{ Evaluation: evaluation, - NewExecutionsNodes: []peer.ID{}, + NewExecutionsNodes: []string{}, StoppedExecutions: []string{ executions[0].ID, }, diff --git a/pkg/orchestrator/scheduler/service_job_test.go b/pkg/orchestrator/scheduler/service_job_test.go index 80eb975ba3..6784e922ab 100644 --- a/pkg/orchestrator/scheduler/service_job_test.go +++ b/pkg/orchestrator/scheduler/service_job_test.go @@ -12,7 +12,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/orchestrator/retry" "github.com/bacalhau-project/bacalhau/pkg/test/mock" "github.com/google/uuid" - "github.com/libp2p/go-libp2p/core/peer" "github.com/stretchr/testify/suite" "go.uber.org/mock/gomock" ) @@ -72,10 +71,10 @@ func (s *ServiceJobSchedulerTestSuite) TestProcess_ShouldCreateEnoughExecutions( matcher := NewPlanMatcher(s.T(), PlanMatcherParams{ Evaluation: evaluation, - NewExecutionsNodes: []peer.ID{ - nodeInfos[0].PeerInfo.ID, - nodeInfos[1].PeerInfo.ID, - nodeInfos[2].PeerInfo.ID, + NewExecutionsNodes: []string{ + nodeInfos[0].ID(), + nodeInfos[1].ID(), + nodeInfos[2].ID(), }, }) s.planner.EXPECT().Process(gomock.Any(), matcher).Times(1) @@ -226,9 +225,9 @@ func (s *ServiceJobSchedulerTestSuite) TestFailUnhealthyExecs_ShouldMarkExecutio matcher := NewPlanMatcher(s.T(), PlanMatcherParams{ Evaluation: evaluation, - NewExecutionsNodes: []peer.ID{ - nodeInfos[0].PeerInfo.ID, - nodeInfos[1].PeerInfo.ID, + NewExecutionsNodes: []string{ + nodeInfos[0].ID(), + nodeInfos[1].ID(), }, StoppedExecutions: []string{ executions[execServiceBidAccepted1].ID, @@ -262,9 +261,9 @@ func (s *ServiceJobSchedulerTestSuite) TestProcess_TreatCompletedExecutionsAsFai matcher := NewPlanMatcher(s.T(), PlanMatcherParams{ Evaluation: evaluation, - NewExecutionsNodes: []peer.ID{ - nodeInfos[0].PeerInfo.ID, - nodeInfos[1].PeerInfo.ID, + NewExecutionsNodes: []string{ + nodeInfos[0].ID(), + nodeInfos[1].ID(), }, }) s.planner.EXPECT().Process(gomock.Any(), matcher).Times(1) diff --git a/pkg/orchestrator/scheduler/utils_test.go b/pkg/orchestrator/scheduler/utils_test.go index 587367fcfb..cc8b243822 100644 --- a/pkg/orchestrator/scheduler/utils_test.go +++ b/pkg/orchestrator/scheduler/utils_test.go @@ -7,15 +7,13 @@ import ( "testing" "github.com/bacalhau-project/bacalhau/pkg/models" - "github.com/libp2p/go-libp2p/core/peer" - "github.com/stretchr/testify/require" ) type PlanMatcher struct { t *testing.T JobState models.JobStateType Evaluation *models.Evaluation - NewExecutionsNodes []peer.ID + NewExecutionsNodes []string NewExecutionsDesiredState models.ExecutionDesiredStateType StoppedExecutions []string ApprovedExecutions []string @@ -24,7 +22,7 @@ type PlanMatcher struct { type PlanMatcherParams struct { JobState models.JobStateType Evaluation *models.Evaluation - NewExecutionsNodes []peer.ID + NewExecutionsNodes []string NewExecutionDesiredState models.ExecutionDesiredStateType StoppedExecutions []string ApprovedExecutions []string @@ -68,7 +66,7 @@ func (m PlanMatcher) Matches(x interface{}) bool { return false } for _, node := range m.NewExecutionsNodes { - desiredState, ok := newExecutionNodes[node.String()] + desiredState, ok := newExecutionNodes[node] if !ok { m.t.Logf("NewExecutionsNodes: %v != %s", newExecutionNodes, m.NewExecutionsNodes) return false @@ -123,11 +121,7 @@ func (m PlanMatcher) String() string { } func mockNodeInfo(t *testing.T, nodeID string) *models.NodeInfo { - id, err := peer.Decode(nodeID) - require.NoError(t, err) return &models.NodeInfo{ - PeerInfo: peer.AddrInfo{ - ID: id, - }, + NodeID: nodeID, } } diff --git a/pkg/orchestrator/selection/discovery/chained.go b/pkg/orchestrator/selection/discovery/chained.go index a3278cc7a8..b1aeb0f8bf 100644 --- a/pkg/orchestrator/selection/discovery/chained.go +++ b/pkg/orchestrator/selection/discovery/chained.go @@ -5,7 +5,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/orchestrator" - "github.com/libp2p/go-libp2p/core/peer" "github.com/pkg/errors" "github.com/rs/zerolog/log" "go.uber.org/multierr" @@ -39,14 +38,14 @@ func (c *Chain) chainDiscovery( getNodes func(orchestrator.NodeDiscoverer) ([]models.NodeInfo, error), ) ([]models.NodeInfo, error) { var err error - uniqueNodes := make(map[peer.ID]models.NodeInfo, 0) + uniqueNodes := make(map[string]models.NodeInfo, 0) for _, discoverer := range c.discoverers { nodeInfos, discoverErr := getNodes(discoverer) err = multierr.Append(err, errors.Wrapf(discoverErr, "error finding nodes from %T", discoverer)) currentNodesCount := len(uniqueNodes) for _, nodeInfo := range nodeInfos { - if _, ok := uniqueNodes[nodeInfo.PeerInfo.ID]; !ok { - uniqueNodes[nodeInfo.PeerInfo.ID] = nodeInfo + if _, ok := uniqueNodes[nodeInfo.ID()]; !ok { + uniqueNodes[nodeInfo.ID()] = nodeInfo } } log.Ctx(ctx).Debug().Msgf("[%s] found %d more nodes by %T", caller, len(uniqueNodes)-currentNodesCount, discoverer) diff --git a/pkg/orchestrator/selection/discovery/chained_test.go b/pkg/orchestrator/selection/discovery/chained_test.go index 88673d74d2..b542a5eb77 100644 --- a/pkg/orchestrator/selection/discovery/chained_test.go +++ b/pkg/orchestrator/selection/discovery/chained_test.go @@ -8,7 +8,6 @@ import ( "testing" "github.com/bacalhau-project/bacalhau/pkg/models" - "github.com/libp2p/go-libp2p/core/peer" "github.com/stretchr/testify/suite" ) @@ -21,9 +20,9 @@ type ChainedSuite struct { } func (s *ChainedSuite) SetupSuite() { - s.peerID1 = models.NodeInfo{PeerInfo: peer.AddrInfo{ID: peer.ID("peerID1")}} - s.peerID2 = models.NodeInfo{PeerInfo: peer.AddrInfo{ID: peer.ID("peerID2")}} - s.peerID3 = models.NodeInfo{PeerInfo: peer.AddrInfo{ID: peer.ID("peerID3")}} + s.peerID1 = models.NodeInfo{NodeID: "peerID1"} + s.peerID2 = models.NodeInfo{NodeID: "peerID2"} + s.peerID3 = models.NodeInfo{NodeID: "peerID3"} } func (s *ChainedSuite) SetupTest() { diff --git a/pkg/orchestrator/selection/discovery/store_test.go b/pkg/orchestrator/selection/discovery/store_test.go index 6f32a7ea75..0e1cc6699e 100644 --- a/pkg/orchestrator/selection/discovery/store_test.go +++ b/pkg/orchestrator/selection/discovery/store_test.go @@ -10,8 +10,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/routing/inmemory" "github.com/bacalhau-project/bacalhau/pkg/test/mock" - "github.com/libp2p/go-libp2p/core/peer" - "github.com/multiformats/go-multiaddr" "github.com/stretchr/testify/suite" ) @@ -63,12 +61,7 @@ func (s *StoreNodeDiscovererSuite) TestListNodes_Empty() { func generateNodeInfo(id string, engines ...string) models.NodeInfo { return models.NodeInfo{ - PeerInfo: peer.AddrInfo{ - ID: peer.ID(id), - Addrs: []multiaddr.Multiaddr{ - multiaddr.StringCast("/ip4/0.0.0.0/tcp/1234"), - }, - }, + NodeID: id, NodeType: models.NodeTypeCompute, ComputeNodeInfo: &models.ComputeNodeInfo{ ExecutionEngines: engines, diff --git a/pkg/orchestrator/selection/ranking/chain.go b/pkg/orchestrator/selection/ranking/chain.go index 2c2a621e8b..e2a636c4f1 100644 --- a/pkg/orchestrator/selection/ranking/chain.go +++ b/pkg/orchestrator/selection/ranking/chain.go @@ -5,7 +5,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/orchestrator" - "github.com/libp2p/go-libp2p/core/peer" ) // Chain assigns a random rank to each node to allow the orchestrator to select random top nodes @@ -25,9 +24,9 @@ func (c *Chain) Add(ranker ...orchestrator.NodeRanker) { func (c *Chain) RankNodes(ctx context.Context, job models.Job, nodes []models.NodeInfo) ([]orchestrator.NodeRank, error) { // initialize map of node ranks - ranksMap := make(map[peer.ID]*orchestrator.NodeRank, len(nodes)) + ranksMap := make(map[string]*orchestrator.NodeRank, len(nodes)) for _, node := range nodes { - ranksMap[node.PeerInfo.ID] = &orchestrator.NodeRank{NodeInfo: node, Rank: orchestrator.RankPossible} + ranksMap[node.ID()] = &orchestrator.NodeRank{NodeInfo: node, Rank: orchestrator.RankPossible} } // iterate over the rankers and add their ranks to the map @@ -40,10 +39,10 @@ func (c *Chain) RankNodes(ctx context.Context, job models.Job, nodes []models.No } for _, nodeRank := range nodeRanks { if !nodeRank.MeetsRequirement() { - ranksMap[nodeRank.NodeInfo.PeerInfo.ID].Rank = orchestrator.RankUnsuitable - ranksMap[nodeRank.NodeInfo.PeerInfo.ID].Reason = nodeRank.Reason - } else if ranksMap[nodeRank.NodeInfo.PeerInfo.ID].MeetsRequirement() { - ranksMap[nodeRank.NodeInfo.PeerInfo.ID].Rank += nodeRank.Rank + ranksMap[nodeRank.NodeInfo.ID()].Rank = orchestrator.RankUnsuitable + ranksMap[nodeRank.NodeInfo.ID()].Reason = nodeRank.Reason + } else if ranksMap[nodeRank.NodeInfo.ID()].MeetsRequirement() { + ranksMap[nodeRank.NodeInfo.ID()].Rank += nodeRank.Rank } } } diff --git a/pkg/orchestrator/selection/ranking/chain_test.go b/pkg/orchestrator/selection/ranking/chain_test.go index 5aeae01c13..14baebbe59 100644 --- a/pkg/orchestrator/selection/ranking/chain_test.go +++ b/pkg/orchestrator/selection/ranking/chain_test.go @@ -7,7 +7,6 @@ import ( "testing" "github.com/bacalhau-project/bacalhau/pkg/models" - "github.com/libp2p/go-libp2p/core/peer" "github.com/stretchr/testify/suite" ) @@ -20,9 +19,9 @@ type ChainSuite struct { } func (s *ChainSuite) SetupSuite() { - s.peerID1 = models.NodeInfo{PeerInfo: peer.AddrInfo{ID: peer.ID("peerID1")}} - s.peerID2 = models.NodeInfo{PeerInfo: peer.AddrInfo{ID: peer.ID("peerID2")}} - s.peerID3 = models.NodeInfo{PeerInfo: peer.AddrInfo{ID: peer.ID("peerID3")}} + s.peerID1 = models.NodeInfo{NodeID: "peerID1"} + s.peerID2 = models.NodeInfo{NodeID: "peerID2"} + s.peerID3 = models.NodeInfo{NodeID: "peerID3"} } func (s *ChainSuite) SetupTest() { diff --git a/pkg/orchestrator/selection/ranking/features_test.go b/pkg/orchestrator/selection/ranking/features_test.go index fff6578606..5d577ccd12 100644 --- a/pkg/orchestrator/selection/ranking/features_test.go +++ b/pkg/orchestrator/selection/ranking/features_test.go @@ -8,7 +8,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/test/mock" - "github.com/libp2p/go-libp2p/core/peer" "github.com/stretchr/testify/suite" ) @@ -22,23 +21,23 @@ type FeatureNodeRankerSuite struct { func (s *FeatureNodeRankerSuite) Nodes() []models.NodeInfo { return []models.NodeInfo{ { - PeerInfo: peer.AddrInfo{ID: peer.ID("docker")}, + NodeID: "docker", ComputeNodeInfo: &models.ComputeNodeInfo{ExecutionEngines: []string{models.EngineDocker}}, }, { - PeerInfo: peer.AddrInfo{ID: peer.ID("wasm")}, + NodeID: "wasm", ComputeNodeInfo: &models.ComputeNodeInfo{ExecutionEngines: []string{models.EngineWasm}}, }, { - PeerInfo: peer.AddrInfo{ID: peer.ID("ipfs")}, + NodeID: "ipfs", ComputeNodeInfo: &models.ComputeNodeInfo{StorageSources: []string{models.StorageSourceIPFS}}, }, { - PeerInfo: peer.AddrInfo{ID: peer.ID("url")}, + NodeID: "url", ComputeNodeInfo: &models.ComputeNodeInfo{StorageSources: []string{models.StorageSourceURL}}, }, { - PeerInfo: peer.AddrInfo{ID: peer.ID("combo")}, + NodeID: "combo", ComputeNodeInfo: &models.ComputeNodeInfo{ ExecutionEngines: []string{models.EngineDocker, models.EngineWasm}, Publishers: []string{models.PublisherIPFS, models.PublisherS3}, @@ -46,7 +45,7 @@ func (s *FeatureNodeRankerSuite) Nodes() []models.NodeInfo { }, }, { - PeerInfo: peer.AddrInfo{ID: peer.ID("unknown")}, + NodeID: "unknown", }, } } diff --git a/pkg/orchestrator/selection/ranking/max_usage_test.go b/pkg/orchestrator/selection/ranking/max_usage_test.go index f49df4f789..11df116190 100644 --- a/pkg/orchestrator/selection/ranking/max_usage_test.go +++ b/pkg/orchestrator/selection/ranking/max_usage_test.go @@ -8,7 +8,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/test/mock" - "github.com/libp2p/go-libp2p/core/peer" "github.com/stretchr/testify/suite" ) @@ -22,15 +21,15 @@ type MaxUsageNodeRankerSuite struct { func (s *MaxUsageNodeRankerSuite) SetupSuite() { s.smallPeer = models.NodeInfo{ - PeerInfo: peer.AddrInfo{ID: peer.ID("small")}, + NodeID: "small", ComputeNodeInfo: &models.ComputeNodeInfo{MaxJobRequirements: models.Resources{CPU: 1}}, } s.medPeer = models.NodeInfo{ - PeerInfo: peer.AddrInfo{ID: peer.ID("med")}, + NodeID: "med", ComputeNodeInfo: &models.ComputeNodeInfo{MaxJobRequirements: models.Resources{CPU: 2}}, } s.largePeer = models.NodeInfo{ - PeerInfo: peer.AddrInfo{ID: peer.ID("large")}, + NodeID: "large", ComputeNodeInfo: &models.ComputeNodeInfo{MaxJobRequirements: models.Resources{CPU: 3}}, } } diff --git a/pkg/orchestrator/selection/ranking/min_version_test.go b/pkg/orchestrator/selection/ranking/min_version_test.go index 5f882bd370..74db51cfae 100644 --- a/pkg/orchestrator/selection/ranking/min_version_test.go +++ b/pkg/orchestrator/selection/ranking/min_version_test.go @@ -7,7 +7,6 @@ import ( "testing" "github.com/bacalhau-project/bacalhau/pkg/models" - "github.com/libp2p/go-libp2p/core/peer" "github.com/stretchr/testify/suite" ) @@ -89,7 +88,7 @@ func (s *MinVersionNodeRankerSuite) TestRankNodes() { var nodes []models.NodeInfo for _, t := range minVersionNodeRankerTestCases { nodes = append(nodes, models.NodeInfo{ - PeerInfo: peer.AddrInfo{ID: peer.ID(t.name)}, + NodeID: t.name, BacalhauVersion: t.version, }) } @@ -110,7 +109,7 @@ func (s *MinVersionNodeRankerSuite) TestRankNodes_NilMinVersion() { var nodes []models.NodeInfo for _, t := range minVersionNodeRankerTestCases { nodes = append(nodes, models.NodeInfo{ - PeerInfo: peer.AddrInfo{ID: peer.ID(t.name)}, + NodeID: t.name, BacalhauVersion: t.version, }) } diff --git a/pkg/orchestrator/selection/ranking/random_test.go b/pkg/orchestrator/selection/ranking/random_test.go index 30e7c597d7..9ac48d84b6 100644 --- a/pkg/orchestrator/selection/ranking/random_test.go +++ b/pkg/orchestrator/selection/ranking/random_test.go @@ -8,7 +8,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/orchestrator" - "github.com/libp2p/go-libp2p/core/peer" "github.com/stretchr/testify/suite" ) @@ -28,7 +27,7 @@ func (s *RandomNodeRankerSuite) TestRankNodes() { var nodes []models.NodeInfo for i := 0; i < nodeCount; i++ { nodes = append(nodes, models.NodeInfo{ - PeerInfo: peer.AddrInfo{ID: peer.ID(rune(i))}, + NodeID: "node" + string(rune(i)), }) } s.RandomNodeRanker = NewRandomNodeRanker(RandomNodeRankerParams{RandomnessRange: randomnessRange}) diff --git a/pkg/orchestrator/selection/ranking/utils_test.go b/pkg/orchestrator/selection/ranking/utils_test.go index c5484256aa..bb6951c12a 100644 --- a/pkg/orchestrator/selection/ranking/utils_test.go +++ b/pkg/orchestrator/selection/ranking/utils_test.go @@ -4,12 +4,11 @@ import ( "testing" "github.com/bacalhau-project/bacalhau/pkg/orchestrator" - "github.com/libp2p/go-libp2p/core/peer" ) func assertEquals(t *testing.T, ranks []orchestrator.NodeRank, nodeID string, expectedRank int) { for _, rank := range ranks { - if rank.NodeInfo.PeerInfo.ID == peer.ID(nodeID) { + if rank.NodeInfo.ID() == nodeID { if rank.Rank != expectedRank { t.Errorf("expected rank %d for node %s, got %d", expectedRank, nodeID, rank.Rank) } diff --git a/pkg/orchestrator/transformer/job.go b/pkg/orchestrator/transformer/job.go index b8773b8f11..c21a9e7b93 100644 --- a/pkg/orchestrator/transformer/job.go +++ b/pkg/orchestrator/transformer/job.go @@ -4,7 +4,6 @@ import ( "context" "time" - "github.com/bacalhau-project/bacalhau/pkg/model" "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/util/idgen" ) @@ -37,11 +36,10 @@ func DefaultsApplier(defaults JobDefaults) JobTransformer { return JobFn(f) } -// RequesterInfo is a transformer that sets the requester ID and public key in the job meta. -func RequesterInfo(requesterNodeID string, requesterPubKey model.PublicKey) JobTransformer { +// RequesterInfo is a transformer that sets the requester ID in the job meta. +func RequesterInfo(requesterNodeID string) JobTransformer { f := func(ctx context.Context, job *models.Job) error { job.Meta[models.MetaRequesterID] = requesterNodeID - job.Meta[models.MetaRequesterPublicKey] = requesterPubKey.String() return nil } return JobFn(f) diff --git a/pkg/orchestrator/types.go b/pkg/orchestrator/types.go index 537ae71e13..beebc076be 100644 --- a/pkg/orchestrator/types.go +++ b/pkg/orchestrator/types.go @@ -70,7 +70,7 @@ func (r NodeRank) MeetsRequirement() bool { } func (r NodeRank) MarshalZerologObject(e *zerolog.Event) { - e.Stringer("Node", r.NodeInfo.PeerInfo.ID). + e.Str("Node", r.NodeInfo.ID()). Bool("MeetsRequirement", r.MeetsRequirement()). Str("Reason", r.Reason) } diff --git a/pkg/publicapi/endpoint/agent/endpoint.go b/pkg/publicapi/endpoint/agent/endpoint.go index 8d791e6f8f..ad0e5a82ea 100644 --- a/pkg/publicapi/endpoint/agent/endpoint.go +++ b/pkg/publicapi/endpoint/agent/endpoint.go @@ -3,27 +3,32 @@ package agent import ( "net/http" + "github.com/bacalhau-project/bacalhau/pkg/model" "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/publicapi/apimodels" "github.com/bacalhau-project/bacalhau/pkg/publicapi/middleware" "github.com/bacalhau-project/bacalhau/pkg/version" "github.com/labstack/echo/v4" + "github.com/rs/zerolog/log" ) type EndpointParams struct { - Router *echo.Echo - NodeInfoProvider models.NodeInfoProvider + Router *echo.Echo + NodeInfoProvider models.NodeInfoProvider + DebugInfoProviders []model.DebugInfoProvider } type Endpoint struct { - router *echo.Echo - nodeInfoProvider models.NodeInfoProvider + router *echo.Echo + nodeInfoProvider models.NodeInfoProvider + debugInfoProviders []model.DebugInfoProvider } func NewEndpoint(params EndpointParams) *Endpoint { e := &Endpoint{ - router: params.Router, - nodeInfoProvider: params.NodeInfoProvider, + router: params.Router, + nodeInfoProvider: params.NodeInfoProvider, + debugInfoProviders: params.DebugInfoProviders, } // JSON group @@ -32,6 +37,7 @@ func NewEndpoint(params EndpointParams) *Endpoint { g.GET("/alive", e.alive) g.GET("/version", e.version) g.GET("/node", e.node) + g.GET("/debug", e.debug) return e } @@ -79,3 +85,25 @@ func (e *Endpoint) node(c echo.Context) error { NodeInfo: &nodeInfo, }) } + +// debug godoc +// +// @ID agent/debug +// @Summary Returns debug information on what the current node is doing. +// @Tags Ops +// @Produce json +// @Success 200 {object} model.DebugInfo +// @Failure 500 {object} string +// @Router /api/v1/agent/debug [get] +func (e *Endpoint) debug(c echo.Context) error { + debugInfoMap := make(map[string]interface{}) + for _, provider := range e.debugInfoProviders { + debugInfo, err := provider.GetDebugInfo(c.Request().Context()) + if err != nil { + log.Ctx(c.Request().Context()).Error().Msgf("could not get debug info from some providers: %s", err) + continue + } + debugInfoMap[debugInfo.Component] = debugInfo.Info + } + return c.JSON(http.StatusOK, debugInfoMap) +} diff --git a/pkg/publicapi/endpoint/orchestrator/node.go b/pkg/publicapi/endpoint/orchestrator/node.go index f0f12b18b6..7b095ecc56 100644 --- a/pkg/publicapi/endpoint/orchestrator/node.go +++ b/pkg/publicapi/endpoint/orchestrator/node.go @@ -51,7 +51,7 @@ func (e *Endpoint) listNodes(c echo.Context) error { var sortFnc func(a, b *models.NodeInfo) bool switch args.OrderBy { case "id", "": - sortFnc = func(a, b *models.NodeInfo) bool { return a.PeerInfo.ID < b.PeerInfo.ID } + sortFnc = func(a, b *models.NodeInfo) bool { return a.ID() < b.ID() } case "type": sortFnc = func(a, b *models.NodeInfo) bool { return a.NodeType < b.NodeType } case "available_cpu": diff --git a/pkg/publicapi/endpoint/shared/endpoint.go b/pkg/publicapi/endpoint/shared/endpoint.go index 801bb73cfe..0235e474a3 100644 --- a/pkg/publicapi/endpoint/shared/endpoint.go +++ b/pkg/publicapi/endpoint/shared/endpoint.go @@ -8,21 +8,17 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/publicapi/middleware" "github.com/bacalhau-project/bacalhau/pkg/version" "github.com/labstack/echo/v4" - "github.com/libp2p/go-libp2p/core/peer" - "github.com/libp2p/go-libp2p/core/peerstore" ) type EndpointParams struct { Router *echo.Echo NodeID string - PeerStore peerstore.Peerstore NodeInfoProvider models.NodeInfoProvider } type Endpoint struct { router *echo.Echo nodeID string - peerStore peerstore.Peerstore nodeInfoProvider models.NodeInfoProvider } @@ -30,14 +26,12 @@ func NewEndpoint(params EndpointParams) *Endpoint { e := &Endpoint{ router: params.Router, nodeID: params.NodeID, - peerStore: params.PeerStore, nodeInfoProvider: params.NodeInfoProvider, } // JSON group g := e.router.Group("/api/v1") g.Use(middleware.SetContentType(echo.MIMEApplicationJSON)) - g.GET("/peers", e.peers) g.GET("/node_info", e.nodeInfo) g.POST("/version", e.version) g.GET("/healthz", e.healthz) @@ -70,23 +64,6 @@ func (e *Endpoint) id(c echo.Context) error { return c.String(http.StatusOK, e.nodeID) } -// @ID peers -// @Summary Returns the peers connected to the host via the transport layer. -// @Description As described in the [architecture docs](https://docs.bacalhau.org/about-bacalhau/architecture), -// @Description each node is connected to a number of peer nodes. -// @Tags Utils -// @Produce json -// @Success 200 {object} []peer.AddrInfo -// @Failure 500 {object} string -// @Router /api/v1/peers [get] -func (e *Endpoint) peers(c echo.Context) error { - var peerInfos []peer.AddrInfo - for _, p := range e.peerStore.Peers() { - peerInfos = append(peerInfos, e.peerStore.PeerInfo(p)) - } - return c.JSON(http.StatusOK, peerInfos) -} - // nodeInfo godoc // // @ID nodeInfo diff --git a/pkg/publicapi/test/agent_test.go b/pkg/publicapi/test/agent_test.go index 568ead488e..1f5d32f5c3 100644 --- a/pkg/publicapi/test/agent_test.go +++ b/pkg/publicapi/test/agent_test.go @@ -34,7 +34,7 @@ func (s *ServerSuite) TestAgentNode() { s.Require().NotEmpty(resp) s.Require().NotNil(resp.NodeInfo) - expectedNode, err := s.requesterNode.NodeInfoStore.Get(context.Background(), s.requesterNode.Host.ID().String()) + expectedNode, err := s.requesterNode.NodeInfoStore.Get(context.Background(), s.requesterNode.ID) s.Require().NoError(err) equalNodeInfo(s.T(), expectedNode, *resp.NodeInfo) @@ -45,16 +45,11 @@ func (s *ServerSuite) TestAgentNodeCompute() { s.Require().NoError(err) s.Require().NotEmpty(resp) s.Require().NotNil(resp.NodeInfo) - - expectedNode, err := s.computeNode.NodeInfoStore.Get(context.Background(), s.computeNode.Host.ID().String()) - s.Require().NoError(err) - - equalNodeInfo(s.T(), expectedNode, *resp.NodeInfo) } func equalNodeInfo(t *testing.T, a, b models.NodeInfo) { require.Equal(t, a.BacalhauVersion, b.BacalhauVersion) - require.Equal(t, a.PeerInfo, b.PeerInfo) + require.Equal(t, a.ID(), b.ID()) require.Equal(t, a.NodeType, b.NodeType) require.Equal(t, a.Labels, b.Labels) diff --git a/pkg/publicapi/test/requester_server_test.go b/pkg/publicapi/test/requester_server_test.go index 63eacabbcc..fa92a007a5 100644 --- a/pkg/publicapi/test/requester_server_test.go +++ b/pkg/publicapi/test/requester_server_test.go @@ -42,7 +42,9 @@ func (s *RequesterSuite) SetupTest() { // After each test func (s *RequesterSuite) TearDownTest() { - s.node.CleanupManager.Cleanup(context.Background()) + if s.node != nil { + s.node.CleanupManager.Cleanup(context.Background()) + } } func (s *RequesterSuite) TestList() { diff --git a/pkg/publicapi/test/util_test.go b/pkg/publicapi/test/util_test.go index 2a8336a571..2f136ea132 100644 --- a/pkg/publicapi/test/util_test.go +++ b/pkg/publicapi/test/util_test.go @@ -2,9 +2,13 @@ package test import ( "context" + "os" "testing" "time" + "github.com/bacalhau-project/bacalhau/pkg/models" + "github.com/libp2p/go-libp2p/core/host" + "github.com/libp2p/go-libp2p/core/peer" "github.com/phayes/freeport" "github.com/stretchr/testify/require" @@ -35,8 +39,21 @@ func setupNodeForTestWithConfig(t *testing.T, apiCfg publicapi.Config) (*node.No privKey, err := config.GetLibp2pPrivKey() require.NoError(t, err) - libp2pHost, err := libp2p.NewHost(libp2pPort, privKey) + + peerID, err := peer.IDFromPrivateKey(privKey) require.NoError(t, err) + nodeID := peerID.String() + + var libp2pHost host.Host + networkType, ok := os.LookupEnv("BACALHAU_NODE_NETWORK_TYPE") + if !ok { + networkType = models.NetworkTypeLibp2p + } + + if networkType == models.NetworkTypeLibp2p { + libp2pHost, err = libp2p.NewHost(libp2pPort, privKey) + require.NoError(t, err) + } computeConfig, err := node.NewComputeConfigWithDefaults() require.NoError(t, err) @@ -44,8 +61,8 @@ func setupNodeForTestWithConfig(t *testing.T, apiCfg publicapi.Config) (*node.No require.NoError(t, err) nodeConfig := node.NodeConfig{ + NodeID: nodeID, CleanupManager: cm, - Host: libp2pHost, HostAddress: "0.0.0.0", APIPort: 0, ComputeConfig: computeConfig, @@ -57,6 +74,10 @@ func setupNodeForTestWithConfig(t *testing.T, apiCfg publicapi.Config) (*node.No NodeInfoPublisherInterval: node.TestNodeInfoPublishConfig, FsRepo: fsRepo, NodeInfoStoreTTL: 10 * time.Minute, + NetworkConfig: node.NetworkConfig{ + Type: networkType, + Libp2pHost: libp2pHost, + }, } n, err := node.NewNode(ctx, nodeConfig) diff --git a/pkg/pubsub/libp2p/pubsub.go b/pkg/pubsub/libp2p/pubsub.go index f13f47f02c..f04f7e07ee 100644 --- a/pkg/pubsub/libp2p/pubsub.go +++ b/pkg/pubsub/libp2p/pubsub.go @@ -63,9 +63,11 @@ func (p *PubSub[T]) Publish(ctx context.Context, message T) error { return p.topic.Publish(ctx, payload) } -func (p *PubSub[T]) Subscribe(_ context.Context, subscriber pubsub.Subscriber[T]) (err error) { +func (p *PubSub[T]) Subscribe(ctx context.Context, subscriber pubsub.Subscriber[T]) (err error) { var firstSubscriber bool p.subscriberOnce.Do(func() { + log.Ctx(ctx).Debug().Msgf("Subscribing to subject %s", p.topicName) + // register the subscriber p.subscriber = subscriber diff --git a/pkg/repo/fs.go b/pkg/repo/fs.go index c1c0e5db83..43cef17481 100644 --- a/pkg/repo/fs.go +++ b/pkg/repo/fs.go @@ -65,7 +65,7 @@ func (fsr *FsRepo) Exists() (bool, error) { if err != nil { return false, err } - if version != RepoVersion1 && version != RepoVersion2 { + if !IsValidVersion(version) { return false, fmt.Errorf("unknown repo version %d", version) } return true, nil diff --git a/pkg/repo/version.go b/pkg/repo/version.go index d27b7d9f8a..506b307eb9 100644 --- a/pkg/repo/version.go +++ b/pkg/repo/version.go @@ -17,6 +17,11 @@ const ( RepoVersionFile = "repo.version" ) +// IsValidVersion returns true if the version is valid. +func IsValidVersion(version int) bool { + return version == RepoVersion1 || version == RepoVersion2 +} + type RepoVersion struct { Version int } diff --git a/pkg/requester/endpoint.go b/pkg/requester/endpoint.go index b2a46c9f0a..9342f6352a 100644 --- a/pkg/requester/endpoint.go +++ b/pkg/requester/endpoint.go @@ -24,7 +24,6 @@ import ( type BaseEndpointParams struct { ID string - PublicKey []byte EvaluationBroker orchestrator.EvaluationBroker Store jobstore.Store EventEmitter orchestrator.EventEmitter @@ -48,7 +47,7 @@ type BaseEndpoint struct { func NewBaseEndpoint(params *BaseEndpointParams) *BaseEndpoint { transforms := []jobtransform.Transformer{ jobtransform.NewTimeoutApplier(params.MinJobExecutionTimeout, params.DefaultJobExecutionTimeout), - jobtransform.NewRequesterInfo(params.ID, params.PublicKey), + jobtransform.NewRequesterInfo(params.ID), jobtransform.RepoExistsOnIPFS(params.StorageProviders), jobtransform.NewPublisherMigrator(), jobtransform.NewEngineMigrator(), diff --git a/pkg/requester/jobtransform/requester_info.go b/pkg/requester/jobtransform/requester_info.go index d37303f8f1..7da5b0aae0 100644 --- a/pkg/requester/jobtransform/requester_info.go +++ b/pkg/requester/jobtransform/requester_info.go @@ -6,11 +6,10 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/model" ) -func NewRequesterInfo(requesterNodeID string, requesterPubKey model.PublicKey) Transformer { +func NewRequesterInfo(requesterNodeID string) Transformer { return func(ctx context.Context, j *model.Job) (modified bool, err error) { j.Metadata.Requester = model.JobRequester{ - RequesterNodeID: requesterNodeID, - RequesterPublicKey: requesterPubKey, + RequesterNodeID: requesterNodeID, } return true, nil } diff --git a/pkg/routing/inmemory/inmemory.go b/pkg/routing/inmemory/inmemory.go index 527775ad24..36173b9856 100644 --- a/pkg/routing/inmemory/inmemory.go +++ b/pkg/routing/inmemory/inmemory.go @@ -113,8 +113,8 @@ func (r *NodeInfoStore) FindPeer(ctx context.Context, peerID peer.ID) (peer.Addr if !ok { return peer.AddrInfo{}, nil } - if len(infoWrapper.PeerInfo.Addrs) > 0 { - return infoWrapper.PeerInfo, nil + if infoWrapper.PeerInfo != nil && len(infoWrapper.PeerInfo.Addrs) > 0 { + return *infoWrapper.PeerInfo, nil } return peer.AddrInfo{}, nil } diff --git a/pkg/routing/inmemory/inmemory_test.go b/pkg/routing/inmemory/inmemory_test.go index 5c0544a7ac..d8ad7e1ed8 100644 --- a/pkg/routing/inmemory/inmemory_test.go +++ b/pkg/routing/inmemory/inmemory_test.go @@ -9,8 +9,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/routing" - "github.com/libp2p/go-libp2p/core/peer" - "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" ) @@ -145,7 +143,7 @@ func (s *InMemoryNodeInfoStoreSuite) Test_Replace() { s.NoError(s.store.Add(ctx, nodeInfo0)) nodeInfo1 := generateNodeInfo(s.T(), nodeIDs[0], models.EngineWasm) - nodeInfo1.PeerInfo.ID = nodeInfo0.PeerInfo.ID + nodeInfo1.NodeID = nodeInfo0.NodeID s.NoError(s.store.Add(ctx, nodeInfo1)) res, err := s.store.Get(ctx, nodeInfo0.ID()) @@ -180,12 +178,8 @@ func (s *InMemoryNodeInfoStoreSuite) Test_Eviction() { } func generateNodeInfo(t *testing.T, peerID string, engines ...string) models.NodeInfo { - id, err := peer.Decode(peerID) - require.NoError(t, err) return models.NodeInfo{ - PeerInfo: peer.AddrInfo{ - ID: id, - }, + NodeID: peerID, NodeType: models.NodeTypeCompute, ComputeNodeInfo: &models.ComputeNodeInfo{ ExecutionEngines: engines, diff --git a/pkg/routing/node_info_provider.go b/pkg/routing/node_info_provider.go index 3ae5f4265c..3678bfa2ff 100644 --- a/pkg/routing/node_info_provider.go +++ b/pkg/routing/node_info_provider.go @@ -4,51 +4,44 @@ import ( "context" "github.com/bacalhau-project/bacalhau/pkg/models" - "github.com/libp2p/go-libp2p/core/host" - "github.com/libp2p/go-libp2p/core/peer" - "github.com/libp2p/go-libp2p/p2p/protocol/identify" ) type NodeInfoProviderParams struct { - Host host.Host - IdentityService identify.IDService - LabelsProvider models.LabelsProvider - ComputeInfoProvider models.ComputeNodeInfoProvider - BacalhauVersion models.BuildVersionInfo + NodeID string + LabelsProvider models.LabelsProvider + BacalhauVersion models.BuildVersionInfo } type NodeInfoProvider struct { - h host.Host - identityService identify.IDService - labelsProvider models.LabelsProvider - computeInfoProvider models.ComputeNodeInfoProvider - bacalhauVersion models.BuildVersionInfo + nodeID string + labelsProvider models.LabelsProvider + bacalhauVersion models.BuildVersionInfo + nodeInfoDecorators []models.NodeInfoDecorator } func NewNodeInfoProvider(params NodeInfoProviderParams) *NodeInfoProvider { return &NodeInfoProvider{ - h: params.Host, - identityService: params.IdentityService, - labelsProvider: params.LabelsProvider, - computeInfoProvider: params.ComputeInfoProvider, - bacalhauVersion: params.BacalhauVersion, + nodeID: params.NodeID, + labelsProvider: params.LabelsProvider, + bacalhauVersion: params.BacalhauVersion, + nodeInfoDecorators: make([]models.NodeInfoDecorator, 0), } } +// RegisterNodeInfoDecorator registers a node info decorator with the node info provider. +func (n *NodeInfoProvider) RegisterNodeInfoDecorator(decorator models.NodeInfoDecorator) { + n.nodeInfoDecorators = append(n.nodeInfoDecorators, decorator) +} + func (n *NodeInfoProvider) GetNodeInfo(ctx context.Context) models.NodeInfo { res := models.NodeInfo{ + NodeID: n.nodeID, BacalhauVersion: n.bacalhauVersion, - PeerInfo: peer.AddrInfo{ - ID: n.h.ID(), - Addrs: n.identityService.OwnObservedAddrs(), - }, - Labels: n.labelsProvider.GetLabels(ctx), - NodeType: models.NodeTypeRequester, + Labels: n.labelsProvider.GetLabels(ctx), + NodeType: models.NodeTypeRequester, } - if n.computeInfoProvider != nil { - info := n.computeInfoProvider.GetComputeInfo(ctx) - res.NodeType = models.NodeTypeCompute - res.ComputeNodeInfo = &info + for _, decorator := range n.nodeInfoDecorators { + res = decorator.DecorateNodeInfo(ctx, res) } return res } diff --git a/pkg/test/compute/resourcelimits_test.go b/pkg/test/compute/resourcelimits_test.go index ac21a30732..1624ef83e3 100644 --- a/pkg/test/compute/resourcelimits_test.go +++ b/pkg/test/compute/resourcelimits_test.go @@ -359,11 +359,11 @@ func (suite *ComputeNodeResourceLimitsSuite) TestParallelGPU() { } // test that each node has 2 job allocated to it - node1Count, ok := allocationMap[stack.Nodes[0].Host.ID().String()] + node1Count, ok := allocationMap[stack.Nodes[0].ID] require.True(suite.T(), ok) require.Equal(suite.T(), jobsPerNode, node1Count) - node2Count, ok := allocationMap[stack.Nodes[1].Host.ID().String()] + node2Count, ok := allocationMap[stack.Nodes[1].ID] require.True(suite.T(), ok) require.Equal(suite.T(), jobsPerNode, node2Count) } diff --git a/pkg/test/compute/setup_test.go b/pkg/test/compute/setup_test.go index 247342432c..c1f5f9ceea 100644 --- a/pkg/test/compute/setup_test.go +++ b/pkg/test/compute/setup_test.go @@ -91,8 +91,21 @@ func (s *ComputeSuite) setupNode() { storagePath := s.T().TempDir() noopstorage := noop_storage.NewNoopStorage() + callback := compute.CallbackMock{ + OnBidCompleteHandler: func(ctx context.Context, result compute.BidResult) { + s.bidChannel <- result + }, + OnRunCompleteHandler: func(ctx context.Context, result compute.RunResult) { + s.completedChannel <- result + }, + OnComputeFailureHandler: func(ctx context.Context, err compute.ComputeError) { + s.failureChannel <- err + }, + } + s.node, err = node.NewComputeNode( context.Background(), + host.ID().String(), s.cm, host, apiServer, @@ -102,23 +115,13 @@ func (s *ComputeSuite) setupNode() { provider.NewNoopProvider[executor.Executor](s.executor), provider.NewNoopProvider[publisher.Publisher](s.publisher), repo, + callback, ) s.NoError(err) s.stateResolver = *resolver.NewStateResolver(resolver.StateResolverParams{ ExecutionStore: s.node.ExecutionStore, }) - s.node.RegisterLocalComputeCallback(compute.CallbackMock{ - OnBidCompleteHandler: func(ctx context.Context, result compute.BidResult) { - s.bidChannel <- result - }, - OnRunCompleteHandler: func(ctx context.Context, result compute.RunResult) { - s.completedChannel <- result - }, - OnComputeFailureHandler: func(ctx context.Context, err compute.ComputeError) { - s.failureChannel <- err - }, - }) s.T().Cleanup(func() { close(s.bidChannel) }) } diff --git a/pkg/test/logstream/stream_address_test.go b/pkg/test/logstream/stream_address_test.go index b4af812c5a..ea49a4658c 100644 --- a/pkg/test/logstream/stream_address_test.go +++ b/pkg/test/logstream/stream_address_test.go @@ -20,6 +20,10 @@ func (s *LogStreamTestSuite) TestStreamAddress() { docker.MustHaveDocker(s.T()) + if s.stack.Nodes[0].Libp2pHost == nil { + // TODO: un-skip once we add log stream support for nats transport layer + s.T().Skip("skipping log stream tests for non-libp2p transports") + } node := s.stack.Nodes[0] task := mock.TaskBuilder(). @@ -31,7 +35,7 @@ func (s *LogStreamTestSuite) TestStreamAddress() { job.Tasks[0] = task execution := mock.ExecutionForJob(job) - execution.NodeID = node.Host.ID().Pretty() + execution.NodeID = node.ID execution.AllocateResources(task.Name, models.Resources{}) err := node.RequesterNode.JobStore.CreateJob(s.ctx, *job) diff --git a/pkg/test/requester/node_selection_test.go b/pkg/test/requester/node_selection_test.go index 695e206924..9c6b41ea7e 100644 --- a/pkg/test/requester/node_selection_test.go +++ b/pkg/test/requester/node_selection_test.go @@ -189,7 +189,7 @@ func (s *NodeSelectionSuite) getSelectedNodes(jobID string) []*node.Node { for _, executionState := range completedExecutionStates { nodeFound := false for _, n := range s.computeNodes { - if n.Host.ID().String() == executionState.NodeID { + if n.ID == executionState.NodeID { nodes = append(nodes, n) nodeFound = true break @@ -206,10 +206,10 @@ func (s *NodeSelectionSuite) assertNodesMatch(expected, selected []*node.Node) { expectedNodeNames := make([]string, 0, len(expected)) selectedNodeNames := make([]string, 0, len(selected)) for _, n := range expected { - expectedNodeNames = append(expectedNodeNames, n.Host.ID().String()) + expectedNodeNames = append(expectedNodeNames, n.ID) } for _, n := range selected { - selectedNodeNames = append(selectedNodeNames, n.Host.ID().String()) + selectedNodeNames = append(selectedNodeNames, n.ID) } s.ElementsMatch(expectedNodeNames, selectedNodeNames) } diff --git a/pkg/test/teststack/stack.go b/pkg/test/teststack/stack.go index 76a4267a0f..58613deb24 100644 --- a/pkg/test/teststack/stack.go +++ b/pkg/test/teststack/stack.go @@ -93,7 +93,7 @@ func WithNoopExecutor(noopConfig noop_executor.ExecutorConfig) devstack.ConfigOp func allNodesDiscovered(t testing.TB, stack *devstack.DevStack) bool { for _, node := range stack.Nodes { - ctx := logger.ContextWithNodeIDLogger(context.Background(), node.Host.ID().String()) + ctx := logger.ContextWithNodeIDLogger(context.Background(), node.ID) if !node.IsRequesterNode() || node.RequesterNode == nil { continue diff --git a/pkg/transport/interfaces.go b/pkg/transport/interfaces.go new file mode 100644 index 0000000000..c314a71302 --- /dev/null +++ b/pkg/transport/interfaces.go @@ -0,0 +1,34 @@ +package transport + +import ( + "context" + + "github.com/bacalhau-project/bacalhau/pkg/compute" + "github.com/bacalhau-project/bacalhau/pkg/model" + "github.com/bacalhau-project/bacalhau/pkg/models" + "github.com/bacalhau-project/bacalhau/pkg/pubsub" +) + +// TransportLayer is the interface for the transport layer. +type TransportLayer interface { + // ComputeProxy enables orchestrator nodes to send job requests to compute nodes. + ComputeProxy() compute.Endpoint + // CallbackProxy enables compute nodes to send results and responses back to orchestrator nodes + CallbackProxy() compute.Callback + // NodeInfoPubSub enables compute nodes to publish their info and capabilities + // to orchestrator nodes for job matching and discovery. + NodeInfoPubSub() pubsub.PubSub[models.NodeInfo] + // NodeInfoDecorator enables transport layer to enrich node info with data + // required for request routing + NodeInfoDecorator() models.NodeInfoDecorator + // DebugInfoProviders enables transport layer to provide meaningful debug info to operators + DebugInfoProviders() []model.DebugInfoProvider + // RegisterComputeCallback registers a compute callback with the transport layer + // so that incoming compute responses are forwarded to the handler + RegisterComputeCallback(callback compute.Callback) error + // RegisterComputeEndpoint registers a compute endpoint with the transport layer + // so that incoming orchestrator requests are forwarded to the handler + RegisterComputeEndpoint(endpoint compute.Endpoint) error + // Close closes the transport layer. + Close(ctx context.Context) error +}