-
Notifications
You must be signed in to change notification settings - Fork 198
ai/live: Support for multiple instances per orchestrator in discovery #3719
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -85,6 +85,7 @@ type LivepeerConfig struct { | |
CliAddr *string | ||
HttpAddr *string | ||
ServiceAddr *string | ||
Nodes *string | ||
OrchAddr *string | ||
VerifierURL *string | ||
EthController *string | ||
|
@@ -112,6 +113,7 @@ type LivepeerConfig struct { | |
IgnoreMaxPriceIfNeeded *bool | ||
MinPerfScore *float64 | ||
DiscoveryTimeout *time.Duration | ||
ExtraNodes *int | ||
MaxSessions *string | ||
CurrentManifest *bool | ||
Nvidia *string | ||
|
@@ -194,6 +196,7 @@ func DefaultLivepeerConfig() LivepeerConfig { | |
defaultCliAddr := "" | ||
defaultHttpAddr := "" | ||
defaultServiceAddr := "" | ||
defaultNodes := "" | ||
defaultOrchAddr := "" | ||
defaultVerifierURL := "" | ||
defaultVerifierPath := "" | ||
|
@@ -215,6 +218,7 @@ func DefaultLivepeerConfig() LivepeerConfig { | |
defaultRegion := "" | ||
defaultMinPerfScore := 0.0 | ||
defaultDiscoveryTimeout := 500 * time.Millisecond | ||
defaultExtraNodes := 0 | ||
defaultCurrentManifest := false | ||
defaultNvidia := "" | ||
defaultNetint := "" | ||
|
@@ -310,6 +314,7 @@ func DefaultLivepeerConfig() LivepeerConfig { | |
CliAddr: &defaultCliAddr, | ||
HttpAddr: &defaultHttpAddr, | ||
ServiceAddr: &defaultServiceAddr, | ||
Nodes: &defaultNodes, | ||
OrchAddr: &defaultOrchAddr, | ||
VerifierURL: &defaultVerifierURL, | ||
VerifierPath: &defaultVerifierPath, | ||
|
@@ -331,6 +336,7 @@ func DefaultLivepeerConfig() LivepeerConfig { | |
Region: &defaultRegion, | ||
MinPerfScore: &defaultMinPerfScore, | ||
DiscoveryTimeout: &defaultDiscoveryTimeout, | ||
ExtraNodes: &defaultExtraNodes, | ||
CurrentManifest: &defaultCurrentManifest, | ||
Nvidia: &defaultNvidia, | ||
Netint: &defaultNetint, | ||
|
@@ -561,6 +567,16 @@ func StartLivepeer(ctx context.Context, cfg LivepeerConfig) { | |
n.OrchSecret, _ = common.ReadFromFile(*cfg.OrchSecret) | ||
} | ||
|
||
// Parse -instances flag and store parsed canonicalized URLs in the node | ||
if cfg.Nodes != nil && *cfg.Nodes != "" { | ||
n.Nodes, err = parseNodes(*cfg.Nodes) | ||
if err != nil || len(n.Nodes) == 0 { | ||
glog.Exit("No valid instance URLs parsed from -nodes: ", err) | ||
} else { | ||
glog.Infof("Configured nodes: %v", strings.Join(n.Nodes, ",")) | ||
} | ||
} | ||
|
||
var transcoderCaps []core.Capability | ||
if *cfg.Transcoder { | ||
core.WorkDir = *cfg.Datadir | ||
|
@@ -1521,6 +1537,8 @@ func StartLivepeer(ctx context.Context, cfg LivepeerConfig) { | |
go refreshOrchPerfScoreLoop(ctx, strings.ToUpper(*cfg.Region), *cfg.OrchPerfStatsURL, n.OrchPerfScore) | ||
} | ||
|
||
n.ExtraNodes = *cfg.ExtraNodes | ||
|
||
// Set up orchestrator discovery | ||
if *cfg.OrchWebhookURL != "" { | ||
whurl, err := validateURL(*cfg.OrchWebhookURL) | ||
|
@@ -1608,6 +1626,10 @@ func StartLivepeer(ctx context.Context, cfg LivepeerConfig) { | |
glog.Exit("Error getting service URI: ", err) | ||
} | ||
|
||
if suri.String() == "" && len(n.Nodes) == 0 { | ||
glog.Exit("Empty service URI and no additional nodes specified; set -serviceAddr or -nodes") | ||
} | ||
|
||
if *cfg.Network != "offchain" && !common.ValidateServiceURI(suri) { | ||
glog.Warning("**Warning -serviceAddr is a not a public address or hostname; this is not recommended for onchain networks**") | ||
} | ||
|
@@ -1749,17 +1771,23 @@ func StartLivepeer(ctx context.Context, cfg LivepeerConfig) { | |
tc <- struct{}{} | ||
}() | ||
|
||
doingWork := orch.ServiceURI().String() != "" | ||
|
||
// check whether or not the orchestrator is available | ||
if *cfg.TestOrchAvail { | ||
if *cfg.TestOrchAvail && doingWork { | ||
time.Sleep(2 * time.Second) | ||
orchAvail := server.CheckOrchestratorAvailability(orch) | ||
if !orchAvail { | ||
// shut down orchestrator | ||
glog.Infof("Orchestrator not available at %v; shutting down", orch.ServiceURI()) | ||
glog.Infof("Orchestrator not available at %v (%v); shutting down", orch.ServiceURI(), *cfg.HttpAddr) | ||
tc <- struct{}{} | ||
} | ||
} | ||
|
||
if !doingWork { | ||
glog.Infof("Orchestrator is not performing work") | ||
} | ||
|
||
}() | ||
|
||
if n.NodeType == core.TranscoderNode || n.NodeType == core.AIWorkerNode { | ||
|
@@ -1840,6 +1868,34 @@ func parseOrchAddrs(addrs string) []*url.URL { | |
return res | ||
} | ||
|
||
func parseNodes(addrs string) ([]string, error) { | ||
var res []string | ||
if len(addrs) == 0 { | ||
return res, fmt.Errorf("instances empty") | ||
} | ||
for _, addr := range strings.Split(addrs, ",") { | ||
addr = strings.TrimSpace(addr) | ||
if addr == "" { | ||
continue | ||
} | ||
// Add https if not provided | ||
if !strings.HasPrefix(addr, "https://") { | ||
addr = "https://" + addr | ||
} | ||
parsed, err := url.ParseRequestURI(addr) | ||
if err != nil { | ||
return nil, fmt.Errorf("Could not parse instance URI '%s': %w", addr, err) | ||
} | ||
// Ensure scheme starts with https; if http is provided, upgrade to https | ||
if parsed.Scheme != "https" { | ||
return nil, fmt.Errorf("Node URI must start with https '%s'", addr) | ||
} | ||
// Use the canonical string form | ||
res = append(res, parsed.String()) | ||
} | ||
return res, nil | ||
} | ||
|
||
func parseOrchBlacklist(b *string) []string { | ||
if b == nil { | ||
return []string{} | ||
|
@@ -1885,6 +1941,10 @@ func isLocalURL(u string) (bool, error) { | |
func getServiceURI(n *core.LivepeerNode, serviceAddr string) (*url.URL, error) { | ||
// Passed in via CLI | ||
if serviceAddr != "" { | ||
if serviceAddr == "none" { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where is it actually set to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The user sets it in the |
||
// special value to signal this node is not to be used for work | ||
return url.Parse("") | ||
} | ||
return url.ParseRequestURI("https://" + serviceAddr) | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe
maxNodes
ormaxExtraNodes
(but then I'd renamenodes
toextraNodes
).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I really struggled with naming this.
My first pass at this PR named it
maxNodes
too but that got a little confusing in practice since this does not include the initial set orchestrators in the pool ... these are literally extra nodes in addition to the initial set of orchestrators in the pool.We might be able to sidestep this if
maxNodes
had a default of 1 but that means changing more of the current behavior (which does not depend on this value) and I tried to avoid doing that.