@@ -29,14 +29,16 @@ type AISessionPool struct {
29
29
sessMap map [string ]* BroadcastSession
30
30
inUseSess []* BroadcastSession
31
31
suspender * suspender
32
+ penalty int
32
33
mu sync.RWMutex
33
34
}
34
35
35
- func NewAISessionPool (selector BroadcastSessionsSelector , suspender * suspender ) * AISessionPool {
36
+ func NewAISessionPool (selector BroadcastSessionsSelector , suspender * suspender , penalty int ) * AISessionPool {
36
37
return & AISessionPool {
37
38
selector : selector ,
38
39
sessMap : make (map [string ]* BroadcastSession ),
39
40
suspender : suspender ,
41
+ penalty : penalty ,
40
42
mu : sync.RWMutex {},
41
43
}
42
44
}
@@ -122,10 +124,17 @@ func (pool *AISessionPool) Remove(sess *BroadcastSession) {
122
124
delete (pool .sessMap , sess .Transcoder ())
123
125
pool .inUseSess = removeSessionFromList (pool .inUseSess , sess )
124
126
125
- // Magic number for now
126
- penalty := 3
127
+ penalty := 0
127
128
// If this method is called assume that the orch should be suspended
128
- // as well
129
+ // as well. Since AISessionManager re-uses the pools the suspension
130
+ // penalty needs to consider the current suspender count to set the penalty
131
+ last_count , ok := pool .suspender .list [sess .Transcoder ()]
132
+ if ok {
133
+ penalty = pool .suspender .count - last_count + pool .penalty
134
+ } else {
135
+ penalty = pool .suspender .count + pool .penalty
136
+ }
137
+
129
138
pool .suspender .suspend (sess .Transcoder (), penalty )
130
139
}
131
140
@@ -152,12 +161,14 @@ type AISessionSelector struct {
152
161
// The time until the pools should be refreshed with orchs from discovery
153
162
ttl time.Duration
154
163
lastRefreshTime time.Time
164
+ initialPoolSize int
155
165
156
166
cap core.Capability
157
167
modelID string
158
168
159
169
node * core.LivepeerNode
160
170
suspender * suspender
171
+ penalty int
161
172
os drivers.OSSession
162
173
}
163
174
@@ -172,8 +183,9 @@ func NewAISessionSelector(cap core.Capability, modelID string, node *core.Livepe
172
183
// The latency score in this context is just the latency of the last completed request for a session
173
184
// The "good enough" latency score is set to 0.0 so the selector will always select unknown sessions first
174
185
minLS := 0.0
175
- warmPool := NewAISessionPool (NewMinLSSelector (stakeRdr , minLS , node .SelectionAlgorithm , node .OrchPerfScore ), suspender )
176
- coldPool := NewAISessionPool (NewMinLSSelector (stakeRdr , minLS , node .SelectionAlgorithm , node .OrchPerfScore ), suspender )
186
+ penalty := 3
187
+ warmPool := NewAISessionPool (NewMinLSSelector (stakeRdr , minLS , node .SelectionAlgorithm , node .OrchPerfScore ), suspender , penalty )
188
+ coldPool := NewAISessionPool (NewMinLSSelector (stakeRdr , minLS , node .SelectionAlgorithm , node .OrchPerfScore ), suspender , penalty )
177
189
sel := & AISessionSelector {
178
190
warmPool : warmPool ,
179
191
coldPool : coldPool ,
@@ -182,6 +194,7 @@ func NewAISessionSelector(cap core.Capability, modelID string, node *core.Livepe
182
194
modelID : modelID ,
183
195
node : node ,
184
196
suspender : suspender ,
197
+ penalty : penalty ,
185
198
os : drivers .NodeStorage .NewSession (strconv .Itoa (int (cap )) + "_" + modelID ),
186
199
}
187
200
@@ -196,7 +209,17 @@ func (sel *AISessionSelector) Select(ctx context.Context) *AISession {
196
209
shouldRefreshSelector := func () bool {
197
210
// Refresh if the # of sessions across warm and cold pools falls below the smaller of the maxRefreshSessionsThreshold and
198
211
// 1/2 the total # of orchs that can be queried during discovery
199
- discoveryPoolSize := sel .node .OrchestratorPool .Size ()
212
+ discoveryPoolSize := int (math .Min (float64 (sel .node .OrchestratorPool .Size ()), float64 (sel .initialPoolSize )))
213
+
214
+ if (sel .warmPool .Size () + sel .coldPool .Size ()) == 0 {
215
+ //release all orchestrators from suspension and try refresh
216
+ //if penalty in
217
+ clog .Infof (ctx , "refreshing sessions, no orchestrators in pools" )
218
+ for i := 0 ; i < sel .penalty ; i ++ {
219
+ sel .suspender .signalRefresh ()
220
+ }
221
+ }
222
+
200
223
if sel .warmPool .Size ()+ sel .coldPool .Size () < int (math .Min (maxRefreshSessionsThreshold , math .Ceil (float64 (discoveryPoolSize )/ 2.0 ))) {
201
224
return true
202
225
}
@@ -257,6 +280,7 @@ func (sel *AISessionSelector) Refresh(ctx context.Context) error {
257
280
258
281
var warmSessions []* BroadcastSession
259
282
var coldSessions []* BroadcastSession
283
+
260
284
for _ , sess := range sessions {
261
285
// If the constraints are missing for this capability skip this session
262
286
constraints , ok := sess .OrchestratorInfo .Capabilities .Constraints [uint32 (sel .cap )]
@@ -279,6 +303,7 @@ func (sel *AISessionSelector) Refresh(ctx context.Context) error {
279
303
280
304
sel .warmPool .Add (warmSessions )
281
305
sel .coldPool .Add (coldSessions )
306
+ sel .initialPoolSize = len (warmSessions ) + len (coldSessions ) + len (sel .suspender .list )
282
307
283
308
sel .lastRefreshTime = time .Now ()
284
309
0 commit comments