Skip to content

Commit

Permalink
Don't pile up successive full refreshes during AWS scaledowns
Browse files Browse the repository at this point in the history
Force refreshing everything at every DeleteNodes calls causes slow down
and throttling on large clusters with many ASGs (and lot of activity).

That function might be called several times in a row during scale-down
(once for each ASG having a node to be removed). Each time the forced
refresh will re-discover all ASGs, all LaunchConfigurations, then re-list all
instances from discovered ASGs.

That immediate refresh isn't required anyway, as the cache's DeleteInstances
concrete implementation will decrement the nodegroup size, and we can
schedule a grouped refresh for the next loop iteration.
  • Loading branch information
bpineau committed Feb 12, 2021
1 parent 8d1473a commit 46a4410
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -469,7 +469,7 @@ func TestDeleteNodes(t *testing.T) {
err = asgs[0].DeleteNodes([]*apiv1.Node{node})
assert.NoError(t, err)
service.AssertNumberOfCalls(t, "TerminateInstanceInAutoScalingGroup", 1)
service.AssertNumberOfCalls(t, "DescribeAutoScalingGroupsPages", 2)
service.AssertNumberOfCalls(t, "DescribeAutoScalingGroupsPages", 1)

newSize, err := asgs[0].TargetSize()
assert.NoError(t, err)
Expand Down Expand Up @@ -516,7 +516,7 @@ func TestDeleteNodesWithPlaceholder(t *testing.T) {
err = asgs[0].DeleteNodes([]*apiv1.Node{node})
assert.NoError(t, err)
service.AssertNumberOfCalls(t, "SetDesiredCapacity", 1)
service.AssertNumberOfCalls(t, "DescribeAutoScalingGroupsPages", 2)
service.AssertNumberOfCalls(t, "DescribeAutoScalingGroupsPages", 1)

newSize, err := asgs[0].TargetSize()
assert.NoError(t, err)
Expand Down
5 changes: 3 additions & 2 deletions cluster-autoscaler/cloudprovider/aws/aws_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -295,8 +295,9 @@ func (m *AwsManager) DeleteInstances(instances []*AwsInstanceRef) error {
if err := m.asgCache.DeleteInstances(instances); err != nil {
return err
}
klog.V(2).Infof("Some ASG instances might have been deleted, forcing ASG list refresh")
return m.forceRefresh()
klog.V(2).Infof("Some ASG instances might have been deleted, scheduling an ASG list refresh")
m.lastRefresh = time.Now().Add(-refreshInterval)
return nil
}

// GetAsgNodes returns Asg nodes.
Expand Down

0 comments on commit 46a4410

Please sign in to comment.