Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,15 @@ status:
type: ScaledUpBasedOnPreferredMaxReplicas
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
message: The recommendation is provided
status: "True"
type: HPATargetUtilizationUpdated
type: VerticalRecommendationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: The recommendation is provided
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
status: "True"
type: VerticalRecommendationUpdated
type: HPATargetUtilizationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
status: "False"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,15 @@ status:
type: ScaledUpBasedOnPreferredMaxReplicas
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
message: The recommendation is provided
status: "True"
type: HPATargetUtilizationUpdated
type: VerticalRecommendationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: The recommendation is provided
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
status: "True"
type: VerticalRecommendationUpdated
type: HPATargetUtilizationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
status: "False"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,15 @@ status:
type: ScaledUpBasedOnPreferredMaxReplicas
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
message: The recommendation is provided
status: "True"
type: HPATargetUtilizationUpdated
type: VerticalRecommendationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: The recommendation is provided
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
status: "True"
type: VerticalRecommendationUpdated
type: HPATargetUtilizationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
status: "False"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,15 @@ status:
type: ScaledUpBasedOnPreferredMaxReplicas
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
message: The recommendation is provided
status: "True"
type: HPATargetUtilizationUpdated
type: VerticalRecommendationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: The recommendation is provided
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
status: "True"
type: VerticalRecommendationUpdated
type: HPATargetUtilizationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
status: "False"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,15 @@ status:
type: ScaledUpBasedOnPreferredMaxReplicas
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
message: The recommendation is provided
status: "True"
type: HPATargetUtilizationUpdated
type: VerticalRecommendationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: The recommendation is provided
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
status: "True"
type: VerticalRecommendationUpdated
type: HPATargetUtilizationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
status: "False"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,15 @@ status:
type: ScaledUpBasedOnPreferredMaxReplicas
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
message: The recommendation is provided
status: "True"
type: HPATargetUtilizationUpdated
type: VerticalRecommendationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: The recommendation is provided
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
status: "True"
type: VerticalRecommendationUpdated
type: HPATargetUtilizationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
status: "False"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,15 @@ status:
type: ScaledUpBasedOnPreferredMaxReplicas
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
message: The recommendation is provided
status: "True"
type: HPATargetUtilizationUpdated
type: VerticalRecommendationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: The recommendation is provided
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
status: "True"
type: VerticalRecommendationUpdated
type: HPATargetUtilizationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
status: "False"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,15 @@ status:
type: ScaledUpBasedOnPreferredMaxReplicas
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
message: The recommendation is provided
status: "True"
type: HPATargetUtilizationUpdated
type: VerticalRecommendationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
message: The recommendation is provided
message: HPA target utilization is updated
reason: HPATargetUtilizationUpdated
status: "True"
type: VerticalRecommendationUpdated
type: HPATargetUtilizationUpdated
- lastTransitionTime: "2023-01-01T00:00:00Z"
lastUpdateTime: "2023-01-01T00:00:00Z"
status: "False"
Expand Down
8 changes: 4 additions & 4 deletions internal/controller/tortoise_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -258,15 +258,15 @@ func (r *TortoiseReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_
return ctrl.Result{RequeueAfter: r.Interval}, nil
}

_, tortoise, err = r.HpaService.UpdateHPAFromTortoiseRecommendation(ctx, tortoise, now)
tortoise, err = r.TortoiseService.UpdateResourceRequest(ctx, tortoise, currentDesiredReplicaNum, now)
if err != nil {
logger.Error(err, "update HPA based on the recommendation in tortoise", "tortoise", req.NamespacedName)
logger.Error(err, "update VPA based on the recommendation in tortoise", "tortoise", req.NamespacedName)
return ctrl.Result{}, err
}

tortoise, err = r.TortoiseService.UpdateResourceRequest(ctx, tortoise, currentDesiredReplicaNum, now)
_, tortoise, err = r.HpaService.UpdateHPAFromTortoiseRecommendation(ctx, tortoise, now)
if err != nil {
logger.Error(err, "update VPA based on the recommendation in tortoise", "tortoise", req.NamespacedName)
logger.Error(err, "update HPA based on the recommendation in tortoise", "tortoise", req.NamespacedName)
return ctrl.Result{}, err
}

Expand Down
33 changes: 24 additions & 9 deletions pkg/hpa/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,7 @@ func (c *Service) ChangeHPAFromTortoiseRecommendation(tortoise *autoscalingv1bet
recommendMax = c.maximumMaxReplica
}

oldMax := hpa.Spec.MaxReplicas
hpa.Spec.MaxReplicas = recommendMax

recommendMin, err := GetReplicasRecommendation(tortoise.Status.Recommendations.Horizontal.MinReplicas, now)
Expand Down Expand Up @@ -443,19 +444,33 @@ func (c *Service) ChangeHPAFromTortoiseRecommendation(tortoise *autoscalingv1bet
minToActuallyApply = recommendMin
}

oldMin := *hpa.Spec.MinReplicas
hpa.Spec.MinReplicas = &minToActuallyApply
if tortoise.Spec.UpdateMode != autoscalingv1beta3.UpdateModeOff && recordMetrics {
// We don't want to record applied* metric when UpdateMode is Off.
netChangeMaxReplicas := float64(hpa.Spec.MaxReplicas - recommendMax)
netChangeMinReplicas := float64(*hpa.Spec.MinReplicas) - float64(recommendMin)
if netChangeMaxReplicas > 0 || netChangeMinReplicas < 0 {
metrics.IncreaseApplyCounter.WithLabelValues(tortoise.Name, tortoise.Namespace).Add(1)
}
if netChangeMaxReplicas < 0 || netChangeMinReplicas > 0 {
metrics.DecreaseApplyCounter.WithLabelValues(tortoise.Name, tortoise.Namespace).Add(1)
netChangeMaxReplicas := float64(recommendMax - oldMax)
netChangeMinReplicas := float64(recommendMin - oldMin)
cpu := float64(0)
mem := float64(0)
for _, r := range tortoise.Status.Conditions.ContainerResourceRequests {
for resourcename, value := range r.Resource {
if resourcename == corev1.ResourceCPU {
cpu += value.AsApproximateFloat64()
}
if resourcename == corev1.ResourceMemory {
mem += value.AsApproximateFloat64()
}
}
}
metrics.NetHPAMinReplicas.WithLabelValues(tortoise.Name, tortoise.Namespace, hpa.Name, tortoise.Spec.TargetRefs.ScaleTargetRef.Name).Set(netChangeMinReplicas)
metrics.NetHPAMaxReplicas.WithLabelValues(tortoise.Name, tortoise.Namespace, hpa.Name, tortoise.Spec.TargetRefs.ScaleTargetRef.Name).Set(netChangeMaxReplicas)
netChangeMaxReplicasCpu := netChangeMaxReplicas * cpu
netChangeMinReplicasCpu := netChangeMinReplicas * cpu
netChangeMinReplicasMem := netChangeMinReplicas * mem
netChangeMaxReplicasMem := netChangeMaxReplicas * mem

metrics.NetHPAMinReplicasCPUCores.WithLabelValues(tortoise.Name, tortoise.Namespace, hpa.Name).Set(netChangeMinReplicasCpu)
metrics.NetHPAMaxReplicasCPUCores.WithLabelValues(tortoise.Name, tortoise.Namespace, hpa.Name).Set(netChangeMaxReplicasCpu)
metrics.NetHPAMinReplicasMemory.WithLabelValues(tortoise.Name, tortoise.Namespace, hpa.Name).Set(netChangeMinReplicasMem)
metrics.NetHPAMaxReplicasMemory.WithLabelValues(tortoise.Name, tortoise.Namespace, hpa.Name).Set(netChangeMaxReplicasMem)
metrics.AppliedHPAMinReplicas.WithLabelValues(tortoise.Name, tortoise.Namespace, hpa.Name).Set(float64(*hpa.Spec.MinReplicas))
metrics.AppliedHPAMaxReplicas.WithLabelValues(tortoise.Name, tortoise.Namespace, hpa.Name).Set(float64(hpa.Spec.MaxReplicas))
}
Expand Down
46 changes: 23 additions & 23 deletions pkg/metrics/metrics.go
Copy link
Collaborator

@sanposhiho sanposhiho Jan 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you elaborate about the use case a bit more? I mean, how would those metrics benefit you? because, for example, net_hpa_minreplicas_cpu_cores shows the difference of CPU allocation with an assumption that HPA always keeps the replicas at minReplicas, which occasionally happens. For another example, net_hpa_maxreplicas_cpu_cores shows the difference of CPU allocation _with an assumption that HPA always keeps the replicas at maxReplicas, which should never happens.
I'm not sure how worthwhile those values would be.

In the first place though, why are you trying to measure them within tortoise? Why not just directly checking the allocated CPU or memory on each service that adopts tortoise if you just want to see the cost reduction?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I do acknowledge the first point on the assumptions as this was something I brought up as well. However, it is difficult to calculate accurate cost savings from tortoise as compared to a service purely using HPA. Directly checking allocated CPU or Memory does not tell whether tortoise is actually helping to reduce costs or not. Therefore we need to show cost savings based on decisions made by tortoise such as changing max/min replicas. I do agree its not an accurate amount especially on the max replica side but with regards to min replica, it should only decrease when there is significant underutilization, in which case i do think it can be used for calculating cost savings by tortoise. Do you think we should remove net max replica changes?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is difficult to calculate accurate cost savings from tortoise as compared to a service purely using HPA

I mean, how would net_hpa_minreplicas_cpu_cores help then? Again, it's a super rare situation that HPA always keeps the replicas at minReplicas, and hence if you see -2 cores in net_hpa_minreplicas_cpu_cores, in most cases, that doesn't mean tortoise makes the cost saving of 2 cores. I'm not sure how this "2 cores" help you understand the cost saving.

Copy link
Collaborator

@sanposhiho sanposhiho Jan 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, this "net" value is simply old value - new value, right? Then, except very first reconciliation, old value is also the value calculated from tortoise. It's not the value that the service owner used within their HPA before adopting Tortoise.

Another thing, tortoise dynamically changes the minReplicas based on the time.
https://github.com/mercari/tortoise/blob/main/docs/horizontal.md#minreplicas
So, the graph of net_hpa_minreplicas_cpu_cores would just show increasing values during the time towards the peak time, and decreasing values during the time towards the off-peak time. Tortoise's principal is not just putting as low as possible value on minReplicas, but is putting the value to be a safe guard as this section describes.
How would those values benefit you for the cost saving calculation?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh my bad i thought that recommendation algo also made changes to hpa min/max replica based on utilization but looking at the code it seems it does not.. then it doesnt make sense to have cost saving metrics for min/max replica since its more about reliability it seems.. i will revert the changes on min/max replicas

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i was somehow under the impression that if utilization was low and min replica was at maybe 10, tortoise would decrease min replica then but thats not the case.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if utilization was low and min replica was at maybe 10, tortoise would decrease min replica then but thats not the case.

You could be right depending on the scenario.
If last week's same day's same time's replica is 20 and the utilization is low today, then tortoise keeps minReplica at 10 (1/2 * lastweek) because most likely something weird is happening now (e.g., the upstream service like gateway is down etc) and it might be dangerous to reduce minReplicas.
Although let's say, then, next week, it's again 10 with a low utilization. It means probably this is a new trend that this service gets smaller traffic now (e.g., one upstream service stopped calling this service, Mercari got unpopular somehow etc).
Tortoise checks the last week's value and it's 10. So, it changes minReplicas to 5 (again 1/2 * lastweek).

So, that's how tortoise deals with the scenario like too high minReplicas makes the service low utilized.

Original file line number Diff line number Diff line change
Expand Up @@ -46,25 +46,25 @@ var (
Help: "memory request (byte) that tortoises actually applys",
}, []string{"tortoise_name", "namespace", "container_name", "controller_name", "controller_kind"})

DecreaseApplyCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "decrease_apply_counter",
Help: "counter for number of resource decreases applied by tortoise",
}, []string{"tortoise_name", "namespace"})

IncreaseApplyCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "increase_apply_counter",
Help: "counter for number of resource increases applied by tortoise",
}, []string{"tortoise_name", "namespace"})

NetHPAMinReplicas = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "net_hpa_minreplicas",
Help: "net hpa minReplicas that tortoises actually applys to hpa",
}, []string{"tortoise_name", "namespace", "hpa_name", "kube_deployment"})

NetHPAMaxReplicas = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "net_hpa_maxreplicas",
Help: "net hpa maxReplicas that tortoises actually applys to hpa",
}, []string{"tortoise_name", "namespace", "hpa_name", "kube_deployment"})
NetHPAMinReplicasCPUCores = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "net_hpa_minreplicas_cpu_cores",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does "net" mean?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

overall change i.e. from 5 cores to 3 cores the change is -2 cores

Help: "net cpu cores changed by minReplicas that tortoises actually applys to hpa",
}, []string{"tortoise_name", "namespace", "hpa_name"})

NetHPAMinReplicasMemory = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "net_hpa_minreplicas_memory",
Help: "net memory changed by minReplicas that tortoises actually applys to hpa",
}, []string{"tortoise_name", "namespace", "hpa_name"})

NetHPAMaxReplicasCPUCores = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "net_hpa_maxreplicas_cpu_cores",
Help: "net cpu cores changed by maxReplicas that tortoises actually applys to hpa",
}, []string{"tortoise_name", "namespace", "hpa_name"})

NetHPAMaxReplicasMemory = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "net_hpa_maxreplicas_memory",
Help: "net memory changed by maxReplicas that tortoises actually applys to hpa",
}, []string{"tortoise_name", "namespace", "hpa_name"})

NetCPURequest = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "net_cpu_request",
Expand Down Expand Up @@ -117,10 +117,10 @@ func init() {
AppliedHPAMinReplicas,
AppliedCPURequest,
AppliedMemoryRequest,
IncreaseApplyCounter,
DecreaseApplyCounter,
NetHPAMaxReplicas,
NetHPAMinReplicas,
NetHPAMinReplicasCPUCores,
NetHPAMinReplicasMemory,
NetHPAMaxReplicasCPUCores,
NetHPAMaxReplicasMemory,
NetCPURequest,
NetMemoryRequest,
ProposedHPATargetUtilization,
Expand Down
10 changes: 2 additions & 8 deletions pkg/tortoise/tortoise.go
Original file line number Diff line number Diff line change
Expand Up @@ -768,21 +768,15 @@ func (c *Service) UpdateResourceRequest(ctx context.Context, tortoise *v1beta3.T
// only record metrics once in every reconcile loop.
for resourcename, value := range r.Resource {
oldRequest := oldRequestMap[r.ContainerName][resourcename]
netChange := float64(oldRequest.MilliValue() - value.MilliValue())
netChange := float64(oldRequest.MilliValue()-value.MilliValue()) * float64(replica)
if resourcename == corev1.ResourceCPU {
// We don't want to record applied* metric when UpdateMode is Off.
metrics.AppliedCPURequest.WithLabelValues(tortoise.Name, tortoise.Namespace, r.ContainerName, tortoise.Spec.TargetRefs.ScaleTargetRef.Name, tortoise.Spec.TargetRefs.ScaleTargetRef.Kind).Set(float64(value.MilliValue()))
metrics.NetCPURequest.WithLabelValues(tortoise.Name, tortoise.Namespace, r.ContainerName, tortoise.Spec.TargetRefs.ScaleTargetRef.Name, tortoise.Spec.TargetRefs.ScaleTargetRef.Kind).Set(netChange)
}
if resourcename == corev1.ResourceMemory {
metrics.AppliedMemoryRequest.WithLabelValues(tortoise.Name, tortoise.Namespace, r.ContainerName, tortoise.Spec.TargetRefs.ScaleTargetRef.Name, tortoise.Spec.TargetRefs.ScaleTargetRef.Kind).Set(float64(value.Value()))
metrics.NetMemoryRequest.WithLabelValues(tortoise.Name, tortoise.Namespace, r.ContainerName, tortoise.Spec.TargetRefs.ScaleTargetRef.Name, tortoise.Spec.TargetRefs.ScaleTargetRef.Kind).Set(float64(netChange))
}
if netChange > 0 {
metrics.IncreaseApplyCounter.WithLabelValues(tortoise.Name, tortoise.Namespace).Add(1)
}
if netChange < 0 {
metrics.DecreaseApplyCounter.WithLabelValues(tortoise.Name, tortoise.Namespace).Add(1)
metrics.NetMemoryRequest.WithLabelValues(tortoise.Name, tortoise.Namespace, r.ContainerName, tortoise.Spec.TargetRefs.ScaleTargetRef.Name, tortoise.Spec.TargetRefs.ScaleTargetRef.Kind).Set(netChange / float64(1000))
}
}
}
Expand Down
Loading