From fb11848a9e3513914562de544aad27f78fae6aae Mon Sep 17 00:00:00 2001
From: "Grot (@grafanabot)" <43478413+grafanabot@users.noreply.github.com>
Date: Fri, 10 Jan 2025 15:50:50 +0100
Subject: [PATCH] ruler: increase retries backoff limit to 1m (#10403) (#10404)

* ruler: increase retries backoff limit to 1m

the previous limit of 2s is too small and doesn't end up spreading out retries for long enough

Signed-off-by: Dimitar Dimitrov <dimitar.dimitrov@grafana.com>

* Update CHANGELOG.md entry

Signed-off-by: Dimitar Dimitrov <dimitar.dimitrov@grafana.com>

---------

Signed-off-by: Dimitar Dimitrov <dimitar.dimitrov@grafana.com>
(cherry picked from commit 8bedb97dd18aa7b12863caa4c413b52c81cbc8ec)

Co-authored-by: Dimitar Dimitrov <dimitar.dimitrov@grafana.com>
---
 CHANGELOG.md               | 2 +-
 pkg/ruler/remotequerier.go | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 88065da5c61..7b8560d19f6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@
 * [CHANGE] Distributor: OTLP and push handler replace all non-UTF8 characters with the unicode replacement character `\uFFFD` in error messages before propagating them. #10236
 * [CHANGE] Querier: pass query matchers to queryable `IsApplicable` hook. #10256
 * [CHANGE] Query-frontend: Add `topic` label to `cortex_ingest_storage_strong_consistency_requests_total`, `cortex_ingest_storage_strong_consistency_failures_total`, and `cortex_ingest_storage_strong_consistency_wait_duration_seconds` metrics. #10220
-* [CHANGE] Ruler: cap the rate of retries for remote query evaluation to 170/sec. This is configurable via `-ruler.query-frontend.max-retries-rate`. #10375
+* [CHANGE] Ruler: cap the rate of retries for remote query evaluation to 170/sec. This is configurable via `-ruler.query-frontend.max-retries-rate`. #10375 #10403
 * [ENHANCEMENT] Query Frontend: Return server-side `samples_processed` statistics. #10103
 * [ENHANCEMENT] Distributor: OTLP receiver now converts also metric metadata. See also https://github.com/prometheus/prometheus/pull/15416. #10168
 * [ENHANCEMENT] Distributor: discard float and histogram samples with duplicated timestamps from each timeseries in a request before the request is forwarded to ingesters. Discarded samples are tracked by the `cortex_discarded_samples_total` metrics with reason `sample_duplicate_timestamp`. #10145
diff --git a/pkg/ruler/remotequerier.go b/pkg/ruler/remotequerier.go
index 4b0a4d7b1d7..b9bd51101dd 100644
--- a/pkg/ruler/remotequerier.go
+++ b/pkg/ruler/remotequerier.go
@@ -363,8 +363,9 @@ func (q *RemoteQuerier) sendRequest(ctx context.Context, req *httpgrpc.HTTPReque
 			return nil, fmt.Errorf("couldn't reserve a retry token")
 		}
 		// We want to wait at least the time for the backoff, but also don't want to exceed the rate limit.
-		// All of this is capped to the max backoff, so that we are less likely to overrun into the next evaluation.
-		retryDelay := max(retry.NextDelay(), min(retryConfig.MaxBackoff, retryReservation.Delay()))
+		// All of this is capped to 1m, so that we are less likely to overrun into the next evaluation.
+		// 1m was selected as giving enough time to spread out the retries.
+		retryDelay := max(retry.NextDelay(), min(time.Minute, retryReservation.Delay()))
 		level.Warn(logger).Log("msg", "failed to remotely evaluate query expression, will retry", "err", err, "retry_delay", retryDelay)
 		select {
 		case <-time.After(retryDelay):