From fb11848a9e3513914562de544aad27f78fae6aae Mon Sep 17 00:00:00 2001 From: "Grot (@grafanabot)" <43478413+grafanabot@users.noreply.github.com> Date: Fri, 10 Jan 2025 15:50:50 +0100 Subject: [PATCH] ruler: increase retries backoff limit to 1m (#10403) (#10404) * ruler: increase retries backoff limit to 1m the previous limit of 2s is too small and doesn't end up spreading out retries for long enough Signed-off-by: Dimitar Dimitrov * Update CHANGELOG.md entry Signed-off-by: Dimitar Dimitrov --------- Signed-off-by: Dimitar Dimitrov (cherry picked from commit 8bedb97dd18aa7b12863caa4c413b52c81cbc8ec) Co-authored-by: Dimitar Dimitrov --- CHANGELOG.md | 2 +- pkg/ruler/remotequerier.go | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 88065da5c61..7b8560d19f6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ * [CHANGE] Distributor: OTLP and push handler replace all non-UTF8 characters with the unicode replacement character `\uFFFD` in error messages before propagating them. #10236 * [CHANGE] Querier: pass query matchers to queryable `IsApplicable` hook. #10256 * [CHANGE] Query-frontend: Add `topic` label to `cortex_ingest_storage_strong_consistency_requests_total`, `cortex_ingest_storage_strong_consistency_failures_total`, and `cortex_ingest_storage_strong_consistency_wait_duration_seconds` metrics. #10220 -* [CHANGE] Ruler: cap the rate of retries for remote query evaluation to 170/sec. This is configurable via `-ruler.query-frontend.max-retries-rate`. #10375 +* [CHANGE] Ruler: cap the rate of retries for remote query evaluation to 170/sec. This is configurable via `-ruler.query-frontend.max-retries-rate`. #10375 #10403 * [ENHANCEMENT] Query Frontend: Return server-side `samples_processed` statistics. #10103 * [ENHANCEMENT] Distributor: OTLP receiver now converts also metric metadata. See also https://github.com/prometheus/prometheus/pull/15416. #10168 * [ENHANCEMENT] Distributor: discard float and histogram samples with duplicated timestamps from each timeseries in a request before the request is forwarded to ingesters. Discarded samples are tracked by the `cortex_discarded_samples_total` metrics with reason `sample_duplicate_timestamp`. #10145 diff --git a/pkg/ruler/remotequerier.go b/pkg/ruler/remotequerier.go index 4b0a4d7b1d7..b9bd51101dd 100644 --- a/pkg/ruler/remotequerier.go +++ b/pkg/ruler/remotequerier.go @@ -363,8 +363,9 @@ func (q *RemoteQuerier) sendRequest(ctx context.Context, req *httpgrpc.HTTPReque return nil, fmt.Errorf("couldn't reserve a retry token") } // We want to wait at least the time for the backoff, but also don't want to exceed the rate limit. - // All of this is capped to the max backoff, so that we are less likely to overrun into the next evaluation. - retryDelay := max(retry.NextDelay(), min(retryConfig.MaxBackoff, retryReservation.Delay())) + // All of this is capped to 1m, so that we are less likely to overrun into the next evaluation. + // 1m was selected as giving enough time to spread out the retries. + retryDelay := max(retry.NextDelay(), min(time.Minute, retryReservation.Delay())) level.Warn(logger).Log("msg", "failed to remotely evaluate query expression, will retry", "err", err, "retry_delay", retryDelay) select { case <-time.After(retryDelay):