Skip to content

Commit

Permalink
Merge #134654
Browse files Browse the repository at this point in the history
134654: upgrade: corrupt descriptor repair does not handle GC timestamp errors r=fqazi a=fqazi

When detecting corrupt descriptors during an upgrade, we use an AOST query. While this works effectively in the production environment, in test environments, the batch timestamp can be too far in the past. This problem occurs because certain tests
(declarative_schema_changer/job-compatibility-mixed-version-V242-V243) intentionally use short garbage collection TTL's, which can cause this logic to fail. To address this, this patch disables the AOST when a BatchTimestampBeforeGCError occurs.

Fixes: #134024

Release note: None

Co-authored-by: Faizan Qazi <[email protected]>
  • Loading branch information
craig[bot] and fqazi committed Nov 8, 2024
2 parents 8b4514f + 48a8446 commit 1c51918
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 8 deletions.
1 change: 1 addition & 0 deletions pkg/upgrade/upgrades/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ go_library(
"//pkg/keys",
"//pkg/keyvisualizer/keyvisjob",
"//pkg/kv",
"//pkg/kv/kvpb",
"//pkg/roachpb",
"//pkg/security/username",
"//pkg/settings",
Expand Down
29 changes: 21 additions & 8 deletions pkg/upgrade/upgrades/first_upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (

"github.com/cockroachdb/cockroach/pkg/clusterversion"
"github.com/cockroachdb/cockroach/pkg/kv"
"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/sql/catalog"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/catalogkeys"
Expand Down Expand Up @@ -156,15 +157,27 @@ func FirstUpgradeFromReleasePrecondition(
// without an AOST clause henceforth.
withAOST := firstUpgradePreconditionUsesAOST
diagnose := func(tbl redact.SafeString) (hasRows bool, err error) {
q := fmt.Sprintf("SELECT count(*) FROM \"\".crdb_internal.%s", tbl)
if withAOST {
q = q + " AS OF SYSTEM TIME '-10s'"
}
row, err := d.InternalExecutor.QueryRow(ctx, redact.Sprintf("query-%s", tbl), nil /* txn */, q)
if err == nil && row[0].String() != "0" {
hasRows = true
withAOST := withAOST
for {
q := fmt.Sprintf("SELECT count(*) FROM \"\".crdb_internal.%s", tbl)
if withAOST {
q = q + " AS OF SYSTEM TIME '-10s'"
}
row, err := d.InternalExecutor.QueryRow(ctx, redact.Sprintf("query-%s", tbl), nil /* txn */, q)
if err == nil && row[0].String() != "0" {
hasRows = true
}
// In tests like "declarative_schema_changer/job-compatibility-mixed-version", its
// possible to hit BatchTimestampBeforeGCError, because the GC interval is
// set to a second. If we ever see BatchTimestampBeforeGCError re-run without
// AOST.
if withAOST && errors.HasType(err, &kvpb.BatchTimestampBeforeGCError{}) {
// Retry with the AOST removed.
withAOST = false
continue
}
return hasRows, err
}
return hasRows, err
}
// Check for possibility of time travel.
if hasRows, err := diagnose("databases"); err != nil {
Expand Down

0 comments on commit 1c51918

Please sign in to comment.