Skip to content

Commit

Permalink
roachtest: fix network/authentication/nodes=4
Browse files Browse the repository at this point in the history
For reasons, the test starts a three node cluster, then immediately
stops it, then restarts n1 and {n2,n3} separately. It was previously
restarting n1 first, followed by `{n2,n3}`. Due to recent change
PR #138109, this no longer works since n1 doesn't have quorum and
so won't signal SQL readiness.

There's an ongoing discussion on whether this new behavior is desired,
but either way, this PR changes the test to restart {n2,n3} first (which
does have quorum, assuming we wait for full replication first, which we
now do as well), followed by n1.

Closes #138806.

Epic: none
Release note: None
  • Loading branch information
tbg committed Jan 10, 2025
1 parent 2d06980 commit a0206e0
Showing 1 changed file with 20 additions and 13 deletions.
33 changes: 20 additions & 13 deletions pkg/cmd/roachtest/tests/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
errors "github.com/cockroachdb/errors"
"github.com/cockroachdb/errors"
_ "github.com/lib/pq" // register postgres driver
"github.com/stretchr/testify/require"
)
Expand All @@ -50,8 +50,13 @@ func runNetworkAuthentication(ctx context.Context, t test.Test, c cluster.Cluste
// that they use coherent certs.
settings := install.MakeClusterSettings()

// Don't create a backup schedule as this test shuts the cluster down immediately.
c.Start(ctx, t.L(), option.NewStartOpts(option.NoBackupSchedule), settings, serverNodes)
// Don't create a backup schedule in this test as the cluster won't be up
// long and we'll inject network issues.
// We wait for replication so that we can safely restart the cluster in two
// steps next.
c.Start(
ctx, t.L(), option.NewStartOpts(option.NoBackupSchedule, option.WaitForReplication()), settings, serverNodes,
)
require.NoError(t, c.StopE(ctx, t.L(), option.DefaultStopOpts(), serverNodes))

t.L().Printf("restarting nodes...")
Expand All @@ -66,16 +71,18 @@ func runNetworkAuthentication(ctx context.Context, t test.Test, c cluster.Cluste
// Currently, creating a scheduled backup at start fails, potentially due to
// the induced network partition. Further investigation required to allow scheduled backups
// to run on this test.
startOpts := option.NewStartOpts(option.NoBackupSchedule)
startOpts.RoachprodOpts.ExtraArgs = append(startOpts.RoachprodOpts.ExtraArgs, "--locality=node=1", "--accept-sql-without-tls")
c.Start(ctx, t.L(), startOpts, settings, c.Node(1))

// See comment above about env vars.
// "--env=COCKROACH_SCAN_INTERVAL=200ms",
// "--env=COCKROACH_SCAN_MAX_IDLE_TIME=20ms",
startOpts = option.NewStartOpts(option.NoBackupSchedule)
startOpts.RoachprodOpts.ExtraArgs = append(startOpts.RoachprodOpts.ExtraArgs, "--locality=node=other", "--accept-sql-without-tls")
c.Start(ctx, t.L(), startOpts, settings, c.Range(2, n-1))
{
// We start n2+ first so that there's quorum.
startOpts := option.NewStartOpts(option.NoBackupSchedule)
startOpts.RoachprodOpts.ExtraArgs = append(startOpts.RoachprodOpts.ExtraArgs, "--locality=node=other", "--accept-sql-without-tls")
c.Start(ctx, t.L(), startOpts, settings, c.Range(2, n-1))
}
{
// Now start n1.
startOpts := option.NewStartOpts(option.NoBackupSchedule)
startOpts.RoachprodOpts.ExtraArgs = append(startOpts.RoachprodOpts.ExtraArgs, "--locality=node=1", "--accept-sql-without-tls")
c.Start(ctx, t.L(), startOpts, settings, c.Node(1))
}

t.L().Printf("retrieving server addresses...")
serverUrls, err := c.InternalPGUrl(ctx, t.L(), serverNodes, roachprod.PGURLOptions{Auth: install.AuthUserPassword})
Expand Down

0 comments on commit a0206e0

Please sign in to comment.