cli: fully and properly drain target node of decommission

With this patch, at the end of decommissioning, we call the drain step as we would for `./cockroach node drain`: ``` [...] ..... id is_live replicas is_decommissioning membership is_draining readiness blocking_ranges 1 true 2 true decommissioning false ready 0 ..... id is_live replicas is_decommissioning membership is_draining readiness blocking_ranges 1 true 1 true decommissioning false ready 0 ...... id is_live replicas is_decommissioning membership is_draining readiness blocking_ranges 1 true 0 true decommissioning false ready 0 draining node n2 node is draining... remaining: 26 node is draining... remaining: 0 (complete) node n2 drained successfully No more data reported on target nodes. Please verify cluster health before removing the nodes. ``` In particular, note how the first invocation returns a RemainingIndicator of 26, so before this patch, we had initiated draining, but it hadn't fully completed. I thought for a while that this could explain cockroachdb#140774, i.e. that cockroachdb#138732 was insufficient as it did not guarantee that the node had actually drained fully by the time it was marked as fully decommissioned and the `node decommission` had returned. But I found that fully draining did not fix the test, and ultimately tracked the issue down to a test infra problem. Still, this PR is a good change, that brings the drain experience in decommission on par with the standalone CLI. See cockroachdb#140774. I verified that the modified decommission/drains roachtest passes via ``` ./pkg/cmd/roachtest/roachstress.sh -l -c 1 decommission/drains/alive ``` Touches cockroachdb#140774. Touches cockroachdb#139411. Touches cockroachdb#139413. PR cockroachdb#138732 already fixed most of the drain issues, but since the decommissioning process still went ahead and shut the node out from the cluster, SQL connections that drain was still waiting for would likely hit errors (since the gateway node would not be able to connect to the rest of the cluster any more due to having been flipped to fully decommissioned). So there's a new release note for the improvement in this PR, which avoids that. Release note (bug fix): previously, a node that was drained as part of decommissioning may have interrupted SQL connections that were still active during drain (and for which drain would have been expected to wait). Epic: None
tbg · Feb 13, 2025 · 5319f88 · 5319f88
1 parent 004b3d4
commit 5319f88
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 29 deletions.
diff --git a/pkg/cli/node.go b/pkg/cli/node.go
@@ -8,7 +8,6 @@ package cli
 import (
 	"context"
 	"fmt"
-	"io"
 	"math"
 	"os"
 	"reflect"
@@ -610,29 +609,23 @@ func runDecommissionNodeImpl(
 					)
 					continue
 				}
-				drainReq := &serverpb.DrainRequest{
-					Shutdown: false,
-					DoDrain:  true,
-					NodeId:   targetNode.String(),
-				}
-				stream, err := c.Drain(ctx, drainReq)
-				if err != nil {
-					fmt.Fprintln(stderr)
-					return errors.Wrapf(err, "while trying to drain n%d", targetNode)
-				}
-
-				// Consume responses until the stream ends (which signals drain
-				// completion).
-				for {
-					_, err := stream.Recv()
-					if err == io.EOF {
-						// Stream gracefully closed by other side.
-						break
-					}
-					if err != nil {
-						fmt.Fprintln(stderr)
-						return errors.Wrapf(err, "while trying to drain n%d", targetNode)
-					}
+				_, _ = fmt.Fprintf(stderr, "draining node n%d\n", targetNode)
+
+				if _, _, err := doDrain(ctx, c, targetNode.String()); err != nil {
+					// NB: doDrain already prints to stdErr.
+					//
+					// Defense in depth: in decommission invocations that don't have to
+					// do much work, if the target node was _just_ shutdown prior to
+					// starting `node decommission`, the node may be absent but the liveness
+					// status sent us here anyway. We don't want to fail out on the drain
+					// step to make the decommissioning command more robust.
+					_, _ = fmt.Fprintf(stderr,
+						"drain step for node n%d failed; decommissioning anyway\n", targetNode,
+					)
+					_ = err // discard intentionally
+				} else {
+					// NB: this output is matched on in the decommission/drains roachtest.
+					_, _ = fmt.Fprintf(stderr, "node n%d drained successfully\n", targetNode)
 				}
 			}
 

diff --git a/pkg/cmd/roachtest/tests/decommission.go b/pkg/cmd/roachtest/tests/decommission.go
@@ -1056,7 +1056,7 @@ func runDecommissionDrains(ctx context.Context, t test.Test, c cluster.Cluster,
 	}
 	t.Status(fmt.Sprintf("decommissioning node %d", decommNodeID))
 	e := retry.WithMaxAttempts(ctx, retryOpts, maxAttempts, func() error {
-		o, err := h.decommission(ctx, decommNode, pinnedNodeID, "--wait=none", "--format=csv")
+		o, e, err := h.decommissionExt(ctx, decommNode, pinnedNodeID, "--wait=none", "--format=csv")
 		require.NoError(t, errors.Wrapf(err, "decommission failed"))
 
 		// Check if all the replicas have been transferred.
@@ -1072,6 +1072,15 @@ func runDecommissionDrains(ctx context.Context, t test.Test, c cluster.Cluster,
 			return nil
 		}
 
+		// Check that the cli printed the below message. We do this because the drain
+		// step does not error out the decommission command on failure (to make that
+		// command more resilient to situations in which a node is terminated right
+		// around the time we try to drain it).
+		// This message is emitted in `cli.runDecommissionNodeImpl` and has a
+		// back-referencing comment mentioning that this roachtest matches on
+		// it.
+		require.Contains(t, e, "drained successfully")
+
 		// Check to see if the node has been drained or decommissioned.
 		// If not, queries should not fail.
 		// Connect to node 4 (the target node of the decommission).
@@ -1235,11 +1244,21 @@ func (h *decommTestHelper) getLogicalNodeID(ctx context.Context, nodeIdx int) (i
 	return nodeID, nil
 }
 
-// decommission decommissions the given targetNodes, running the process
-// through the specified runNode.
 func (h *decommTestHelper) decommission(
 	ctx context.Context, targetNodes option.NodeListOption, runNode int, verbs ...string,
 ) (string, error) {
+	o, _, err := h.decommissionExt(ctx, targetNodes, runNode, verbs...)
+	return o, err
+}
+
+// decommission decommissions the given targetNodes, running the process
+// through the specified runNode.
+// Returns stdout, stderr, error.
+// Stdout has the tabular decommission process, stderr contains informational
+// updates.
+func (h *decommTestHelper) decommissionExt(
+	ctx context.Context, targetNodes option.NodeListOption, runNode int, verbs ...string,
+) (string, string, error) {
 	args := []string{"node", "decommission"}
 	args = append(args, verbs...)
 
@@ -1250,7 +1269,7 @@ func (h *decommTestHelper) decommission(
 			args = append(args, strconv.Itoa(target))
 		}
 	}
-	return execCLI(ctx, h.t, h.c, runNode, args...)
+	return execCLIExt(ctx, h.t, h.c, runNode, args...)
 }
 
 // recommission recommissions the given targetNodes, running the process
@@ -1479,13 +1498,20 @@ func (h *decommTestHelper) getRandNodeOtherThan(ids ...int) int {
 func execCLI(
 	ctx context.Context, t test.Test, c cluster.Cluster, runNode int, extraArgs ...string,
 ) (string, error) {
+	out, _, err := execCLIExt(ctx, t, c, runNode, extraArgs...)
+	return out, err
+}
+
+func execCLIExt(
+	ctx context.Context, t test.Test, c cluster.Cluster, runNode int, extraArgs ...string,
+) (string, string, error) {
 	args := []string{"./cockroach"}
 	args = append(args, extraArgs...)
 	args = append(args, fmt.Sprintf("--port={pgport:%d}", runNode))
 	args = append(args, fmt.Sprintf("--certs-dir=%s", install.CockroachNodeCertsDir))
 	result, err := c.RunWithDetailsSingleNode(ctx, t.L(), option.WithNodes(c.Node(runNode)), args...)
 	t.L().Printf("%s\n", result.Stdout)
-	return result.Stdout, err
+	return result.Stdout, result.Stderr, err
 }
 
 // Increase the logging verbosity for decommission tests to make life easier