valkey-io · enjoy-binbin · Sep 30, 2024 · Sep 30, 2024 · Oct 6, 2024 · Oct 6, 2024
diff --git a/src/cluster.h b/src/cluster.h
@@ -7,7 +7,6 @@
 
 #define CLUSTER_SLOT_MASK_BITS 14                                   /* Number of bits used for slot id. */
 #define CLUSTER_SLOTS (1 << CLUSTER_SLOT_MASK_BITS)                 /* Total number of slots in cluster mode, which is 16384. */
-#define CLUSTER_SLOT_MASK ((unsigned long long)(CLUSTER_SLOTS - 1)) /* Bit mask for slot id stored in LSB. */
 #define CLUSTER_OK 0                                                /* Everything looks ok */
 #define CLUSTER_FAIL 1                                              /* The cluster can't work */
 #define CLUSTER_NAMELEN 40                                          /* sha1 hex length */

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
@@ -1233,6 +1233,40 @@ void clusterInitLast(void) {
 
 /* Called when a cluster node receives SHUTDOWN. */
 void clusterHandleServerShutdown(void) {
+    if (nodeIsPrimary(myself) && server.auto_failover_on_shutdown) {
+        /* Find the first best replica, that is, the replica with the largest offset. */
+        client *best_replica = NULL;
+        listIter replicas_iter;
+        listNode *replicas_list_node;
+        listRewind(server.replicas, &replicas_iter);
+        while ((replicas_list_node = listNext(&replicas_iter)) != NULL) {
+            client *replica = listNodeValue(replicas_list_node);
+            /* This is done only when the replica offset is caught up, to avoid data loss.
+             * And 0x800ff is 8.0.255, we only support new versions for this feature. */
+            if (replica->repl_data->repl_state == REPLICA_STATE_ONLINE &&
+                // replica->repl_data->replica_version > 0x800ff &&
+                replica->name && sdslen(replica->name->ptr) == CLUSTER_NAMELEN &&
+                replica->repl_data->repl_ack_off == server.primary_repl_offset) {
+                best_replica = replica;
+                break;
+            }
+        }
+
+        if (best_replica) {
+            /* Send a CLUSTER FAILOVER FORCE to the best replica. */
+            char buf[128];
+            size_t buflen = snprintf(buf, sizeof(buf), "*5\r\n$7\r\nCLUSTER\r\n$8\r\nFAILOVER\r\n$5\r\nFORCE\r\n$9\r\nreplicaid\r\n$%d\r\n%s\r\n", CLUSTER_NAMELEN, (char *)best_replica->name->ptr);
+            if (connWrite(best_replica->conn, buf, buflen) == (int)strlen(buf)) {
+                serverLog(LL_NOTICE, "Sending CLUSTER FAILOVER FORCE to replica %s succeeded.",
+                          replicationGetReplicaName(best_replica));
+            } else {
+                serverLog(LL_WARNING, "Failed to send CLUSTER FAILOVER FORCE to replica: %s", strerror(errno));
+            }
+        } else {
+            serverLog(LL_NOTICE, "Unable to find a replica to perform an auto failover on shutdown.");
+        }
+    }
+
     /* The error logs have been logged in the save function if the save fails. */
     serverLog(LL_NOTICE, "Saving the cluster configuration file before exiting.");
     clusterSaveConfig(1);
@@ -6998,32 +7032,46 @@ int clusterCommandSpecial(client *c) {
         } else {
             addReplyLongLong(c, clusterNodeFailureReportsCount(n));
         }
-    } else if (!strcasecmp(c->argv[1]->ptr, "failover") && (c->argc == 2 || c->argc == 3)) {
-        /* CLUSTER FAILOVER [FORCE|TAKEOVER] */
+    } else if (!strcasecmp(c->argv[1]->ptr, "failover") && (c->argc >= 2)) {
+        /* CLUSTER FAILOVER [FORCE|TAKEOVER] [replicaid <node id>] */
         int force = 0, takeover = 0;
+        robj *replicaid = NULL;
 
-        if (c->argc == 3) {
-            if (!strcasecmp(c->argv[2]->ptr, "force")) {
+        for (int j = 2; j < c->argc; j++) {
+            int moreargs = (c->argc - 1) - j;
+            if (!strcasecmp(c->argv[j]->ptr, "force")) {
                 force = 1;
-            } else if (!strcasecmp(c->argv[2]->ptr, "takeover")) {
+            } else if (!strcasecmp(c->argv[j]->ptr, "takeover")) {
                 takeover = 1;
                 force = 1; /* Takeover also implies force. */
+            } else if (!strcasecmp(c->argv[j]->ptr, "replicaid") && moreargs) {
+                j++;
+                replicaid = c->argv[j];
             } else {
                 addReplyErrorObject(c, shared.syntaxerr);
                 return 1;
             }
         }
 
+        /* Check if it should be executed by myself. */
+        if (replicaid != NULL) {
+            clusterNode *n = clusterLookupNode(replicaid->ptr, sdslen(replicaid->ptr));
+            if (n != myself) {
+                /* Ignore this command, including the sanity check and the process. */
+                addReply(c, shared.ok);
+                return 1;
+            }
+        }
+
         /* Check preconditions. */
         if (clusterNodeIsPrimary(myself)) {
-            addReplyError(c, "You should send CLUSTER FAILOVER to a replica");
+            if (replicaid == NULL) addReplyError(c, "You should send CLUSTER FAILOVER to a replica");
             return 1;
         } else if (myself->replicaof == NULL) {
-            addReplyError(c, "I'm a replica but my master is unknown to me");
+            if (replicaid == NULL) addReplyError(c, "I'm a replica but my master is unknown to me");
             return 1;
         } else if (!force && (nodeFailed(myself->replicaof) || myself->replicaof->link == NULL)) {
-            addReplyError(c, "Master is down or failed, "
-                             "please use CLUSTER FAILOVER FORCE");
+            if (replicaid == NULL) addReplyError(c, "Master is down or failed, please use CLUSTER FAILOVER FORCE");
             return 1;
         }
         resetManualFailover();
@@ -7042,7 +7090,11 @@ int clusterCommandSpecial(client *c) {
             /* If this is a forced failover, we don't need to talk with our
              * primary to agree about the offset. We just failover taking over
              * it without coordination. */
-            serverLog(LL_NOTICE, "Forced failover user request accepted (user request from '%s').", client);
+            if (c == server.primary) {
+                serverLog(LL_NOTICE, "Forced failover primary request accepted (primary request from '%s').", client);
+            } else {
+                serverLog(LL_NOTICE, "Forced failover user request accepted (user request from '%s').", client);
+            }
             manualFailoverCanStart();
             /* We can start a manual failover as soon as possible, setting a flag
              * here so that we don't need to waiting for the cron to kick in. */

diff --git a/src/config.c b/src/config.c
@@ -3188,6 +3188,7 @@ standardConfig static_configs[] = {
     createBoolConfig("cluster-slot-stats-enabled", NULL, MODIFIABLE_CONFIG, server.cluster_slot_stats_enabled, 0, NULL, NULL),
     createBoolConfig("hide-user-data-from-log", NULL, MODIFIABLE_CONFIG, server.hide_user_data_from_log, 1, NULL, NULL),
     createBoolConfig("import-mode", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.import_mode, 0, NULL, NULL),
+    createBoolConfig("auto-failover-on-shutdown", NULL, MODIFIABLE_CONFIG, server.auto_failover_on_shutdown, 0, NULL, NULL),
 
     /* String Configs */
     createStringConfig("aclfile", NULL, IMMUTABLE_CONFIG, ALLOW_EMPTY_STRING, server.acl_filename, "", NULL, NULL),

diff --git a/src/replication.c b/src/replication.c
@@ -3594,6 +3594,14 @@ void syncWithPrimary(connection *conn) {
         err = sendCommand(conn, "REPLCONF", "version", VALKEY_VERSION, NULL);
         if (err) goto write_error;
 
+        /* Inform the primary of our (replica) node name. */
+        if (server.cluster_enabled) {
+            char *argv[] = {"CLIENT", "SETNAME", server.cluster->myself->name};
+            size_t lens[] = {6, 7, CLUSTER_NAMELEN};
+            err = sendCommandArgv(conn, 3, argv, lens);
+            if (err) goto write_error;
+        }
+
         server.repl_state = REPL_STATE_RECEIVE_AUTH_REPLY;
         return;
     }
@@ -3684,6 +3692,24 @@ void syncWithPrimary(connection *conn) {
         }
         sdsfree(err);
         err = NULL;
+        if (server.cluster_enabled) {
+            server.repl_state = REPL_STATE_RECEIVE_SETNAME_REPLY;
+            return;
+        } else {
+            server.repl_state = REPL_STATE_SEND_PSYNC;
+        }
+    }
+
+    /* Receive CLIENT SETNAME reply. */
+    if (server.repl_state == REPL_STATE_RECEIVE_SETNAME_REPLY) {
+        err = receiveSynchronousResponse(conn);
+        if (err == NULL) goto no_response_error;
+        /* Ignore the error if any. 8.1 introduced this logic and we don't care if it failed. */
+        if (err[0] == '-') {
+            serverLog(LL_NOTICE, "(Non critical) Primary does not understand CLIENT SETNAME: %s", err);
+        }
+        sdsfree(err);
+        err = NULL;
         server.repl_state = REPL_STATE_SEND_PSYNC;
     }
 

diff --git a/src/server.h b/src/server.h
@@ -396,6 +396,7 @@ typedef enum {
     REPL_STATE_RECEIVE_IP_REPLY,      /* Wait for REPLCONF reply */
     REPL_STATE_RECEIVE_CAPA_REPLY,    /* Wait for REPLCONF reply */
     REPL_STATE_RECEIVE_VERSION_REPLY, /* Wait for REPLCONF reply */
+    REPL_STATE_RECEIVE_SETNAME_REPLY, /* Wait for CLIENT SETNAME reply */
     REPL_STATE_SEND_PSYNC,            /* Send PSYNC */
     REPL_STATE_RECEIVE_PSYNC_REPLY,   /* Wait for PSYNC reply */
     /* --- End of handshake states --- */
@@ -2067,6 +2068,7 @@ struct valkeyServer {
     unsigned long cluster_blacklist_ttl;                   /* Duration in seconds that a node is denied re-entry into
                                                             * the cluster after it is forgotten with CLUSTER FORGET. */
     int cluster_slot_stats_enabled;                        /* Cluster slot usage statistics tracking enabled. */
+    int auto_failover_on_shutdown;                         /* Trigger manual failover on shutdown to primary. */
     /* Debug config that goes along with cluster_drop_packet_filter. When set, the link is closed on packet drop. */
     uint32_t debug_cluster_close_link_on_packet_drop : 1;
     /* Debug config to control the random ping. When set, we will disable the random ping in clusterCron. */

diff --git a/tests/support/util.tcl b/tests/support/util.tcl
@@ -126,6 +126,31 @@ proc wait_replica_online r {
     }
 }
 
+proc check_replica_acked_ofs {primary replica_ip replica_port} {
+    set infostr [$primary info replication]
+    set master_repl_offset [getInfoProperty $infostr master_repl_offset]
+    if {[regexp -lineanchor "^slave\\d:ip=$replica_ip,port=$replica_port,.*,offset=(\\d+).*\r\n" $infostr _ offset]} {
+        if {$master_repl_offset == $offset} {
+            return 1
+        }
+        return 0
+    }
+    return 0
+}
+
+proc wait_replica_acked_ofs {primary replica replica_ip replica_port} {
+    $primary config set repl-ping-replica-period 3600
+    $replica config set hz 500
+    wait_for_condition 1000 50 {
+        [check_replica_acked_ofs $primary $replica_ip $replica_port] eq 1
+    } else {
+        puts "INFO REPLICATION: [$primary info replication]"
+        fail "replica acked offset didn't match in time"
+    }
+    $primary config set repl-ping-replica-period 10
+    $replica config set hz 10
+}
+
 proc wait_for_ofs_sync {r1 r2} {
     wait_for_condition 50 100 {
         [status $r1 master_repl_offset] eq [status $r2 master_repl_offset]

diff --git a/tests/unit/cluster/auto-failover-on-shutdown.tcl b/tests/unit/cluster/auto-failover-on-shutdown.tcl
@@ -0,0 +1,93 @@
+proc shutdown_how {srv_id how} {
+    if {$how == "shutdown"} {
+        catch {R $srv_id shutdown nosave}
+    } elseif {$how == "sigterm"} {
+        exec kill -SIGTERM [s -$srv_id process_id]
+    }
+}
+
+# We will start a cluster with 3 primary nodes and 4 replicas, the primary 1 will have 2 replicas.
+# We will pause the replica 1, and then shutdown the primary 1, and making replica 2 to become
+# the new primary.
+proc test_main {how shutdown_timeout} {
+    test "auto-failover-on-shutdown will always pick a best replica and send CLUSTER FAILOVER - $how - shutdown-timeout: $shutdown_timeout" {
+        set primary [srv 0 client]
+        set replica1 [srv -3 client]
+        set replica1_pid [s -3 process_id]
+        set replica2 [srv -6 client]
+        set replica2_ip [srv -6 host]
+        set replica2_port [srv -6 port]
+
+        $primary config set auto-failover-on-shutdown yes
+        $primary config set shutdown-timeout $shutdown_timeout
+        $primary config set repl-ping-replica-period 3600
+
+        # To avoid failover kick in.
+        $replica2 config set cluster-replica-no-failover yes
+
+        # Pause a replica so it has no chance to catch up with the offset.
+        pause_process $replica1_pid
+
+        # Primary write some data to increase the offset.
+        for {set i 0} {$i < 10} {incr i} {
+            $primary incr key_991803
+        }
+
+        if {$shutdown_timeout == 0} {
+            # Wait the replica2 catch up with the offset
+            wait_for_ofs_sync $primary $replica2
+            wait_replica_acked_ofs $primary $replica2 $replica2_ip $replica2_port
+        } else {
+            # If shutdown-timeout is enable, we expect the primary to pause writing
+            # and wait for the replica to catch up with the offset.
+        }
+
+        # Shutdown the primary.
+        shutdown_how 0 $how
+
+        # Wait for the replica2 to become a primary.
+        wait_for_condition 1000 50 {
+            [s -6 role] eq {master}
+        } else {
+            puts "s -6 role: [s -6 role]"
+            fail "Failover does not happened"
+        }
+
+        # Make sure that the expected logs are printed.
+        verify_log_message 0 "*Sending CLUSTER FAILOVER FORCE to replica*" 0
+        verify_log_message -6 "*Forced failover primary request accepted*" 0
+
+        resume_process $replica1_pid
+    }
+
+    test "Unable to find a replica to perform an auto failover - $how" {
+        set primary [srv -6 client]
+        set replica1 [srv -3 client]
+        set replica1_pid [s -3 process_id]
+
+        pause_process $replica1_pid
+
+        $primary config set auto-failover-on-shutdown yes
+        $primary client kill type replica
+        shutdown_how 6 $how
+        wait_for_log_messages -6 {"*Unable to find a replica to perform an auto failover on shutdown*"} 0 1000 10
+
+        resume_process $replica1_pid
+    }
+}
+
+start_cluster 3 4 {tags {external:skip cluster}} {
+    test_main "shutdown" 0
+}
+
+start_cluster 3 4 {tags {external:skip cluster}} {
+    test_main "sigterm" 0
+}
+
+start_cluster 3 4 {tags {external:skip cluster}} {
+    test_main "shutdown" 10
+}
+
+start_cluster 3 4 {tags {external:skip cluster}} {
+    test_main "sigterm" 10
+}
diff --git a/valkey.conf b/valkey.conf
@@ -1683,6 +1683,10 @@ aof-timestamp-enabled no
 # shutdown-on-sigint default
 # shutdown-on-sigterm default
 
+# TODO
+#
+# auto-failover-on-shutdown no
+
 ################ NON-DETERMINISTIC LONG BLOCKING COMMANDS #####################
 
 # Maximum time in milliseconds for EVAL scripts, functions and in some cases