-
Notifications
You must be signed in to change notification settings - Fork 703
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Trigger manual failover on SIGTERM / shutdown to cluster primary #1091
base: unstable
Are you sure you want to change the base?
Changes from 4 commits
6ab8888
4b49f03
f9ca731
df0ef8d
594fd5a
519eb2a
32043dd
e7b33fa
d6649e5
64831c9
b06a8c4
5f7b429
e56a360
0ccc4e4
c9bfd69
c8037a1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1233,25 +1233,30 @@ void clusterInitLast(void) { | |
|
||
/* Called when a cluster node receives SHUTDOWN. */ | ||
void clusterHandleServerShutdown(void) { | ||
if (server.auto_failover_on_shutdown) { | ||
if (nodeIsPrimary(myself) && server.auto_failover_on_shutdown) { | ||
/* Find the first best replica, that is, the replica with the largest offset. */ | ||
client *best_replica = NULL; | ||
listIter replicas_iter; | ||
listNode *replicas_list_node; | ||
listRewind(server.replicas, &replicas_iter); | ||
while ((replicas_list_node = listNext(&replicas_iter)) != NULL) { | ||
client *replica = listNodeValue(replicas_list_node); | ||
/* This is done only when the replica offset is caught up, to avoid data loss */ | ||
if (replica->repl_state == REPLICA_STATE_ONLINE && replica->repl_ack_off == server.primary_repl_offset) { | ||
/* This is done only when the replica offset is caught up, to avoid data loss. | ||
* And 0x800ff is 8.0.255, we only support new versions for this feature. */ | ||
if (replica->repl_data->repl_state == REPLICA_STATE_ONLINE && | ||
// replica->repl_data->replica_version > 0x800ff && | ||
replica->name && sdslen(replica->name->ptr) == CLUSTER_NAMELEN && | ||
replica->repl_data->repl_ack_off == server.primary_repl_offset) { | ||
best_replica = replica; | ||
break; | ||
} | ||
} | ||
|
||
if (best_replica) { | ||
/* Send a CLUSTER FAILOVER FORCE to the best replica. */ | ||
const char *buf = "*3\r\n$7\r\nCLUSTER\r\n$8\r\nFAILOVER\r\n$5\r\nFORCE\r\n"; | ||
if (connWrite(best_replica->conn, buf, strlen(buf)) == (int)strlen(buf)) { | ||
char buf[128]; | ||
size_t buflen = snprintf(buf, sizeof(buf), "*5\r\n$7\r\nCLUSTER\r\n$8\r\nFAILOVER\r\n$5\r\nFORCE\r\n$9\r\nreplicaid\r\n$%d\r\n%s\r\n", CLUSTER_NAMELEN, (char *)best_replica->name->ptr); | ||
enjoy-binbin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (connWrite(best_replica->conn, buf, buflen) == (int)strlen(buf)) { | ||
serverLog(LL_NOTICE, "Sending CLUSTER FAILOVER FORCE to replica %s succeeded.", | ||
replicationGetReplicaName(best_replica)); | ||
} else { | ||
|
@@ -4821,8 +4826,9 @@ void clusterHandleReplicaFailover(void) { | |
if (server.cluster->mf_end) { | ||
server.cluster->failover_auth_time = now; | ||
server.cluster->failover_auth_rank = 0; | ||
server.cluster->failover_auth_count++; | ||
clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); | ||
/* Reset auth_age since it is outdated now and we can bypass the auth_timeout | ||
* check in the next state and start the election ASAP. */ | ||
auth_age = 0; | ||
} | ||
serverLog(LL_NOTICE, | ||
"Start of election delayed for %lld milliseconds " | ||
|
@@ -7026,32 +7032,46 @@ int clusterCommandSpecial(client *c) { | |
} else { | ||
addReplyLongLong(c, clusterNodeFailureReportsCount(n)); | ||
} | ||
} else if (!strcasecmp(c->argv[1]->ptr, "failover") && (c->argc == 2 || c->argc == 3)) { | ||
/* CLUSTER FAILOVER [FORCE|TAKEOVER] */ | ||
} else if (!strcasecmp(c->argv[1]->ptr, "failover") && (c->argc >= 2)) { | ||
/* CLUSTER FAILOVER [FORCE|TAKEOVER] [replicaid <node id>] */ | ||
zuiderkwast marked this conversation as resolved.
Show resolved
Hide resolved
|
||
int force = 0, takeover = 0; | ||
robj *replicaid = NULL; | ||
|
||
if (c->argc == 3) { | ||
if (!strcasecmp(c->argv[2]->ptr, "force")) { | ||
for (int j = 2; j < c->argc; j++) { | ||
int moreargs = (c->argc - 1) - j; | ||
if (!strcasecmp(c->argv[j]->ptr, "force")) { | ||
force = 1; | ||
} else if (!strcasecmp(c->argv[2]->ptr, "takeover")) { | ||
} else if (!strcasecmp(c->argv[j]->ptr, "takeover")) { | ||
takeover = 1; | ||
force = 1; /* Takeover also implies force. */ | ||
} else if (!strcasecmp(c->argv[j]->ptr, "replicaid") && moreargs) { | ||
j++; | ||
replicaid = c->argv[j]; | ||
} else { | ||
addReplyErrorObject(c, shared.syntaxerr); | ||
return 1; | ||
} | ||
} | ||
|
||
/* Check if it should be executed by myself. */ | ||
if (replicaid != NULL) { | ||
clusterNode *n = clusterLookupNode(replicaid->ptr, sdslen(replicaid->ptr)); | ||
if (n != myself) { | ||
enjoy-binbin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/* Ignore this command, including the sanity check and the process. */ | ||
addReply(c, shared.ok); | ||
return 1; | ||
} | ||
} | ||
|
||
/* Check preconditions. */ | ||
if (clusterNodeIsPrimary(myself)) { | ||
addReplyError(c, "You should send CLUSTER FAILOVER to a replica"); | ||
if (replicaid == NULL) addReplyError(c, "You should send CLUSTER FAILOVER to a replica"); | ||
return 1; | ||
} else if (myself->replicaof == NULL) { | ||
addReplyError(c, "I'm a replica but my master is unknown to me"); | ||
if (replicaid == NULL) addReplyError(c, "I'm a replica but my master is unknown to me"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why add this? If the primary is unknown, the failover can't succeed, so I think we need to return an error even if REPLICAID is sent. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was wondering if there would be some races, that a replica return an error to the priamry. Or maybe we should always return OK if replicaid is passed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Replicas don't send the replies to primaries. Only problem is the confic to panic on repöocation errors. But i can't see any races. Can you? We can return ok if you want. Maybe we should check that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. since we will write it to the backlog, i am worry that after some down and up, the psync will get the command and return an error. Though I haven't verified it specifically. |
||
return 1; | ||
} else if (!force && (nodeFailed(myself->replicaof) || myself->replicaof->link == NULL)) { | ||
addReplyError(c, "Master is down or failed, " | ||
"please use CLUSTER FAILOVER FORCE"); | ||
if (replicaid == NULL) addReplyError(c, "Master is down or failed, please use CLUSTER FAILOVER FORCE"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why add this? I think it should be an error even with REPLICAID. |
||
return 1; | ||
} | ||
resetManualFailover(); | ||
|
@@ -7075,7 +7095,7 @@ int clusterCommandSpecial(client *c) { | |
} else { | ||
serverLog(LL_NOTICE, "Forced failover user request accepted (user request from '%s').", client); | ||
} | ||
server.cluster->mf_can_start = 1; | ||
manualFailoverCanStart(); | ||
/* We can start a manual failover as soon as possible, setting a flag | ||
* here so that we don't need to waiting for the cron to kick in. */ | ||
clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Forgot to uncomment the check?
In the comment, maybe we shall not say "new" because in a few years, this will not be new anymore.
Why not check
>= 0x80100
instead> 0x800ff
? It's the same but maybe easier to read?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i take this from
just a easy way that i can test in local.