From b2f152695812fcf16065f966b557603bbe164a30 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Date: Mon, 14 Oct 2019 15:03:08 +0500 Subject: [PATCH] Fix for miscellaneous watchdog issues. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The commit takes care of following reports in watchdog -- pgpool-general: 6672 Query (watchdog split brain) -- 0000547: We need to do arping again after recovering from split-brain. Basically in a bid to solve these issue, the commit makes blow mentioned behavioral changes in watchdog 1-- If life check reports a watchdog node is dead but the watchdog core is still able to connect and communicate with it without any problem, the watchdog core was getting into the dilemma of whether to consider it as a lost or an alive node. Fix: With this commit, the lost nodes reported by life-check (external or internal) are treated as hard failures even when the node is reachable from the watchdog core. and that node can only become alive if it is restarted or secondly if life-check informs that the node has become alive again. The second type of node failures that can mark the watchdog node as lost, occurs when a watchdog node fails to respond to the messages sent to it or frequent errors occur on its socket. These errors are detected by the watchdog core itself and get recovered automatically whenever that node becomes responsive again. Apart from this the commit also made some improvements in the area of detecting such internal errors. 2-- Standby nodes were too aggressive in reacting to the coup attempt (when another watchdog node tries to become a master while valid master already exists in the cluster) or to the possibility of split-brain (when it receives "I AM MASTER" message from the node that is not the master as per the standby node record). In both these situations, the standby nodes used to re-join the master node in the hope to find the true master. But that didn't prove to be a very good strategy, since it may lead to influence the selection of true master node when the true-master and fake-master nodes get into a fight to retain the master node status. As one of the things true and fake master compares is the number of connected standby nodes. And if during the fight the standby leaves the current master to re-join the cluster the standby node count on the true master becomes lower than actual and that affects its chances of winning the elections. Fix: The commit makes the standby nodes more lean-back and to actively reject the nodes that are trying to become a master in the presence of the true-master and avoid re-joining the cluster until it is absolutely necessary. 3-- The third problem was in the case of network partitioning or partial life-check failure (when node A thinks node B is lost but node B thinks node A is not lost) In this case the kicked out standby node was too aggressive in trying to connect to the master or become a master itself so that potentially put the unnecessary burden on the network and the cluster nodes. Fix: So the fix for this issue is to make the isolated node a bit calm and wait between trying to connect to master or become a master. For that purpose,a new WD_NETWORK_ISOLATION state is added to the state machine, which just adds a delay between successive tries of becoming a master. The node can only go into this state when it comes to knowing that it is marked as lost on the current master and gets out of it when it gets an indication that life-check on the remote node has informed that node is alive again or secondly it gets out of it after 10 seconds anyway. Finally, Because creating the partial or complete network failures scenario is very difficult so the commit also adds a watchdog debug aid similar to the health-check debug to simulate different scenarios. To enable the watchdog-debug aid the Pgpool-II needs to be compiled with  WATCHDOG_DEBUG=1" flag ($make WATCHDOG_DEBUG=1) Once compiled with debug aid enabled, you can put the command in pgpool_logdir->watchdog_debug_requests file to test different scenarios. e.g. $ echo "KILL_ALL_COMMUNICATION" > logdir/watchdog_debug_requests $ echo "KILL_ALL_SENDERS" >> logdir/watchdog_debug_requests The current list of commands that are supported by watchdog debug aid are DO_NOT_REPLY_TO_BEACON <= Standby node stops replying master node beacon messages             while this line is present in the watchdog_debug_requests file DO_NOT_SEND_BEACON <= Master node stops sending beacon messages to standby nodes             while this line is present in the watchdog_debug_requests file KILL_ALL_COMMUNICATION <= Watchdog stops all communication with all nodes             while this line is present in the watchdog_debug_requests file KILL_ALL_RECEIVERS  <= Watchdog ignores messages of all node             while this line is present in the watchdog_debug_requests file KILL_ALL_SENDERS  <= Watchdog stops sending messages to all nodes             while this line is present in the watchdog_debug_requests file --- src/include/watchdog/watchdog.h | 31 +- src/test/regression/regress.sh | 2 +- src/watchdog/Makefile.am | 3 +- src/watchdog/Makefile.in | 3 +- src/watchdog/watchdog.c | 768 +++++++++++++++++++++++++++----- 5 files changed, 698 insertions(+), 109 deletions(-) diff --git a/src/include/watchdog/watchdog.h b/src/include/watchdog/watchdog.h index 1146a4aec..83f6651b2 100644 --- a/src/include/watchdog/watchdog.h +++ b/src/include/watchdog/watchdog.h @@ -79,7 +79,8 @@ typedef enum WD_IN_NW_TROUBLE, /* the following states are only valid on remote nodes */ WD_SHUTDOWN, - WD_ADD_MESSAGE_SENT + WD_ADD_MESSAGE_SENT, + WD_NETWORK_ISOLATION } WD_STATES; typedef enum @@ -114,9 +115,21 @@ typedef enum WD_EVENT_NODE_CON_LOST, WD_EVENT_NODE_CON_FOUND, WD_EVENT_CLUSTER_QUORUM_CHANGED, - WD_EVENT_WD_STATE_REQUIRE_RELOAD + WD_EVENT_WD_STATE_REQUIRE_RELOAD, + WD_EVENT_I_AM_APPEARING_LOST, + WD_EVENT_I_AM_APPEARING_FOUND } WD_EVENTS; +typedef enum { + NODE_LOST_UNKNOWN_REASON, + NODE_LOST_BY_LIFECHECK, + NODE_LOST_BY_SEND_FAILURE, + NODE_LOST_BY_MISSING_BEACON, + NODE_LOST_BY_RECEIVE_TIMEOUT, + NODE_LOST_BY_NOT_REACHABLE, + NODE_LOST_SHUTDOWN +} WD_NODE_LOST_REASONS; + typedef struct SocketConnection { int sock; /* socket descriptor */ @@ -135,6 +148,20 @@ typedef struct WatchdogNode * from the node */ struct timeval last_sent_time; /* timestamp when last packet was sent on * the node */ + bool has_lost_us; /* + * True when this remote node thinks + * we are lost + */ + int sending_failures_count; /* number of times we have failed + * to send message to the node. + * Gets reset after successfull sent + */ + int missed_beacon_count; /* number of times the node has + * failed to reply for beacon. + * message + */ + WD_NODE_LOST_REASONS node_lost_reason; + char pgp_version[MAX_VERSION_STR_LEN]; /* Pgpool-II version */ int wd_data_major_version; /* watchdog messaging version major*/ int wd_data_minor_version; /* watchdog messaging version minor*/ diff --git a/src/test/regression/regress.sh b/src/test/regression/regress.sh index a891c2ae4..37efee029 100755 --- a/src/test/regression/regress.sh +++ b/src/test/regression/regress.sh @@ -38,7 +38,7 @@ function install_pgpool test -d $log || mkdir $log - make install HEALTHCHECK_DEBUG=1 -C $dir/../../ -e prefix=${PGPOOL_PATH} >& regression.log 2>&1 + make install HEALTHCHECK_DEBUG=1 WATCHDOG_DEBUG=1 -C $dir/../../ -e prefix=${PGPOOL_PATH} >& regression.log 2>&1 if [ $? != 0 ];then echo "make install failed" diff --git a/src/watchdog/Makefile.am b/src/watchdog/Makefile.am index bb4c2204c..0400459f2 100644 --- a/src/watchdog/Makefile.am +++ b/src/watchdog/Makefile.am @@ -1,6 +1,6 @@ top_builddir = ../.. AM_CPPFLAGS = -D_GNU_SOURCE -I @PGSQL_INCLUDE_DIR@ - +WATCHDOG_DEBUG=0 noinst_LIBRARIES = lib-watchdog.a lib_watchdog_a_SOURCES = \ @@ -16,3 +16,4 @@ lib_watchdog_a_SOURCES = \ wd_utils.c \ wd_escalation.c +DEFS = @DEFS@ -DWATCHDOG_DEBUG_OPTS=$(WATCHDOG_DEBUG) diff --git a/src/watchdog/Makefile.in b/src/watchdog/Makefile.in index 891d6058b..103dd3189 100644 --- a/src/watchdog/Makefile.in +++ b/src/watchdog/Makefile.in @@ -189,7 +189,7 @@ COLLATEINDEX = @COLLATEINDEX@ CPP = @CPP@ CPPFLAGS = @CPPFLAGS@ CYGPATH_W = @CYGPATH_W@ -DEFS = @DEFS@ +DEFS = @DEFS@ -DWATCHDOG_DEBUG_OPTS=$(WATCHDOG_DEBUG) DLLTOOL = @DLLTOOL@ DOCBOOKSTYLE = @DOCBOOKSTYLE@ DSYMUTIL = @DSYMUTIL@ @@ -312,6 +312,7 @@ top_build_prefix = @top_build_prefix@ top_builddir = ../.. top_srcdir = @top_srcdir@ AM_CPPFLAGS = -D_GNU_SOURCE -I @PGSQL_INCLUDE_DIR@ +WATCHDOG_DEBUG = 0 noinst_LIBRARIES = lib-watchdog.a lib_watchdog_a_SOURCES = \ watchdog.c \ diff --git a/src/watchdog/watchdog.c b/src/watchdog/watchdog.c index c240717da..625660ec6 100644 --- a/src/watchdog/watchdog.c +++ b/src/watchdog/watchdog.c @@ -89,10 +89,23 @@ typedef enum IPC_CMD_PREOCESS_RES #define MAX_SECS_WAIT_FOR_REPLY_FROM_NODE 5 /* time in seconds to wait for * the reply from remote * watchdog node */ + +#define MAX_ALLOWED_SEND_FAILURES 3 /* number of times sending message failure + * can be tolerated + */ +#define MAX_ALLOWED_BEACON_REPLY_MISS 3 /* number of times missing beacon message reply + * can be tolerated + */ + + #define FAILOVER_COMMAND_FINISH_TIMEOUT 15 /* timeout in seconds to wait * for Pgpool-II to build * consensus for failover */ +#define MIN_SECS_BETWEEN_BROADCAST_SRV_MSG 5 /* minimum amount of seconds to wait + * before broadcasting the same cluster + * service message */ + #define WD_NO_MESSAGE 0 #define WD_ADD_NODE_MESSAGE 'A' @@ -129,6 +142,8 @@ typedef enum IPC_CMD_PREOCESS_RES #define CLUSTER_IAM_RESIGNING_FROM_MASTER 'R' #define CLUSTER_NODE_INVALID_VERSION 'V' #define CLUSTER_NODE_REQUIRE_TO_RELOAD 'I' +#define CLUSTER_NODE_APPEARING_LOST 'Y' +#define CLUSTER_NODE_APPEARING_FOUND 'Z' #define WD_MASTER_NODE getMasterWatchdogNode() @@ -199,7 +214,8 @@ char *wd_event_name[] = "NODE CONNECTION LOST", "NODE CONNECTION FOUND", "CLUSTER QUORUM STATUS CHANGED", - "NODE REQUIRE TO RELOAD STATE" + "NODE REQUIRE TO RELOAD STATE", + "I AM APPEARING LOST" }; char *wd_state_names[] = { @@ -214,7 +230,18 @@ char *wd_state_names[] = { "LOST", "IN NETWORK TROUBLE", "SHUTDOWN", - "ADD MESSAGE SENT" + "ADD MESSAGE SENT", + "NETWORK ISOLATION" +}; + +char *wd_node_lost_reasons[] = { + "UNKNOWN REASON", + "REPORTED BY LIFECHECK", + "SEND MESSAGE FAILURES", + "MISSING BEACON REPLIES", + "RECEIVE TIMEOUT", + "NOT REACHABLE", + "SHUTDOWN" }; typedef struct WDPacketData @@ -329,6 +356,10 @@ typedef struct wd_cluster bool clusterInitialized; bool ipc_auth_needed; int current_failover_id; + struct timeval last_bcast_srv_msg_time; /* timestamp when last packet was + * boradcasted by the local node */ + char last_bcast_srv_msg; + List *unidentified_socks; List *notify_clients; List *ipc_command_socks; @@ -353,6 +384,22 @@ typedef struct WDFailoverObject int state; } WDFailoverObject; +#ifdef WATCHDOG_DEBUG_OPTS +#if WATCHDOG_DEBUG_OPTS > 0 +#define WATCHDOG_DEBUG +#endif +#endif + +static bool check_debug_request_do_not_send_beacon(void); +static bool check_debug_request_do_not_reply_beacon(void); +static bool check_debug_request_kill_all_communication(void); +static bool check_debug_request_kill_all_receivers(void); +static bool check_debug_request_kill_all_senders(void); + + +#ifdef WATCHDOG_DEBUG +static void load_watchdog_debug_test_option(void); +#endif static void process_remote_failover_command_on_coordinator(WatchdogNode * wdNode, WDPacketData * pkt); static WDFailoverObject * get_failover_object(POOL_REQUEST_KIND reqKind, int nodesCount, int *nodeList); @@ -422,7 +469,7 @@ static void service_internal_command(void); static unsigned int get_next_commandID(void); static WatchdogNode * parse_node_info_message(WDPacketData * pkt, char **authkey); static void update_quorum_status(void); -static int get_mimimum_remote_nodes_required_for_quorum(void); +static int get_minimum_remote_nodes_required_for_quorum(void); static int get_minimum_votes_to_resolve_consensus(void); static bool write_packet_to_socket(int sock, WDPacketData * pkt, bool ipcPacket); @@ -462,6 +509,7 @@ static int watchdog_state_machine_joining(WD_EVENTS event, WatchdogNode * wdNode static int watchdog_state_machine_loading(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand); static int watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand); static int watchdog_state_machine_nw_error(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand); +static int watchdog_state_machine_nw_isolation(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand); static int I_am_master_and_cluser_in_split_brain(WatchdogNode * otherMasterNode); static void handle_split_brain(WatchdogNode * otherMasterNode, WDPacketData * pkt); @@ -530,6 +578,7 @@ static void set_cluster_master_node(WatchdogNode * wdNode); static void clear_standby_nodes_list(void); static int standby_node_left_cluster(WatchdogNode * wdNode); static int standby_node_join_cluster(WatchdogNode * wdNode); +static void update_missed_beacon_count(WDCommandData* ipcCommand, bool clear); /* global variables */ wd_cluster g_cluster; @@ -1171,6 +1220,9 @@ watchdog_main(void) g_timeout_sec = 0; } } +#ifdef WATCHDOG_DEBUG + load_watchdog_debug_test_option(); +#endif if (select_ret > 0) { int processed_fds = 0; @@ -1410,10 +1462,14 @@ read_sockets(fd_set *rmask, int pending_fds_count) if (pkt) { - watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL); - /* since a packet is received reset last sent time */ - wdNode->last_sent_time.tv_sec = 0; - wdNode->last_sent_time.tv_usec = 0; + if (check_debug_request_kill_all_communication() == false && + check_debug_request_kill_all_receivers() == false) + { + watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL); + /* since a packet is received reset last sent time */ + wdNode->last_sent_time.tv_sec = 0; + wdNode->last_sent_time.tv_usec = 0; + } free_packet(pkt); } else @@ -1437,10 +1493,14 @@ read_sockets(fd_set *rmask, int pending_fds_count) if (pkt) { - watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL); - /* since a packet is received reset last sent time */ - wdNode->last_sent_time.tv_sec = 0; - wdNode->last_sent_time.tv_usec = 0; + if (check_debug_request_kill_all_communication() == false && + check_debug_request_kill_all_receivers() == false) + { + watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL); + /* since a packet is received reset last sent time */ + wdNode->last_sent_time.tv_sec = 0; + wdNode->last_sent_time.tv_usec = 0; + } free_packet(pkt); } else @@ -1470,6 +1530,7 @@ read_sockets(fd_set *rmask, int pending_fds_count) pkt = read_packet_of_type(conn, WD_ADD_NODE_MESSAGE); if (pkt) { + struct timeval previous_startup_time; char *authkey = NULL; WatchdogNode *tempNode = parse_node_info_message(pkt, &authkey); @@ -1486,6 +1547,7 @@ read_sockets(fd_set *rmask, int pending_fds_count) /* verify this node */ if (authenticated) { + WD_STATES oldNodeState = WD_DEAD; for (i = 0; i < g_cluster.remoteNodeCount; i++) { wdNode = &(g_cluster.remoteNodes[i]); @@ -1495,6 +1557,9 @@ read_sockets(fd_set *rmask, int pending_fds_count) { /* We have found the match */ found = true; + previous_startup_time.tv_sec = wdNode->startup_time.tv_sec; + oldNodeState = wdNode->state; + close_socket_connection(&wdNode->server_socket); strlcpy(wdNode->delegate_ip, tempNode->delegate_ip, WD_MAX_HOST_NAMELEN); strlcpy(wdNode->nodeName, tempNode->nodeName, WD_MAX_HOST_NAMELEN); @@ -1528,7 +1593,35 @@ read_sockets(fd_set *rmask, int pending_fds_count) wdNode->wd_data_major_version, wdNode->wd_data_minor_version))); - watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL); + if (oldNodeState == WD_SHUTDOWN) + { + ereport(LOG, + (errmsg("The newly joined node:\"%s\" had left the cluster because it was shutdown",wdNode->nodeName))); + watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL); + + } + else if (oldNodeState == WD_LOST) + { + ereport(LOG, + (errmsg("The newly joined node:\"%s\" had left the cluster because it was lost",wdNode->nodeName), + errdetail("lost reason was \"%s\" and startup time diff = %d", + wd_node_lost_reasons[wdNode->node_lost_reason], + abs((int)(previous_startup_time.tv_sec - wdNode->startup_time.tv_sec))))); + + if (abs((int)(previous_startup_time.tv_sec - wdNode->startup_time.tv_sec)) <= 2 && + wdNode->node_lost_reason == NODE_LOST_BY_LIFECHECK) + { + ereport(LOG, + (errmsg("node:\"%s\" was reported lost by the lifecheck process",wdNode->nodeName), + errdetail("only lifecheck process can mark this node alive again"))); + /* restore the node's lost state */ + wdNode->state = oldNodeState; + } + else + watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL); + + } + } else ereport(NOTICE, @@ -1721,6 +1814,12 @@ write_ipc_command_with_result_data(WDCommandData * ipcCommand, char type, char * (errmsg("not replying to IPC, Invalid IPC command."))); return false; } + /* DEBUG AID */ + if (ipcCommand->commandSource == COMMAND_SOURCE_REMOTE && + (check_debug_request_kill_all_senders() || + check_debug_request_kill_all_communication())) + return false; + return write_packet_to_socket(ipcCommand->sourceIPCSocket, &pkt, true); } @@ -1932,7 +2031,7 @@ static IPC_CMD_PREOCESS_RES process_IPC_get_runtime_variable_value_request(WDCom json_value_free(root); ereport(NOTICE, (errmsg("failed to process get local variable IPC command"), - errdetail("unable to parse json data"))); + errdetail("unable to parse JSON data"))); return IPC_CMD_ERROR; } @@ -2031,7 +2130,7 @@ static IPC_CMD_PREOCESS_RES process_IPC_nodeStatusChange_command(WDCommandData * { ereport(NOTICE, (errmsg("failed to process NODE STATE CHANGE IPC command"), - errdetail("unable to parse json data"))); + errdetail("unable to parse JSON data"))); return IPC_CMD_ERROR; } @@ -2086,7 +2185,10 @@ fire_node_status_event(int nodeID, int nodeStatus) if (wdNode == g_cluster.localNode) watchdog_state_machine(WD_EVENT_LOCAL_NODE_LOST, wdNode, NULL, NULL); else + { + wdNode->node_lost_reason = NODE_LOST_BY_LIFECHECK; watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL); + } } else if (nodeStatus == WD_LIFECHECK_NODE_STATUS_ALIVE) { @@ -2188,7 +2290,7 @@ service_expired_failovers(void) BACKEND_INFO(node_id).role == ROLE_PRIMARY) { ereport(LOG, - (errmsg("We are not able to build consensus for our primary node failover request, got %d votesonly for failover request ID:%d", failoverObj->request_count, failoverObj->failoverID), + (errmsg("We are not able to build consensus for our primary node failover request, got %d votes only for failover request ID:%d", failoverObj->request_count, failoverObj->failoverID), errdetail("resigning from the coordinator"))); need_to_resign = true; } @@ -2797,7 +2899,7 @@ static IPC_CMD_PREOCESS_RES process_IPC_failover_indication(WDCommandData * ipcC { ereport(LOG, (errmsg("unable to process failover indication"), - errdetail("invalid json data in command packet"))); + errdetail("invalid JSON data in command packet"))); res = FAILOVER_RES_INVALID_FUNCTION; } if (root) @@ -2808,7 +2910,7 @@ static IPC_CMD_PREOCESS_RES process_IPC_failover_indication(WDCommandData * ipcC { ereport(LOG, (errmsg("unable to process failover indication"), - errdetail("invalid json data in command packet"))); + errdetail("invalid JSON data in command packet"))); res = FAILOVER_RES_INVALID_FUNCTION; } else if (failoverState == 0) /* start */ @@ -2851,7 +2953,7 @@ failover_start_indication(WDCommandData * ipcCommand) } else if (get_local_node_state() == WD_STANDBY) { - /* The node might be performing the locl quarantine opetaion */ + /* The node might be performing the local quarantine operation */ ereport(DEBUG1, (errmsg("main process is starting the local quarantine operation"))); return FAILOVER_RES_PROCEED; @@ -2878,7 +2980,7 @@ failover_end_indication(WDCommandData * ipcCommand) } else if (get_local_node_state() == WD_STANDBY) { - /* The node might be performing the locl quarantine opetaion */ + /* The node might be performing the local quarantine operation */ ereport(DEBUG1, (errmsg("main process is ending the local quarantine operation"))); return FAILOVER_RES_PROCEED; @@ -3257,7 +3359,7 @@ update_successful_outgoing_cons(fd_set *wmask, int pending_fds_count) { ereport(DEBUG1, (errmsg("error in outbound connection to %s:%d ", wdNode->hostname, wdNode->wd_port), - errdetail("getsockopt faile with error \"%s\"", strerror(errno)))); + errdetail("getsockopt failed with error \"%s\"", strerror(errno)))); close_socket_connection(&wdNode->client_socket); wdNode->client_socket.sock_state = WD_SOCK_ERROR; @@ -3721,9 +3823,9 @@ cluster_service_message_processor(WatchdogNode * wdNode, WDPacketData * pkt) else { ereport(LOG, - (errmsg("master/coordinator node \"%s\" decided to resigning from master, probably because of split-brain", + (errmsg("master/coordinator node \"%s\" decided to resign from master, probably because of split-brain", wdNode->nodeName), - errdetail("but it was not our coordinator/master anyway. ignoring the message"))); + errdetail("It was not our coordinator/master anyway. ignoring the message"))); } } break; @@ -3769,7 +3871,7 @@ cluster_service_message_processor(WatchdogNode * wdNode, WDPacketData * pkt) else { ereport(LOG, - (errmsg("master/coordinator node \"%s\" decided to resigning from master, probably because of split-brain", + (errmsg("master/coordinator node \"%s\" decided to resign from master, probably because of split-brain", wdNode->nodeName), errdetail("but it was not our coordinator/master anyway. ignoring the message"))); } @@ -3782,13 +3884,34 @@ cluster_service_message_processor(WatchdogNode * wdNode, WDPacketData * pkt) } break; + case CLUSTER_NODE_APPEARING_LOST: + { + ereport(LOG, + (errmsg("remote node \"%s\" is reporting that it has lost us", + wdNode->nodeName))); + wdNode->has_lost_us = true; + watchdog_state_machine(WD_EVENT_I_AM_APPEARING_LOST, wdNode, NULL, NULL); + } + break; + + case CLUSTER_NODE_APPEARING_FOUND: + { + ereport(LOG, + (errmsg("remote node \"%s\" is reporting that it has found us again", + wdNode->nodeName))); + wdNode->has_lost_us = false; + watchdog_state_machine(WD_EVENT_I_AM_APPEARING_FOUND, wdNode, NULL, NULL); + } + break; + case CLUSTER_NODE_INVALID_VERSION: { /* * this should never happen means something is seriously wrong */ ereport(FATAL, - (errmsg("\"%s\" node has found serious issues in our watchdog messages", + (return_code(POOL_EXIT_FATAL), + errmsg("\"%s\" node has found serious issues in our watchdog messages", wdNode->nodeName), errdetail("shutting down"))); } @@ -3853,16 +3976,6 @@ standard_packet_processor(WatchdogNode * wdNode, WDPacketData * pkt) } break; - case WD_EVENT_REMOTE_NODE_FOUND: - { - ereport(LOG, - (errmsg("remote node \"%s\" became reachable again", wdNode->nodeName), - errdetail("requesting the node info"))); - send_message_of_type(wdNode, WD_REQ_INFO_MESSAGE, NULL); - break; - } - break; - case WD_ADD_NODE_MESSAGE: case WD_REQ_INFO_MESSAGE: replyPkt = get_mynode_info_message(pkt); @@ -3962,6 +4075,33 @@ standard_packet_processor(WatchdogNode * wdNode, WDPacketData * pkt) { standby_node_left_cluster(wdNode); } + if (oldNodeState == WD_LOST) + { + /* + * We have received the message from lost node + * add it back to cluster if it was not marked by + * life-check + * Node lost by life-check processes can only be + * added back when we get alive notification for the + * node from life-check + */ + ereport(LOG, + (errmsg("we have received the NODE INFO message from the node:\"%s\" that was lost",wdNode->nodeName), + errdetail("we had lost this node because of \"%s\"",wd_node_lost_reasons[wdNode->node_lost_reason]))); + + if (wdNode->node_lost_reason == NODE_LOST_BY_LIFECHECK) + { + ereport(LOG, + (errmsg("node:\"%s\" was reported lost by the lifecheck process",wdNode->nodeName), + errdetail("only life-check process can mark this node alive again"))); + /* restore the node's lost state */ + wdNode->state = oldNodeState; + } + else + { + watchdog_state_machine(WD_EVENT_REMOTE_NODE_FOUND, wdNode, NULL, NULL); + } + } } break; @@ -3996,11 +4136,16 @@ standard_packet_processor(WatchdogNode * wdNode, WDPacketData * pkt) send_cluster_service_message(NULL, pkt, CLUSTER_IN_SPLIT_BRAIN); } - else + else if (WD_MASTER_NODE != NULL) { replyPkt = get_mynode_info_message(pkt); beacon_message_received_from_node(wdNode, pkt); } + /* + * if (WD_MASTER_NODE == NULL) + * do not reply to beacon if we are not connected to + * any master node + */ } break; @@ -4021,6 +4166,10 @@ standard_packet_processor(WatchdogNode * wdNode, WDPacketData * pkt) static bool send_message_to_connection(SocketConnection * conn, WDPacketData * pkt) { + if (check_debug_request_kill_all_communication() == true || + check_debug_request_kill_all_senders() == true) + return false; + if (conn->sock > 0 && conn->sock_state == WD_SOCK_CONNECTED) { if (write_packet_to_socket(conn->sock, pkt, false) == true) @@ -4047,6 +4196,8 @@ send_message_to_node(WatchdogNode * wdNode, WDPacketData * pkt) } if (ret) { + /* reset the sending error counter */ + wdNode->sending_failures_count = 0; /* we only update the last sent time if reply for packet is expected */ switch (pkt->type) { @@ -4061,6 +4212,7 @@ send_message_to_node(WatchdogNode * wdNode, WDPacketData * pkt) } else { + wdNode->sending_failures_count++; ereport(DEBUG1, (errmsg("sending packet %c to node \"%s\" failed", pkt->type, wdNode->nodeName))); } @@ -4130,7 +4282,7 @@ static IPC_CMD_PREOCESS_RES wd_command_processor_for_node_lost_event(WDCommandDa if (nodeResult->cmdState == COMMAND_STATE_SENT) { ereport(LOG, - (errmsg("remote node \"%s\" lost while ipc command was in progress ", wdLostNode->nodeName))); + (errmsg("remote node \"%s\" lost while IPC command was in progress ", wdLostNode->nodeName))); /* * since the node is lost and will be removed from the @@ -4353,15 +4505,36 @@ service_unreachable_nodes(void) (errmsg("remote node \"%s\" is not replying..", wdNode->nodeName), errdetail("marking the node as lost"))); /* mark the node as lost */ + wdNode->node_lost_reason = NODE_LOST_BY_RECEIVE_TIMEOUT; watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL); } } + else if (wdNode->sending_failures_count > MAX_ALLOWED_SEND_FAILURES) + { + ereport(LOG, + (errmsg("not able to send messages to remote node \"%s\"",wdNode->nodeName), + errdetail("marking the node as lost"))); + /* mark the node as lost */ + wdNode->node_lost_reason = NODE_LOST_BY_SEND_FAILURE; + watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL); + } + else if (wdNode->missed_beacon_count > MAX_ALLOWED_BEACON_REPLY_MISS) + { + ereport(LOG, + (errmsg("remote node \"%s\" is not responding to our beacon messages",wdNode->nodeName), + errdetail("marking the node as lost"))); + /* mark the node as lost */ + wdNode->node_lost_reason = NODE_LOST_BY_MISSING_BEACON; + wdNode->missed_beacon_count = 0; /* Reset the counter */ + watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL); + } } else { ereport(LOG, (errmsg("remote node \"%s\" is not reachable", wdNode->nodeName), errdetail("marking the node as lost"))); + wdNode->node_lost_reason = NODE_LOST_BY_NOT_REACHABLE; watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL); } } @@ -4390,8 +4563,6 @@ watchdog_internal_command_packet_processor(WatchdogNode * wdNode, WDPacketData * for (i = 0; i < g_cluster.remoteNodeCount; i++) { WDCommandNodeResult *nodeRes = &clusterCommand->nodeResults[i]; - - clear_command_node_result(nodeRes); if (nodeRes->wdNode == wdNode) { nodeResult = nodeRes; @@ -4536,7 +4707,7 @@ issue_watchdog_internal_command(WatchdogNode * wdNode, WDPacketData * pkt, int t if (send_message_to_node(nodeResult->wdNode, pkt) == false) { ereport(DEBUG1, - (errmsg("failed to send watchdog internla command packet %s", nodeResult->wdNode->nodeName), + (errmsg("failed to send watchdog internal command packet %s", nodeResult->wdNode->nodeName), errdetail("saving the packet. will try to resend it if connection recovers"))); /* failed to send. May be try again later */ @@ -4733,6 +4904,30 @@ reply_with_minimal_message(WatchdogNode * wdNode, char type, WDPacketData * repl static bool send_cluster_service_message(WatchdogNode * wdNode, WDPacketData * replyFor, char message) { + /* Check if its a broadcast message */ + if (wdNode == NULL) + { + /* see if we have already broadcasted the similar message recently */ + if (message == g_cluster.last_bcast_srv_msg) + { + struct timeval currTime; + gettimeofday(&currTime, NULL); + int last_bcast_sec = WD_TIME_DIFF_SEC(currTime, g_cluster.last_bcast_srv_msg_time); + if (last_bcast_sec < MIN_SECS_BETWEEN_BROADCAST_SRV_MSG) + { + /* + * do not broadcast this message + * to prevent flooding + */ + ereport(DEBUG4, + (errmsg("not broadcasting cluster service message %c to prevent flooding ",message), + errdetail("last time same message was sent %d seconds ago",last_bcast_sec))); + return true; + } + } + g_cluster.last_bcast_srv_msg = message; + gettimeofday(&g_cluster.last_bcast_srv_msg_time, NULL); + } return reply_with_message(wdNode, WD_CLUSTER_SERVICE_MESSAGE, &message, 1, replyFor); } @@ -4933,9 +5128,6 @@ watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pk if (event == WD_EVENT_REMOTE_NODE_LOST) { - /* close all socket connections to the node */ - close_socket_connection(&wdNode->client_socket); - close_socket_connection(&wdNode->server_socket); if (wdNode->state == WD_SHUTDOWN) { @@ -4947,6 +5139,8 @@ watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pk wdNode->state = WD_LOST; ereport(LOG, (errmsg("remote node \"%s\" is lost", wdNode->nodeName))); + /* Inform the node, that it is lost for us */ + send_cluster_service_message(wdNode, pkt, CLUSTER_NODE_APPEARING_LOST); } if (wdNode == WD_MASTER_NODE) { @@ -4955,11 +5149,29 @@ watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pk set_cluster_master_node(NULL); } + /* close all socket connections to the node */ + close_socket_connection(&wdNode->client_socket); + close_socket_connection(&wdNode->server_socket); + /* clear the wait timer on the node */ wdNode->last_sent_time.tv_sec = 0; wdNode->last_sent_time.tv_usec = 0; + wdNode->sending_failures_count = 0; node_lost_while_ipc_command(wdNode); } + else if (event == WD_EVENT_REMOTE_NODE_FOUND) + { + ereport(LOG, + (errmsg("remote node \"%s\" became reachable again", wdNode->nodeName), + errdetail("requesting the node info"))); + /* + * remove the lost state from the node + * and change it to joining for now + */ + wdNode->node_lost_reason = NODE_LOST_UNKNOWN_REASON; + wdNode->state = WD_LOADING; + send_cluster_service_message(wdNode, pkt, CLUSTER_NODE_APPEARING_FOUND); + } else if (event == WD_EVENT_PACKET_RCV) { print_packet_node_info(pkt, wdNode, false); @@ -4974,6 +5186,7 @@ watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pk if (pkt->type == WD_INFORM_I_AM_GOING_DOWN) { wdNode->state = WD_SHUTDOWN; + wdNode->node_lost_reason = NODE_LOST_SHUTDOWN; return watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL); } @@ -4998,7 +5211,7 @@ watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pk if (any_interface_available() == false) { ereport(WARNING, - (errmsg("network event has occured and all monitored interfaces are down"), + (errmsg("network event has occurred and all monitored interfaces are down"), errdetail("changing the state to in network trouble"))); set_state(WD_IN_NW_TROUBLE); @@ -5040,7 +5253,7 @@ watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pk else if (event == WD_EVENT_LOCAL_NODE_LOST) { ereport(WARNING, - (errmsg("watchdog lifecheck reported, we are disconnected from the network"), + (errmsg("watchdog life-check reported, we are disconnected from the network"), errdetail("changing the state to LOST"))); set_state(WD_LOST); } @@ -5075,6 +5288,9 @@ watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pk case WD_IN_NW_TROUBLE: watchdog_state_machine_nw_error(event, wdNode, pkt, clusterCommand); break; + case WD_NETWORK_ISOLATION: + watchdog_state_machine_nw_isolation(event, wdNode, pkt, clusterCommand); + break; default: /* Should never ever happen */ ereport(WARNING, @@ -5130,7 +5346,7 @@ watchdog_state_machine_loading(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD case WD_STAND_FOR_COORDINATOR_MESSAGE: { /* - * We are loading but a note is already contesting + * We are loading but a node is already contesting * for coordinator node well we can ignore it but * then this could eventually mean a lower * priority node can became a coordinator node. So @@ -5387,8 +5603,10 @@ watchdog_state_machine_standForCord(WD_EVENTS event, WatchdogNode * wdNode, WDPa if (pkt->type == WD_ERROR_MESSAGE) { ereport(LOG, - (errmsg("our stand for coordinator request is rejected by node \"%s\"", wdNode->nodeName))); - set_state(WD_JOINING); + (errmsg("our stand for coordinator request is rejected by node \"%s\"",wdNode->nodeName), + errdetail("we might be in partial network isolation and cluster already have a valid master"), + errhint("please verify the watchdog life-check and network is working properly"))); + set_state(WD_NETWORK_ISOLATION); } else if (pkt->type == WD_REJECT_MESSAGE) { @@ -5493,6 +5711,7 @@ watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPac send_cluster_command(NULL, WD_DECLARE_COORDINATOR_MESSAGE, 4); set_timeout(MAX_SECS_WAIT_FOR_REPLY_FROM_NODE); + update_missed_beacon_count(NULL,true); ereport(LOG, (errmsg("I am announcing my self as master/coordinator watchdog node"))); @@ -5565,6 +5784,8 @@ watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPac else if (clusterCommand->commandPacket.type == WD_IAM_COORDINATOR_MESSAGE) { + update_missed_beacon_count(clusterCommand,false); + if (clusterCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED) { ereport(DEBUG1, @@ -5688,11 +5909,49 @@ watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPac case WD_EVENT_TIMEOUT: { - send_cluster_command(NULL, WD_IAM_COORDINATOR_MESSAGE, 5); + if (check_debug_request_do_not_send_beacon() == false) + send_cluster_command(NULL, WD_IAM_COORDINATOR_MESSAGE, 5); set_timeout(BEACON_MESSAGE_INTERVAL_SECONDS); } break; + case WD_EVENT_I_AM_APPEARING_LOST: + { + /* The remote node has lost us, It would have already marked + * us as lost, So remove it from standby*/ + standby_node_left_cluster(wdNode); + } + break; + + case WD_EVENT_I_AM_APPEARING_FOUND: + { + /* The remote node has found us again */ + if (wdNode->wd_data_major_version >= 1 && wdNode->wd_data_minor_version >= 1) + { + /* + * Since data version 1.1 we support CLUSTER_NODE_REQUIRE_TO_RELOAD + * which makes the standby nodes to re-send the join master node + */ + ereport(DEBUG1, + (errmsg("asking remote node \"%s\" to rejoin master", wdNode->nodeName), + errdetail("watchdog data version %s",WD_MESSAGE_DATA_VERSION))); + + send_cluster_service_message(wdNode, pkt, CLUSTER_NODE_REQUIRE_TO_RELOAD); + } + else + { + /* + * The node is on older version + * So ask it to re-join the cluster + */ + ereport(DEBUG1, + (errmsg("asking remote node \"%s\" to rejoin cluster", wdNode->nodeName), + errdetail("watchdog data version %s",WD_MESSAGE_DATA_VERSION))); + send_cluster_service_message(wdNode, pkt, CLUSTER_NEEDS_ELECTION); + } + } + break; + case WD_EVENT_REMOTE_NODE_LOST: { standby_node_left_cluster(wdNode); @@ -5704,6 +5963,7 @@ watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPac ereport(LOG, (errmsg("remote node \"%s\" is reachable again", wdNode->nodeName), errdetail("trying to add it back as a standby"))); + wdNode->node_lost_reason = NODE_LOST_UNKNOWN_REASON; /* If I am the cluster master. Ask for the node info and to re-send the join message */ send_message_of_type(wdNode, WD_REQ_INFO_MESSAGE, NULL); if (wdNode->wd_data_major_version >= 1 && wdNode->wd_data_minor_version >= 1) @@ -5736,12 +5996,21 @@ watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPac { switch (pkt->type) { + case WD_ADD_NODE_MESSAGE: + /* In case we received the ADD node message from + * one of our standby, Remove that standby from + * the list + */ + standby_node_left_cluster(wdNode); + standard_packet_processor(wdNode, pkt); + break; + case WD_STAND_FOR_COORDINATOR_MESSAGE: reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt); break; case WD_DECLARE_COORDINATOR_MESSAGE: ereport(NOTICE, - (errmsg("We are corrdinator and another node tried a coup"))); + (errmsg("We are coordinator and another node tried a coup"))); reply_with_minimal_message(wdNode, WD_ERROR_MESSAGE, pkt); break; @@ -5776,14 +6045,24 @@ watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPac case WD_JOIN_COORDINATOR_MESSAGE: { - reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt); - /* - * Also get the configurations from the standby - * node + * If the node is marked as lost because of + * life-check, Do not let it join the cluster */ - send_message_of_type(wdNode, WD_ASK_FOR_POOL_CONFIG, NULL); - standby_node_join_cluster(wdNode); + if (wdNode->state == WD_LOST && wdNode->node_lost_reason == NODE_LOST_BY_LIFECHECK) + { + ereport(LOG, + (errmsg("lost remote node \"%s\" is requesting to join the cluster",wdNode->nodeName), + errdetail("rejecting the request until life-check inform us that it is reachable again"))); + reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt); + } + else + { + reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt); + /* Also get the configurations from the standby node */ + send_message_of_type(wdNode,WD_ASK_FOR_POOL_CONFIG,NULL); + standby_node_join_cluster(wdNode); + } } break; @@ -5814,7 +6093,7 @@ watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPac * watchdog node when the network becomes reachable, but there is a problem. * * Once the cable on the system is unplugged or when the node gets isolated from the - * cluster there is every likelihood that the backend healthcheck of the isolated node + * cluster there is every likelihood that the backend health-check of the isolated node * start reporting the backend node failure and the pgpool-II proceeds to perform * the failover for all attached backend nodes. Since the pgpool-II is yet not * smart enough to figure out it is because of the network failure of its own @@ -5891,6 +6170,42 @@ watchdog_state_machine_nw_error(WD_EVENTS event, WatchdogNode * wdNode, WDPacket return 0; } +/* + * we could end up in tis state if we were connected to the + * master node as standby and got lost on the master. + * Here we just wait for BEACON_MESSAGE_INTERVAL_SECONDS + * and retry to join the cluster. + */ +static int +watchdog_state_machine_nw_isolation(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand) +{ + switch (event) + { + case WD_EVENT_WD_STATE_CHANGED: + set_timeout(BEACON_MESSAGE_INTERVAL_SECONDS); + break; + + case WD_EVENT_PACKET_RCV: + standard_packet_processor(wdNode, pkt); + break; + + case WD_EVENT_REMOTE_NODE_FOUND: + case WD_EVENT_WD_STATE_REQUIRE_RELOAD: + case WD_EVENT_I_AM_APPEARING_FOUND: + case WD_EVENT_TIMEOUT: + /* fall through */ + case WD_EVENT_NW_IP_IS_ASSIGNED: + ereport(LOG, + (errmsg("trying again to join the cluster"))); + set_state(WD_JOINING); + break; + + default: + break; + } + return 0; +} + static bool beacon_message_received_from_node(WatchdogNode * wdNode, WDPacketData * pkt) { @@ -6063,7 +6378,7 @@ handle_split_brain(WatchdogNode * otherMasterNode, WDPacketData * pkt) (errmsg("We are in split brain, and \"%s\" node is the best candidate for master/coordinator" ,otherMasterNode->nodeName), errdetail("re-initializing the local watchdog cluster state"))); - /* brodcast the message about I am not the true master node */ + /* broadcast the message about I am not the true master node */ send_cluster_service_message(NULL, pkt, CLUSTER_IAM_NOT_TRUE_MASTER); set_state(WD_JOINING); } @@ -6089,8 +6404,8 @@ start_escalated_node(void) while (g_cluster.de_escalation_pid > 0 && wait_secs-- > 0) { /* - * de_escalation proceess was already running and we are esclating - * again. give some time to de-escalation process to exit normaly + * de_escalation process was already running and we are escalating + * again. give some time to de-escalation process to exit normally */ ereport(LOG, (errmsg("waiting for de-escalation process to exit before starting escalation"))); @@ -6131,8 +6446,8 @@ resign_from_escalated_node(void) while (g_cluster.escalation_pid > 0 && wait_secs-- > 0) { /* - * escalation proceess was already running and we are resigning from - * it. wait for the escalation process to exit normaly + * escalation process was already running and we are resigning from + * it. wait for the escalation process to exit normally */ ereport(LOG, (errmsg("waiting for escalation process to exit before starting de-escalation"))); @@ -6228,10 +6543,11 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD send_cluster_command(WD_MASTER_NODE, WD_JOIN_COORDINATOR_MESSAGE, 5); /* Also reset my priority as per the original configuration */ g_cluster.localNode->wd_priority = pool_config->wd_priority; + set_timeout(BEACON_MESSAGE_INTERVAL_SECONDS); break; case WD_EVENT_TIMEOUT: - set_timeout(5); + set_timeout(BEACON_MESSAGE_INTERVAL_SECONDS); break; case WD_EVENT_WD_STATE_REQUIRE_RELOAD: @@ -6243,27 +6559,54 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD break; case WD_EVENT_COMMAND_FINISHED: + { + if (clusterCommand->commandPacket.type == WD_JOIN_COORDINATOR_MESSAGE) { - if (clusterCommand->commandPacket.type == WD_JOIN_COORDINATOR_MESSAGE) + if (clusterCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED || + clusterCommand->commandStatus == COMMAND_FINISHED_TIMEOUT) { - if (clusterCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED || - clusterCommand->commandStatus == COMMAND_FINISHED_TIMEOUT) - { - register_watchdog_state_change_interupt(); + register_watchdog_state_change_interupt(); + ereport(LOG, + (errmsg("successfully joined the watchdog cluster as standby node"), + errdetail("our join coordinator request is accepted by cluster leader node \"%s\"", WD_MASTER_NODE->nodeName))); + } + else + { + ereport(NOTICE, + (errmsg("our join coordinator is rejected by node \"%s\"", wdNode->nodeName), + errhint("rejoining the cluster."))); + + if (WD_MASTER_NODE->has_lost_us) + { ereport(LOG, - (errmsg("successfully joined the watchdog cluster as standby node"), - errdetail("our join coordinator request is accepted by cluster leader node \"%s\"", WD_MASTER_NODE->nodeName))); + (errmsg("master node \"%s\" thinks we are lost, and \"%s\" is not letting us join",WD_MASTER_NODE->nodeName,wdNode->nodeName), + errhint("please verify the watchdog life-check and network is working properly"))); + set_state(WD_NETWORK_ISOLATION); } else { - ereport(NOTICE, - (errmsg("our join coordinator is rejected by node \"%s\"", wdNode->nodeName), - errhint("rejoining the cluster."))); set_state(WD_JOINING); } } } + } + break; + + case WD_EVENT_I_AM_APPEARING_LOST: + { + /* The remote node has lost us, and if it + * was our coordinator we might already be + * removed from it's standby list + * So re-Join the cluster + */ + if (WD_MASTER_NODE == wdNode) + { + ereport(LOG, + (errmsg("we are lost on the master node \"%s\"",wdNode->nodeName))); + set_state(WD_JOINING); + } + } break; case WD_EVENT_REMOTE_NODE_LOST: @@ -6285,6 +6628,22 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD { switch (pkt->type) { + case WD_ADD_NODE_MESSAGE: + { + /* In case we received the ADD node message from + * our coordinator. Reset the cluster state + */ + if (wdNode == WD_MASTER_NODE) + { + ereport(LOG, + (errmsg("received ADD NODE message from the master node \"%s\"", wdNode->nodeName), + errdetail("re-joining the cluster"))); + set_state(WD_JOINING); + } + standard_packet_processor(wdNode, pkt); + } + break; + case WD_FAILOVER_END: { register_backend_state_sync_req_interupt(); @@ -6300,8 +6659,11 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD } else { + ereport(LOG, + (errmsg("We are connected to master node \"%s\" and another node \"%s\" is trying to become a master",WD_MASTER_NODE->nodeName, wdNode->nodeName))); reply_with_minimal_message(wdNode, WD_ERROR_MESSAGE, pkt); - set_state(WD_JOINING); + /* Ask master to re-send its node info */ + send_message_of_type(WD_MASTER_NODE, WD_REQ_INFO_MESSAGE, NULL); } } break; @@ -6312,14 +6674,14 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD { /* * we already have a master node and we got a - * new node trying to be master re-initialize - * the cluster, something is wrong + * new node trying to be master */ + ereport(LOG, + (errmsg("We are connected to master node \"%s\" and another node \"%s\" is trying to declare itself as a master",WD_MASTER_NODE->nodeName, wdNode->nodeName))); reply_with_minimal_message(wdNode, WD_ERROR_MESSAGE, pkt); - } - else - { - set_state(WD_JOINING); + /* Ask master to re-send its node info */ + send_message_of_type(WD_MASTER_NODE, WD_REQ_INFO_MESSAGE, NULL); + } } break; @@ -6339,7 +6701,7 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD send_cluster_service_message(NULL, pkt, CLUSTER_IN_SPLIT_BRAIN); } - else + else if (check_debug_request_do_not_reply_beacon() == false) { send_message_of_type(wdNode, WD_INFO_MESSAGE, pkt); beacon_message_received_from_node(wdNode, pkt); @@ -6369,7 +6731,7 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD gettimeofday(&currTime, NULL); int last_rcv_sec = WD_TIME_DIFF_SEC(currTime, WD_MASTER_NODE->last_rcv_time); - if (last_rcv_sec >= (2 * BEACON_MESSAGE_INTERVAL_SECONDS)) + if (last_rcv_sec >= (3 * BEACON_MESSAGE_INTERVAL_SECONDS)) { /* we have missed atleast two beacons from master node */ ereport(WARNING, @@ -6377,9 +6739,8 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD WD_MASTER_NODE->nodeName), errdetail("re-initializing the cluster"))); set_state(WD_JOINING); - } - else if (last_rcv_sec >= BEACON_MESSAGE_INTERVAL_SECONDS) + else if (last_rcv_sec >= (2 * BEACON_MESSAGE_INTERVAL_SECONDS)) { /* * We have not received a last becacon from master ask for the @@ -6400,10 +6761,10 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD * The function identifies the current quorum state * quorum values: * -1: - * quorum is lost or does not exisits + * quorum is lost or does not exists * 0: * The quorum is on the edge. (when participating cluster is configured - * with even number of nodes, and we have exectly 50% nodes + * with even number of nodes, and we have exactly 50% nodes * 1: * quorum exists */ @@ -6412,11 +6773,11 @@ update_quorum_status(void) { int quorum_status = g_cluster.quorum_status; - if (g_cluster.clusterMasterInfo.standby_nodes_count > get_mimimum_remote_nodes_required_for_quorum()) + if (g_cluster.clusterMasterInfo.standby_nodes_count > get_minimum_remote_nodes_required_for_quorum()) { g_cluster.quorum_status = 1; } - else if (g_cluster.clusterMasterInfo.standby_nodes_count == get_mimimum_remote_nodes_required_for_quorum()) + else if (g_cluster.clusterMasterInfo.standby_nodes_count == get_minimum_remote_nodes_required_for_quorum()) { if (g_cluster.remoteNodeCount % 2 != 0) { @@ -6443,10 +6804,10 @@ update_quorum_status(void) * returns the minimum number of remote nodes required for quorum */ static int -get_mimimum_remote_nodes_required_for_quorum(void) +get_minimum_remote_nodes_required_for_quorum(void) { /* - * Even numner of remote nodes, That means total number of nodes are odd, + * Even number of remote nodes, That means total number of nodes are odd, * so minimum quorum is just remote/2. */ if (g_cluster.remoteNodeCount % 2 == 0) @@ -6466,11 +6827,11 @@ static int get_minimum_votes_to_resolve_consensus(void) { /* - * Since get_mimimum_remote_nodes_required_for_quorum() returns + * Since get_minimum_remote_nodes_required_for_quorum() returns * the number of remote nodes required to complete the quorum * that is always one less than the total number of nodes required * for the cluster to build quorum or consensus, reason being - * in get_mimimum_remote_nodes_required_for_quorum() + * in get_minimum_remote_nodes_required_for_quorum() * we always consider the local node as a valid pre-casted vote. * But when it comes to count the number of votes required to build * consensus for any type of decision, for example for building the @@ -6482,8 +6843,8 @@ get_minimum_votes_to_resolve_consensus(void) * For example * If Total nodes in cluster = 4 * remote node will be = 3 - * get_mimimum_remote_nodes_required_for_quorum() return = 1 - * Minimum number of votes required for consensu will be + * get_minimum_remote_nodes_required_for_quorum() return = 1 + * Minimum number of votes required for consensus will be * * if(pool_config->enable_consensus_with_half_votes = true) * (exact 50% n/2) ==> 4/2 = 2 @@ -6493,13 +6854,13 @@ get_minimum_votes_to_resolve_consensus(void) * */ - int required_node_count = get_mimimum_remote_nodes_required_for_quorum() + 1; + int required_node_count = get_minimum_remote_nodes_required_for_quorum() + 1; /* * When the total number of nodes in the watchdog cluster including the * local node are even, The number of votes required for the consensus * depends on the enable_consensus_with_half_votes. * So for even number of nodes when enable_consensus_with_half_votes is - * not allowed than we would nedd one more vote than exact 50% + * not allowed than we would add one more vote than exact 50% */ if (g_cluster.remoteNodeCount % 2 != 0) { @@ -6512,7 +6873,7 @@ get_minimum_votes_to_resolve_consensus(void) /* * sets the state of local watchdog node, and fires a state change event - * if the new and old state differes + * if the new and old state differs */ static int set_state(WD_STATES newState) @@ -6973,7 +7334,7 @@ check_IPC_client_authentication(json_value * rootObj, bool internal_client_only) if (json_get_int_value_for_key(rootObj, WD_IPC_SHARED_KEY, (int *) &packet_key)) { ereport(DEBUG2, - (errmsg("IPC json data packet does not contain shared key"))); + (errmsg("IPC JSON data packet does not contain shared key"))); has_shared_key = false; } else @@ -6994,15 +7355,15 @@ check_IPC_client_authentication(json_value * rootObj, bool internal_client_only) if (has_shared_key == false) { ereport(LOG, - (errmsg("invalid json data packet"), - errdetail("authentication shared key not found in json data"))); + (errmsg("invalid JSON data packet"), + errdetail("authentication shared key not found in JSON data"))); return false; } /* compare if shared keys match */ if (*shared_key != packet_key) return false; - /* providing a valid shared key for inetenal clients is enough */ + /* providing a valid shared key for internal clients is enough */ return true; } @@ -7014,14 +7375,14 @@ check_IPC_client_authentication(json_value * rootObj, bool internal_client_only) if (has_shared_key == true && *shared_key == packet_key) return true; - /* shared key is out of question validate the authKey valurs */ + /* shared key is out of question validate the authKey values */ packet_auth_key = json_get_string_value_for_key(rootObj, WD_IPC_AUTH_KEY); if (packet_auth_key == NULL) { ereport(DEBUG1, - (errmsg("invalid json data packet"), - errdetail("authentication key not found in json data"))); + (errmsg("invalid JSON data packet"), + errdetail("authentication key not found in JSON data"))); return false; } @@ -7265,7 +7626,7 @@ set_cluster_master_node(WatchdogNode * wdNode) { if (wdNode == NULL) ereport(LOG, - (errmsg("unassigning the %s node \"%s\" from watchdog cluster master", + (errmsg("removing the %s node \"%s\" from watchdog cluster master", (g_cluster.localNode == WD_MASTER_NODE) ? "local" : "remote", WD_MASTER_NODE->nodeName))); else @@ -7365,3 +7726,202 @@ clear_standby_nodes_list(void) g_cluster.clusterMasterInfo.standby_nodes_count = 0; g_cluster.localNode->standby_nodes_count = 0; } + +static void update_missed_beacon_count(WDCommandData* ipcCommand, bool clear) +{ + int i; + for (i=0; i< g_cluster.remoteNodeCount; i++) + { + if (clear) + { + WatchdogNode* wdNode = &(g_cluster.remoteNodes[i]); + wdNode->missed_beacon_count = 0; + } + else + { + WDCommandNodeResult* nodeResult = &ipcCommand->nodeResults[i]; + if (ipcCommand->commandStatus == COMMAND_IN_PROGRESS ) + return; + + if (nodeResult->cmdState == COMMAND_STATE_SENT) + { + if (nodeResult->wdNode->state == WD_STANDBY) + { + nodeResult->wdNode->missed_beacon_count++; + if (nodeResult->wdNode->missed_beacon_count > 1) + ereport(LOG, + (errmsg("remote node \"%s\" is not replying to our beacons",nodeResult->wdNode->nodeName), + errdetail("missed beacon reply count:%d",nodeResult->wdNode->missed_beacon_count))); + } + else + nodeResult->wdNode->missed_beacon_count = 0; + } + if (nodeResult->cmdState == COMMAND_STATE_REPLIED) + { + if (nodeResult->wdNode->missed_beacon_count > 0) + ereport(LOG, + (errmsg("remote node \"%s\" is replying again after missing %d beacons",nodeResult->wdNode->nodeName, + nodeResult->wdNode->missed_beacon_count))); + nodeResult->wdNode->missed_beacon_count = 0; + } + } + } +} + +#ifdef WATCHDOG_DEBUG +/* + * Node down request file. In the file, each line consists of watchdog + * debug command. The possible commands are same as the defines below + * for example to stop Pgpool-II from sending the reply to beacon messages + * from the master node write DO_NOT_REPLY_TO_BEACON in watchdog_debug_requests + * + * + * echo "DO_NOT_REPLY_TO_BEACON" > pgpool_logdir/watchdog_debug_requests + */ + +typedef struct watchdog_debug_commands +{ + char command[100]; + unsigned int code; +} watchdog_debug_commands; + +unsigned int watchdog_debug_command = 0; + + +#define WATCHDOG_DEBUG_FILE "watchdog_debug_requests" + +#define DO_NOT_REPLY_TO_BEACON 1 +#define DO_NOT_SEND_BEACON 2 +#define KILL_ALL_COMMUNICATION 4 +#define KILL_ALL_RECEIVERS 8 +#define KILL_ALL_SENDERS 16 + + +watchdog_debug_commands wd_debug_commands[] = { + {"DO_NOT_REPLY_TO_BEACON", DO_NOT_REPLY_TO_BEACON}, + {"DO_NOT_SEND_BEACON", DO_NOT_SEND_BEACON}, + {"KILL_ALL_COMMUNICATION", KILL_ALL_COMMUNICATION}, + {"KILL_ALL_RECEIVERS", KILL_ALL_RECEIVERS}, + {"KILL_ALL_SENDERS", KILL_ALL_SENDERS}, + {"", 0} +}; + +static bool +check_debug_request_kill_all_communication(void) +{ + return (watchdog_debug_command & KILL_ALL_COMMUNICATION); +} +static bool +check_debug_request_kill_all_receivers(void) +{ + return (watchdog_debug_command & KILL_ALL_RECEIVERS); +} +static bool +check_debug_request_kill_all_senders(void) +{ + return (watchdog_debug_command & KILL_ALL_SENDERS); +} + +static bool +check_debug_request_do_not_send_beacon(void) +{ + return (watchdog_debug_command & DO_NOT_SEND_BEACON); +} + +static bool +check_debug_request_do_not_reply_beacon(void) +{ + return (watchdog_debug_command & DO_NOT_REPLY_TO_BEACON); +} +/* + * Check watchdog debug request options file for debug commands + * each line should contain only one command + * + * Possible commands + * DO_NOT_REPLY_TO_BEACON + * DO_NOT_SEND_BEACON + * KILL_ALL_COMMUNICATION + * KILL_ALL_RECEIVERS + * KILL_ALL_SENDERS + */ + +static void +load_watchdog_debug_test_option(void) +{ + static char wd_debug_request_file[POOLMAXPATHLEN]; + FILE *fd; + int i; +#define MAXLINE 128 + char readbuf[MAXLINE]; + + watchdog_debug_command = 0; + + if (wd_debug_request_file[0] == '\0') + { + snprintf(wd_debug_request_file, sizeof(wd_debug_request_file), + "%s/%s", pool_config->logdir, WATCHDOG_DEBUG_FILE); + } + + fd = fopen(wd_debug_request_file, "r"); + if (!fd) + { + ereport(DEBUG3, + (errmsg("load_watchdog_debug_test_option: failed to open file %s", + wd_debug_request_file), + errdetail("\"%s\"", strerror(errno)))); + return; + } + + for (i = 0;; i++) + { + int cmd = 0; + bool valid_command = false; + readbuf[MAXLINE - 1] = '\0'; + if (fgets(readbuf, MAXLINE - 1, fd) == 0) + break; + for (cmd =0 ;; cmd++) + { + if (strlen(wd_debug_commands[cmd].command) == 0 || wd_debug_commands[cmd].code == 0) + break; + + if (strncasecmp(wd_debug_commands[cmd].command,readbuf,strlen(wd_debug_commands[cmd].command)) == 0) + { + ereport(DEBUG3, + (errmsg("Watchdog DEBUG COMMAND %d: \"%s\" request found", + cmd,wd_debug_commands[cmd].command))); + + watchdog_debug_command |= wd_debug_commands[cmd].code; + valid_command = true; + break; + } + } + if (!valid_command) + ereport(WARNING, + (errmsg("%s file contains invalid command", + wd_debug_request_file), + errdetail("\"%s\" not recognized", readbuf))); + } + + fclose(fd); +} +#else +/* + * All these command checks return false when WATCHDOG_DEBUG is + * not enabled + */ +static bool +check_debug_request_do_not_send_beacon(void) +{return false;} +static bool +check_debug_request_do_not_reply_beacon(void) +{return false;} +static bool +check_debug_request_kill_all_communication(void) +{return false;} +static bool +check_debug_request_kill_all_receivers(void) +{return false;} +static bool +check_debug_request_kill_all_senders(void) +{return false;} +#endif -- 2.39.5