int network_monitor_sock;
bool clusterInitialized;
bool ipc_auth_needed;
- int current_failover_id;
+ int current_failover_id;
+ int failover_command_timeout;
struct timeval last_bcast_srv_msg_time; /* timestamp when last packet was
* boradcasted by the local node */
char last_bcast_srv_msg;
static int standby_node_left_cluster(WatchdogNode * wdNode);
static int standby_node_join_cluster(WatchdogNode * wdNode);
static void update_missed_beacon_count(WDCommandData* ipcCommand, bool clear);
+static void update_failover_timeout(WatchdogNode * wdNode, POOL_CONFIG *pool_config);
/* global variables */
wd_cluster g_cluster;
if (failoverObj)
{
- if (WD_TIME_DIFF_SEC(currTime, failoverObj->startTime) >= FAILOVER_COMMAND_FINISH_TIMEOUT)
+ if (WD_TIME_DIFF_SEC(currTime, failoverObj->startTime) >= g_cluster.failover_command_timeout)
{
failovers_to_del = lappend(failovers_to_del, failoverObj);
ereport(DEBUG1,
if (standby_config)
{
verify_pool_configurations(wdNode, standby_config);
+ update_failover_timeout(wdNode, standby_config);
}
}
}
send_cluster_command(NULL, WD_DECLARE_COORDINATOR_MESSAGE, 4);
set_timeout(MAX_SECS_WAIT_FOR_REPLY_FROM_NODE);
update_missed_beacon_count(NULL,true);
+ update_failover_timeout(g_cluster.localNode, pool_config);
+
ereport(LOG,
(errmsg("I am announcing my self as master/coordinator watchdog node")));
}
}
+static void
+update_failover_timeout(WatchdogNode * wdNode, POOL_CONFIG *pool_config)
+{
+ int failover_command_timeout;
+ if (get_local_node_state() != WD_COORDINATOR)
+ return;
+
+ failover_command_timeout = pool_config->health_check_period + (pool_config->health_check_retry_delay * pool_config->health_check_max_retries);
+
+ if (pool_config->health_check_params)
+ {
+ int i;
+ for (i = 0 ; i < pool_config->backend_desc->num_backends; i++)
+ {
+ int pn_failover_command_timeout = pool_config->health_check_params[i].health_check_period +
+ (pool_config->health_check_params[i].health_check_retry_delay *
+ pool_config->health_check_params[i].health_check_max_retries);
+
+ if (failover_command_timeout < pn_failover_command_timeout)
+ failover_command_timeout = pn_failover_command_timeout;
+ }
+ }
+
+ if (g_cluster.localNode == wdNode)
+ {
+ /* Reset*/
+ g_cluster.failover_command_timeout = failover_command_timeout;
+ }
+ else if (g_cluster.failover_command_timeout < failover_command_timeout)
+ g_cluster.failover_command_timeout = failover_command_timeout;
+
+ if (g_cluster.failover_command_timeout < FAILOVER_COMMAND_FINISH_TIMEOUT)
+ g_cluster.failover_command_timeout = FAILOVER_COMMAND_FINISH_TIMEOUT;
+
+ ereport(LOG,(errmsg("Setting failover command timeout to %d",failover_command_timeout)));
+}
+
#ifdef WATCHDOG_DEBUG
/*
* Node down request file. In the file, each line consists of watchdog
goto ERROR_EXIT;
strncpy(config->backend_desc->backend_info[i].backend_hostname, ptr, sizeof(config->backend_desc->backend_info[i].backend_hostname) - 1);
}
+ /* per node health check parameters */
+ value = json_get_value_for_key(root, "health_check_params");
+ /* We don't get seperate health check params from older version
+ * so be kind if the JSON does not contain one
+ */
+ if (value != NULL && value->type == json_array)
+ {
+ int health_check_params_count = value->u.array.length;
+ if (health_check_params_count != config->backend_desc->num_backends)
+ {
+ ereport(LOG,
+ (errmsg("unexpected number of health check parameters received"),
+ errdetail("expected:%d got %d",config->backend_desc->num_backends,health_check_params_count)));
+ }
+ config->health_check_params = palloc0(sizeof(HealthCheckParams) * config->backend_desc->num_backends);
+
+
+ if (health_check_params_count > config->backend_desc->num_backends)
+ health_check_params_count = config->backend_desc->num_backends;
+
+ for (i = 0; i < health_check_params_count; i++)
+ {
+ json_value *arr_value = value->u.array.values[i];
+
+ if (json_get_int_value_for_key(arr_value, "health_check_timeout", &config->health_check_params[i].health_check_timeout))
+ config->health_check_params[i].health_check_timeout = 0;
+ if (json_get_int_value_for_key(arr_value, "health_check_period", &config->health_check_params[i].health_check_period))
+ config->health_check_params[i].health_check_period = 0;
+ if (json_get_int_value_for_key(arr_value, "health_check_max_retries", &config->health_check_params[i].health_check_max_retries))
+ config->health_check_params[i].health_check_max_retries = 0;
+ if (json_get_int_value_for_key(arr_value, "health_check_retry_delay", &config->health_check_params[i].health_check_retry_delay))
+ config->health_check_params[i].health_check_retry_delay = 0;
+ if (json_get_int_value_for_key(arr_value, "connect_timeout", &config->health_check_params[i].connect_timeout))
+ config->health_check_params[i].connect_timeout = 0;
+ }
+ }
+ else
+ config->health_check_params = NULL;
/* wd_remote_nodes array */
value = json_get_value_for_key(root, "wd_remote_nodes");
jw_put_bool(jNode, "failover_require_consensus", pool_config->failover_require_consensus);
jw_put_bool(jNode, "allow_multiple_failover_requests_from_node", pool_config->allow_multiple_failover_requests_from_node);
+ /* Array of health_check params
+ * We transport num_backend at max
+ */
+ jw_start_array(jNode, "health_check_params");
+ for (i = 0; i < pool_config->backend_desc->num_backends; i++)
+ {
+ jw_start_object(jNode, "HealthCheckParams");
+
+ jw_put_int(jNode, "health_check_timeout", pool_config->health_check_params[i].health_check_timeout);
+ jw_put_int(jNode, "health_check_period", pool_config->health_check_params[i].health_check_period);
+ jw_put_int(jNode, "health_check_max_retries", pool_config->health_check_params[i].health_check_max_retries);
+ jw_put_int(jNode, "health_check_retry_delay", pool_config->health_check_params[i].health_check_retry_delay);
+ jw_put_int(jNode, "connect_timeout", pool_config->health_check_params[i].connect_timeout);
+ jw_end_element(jNode);
+ }
+ jw_end_element(jNode); /* backend_desc array End */
+
/* Array of backends */
jw_start_array(jNode, "backend_desc");
for (i = 0; i < pool_config->backend_desc->num_backends; i++)