Fix for :[pgpool-hackers: 4227] Issue with failover_require_consensus

author Muhammad Usama <muhammad.usama@percona.com>

Tue, 20 Dec 2022 08:37:46 +0000 (13:37 +0500)

committer Muhammad Usama <muhammad.usama@percona.com>

Tue, 20 Dec 2022 08:48:57 +0000 (13:48 +0500)
author Muhammad Usama <muhammad.usama@percona.com>
Tue, 20 Dec 2022 08:37:46 +0000 (13:37 +0500)
committer Muhammad Usama <muhammad.usama@percona.com>
Tue, 20 Dec 2022 08:48:57 +0000 (13:48 +0500)
diff --git a/src/watchdog/watchdog.c b/src/watchdog/watchdog.c

index 60b23e7219621daddbc7093691d1d7fb87a56719..3bc512165a8ff35f12a91fea00f82b5ac588e2f6 100644 (file)
--- a/src/watchdog/watchdog.c
+++ b/src/watchdog/watchdog.c
@@ -349,7 +349,8 @@ typedef struct wd_cluster
         int                     network_monitor_sock;
         bool            clusterInitialized;
         bool            ipc_auth_needed;
-       int                     current_failover_id;
+       int             current_failover_id;
+       int             failover_command_timeout;
         struct timeval last_bcast_srv_msg_time; /* timestamp when last packet was
                                                                                          *  boradcasted by the local node */
         char            last_bcast_srv_msg;
@@ -572,6 +573,7 @@ static void clear_standby_nodes_list(void);
  static int     standby_node_left_cluster(WatchdogNode * wdNode);
  static int     standby_node_join_cluster(WatchdogNode * wdNode);
  static void update_missed_beacon_count(WDCommandData* ipcCommand, bool clear);
+static void update_failover_timeout(WatchdogNode * wdNode, POOL_CONFIG *pool_config);
  
  /* global variables */
  wd_cluster     g_cluster;
@@ -2239,7 +2241,7 @@ service_expired_failovers(void)
  
                 if (failoverObj)
                 {
-                       if (WD_TIME_DIFF_SEC(currTime, failoverObj->startTime) >= FAILOVER_COMMAND_FINISH_TIMEOUT)
+                       if (WD_TIME_DIFF_SEC(currTime, failoverObj->startTime) >= g_cluster.failover_command_timeout)
                         {
                                 failovers_to_del = lappend(failovers_to_del, failoverObj);
                                 ereport(DEBUG1,
@@ -3910,6 +3912,7 @@ standard_packet_processor(WatchdogNode * wdNode, WDPacketData * pkt)
                                         if (standby_config)
                                         {
                                                 verify_pool_configurations(wdNode, standby_config);
+                                               update_failover_timeout(wdNode, standby_config);
                                         }
                                 }
                         }
@@ -5651,6 +5654,8 @@ watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPac
                                 send_cluster_command(NULL, WD_DECLARE_COORDINATOR_MESSAGE, 4);
                                 set_timeout(MAX_SECS_WAIT_FOR_REPLY_FROM_NODE);
                                 update_missed_beacon_count(NULL,true);
+                               update_failover_timeout(g_cluster.localNode, pool_config);
+
                                 ereport(LOG,
                                                 (errmsg("I am announcing my self as master/coordinator watchdog node")));
  
@@ -7659,6 +7664,43 @@ static void update_missed_beacon_count(WDCommandData* ipcCommand, bool clear)
         }
  }
  
+static void
+update_failover_timeout(WatchdogNode * wdNode, POOL_CONFIG *pool_config)
+{
+       int failover_command_timeout;
+       if (get_local_node_state() != WD_COORDINATOR)
+               return;
+
+       failover_command_timeout = pool_config->health_check_period + (pool_config->health_check_retry_delay * pool_config->health_check_max_retries);
+
+       if (pool_config->health_check_params)
+       {
+               int i;
+               for (i = 0 ; i < pool_config->backend_desc->num_backends; i++)
+               {
+                       int pn_failover_command_timeout = pool_config->health_check_params[i].health_check_period +
+                                                       (pool_config->health_check_params[i].health_check_retry_delay *
+                                                       pool_config->health_check_params[i].health_check_max_retries);
+
+                       if (failover_command_timeout < pn_failover_command_timeout)
+                               failover_command_timeout = pn_failover_command_timeout;
+               }
+       }
+
+       if (g_cluster.localNode == wdNode)
+       {
+               /* Reset*/
+               g_cluster.failover_command_timeout = failover_command_timeout;
+       }
+       else if (g_cluster.failover_command_timeout < failover_command_timeout)
+               g_cluster.failover_command_timeout = failover_command_timeout;
+
+       if (g_cluster.failover_command_timeout < FAILOVER_COMMAND_FINISH_TIMEOUT)
+               g_cluster.failover_command_timeout = FAILOVER_COMMAND_FINISH_TIMEOUT;
+
+       ereport(LOG,(errmsg("Setting failover command timeout to %d",failover_command_timeout)));
+}
+
  #ifdef WATCHDOG_DEBUG
  /*
   * Node down request file. In the file, each line consists of watchdog
diff --git a/src/watchdog/wd_json_data.c b/src/watchdog/wd_json_data.c

index bd3e90574094222a12dbb26273c5801884fa1928..41fff012a95e6734009c43ff423791b4019ff3bc 100644 (file)
--- a/src/watchdog/wd_json_data.c
+++ b/src/watchdog/wd_json_data.c
@@ -136,6 +136,44 @@ get_pool_config_from_json(char *json_data, int data_len)
                         goto ERROR_EXIT;
                 strncpy(config->backend_desc->backend_info[i].backend_hostname, ptr, sizeof(config->backend_desc->backend_info[i].backend_hostname) - 1);
         }
+    /* per node health check parameters */
+    value = json_get_value_for_key(root, "health_check_params");
+    /* We don't get seperate health check params from older version
+     * so be kind if the JSON does not contain one
+     */
+    if (value != NULL && value->type == json_array)
+    {
+        int health_check_params_count = value->u.array.length;
+        if (health_check_params_count != config->backend_desc->num_backends)
+        {
+            ereport(LOG,
+                    (errmsg("unexpected number of health check parameters received"),
+                     errdetail("expected:%d got %d",config->backend_desc->num_backends,health_check_params_count)));
+        }
+        config->health_check_params = palloc0(sizeof(HealthCheckParams) * config->backend_desc->num_backends);
+        
+        
+        if (health_check_params_count > config->backend_desc->num_backends)
+            health_check_params_count = config->backend_desc->num_backends;
+        
+        for (i = 0; i < health_check_params_count; i++)
+        {
+            json_value *arr_value = value->u.array.values[i];
+            
+            if (json_get_int_value_for_key(arr_value, "health_check_timeout", &config->health_check_params[i].health_check_timeout))
+                config->health_check_params[i].health_check_timeout = 0;
+            if (json_get_int_value_for_key(arr_value, "health_check_period", &config->health_check_params[i].health_check_period))
+                config->health_check_params[i].health_check_period = 0;
+            if (json_get_int_value_for_key(arr_value, "health_check_max_retries", &config->health_check_params[i].health_check_max_retries))
+                config->health_check_params[i].health_check_max_retries = 0;
+            if (json_get_int_value_for_key(arr_value, "health_check_retry_delay", &config->health_check_params[i].health_check_retry_delay))
+                config->health_check_params[i].health_check_retry_delay = 0;
+            if (json_get_int_value_for_key(arr_value, "connect_timeout", &config->health_check_params[i].connect_timeout))
+                config->health_check_params[i].connect_timeout = 0;
+        }
+    }
+    else
+        config->health_check_params = NULL;
  
         /* wd_remote_nodes array */
         value = json_get_value_for_key(root, "wd_remote_nodes");
@@ -214,6 +252,23 @@ get_pool_config_json(void)
         jw_put_bool(jNode, "failover_require_consensus", pool_config->failover_require_consensus);
         jw_put_bool(jNode, "allow_multiple_failover_requests_from_node", pool_config->allow_multiple_failover_requests_from_node);
  
+       /* Array of health_check params
+        * We transport num_backend at max
+        */
+       jw_start_array(jNode, "health_check_params");
+       for (i = 0; i < pool_config->backend_desc->num_backends; i++)
+       {
+               jw_start_object(jNode, "HealthCheckParams");
+
+               jw_put_int(jNode, "health_check_timeout", pool_config->health_check_params[i].health_check_timeout);
+               jw_put_int(jNode, "health_check_period", pool_config->health_check_params[i].health_check_period);
+               jw_put_int(jNode, "health_check_max_retries", pool_config->health_check_params[i].health_check_max_retries);
+               jw_put_int(jNode, "health_check_retry_delay", pool_config->health_check_params[i].health_check_retry_delay);
+               jw_put_int(jNode, "connect_timeout", pool_config->health_check_params[i].connect_timeout);
+               jw_end_element(jNode);
+       }
+       jw_end_element(jNode);          /* backend_desc array End */
+
         /* Array of backends */
         jw_start_array(jNode, "backend_desc");
         for (i = 0; i < pool_config->backend_desc->num_backends; i++)
author	Muhammad Usama <muhammad.usama@percona.com>
	Tue, 20 Dec 2022 08:37:46 +0000 (13:37 +0500)
committer	Muhammad Usama <muhammad.usama@percona.com>
	Tue, 20 Dec 2022 08:48:57 +0000 (13:48 +0500)
src/watchdog/watchdog.c		patch \| blob \| blame \| history
src/watchdog/wd_json_data.c		patch \| blob \| blame \| history