Fix for no primary on standby pgpool when primary is quarantined on master
authorMuhammad Usama <m.usama@gmail.com>
Wed, 7 Aug 2019 15:22:01 +0000 (20:22 +0500)
committerMuhammad Usama <m.usama@gmail.com>
Thu, 8 Aug 2019 06:14:47 +0000 (11:14 +0500)
Master watchdog Pgpool sends primary_node_id = -1 in the backend status sync
message if the primary node is quarantined on it. So standby watchdog Pgpool
must not update its primary_node_id if the primary backend node id in sync
message is invalid_node_id (-1) while the same sync message reports the
backend status of the current primary node as "NOT DOWN".

The issue was reported by  "Tatsuo Ishii <ishii@sraoss.co.jp>" and fixed by me

src/main/pgpool_main.c
src/watchdog/wd_json_data.c

index 2eeeb40c1d5d5d82ff5472245513fc9eda9e354b..bd80f8b146bc1ee58f3fcb32c29c6a013842f196 100644 (file)
@@ -3715,19 +3715,8 @@ static void sync_backend_from_watchdog(void)
        ereport(DEBUG1,
                        (errmsg("primary node on master watchdog node \"%s\" is %d",backendStatus->nodeName,backendStatus->primary_node_id)));
 
-       if (Req_info->primary_node_id != backendStatus->primary_node_id)
-       {
-               /* Do not produce this log message if we are starting up the Pgpool-II*/
-               if (processState != INITIALIZING)
-                       ereport(LOG,
-                                       (errmsg("primary node:%d on master watchdog node \"%s\" is different from local primary node:%d",
-                                                       backendStatus->primary_node_id,backendStatus->nodeName,Req_info->primary_node_id)));
-
-               Req_info->primary_node_id = backendStatus->primary_node_id;
-               primary_changed = true;
-       }
-       /* update the local backend status
-        * Also remove quarantine flags
+       /*
+        * update the local backend status Also remove quarantine flags
         */
        for (i = 0; i < backendStatus->node_count; i++)
        {
@@ -3765,6 +3754,34 @@ static void sync_backend_from_watchdog(void)
                        }
                }
        }
+
+       if (Req_info->primary_node_id != backendStatus->primary_node_id)
+       {
+               /* Do not produce this log message if we are starting up the Pgpool-II */
+               if (processState != INITIALIZING)
+                       ereport(LOG,
+                                       (errmsg("primary node:%d on master watchdog node \"%s\" is different from local primary node:%d",
+                                                       backendStatus->primary_node_id, backendStatus->nodeName, Req_info->primary_node_id)));
+               /*
+                * master node returns primary_node_id = -1 when the node primary
+                * node is in  quarantine state on the master.
+                * So we will not update our primary node id when the status of current primary node
+                * is not CON_DOWN while primary_node_id sent by master watchdong node is -1
+                */
+               if (backendStatus->primary_node_id == -1 && BACKEND_INFO(Req_info->primary_node_id).backend_status != CON_DOWN)
+               {
+                       ereport(LOG,
+                (errmsg("primary node:%d on master watchdog node \"%s\" seems to be quarantined",
+                                       Req_info->primary_node_id, backendStatus->nodeName),
+                errdetail("keeping the current primary")));
+               }
+               else
+               {
+                       Req_info->primary_node_id = backendStatus->primary_node_id;
+                       primary_changed = true;
+               }
+       }
+
        pfree(backendStatus);
 
        if (reload_maste_node_id)
index 58465c49614f352508c73ecc055bcfd485b390b5..dc82ab7d34b5728d233c7951ab34725250a3d2d9 100644 (file)
@@ -305,13 +305,14 @@ char* get_backend_node_status_json(WatchdogNode* wdNode)
 
        jw_start_array(jNode, "BackendNodeStatusList");
 
-       for (i=0;i< pool_config->backend_desc->num_backends;i++)
+       for (i = 0; i < pool_config->backend_desc->num_backends; i++)
        {
                BACKEND_STATUS backend_status = pool_config->backend_desc->backend_info[i].backend_status;
                if (backend_status == CON_DOWN && pool_config->backend_desc->backend_info[i].quarantine)
                {
-                       /* since quarantine nodes are not cluster wide
-                        * so send CON_WATI status for quarantine nodes
+                       /*
+                        * since quarantine nodes are not cluster wide so send CON_WAIT
+                        * status for quarantine nodes
                         */
                        backend_status = CON_CONNECT_WAIT;
                }