Fix corner case bug in Pgpool-II starting up.
authorTatsuo Ishii <ishii@postgresql.org>
Tue, 9 May 2017 23:30:17 +0000 (08:30 +0900)
committerTatsuo Ishii <ishii@postgresql.org>
Tue, 9 May 2017 23:30:17 +0000 (08:30 +0900)
It is possible that a failover request is accepted before primary node
is searched.  This leads Pgpool-II to a strange state: there's no
primary node if the failed node was a primary node (even if new
primary node exists as a result of promotion of existing standby).

See [pgpool-hackers: 2321] for more details.

src/main/pgpool_main.c
src/test/regression/tests/003.failover/test.sh
src/test/regression/tests/055.backend_all_down/test.sh

index df71128a2e56321ddcb48365f347c61c9f16053e..da94e8c756d7c8149b920e53357d06d15d5a4266 100644 (file)
@@ -209,6 +209,8 @@ int PgpoolMain(bool discard_status, bool clear_memcache_oidmaps)
 
        sigjmp_buf      local_sigjmp_buf;
 
+       bool first = true;
+
        /* For PostmasterRandom */
        gettimeofday(&random_start_time, NULL);
 
@@ -375,6 +377,15 @@ int PgpoolMain(bool discard_status, bool clear_memcache_oidmaps)
 
        MemoryContextSwitchTo(TopMemoryContext);
 
+       /*
+        * if the primary node id is not loaded by watchdog, search for it
+        */
+       if (Req_info->primary_node_id < 0)
+       {
+               /* Save primary node id */
+               Req_info->primary_node_id = find_primary_node_repeatedly();
+       }
+
        /* fork a child for PCP handling */
        pcp_unix_fd = create_unix_domain_socket(pcp_un_addr);
        /* Add onproc exit to clean up the unix domain socket at exit */
@@ -396,23 +407,6 @@ int PgpoolMain(bool discard_status, bool clear_memcache_oidmaps)
                        health_check_pids[i] = worker_fork_a_child(PT_HEALTH_CHECK, do_health_check_child, &i);
        }
 
-       /*
-        * check for child signals to ensure child startup before reporting successfull start
-        */
-       CHECK_REQUEST;
-
-       ereport(LOG,
-                       (errmsg("%s successfully started. version %s (%s)", PACKAGE, VERSION, PGPOOLVERSION)));
-
-       /*
-        * if the primary node id is not loaded by watchdog, search for it
-        */
-       if (Req_info->primary_node_id < 0)
-       {
-               /* Save primary node id */
-               Req_info->primary_node_id = find_primary_node();
-       }
-
        if (sigsetjmp(local_sigjmp_buf, 1) != 0)
        {
                /* Since not using PG_TRY, must reset error stack by hand */
@@ -435,6 +429,15 @@ int PgpoolMain(bool discard_status, bool clear_memcache_oidmaps)
        {
                CHECK_REQUEST;
 
+               /*
+                * check for child signals to ensure child startup before reporting
+                * successfull start.
+                */
+               if (first)
+                       ereport(LOG,
+                                       (errmsg("%s successfully started. version %s (%s)", PACKAGE, VERSION, PGPOOLVERSION)));
+               first = false;
+
                processState = SLEEPING;
                for (;;)
                {
index 4bbd3de423e4262e7f57faa56c7878a89767ae0d..ede8672a2183e4ea338fc16b52e0fb58071110b1 100755 (executable)
@@ -40,7 +40,7 @@ do
        ./shutdownall
 
        ./startall
-       wait_for_pgpool_startup
+#      wait_for_pgpool_startup
        # trigger failover on node 0
        $PG_CTL -D data0 -m f stop
        wait_for_failover_done
index d0ca580f6637ee93a8e8b2dcd6ad8261028ed268..fddbca026e8e235fefde3b4c58b43c91bad47513 100755 (executable)
@@ -17,6 +17,8 @@ echo -n "creating test environment..."
 $PGPOOL_SETUP -m s -n 2 || exit 1
 echo "done."
 
+echo "search_primary_node_timeout = 5" >> etc/pgpool.conf
+
 source ./bashrc.ports
 
 export PGPORT=$PGPOOL_PORT