Fix bug with health check when used with child_life_time reported in [pgpool-general...
authorTatsuo Ishii <ishii@postgresql.org>
Sat, 20 Jul 2013 03:58:09 +0000 (12:58 +0900)
committerTatsuo Ishii <ishii@postgresql.org>
Sat, 20 Jul 2013 03:58:09 +0000 (12:58 +0900)
Here is the explanation why the problem occurs:

--------------------------------------------------------------------------------
Ok. I think I finally understand what's going on here.

Pgpool main process (14317) started health checking at Jul 12 09:17:04.

Jul 12 09:17:04 purple1-node1-ps pgpool[14317]: starting health checking

Pgpool main process set timer at 09:17:14 because you set
health_check_timeout 10.  This time the health check successfully
completed. The timer for 09:17:14 is blocked by calling
signal(SIGALRM, SIG_IGN).

Unfortunately child life time was expired at 09:17:14 and pgpool main
process was busy at the time because of this.

Jul 12 09:17:14 purple1-node1-ps pgpool[16789]: child life 300 seconds expired
Jul 12 09:17:14 purple1-node1-ps pgpool[14317]: reap_handler called

Jul 12 09:17:14 purple1-node1-ps pgpool[14317]: starting health checking

Pgpool main re-enabled the timer and reset the timer variable
(health_check_timer_expired = 0). But when the timer re-enabled, the
signal handler for the timer set health_check_timer_expired to 1.  As
a result pgpool thought that health check timer was expired.

Jul 12 09:17:14 purple1-node1-ps pgpool[14317]: health_check: health check timer has been already expired before attempting to connect to 0 th backend

Thus failover happend even if the backend was running fine.
--------------------------------------------------------------------------------

To fix the problem new macro CLEAR_ALARM, which calls alarm(0) until
all pending alarms are cleared, is defined and used whenever necessary
to cancel health check timer. Also before forking off child process
health_check_timer_expire is explicitely cleared.

Also this causes the error message.

Jul 12 09:32:14 purple1-node1-ps pgpool[11465]: connect_inet_domain_socket_by_port: health check timer expired

Process 11465 is a child process and is not supposed to run into this
situation. This is caused because the global variable
"health_check_timer_expired" is set to 1 before the new child is
forked off after child_life_time expired is set to 1. This could if
SIGCHLD signal is received at the moment when the bug below happens.
To make sure this never happens in connect_inet_domain_socket_by_port
checks health_check_timer_expired only if it is a main process.

main.c
pool_connection_pool.c

diff --git a/main.c b/main.c
index d447a2175391518a4ac6c5fb6a2af7e48e7b40e3..160aaa12fc61afb073460c2cfe5164f8d344b4fc 100644 (file)
--- a/main.c
+++ b/main.c
                } \
     } while (0)
 
+#define CLEAR_ALARM \
+       do { \
+                       pool_debug("health check: clearing alarm"); \
+    } while (alarm(0) > 0)
 
 #define PGPOOLMAXLITSENQUEUELENGTH 10000
 static void daemonize(void);
@@ -718,6 +722,7 @@ int main(int argc, char **argv)
                                 * communication path failure much earlier before
                                 * TCP/IP stack detects it.
                                 */
+                               CLEAR_ALARM;
                                pool_signal(SIGALRM, health_check_timer_handler);
                                alarm(pool_config->health_check_timeout);
                        }
@@ -742,6 +747,7 @@ int main(int argc, char **argv)
 
                                        retrycnt++;
                                        pool_signal(SIGALRM, SIG_IGN);  /* Cancel timer */
+                                       CLEAR_ALARM;
 
                                        if (!pool_config->parallel_mode)
                                        {
@@ -797,6 +803,7 @@ int main(int argc, char **argv)
                                {
                                        sys_retrycnt++;
                                        pool_signal(SIGALRM, SIG_IGN);
+                                       CLEAR_ALARM;
 
                                        if (sys_retrycnt > NUM_BACKENDS)
                                        {
@@ -828,6 +835,7 @@ int main(int argc, char **argv)
                        {
                                /* seems OK. cancel health check timer */
                                pool_signal(SIGALRM, SIG_IGN);
+                               CLEAR_ALARM;
                        }
 
                        sleep_time = pool_config->health_check_period;
@@ -1187,6 +1195,7 @@ pid_t pcp_fork_a_child(int unix_fd, int inet_fd, char *pcp_conf_file)
 
                /* call PCP child main */
                POOL_SETMASK(&UnBlockSig);
+               health_check_timer_expired = 0;
                reload_config_request = 0;
                run_as_pcp_child = true;
                pcp_do_child(unix_fd, inet_fd, pcp_conf_file);
@@ -1228,6 +1237,7 @@ pid_t fork_a_child(int unix_fd, int inet_fd, int id)
 
                /* call child main */
                POOL_SETMASK(&UnBlockSig);
+               health_check_timer_expired = 0;
                reload_config_request = 0;
                my_proc_id = id;
                run_as_pcp_child = false;
@@ -1270,6 +1280,7 @@ pid_t worker_fork_a_child()
 
                /* call child main */
                POOL_SETMASK(&UnBlockSig);
+               health_check_timer_expired = 0;
                reload_config_request = 0;
                do_worker_child();
        }
index a6b6a06ef2a81aa1aac1cc6ad629487e96a4fbea..87d9832001d0e761bafe14af1cdc8e1b32ea3b95 100644 (file)
@@ -569,7 +569,7 @@ int connect_inet_domain_socket_by_port(char *host, int port, bool retry)
                        return -1;
                }
 
-               if (health_check_timer_expired)         /* has health check timer expired */
+               if (health_check_timer_expired && getpid() == mypid)            /* has health check timer expired */
                {
                        pool_log("connect_inet_domain_socket_by_port: health check timer expired");
                        close(fd);