Resume replay immediately on bdr_apply_resume()
authorCraig Ringer <craig@2ndquadrant.com>
Sat, 11 Jul 2015 13:56:06 +0000 (21:56 +0800)
committerCraig Ringer <craig@2ndquadrant.com>
Wed, 15 Jul 2015 04:47:32 +0000 (12:47 +0800)
Previously, resuming from bdr_apply_pause() using bdr_apply_resume()
could take up to a full latch-wait timeout in bdr_apply_work() to take
effect because we didn't ever set the apply worker's latch on resume.

bdr.c
bdr_apply.c
expected/pause.out
sql/pause.sql

diff --git a/bdr.c b/bdr.c
index fc3ed3270670ad9c968854a86b9d980ef19d287d..942c7f04828dae7b672743abb7a7cbb0bd4acfe5 100644 (file)
--- a/bdr.c
+++ b/bdr.c
@@ -909,6 +909,10 @@ bdr_maintain_schema(bool update_extensions)
 Datum
 bdr_apply_pause(PG_FUNCTION_ARGS)
 {
+   /*
+    * It's safe to pause without grabbing the segment lock;
+    * an overlapping resume won't do any harm.
+    */
    BdrWorkerCtl->pause_apply = true;
    PG_RETURN_VOID();
 }
@@ -916,7 +920,27 @@ bdr_apply_pause(PG_FUNCTION_ARGS)
 Datum
 bdr_apply_resume(PG_FUNCTION_ARGS)
 {
+   int i;
+
+   LWLockAcquire(BdrWorkerCtl->lock, LW_SHARED);
    BdrWorkerCtl->pause_apply = false;
+
+   /*
+    * To get apply workers to notice immediately we have to set all their
+    * latches. This will also force config reloads, but that's cheap and
+    * harmless.
+    */
+   for (i = 0; i < bdr_max_workers; i++)
+   {
+       BdrWorker *w = &BdrWorkerCtl->slots[i];
+       if (w->worker_type == BDR_WORKER_APPLY)
+       {
+           BdrApplyWorker *apply = &w->data.apply;
+           SetLatch(apply->proclatch);
+       }
+   }
+
+   LWLockRelease(BdrWorkerCtl->lock);
    PG_RETURN_VOID();
 }
 
index f5be0bfef7ad3400e3a39fff466109b7c1d6a779..373a6591dc4db4920a88c3c0cde55f2b083ce1cf 100644 (file)
@@ -2495,18 +2495,16 @@ bdr_apply_work(PGconn* streamConn)
         * flag in shmem. We don't pause until the end of the current
         * transaction, to avoid sleeping with locks held.
         *
-        * XXX With the 1s timeout below, we don't risk delaying the
-        * resumption too much. But it would be better to use a global
-        * latch that can be set by pg_bdr_apply_resume(), and not have
-        * to wake up so often.
+        * Sleep for 5 minutes before re-checking. We shouldn't really
+        * need to since we set the proc latch on resume, but it doesn't
+        * hurt to be careful.
         */
-
        while (BdrWorkerCtl->pause_apply && !IsTransactionState())
        {
            ResetLatch(&MyProc->procLatch);
            rc = WaitLatch(&MyProc->procLatch,
                           WL_TIMEOUT | WL_LATCH_SET | WL_POSTMASTER_DEATH,
-                          1000L);
+                          300000L);
 
            if (rc & WL_POSTMASTER_DEATH)
                proc_exit(1);
index 85e1ba5b0f4cbd13c0538df62b89ec2c7bd3a2b0..ca6356196449b15533a32ded7961ab80fdc06f77 100644 (file)
@@ -84,12 +84,18 @@ SELECT bdr.bdr_apply_resume();
 \ccc regression
 invalid command \ccc
 INSERT INTO pause_test(x) VALUES ('after resume');
+-- The pause latch timeout is 5 minutes. To make sure that setting
+-- the latch is doing its job and unpausing before timeout, expect
+-- resume to take effect well before then.
+BEGIN;
+SET LOCAL statement_timeout = '60s';
 SELECT pg_xlog_wait_remote_apply(pg_current_xlog_location(), 0);
  pg_xlog_wait_remote_apply 
 ---------------------------
  
 (1 row)
 
+COMMIT;
 \cc postgres
 invalid command \cc
 -- Must see all three rows
index 792a30d8ac0bf1499849584d155e935cb3a14bff..64dde4a83545842708c62fd238c5a772ad07456a 100644 (file)
@@ -36,7 +36,14 @@ SELECT bdr.bdr_apply_resume();
 \ccc regression
 
 INSERT INTO pause_test(x) VALUES ('after resume');
+
+-- The pause latch timeout is 5 minutes. To make sure that setting
+-- the latch is doing its job and unpausing before timeout, expect
+-- resume to take effect well before then.
+BEGIN;
+SET LOCAL statement_timeout = '60s';
 SELECT pg_xlog_wait_remote_apply(pg_current_xlog_location(), 0);
+COMMIT;
 
 \cc postgres