From: Craig Ringer Date: Sat, 11 Jul 2015 13:56:06 +0000 (+0800) Subject: Resume replay immediately on bdr_apply_resume() X-Git-Url: http://git.postgresql.org/gitweb/static/gitweb.js?a=commitdiff_plain;h=72eb773e5fe14dd2e0da810914606ae172a1c9ed;p=2ndquadrant_bdr.git Resume replay immediately on bdr_apply_resume() Previously, resuming from bdr_apply_pause() using bdr_apply_resume() could take up to a full latch-wait timeout in bdr_apply_work() to take effect because we didn't ever set the apply worker's latch on resume. --- diff --git a/bdr.c b/bdr.c index fc3ed32706..942c7f0482 100644 --- a/bdr.c +++ b/bdr.c @@ -909,6 +909,10 @@ bdr_maintain_schema(bool update_extensions) Datum bdr_apply_pause(PG_FUNCTION_ARGS) { + /* + * It's safe to pause without grabbing the segment lock; + * an overlapping resume won't do any harm. + */ BdrWorkerCtl->pause_apply = true; PG_RETURN_VOID(); } @@ -916,7 +920,27 @@ bdr_apply_pause(PG_FUNCTION_ARGS) Datum bdr_apply_resume(PG_FUNCTION_ARGS) { + int i; + + LWLockAcquire(BdrWorkerCtl->lock, LW_SHARED); BdrWorkerCtl->pause_apply = false; + + /* + * To get apply workers to notice immediately we have to set all their + * latches. This will also force config reloads, but that's cheap and + * harmless. + */ + for (i = 0; i < bdr_max_workers; i++) + { + BdrWorker *w = &BdrWorkerCtl->slots[i]; + if (w->worker_type == BDR_WORKER_APPLY) + { + BdrApplyWorker *apply = &w->data.apply; + SetLatch(apply->proclatch); + } + } + + LWLockRelease(BdrWorkerCtl->lock); PG_RETURN_VOID(); } diff --git a/bdr_apply.c b/bdr_apply.c index f5be0bfef7..373a6591dc 100644 --- a/bdr_apply.c +++ b/bdr_apply.c @@ -2495,18 +2495,16 @@ bdr_apply_work(PGconn* streamConn) * flag in shmem. We don't pause until the end of the current * transaction, to avoid sleeping with locks held. * - * XXX With the 1s timeout below, we don't risk delaying the - * resumption too much. But it would be better to use a global - * latch that can be set by pg_bdr_apply_resume(), and not have - * to wake up so often. + * Sleep for 5 minutes before re-checking. We shouldn't really + * need to since we set the proc latch on resume, but it doesn't + * hurt to be careful. */ - while (BdrWorkerCtl->pause_apply && !IsTransactionState()) { ResetLatch(&MyProc->procLatch); rc = WaitLatch(&MyProc->procLatch, WL_TIMEOUT | WL_LATCH_SET | WL_POSTMASTER_DEATH, - 1000L); + 300000L); if (rc & WL_POSTMASTER_DEATH) proc_exit(1); diff --git a/expected/pause.out b/expected/pause.out index 85e1ba5b0f..ca63561964 100644 --- a/expected/pause.out +++ b/expected/pause.out @@ -84,12 +84,18 @@ SELECT bdr.bdr_apply_resume(); \ccc regression invalid command \ccc INSERT INTO pause_test(x) VALUES ('after resume'); +-- The pause latch timeout is 5 minutes. To make sure that setting +-- the latch is doing its job and unpausing before timeout, expect +-- resume to take effect well before then. +BEGIN; +SET LOCAL statement_timeout = '60s'; SELECT pg_xlog_wait_remote_apply(pg_current_xlog_location(), 0); pg_xlog_wait_remote_apply --------------------------- (1 row) +COMMIT; \cc postgres invalid command \cc -- Must see all three rows diff --git a/sql/pause.sql b/sql/pause.sql index 792a30d8ac..64dde4a835 100644 --- a/sql/pause.sql +++ b/sql/pause.sql @@ -36,7 +36,14 @@ SELECT bdr.bdr_apply_resume(); \ccc regression INSERT INTO pause_test(x) VALUES ('after resume'); + +-- The pause latch timeout is 5 minutes. To make sure that setting +-- the latch is doing its job and unpausing before timeout, expect +-- resume to take effect well before then. +BEGIN; +SET LOCAL statement_timeout = '60s'; SELECT pg_xlog_wait_remote_apply(pg_current_xlog_location(), 0); +COMMIT; \cc postgres