Rewrite background writer. bgwriter-rewrite
authorAndres Freund <andres@anarazel.de>
Fri, 19 Feb 2016 20:07:51 +0000 (12:07 -0800)
committerAndres Freund <andres@anarazel.de>
Tue, 11 Jun 2019 02:01:00 +0000 (19:01 -0700)
This currently consists out of two major parts:

1) Add more statistics, to be able to even evaluate the effects of
   bgwriter changes / problems. This should probably be split into a
   separate commit.

   It's remarkable how odd the set of current measurements is, and how
   many different mechanisms for transporting those values we
   currently have. The patch adds and replaces a few measurements, but
   doesn't yet do enough cleanup (have fewer transport mechanisms,
   split into different views).

2) A new bgwriter implementation (that can be turned on by setting the
   bgwriter_legacy GUC to false). There's a few major differences:

   a) bgwriter performs the clock sweep - that makes it a lot easier
      to actually find buffers worthwhile to clean. It's quite
      possible to get into situations where the old bgwriter can't do
      anything for a while because all buffers have a usagecount > 0.
   b) When a buffer is encountered by bgwriter, after performing clock
      sweep, is clean and has usage/pin count of 0 (i.e. it can be
      reclaimed), then we also push it onto the queue.
   c) It just has a ringbuffer of clean buffers, that backends can
      drain. Bgwriter pushes (without any locks) entries onto the
      queue, backends can pop them of.
   d) The pacing logic is a lot simpler. There's a ringbuffer that
      bgwriter tries to fill. There's a low watermark that causes
      backends to wake up bgwriter.

16 files changed:
src/backend/access/transam/xlog.c
src/backend/catalog/system_views.sql
src/backend/postmaster/bgwriter.c
src/backend/postmaster/checkpointer.c
src/backend/postmaster/pgstat.c
src/backend/storage/buffer/buf_init.c
src/backend/storage/buffer/bufmgr.c
src/backend/storage/buffer/freelist.c
src/backend/utils/adt/pgstatfuncs.c
src/backend/utils/misc/guc.c
src/include/catalog/pg_proc.dat
src/include/pgstat.h
src/include/postmaster/bgwriter.h
src/include/storage/buf_internals.h
src/include/storage/bufmgr.h
src/test/regress/expected/rules.out

index 1c7dd51b9f1c0b81d7235587fcf9eddfbc2dee81..78c1d786fa4107a6aebe7c6b46f1020de420f196 100644 (file)
@@ -8376,6 +8376,8 @@ LogCheckpointEnd(bool restartpoint)
    BgWriterStats.m_checkpoint_sync_time +=
        sync_secs * 1000 + sync_usecs / 1000;
 
+   BgWriterStats.m_buf_fsync_checkpointer += CheckpointStats.ckpt_sync_rels;
+
    /*
     * All of the published timing statistics are accounted for.  Only
     * continue if a log message is to be written.
index 78a103cdb95a426af9db4bf3aa271ae86cc5c477..d15aed10ad285c2a4226528e5eb197e9c318be47 100644 (file)
@@ -898,12 +898,27 @@ CREATE VIEW pg_stat_bgwriter AS
         pg_stat_get_bgwriter_requested_checkpoints() AS checkpoints_req,
         pg_stat_get_checkpoint_write_time() AS checkpoint_write_time,
         pg_stat_get_checkpoint_sync_time() AS checkpoint_sync_time,
-        pg_stat_get_bgwriter_buf_written_checkpoints() AS buffers_checkpoint,
-        pg_stat_get_bgwriter_buf_written_clean() AS buffers_clean,
+
+        pg_stat_get_buf_written_checkpoints() AS buffers_written_checkpoint,
+        pg_stat_get_buf_written_bgwriter() AS buffers_written_bgwriter,
+        pg_stat_get_buf_written_backend() AS buffers_written_backend,
+        pg_stat_get_buf_written_ring() AS buffers_written_ring,
+
+        pg_stat_get_buf_fsync_checkpointer() AS buffers_fsync_checkpointer,
+        pg_stat_get_buf_fsync_bgwriter() AS buffers_fsync_bgwriter,
+        pg_stat_get_buf_fsync_backend() AS buffers_fsync_backend,
+
+        pg_stat_get_buf_bgwriter_clean() AS buffers_bgwriter_clean,
+
+        pg_stat_get_buf_alloc_preclean() AS buffers_alloc_preclean,
+        pg_stat_get_buf_alloc_free() AS buffers_alloc_free,
+        pg_stat_get_buf_alloc_sweep() AS buffers_alloc_sweep,
+        pg_stat_get_buf_alloc_ring() AS buffers_alloc_ring,
+
+        pg_stat_get_buf_ticks_bgwriter() AS buffers_ticks_bgwriter,
+        pg_stat_get_buf_ticks_backend() AS buffers_ticks_backend,
+
         pg_stat_get_bgwriter_maxwritten_clean() AS maxwritten_clean,
-        pg_stat_get_buf_written_backend() AS buffers_backend,
-        pg_stat_get_buf_fsync_backend() AS buffers_backend_fsync,
-        pg_stat_get_buf_alloc() AS buffers_alloc,
         pg_stat_get_bgwriter_stat_reset_time() AS stats_reset;
 
 CREATE VIEW pg_stat_progress_vacuum AS
index e6b6c549de54ab226e6cae322cc9848f0214b319..526304fefc9ab485a8fe3e6098ade74fb44d4f0d 100644 (file)
@@ -65,6 +65,7 @@
  * GUC parameters
  */
 int            BgWriterDelay = 200;
+bool       BgWriterLegacy = true;
 
 /*
  * Multiplier to apply to BgWriterDelay when we decide to hibernate.
@@ -264,7 +265,10 @@ BackgroundWriterMain(void)
        /*
         * Do one cycle of dirty-buffer writing.
         */
-       can_hibernate = BgBufferSync(&wb_context);
+       if (BgWriterLegacy)
+           can_hibernate = BgBufferSyncLegacy(&wb_context);
+       else
+           can_hibernate = BgBufferSyncNew(&wb_context);
 
        /*
         * Send off activity statistics to the stats collector
@@ -366,7 +370,8 @@ BackgroundWriterMain(void)
                             BgWriterDelay * HIBERNATE_FACTOR,
                             WAIT_EVENT_BGWRITER_HIBERNATE);
            /* Reset the notification request in case we timed out */
-           StrategyNotifyBgWriter(-1);
+           if (BgWriterLegacy)
+               StrategyNotifyBgWriter(-1);
        }
 
        prev_hibernate = can_hibernate;
index 13f152b4731e7f7338dba7a73af1cbd9e4bea5c8..e5ecca1e3db0505ee2f411bb0124b9e149bd30b3 100644 (file)
  * The requests array holds fsync requests sent by backends and not yet
  * absorbed by the checkpointer.
  *
- * Unlike the checkpoint fields, num_backend_writes, num_backend_fsync, and
+ * Unlike the checkpoint fields, num_written_*, num_fsync_*, and
  * the requests fields are protected by CheckpointerCommLock.
  *----------
  */
@@ -127,8 +127,11 @@ typedef struct
    ConditionVariable start_cv; /* signaled when ckpt_started advances */
    ConditionVariable done_cv;  /* signaled when ckpt_done advances */
 
-   uint32      num_backend_writes; /* counts user backend buffer writes */
-   uint32      num_backend_fsync;  /* counts user backend fsync calls */
+   uint32      num_written_backend; /* counts user backend buffer writes */
+   uint32      num_written_ring; /* counts ring buffer writes */
+
+   uint32      num_fsync_bgwriter; /* counts bgwriter fsync calls */
+   uint32      num_fsync_backend;  /* counts user backend fsync calls */
 
    int         num_requests;   /* current # of requests */
    int         max_requests;   /* allocated array size */
@@ -1119,7 +1122,7 @@ ForwardSyncRequest(const FileTag *ftag, SyncRequestType type)
 
    /* Count all backend writes regardless of if they fit in the queue */
    if (!AmBackgroundWriterProcess())
-       CheckpointerShmem->num_backend_writes++;
+       CheckpointerShmem->num_written_backend++;
 
    /*
     * If the checkpointer isn't running or the request queue is full, the
@@ -1134,8 +1137,10 @@ ForwardSyncRequest(const FileTag *ftag, SyncRequestType type)
         * Count the subset of writes where backends have to do their own
         * fsync
         */
-       if (!AmBackgroundWriterProcess())
-           CheckpointerShmem->num_backend_fsync++;
+       if (AmBackgroundWriterProcess())
+           CheckpointerShmem->num_fsync_backend++;
+       else
+           CheckpointerShmem->num_fsync_bgwriter++;
        LWLockRelease(CheckpointerCommLock);
        return false;
    }
@@ -1295,11 +1300,15 @@ AbsorbSyncRequests(void)
    LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE);
 
    /* Transfer stats counts into pending pgstats message */
-   BgWriterStats.m_buf_written_backend += CheckpointerShmem->num_backend_writes;
-   BgWriterStats.m_buf_fsync_backend += CheckpointerShmem->num_backend_fsync;
+   BgWriterStats.m_buf_written_backend += CheckpointerShmem->num_written_backend;
+   BgWriterStats.m_buf_written_ring += CheckpointerShmem->num_written_ring;
+   BgWriterStats.m_buf_fsync_backend += CheckpointerShmem->num_fsync_backend;
+   BgWriterStats.m_buf_fsync_bgwriter += CheckpointerShmem->num_fsync_bgwriter;
 
-   CheckpointerShmem->num_backend_writes = 0;
-   CheckpointerShmem->num_backend_fsync = 0;
+   CheckpointerShmem->num_written_backend = 0;
+   CheckpointerShmem->num_written_ring = 0;
+   CheckpointerShmem->num_fsync_backend = 0;
+   CheckpointerShmem->num_fsync_bgwriter = 0;
 
    /*
     * We try to avoid holding the lock for a long time by copying the request
@@ -1373,3 +1382,12 @@ FirstCallSinceLastCheckpoint(void)
 
    return FirstCall;
 }
+
+// FIXME: crappy API
+void
+ReportRingWrite(void)
+{
+   LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE);
+   CheckpointerShmem->num_written_ring++;
+   LWLockRelease(CheckpointerCommLock);
+}
index b4f2b28b517fbe58410d215398b938ae7fc7bb55..9aa7b9b81391d06205be9299d36491d792cc47b6 100644 (file)
@@ -6313,12 +6313,26 @@ pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
    globalStats.requested_checkpoints += msg->m_requested_checkpoints;
    globalStats.checkpoint_write_time += msg->m_checkpoint_write_time;
    globalStats.checkpoint_sync_time += msg->m_checkpoint_sync_time;
+
    globalStats.buf_written_checkpoints += msg->m_buf_written_checkpoints;
-   globalStats.buf_written_clean += msg->m_buf_written_clean;
-   globalStats.maxwritten_clean += msg->m_maxwritten_clean;
+   globalStats.buf_written_bgwriter += msg->m_buf_written_bgwriter;
    globalStats.buf_written_backend += msg->m_buf_written_backend;
+   globalStats.buf_written_ring += msg->m_buf_written_ring;
+
+   globalStats.buf_fsync_checkpointer += msg->m_buf_fsync_checkpointer;
+   globalStats.buf_fsync_bgwriter += msg->m_buf_fsync_bgwriter;
    globalStats.buf_fsync_backend += msg->m_buf_fsync_backend;
-   globalStats.buf_alloc += msg->m_buf_alloc;
+
+   globalStats.buf_alloc_preclean += msg->m_buf_alloc_preclean;
+   globalStats.buf_alloc_free += msg->m_buf_alloc_free;
+   globalStats.buf_alloc_sweep += msg->m_buf_alloc_sweep;
+   globalStats.buf_alloc_ring += msg->m_buf_alloc_ring;
+
+   globalStats.buf_ticks_bgwriter += msg->m_buf_ticks_bgwriter;
+   globalStats.buf_ticks_backend += msg->m_buf_ticks_backend;
+
+   globalStats.buf_clean_bgwriter += msg->m_buf_clean_bgwriter;
+   globalStats.maxwritten_clean += msg->m_maxwritten_clean;
 }
 
 /* ----------
index ccd2c31c0b39af34228b9f5c4eb2f4d8239399b8..6154f75714f260043d67421fa957a56246892320 100644 (file)
@@ -14,6 +14,7 @@
  */
 #include "postgres.h"
 
+#include "lib/ringbuf.h"
 #include "storage/bufmgr.h"
 #include "storage/buf_internals.h"
 
@@ -23,6 +24,7 @@ char     *BufferBlocks;
 LWLockMinimallyPadded *BufferIOLWLockArray = NULL;
 WritebackContext BackendWritebackContext;
 CkptSortItem *CkptBufferIds;
+ringbuf *VictimBuffers = NULL;
 
 
 /*
@@ -70,7 +72,8 @@ InitBufferPool(void)
    bool        foundBufs,
                foundDescs,
                foundIOLocks,
-               foundBufCkpt;
+               foundBufCkpt,
+               foundFreeBufs;
 
    /* Align descriptors to a cacheline boundary. */
    BufferDescriptors = (BufferDescPadded *)
@@ -91,6 +94,10 @@ InitBufferPool(void)
    LWLockRegisterTranche(LWTRANCHE_BUFFER_IO_IN_PROGRESS, "buffer_io");
    LWLockRegisterTranche(LWTRANCHE_BUFFER_CONTENT, "buffer_content");
 
+   VictimBuffers = ShmemInitStruct("Free Buffers",
+                                   ringbuf_size(VICTIM_BUFFER_PRECLEAN_SIZE),
+                                   &foundFreeBufs);
+
    /*
     * The array used to sort to-be-checkpointed buffer ids is located in
     * shared memory, to avoid having to allocate significant amounts of
@@ -102,10 +109,11 @@ InitBufferPool(void)
        ShmemInitStruct("Checkpoint BufferIds",
                        NBuffers * sizeof(CkptSortItem), &foundBufCkpt);
 
-   if (foundDescs || foundBufs || foundIOLocks || foundBufCkpt)
+   if (foundDescs || foundBufs || foundIOLocks || foundBufCkpt || foundFreeBufs)
    {
        /* should find all of these, or none of them */
-       Assert(foundDescs && foundBufs && foundIOLocks && foundBufCkpt);
+       Assert(foundDescs && foundBufs && foundIOLocks && foundBufCkpt && foundFreeBufs);
+
        /* note: this path is only taken in EXEC_BACKEND case */
    }
    else
@@ -129,6 +137,7 @@ InitBufferPool(void)
            /*
             * Initially link all the buffers together as unused. Subsequent
             * management of this list is done by freelist.c.
+            * FIXME: remove once legacy bgwriter is removed
             */
            buf->freeNext = i + 1;
 
@@ -139,8 +148,10 @@ InitBufferPool(void)
                             LWTRANCHE_BUFFER_IO_IN_PROGRESS);
        }
 
-       /* Correct last entry of linked list */
+       /* Correct last entry of linked list: FIXME: remove */
        GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST;
+       /* FIXME: could fill the first few free buffers? */
+       VictimBuffers = ringbuf_create(VictimBuffers, VICTIM_BUFFER_PRECLEAN_SIZE);
    }
 
    /* Init other shared buffer-management stuff */
@@ -189,5 +200,8 @@ BufferShmemSize(void)
    /* size of checkpoint sort array in bufmgr.c */
    size = add_size(size, mul_size(NBuffers, sizeof(CkptSortItem)));
 
+   /* FIXME: better ringbuffer size */
+   size = add_size(size, ringbuf_size(VICTIM_BUFFER_PRECLEAN_SIZE));
+
    return size;
 }
index 7332e6b59034ce3b1a06c4aba91f00e7237c788f..9d63244ba084b871ff84c820782343df403ffe83 100644 (file)
@@ -39,6 +39,7 @@
 #include "catalog/storage.h"
 #include "executor/instrument.h"
 #include "lib/binaryheap.h"
+#include "lib/ringbuf.h"
 #include "miscadmin.h"
 #include "pg_trace.h"
 #include "pgstat.h"
@@ -101,7 +102,7 @@ typedef struct CkptTsStatus
    /* already processed pages in this tablespace */
    int         num_scanned;
 
-   /* current offset in CkptBufferIds for this tablespace */
+   /* currentCheckpointerShmem->num_written_ring offset in CkptBufferIds for this tablespace */
    int         index;
 } CkptTsStatus;
 
@@ -866,11 +867,29 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 
    if (isExtend)
    {
+       instr_time  io_start,
+           io_time;
+
+       if (track_io_timing)
+           INSTR_TIME_SET_CURRENT(io_start);
+
        /* new buffers are zero-filled */
        MemSet((char *) bufBlock, 0, BLCKSZ);
+
+       if (track_io_timing)
+           INSTR_TIME_SET_CURRENT(io_start);
+
        /* don't set checksum for all-zero page */
        smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
 
+       if (track_io_timing)
+       {
+           INSTR_TIME_SET_CURRENT(io_time);
+           INSTR_TIME_SUBTRACT(io_time, io_start);
+           pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
+           INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
+       }
+
        /*
         * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
         * although we're essentially performing a write. At least on linux
@@ -1136,6 +1155,9 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
                        UnpinBuffer(buf, true);
                        continue;
                    }
+
+                   // FIXME: crappy API
+                   StrategyReportWrite(strategy, buf);
                }
 
                /* OK, do the I/O */
@@ -1352,6 +1374,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
  * trying to write it out.  We have to let them finish before we can
  * reclaim the buffer.
  *
+ * FIXME: ^^^
+ *
  * The buffer could get reclaimed by someone else while we are waiting
  * to acquire the necessary locks; if so, don't mess it up.
  */
@@ -2038,7 +2062,119 @@ BufferSync(int flags)
 }
 
 /*
- * BgBufferSync -- Write out some dirty buffers in the pool.
+ * BgBufferSyncNew -- Write out some dirty buffers in the pool.
+ *
+ * This is called periodically by the background writer process.
+ *
+ * Returns true if it's appropriate for the bgwriter process to go into
+ * low-power hibernation mode.
+ */
+bool
+BgBufferSyncNew(WritebackContext *wb_context)
+{
+   uint32      recent_alloc_preclean;
+   uint32      recent_alloc_free;
+   uint32      recent_alloc_sweep;
+   uint32      recent_alloc_ring;
+   uint32      strategy_passes;
+   uint64      nticks;
+   uint64      nticks_sum = 0;
+
+   /* Make sure we can handle the pin inside SyncOneBuffer */
+   ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+   /* Know where to start, and report buffer alloc counts to pgstat */
+   StrategySyncStart(&strategy_passes,
+                     &recent_alloc_preclean,
+                     &recent_alloc_free,
+                     &recent_alloc_sweep,
+                     &recent_alloc_ring,
+                     &nticks);
+
+   /* Report buffer alloc counts to pgstat */
+   BgWriterStats.m_buf_alloc_preclean += recent_alloc_preclean;
+   BgWriterStats.m_buf_alloc_free += recent_alloc_free;
+   BgWriterStats.m_buf_alloc_sweep += recent_alloc_sweep;
+   BgWriterStats.m_buf_alloc_ring += recent_alloc_ring;
+   BgWriterStats.m_buf_ticks_backend += nticks;
+
+   /* go and populate freelist */
+   while (!ringbuf_full(VictimBuffers))
+   {
+       BufferDesc *bufHdr;
+       bool pushed;
+       bool dirty;
+       uint32      buf_state;
+
+       ReservePrivateRefCountEntry();
+
+       bufHdr = ClockSweep(NULL, &buf_state, &nticks);
+       nticks_sum += nticks;
+
+       dirty = buf_state & BM_DIRTY;
+
+       if (dirty)
+       {
+           SMgrRelation reln;
+           BufferTag tag;
+           LWLock *content_lock;
+
+
+           /*
+            * Pin it, share-lock it, write it.  (FlushBuffer will do nothing if the
+            * buffer is clean by the time we've locked it.)
+            */
+           PinBuffer_Locked(bufHdr);
+
+           /* open relation before locking the page */
+           reln = smgropen(bufHdr->tag.rnode, InvalidBackendId);
+
+           content_lock = BufferDescriptorGetContentLock(bufHdr);
+
+           LWLockAcquire(content_lock, LW_SHARED);
+           FlushBuffer(bufHdr, reln);
+           LWLockRelease(content_lock);
+
+           /* copy tag before releasing pin */
+           tag = bufHdr->tag;
+
+           UnpinBuffer(bufHdr, true);
+
+           pushed = ringbuf_push(VictimBuffers, bufHdr);
+
+           Assert(wb_context);
+           ScheduleBufferTagForWriteback(wb_context, &tag);
+
+           BgWriterStats.m_buf_written_bgwriter++;
+       }
+       else
+       {
+           UnlockBufHdr(bufHdr, buf_state);
+           pushed = ringbuf_push(VictimBuffers, bufHdr);
+
+           BgWriterStats.m_buf_clean_bgwriter++;
+       }
+
+       /* full, shouldn't normally happen, we're the only writer  */
+       if (!pushed)
+           break;
+
+       /* so we occasionally sleep, even if continually busy */
+       if (BgWriterStats.m_buf_written_bgwriter >= bgwriter_lru_maxpages)
+       {
+           BgWriterStats.m_maxwritten_clean++;
+           break;
+       }
+   }
+
+   BgWriterStats.m_buf_ticks_bgwriter += nticks_sum;
+
+   return BgWriterStats.m_buf_written_bgwriter == 0 &&
+       BgWriterStats.m_buf_clean_bgwriter == 0;
+}
+
+/*
+ * BgBufferSyncLegacy -- Write out some dirty buffers in the pool.
  *
  * This is called periodically by the background writer process.
  *
@@ -2049,12 +2185,16 @@ BufferSync(int flags)
  * bgwriter_lru_maxpages to 0.)
  */
 bool
-BgBufferSync(WritebackContext *wb_context)
+BgBufferSyncLegacy(WritebackContext *wb_context)
 {
    /* info obtained from freelist.c */
    int         strategy_buf_id;
    uint32      strategy_passes;
-   uint32      recent_alloc;
+   uint32      recent_alloc_preclean;
+   uint32      recent_alloc_free;
+   uint32      recent_alloc_sweep;
+   uint32      recent_alloc_ring;
+   uint64      recent_ticks;
 
    /*
     * Information saved between calls so we can determine the strategy
@@ -2090,16 +2230,25 @@ BgBufferSync(WritebackContext *wb_context)
 
    /* Variables for final smoothed_density update */
    long        new_strategy_delta;
-   uint32      new_recent_alloc;
+   uint32      new_recent_alloc_sweep;
 
    /*
     * Find out where the freelist clock sweep currently is, and how many
     * buffer allocations have happened since our last call.
     */
-   strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
+   strategy_buf_id = StrategySyncStart(&strategy_passes,
+                                       &recent_alloc_preclean,
+                                       &recent_alloc_free,
+                                       &recent_alloc_sweep,
+                                       &recent_alloc_ring,
+                                       &recent_ticks);
 
    /* Report buffer alloc counts to pgstat */
-   BgWriterStats.m_buf_alloc += recent_alloc;
+   BgWriterStats.m_buf_alloc_preclean += recent_alloc_preclean;
+   BgWriterStats.m_buf_alloc_free += recent_alloc_free;
+   BgWriterStats.m_buf_alloc_sweep += recent_alloc_sweep;
+   BgWriterStats.m_buf_alloc_ring += recent_alloc_ring;
+   BgWriterStats.m_buf_ticks_backend += recent_ticks;
 
    /*
     * If we're not running the LRU scan, just stop after doing the stats
@@ -2196,9 +2345,9 @@ BgBufferSync(WritebackContext *wb_context)
     *
     * If the strategy point didn't move, we don't update the density estimate
     */
-   if (strategy_delta > 0 && recent_alloc > 0)
+   if (strategy_delta > 0 && recent_alloc_sweep > 0)
    {
-       scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
+       scans_per_alloc = (float) strategy_delta / (float) recent_alloc_sweep;
        smoothed_density += (scans_per_alloc - smoothed_density) /
            smoothing_samples;
    }
@@ -2216,10 +2365,10 @@ BgBufferSync(WritebackContext *wb_context)
     * a true average we want a fast-attack, slow-decline behavior: we
     * immediately follow any increase.
     */
-   if (smoothed_alloc <= (float) recent_alloc)
-       smoothed_alloc = recent_alloc;
+   if (smoothed_alloc <= (float) recent_alloc_sweep)
+       smoothed_alloc = recent_alloc_sweep;
    else
-       smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
+       smoothed_alloc += ((float) recent_alloc_sweep - smoothed_alloc) /
            smoothing_samples;
 
    /* Scale the estimate by a GUC to allow more aggressive tuning. */
@@ -2297,7 +2446,7 @@ BgBufferSync(WritebackContext *wb_context)
            reusable_buffers++;
    }
 
-   BgWriterStats.m_buf_written_clean += num_written;
+   BgWriterStats.m_buf_written_bgwriter += num_written;
 
 #ifdef BGW_DEBUG
    elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
@@ -2317,22 +2466,22 @@ BgBufferSync(WritebackContext *wb_context)
     * density estimates.
     */
    new_strategy_delta = bufs_to_lap - num_to_scan;
-   new_recent_alloc = reusable_buffers - reusable_buffers_est;
-   if (new_strategy_delta > 0 && new_recent_alloc > 0)
+   new_recent_alloc_sweep = reusable_buffers - reusable_buffers_est;
+   if (new_strategy_delta > 0 && new_recent_alloc_sweep > 0)
    {
-       scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
+       scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc_sweep;
        smoothed_density += (scans_per_alloc - smoothed_density) /
            smoothing_samples;
 
 #ifdef BGW_DEBUG
        elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
-            new_recent_alloc, new_strategy_delta,
+            new_recent_alloc_sweep, new_strategy_delta,
             scans_per_alloc, smoothed_density);
 #endif
    }
 
    /* Return true if OK to hibernate */
-   return (bufs_to_lap == 0 && recent_alloc == 0);
+   return (bufs_to_lap == 0 && new_recent_alloc_sweep == 0);
 }
 
 /*
@@ -4321,6 +4470,8 @@ void
 IssuePendingWritebacks(WritebackContext *context)
 {
    int         i;
+   instr_time  io_start,
+               io_time;
 
    if (context->nr_pending == 0)
        return;
@@ -4332,6 +4483,9 @@ IssuePendingWritebacks(WritebackContext *context)
    qsort(&context->pending_writebacks, context->nr_pending,
          sizeof(PendingWriteback), buffertag_comparator);
 
+   if (track_io_timing)
+       INSTR_TIME_SET_CURRENT(io_start);
+
    /*
     * Coalesce neighbouring writes, but nothing else. For that we iterate
     * through the, now sorted, array of pending flushes, and look forward to
@@ -4381,6 +4535,14 @@ IssuePendingWritebacks(WritebackContext *context)
        smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks);
    }
 
+   if (track_io_timing)
+   {
+       INSTR_TIME_SET_CURRENT(io_time);
+       INSTR_TIME_SUBTRACT(io_time, io_start);
+       pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
+       INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
+   }
+
    context->nr_pending = 0;
 }
 
index 06659ab265336beb15c46a1e3e514f70f2b7a917..6583f1c3815efa974ae687d787575ab053b1b755 100644 (file)
@@ -15,7 +15,9 @@
  */
 #include "postgres.h"
 
+#include "lib/ringbuf.h"
 #include "port/atomics.h"
+#include "postmaster/bgwriter.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "storage/proc.h"
@@ -51,7 +53,14 @@ typedef struct
     * overflow during a single bgwriter cycle.
     */
    uint32      completePasses; /* Complete cycles of the clock sweep */
-   pg_atomic_uint32 numBufferAllocs;   /* Buffers allocated since last reset */
+
+   /* Buffers allocated since last reset */
+   pg_atomic_uint32 numBufferAllocsPreclean;
+   pg_atomic_uint32 numBufferAllocsFree;
+   pg_atomic_uint32 numBufferAllocsSweep;
+   pg_atomic_uint32 numBufferAllocsRing;
+
+   pg_atomic_uint64 numBufferTicksBackend;
 
    /*
     * Bgworker process to be notified upon activity or -1 if none. See
@@ -168,6 +177,62 @@ ClockSweepTick(void)
    return victim;
 }
 
+BufferDesc *
+ClockSweep(BufferAccessStrategy strategy, uint32 *buf_state, uint64 *nticks)
+{
+   BufferDesc *buf;
+   int         trycounter;
+   uint32      local_buf_state;    /* to avoid repeated (de-)referencing */
+   uint64      local_nticks = 0;
+
+   trycounter = NBuffers;
+   for (;;)
+   {
+
+       buf = GetBufferDescriptor(ClockSweepTick());
+       local_nticks++;
+
+       /*
+        * If the buffer is pinned or has a nonzero usage_count, we cannot use
+        * it; decrement the usage_count (unless pinned) and keep scanning.
+        */
+       local_buf_state = LockBufHdr(buf);
+
+       if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0)
+       {
+           if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0)
+           {
+               local_buf_state -= BUF_USAGECOUNT_ONE;
+
+               trycounter = NBuffers;
+           }
+           else
+           {
+               /* Found a usable buffer */
+               if (strategy != NULL)
+                   AddBufferToRing(strategy, buf);
+               *buf_state = local_buf_state;
+               *nticks = local_nticks;
+
+               return buf;
+           }
+       }
+       else if (--trycounter == 0)
+       {
+           /*
+            * We've scanned all the buffers without making any state changes,
+            * so all the buffers are pinned (or were when we looked at them).
+            * We could hope that someone will free one eventually, but it's
+            * probably better to fail than to risk getting stuck in an
+            * infinite loop.
+            */
+           UnlockBufHdr(buf, local_buf_state);
+           elog(ERROR, "no unpinned buffers available");
+       }
+       UnlockBufHdr(buf, local_buf_state);
+   }
+}
+
 /*
  * have_free_buffer -- a lockless check to see if there is a free buffer in
  *                    buffer pool.
@@ -202,8 +267,8 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
 {
    BufferDesc *buf;
    int         bgwprocno;
-   int         trycounter;
    uint32      local_buf_state;    /* to avoid repeated (de-)referencing */
+   uint64      nticks;
 
    /*
     * If given a strategy object, see whether it can select a buffer. We
@@ -229,26 +294,22 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
     * some arbitrary process.
     */
    bgwprocno = INT_ACCESS_ONCE(StrategyControl->bgwprocno);
-   if (bgwprocno != -1)
+   if (BgWriterLegacy)
    {
-       /* reset bgwprocno first, before setting the latch */
-       StrategyControl->bgwprocno = -1;
+       if (bgwprocno != -1)
+       {
+           /* reset bgwprocno first, before setting the latch */
+           StrategyControl->bgwprocno = -1;
 
-       /*
-        * Not acquiring ProcArrayLock here which is slightly icky. It's
-        * actually fine because procLatch isn't ever freed, so we just can
-        * potentially set the wrong process' (or no process') latch.
-        */
-       SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch);
+           /*
+            * Not acquiring ProcArrayLock here which is slightly icky. It's
+            * actually fine because procLatch isn't ever freed, so we just can
+            * potentially set the wrong process' (or no process') latch.
+            */
+           SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch);
+       }
    }
 
-   /*
-    * We count buffer allocation requests so that the bgwriter can estimate
-    * the rate of buffer consumption.  Note that buffers recycled by a
-    * strategy object are intentionally not counted here.
-    */
-   pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1);
-
    /*
     * First check, without acquiring the lock, whether there's buffers in the
     * freelist. Since we otherwise don't require the spinlock in every
@@ -302,6 +363,9 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
            if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
                && BUF_STATE_GET_USAGECOUNT(local_buf_state) == 0)
            {
+               // FIXME: possible to do outside of lock?
+               pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocsFree, 1);
+
                if (strategy != NULL)
                    AddBufferToRing(strategy, buf);
                *buf_state = local_buf_state;
@@ -312,51 +376,81 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
        }
    }
 
-   /* Nothing on the freelist, so run the "clock sweep" algorithm */
-   trycounter = NBuffers;
-   for (;;)
+   if (!BgWriterLegacy)
    {
-       buf = GetBufferDescriptor(ClockSweepTick());
+       int i = 0;
 
        /*
-        * If the buffer is pinned or has a nonzero usage_count, we cannot use
-        * it; decrement the usage_count (unless pinned) and keep scanning.
+        * Try to get a buffer from the clean buffer list.
         */
-       local_buf_state = LockBufHdr(buf);
-
-       if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0)
+       while (!ringbuf_empty(VictimBuffers))
        {
-           if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0)
-           {
-               local_buf_state -= BUF_USAGECOUNT_ONE;
+           BufferDesc *buf;
+           bool found;
+           uint32 elements;
 
-               trycounter = NBuffers;
+           found = ringbuf_pop(VictimBuffers, (void *)&buf);
+
+           /* If the ringbuffer is sufficiently depleted, wakeup the bgwriter. */
+           if (bgwprocno != -1 &&
+               (!found ||
+                (elements = ringbuf_elements(VictimBuffers)) < VICTIM_BUFFER_PRECLEAN_SIZE / 4))
+           {
+#if 0
+               if (!found)
+                   elog(LOG, "signalling bgwriter: empty");
+               else
+                   elog(LOG, "signalling bgwriter: watermark: %u %u/%u",
+                        elements, VICTIM_BUFFER_PRECLEAN_SIZE / 4, VICTIM_BUFFER_PRECLEAN_SIZE);
+#endif
+               SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch);
            }
-           else
+
+           if (!found)
+               break;
+
+           /* check if the buffer is still unused, done if so */
+           local_buf_state = LockBufHdr(buf);
+           if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
+               && BUF_STATE_GET_USAGECOUNT(local_buf_state) == 0)
            {
-               /* Found a usable buffer */
+               // FIXME: possible to do outside of lock?
+               pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocsPreclean, 1);
+
                if (strategy != NULL)
                    AddBufferToRing(strategy, buf);
                *buf_state = local_buf_state;
                return buf;
            }
+           else
+           {
+               UnlockBufHdr(buf, local_buf_state);
+               //ereport(LOG, (errmsg("buffer %u since reused (hand at %u)",
+               //                   buf->buf_id,
+               //                   pg_atomic_read_u32(&StrategyControl->nextVictimBuffer) % NBuffers),
+               //            errhidestmt(true)));
+           }
+
+           i++;
        }
-       else if (--trycounter == 0)
-       {
-           /*
-            * We've scanned all the buffers without making any state changes,
-            * so all the buffers are pinned (or were when we looked at them).
-            * We could hope that someone will free one eventually, but it's
-            * probably better to fail than to risk getting stuck in an
-            * infinite loop.
-            */
-           UnlockBufHdr(buf, local_buf_state);
-           elog(ERROR, "no unpinned buffers available");
-       }
-       UnlockBufHdr(buf, local_buf_state);
+
+#if 0
+       ereport(LOG, (errmsg("ringbuf empty after %u cycles", i),
+                     errhidestmt(true)));
+#endif
+
    }
+
+   /* Nothing on the freelist, so run the "clock sweep" algorithm */
+   buf = ClockSweep(strategy, buf_state, &nticks);
+
+   pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocsSweep, 1);
+   pg_atomic_fetch_add_u64(&StrategyControl->numBufferTicksBackend, nticks);
+
+   return buf;
 }
 
+
 /*
  * StrategyFreeBuffer: put a buffer on the freelist
  */
@@ -381,18 +475,22 @@ StrategyFreeBuffer(BufferDesc *buf)
 }
 
 /*
- * StrategySyncStart -- tell BufferSync where to start syncing
+ * StrategySyncStart -- tell BgBufferSync where to start syncing
  *
- * The result is the buffer index of the best buffer to sync first.
- * BufferSync() will proceed circularly around the buffer array from there.
+ * The result is the buffer index below the current clock-hand. BgBufferSync()
+ * will proceed circularly around the buffer array from there.
  *
- * In addition, we return the completed-pass count (which is effectively
- * the higher-order bits of nextVictimBuffer) and the count of recent buffer
- * allocs if non-NULL pointers are passed.  The alloc count is reset after
- * being read.
+ * In addition, we return the completed-pass count (which is effectively the
+ * higher-order bits of nextVictimBuffer) and the counts of recent buffer
+ * allocations.  The allocation counts are reset after being read.
  */
 int
-StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
+StrategySyncStart(uint32 *complete_passes,
+                 uint32 *alloc_preclean,
+                 uint32 *alloc_free,
+                 uint32 *alloc_sweep,
+                 uint32 *alloc_ring,
+                 uint64 *ticks_backend)
 {
    uint32      nextVictimBuffer;
    int         result;
@@ -410,13 +508,16 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
         * completePasses could be incremented. C.f. ClockSweepTick().
         */
        *complete_passes += nextVictimBuffer / NBuffers;
-   }
 
-   if (num_buf_alloc)
-   {
-       *num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0);
    }
    SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+
+   *alloc_preclean = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocsPreclean, 0);
+   *alloc_free = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocsFree, 0);
+   *alloc_sweep = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocsSweep, 0);
+   *alloc_ring = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocsRing, 0);
+   *ticks_backend = pg_atomic_exchange_u64(&StrategyControl->numBufferTicksBackend, 0);
+
    return result;
 }
 
@@ -517,7 +618,11 @@ StrategyInitialize(bool init)
 
        /* Clear statistics */
        StrategyControl->completePasses = 0;
-       pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0);
+       pg_atomic_init_u32(&StrategyControl->numBufferAllocsPreclean, 0);
+       pg_atomic_init_u32(&StrategyControl->numBufferAllocsFree, 0);
+       pg_atomic_init_u32(&StrategyControl->numBufferAllocsSweep, 0);
+       pg_atomic_init_u32(&StrategyControl->numBufferAllocsRing, 0);
+       pg_atomic_init_u64(&StrategyControl->numBufferTicksBackend, 0);
 
        /* No pending notification */
        StrategyControl->bgwprocno = -1;
@@ -645,6 +750,9 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
    if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
        && BUF_STATE_GET_USAGECOUNT(local_buf_state) <= 1)
    {
+       // FIXME: possible to do outside of lock?
+       pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocsRing, 1);
+
        strategy->current_was_in_ring = true;
        *buf_state = local_buf_state;
        return buf;
@@ -702,3 +810,11 @@ StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf)
 
    return true;
 }
+
+void
+StrategyReportWrite(BufferAccessStrategy strategy,
+                   BufferDesc *buf)
+{
+   if (strategy->current_was_in_ring)
+       ReportRingWrite();
+}
index 05240bfd142c3b0176d254640e81dba4c6cb58bb..d0d163ea35a8bcab226ddafad5de6ef6ffe891c2 100644 (file)
@@ -1604,15 +1604,45 @@ pg_stat_get_bgwriter_requested_checkpoints(PG_FUNCTION_ARGS)
 }
 
 Datum
-pg_stat_get_bgwriter_buf_written_checkpoints(PG_FUNCTION_ARGS)
+pg_stat_get_buf_written_checkpoints(PG_FUNCTION_ARGS)
 {
    PG_RETURN_INT64(pgstat_fetch_global()->buf_written_checkpoints);
 }
 
 Datum
-pg_stat_get_bgwriter_buf_written_clean(PG_FUNCTION_ARGS)
+pg_stat_get_buf_written_bgwriter(PG_FUNCTION_ARGS)
 {
-   PG_RETURN_INT64(pgstat_fetch_global()->buf_written_clean);
+   PG_RETURN_INT64(pgstat_fetch_global()->buf_written_bgwriter);
+}
+
+Datum
+pg_stat_get_buf_written_backend(PG_FUNCTION_ARGS)
+{
+   PG_RETURN_INT64(pgstat_fetch_global()->buf_written_backend);
+}
+
+Datum
+pg_stat_get_buf_written_ring(PG_FUNCTION_ARGS)
+{
+   PG_RETURN_INT64(pgstat_fetch_global()->buf_written_ring);
+}
+
+Datum
+pg_stat_get_buf_ticks_bgwriter(PG_FUNCTION_ARGS)
+{
+   PG_RETURN_INT64(pgstat_fetch_global()->buf_ticks_bgwriter);
+}
+
+Datum
+pg_stat_get_buf_ticks_backend(PG_FUNCTION_ARGS)
+{
+   PG_RETURN_INT64(pgstat_fetch_global()->buf_ticks_backend);
+}
+
+Datum
+pg_stat_get_buf_bgwriter_clean(PG_FUNCTION_ARGS)
+{
+   PG_RETURN_INT64(pgstat_fetch_global()->buf_clean_bgwriter);
 }
 
 Datum
@@ -1641,10 +1671,17 @@ pg_stat_get_bgwriter_stat_reset_time(PG_FUNCTION_ARGS)
    PG_RETURN_TIMESTAMPTZ(pgstat_fetch_global()->stat_reset_timestamp);
 }
 
+// FIXME: name
 Datum
-pg_stat_get_buf_written_backend(PG_FUNCTION_ARGS)
+pg_stat_get_buf_fsync_checkpointer(PG_FUNCTION_ARGS)
 {
-   PG_RETURN_INT64(pgstat_fetch_global()->buf_written_backend);
+   PG_RETURN_INT64(pgstat_fetch_global()->buf_fsync_checkpointer);
+}
+
+Datum
+pg_stat_get_buf_fsync_bgwriter(PG_FUNCTION_ARGS)
+{
+   PG_RETURN_INT64(pgstat_fetch_global()->buf_fsync_backend);
 }
 
 Datum
@@ -1654,9 +1691,27 @@ pg_stat_get_buf_fsync_backend(PG_FUNCTION_ARGS)
 }
 
 Datum
-pg_stat_get_buf_alloc(PG_FUNCTION_ARGS)
+pg_stat_get_buf_alloc_preclean(PG_FUNCTION_ARGS)
+{
+   PG_RETURN_INT64(pgstat_fetch_global()->buf_alloc_preclean);
+}
+
+Datum
+pg_stat_get_buf_alloc_free(PG_FUNCTION_ARGS)
+{
+   PG_RETURN_INT64(pgstat_fetch_global()->buf_alloc_free);
+}
+
+Datum
+pg_stat_get_buf_alloc_sweep(PG_FUNCTION_ARGS)
+{
+   PG_RETURN_INT64(pgstat_fetch_global()->buf_alloc_sweep);
+}
+
+Datum
+pg_stat_get_buf_alloc_ring(PG_FUNCTION_ARGS)
 {
-   PG_RETURN_INT64(pgstat_fetch_global()->buf_alloc);
+   PG_RETURN_INT64(pgstat_fetch_global()->buf_alloc_ring);
 }
 
 Datum
index 1208eb9a6836033a8dd8ee991799d00a204c6f0f..425d057a4758b831d73c8668e4f62ac1cddc89c1 100644 (file)
@@ -1434,6 +1434,17 @@ static struct config_bool ConfigureNamesBool[] =
        NULL, NULL, NULL
    },
 
+   {
+       {"bgwriter_legacy", PGC_SIGHUP, RESOURCES_BGWRITER,
+           gettext_noop("Use legacy bgwriter algorithm."),
+           NULL,
+           GUC_UNIT_MS
+       },
+       &BgWriterLegacy,
+       true,
+       NULL, NULL, NULL
+   },
+
    {
        {"trace_notify", PGC_USERSET, DEVELOPER_OPTIONS,
            gettext_noop("Generates debugging output for LISTEN and NOTIFY."),
@@ -2734,7 +2745,7 @@ static struct config_int ConfigureNamesInt[] =
            GUC_UNIT_MS
        },
        &BgWriterDelay,
-       200, 10, 10000,
+       200, 1, 10000,
        NULL, NULL, NULL
    },
 
index 87335248a03d5e1bb21a1bd0d8fa59ddc1e22cb7..464e088c3468ca703166a9b8067406883c6ba3b4 100644 (file)
   proname => 'pg_stat_get_bgwriter_requested_checkpoints', provolatile => 's',
   proparallel => 'r', prorettype => 'int8', proargtypes => '',
   prosrc => 'pg_stat_get_bgwriter_requested_checkpoints' },
+
 { oid => '2771',
   descr => 'statistics: number of buffers written by the bgwriter during checkpoints',
-  proname => 'pg_stat_get_bgwriter_buf_written_checkpoints', provolatile => 's',
+  proname => 'pg_stat_get_buf_written_checkpoints', provolatile => 's',
   proparallel => 'r', prorettype => 'int8', proargtypes => '',
-  prosrc => 'pg_stat_get_bgwriter_buf_written_checkpoints' },
+  prosrc => 'pg_stat_get_buf_written_checkpoints' },
 { oid => '2772',
   descr => 'statistics: number of buffers written by the bgwriter for cleaning dirty buffers',
-  proname => 'pg_stat_get_bgwriter_buf_written_clean', provolatile => 's',
+  proname => 'pg_stat_get_buf_written_bgwriter', provolatile => 's',
+  proparallel => 'r', prorettype => 'int8', proargtypes => '',
+  prosrc => 'pg_stat_get_buf_written_bgwriter' },
+
+{ oid => '2775',
+  descr => 'statistics: number of buffers written by backends while cleaning dirty buffers',
+  proname => 'pg_stat_get_buf_written_backend', provolatile => 's',
+  proparallel => 'r', prorettype => 'int8', proargtypes => '',
+  prosrc => 'pg_stat_get_buf_written_backend' },
+{ oid => '270',
+  descr => 'statistics: number of buffers written by backends when recycling ring entries',
+  proname => 'pg_stat_get_buf_written_ring', provolatile => 's',
+  proparallel => 'r', prorettype => 'int8', proargtypes => '',
+  prosrc => 'pg_stat_get_buf_written_ring' },
+
+{ oid => '271',
+  descr => 'statistics: number of fsync requests processed by checkpointer',
+  proname => 'pg_stat_get_buf_fsync_checkpointer', provolatile => 's',
+  proparallel => 'r', prorettype => 'int8', proargtypes => '',
+  prosrc => 'pg_stat_get_buf_fsync_checkpointer' },
+{ oid => '272',
+  descr => 'statistics: number of bgwriter buffer writes that did their own fsync',
+  proname => 'pg_stat_get_buf_fsync_bgwriter', provolatile => 's',
+  proparallel => 'r', prorettype => 'int8', proargtypes => '',
+  prosrc => 'pg_stat_get_buf_fsync_bgwriter' },
+{ oid => '3063',
+  descr => 'statistics: number of backend writes that did their own fsync',
+  proname => 'pg_stat_get_buf_fsync_backend', provolatile => 's',
   proparallel => 'r', prorettype => 'int8', proargtypes => '',
-  prosrc => 'pg_stat_get_bgwriter_buf_written_clean' },
+  prosrc => 'pg_stat_get_buf_fsync_backend' },
+
+{ oid => '273', descr => 'statistics: number of reusable clean buffers discovered by bgwriter',
+  proname => 'pg_stat_get_buf_bgwriter_clean', provolatile => 's', proparallel => 'r',
+  prorettype => 'int8', proargtypes => '', prosrc => 'pg_stat_get_buf_bgwriter_clean' },
+
+{ oid => '380', descr => 'statistics: number of backend buffer allocations via preclean list',
+  proname => 'pg_stat_get_buf_alloc_preclean', provolatile => 's', proparallel => 'r',
+  prorettype => 'int8', proargtypes => '', prosrc => 'pg_stat_get_buf_alloc_preclean' },
+{ oid => '2859', descr => 'statistics: number of backend buffer allocations via backend clock sweep',
+  proname => 'pg_stat_get_buf_alloc_sweep', provolatile => 's', proparallel => 'r',
+  prorettype => 'int8', proargtypes => '', prosrc => 'pg_stat_get_buf_alloc_sweep' },
+{ oid => '381', descr => 'statistics: number of backend buffer allocations via ring buffer',
+  proname => 'pg_stat_get_buf_alloc_ring', provolatile => 's', proparallel => 'r',
+  prorettype => 'int8', proargtypes => '', prosrc => 'pg_stat_get_buf_alloc_ring' },
+{ oid => '421', descr => 'statistics: number of backend buffer allocations via free list',
+  proname => 'pg_stat_get_buf_alloc_free', provolatile => 's', proparallel => 'r',
+  prorettype => 'int8', proargtypes => '', prosrc => 'pg_stat_get_buf_alloc_free' },
+
+{ oid => '560', descr => 'statistics: number of clock sweep ticks by bgwriter',
+  proname => 'pg_stat_get_buf_ticks_bgwriter', provolatile => 's', proparallel => 'r',
+  prorettype => 'int8', proargtypes => '', prosrc => 'pg_stat_get_buf_ticks_bgwriter' },
+{ oid => '561', descr => 'statistics: number of clock sweep ticks by backend',
+  proname => 'pg_stat_get_buf_ticks_backend', provolatile => 's', proparallel => 'r',
+  prorettype => 'int8', proargtypes => '', prosrc => 'pg_stat_get_buf_ticks_backend' },
+
+
 { oid => '2773',
   descr => 'statistics: number of times the bgwriter stopped processing when it had written too many buffers while cleaning',
   proname => 'pg_stat_get_bgwriter_maxwritten_clean', provolatile => 's',
   proname => 'pg_stat_get_checkpoint_sync_time', provolatile => 's',
   proparallel => 'r', prorettype => 'float8', proargtypes => '',
   prosrc => 'pg_stat_get_checkpoint_sync_time' },
-{ oid => '2775', descr => 'statistics: number of buffers written by backends',
-  proname => 'pg_stat_get_buf_written_backend', provolatile => 's',
-  proparallel => 'r', prorettype => 'int8', proargtypes => '',
-  prosrc => 'pg_stat_get_buf_written_backend' },
-{ oid => '3063',
-  descr => 'statistics: number of backend buffer writes that did their own fsync',
-  proname => 'pg_stat_get_buf_fsync_backend', provolatile => 's',
-  proparallel => 'r', prorettype => 'int8', proargtypes => '',
-  prosrc => 'pg_stat_get_buf_fsync_backend' },
-{ oid => '2859', descr => 'statistics: number of buffer allocations',
-  proname => 'pg_stat_get_buf_alloc', provolatile => 's', proparallel => 'r',
-  prorettype => 'int8', proargtypes => '', prosrc => 'pg_stat_get_buf_alloc' },
 
 { oid => '2978', descr => 'statistics: number of function calls',
   proname => 'pg_stat_get_function_calls', provolatile => 's',
index 0a3ad3a1883b270ba8b44396a4121b7a154f6958..54c4765fb11974eb64aaf3482f00364ad7663c57 100644 (file)
@@ -413,14 +413,30 @@ typedef struct PgStat_MsgBgWriter
 
    PgStat_Counter m_timed_checkpoints;
    PgStat_Counter m_requested_checkpoints;
+   PgStat_Counter m_checkpoint_write_time; /* times in milliseconds */
+   PgStat_Counter m_checkpoint_sync_time;
+
    PgStat_Counter m_buf_written_checkpoints;
-   PgStat_Counter m_buf_written_clean;
-   PgStat_Counter m_maxwritten_clean;
+   PgStat_Counter m_buf_written_bgwriter;
    PgStat_Counter m_buf_written_backend;
+   PgStat_Counter m_buf_written_ring;
+
+   PgStat_Counter m_buf_fsync_checkpointer;
+   PgStat_Counter m_buf_fsync_bgwriter;
    PgStat_Counter m_buf_fsync_backend;
-   PgStat_Counter m_buf_alloc;
-   PgStat_Counter m_checkpoint_write_time; /* times in milliseconds */
-   PgStat_Counter m_checkpoint_sync_time;
+
+   PgStat_Counter m_buf_clean_bgwriter;
+
+   PgStat_Counter m_buf_alloc_preclean;
+   PgStat_Counter m_buf_alloc_free;
+   PgStat_Counter m_buf_alloc_sweep;
+   PgStat_Counter m_buf_alloc_ring;
+
+   PgStat_Counter m_buf_ticks_bgwriter;
+   PgStat_Counter m_buf_ticks_backend;
+
+   PgStat_Counter m_maxwritten_clean;
+
 } PgStat_MsgBgWriter;
 
 /* ----------
@@ -699,16 +715,33 @@ typedef struct PgStat_ArchiverStats
 typedef struct PgStat_GlobalStats
 {
    TimestampTz stats_timestamp;    /* time of stats file update */
+
    PgStat_Counter timed_checkpoints;
    PgStat_Counter requested_checkpoints;
    PgStat_Counter checkpoint_write_time;   /* times in milliseconds */
    PgStat_Counter checkpoint_sync_time;
+
    PgStat_Counter buf_written_checkpoints;
-   PgStat_Counter buf_written_clean;
-   PgStat_Counter maxwritten_clean;
+   PgStat_Counter buf_written_bgwriter;
    PgStat_Counter buf_written_backend;
+   PgStat_Counter buf_written_ring;
+
+   PgStat_Counter buf_fsync_checkpointer;
+   PgStat_Counter buf_fsync_bgwriter;
    PgStat_Counter buf_fsync_backend;
-   PgStat_Counter buf_alloc;
+
+   PgStat_Counter buf_clean_bgwriter;
+
+   PgStat_Counter buf_alloc_preclean;
+   PgStat_Counter buf_alloc_free;
+   PgStat_Counter buf_alloc_sweep;
+   PgStat_Counter buf_alloc_ring;
+
+   PgStat_Counter buf_ticks_bgwriter;
+   PgStat_Counter buf_ticks_backend;
+
+   PgStat_Counter maxwritten_clean;
+
    TimestampTz stat_reset_timestamp;
 } PgStat_GlobalStats;
 
index 630366f49efd241750b57e10e649aa7718d34424..892e24e0832de0a72cf1e45617fabeb84862c20a 100644 (file)
@@ -26,6 +26,7 @@ extern int    BgWriterDelay;
 extern int CheckPointTimeout;
 extern int CheckPointWarning;
 extern double CheckPointCompletionTarget;
+extern bool    BgWriterLegacy;
 
 extern void BackgroundWriterMain(void) pg_attribute_noreturn();
 extern void CheckpointerMain(void) pg_attribute_noreturn();
@@ -40,6 +41,8 @@ extern void AbsorbSyncRequests(void);
 extern Size CheckpointerShmemSize(void);
 extern void CheckpointerShmemInit(void);
 
+extern void ReportRingWrite(void);
+
 extern bool FirstCallSinceLastCheckpoint(void);
 
 #endif                         /* _BGWRITER_H */
index df2dda7e7e73e3454f9fad89ed6087348fb5f5a1..1b58b1db0df50e3dc3a599c1ccb6f9eb386a2c8f 100644 (file)
@@ -142,7 +142,7 @@ typedef struct buftag
  * single atomic operation, without actually acquiring and releasing spinlock;
  * for instance, increase or decrease refcount.  buf_id field never changes
  * after initialization, so does not need locking.  freeNext is protected by
- * the buffer_strategy_lock not buffer header lock.  The LWLock can take care
+ * the buffer_strategy_lock not buffer header lock (XXX: remove).  The LWLock can take care
  * of itself.  The buffer header lock is *not* used to control access to the
  * data in the buffer!
  *
@@ -184,7 +184,9 @@ typedef struct BufferDesc
    pg_atomic_uint32 state;
 
    int         wait_backend_pid;   /* backend PID of pin-count waiter */
-   int         freeNext;       /* link in freelist chain */
+
+   /* link in freelist chain: only used with legacy bgwriter */
+   int         freeNext;
 
    LWLock      content_lock;   /* to lock access to buffer contents */
 } BufferDesc;
@@ -232,10 +234,18 @@ extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray;
 /*
  * The freeNext field is either the index of the next freelist entry,
  * or one of these special values:
+ * XXX: Remove when removing legacy bgwriter
  */
 #define FREENEXT_END_OF_LIST   (-1)
 #define FREENEXT_NOT_IN_LIST   (-2)
 
+/*
+ * FIXME: Probably needs to depend on NBuffers or such.
+ */
+
+/* size of buffer free list */
+#define VICTIM_BUFFER_PRECLEAN_SIZE 4096
+
 /*
  * Functions for acquiring/releasing a shared buffer header's spinlock.  Do
  * not apply these to local buffers!
@@ -274,6 +284,7 @@ typedef struct WritebackContext
 /* in buf_init.c */
 extern PGDLLIMPORT BufferDescPadded *BufferDescriptors;
 extern PGDLLIMPORT WritebackContext BackendWritebackContext;
+extern PGDLLIMPORT struct ringbuf *VictimBuffers;
 
 /* in localbuf.c */
 extern BufferDesc *LocalBufferDescriptors;
@@ -306,13 +317,24 @@ extern void IssuePendingWritebacks(WritebackContext *context);
 extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
 
 /* freelist.c */
-extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
+extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategym,
                                     uint32 *buf_state);
+extern BufferDesc *ClockSweep(BufferAccessStrategy strategy,
+                             uint32 *buf_state, uint64 *nticks);
+
 extern void StrategyFreeBuffer(BufferDesc *buf);
 extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
                                 BufferDesc *buf);
 
-extern int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
+extern void StrategyReportWrite(BufferAccessStrategy strategy,
+                               BufferDesc *buf);
+
+extern int StrategySyncStart(uint32 *complete_passes,
+                             uint32 *alloc_preclean,
+                             uint32 *alloc_free,
+                             uint32 *alloc_sweep,
+                             uint32 *alloc_ring,
+                             uint64 *ticks_backend);
 extern void StrategyNotifyBgWriter(int bgwprocno);
 
 extern Size StrategyShmemSize(void);
index 509f4b7ef1c474fc259348dc4d05c69d239830ac..9957b9c8c2717f222ab4bba74818e845010ee253 100644 (file)
@@ -221,7 +221,9 @@ extern bool HoldingBufferPinThatDelaysRecovery(void);
 extern void AbortBufferIO(void);
 
 extern void BufmgrCommit(void);
-extern bool BgBufferSync(struct WritebackContext *wb_context);
+
+extern bool BgBufferSyncNew(struct WritebackContext *wb_context);
+extern bool BgBufferSyncLegacy(struct WritebackContext *wb_context);
 
 extern void AtProcExit_LocalBuffers(void);
 
index 7d365c48d1298de1cc8699d6cbf6b2313100ec90..da436d982ab830ea81560c56ab7892d811bbd7e6 100644 (file)
@@ -1796,12 +1796,21 @@ pg_stat_bgwriter| SELECT pg_stat_get_bgwriter_timed_checkpoints() AS checkpoints
     pg_stat_get_bgwriter_requested_checkpoints() AS checkpoints_req,
     pg_stat_get_checkpoint_write_time() AS checkpoint_write_time,
     pg_stat_get_checkpoint_sync_time() AS checkpoint_sync_time,
-    pg_stat_get_bgwriter_buf_written_checkpoints() AS buffers_checkpoint,
-    pg_stat_get_bgwriter_buf_written_clean() AS buffers_clean,
+    pg_stat_get_buf_written_checkpoints() AS buffers_written_checkpoint,
+    pg_stat_get_buf_written_bgwriter() AS buffers_written_bgwriter,
+    pg_stat_get_buf_written_backend() AS buffers_written_backend,
+    pg_stat_get_buf_written_ring() AS buffers_written_ring,
+    pg_stat_get_buf_fsync_checkpointer() AS buffers_fsync_checkpointer,
+    pg_stat_get_buf_fsync_bgwriter() AS buffers_fsync_bgwriter,
+    pg_stat_get_buf_fsync_backend() AS buffers_fsync_backend,
+    pg_stat_get_buf_bgwriter_clean() AS buffers_bgwriter_clean,
+    pg_stat_get_buf_alloc_preclean() AS buffers_alloc_preclean,
+    pg_stat_get_buf_alloc_free() AS buffers_alloc_free,
+    pg_stat_get_buf_alloc_sweep() AS buffers_alloc_sweep,
+    pg_stat_get_buf_alloc_ring() AS buffers_alloc_ring,
+    pg_stat_get_buf_ticks_bgwriter() AS buffers_ticks_bgwriter,
+    pg_stat_get_buf_ticks_backend() AS buffers_ticks_backend,
     pg_stat_get_bgwriter_maxwritten_clean() AS maxwritten_clean,
-    pg_stat_get_buf_written_backend() AS buffers_backend,
-    pg_stat_get_buf_fsync_backend() AS buffers_backend_fsync,
-    pg_stat_get_buf_alloc() AS buffers_alloc,
     pg_stat_get_bgwriter_stat_reset_time() AS stats_reset;
 pg_stat_database| SELECT d.oid AS datid,
     d.datname,