Improve coding around the fsync request queue.

author Tom Lane <tgl@sss.pgh.pa.us>

Tue, 17 Jul 2012 20:55:56 +0000 (16:55 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Tue, 17 Jul 2012 20:55:56 +0000 (16:55 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Tue, 17 Jul 2012 20:55:56 +0000 (16:55 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Tue, 17 Jul 2012 20:55:56 +0000 (16:55 -0400)
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c

index ea66acc0518727642e8be7973b96fe90c4493638..50e987a006e9ef1cff230108a9479d45e8880e24 100644 (file)
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -885,17 +885,22 @@ BgWriterShmemSize(void)
  void
  BgWriterShmemInit(void)
  {
+       Size            size = BgWriterShmemSize();
         bool            found;
  
         BgWriterShmem = (BgWriterShmemStruct *)
                 ShmemInitStruct("Background Writer Data",
-                                               BgWriterShmemSize(),
+                                               size,
                                                 &found);
  
         if (!found)
         {
-               /* First time through, so initialize */
-               MemSet(BgWriterShmem, 0, sizeof(BgWriterShmemStruct));
+               /*
+                * First time through, so initialize.  Note that we zero the whole
+                * requests array; this is so that CompactBgwriterRequestQueue
+                * can assume that any pad bytes in the request structs are zeroes.
+                */
+               MemSet(BgWriterShmem, 0, size);
                 SpinLockInit(&BgWriterShmem->ckpt_lck);
                 BgWriterShmem->max_requests = NBuffers;
         }
@@ -1111,25 +1116,27 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
  
  /*
   * CompactBgwriterRequestQueue
- *             Remove duplicates from the request queue to avoid backend fsyncs.
+ *             Remove duplicates from the request queue to avoid backend fsyncs.
+ *             Returns "true" if any entries were removed.
   *
   * Although a full fsync request queue is not common, it can lead to severe
   * performance problems when it does happen.  So far, this situation has
   * only been observed to occur when the system is under heavy write load,
- * and especially during the "sync" phase of a checkpoint.  Without this
+ * and especially during the "sync" phase of a checkpoint.     Without this
   * logic, each backend begins doing an fsync for every block written, which
   * gets very expensive and can slow down the whole system.
   *
   * Trying to do this every time the queue is full could lose if there
- * aren't any removable entries.  But should be vanishingly rare in
+ * aren't any removable entries.  But that should be vanishingly rare in
   * practice: there's one queue entry per shared buffer.
   */
  static bool
-CompactBgwriterRequestQueue()
+CompactBgwriterRequestQueue(void)
  {
-       struct BgWriterSlotMapping {
-               BgWriterRequest request;
-               int             slot;
+       struct BgWriterSlotMapping
+       {
+               BgWriterRequest request;
+               int                     slot;
         };
  
         int                     n,
@@ -1142,20 +1149,22 @@ CompactBgwriterRequestQueue()
         /* must hold BgWriterCommLock in exclusive mode */
         Assert(LWLockHeldByMe(BgWriterCommLock));
  
+       /* Initialize skip_slot array */
+       skip_slot = palloc0(sizeof(bool) * BgWriterShmem->num_requests);
+
         /* Initialize temporary hash table */
         MemSet(&ctl, 0, sizeof(ctl));
         ctl.keysize = sizeof(BgWriterRequest);
         ctl.entrysize = sizeof(struct BgWriterSlotMapping);
         ctl.hash = tag_hash;
+       ctl.hcxt = CurrentMemoryContext;
+
         htab = hash_create("CompactBgwriterRequestQueue",
                                            BgWriterShmem->num_requests,
                                            &ctl,
-                                          HASH_ELEM | HASH_FUNCTION);
+                                          HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
  
-       /* Initialize skip_slot array */
-       skip_slot = palloc0(sizeof(bool) * BgWriterShmem->num_requests);
-
-       /* 
+       /*
          * The basic idea here is that a request can be skipped if it's followed
          * by a later, identical request.  It might seem more sensible to work
          * backwards from the end of the queue and check whether a request is
@@ -1164,23 +1173,32 @@ CompactBgwriterRequestQueue()
          * intervening FORGET_RELATION_FSYNC or FORGET_DATABASE_FSYNC request, so
          * we do it this way.  It would be possible to be even smarter if we made
          * the code below understand the specific semantics of such requests (it
-        * could blow away preceding entries that would end up being cancelled
+        * could blow away preceding entries that would end up being canceled
          * anyhow), but it's not clear that the extra complexity would buy us
          * anything.
          */
-       for (n = 0; n < BgWriterShmem->num_requests; ++n)
+       for (n = 0; n < BgWriterShmem->num_requests; n++)
         {
                 BgWriterRequest *request;
                 struct BgWriterSlotMapping *slotmap;
-               bool    found;
+               bool            found;
  
+               /*
+                * We use the request struct directly as a hashtable key.  This
+                * assumes that any padding bytes in the structs are consistently the
+                * same, which should be okay because we zeroed them in
+                * BgWriterShmemInit.  Note also that RelFileNode had better
+                * contain no pad bytes.
+                */
                 request = &BgWriterShmem->requests[n];
                 slotmap = hash_search(htab, request, HASH_ENTER, &found);
                 if (found)
                 {
+                       /* Duplicate, so mark the previous occurrence as skippable */
                         skip_slot[slotmap->slot] = true;
-                       ++num_skipped;
+                       num_skipped++;
                 }
+               /* Remember slot containing latest occurrence of this request value */
                 slotmap->slot = n;
         }
  
@@ -1195,15 +1213,16 @@ CompactBgwriterRequestQueue()
         }
  
         /* We found some duplicates; remove them. */
-       for (n = 0, preserve_count = 0; n < BgWriterShmem->num_requests; ++n)
+       preserve_count = 0;
+       for (n = 0; n < BgWriterShmem->num_requests; n++)
         {
                 if (skip_slot[n])
                         continue;
                 BgWriterShmem->requests[preserve_count++] = BgWriterShmem->requests[n];
         }
         ereport(DEBUG1,
-                       (errmsg("compacted fsync request queue from %d entries to %d entries",
-                               BgWriterShmem->num_requests, preserve_count)));
+          (errmsg("compacted fsync request queue from %d entries to %d entries",
+                          BgWriterShmem->num_requests, preserve_count)));
         BgWriterShmem->num_requests = preserve_count;
  
         /* Cleanup. */
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c

index c0e41c7053b5761aee4457a0b46255d266536639..340611003bd28e54f9275617fa52d7633b20e5eb 100644 (file)
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -126,7 +126,7 @@ static MemoryContext MdCxt;         /* context for all md.c allocations */
  typedef struct
  {
         RelFileNode rnode;                      /* the targeted relation */
-       ForkNumber      forknum;
+       ForkNumber      forknum;                /* which fork */
         BlockNumber segno;                      /* which segment */
  } PendingOperationTag;
  
@@ -1212,7 +1212,7 @@ mdpostckpt(void)
   * If there is a local pending-ops table, just make an entry in it for
   * mdsync to process later.  Otherwise, try to pass off the fsync request
   * to the background writer process.  If that fails, just do the fsync
- * locally before returning (we expect this will not happen often enough
+ * locally before returning (we hope this will not happen often enough
   * to be a performance problem).
   */
  static void
@@ -1239,6 +1239,9 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
  /*
   * register_unlink() -- Schedule a file to be deleted after next checkpoint
   *
+ * We don't bother passing in the fork number, because this is only used
+ * with main forks.
+ *
   * As with register_dirty_segment, this could involve either a local or
   * a remote pending-ops table.
   */
@@ -1349,6 +1352,9 @@ RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
                 MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
                 PendingUnlinkEntry *entry;
  
+               /* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */
+               Assert(forknum == MAIN_FORKNUM);
+
                 entry = palloc(sizeof(PendingUnlinkEntry));
                 entry->rnode = rnode;
                 entry->cycle_ctr = mdckpt_cycle_ctr;
@@ -1398,7 +1404,7 @@ RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
  }
  
  /*
- * ForgetRelationFsyncRequests -- forget any fsyncs for a rel
+ * ForgetRelationFsyncRequests -- forget any fsyncs for a relation fork
   */
  void
  ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
diff --git a/src/include/storage/relfilenode.h b/src/include/storage/relfilenode.h

index b5e4e1134db2115d26709972b9304d2470f8d6f3..9a9df6d11e88027fe6753c2ec641bc8d1a6eb1a1 100644 (file)
--- a/src/include/storage/relfilenode.h
+++ b/src/include/storage/relfilenode.h
@@ -65,6 +65,10 @@ typedef enum ForkNumber
   * Note: in pg_class, relfilenode can be zero to denote that the relation
   * is a "mapped" relation, whose current true filenode number is available
   * from relmapper.c.  Again, this case is NOT allowed in RelFileNodes.
+ *
+ * Note: various places use RelFileNode in hashtable keys.  Therefore,
+ * there *must not* be any unused padding bytes in this struct.  That
+ * should be safe as long as all the fields are of type Oid.
   */
  typedef struct RelFileNode
  {
author	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 17 Jul 2012 20:55:56 +0000 (16:55 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 17 Jul 2012 20:55:56 +0000 (16:55 -0400)
src/backend/postmaster/bgwriter.c		patch \| blob \| history
src/backend/storage/smgr/md.c		patch \| blob \| history
src/include/storage/relfilenode.h		patch \| blob \| history