]> granicus.if.org Git - postgresql/commitdiff
Improve coding around the fsync request queue.
authorTom Lane <tgl@sss.pgh.pa.us>
Tue, 17 Jul 2012 20:55:56 +0000 (16:55 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Tue, 17 Jul 2012 20:55:56 +0000 (16:55 -0400)
In all branches back to 8.3, this patch fixes a questionable assumption in
CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue that there are
no uninitialized pad bytes in the request queue structs.  This would only
cause trouble if (a) there were such pad bytes, which could happen in 8.4
and up if the compiler makes enum ForkNumber narrower than 32 bits, but
otherwise would require not-currently-planned changes in the widths of
other typedefs; and (b) the kernel has not uniformly initialized the
contents of shared memory to zeroes.  Still, it seems a tad risky, and we
can easily remove any risk by pre-zeroing the request array for ourselves.
In addition to that, we need to establish a coding rule that struct
RelFileNode can't contain any padding bytes, since such structs are copied
into the request array verbatim.  (There are other places that are assuming
this anyway, it turns out.)

In 9.1 and up, the risk was a bit larger because we were also effectively
assuming that struct RelFileNodeBackend contained no pad bytes, and with
fields of different types in there, that would be much easier to break.
However, there is no good reason to ever transmit fsync or delete requests
for temp files to the bgwriter/checkpointer, so we can revert the request
structs to plain RelFileNode, getting rid of the padding risk and saving
some marginal number of bytes and cycles in fsync queue manipulation while
we are at it.  The savings might be more than marginal during deletion of
a temp relation, because the old code transmitted an entirely useless but
nonetheless expensive-to-process ForgetRelationFsync request to the
background process, and also had the background process perform the file
deletion even though that can safely be done immediately.

In addition, make some cleanup of nearby comments and small improvements to
the code in CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue.

src/backend/postmaster/bgwriter.c
src/backend/storage/smgr/md.c
src/include/storage/relfilenode.h

index ea66acc0518727642e8be7973b96fe90c4493638..50e987a006e9ef1cff230108a9479d45e8880e24 100644 (file)
@@ -885,17 +885,22 @@ BgWriterShmemSize(void)
 void
 BgWriterShmemInit(void)
 {
+       Size            size = BgWriterShmemSize();
        bool            found;
 
        BgWriterShmem = (BgWriterShmemStruct *)
                ShmemInitStruct("Background Writer Data",
-                                               BgWriterShmemSize(),
+                                               size,
                                                &found);
 
        if (!found)
        {
-               /* First time through, so initialize */
-               MemSet(BgWriterShmem, 0, sizeof(BgWriterShmemStruct));
+               /*
+                * First time through, so initialize.  Note that we zero the whole
+                * requests array; this is so that CompactBgwriterRequestQueue
+                * can assume that any pad bytes in the request structs are zeroes.
+                */
+               MemSet(BgWriterShmem, 0, size);
                SpinLockInit(&BgWriterShmem->ckpt_lck);
                BgWriterShmem->max_requests = NBuffers;
        }
@@ -1111,25 +1116,27 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
 
 /*
  * CompactBgwriterRequestQueue
- *             Remove duplicates from the request queue to avoid backend fsyncs.
+ *             Remove duplicates from the request queue to avoid backend fsyncs.
+ *             Returns "true" if any entries were removed.
  *
  * Although a full fsync request queue is not common, it can lead to severe
  * performance problems when it does happen.  So far, this situation has
  * only been observed to occur when the system is under heavy write load,
- * and especially during the "sync" phase of a checkpoint.  Without this
+ * and especially during the "sync" phase of a checkpoint.     Without this
  * logic, each backend begins doing an fsync for every block written, which
  * gets very expensive and can slow down the whole system.
  *
  * Trying to do this every time the queue is full could lose if there
- * aren't any removable entries.  But should be vanishingly rare in
+ * aren't any removable entries.  But that should be vanishingly rare in
  * practice: there's one queue entry per shared buffer.
  */
 static bool
-CompactBgwriterRequestQueue()
+CompactBgwriterRequestQueue(void)
 {
-       struct BgWriterSlotMapping {
-               BgWriterRequest request;
-               int             slot;
+       struct BgWriterSlotMapping
+       {
+               BgWriterRequest request;
+               int                     slot;
        };
 
        int                     n,
@@ -1142,20 +1149,22 @@ CompactBgwriterRequestQueue()
        /* must hold BgWriterCommLock in exclusive mode */
        Assert(LWLockHeldByMe(BgWriterCommLock));
 
+       /* Initialize skip_slot array */
+       skip_slot = palloc0(sizeof(bool) * BgWriterShmem->num_requests);
+
        /* Initialize temporary hash table */
        MemSet(&ctl, 0, sizeof(ctl));
        ctl.keysize = sizeof(BgWriterRequest);
        ctl.entrysize = sizeof(struct BgWriterSlotMapping);
        ctl.hash = tag_hash;
+       ctl.hcxt = CurrentMemoryContext;
+
        htab = hash_create("CompactBgwriterRequestQueue",
                                           BgWriterShmem->num_requests,
                                           &ctl,
-                                          HASH_ELEM | HASH_FUNCTION);
+                                          HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
 
-       /* Initialize skip_slot array */
-       skip_slot = palloc0(sizeof(bool) * BgWriterShmem->num_requests);
-
-       /* 
+       /*
         * The basic idea here is that a request can be skipped if it's followed
         * by a later, identical request.  It might seem more sensible to work
         * backwards from the end of the queue and check whether a request is
@@ -1164,23 +1173,32 @@ CompactBgwriterRequestQueue()
         * intervening FORGET_RELATION_FSYNC or FORGET_DATABASE_FSYNC request, so
         * we do it this way.  It would be possible to be even smarter if we made
         * the code below understand the specific semantics of such requests (it
-        * could blow away preceding entries that would end up being cancelled
+        * could blow away preceding entries that would end up being canceled
         * anyhow), but it's not clear that the extra complexity would buy us
         * anything.
         */
-       for (n = 0; n < BgWriterShmem->num_requests; ++n)
+       for (n = 0; n < BgWriterShmem->num_requests; n++)
        {
                BgWriterRequest *request;
                struct BgWriterSlotMapping *slotmap;
-               bool    found;
+               bool            found;
 
+               /*
+                * We use the request struct directly as a hashtable key.  This
+                * assumes that any padding bytes in the structs are consistently the
+                * same, which should be okay because we zeroed them in
+                * BgWriterShmemInit.  Note also that RelFileNode had better
+                * contain no pad bytes.
+                */
                request = &BgWriterShmem->requests[n];
                slotmap = hash_search(htab, request, HASH_ENTER, &found);
                if (found)
                {
+                       /* Duplicate, so mark the previous occurrence as skippable */
                        skip_slot[slotmap->slot] = true;
-                       ++num_skipped;
+                       num_skipped++;
                }
+               /* Remember slot containing latest occurrence of this request value */
                slotmap->slot = n;
        }
 
@@ -1195,15 +1213,16 @@ CompactBgwriterRequestQueue()
        }
 
        /* We found some duplicates; remove them. */
-       for (n = 0, preserve_count = 0; n < BgWriterShmem->num_requests; ++n)
+       preserve_count = 0;
+       for (n = 0; n < BgWriterShmem->num_requests; n++)
        {
                if (skip_slot[n])
                        continue;
                BgWriterShmem->requests[preserve_count++] = BgWriterShmem->requests[n];
        }
        ereport(DEBUG1,
-                       (errmsg("compacted fsync request queue from %d entries to %d entries",
-                               BgWriterShmem->num_requests, preserve_count)));
+          (errmsg("compacted fsync request queue from %d entries to %d entries",
+                          BgWriterShmem->num_requests, preserve_count)));
        BgWriterShmem->num_requests = preserve_count;
 
        /* Cleanup. */
index c0e41c7053b5761aee4457a0b46255d266536639..340611003bd28e54f9275617fa52d7633b20e5eb 100644 (file)
@@ -126,7 +126,7 @@ static MemoryContext MdCxt;         /* context for all md.c allocations */
 typedef struct
 {
        RelFileNode rnode;                      /* the targeted relation */
-       ForkNumber      forknum;
+       ForkNumber      forknum;                /* which fork */
        BlockNumber segno;                      /* which segment */
 } PendingOperationTag;
 
@@ -1212,7 +1212,7 @@ mdpostckpt(void)
  * If there is a local pending-ops table, just make an entry in it for
  * mdsync to process later.  Otherwise, try to pass off the fsync request
  * to the background writer process.  If that fails, just do the fsync
- * locally before returning (we expect this will not happen often enough
+ * locally before returning (we hope this will not happen often enough
  * to be a performance problem).
  */
 static void
@@ -1239,6 +1239,9 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 /*
  * register_unlink() -- Schedule a file to be deleted after next checkpoint
  *
+ * We don't bother passing in the fork number, because this is only used
+ * with main forks.
+ *
  * As with register_dirty_segment, this could involve either a local or
  * a remote pending-ops table.
  */
@@ -1349,6 +1352,9 @@ RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
                MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
                PendingUnlinkEntry *entry;
 
+               /* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */
+               Assert(forknum == MAIN_FORKNUM);
+
                entry = palloc(sizeof(PendingUnlinkEntry));
                entry->rnode = rnode;
                entry->cycle_ctr = mdckpt_cycle_ctr;
@@ -1398,7 +1404,7 @@ RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
 }
 
 /*
- * ForgetRelationFsyncRequests -- forget any fsyncs for a rel
+ * ForgetRelationFsyncRequests -- forget any fsyncs for a relation fork
  */
 void
 ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
index b5e4e1134db2115d26709972b9304d2470f8d6f3..9a9df6d11e88027fe6753c2ec641bc8d1a6eb1a1 100644 (file)
@@ -65,6 +65,10 @@ typedef enum ForkNumber
  * Note: in pg_class, relfilenode can be zero to denote that the relation
  * is a "mapped" relation, whose current true filenode number is available
  * from relmapper.c.  Again, this case is NOT allowed in RelFileNodes.
+ *
+ * Note: various places use RelFileNode in hashtable keys.  Therefore,
+ * there *must not* be any unused padding bytes in this struct.  That
+ * should be safe as long as all the fields are of type Oid.
  */
 typedef struct RelFileNode
 {