]> granicus.if.org Git - postgresql/commitdiff
Improve coding around the fsync request queue.
authorTom Lane <tgl@sss.pgh.pa.us>
Tue, 17 Jul 2012 20:56:08 +0000 (16:56 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Tue, 17 Jul 2012 20:56:08 +0000 (16:56 -0400)
In all branches back to 8.3, this patch fixes a questionable assumption in
CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue that there are
no uninitialized pad bytes in the request queue structs.  This would only
cause trouble if (a) there were such pad bytes, which could happen in 8.4
and up if the compiler makes enum ForkNumber narrower than 32 bits, but
otherwise would require not-currently-planned changes in the widths of
other typedefs; and (b) the kernel has not uniformly initialized the
contents of shared memory to zeroes.  Still, it seems a tad risky, and we
can easily remove any risk by pre-zeroing the request array for ourselves.
In addition to that, we need to establish a coding rule that struct
RelFileNode can't contain any padding bytes, since such structs are copied
into the request array verbatim.  (There are other places that are assuming
this anyway, it turns out.)

In 9.1 and up, the risk was a bit larger because we were also effectively
assuming that struct RelFileNodeBackend contained no pad bytes, and with
fields of different types in there, that would be much easier to break.
However, there is no good reason to ever transmit fsync or delete requests
for temp files to the bgwriter/checkpointer, so we can revert the request
structs to plain RelFileNode, getting rid of the padding risk and saving
some marginal number of bytes and cycles in fsync queue manipulation while
we are at it.  The savings might be more than marginal during deletion of
a temp relation, because the old code transmitted an entirely useless but
nonetheless expensive-to-process ForgetRelationFsync request to the
background process, and also had the background process perform the file
deletion even though that can safely be done immediately.

In addition, make some cleanup of nearby comments and small improvements to
the code in CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue.

src/backend/postmaster/bgwriter.c
src/include/storage/relfilenode.h

index 8d6687584eaa71e61bfccca41d7b7db6a2803edf..77970eead0436c7862ab751f08181a4f618e7c90 100644 (file)
@@ -827,22 +827,29 @@ BgWriterShmemSize(void)
 void
 BgWriterShmemInit(void)
 {
+       Size            size = BgWriterShmemSize();
        bool            found;
 
        BgWriterShmem = (BgWriterShmemStruct *)
                ShmemInitStruct("Background Writer Data",
-                                               BgWriterShmemSize(),
+                                               size,
                                                &found);
        if (BgWriterShmem == NULL)
                ereport(FATAL,
                                (errcode(ERRCODE_OUT_OF_MEMORY),
                                 errmsg("not enough shared memory for background writer")));
-       if (found)
-               return;                                 /* already initialized */
 
-       MemSet(BgWriterShmem, 0, sizeof(BgWriterShmemStruct));
-       SpinLockInit(&BgWriterShmem->ckpt_lck);
-       BgWriterShmem->max_requests = NBuffers;
+       if (!found)
+       {
+               /*
+                * First time through, so initialize.  Note that we zero the whole
+                * requests array; this is so that CompactBgwriterRequestQueue
+                * can assume that any pad bytes in the request structs are zeroes.
+                */
+               MemSet(BgWriterShmem, 0, size);
+               SpinLockInit(&BgWriterShmem->ckpt_lck);
+               BgWriterShmem->max_requests = NBuffers;
+       }
 }
 
 /*
@@ -1028,25 +1035,27 @@ ForwardFsyncRequest(RelFileNode rnode, BlockNumber segno)
 
 /*
  * CompactBgwriterRequestQueue
- *             Remove duplicates from the request queue to avoid backend fsyncs.
+ *             Remove duplicates from the request queue to avoid backend fsyncs.
+ *             Returns "true" if any entries were removed.
  *
  * Although a full fsync request queue is not common, it can lead to severe
  * performance problems when it does happen.  So far, this situation has
  * only been observed to occur when the system is under heavy write load,
- * and especially during the "sync" phase of a checkpoint.  Without this
+ * and especially during the "sync" phase of a checkpoint.     Without this
  * logic, each backend begins doing an fsync for every block written, which
  * gets very expensive and can slow down the whole system.
  *
  * Trying to do this every time the queue is full could lose if there
- * aren't any removable entries.  But should be vanishingly rare in
+ * aren't any removable entries.  But that should be vanishingly rare in
  * practice: there's one queue entry per shared buffer.
  */
 static bool
-CompactBgwriterRequestQueue()
+CompactBgwriterRequestQueue(void)
 {
-       struct BgWriterSlotMapping {
-               BgWriterRequest request;
-               int             slot;
+       struct BgWriterSlotMapping
+       {
+               BgWriterRequest request;
+               int                     slot;
        };
 
        int                     n,
@@ -1059,20 +1068,22 @@ CompactBgwriterRequestQueue()
        /* must hold BgWriterCommLock in exclusive mode */
        Assert(LWLockHeldByMe(BgWriterCommLock));
 
+       /* Initialize skip_slot array */
+       skip_slot = palloc0(sizeof(bool) * BgWriterShmem->num_requests);
+
        /* Initialize temporary hash table */
        MemSet(&ctl, 0, sizeof(ctl));
        ctl.keysize = sizeof(BgWriterRequest);
        ctl.entrysize = sizeof(struct BgWriterSlotMapping);
        ctl.hash = tag_hash;
+       ctl.hcxt = CurrentMemoryContext;
+
        htab = hash_create("CompactBgwriterRequestQueue",
                                           BgWriterShmem->num_requests,
                                           &ctl,
-                                          HASH_ELEM | HASH_FUNCTION);
-
-       /* Initialize skip_slot array */
-       skip_slot = palloc0(sizeof(bool) * BgWriterShmem->num_requests);
+                                          HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
 
-       /* 
+       /*
         * The basic idea here is that a request can be skipped if it's followed
         * by a later, identical request.  It might seem more sensible to work
         * backwards from the end of the queue and check whether a request is
@@ -1081,23 +1092,32 @@ CompactBgwriterRequestQueue()
         * intervening FORGET_RELATION_FSYNC or FORGET_DATABASE_FSYNC request, so
         * we do it this way.  It would be possible to be even smarter if we made
         * the code below understand the specific semantics of such requests (it
-        * could blow away preceding entries that would end up being cancelled
+        * could blow away preceding entries that would end up being canceled
         * anyhow), but it's not clear that the extra complexity would buy us
         * anything.
         */
-       for (n = 0; n < BgWriterShmem->num_requests; ++n)
+       for (n = 0; n < BgWriterShmem->num_requests; n++)
        {
                BgWriterRequest *request;
                struct BgWriterSlotMapping *slotmap;
-               bool    found;
+               bool            found;
 
+               /*
+                * We use the request struct directly as a hashtable key.  This
+                * assumes that any padding bytes in the structs are consistently the
+                * same, which should be okay because we zeroed them in
+                * BgWriterShmemInit.  Note also that RelFileNode had better
+                * contain no pad bytes.
+                */
                request = &BgWriterShmem->requests[n];
                slotmap = hash_search(htab, request, HASH_ENTER, &found);
                if (found)
                {
+                       /* Duplicate, so mark the previous occurrence as skippable */
                        skip_slot[slotmap->slot] = true;
-                       ++num_skipped;
+                       num_skipped++;
                }
+               /* Remember slot containing latest occurrence of this request value */
                slotmap->slot = n;
        }
 
@@ -1112,15 +1132,16 @@ CompactBgwriterRequestQueue()
        }
 
        /* We found some duplicates; remove them. */
-       for (n = 0, preserve_count = 0; n < BgWriterShmem->num_requests; ++n)
+       preserve_count = 0;
+       for (n = 0; n < BgWriterShmem->num_requests; n++)
        {
                if (skip_slot[n])
                        continue;
                BgWriterShmem->requests[preserve_count++] = BgWriterShmem->requests[n];
        }
        ereport(DEBUG1,
-                       (errmsg("compacted fsync request queue from %d entries to %d entries",
-                               BgWriterShmem->num_requests, preserve_count)));
+          (errmsg("compacted fsync request queue from %d entries to %d entries",
+                          BgWriterShmem->num_requests, preserve_count)));
        BgWriterShmem->num_requests = preserve_count;
 
        /* Cleanup. */
index 9638294b4a48c2b068c760f29008029b9fb34bd6..48f36e050e029044bc407f589a25e8452c0aa9e5 100644 (file)
  * identified by pg_database.dattablespace).  However this shorthand
  * is NOT allowed in RelFileNode structs --- the real tablespace ID
  * must be supplied when setting spcNode.
+ *
+ * Note: various places use RelFileNode in hashtable keys.  Therefore,
+ * there *must not* be any unused padding bytes in this struct.  That
+ * should be safe as long as all the fields are of type Oid.
  */
 typedef struct RelFileNode
 {