Improve coding around the fsync request queue.

author Tom Lane <tgl@sss.pgh.pa.us>

Tue, 17 Jul 2012 20:55:51 +0000 (16:55 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Tue, 17 Jul 2012 20:57:22 +0000 (16:57 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Tue, 17 Jul 2012 20:55:51 +0000 (16:55 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Tue, 17 Jul 2012 20:57:22 +0000 (16:57 -0400)
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c

index 5643ec821af2d5e3d67dc378e565f8626e537998..c850033537195269bb306aa8846d38d2ea507ec1 100644 (file)
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -116,7 +116,7 @@
   */
  typedef struct
  {
-       RelFileNodeBackend rnode;
+       RelFileNode     rnode;
         ForkNumber      forknum;
         BlockNumber segno;                      /* see md.c for special values */
         /* might add a real request-type field later; not needed yet */
@@ -896,17 +896,22 @@ BgWriterShmemSize(void)
  void
  BgWriterShmemInit(void)
  {
+       Size            size = BgWriterShmemSize();
         bool            found;
  
         BgWriterShmem = (BgWriterShmemStruct *)
                 ShmemInitStruct("Background Writer Data",
-                                               BgWriterShmemSize(),
+                                               size,
                                                 &found);
  
         if (!found)
         {
-               /* First time through, so initialize */
-               MemSet(BgWriterShmem, 0, sizeof(BgWriterShmemStruct));
+               /*
+                * First time through, so initialize.  Note that we zero the whole
+                * requests array; this is so that CompactBgwriterRequestQueue
+                * can assume that any pad bytes in the request structs are zeroes.
+                */
+               MemSet(BgWriterShmem, 0, size);
                 SpinLockInit(&BgWriterShmem->ckpt_lck);
                 BgWriterShmem->max_requests = NBuffers;
         }
@@ -1068,6 +1073,10 @@ RequestCheckpoint(int flags)
   * is dirty and must be fsync'd before next checkpoint.  We also use this
   * opportunity to count such writes for statistical purposes.
   *
+ * This functionality is only supported for regular (not backend-local)
+ * relations, so the rnode argument is intentionally RelFileNode not
+ * RelFileNodeBackend.
+ *
   * segno specifies which segment (not block!) of the relation needs to be
   * fsync'd.  (Since the valid range is much less than BlockNumber, we can
   * use high values for special flags; that's all internal to md.c, which
@@ -1084,8 +1093,7 @@ RequestCheckpoint(int flags)
   * let the backend know by returning false.
   */
  bool
-ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
-                                       BlockNumber segno)
+ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
  {
         BgWriterRequest *request;
  
@@ -1129,6 +1137,7 @@ ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
  /*
   * CompactBgwriterRequestQueue
   *             Remove duplicates from the request queue to avoid backend fsyncs.
+ *             Returns "true" if any entries were removed.
   *
   * Although a full fsync request queue is not common, it can lead to severe
   * performance problems when it does happen.  So far, this situation has
@@ -1138,11 +1147,11 @@ ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
   * gets very expensive and can slow down the whole system.
   *
   * Trying to do this every time the queue is full could lose if there
- * aren't any removable entries.  But should be vanishingly rare in
+ * aren't any removable entries.  But that should be vanishingly rare in
   * practice: there's one queue entry per shared buffer.
   */
  static bool
-CompactBgwriterRequestQueue()
+CompactBgwriterRequestQueue(void)
  {
         struct BgWriterSlotMapping
         {
@@ -1160,18 +1169,20 @@ CompactBgwriterRequestQueue()
         /* must hold BgWriterCommLock in exclusive mode */
         Assert(LWLockHeldByMe(BgWriterCommLock));
  
+       /* Initialize skip_slot array */
+       skip_slot = palloc0(sizeof(bool) * BgWriterShmem->num_requests);
+
         /* Initialize temporary hash table */
         MemSet(&ctl, 0, sizeof(ctl));
         ctl.keysize = sizeof(BgWriterRequest);
         ctl.entrysize = sizeof(struct BgWriterSlotMapping);
         ctl.hash = tag_hash;
+       ctl.hcxt = CurrentMemoryContext;
+
         htab = hash_create("CompactBgwriterRequestQueue",
                                            BgWriterShmem->num_requests,
                                            &ctl,
-                                          HASH_ELEM | HASH_FUNCTION);
-
-       /* Initialize skip_slot array */
-       skip_slot = palloc0(sizeof(bool) * BgWriterShmem->num_requests);
+                                          HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
  
         /*
          * The basic idea here is that a request can be skipped if it's followed
@@ -1186,19 +1197,28 @@ CompactBgwriterRequestQueue()
          * anyhow), but it's not clear that the extra complexity would buy us
          * anything.
          */
-       for (n = 0; n < BgWriterShmem->num_requests; ++n)
+       for (n = 0; n < BgWriterShmem->num_requests; n++)
         {
                 BgWriterRequest *request;
                 struct BgWriterSlotMapping *slotmap;
                 bool            found;
  
+               /*
+                * We use the request struct directly as a hashtable key.  This
+                * assumes that any padding bytes in the structs are consistently the
+                * same, which should be okay because we zeroed them in
+                * BgWriterShmemInit.  Note also that RelFileNode had better
+                * contain no pad bytes.
+                */
                 request = &BgWriterShmem->requests[n];
                 slotmap = hash_search(htab, request, HASH_ENTER, &found);
                 if (found)
                 {
+                       /* Duplicate, so mark the previous occurrence as skippable */
                         skip_slot[slotmap->slot] = true;
-                       ++num_skipped;
+                       num_skipped++;
                 }
+               /* Remember slot containing latest occurrence of this request value */
                 slotmap->slot = n;
         }
  
@@ -1213,7 +1233,8 @@ CompactBgwriterRequestQueue()
         }
  
         /* We found some duplicates; remove them. */
-       for (n = 0, preserve_count = 0; n < BgWriterShmem->num_requests; ++n)
+       preserve_count = 0;
+       for (n = 0; n < BgWriterShmem->num_requests; n++)
         {
                 if (skip_slot[n])
                         continue;
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c

index 0e5d4c568ba48d46d9fb72422a2ba3fa515f5985..5eff617ed7adf7fc41cd55bfe792862ae0d93719 100644 (file)
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1971,7 +1971,8 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum,
  {
         int                     i;
  
-       if (rnode.backend != InvalidBackendId)
+       /* If it's a local relation, it's localbuf.c's problem. */
+       if (RelFileNodeBackendIsTemp(rnode))
         {
                 if (rnode.backend == MyBackendId)
                         DropRelFileNodeLocalBuffers(rnode.node, forkNum, firstDelBlock);
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c

index 1d86e34aac9fb1add74edfd21f08918bfad4d84d..14fbc01260d9c948790e926c93b5434aedc65086 100644 (file)
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -121,13 +121,17 @@ static MemoryContext MdCxt;               /* context for all md.c allocations */
   * be deleted after the next checkpoint, but we use a linked list instead of
   * a hash table, because we don't expect there to be any duplicate requests.
   *
+ * These mechanisms are only used for non-temp relations; we never fsync
+ * temp rels, nor do we need to postpone their deletion (see comments in
+ * mdunlink).
+ *
   * (Regular backends do not track pending operations locally, but forward
   * them to the bgwriter.)
   */
  typedef struct
  {
-       RelFileNodeBackend rnode;       /* the targeted relation */
-       ForkNumber      forknum;
+       RelFileNode     rnode;                  /* the targeted relation */
+       ForkNumber      forknum;                /* which fork */
         BlockNumber segno;                      /* which segment */
  } PendingOperationTag;
  
@@ -142,7 +146,7 @@ typedef struct
  
  typedef struct
  {
-       RelFileNodeBackend rnode;       /* the dead relation to delete */
+       RelFileNode     rnode;                  /* the dead relation to delete */
         CycleCtr        cycle_ctr;              /* mdckpt_cycle_ctr when request was made */
  } PendingUnlinkEntry;
  
@@ -301,11 +305,11 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
  /*
   *     mdunlink() -- Unlink a relation.
   *
- * Note that we're passed a RelFileNode --- by the time this is called,
+ * Note that we're passed a RelFileNodeBackend --- by the time this is called,
   * there won't be an SMgrRelation hashtable entry anymore.
   *
- * Actually, we don't unlink the first segment file of the relation, but
- * just truncate it to zero length, and record a request to unlink it after
+ * For regular relations, we don't unlink the first segment file of the rel,
+ * but just truncate it to zero length, and record a request to unlink it after
   * the next checkpoint.  Additional segments can be unlinked immediately,
   * however.  Leaving the empty file in place prevents that relfilenode
   * number from being reused.  The scenario this protects us from is:
@@ -322,6 +326,12 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
   * number until it's safe, because relfilenode assignment skips over any
   * existing file.
   *
+ * We do not need to go through this dance for temp relations, though, because
+ * we never make WAL entries for temp rels, and so a temp rel poses no threat
+ * to the health of a regular rel that has taken over its relfilenode number.
+ * The fact that temp rels and regular rels have different file naming
+ * patterns provides additional safety.
+ *
   * All the above applies only to the relation's main fork; other forks can
   * just be removed immediately, since they are not needed to prevent the
   * relfilenode number from being recycled.  Also, we do not carefully
@@ -344,16 +354,18 @@ mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
  
         /*
          * We have to clean out any pending fsync requests for the doomed
-        * relation, else the next mdsync() will fail.
+        * relation, else the next mdsync() will fail.  There can't be any such
+        * requests for a temp relation, though.
          */
-       ForgetRelationFsyncRequests(rnode, forkNum);
+       if (!RelFileNodeBackendIsTemp(rnode))
+               ForgetRelationFsyncRequests(rnode.node, forkNum);
  
         path = relpath(rnode, forkNum);
  
         /*
          * Delete or truncate the first segment.
          */
-       if (isRedo || forkNum != MAIN_FORKNUM)
+       if (isRedo || forkNum != MAIN_FORKNUM || RelFileNodeBackendIsTemp(rnode))
         {
                 ret = unlink(path);
                 if (ret < 0 && errno != ENOENT)
@@ -1078,8 +1090,7 @@ mdsync(void)
                                  * the relation will have been dirtied through this same smgr
                                  * relation, and so we can save a file open/close cycle.
                                  */
-                               reln = smgropen(entry->tag.rnode.node,
-                                                               entry->tag.rnode.backend);
+                               reln = smgropen(entry->tag.rnode, InvalidBackendId);
  
                                 /*
                                  * It is possible that the relation has been dropped or
@@ -1230,7 +1241,7 @@ mdpostckpt(void)
                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);
  
                 /* Unlink the file */
-               path = relpath(entry->rnode, MAIN_FORKNUM);
+               path = relpathperm(entry->rnode, MAIN_FORKNUM);
                 if (unlink(path) < 0)
                 {
                         /*
@@ -1258,20 +1269,23 @@ mdpostckpt(void)
   * If there is a local pending-ops table, just make an entry in it for
   * mdsync to process later.  Otherwise, try to pass off the fsync request
   * to the background writer process.  If that fails, just do the fsync
- * locally before returning (we expect this will not happen often enough
+ * locally before returning (we hope this will not happen often enough
   * to be a performance problem).
   */
  static void
  register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
  {
+       /* Temp relations should never be fsync'd */
+       Assert(!SmgrIsTemp(reln));
+
         if (pendingOpsTable)
         {
                 /* push it into local pending-ops table */
-               RememberFsyncRequest(reln->smgr_rnode, forknum, seg->mdfd_segno);
+               RememberFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno);
         }
         else
         {
-               if (ForwardFsyncRequest(reln->smgr_rnode, forknum, seg->mdfd_segno))
+               if (ForwardFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno))
                         return;                         /* passed it off successfully */
  
                 ereport(DEBUG1,
@@ -1288,16 +1302,23 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
  /*
   * register_unlink() -- Schedule a file to be deleted after next checkpoint
   *
+ * We don't bother passing in the fork number, because this is only used
+ * with main forks.
+ *
   * As with register_dirty_segment, this could involve either a local or
   * a remote pending-ops table.
   */
  static void
  register_unlink(RelFileNodeBackend rnode)
  {
+       /* Should never be used with temp relations */
+       Assert(!RelFileNodeBackendIsTemp(rnode));
+
         if (pendingOpsTable)
         {
                 /* push it into local pending-ops table */
-               RememberFsyncRequest(rnode, MAIN_FORKNUM, UNLINK_RELATION_REQUEST);
+               RememberFsyncRequest(rnode.node, MAIN_FORKNUM,
+                                                        UNLINK_RELATION_REQUEST);
         }
         else
         {
@@ -1309,7 +1330,7 @@ register_unlink(RelFileNodeBackend rnode)
                  * XXX should we just leave the file orphaned instead?
                  */
                 Assert(IsUnderPostmaster);
-               while (!ForwardFsyncRequest(rnode, MAIN_FORKNUM,
+               while (!ForwardFsyncRequest(rnode.node, MAIN_FORKNUM,
                                                                         UNLINK_RELATION_REQUEST))
                         pg_usleep(10000L);      /* 10 msec seems a good number */
         }
@@ -1335,8 +1356,7 @@ register_unlink(RelFileNodeBackend rnode)
   * structure for them.)
   */
  void
-RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
-                                        BlockNumber segno)
+RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
  {
         Assert(pendingOpsTable);
  
@@ -1349,7 +1369,7 @@ RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
                 hash_seq_init(&hstat, pendingOpsTable);
                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
                 {
-                       if (RelFileNodeBackendEquals(entry->tag.rnode, rnode) &&
+                       if (RelFileNodeEquals(entry->tag.rnode, rnode) &&
                                 entry->tag.forknum == forknum)
                         {
                                 /* Okay, cancel this entry */
@@ -1370,7 +1390,7 @@ RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
                 hash_seq_init(&hstat, pendingOpsTable);
                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
                 {
-                       if (entry->tag.rnode.node.dbNode == rnode.node.dbNode)
+                       if (entry->tag.rnode.dbNode == rnode.dbNode)
                         {
                                 /* Okay, cancel this entry */
                                 entry->canceled = true;
@@ -1384,7 +1404,7 @@ RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
                         PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
  
                         next = lnext(cell);
-                       if (entry->rnode.node.dbNode == rnode.node.dbNode)
+                       if (entry->rnode.dbNode == rnode.dbNode)
                         {
                                 pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
                                 pfree(entry);
@@ -1399,6 +1419,9 @@ RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
                 MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
                 PendingUnlinkEntry *entry;
  
+               /* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */
+               Assert(forknum == MAIN_FORKNUM);
+
                 entry = palloc(sizeof(PendingUnlinkEntry));
                 entry->rnode = rnode;
                 entry->cycle_ctr = mdckpt_cycle_ctr;
@@ -1448,10 +1471,10 @@ RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
  }
  
  /*
- * ForgetRelationFsyncRequests -- forget any fsyncs for a rel
+ * ForgetRelationFsyncRequests -- forget any fsyncs for a relation fork
   */
  void
-ForgetRelationFsyncRequests(RelFileNodeBackend rnode, ForkNumber forknum)
+ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
  {
         if (pendingOpsTable)
         {
@@ -1486,12 +1509,11 @@ ForgetRelationFsyncRequests(RelFileNodeBackend rnode, ForkNumber forknum)
  void
  ForgetDatabaseFsyncRequests(Oid dbid)
  {
-       RelFileNodeBackend rnode;
+       RelFileNode rnode;
  
-       rnode.node.dbNode = dbid;
-       rnode.node.spcNode = 0;
-       rnode.node.relNode = 0;
-       rnode.backend = InvalidBackendId;
+       rnode.dbNode = dbid;
+       rnode.spcNode = 0;
+       rnode.relNode = 0;
  
         if (pendingOpsTable)
         {
diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h

index eaf2206f5e2f2d9d3d0d1da00000d8f5bce4f372..447bc8ffa17da32a9d4dd9b2437d8baba7b43366 100644 (file)
--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -27,7 +27,7 @@ extern void BackgroundWriterMain(void);
  extern void RequestCheckpoint(int flags);
  extern void CheckpointWriteDelay(int flags, double progress);
  
-extern bool ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
+extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum,
                                         BlockNumber segno);
  extern void AbsorbFsyncRequests(void);
  
diff --git a/src/include/storage/relfilenode.h b/src/include/storage/relfilenode.h

index 659a339e61eb4b522cc0b372a332974db53b1edc..5f07e57c9d138fef293adc9399e8d295e95e8d0e 100644 (file)
--- a/src/include/storage/relfilenode.h
+++ b/src/include/storage/relfilenode.h
@@ -69,6 +69,10 @@ typedef enum ForkNumber
   * Note: in pg_class, relfilenode can be zero to denote that the relation
   * is a "mapped" relation, whose current true filenode number is available
   * from relmapper.c.  Again, this case is NOT allowed in RelFileNodes.
+ *
+ * Note: various places use RelFileNode in hashtable keys.  Therefore,
+ * there *must not* be any unused padding bytes in this struct.  That
+ * should be safe as long as all the fields are of type Oid.
   */
  typedef struct RelFileNode
  {
@@ -79,7 +83,11 @@ typedef struct RelFileNode
  
  /*
   * Augmenting a relfilenode with the backend ID provides all the information
- * we need to locate the physical storage.
+ * we need to locate the physical storage.  The backend ID is InvalidBackendId
+ * for regular relations (those accessible to more than one backend), or the
+ * owning backend's ID for backend-local relations.  Backend-local relations
+ * are always transient and removed in case of a database crash; they are
+ * never WAL-logged or fsync'd.
   */
  typedef struct RelFileNodeBackend
  {
@@ -87,11 +95,15 @@ typedef struct RelFileNodeBackend
         BackendId       backend;
  } RelFileNodeBackend;
  
+#define RelFileNodeBackendIsTemp(rnode) \
+       ((rnode).backend != InvalidBackendId)
+
  /*
   * Note: RelFileNodeEquals and RelFileNodeBackendEquals compare relNode first
   * since that is most likely to be different in two unequal RelFileNodes.  It
   * is probably redundant to compare spcNode if the other fields are found equal,
- * but do it anyway to be sure.
+ * but do it anyway to be sure.  Likewise for checking the backend ID in
+ * RelFileNodeBackendEquals.
   */
  #define RelFileNodeEquals(node1, node2) \
         ((node1).relNode == (node2).relNode && \
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h

index e4792668c27452ca56e959ebf4e1d7b9454a7891..48ee70f4187b37b907112d9074b7b6a7cfe85416 100644 (file)
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -71,7 +71,7 @@ typedef struct SMgrRelationData
  typedef SMgrRelationData *SMgrRelation;
  
  #define SmgrIsTemp(smgr) \
-       ((smgr)->smgr_rnode.backend != InvalidBackendId)
+       RelFileNodeBackendIsTemp((smgr)->smgr_rnode)
  
  extern void smgrinit(void);
  extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend);
@@ -126,10 +126,9 @@ extern void mdsync(void);
  extern void mdpostckpt(void);
  
  extern void SetForwardFsyncRequests(void);
-extern void RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
+extern void RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum,
                                          BlockNumber segno);
-extern void ForgetRelationFsyncRequests(RelFileNodeBackend rnode,
-                                                       ForkNumber forknum);
+extern void ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum);
  extern void ForgetDatabaseFsyncRequests(Oid dbid);
  
  /* smgrtype.c */
author	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 17 Jul 2012 20:55:51 +0000 (16:55 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 17 Jul 2012 20:57:22 +0000 (16:57 -0400)
src/backend/postmaster/bgwriter.c		patch \| blob \| history
src/backend/storage/buffer/bufmgr.c		patch \| blob \| history
src/backend/storage/smgr/md.c		patch \| blob \| history
src/include/postmaster/bgwriter.h		patch \| blob \| history
src/include/storage/relfilenode.h		patch \| blob \| history
src/include/storage/smgr.h		patch \| blob \| history