]> granicus.if.org Git - postgresql/commitdiff
Rethink checkpointer's fsync-request table representation.
authorTom Lane <tgl@sss.pgh.pa.us>
Thu, 19 Jul 2012 23:28:27 +0000 (19:28 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Thu, 19 Jul 2012 23:28:27 +0000 (19:28 -0400)
Instead of having one hash table entry per relation/fork/segment, just have
one per relation, and use bitmapsets to represent which specific segments
need to be fsync'd.  This eliminates the need to scan the whole hash table
to implement FORGET_RELATION_FSYNC, which fixes the O(N^2) behavior
recently demonstrated by Jeff Janes for cases involving lots of TRUNCATE or
DROP TABLE operations during a single checkpoint cycle.  Per an idea from
Robert Haas.

(FORGET_DATABASE_FSYNC still sucks, but since dropping a database is a
pretty expensive operation anyway, we'll live with that.)

In passing, improve the delayed-unlink code: remove the pass over the list
in mdpreckpt, since it wasn't doing anything for us except supporting a
useless Assert in mdpostckpt, and fix mdpostckpt so that it will absorb
fsync requests every so often when clearing a large backlog of deletion
requests.

src/backend/storage/smgr/md.c

index 6b3c3ee0e75a8752c5c98f771107027507365f94..97742b92bb4ffa33db3554658511911955f47550 100644 (file)
@@ -32,8 +32,9 @@
 #include "pg_trace.h"
 
 
-/* interval for calling AbsorbFsyncRequests in mdsync */
+/* intervals for calling AbsorbFsyncRequests in mdsync and mdpostckpt */
 #define FSYNCS_PER_ABSORB              10
+#define UNLINKS_PER_ABSORB             10
 
 /*
  * Special values for the segno arg to RememberFsyncRequest.
@@ -51,7 +52,7 @@
  * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
  * that's what you get.  Ugh.  This code is designed so that we don't
  * actually believe these cases are okay without further evidence (namely,
- * a pending fsync request getting revoked ... see mdsync).
+ * a pending fsync request getting canceled ... see mdsync).
  */
 #ifndef WIN32
 #define FILE_POSSIBLY_DELETED(err)     ((err) == ENOENT)
@@ -111,12 +112,12 @@ static MemoryContext MdCxt;               /* context for all md.c allocations */
 
 
 /*
- * In some contexts (currently, standalone backends and the checkpointer process)
+ * In some contexts (currently, standalone backends and the checkpointer)
  * we keep track of pending fsync operations: we need to remember all relation
  * segments that have been written since the last checkpoint, so that we can
  * fsync them down to disk before completing the next checkpoint.  This hash
  * table remembers the pending operations.     We use a hash table mostly as
- * a convenient way of eliminating duplicate requests.
+ * a convenient way of merging duplicate requests.
  *
  * We use a similar mechanism to remember no-longer-needed files that can
  * be deleted after the next checkpoint, but we use a linked list instead of
@@ -129,25 +130,21 @@ static MemoryContext MdCxt;               /* context for all md.c allocations */
  * (Regular backends do not track pending operations locally, but forward
  * them to the checkpointer.)
  */
-typedef struct
-{
-       RelFileNode     rnode;                  /* the targeted relation */
-       ForkNumber      forknum;                /* which fork */
-       BlockNumber segno;                      /* which segment */
-} PendingOperationTag;
-
 typedef uint16 CycleCtr;               /* can be any convenient integer size */
 
 typedef struct
 {
-       PendingOperationTag tag;        /* hash table key (must be first!) */
-       bool            canceled;               /* T => request canceled, not yet removed */
-       CycleCtr        cycle_ctr;              /* mdsync_cycle_ctr when request was made */
+       RelFileNode rnode;                      /* hash table key (must be first!) */
+       CycleCtr        cycle_ctr;              /* mdsync_cycle_ctr of oldest request */
+       /* requests[f] has bit n set if we need to fsync segment n of fork f */
+       Bitmapset  *requests[MAX_FORKNUM + 1];
+       /* canceled[f] is true if we canceled fsyncs for fork "recently" */
+       bool            canceled[MAX_FORKNUM + 1];
 } PendingOperationEntry;
 
 typedef struct
 {
-       RelFileNode     rnode;                  /* the dead relation to delete */
+       RelFileNode rnode;                      /* the dead relation to delete */
        CycleCtr        cycle_ctr;              /* mdckpt_cycle_ctr when request was made */
 } PendingUnlinkEntry;
 
@@ -167,7 +164,7 @@ typedef enum                                        /* behavior for mdopen & _mdfd_getseg */
 
 /* local routines */
 static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum,
-                                                bool isRedo);
+                        bool isRedo);
 static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum,
           ExtensionBehavior behavior);
 static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
@@ -206,7 +203,7 @@ mdinit(void)
                HASHCTL         hash_ctl;
 
                MemSet(&hash_ctl, 0, sizeof(hash_ctl));
-               hash_ctl.keysize = sizeof(PendingOperationTag);
+               hash_ctl.keysize = sizeof(RelFileNode);
                hash_ctl.entrysize = sizeof(PendingOperationEntry);
                hash_ctl.hash = tag_hash;
                hash_ctl.hcxt = MdCxt;
@@ -227,10 +224,19 @@ mdinit(void)
 void
 SetForwardFsyncRequests(void)
 {
-       /* Perform any pending ops we may have queued up */
+       /* Perform any pending fsyncs we may have queued up, then drop table */
        if (pendingOpsTable)
+       {
                mdsync();
+               hash_destroy(pendingOpsTable);
+       }
        pendingOpsTable = NULL;
+
+       /*
+        * We should not have any pending unlink requests, since mdunlink doesn't
+        * queue unlink requests when isRedo.
+        */
+       Assert(pendingUnlinks == NIL);
 }
 
 /*
@@ -356,7 +362,7 @@ mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 {
        /*
         * We have to clean out any pending fsync requests for the doomed
-        * relation, else the next mdsync() will fail.  There can't be any such
+        * relation, else the next mdsync() will fail.  There can't be any such
         * requests for a temp relation, though.  We can send just one request
         * even when deleting multiple forks, since the fsync queuing code accepts
         * the "InvalidForkNumber = all forks" convention.
@@ -1046,8 +1052,11 @@ mdsync(void)
        hash_seq_init(&hstat, pendingOpsTable);
        while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
        {
+               ForkNumber      forknum;
+
                /*
-                * If the entry is new then don't process it this time.  Note that
+                * If the entry is new then don't process it this time; it might
+                * contain multiple fsync-request bits, but they are all new.  Note
                 * "continue" bypasses the hash-remove call at the bottom of the loop.
                 */
                if (entry->cycle_ctr == mdsync_cycle_ctr)
@@ -1057,130 +1066,176 @@ mdsync(void)
                Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
 
                /*
-                * If fsync is off then we don't have to bother opening the file at
-                * all.  (We delay checking until this point so that changing fsync on
-                * the fly behaves sensibly.)  Also, if the entry is marked canceled,
-                * fall through to delete it.
+                * Scan over the forks and segments represented by the entry.
+                *
+                * The bitmap manipulations are slightly tricky, because we can call
+                * AbsorbFsyncRequests() inside the loop and that could result in
+                * bms_add_member() modifying and even re-palloc'ing the bitmapsets.
+                * This is okay because we unlink each bitmapset from the hashtable
+                * entry before scanning it.  That means that any incoming fsync
+                * requests will be processed now if they reach the table before we
+                * begin to scan their fork.
                 */
-               if (enableFsync && !entry->canceled)
+               for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
                {
-                       int                     failures;
+                       Bitmapset  *requests = entry->requests[forknum];
+                       int                     segno;
 
-                       /*
-                        * If in checkpointer, we want to absorb pending requests every so
-                        * often to prevent overflow of the fsync request queue.  It is
-                        * unspecified whether newly-added entries will be visited by
-                        * hash_seq_search, but we don't care since we don't need to
-                        * process them anyway.
-                        */
-                       if (--absorb_counter <= 0)
-                       {
-                               AbsorbFsyncRequests();
-                               absorb_counter = FSYNCS_PER_ABSORB;
-                       }
+                       entry->requests[forknum] = NULL;
+                       entry->canceled[forknum] = false;
 
-                       /*
-                        * The fsync table could contain requests to fsync segments that
-                        * have been deleted (unlinked) by the time we get to them. Rather
-                        * than just hoping an ENOENT (or EACCES on Windows) error can be
-                        * ignored, what we do on error is absorb pending requests and
-                        * then retry.  Since mdunlink() queues a "revoke" message before
-                        * actually unlinking, the fsync request is guaranteed to be
-                        * marked canceled after the absorb if it really was this case.
-                        * DROP DATABASE likewise has to tell us to forget fsync requests
-                        * before it starts deletions.
-                        */
-                       for (failures = 0;; failures++)         /* loop exits at "break" */
+                       while ((segno = bms_first_member(requests)) >= 0)
                        {
-                               SMgrRelation reln;
-                               MdfdVec    *seg;
-                               char       *path;
+                               int                     failures;
 
                                /*
-                                * Find or create an smgr hash entry for this relation. This
-                                * may seem a bit unclean -- md calling smgr?  But it's really
-                                * the best solution.  It ensures that the open file reference
-                                * isn't permanently leaked if we get an error here. (You may
-                                * say "but an unreferenced SMgrRelation is still a leak!" Not
-                                * really, because the only case in which a checkpoint is done
-                                * by a process that isn't about to shut down is in the
-                                * checkpointer, and it will periodically do smgrcloseall().
-                                * This fact justifies our not closing the reln in the success
-                                * path either, which is a good thing since in
-                                * non-checkpointer cases we couldn't safely do that.)
+                                * If fsync is off then we don't have to bother opening the
+                                * file at all.  (We delay checking until this point so that
+                                * changing fsync on the fly behaves sensibly.)
                                 */
-                               reln = smgropen(entry->tag.rnode, InvalidBackendId);
+                               if (!enableFsync)
+                                       continue;
 
                                /*
-                                * It is possible that the relation has been dropped or
-                                * truncated since the fsync request was entered.  Therefore,
-                                * allow ENOENT, but only if we didn't fail already on this
-                                * file.  This applies both during _mdfd_getseg() and during
-                                * FileSync, since fd.c might have closed the file behind our
-                                * back.
+                                * If in checkpointer, we want to absorb pending requests
+                                * every so often to prevent overflow of the fsync request
+                                * queue.  It is unspecified whether newly-added entries will
+                                * be visited by hash_seq_search, but we don't care since we
+                                * don't need to process them anyway.
                                 */
-                               seg = _mdfd_getseg(reln, entry->tag.forknum,
-                                                         entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
-                                                                  false, EXTENSION_RETURN_NULL);
-
-                               INSTR_TIME_SET_CURRENT(sync_start);
-
-                               if (seg != NULL &&
-                                       FileSync(seg->mdfd_vfd) >= 0)
+                               if (--absorb_counter <= 0)
                                {
-                                       INSTR_TIME_SET_CURRENT(sync_end);
-                                       sync_diff = sync_end;
-                                       INSTR_TIME_SUBTRACT(sync_diff, sync_start);
-                                       elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
-                                       if (elapsed > longest)
-                                               longest = elapsed;
-                                       total_elapsed += elapsed;
-                                       processed++;
-                                       if (log_checkpoints)
-                                               elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
-                                                        processed, FilePathName(seg->mdfd_vfd), (double) elapsed / 1000);
-
-                                       break;          /* success; break out of retry loop */
+                                       AbsorbFsyncRequests();
+                                       absorb_counter = FSYNCS_PER_ABSORB;
                                }
 
                                /*
-                                * XXX is there any point in allowing more than one retry?
-                                * Don't see one at the moment, but easy to change the test
-                                * here if so.
+                                * The fsync table could contain requests to fsync segments
+                                * that have been deleted (unlinked) by the time we get to
+                                * them. Rather than just hoping an ENOENT (or EACCES on
+                                * Windows) error can be ignored, what we do on error is
+                                * absorb pending requests and then retry.      Since mdunlink()
+                                * queues a "cancel" message before actually unlinking, the
+                                * fsync request is guaranteed to be marked canceled after the
+                                * absorb if it really was this case. DROP DATABASE likewise
+                                * has to tell us to forget fsync requests before it starts
+                                * deletions.
                                 */
-                               path = _mdfd_segpath(reln, entry->tag.forknum,
-                                                                        entry->tag.segno);
-                               if (!FILE_POSSIBLY_DELETED(errno) ||
-                                       failures > 0)
-                                       ereport(ERROR,
-                                                       (errcode_for_file_access(),
-                                                  errmsg("could not fsync file \"%s\": %m", path)));
-                               else
-                                       ereport(DEBUG1,
-                                                       (errcode_for_file_access(),
-                                          errmsg("could not fsync file \"%s\" but retrying: %m",
-                                                         path)));
-                               pfree(path);
-
-                               /*
-                                * Absorb incoming requests and check to see if canceled.
-                                */
-                               AbsorbFsyncRequests();
-                               absorb_counter = FSYNCS_PER_ABSORB;             /* might as well... */
-
-                               if (entry->canceled)
-                                       break;
-                       }                                       /* end retry loop */
+                               for (failures = 0;; failures++) /* loop exits at "break" */
+                               {
+                                       SMgrRelation reln;
+                                       MdfdVec    *seg;
+                                       char       *path;
+                                       int                     save_errno;
+
+                                       /*
+                                        * Find or create an smgr hash entry for this relation.
+                                        * This may seem a bit unclean -- md calling smgr?      But
+                                        * it's really the best solution.  It ensures that the
+                                        * open file reference isn't permanently leaked if we get
+                                        * an error here. (You may say "but an unreferenced
+                                        * SMgrRelation is still a leak!" Not really, because the
+                                        * only case in which a checkpoint is done by a process
+                                        * that isn't about to shut down is in the checkpointer,
+                                        * and it will periodically do smgrcloseall(). This fact
+                                        * justifies our not closing the reln in the success path
+                                        * either, which is a good thing since in non-checkpointer
+                                        * cases we couldn't safely do that.)
+                                        */
+                                       reln = smgropen(entry->rnode, InvalidBackendId);
+
+                                       /* Attempt to open and fsync the target segment */
+                                       seg = _mdfd_getseg(reln, forknum,
+                                                        (BlockNumber) segno * (BlockNumber) RELSEG_SIZE,
+                                                                          false, EXTENSION_RETURN_NULL);
+
+                                       INSTR_TIME_SET_CURRENT(sync_start);
+
+                                       if (seg != NULL &&
+                                               FileSync(seg->mdfd_vfd) >= 0)
+                                       {
+                                               /* Success; update statistics about sync timing */
+                                               INSTR_TIME_SET_CURRENT(sync_end);
+                                               sync_diff = sync_end;
+                                               INSTR_TIME_SUBTRACT(sync_diff, sync_start);
+                                               elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
+                                               if (elapsed > longest)
+                                                       longest = elapsed;
+                                               total_elapsed += elapsed;
+                                               processed++;
+                                               if (log_checkpoints)
+                                                       elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
+                                                                processed,
+                                                                FilePathName(seg->mdfd_vfd),
+                                                                (double) elapsed / 1000);
+
+                                               break;  /* out of retry loop */
+                                       }
+
+                                       /* Compute file name for use in message */
+                                       save_errno = errno;
+                                       path = _mdfd_segpath(reln, forknum, (BlockNumber) segno);
+                                       errno = save_errno;
+
+                                       /*
+                                        * It is possible that the relation has been dropped or
+                                        * truncated since the fsync request was entered.
+                                        * Therefore, allow ENOENT, but only if we didn't fail
+                                        * already on this file.  This applies both for
+                                        * _mdfd_getseg() and for FileSync, since fd.c might have
+                                        * closed the file behind our back.
+                                        *
+                                        * XXX is there any point in allowing more than one retry?
+                                        * Don't see one at the moment, but easy to change the
+                                        * test here if so.
+                                        */
+                                       if (!FILE_POSSIBLY_DELETED(errno) ||
+                                               failures > 0)
+                                               ereport(ERROR,
+                                                               (errcode_for_file_access(),
+                                                                errmsg("could not fsync file \"%s\": %m",
+                                                                               path)));
+                                       else
+                                               ereport(DEBUG1,
+                                                               (errcode_for_file_access(),
+                                               errmsg("could not fsync file \"%s\" but retrying: %m",
+                                                          path)));
+                                       pfree(path);
+
+                                       /*
+                                        * Absorb incoming requests and check to see if a cancel
+                                        * arrived for this relation fork.
+                                        */
+                                       AbsorbFsyncRequests();
+                                       absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
+
+                                       if (entry->canceled[forknum])
+                                               break;
+                               }                               /* end retry loop */
+                       }
+                       bms_free(requests);
                }
 
                /*
-                * If we get here, either we fsync'd successfully, or we don't have to
-                * because enableFsync is off, or the entry is (now) marked canceled.
-                * Okay to delete it.
+                * We've finished everything that was requested before we started to
+                * scan the entry.      If no new requests have been inserted meanwhile,
+                * remove the entry.  Otherwise, update its cycle counter, as all the
+                * requests now in it must have arrived during this cycle.
                 */
-               if (hash_search(pendingOpsTable, &entry->tag,
-                                               HASH_REMOVE, NULL) == NULL)
-                       elog(ERROR, "pendingOpsTable corrupted");
+               for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+               {
+                       if (entry->requests[forknum] != NULL)
+                               break;
+               }
+               if (forknum <= MAX_FORKNUM)
+                       entry->cycle_ctr = mdsync_cycle_ctr;
+               else
+               {
+                       /* Okay to remove it */
+                       if (hash_search(pendingOpsTable, &entry->rnode,
+                                                       HASH_REMOVE, NULL) == NULL)
+                               elog(ERROR, "pendingOpsTable corrupted");
+               }
        }                                                       /* end loop over hashtable entries */
 
        /* Return sync performance metrics for report at checkpoint end */
@@ -1209,21 +1264,6 @@ mdsync(void)
 void
 mdpreckpt(void)
 {
-       ListCell   *cell;
-
-       /*
-        * In case the prior checkpoint wasn't completed, stamp all entries in the
-        * list with the current cycle counter.  Anything that's in the list at
-        * the start of checkpoint can surely be deleted after the checkpoint is
-        * finished, regardless of when the request was made.
-        */
-       foreach(cell, pendingUnlinks)
-       {
-               PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
-
-               entry->cycle_ctr = mdckpt_cycle_ctr;
-       }
-
        /*
         * Any unlink requests arriving after this point will be assigned the next
         * cycle counter, and won't be unlinked until next checkpoint.
@@ -1239,6 +1279,9 @@ mdpreckpt(void)
 void
 mdpostckpt(void)
 {
+       int                     absorb_counter;
+
+       absorb_counter = UNLINKS_PER_ABSORB;
        while (pendingUnlinks != NIL)
        {
                PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
@@ -1247,13 +1290,15 @@ mdpostckpt(void)
                /*
                 * New entries are appended to the end, so if the entry is new we've
                 * reached the end of old entries.
+                *
+                * Note: if just the right number of consecutive checkpoints fail, we
+                * could be fooled here by cycle_ctr wraparound.  However, the only
+                * consequence is that we'd delay unlinking for one more checkpoint,
+                * which is perfectly tolerable.
                 */
                if (entry->cycle_ctr == mdckpt_cycle_ctr)
                        break;
 
-               /* Else assert we haven't missed it */
-               Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);
-
                /* Unlink the file */
                path = relpathperm(entry->rnode, MAIN_FORKNUM);
                if (unlink(path) < 0)
@@ -1272,8 +1317,21 @@ mdpostckpt(void)
                }
                pfree(path);
 
+               /* And remove the list entry */
                pendingUnlinks = list_delete_first(pendingUnlinks);
                pfree(entry);
+
+               /*
+                * As in mdsync, we don't want to stop absorbing fsync requests for a
+                * long time when there are many deletions to be done.  We can safely
+                * call AbsorbFsyncRequests() at this point in the loop (note it might
+                * try to delete list entries).
+                */
+               if (--absorb_counter <= 0)
+               {
+                       AbsorbFsyncRequests();
+                       absorb_counter = UNLINKS_PER_ABSORB;
+               }
        }
 }
 
@@ -1353,7 +1411,7 @@ register_unlink(RelFileNodeBackend rnode)
 /*
  * RememberFsyncRequest() -- callback from checkpointer side of fsync request
  *
- * We stuff most fsync requests into the local hash table for execution
+ * We stuff fsync requests into the local hash table for execution
  * during the checkpointer's next checkpoint.  UNLINK requests go into a
  * separate linked list, however, because they get processed separately.
  *
@@ -1361,14 +1419,15 @@ register_unlink(RelFileNodeBackend rnode)
  * BlockNumber, so we can reserve high values of segno for special purposes.
  * We define three:
  * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation,
- *   either for one fork, or all forks if forknum is InvalidForkNumber
+ *      either for one fork, or all forks if forknum is InvalidForkNumber
  * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
  * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
  *      checkpoint.
+ * Note also that we're assuming real segment numbers don't exceed INT_MAX.
  *
- * (Handling the FORGET_* requests is a tad slow because the hash table has
- * to be searched linearly, but it doesn't seem worth rethinking the table
- * structure for them.)
+ * (Handling FORGET_DATABASE_FSYNC requests is a tad slow because the hash
+ * table has to be searched linearly, but dropping a database is a pretty
+ * heavyweight operation anyhow, so we'll live with it.)
  */
 void
 RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
@@ -1378,18 +1437,37 @@ RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
        if (segno == FORGET_RELATION_FSYNC)
        {
                /* Remove any pending requests for the relation (one or all forks) */
-               HASH_SEQ_STATUS hstat;
                PendingOperationEntry *entry;
 
-               hash_seq_init(&hstat, pendingOpsTable);
-               while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
+               entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
+                                                                                                         &rnode,
+                                                                                                         HASH_FIND,
+                                                                                                         NULL);
+               if (entry)
                {
-                       if (RelFileNodeEquals(entry->tag.rnode, rnode) &&
-                               (entry->tag.forknum == forknum ||
-                                forknum == InvalidForkNumber))
+                       /*
+                        * We can't just delete the entry since mdsync could have an
+                        * active hashtable scan.  Instead we delete the bitmapsets; this
+                        * is safe because of the way mdsync is coded.  We also set the
+                        * "canceled" flags so that mdsync can tell that a cancel arrived
+                        * for the fork(s).
+                        */
+                       if (forknum == InvalidForkNumber)
+                       {
+                               /* remove requests for all forks */
+                               for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+                               {
+                                       bms_free(entry->requests[forknum]);
+                                       entry->requests[forknum] = NULL;
+                                       entry->canceled[forknum] = true;
+                               }
+                       }
+                       else
                        {
-                               /* Okay, cancel this entry */
-                               entry->canceled = true;
+                               /* remove requests for single fork */
+                               bms_free(entry->requests[forknum]);
+                               entry->requests[forknum] = NULL;
+                               entry->canceled[forknum] = true;
                        }
                }
        }
@@ -1406,10 +1484,15 @@ RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
                hash_seq_init(&hstat, pendingOpsTable);
                while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
                {
-                       if (entry->tag.rnode.dbNode == rnode.dbNode)
+                       if (entry->rnode.dbNode == rnode.dbNode)
                        {
-                               /* Okay, cancel this entry */
-                               entry->canceled = true;
+                               /* remove requests for all forks */
+                               for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+                               {
+                                       bms_free(entry->requests[forknum]);
+                                       entry->requests[forknum] = NULL;
+                                       entry->canceled[forknum] = true;
+                               }
                        }
                }
 
@@ -1449,40 +1532,32 @@ RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
        else
        {
                /* Normal case: enter a request to fsync this segment */
-               PendingOperationTag key;
+               MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
                PendingOperationEntry *entry;
                bool            found;
 
-               /* ensure any pad bytes in the hash key are zeroed */
-               MemSet(&key, 0, sizeof(key));
-               key.rnode = rnode;
-               key.forknum = forknum;
-               key.segno = segno;
-
                entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
-                                                                                                         &key,
+                                                                                                         &rnode,
                                                                                                          HASH_ENTER,
                                                                                                          &found);
-               /* if new or previously canceled entry, initialize it */
-               if (!found || entry->canceled)
+               /* if new entry, initialize it */
+               if (!found)
                {
-                       entry->canceled = false;
                        entry->cycle_ctr = mdsync_cycle_ctr;
+                       MemSet(entry->requests, 0, sizeof(entry->requests));
+                       MemSet(entry->canceled, 0, sizeof(entry->canceled));
                }
 
                /*
                 * NB: it's intentional that we don't change cycle_ctr if the entry
-                * already exists.      The fsync request must be treated as old, even
-                * though the new request will be satisfied too by any subsequent
-                * fsync.
-                *
-                * However, if the entry is present but is marked canceled, we should
-                * act just as though it wasn't there.  The only case where this could
-                * happen would be if a file had been deleted, we received but did not
-                * yet act on the cancel request, and the same relfilenode was then
-                * assigned to a new file.      We mustn't lose the new request, but it
-                * should be considered new not old.
+                * already exists.      The cycle_ctr must represent the oldest fsync
+                * request that could be in the entry.
                 */
+
+               entry->requests[forknum] = bms_add_member(entry->requests[forknum],
+                                                                                                 (int) segno);
+
+               MemoryContextSwitchTo(oldcxt);
        }
 }
 
@@ -1503,7 +1578,7 @@ ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
        else if (IsUnderPostmaster)
        {
                /*
-                * Notify the checkpointer about it.  If we fail to queue the revoke
+                * Notify the checkpointer about it.  If we fail to queue the cancel
                 * message, we have to sleep and try again ... ugly, but hopefully
                 * won't happen often.
                 *
@@ -1517,7 +1592,7 @@ ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
 
                /*
                 * Note we don't wait for the checkpointer to actually absorb the
-                * revoke message; see mdsync() for the implications.
+                * cancel message; see mdsync() for the implications.
                 */
        }
 }