]> granicus.if.org Git - postgresql/commitdiff
Rearrange mdsync() looping logic to avoid the problem that a sufficiently
authorTom Lane <tgl@sss.pgh.pa.us>
Thu, 12 Apr 2007 17:10:55 +0000 (17:10 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Thu, 12 Apr 2007 17:10:55 +0000 (17:10 +0000)
fast flow of new fsync requests can prevent mdsync() from ever completing.
This was an unforeseen consequence of a patch added in Mar 2006 to prevent
the fsync request queue from overflowing.  Problem identified by Heikki
Linnakangas and independently by ITAGAKI Takahiro; fix based on ideas from
Takahiro-san, Heikki, and Tom.

Back-patch as far as 8.1 because a previous back-patch introduced the problem
into 8.1 ...

src/backend/storage/smgr/md.c

index e8396b698d4036fcfc8dbbbcdfe99a3aaa31cd12..3ac3e8f4e73c7eee882563e2fa3242f0f79e8d27 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.127 2007/01/17 16:25:01 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.128 2007/04/12 17:10:55 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -122,14 +122,19 @@ typedef struct
        BlockNumber segno;                      /* which segment */
 } PendingOperationTag;
 
+typedef uint16 CycleCtr;               /* can be any convenient integer size */
+
 typedef struct
 {
        PendingOperationTag tag;        /* hash table key (must be first!) */
-       int                     failures;               /* number of failed attempts to fsync */
+       bool            canceled;               /* T => request canceled, not yet removed */
+       CycleCtr        cycle_ctr;              /* mdsync_cycle_ctr when request was made */
 } PendingOperationEntry;
 
 static HTAB *pendingOpsTable = NULL;
 
+static CycleCtr mdsync_cycle_ctr = 0;
+
 
 typedef enum                                   /* behavior for mdopen & _mdfd_getseg */
 {
@@ -856,70 +861,125 @@ mdimmedsync(SMgrRelation reln)
 
 /*
  *     mdsync() -- Sync previous writes to stable storage.
- *
- * This is only called during checkpoints, and checkpoints should only
- * occur in processes that have created a pendingOpsTable.
  */
 void
 mdsync(void)
 {
-       bool            need_retry;
+       static bool mdsync_in_progress = false;
+
+       HASH_SEQ_STATUS hstat;
+       PendingOperationEntry *entry;
+       int                     absorb_counter;
 
+       /*
+        * This is only called during checkpoints, and checkpoints should only
+        * occur in processes that have created a pendingOpsTable.
+        */
        if (!pendingOpsTable)
                elog(ERROR, "cannot sync without a pendingOpsTable");
 
        /*
-        * The fsync table could contain requests to fsync relations that have
-        * been deleted (unlinked) by the time we get to them.  Rather than
-        * just hoping an ENOENT (or EACCES on Windows) error can be ignored,
-        * what we will do is retry the whole process after absorbing fsync
-        * request messages again.  Since mdunlink() queues a "revoke" message
-        * before actually unlinking, the fsync request is guaranteed to be gone
-        * the second time if it really was this case.  DROP DATABASE likewise
-        * has to tell us to forget fsync requests before it starts deletions.
+        * If we are in the bgwriter, the sync had better include all fsync
+        * requests that were queued by backends before the checkpoint REDO
+        * point was determined.  We go that a little better by accepting all
+        * requests queued up to the point where we start fsync'ing.
         */
-       do {
-               HASH_SEQ_STATUS hstat;
-               PendingOperationEntry *entry;
-               int                     absorb_counter;
+       AbsorbFsyncRequests();
+
+       /*
+        * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
+        * checkpoint), we want to ignore fsync requests that are entered into the
+        * hashtable after this point --- they should be processed next time,
+        * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
+        * ones: new ones will have cycle_ctr equal to the incremented value of
+        * mdsync_cycle_ctr.
+        *
+        * In normal circumstances, all entries present in the table at this
+        * point will have cycle_ctr exactly equal to the current (about to be old)
+        * value of mdsync_cycle_ctr.  However, if we fail partway through the
+        * fsync'ing loop, then older values of cycle_ctr might remain when we
+        * come back here to try again.  Repeated checkpoint failures would
+        * eventually wrap the counter around to the point where an old entry
+        * might appear new, causing us to skip it, possibly allowing a checkpoint
+        * to succeed that should not have.  To forestall wraparound, any time
+        * the previous mdsync() failed to complete, run through the table and
+        * forcibly set cycle_ctr = mdsync_cycle_ctr.
+        *
+        * Think not to merge this loop with the main loop, as the problem is
+        * exactly that that loop may fail before having visited all the entries.
+        * From a performance point of view it doesn't matter anyway, as this
+        * path will never be taken in a system that's functioning normally.
+        */
+       if (mdsync_in_progress)
+       {
+               /* prior try failed, so update any stale cycle_ctr values */
+               hash_seq_init(&hstat, pendingOpsTable);
+               while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
+               {
+                       entry->cycle_ctr = mdsync_cycle_ctr;
+               }
+       }
 
-               need_retry = false;
+       /* Advance counter so that new hashtable entries are distinguishable */
+       mdsync_cycle_ctr++;
 
+       /* Set flag to detect failure if we don't reach the end of the loop */
+       mdsync_in_progress = true;
+
+       /* Now scan the hashtable for fsync requests to process */
+       absorb_counter = FSYNCS_PER_ABSORB;
+       hash_seq_init(&hstat, pendingOpsTable);
+       while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
+       {
                /*
-                * If we are in the bgwriter, the sync had better include all fsync
-                * requests that were queued by backends before the checkpoint REDO
-                * point was determined. We go that a little better by accepting all
-                * requests queued up to the point where we start fsync'ing.
+                * If the entry is new then don't process it this time.  Note that
+                * "continue" bypasses the hash-remove call at the bottom of the loop.
                 */
-               AbsorbFsyncRequests();
+               if (entry->cycle_ctr == mdsync_cycle_ctr)
+                       continue;
 
-               absorb_counter = FSYNCS_PER_ABSORB;
-               hash_seq_init(&hstat, pendingOpsTable);
-               while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
+               /* Else assert we haven't missed it */
+               Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
+
+               /*
+                * If fsync is off then we don't have to bother opening the file
+                * at all.  (We delay checking until this point so that changing
+                * fsync on the fly behaves sensibly.)  Also, if the entry is
+                * marked canceled, fall through to delete it.
+                */
+               if (enableFsync && !entry->canceled)
                {
+                       int                     failures;
+
                        /*
-                        * If fsync is off then we don't have to bother opening the file
-                        * at all.  (We delay checking until this point so that changing
-                        * fsync on the fly behaves sensibly.)
+                        * If in bgwriter, we want to absorb pending requests every so
+                        * often to prevent overflow of the fsync request queue.  It is
+                        * unspecified whether newly-added entries will be visited by
+                        * hash_seq_search, but we don't care since we don't need to
+                        * process them anyway.
                         */
-                       if (enableFsync)
+                       if (--absorb_counter <= 0)
+                       {
+                               AbsorbFsyncRequests();
+                               absorb_counter = FSYNCS_PER_ABSORB;
+                       }
+
+                       /*
+                        * The fsync table could contain requests to fsync segments that
+                        * have been deleted (unlinked) by the time we get to them.
+                        * Rather than just hoping an ENOENT (or EACCES on Windows) error
+                        * can be ignored, what we do on error is absorb pending requests
+                        * and then retry.  Since mdunlink() queues a "revoke" message
+                        * before actually unlinking, the fsync request is guaranteed to
+                        * be marked canceled after the absorb if it really was this case.
+                        * DROP DATABASE likewise has to tell us to forget fsync requests
+                        * before it starts deletions.
+                        */
+                       for (failures = 0; ; failures++)        /* loop exits at "break" */
                        {
                                SMgrRelation reln;
                                MdfdVec    *seg;
 
-                               /*
-                                * If in bgwriter, we want to absorb pending requests every so
-                                * often to prevent overflow of the fsync request queue.  This
-                                * could result in deleting the current entry out from under
-                                * our hashtable scan, so the procedure is to fall out of the
-                                * scan and start over from the top of the function.
-                                */
-                               if (--absorb_counter <= 0)
-                               {
-                                       need_retry = true;
-                                       break;
-                               }
-
                                /*
                                 * Find or create an smgr hash entry for this relation. This
                                 * may seem a bit unclean -- md calling smgr?  But it's really
@@ -940,7 +1000,7 @@ mdsync(void)
                                /*
                                 * It is possible that the relation has been dropped or
                                 * truncated since the fsync request was entered.  Therefore,
-                                * allow ENOENT, but only if we didn't fail once already on
+                                * allow ENOENT, but only if we didn't fail already on
                                 * this file.  This applies both during _mdfd_getseg() and
                                 * during FileSync, since fd.c might have closed the file
                                 * behind our back.
@@ -948,42 +1008,56 @@ mdsync(void)
                                seg = _mdfd_getseg(reln,
                                                                   entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
                                                                   false, EXTENSION_RETURN_NULL);
-                               if (seg == NULL ||
-                                       FileSync(seg->mdfd_vfd) < 0)
-                               {
-                                       /*
-                                        * XXX is there any point in allowing more than one try?
-                                        * Don't see one at the moment, but easy to change the
-                                        * test here if so.
-                                        */
-                                       if (!FILE_POSSIBLY_DELETED(errno) ||
-                                               ++(entry->failures) > 1)
-                                               ereport(ERROR,
-                                                               (errcode_for_file_access(),
-                                                                errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
-                                                                               entry->tag.segno,
-                                                                               entry->tag.rnode.spcNode,
-                                                                               entry->tag.rnode.dbNode,
-                                                                               entry->tag.rnode.relNode)));
-                                       else
-                                               ereport(DEBUG1,
-                                                               (errcode_for_file_access(),
-                                                                errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
-                                                                               entry->tag.segno,
-                                                                               entry->tag.rnode.spcNode,
-                                                                               entry->tag.rnode.dbNode,
-                                                                               entry->tag.rnode.relNode)));
-                                       need_retry = true;
-                                       continue;       /* don't delete the hashtable entry */
-                               }
-                       }
+                               if (seg != NULL &&
+                                       FileSync(seg->mdfd_vfd) >= 0)
+                                       break;          /* success; break out of retry loop */
 
-                       /* Okay, delete this entry */
-                       if (hash_search(pendingOpsTable, &entry->tag,
-                                                       HASH_REMOVE, NULL) == NULL)
-                               elog(ERROR, "pendingOpsTable corrupted");
+                               /*
+                                * XXX is there any point in allowing more than one retry?
+                                * Don't see one at the moment, but easy to change the
+                                * test here if so.
+                                */
+                               if (!FILE_POSSIBLY_DELETED(errno) ||
+                                       failures > 0)
+                                       ereport(ERROR,
+                                                       (errcode_for_file_access(),
+                                                        errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
+                                                                       entry->tag.segno,
+                                                                       entry->tag.rnode.spcNode,
+                                                                       entry->tag.rnode.dbNode,
+                                                                       entry->tag.rnode.relNode)));
+                               else
+                                       ereport(DEBUG1,
+                                                       (errcode_for_file_access(),
+                                                        errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
+                                                                       entry->tag.segno,
+                                                                       entry->tag.rnode.spcNode,
+                                                                       entry->tag.rnode.dbNode,
+                                                                       entry->tag.rnode.relNode)));
+
+                               /*
+                                * Absorb incoming requests and check to see if canceled.
+                                */
+                               AbsorbFsyncRequests();
+                               absorb_counter = FSYNCS_PER_ABSORB;     /* might as well... */
+
+                               if (entry->canceled)
+                                       break;
+                       }       /* end retry loop */
                }
-       } while (need_retry);
+
+               /*
+                * If we get here, either we fsync'd successfully, or we don't have
+                * to because enableFsync is off, or the entry is (now) marked
+                * canceled.  Okay to delete it.
+                */
+               if (hash_search(pendingOpsTable, &entry->tag,
+                                               HASH_REMOVE, NULL) == NULL)
+                       elog(ERROR, "pendingOpsTable corrupted");
+       }       /* end loop over hashtable entries */
+
+       /* Flag successful completion of mdsync */
+       mdsync_in_progress = false;
 }
 
 /*
@@ -1027,8 +1101,8 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
  *
  * The range of possible segment numbers is way less than the range of
  * BlockNumber, so we can reserve high values of segno for special purposes.
- * We define two: FORGET_RELATION_FSYNC means to drop pending fsyncs for
- * a relation, and FORGET_DATABASE_FSYNC means to drop pending fsyncs for
+ * We define two: FORGET_RELATION_FSYNC means to cancel pending fsyncs for
+ * a relation, and FORGET_DATABASE_FSYNC means to cancel pending fsyncs for
  * a whole database.  (These are a tad slow because the hash table has to be
  * searched linearly, but it doesn't seem worth rethinking the table structure
  * for them.)
@@ -1049,10 +1123,8 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
                {
                        if (RelFileNodeEquals(entry->tag.rnode, rnode))
                        {
-                               /* Okay, delete this entry */
-                               if (hash_search(pendingOpsTable, &entry->tag,
-                                                               HASH_REMOVE, NULL) == NULL)
-                                       elog(ERROR, "pendingOpsTable corrupted");
+                               /* Okay, cancel this entry */
+                               entry->canceled = true;
                        }
                }
        }
@@ -1067,10 +1139,8 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
                {
                        if (entry->tag.rnode.dbNode == rnode.dbNode)
                        {
-                               /* Okay, delete this entry */
-                               if (hash_search(pendingOpsTable, &entry->tag,
-                                                               HASH_REMOVE, NULL) == NULL)
-                                       elog(ERROR, "pendingOpsTable corrupted");
+                               /* Okay, cancel this entry */
+                               entry->canceled = true;
                        }
                }
        }
@@ -1090,8 +1160,25 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
                                                                                                          &key,
                                                                                                          HASH_ENTER,
                                                                                                          &found);
-               if (!found)                             /* new entry, so initialize it */
-                       entry->failures = 0;
+               /* if new or previously canceled entry, initialize it */
+               if (!found || entry->canceled)
+               {
+                       entry->canceled = false;
+                       entry->cycle_ctr = mdsync_cycle_ctr;
+               }
+               /*
+                * NB: it's intentional that we don't change cycle_ctr if the entry
+                * already exists.  The fsync request must be treated as old, even
+                * though the new request will be satisfied too by any subsequent
+                * fsync.
+                *
+                * However, if the entry is present but is marked canceled, we should
+                * act just as though it wasn't there.  The only case where this could
+                * happen would be if a file had been deleted, we received but did not
+                * yet act on the cancel request, and the same relfilenode was then
+                * assigned to a new file.  We mustn't lose the new request, but
+                * it should be considered new not old.
+                */
        }
 }