Revise bgwriter fsync-request mechanism to improve robustness when a table

author Tom Lane <tgl@sss.pgh.pa.us>

Wed, 17 Jan 2007 00:17:21 +0000 (00:17 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Wed, 17 Jan 2007 00:17:21 +0000 (00:17 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Wed, 17 Jan 2007 00:17:21 +0000 (00:17 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Wed, 17 Jan 2007 00:17:21 +0000 (00:17 +0000)
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c

index 7d19bf9f424aab5c2910a59cb37703fad28faeef..26650253361333d132b1e6cf8a5f8f1ae04b4f86 100644 (file)
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -2,7 +2,7 @@
   *
   * bgwriter.c
   *
- * The background writer (bgwriter) is new in Postgres 8.0.  It attempts
+ * The background writer (bgwriter) is new as of Postgres 8.0.  It attempts
   * to keep regular backends from having to write out dirty shared buffers
   * (which they would only do when needing to free a shared buffer to read in
   * another page).  In the best scenario all writes from shared buffers will
@@ -37,7 +37,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.34 2007/01/05 22:19:36 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.35 2007/01/17 00:17:20 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -103,8 +103,8 @@
  typedef struct
  {
         RelFileNode rnode;
-       BlockNumber segno;
-       /* might add a request-type field later */
+       BlockNumber segno;                      /* InvalidBlockNumber means "revoke" */
+       /* might add a real request-type field later; not needed yet */
  } BgWriterRequest;
  
  typedef struct
@@ -695,6 +695,11 @@ RequestCheckpoint(bool waitforit, bool warnontime)
   * ForwardFsyncRequest
   *             Forward a file-fsync request from a backend to the bgwriter
   *
+ * segno specifies which segment (not block!) of the relation needs to be
+ * fsync'd.  If segno == InvalidBlockNumber, the meaning is to revoke any
+ * pending fsync requests for the entire relation (this message is sent
+ * when the relation is about to be deleted).
+ *
   * Whenever a backend is compelled to write directly to a relation
   * (which should be seldom, if the bgwriter is getting its job done),
   * the backend calls this routine to pass over knowledge that the relation
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c

index 27a7c3d19a7f335068c6b4fb851f62ba2c9c4804..e3cc00ca1a92fc1633267741f2315d2d341e4eed 100644 (file)
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.125 2007/01/05 22:19:38 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.126 2007/01/17 00:17:21 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -31,6 +31,19 @@
  /* interval for calling AbsorbFsyncRequests in mdsync */
  #define FSYNCS_PER_ABSORB              10
  
+/*
+ * On Windows, we have to interpret EACCES as possibly meaning the same as
+ * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
+ * that's what you get.  Ugh.  This code is designed so that we don't
+ * actually believe these cases are okay without further evidence (namely,
+ * a pending fsync request getting revoked ... see mdsync).
+ */
+#ifndef WIN32
+#define FILE_POSSIBLY_DELETED(err)  ((err) == ENOENT)
+#else
+#define FILE_POSSIBLY_DELETED(err)  ((err) == ENOENT || (err) == EACCES)
+#endif
+
  /*
   *     The magnetic disk storage manager keeps track of open file
   *     descriptors in its own descriptor pool.  This is done to make it
@@ -93,9 +106,8 @@ static MemoryContext MdCxt;          /* context for all md.c allocations */
   * we keep track of pending fsync operations: we need to remember all relation
   * segments that have been written since the last checkpoint, so that we can
   * fsync them down to disk before completing the next checkpoint.  This hash
- * table remembers the pending operations.     We use a hash table not because
- * we want to look up individual operations, but simply as a convenient way
- * of eliminating duplicate requests.
+ * table remembers the pending operations.     We use a hash table mostly as
+ * a convenient way of eliminating duplicate requests.
   *
   * (Regular backends do not track pending operations locally, but forward
   * them to the bgwriter.)
@@ -104,6 +116,12 @@ typedef struct
  {
         RelFileNode rnode;                      /* the targeted relation */
         BlockNumber segno;                      /* which segment */
+} PendingOperationTag;
+
+typedef struct
+{
+       PendingOperationTag tag;        /* hash table key (must be first!) */
+       int                     failures;               /* number of failed attempts to fsync */
  } PendingOperationEntry;
  
  static HTAB *pendingOpsTable = NULL;
@@ -153,7 +171,7 @@ mdinit(void)
                 HASHCTL         hash_ctl;
  
                 MemSet(&hash_ctl, 0, sizeof(hash_ctl));
-               hash_ctl.keysize = sizeof(PendingOperationEntry);
+               hash_ctl.keysize = sizeof(PendingOperationTag);
                 hash_ctl.entrysize = sizeof(PendingOperationEntry);
                 hash_ctl.hash = tag_hash;
                 hash_ctl.hcxt = MdCxt;
@@ -236,6 +254,35 @@ mdunlink(RelFileNode rnode, bool isRedo)
  {
         char       *path;
  
+       /*
+        * We have to clean out any pending fsync requests for the doomed relation,
+        * else the next mdsync() will fail.
+        */
+       if (pendingOpsTable)
+       {
+               /* standalone backend or startup process: fsync state is local */
+               RememberFsyncRequest(rnode, InvalidBlockNumber);
+       }
+       else if (IsUnderPostmaster)
+       {
+               /*
+                * Notify the bgwriter about it.  If we fail to queue the revoke
+                * message, we have to sleep and try again ... ugly, but hopefully
+                * won't happen often.
+                *
+                * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with
+                * an error would leave the no-longer-used file still present on
+                * disk, which would be bad, so I'm inclined to assume that the
+                * bgwriter will always empty the queue soon.
+                */
+               while (!ForwardFsyncRequest(rnode, InvalidBlockNumber))
+                       pg_usleep(10000L);      /* 10 msec seems a good number */
+               /*
+                * Note we don't wait for the bgwriter to actually absorb the
+                * revoke message; see mdsync() for the implications.
+                */
+       }
+
         path = relpath(rnode);
  
         /* Delete the first segment, or only segment if not doing segmenting */
@@ -414,7 +461,8 @@ mdopen(SMgrRelation reln, ExtensionBehavior behavior)
                 if (fd < 0)
                 {
                         pfree(path);
-                       if (behavior == EXTENSION_RETURN_NULL && errno == ENOENT)
+                       if (behavior == EXTENSION_RETURN_NULL &&
+                               FILE_POSSIBLY_DELETED(errno))
                                 return NULL;
                         ereport(ERROR,
                                         (errcode_for_file_access(),
@@ -834,94 +882,126 @@ mdimmedsync(SMgrRelation reln)
  void
  mdsync(void)
  {
-       HASH_SEQ_STATUS hstat;
-       PendingOperationEntry *entry;
-       int                     absorb_counter;
+       bool            need_retry;
  
         if (!pendingOpsTable)
                 elog(ERROR, "cannot sync without a pendingOpsTable");
  
         /*
-        * If we are in the bgwriter, the sync had better include all fsync
-        * requests that were queued by backends before the checkpoint REDO point
-        * was determined.      We go that a little better by accepting all requests
-        * queued up to the point where we start fsync'ing.
+        * The fsync table could contain requests to fsync relations that have
+        * been deleted (unlinked) by the time we get to them.  Rather than
+        * just hoping an ENOENT (or EACCES on Windows) error can be ignored,
+        * what we will do is retry the whole process after absorbing fsync
+        * request messages again.  Since mdunlink() queues a "revoke" message
+        * before actually unlinking, the fsync request is guaranteed to be gone
+        * the second time if it really was this case.
          */
-       AbsorbFsyncRequests();
+       do {
+               HASH_SEQ_STATUS hstat;
+               PendingOperationEntry *entry;
+               int                     absorb_counter;
+
+               need_retry = false;
  
-       absorb_counter = FSYNCS_PER_ABSORB;
-       hash_seq_init(&hstat, pendingOpsTable);
-       while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
-       {
                 /*
-                * If fsync is off then we don't have to bother opening the file at
-                * all.  (We delay checking until this point so that changing fsync on
-                * the fly behaves sensibly.)
+                * If we are in the bgwriter, the sync had better include all fsync
+                * requests that were queued by backends before the checkpoint REDO
+                * point was determined. We go that a little better by accepting all
+                * requests queued up to the point where we start fsync'ing.
                  */
-               if (enableFsync)
-               {
-                       SMgrRelation reln;
-                       MdfdVec    *seg;
+               AbsorbFsyncRequests();
  
+               absorb_counter = FSYNCS_PER_ABSORB;
+               hash_seq_init(&hstat, pendingOpsTable);
+               while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
+               {
                         /*
-                        * If in bgwriter, absorb pending requests every so often to
-                        * prevent overflow of the fsync request queue.  The hashtable
-                        * code does not specify whether entries added by this will be
-                        * visited by our search, but we don't really care: it's OK if we
-                        * do, and OK if we don't.
+                        * If fsync is off then we don't have to bother opening the file
+                        * at all.  (We delay checking until this point so that changing
+                        * fsync on the fly behaves sensibly.)
                          */
-                       if (--absorb_counter <= 0)
+                       if (enableFsync)
                         {
-                               AbsorbFsyncRequests();
-                               absorb_counter = FSYNCS_PER_ABSORB;
-                       }
-
-                       /*
-                        * Find or create an smgr hash entry for this relation. This may
-                        * seem a bit unclean -- md calling smgr?  But it's really the
-                        * best solution.  It ensures that the open file reference isn't
-                        * permanently leaked if we get an error here. (You may say "but
-                        * an unreferenced SMgrRelation is still a leak!" Not really,
-                        * because the only case in which a checkpoint is done by a
-                        * process that isn't about to shut down is in the bgwriter, and
-                        * it will periodically do smgrcloseall().      This fact justifies
-                        * our not closing the reln in the success path either, which is a
-                        * good thing since in non-bgwriter cases we couldn't safely do
-                        * that.)  Furthermore, in many cases the relation will have been
-                        * dirtied through this same smgr relation, and so we can save a
-                        * file open/close cycle.
-                        */
-                       reln = smgropen(entry->rnode);
+                               SMgrRelation reln;
+                               MdfdVec    *seg;
+
+                               /*
+                                * If in bgwriter, we want to absorb pending requests every so
+                                * often to prevent overflow of the fsync request queue.  This
+                                * could result in deleting the current entry out from under
+                                * our hashtable scan, so the procedure is to fall out of the
+                                * scan and start over from the top of the function.
+                                */
+                               if (--absorb_counter <= 0)
+                               {
+                                       need_retry = true;
+                                       break;
+                               }
  
-                       /*
-                        * It is possible that the relation has been dropped or truncated
-                        * since the fsync request was entered.  Therefore, we have to
-                        * allow file-not-found errors.  This applies both during
-                        * _mdfd_getseg() and during FileSync, since fd.c might have
-                        * closed the file behind our back.
-                        */
-                       seg = _mdfd_getseg(reln,
-                                                          entry->segno * ((BlockNumber) RELSEG_SIZE),
-                                                          false, EXTENSION_RETURN_NULL);
-                       if (seg)
-                       {
-                               if (FileSync(seg->mdfd_vfd) < 0 &&
-                                       errno != ENOENT)
-                                       ereport(ERROR,
-                                                       (errcode_for_file_access(),
-                                                        errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
-                                                                       entry->segno,
-                                                                       entry->rnode.spcNode,
-                                                                       entry->rnode.dbNode,
-                                                                       entry->rnode.relNode)));
+                               /*
+                                * Find or create an smgr hash entry for this relation. This
+                                * may seem a bit unclean -- md calling smgr?  But it's really
+                                * the best solution.  It ensures that the open file reference
+                                * isn't permanently leaked if we get an error here. (You may
+                                * say "but an unreferenced SMgrRelation is still a leak!" Not
+                                * really, because the only case in which a checkpoint is done
+                                * by a process that isn't about to shut down is in the
+                                * bgwriter, and it will periodically do smgrcloseall(). This
+                                * fact justifies our not closing the reln in the success path
+                                * either, which is a good thing since in non-bgwriter cases
+                                * we couldn't safely do that.)  Furthermore, in many cases
+                                * the relation will have been dirtied through this same smgr
+                                * relation, and so we can save a file open/close cycle.
+                                */
+                               reln = smgropen(entry->tag.rnode);
+
+                               /*
+                                * It is possible that the relation has been dropped or
+                                * truncated since the fsync request was entered.  Therefore,
+                                * allow ENOENT, but only if we didn't fail once already on
+                                * this file.  This applies both during _mdfd_getseg() and
+                                * during FileSync, since fd.c might have closed the file
+                                * behind our back.
+                                */
+                               seg = _mdfd_getseg(reln,
+                                                                  entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
+                                                                  false, EXTENSION_RETURN_NULL);
+                               if (seg == NULL ||
+                                       FileSync(seg->mdfd_vfd) < 0)
+                               {
+                                       /*
+                                        * XXX is there any point in allowing more than one try?
+                                        * Don't see one at the moment, but easy to change the
+                                        * test here if so.
+                                        */
+                                       if (!FILE_POSSIBLY_DELETED(errno) ||
+                                               ++(entry->failures) > 1)
+                                               ereport(ERROR,
+                                                               (errcode_for_file_access(),
+                                                                errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
+                                                                               entry->tag.segno,
+                                                                               entry->tag.rnode.spcNode,
+                                                                               entry->tag.rnode.dbNode,
+                                                                               entry->tag.rnode.relNode)));
+                                       else
+                                               ereport(DEBUG1,
+                                                               (errcode_for_file_access(),
+                                                                errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
+                                                                               entry->tag.segno,
+                                                                               entry->tag.rnode.spcNode,
+                                                                               entry->tag.rnode.dbNode,
+                                                                               entry->tag.rnode.relNode)));
+                                       need_retry = true;
+                                       continue;       /* don't delete the hashtable entry */
+                               }
                         }
-               }
  
-               /* Okay, delete this entry */
-               if (hash_search(pendingOpsTable, entry,
-                                               HASH_REMOVE, NULL) == NULL)
-                       elog(ERROR, "pendingOpsTable corrupted");
-       }
+                       /* Okay, delete this entry */
+                       if (hash_search(pendingOpsTable, &entry->tag,
+                                                       HASH_REMOVE, NULL) == NULL)
+                               elog(ERROR, "pendingOpsTable corrupted");
+               }
+       } while (need_retry);
  }
  
  /*
@@ -938,14 +1018,8 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
  {
         if (pendingOpsTable)
         {
-               PendingOperationEntry entry;
-
-               /* ensure any pad bytes in the struct are zeroed */
-               MemSet(&entry, 0, sizeof(entry));
-               entry.rnode = reln->smgr_rnode;
-               entry.segno = seg->mdfd_segno;
-
-               (void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL);
+               /* push it into local pending-ops table */
+               RememberFsyncRequest(reln->smgr_rnode, seg->mdfd_segno);
         }
         else
         {
@@ -968,20 +1042,55 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
   *
   * We stuff the fsync request into the local hash table for execution
   * during the bgwriter's next checkpoint.
+ *
+ * segno == InvalidBlockNumber is a "revoke" request: remove any pending
+ * fsync requests for the whole relation.
   */
  void
  RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
  {
-       PendingOperationEntry entry;
-
         Assert(pendingOpsTable);
  
-       /* ensure any pad bytes in the struct are zeroed */
-       MemSet(&entry, 0, sizeof(entry));
-       entry.rnode = rnode;
-       entry.segno = segno;
+       if (segno != InvalidBlockNumber)
+       {
+               /* Enter a request to fsync this segment */
+               PendingOperationTag key;
+               PendingOperationEntry *entry;
+               bool            found;
+
+               /* ensure any pad bytes in the hash key are zeroed */
+               MemSet(&key, 0, sizeof(key));
+               key.rnode = rnode;
+               key.segno = segno;
+
+               entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
+                                                                                                         &key,
+                                                                                                         HASH_ENTER,
+                                                                                                         &found);
+               if (!found)                             /* new entry, so initialize it */
+                       entry->failures = 0;
+       }
+       else
+       {
+               /*
+                * Remove any pending requests for the entire relation.  (This is a
+                * tad slow but it doesn't seem worth rethinking the table structure.)
+                */
+               HASH_SEQ_STATUS hstat;
+               PendingOperationEntry *entry;
  
-       (void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL);
+               hash_seq_init(&hstat, pendingOpsTable);
+               while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
+               {
+                       if (RelFileNodeEquals(entry->tag.rnode, rnode))
+                       {
+                               /* Okay, delete this entry */
+                               if (hash_search(pendingOpsTable, &entry->tag,
+                                                               HASH_REMOVE, NULL) == NULL)
+                                       elog(ERROR, "pendingOpsTable corrupted");
+                       }
+               }
+       }
  }
  
  /*
@@ -1102,7 +1211,8 @@ _mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool isTemp,
                         }
                         if (v->mdfd_chain == NULL)
                         {
-                               if (behavior == EXTENSION_RETURN_NULL && errno == ENOENT)
+                               if (behavior == EXTENSION_RETURN_NULL &&
+                                       FILE_POSSIBLY_DELETED(errno))
                                         return NULL;
                                 ereport(ERROR,
                                                 (errcode_for_file_access(),
author	Tom Lane <tgl@sss.pgh.pa.us>
	Wed, 17 Jan 2007 00:17:21 +0000 (00:17 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Wed, 17 Jan 2007 00:17:21 +0000 (00:17 +0000)
src/backend/postmaster/bgwriter.c		patch \| blob \| history
src/backend/storage/smgr/md.c		patch \| blob \| history