Per previous discussions, get rid of use of sync(2) in favor of

author Tom Lane <tgl@sss.pgh.pa.us>

Mon, 31 May 2004 03:48:10 +0000 (03:48 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Mon, 31 May 2004 03:48:10 +0000 (03:48 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Mon, 31 May 2004 03:48:10 +0000 (03:48 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Mon, 31 May 2004 03:48:10 +0000 (03:48 +0000)
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c

index 88e1f1256ad80a8f007341325e83fa25db75b9c1..97f887d0a06ce234e256dda5bf204333ec7033b0 100644 (file)
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -13,7 +13,7 @@
   * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.19 2003/11/29 19:51:40 pgsql Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.20 2004/05/31 03:47:54 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -97,7 +97,7 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status)
         Assert(status == TRANSACTION_STATUS_COMMITTED ||
                    status == TRANSACTION_STATUS_ABORTED);
  
-       LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
+       LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
  
         byteptr = SimpleLruReadPage(ClogCtl, pageno, xid, true);
         byteptr += byteno;
@@ -110,7 +110,7 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status)
  
         /* ...->page_status[slotno] = CLOG_PAGE_DIRTY; already done */
  
-       LWLockRelease(ClogCtl->locks->ControlLock);
+       LWLockRelease(ClogCtl->ControlLock);
  }
  
  /*
@@ -128,14 +128,14 @@ TransactionIdGetStatus(TransactionId xid)
         char       *byteptr;
         XidStatus       status;
  
-       LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
+       LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
  
         byteptr = SimpleLruReadPage(ClogCtl, pageno, xid, false);
         byteptr += byteno;
  
         status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
  
-       LWLockRelease(ClogCtl->locks->ControlLock);
+       LWLockRelease(ClogCtl->ControlLock);
  
         return status;
  }
@@ -169,16 +169,16 @@ BootStrapCLOG(void)
  {
         int                     slotno;
  
-       LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
+       LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
  
         /* Create and zero the first page of the commit log */
         slotno = ZeroCLOGPage(0, false);
  
         /* Make sure it's written out */
-       SimpleLruWritePage(ClogCtl, slotno);
+       SimpleLruWritePage(ClogCtl, slotno, NULL);
         /* Assert(ClogCtl->page_status[slotno] == CLOG_PAGE_CLEAN); */
  
-       LWLockRelease(ClogCtl->locks->ControlLock);
+       LWLockRelease(ClogCtl->ControlLock);
  }
  
  /*
@@ -256,12 +256,12 @@ ExtendCLOG(TransactionId newestXact)
  
         pageno = TransactionIdToPage(newestXact);
  
-       LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
+       LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
  
         /* Zero the page and make an XLOG entry about it */
         ZeroCLOGPage(pageno, true);
  
-       LWLockRelease(ClogCtl->locks->ControlLock);
+       LWLockRelease(ClogCtl->ControlLock);
  }
  
  
@@ -351,13 +351,13 @@ clog_redo(XLogRecPtr lsn, XLogRecord *record)
  
                 memcpy(&pageno, XLogRecGetData(record), sizeof(int));
  
-               LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
+               LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
  
                 slotno = ZeroCLOGPage(pageno, false);
-               SimpleLruWritePage(ClogCtl, slotno);
+               SimpleLruWritePage(ClogCtl, slotno, NULL);
                 /* Assert(ClogCtl->page_status[slotno] == SLRU_PAGE_CLEAN); */
  
-               LWLockRelease(ClogCtl->locks->ControlLock);
+               LWLockRelease(ClogCtl->ControlLock);
         }
  }
  
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c

index 57dcd2b33798c3e9230681bb5cfaee593903aa5a..58798d0f07fcb56d230f886258c94bc07e5a4f4a 100644 (file)
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.15 2004/05/29 22:48:18 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.16 2004/05/31 03:47:54 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -17,6 +17,7 @@
  #include <unistd.h>
  
  #include "access/slru.h"
+#include "access/clog.h"               /* only for NUM_CLOG_BUFFERS */
  #include "postmaster/bgwriter.h"
  #include "storage/fd.h"
  #include "storage/lwlock.h"
@@ -100,6 +101,8 @@ typedef enum
   */
  typedef struct SlruSharedData
  {
+       LWLockId        ControlLock;
+
         /*
          * Info for each buffer slot.  Page number is undefined when status is
          * EMPTY.  lru_count is essentially the number of page switches since
@@ -110,6 +113,7 @@ typedef struct SlruSharedData
         SlruPageStatus page_status[NUM_CLOG_BUFFERS];
         int                     page_number[NUM_CLOG_BUFFERS];
         unsigned int page_lru_count[NUM_CLOG_BUFFERS];
+       LWLockId        BufferLocks[NUM_CLOG_BUFFERS];  /* Per-buffer I/O locks */
  
         /*
          * latest_page_number is the page number of the current end of the
@@ -118,12 +122,24 @@ typedef struct SlruSharedData
          */
         int                     latest_page_number;
  } SlruSharedData;
-typedef SlruSharedData *SlruShared;
-
  
  #define SlruFileName(ctl, path, seg) \
         snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg)
  
+/*
+ * During SimpleLruFlush(), we will usually not need to write/fsync more
+ * than one or two physical files, but we may need to write several pages
+ * per file.  We can consolidate the I/O requests by leaving files open
+ * until control returns to SimpleLruFlush().  This data structure remembers
+ * which files are open.
+ */
+typedef struct SlruFlushData
+{
+       int                     num_files;                                      /* # files actually open */
+       int                     fd[NUM_CLOG_BUFFERS];           /* their FD's */
+       int                     segno[NUM_CLOG_BUFFERS];        /* their clog seg#s */
+} SlruFlushData;
+
  /*
   * Macro to mark a buffer slot "most recently used".
   */
@@ -145,14 +161,17 @@ typedef enum
         SLRU_SEEK_FAILED,
         SLRU_READ_FAILED,
         SLRU_WRITE_FAILED,
+       SLRU_FSYNC_FAILED,
         SLRU_CLOSE_FAILED
  } SlruErrorCause;
+
  static SlruErrorCause slru_errcause;
  static int     slru_errno;
  
  
  static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno);
-static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno);
+static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno,
+                                                                 SlruFlush fdata);
  static void SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid);
  static int     SlruSelectLRUPage(SlruCtl ctl, int pageno);
  static bool SlruScanDirectory(SlruCtl ctl, int cutoffPage, bool doDeletions);
@@ -165,24 +184,16 @@ static bool SlruScanDirectory(SlruCtl ctl, int cutoffPage, bool doDeletions);
  int
  SimpleLruShmemSize(void)
  {
-       return MAXALIGN(sizeof(SlruSharedData))
-               + BLCKSZ * NUM_CLOG_BUFFERS
-               + MAXALIGN(sizeof(SlruLockData))
-               ;
+       return MAXALIGN(sizeof(SlruSharedData)) + BLCKSZ * NUM_CLOG_BUFFERS;
  }
  
  void
  SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir)
  {
-       bool            found;
-       char       *ptr;
         SlruShared      shared;
-       SlruLock        locks;
+       bool            found;
  
-       ptr = ShmemInitStruct(name, SimpleLruShmemSize(), &found);
-       shared = (SlruShared) ptr;
-       locks = (SlruLock) (ptr + MAXALIGN(sizeof(SlruSharedData)) +
-                                               BLCKSZ * NUM_CLOG_BUFFERS);
+       shared = (SlruShared) ShmemInitStruct(name, SimpleLruShmemSize(), &found);
  
         if (!IsUnderPostmaster)
         {
@@ -192,18 +203,18 @@ SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir)
  
                 Assert(!found);
  
-               locks->ControlLock = LWLockAssign();
-
                 memset(shared, 0, sizeof(SlruSharedData));
  
+               shared->ControlLock = LWLockAssign();
+
                 bufptr = (char *) shared + MAXALIGN(sizeof(SlruSharedData));
  
                 for (slotno = 0; slotno < NUM_CLOG_BUFFERS; slotno++)
                 {
-                       locks->BufferLocks[slotno] = LWLockAssign();
                         shared->page_buffer[slotno] = bufptr;
                         shared->page_status[slotno] = SLRU_PAGE_EMPTY;
                         shared->page_lru_count[slotno] = 1;
+                       shared->BufferLocks[slotno] = LWLockAssign();
                         bufptr += BLCKSZ;
                 }
  
@@ -213,10 +224,10 @@ SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir)
                 Assert(found);
  
         /* Initialize the unshared control struct */
-       ctl->locks = locks;
         ctl->shared = shared;
+       ctl->ControlLock = shared->ControlLock;
  
-       /* Init directory path */
+       /* Initialize unshared copy of directory path */
         snprintf(ctl->Dir, MAXPGPATH, "%s/%s", DataDir, subdir);
  }
  
@@ -232,7 +243,7 @@ int
  SimpleLruZeroPage(SlruCtl ctl, int pageno)
  {
         int                     slotno;
-       SlruShared      shared = (SlruShared) ctl->shared;
+       SlruShared      shared = ctl->shared;
  
         /* Find a suitable buffer slot for the page */
         slotno = SlruSelectLRUPage(ctl, pageno);
@@ -270,7 +281,7 @@ SimpleLruZeroPage(SlruCtl ctl, int pageno)
  char *
  SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
  {
-       SlruShared      shared = (SlruShared) ctl->shared;
+       SlruShared      shared = ctl->shared;
  
         /* Outer loop handles restart if we lose the buffer to someone else */
         for (;;)
@@ -313,8 +324,8 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
                 SlruRecentlyUsed(shared, slotno);
  
                 /* Release shared lock, grab per-buffer lock instead */
-               LWLockRelease(ctl->locks->ControlLock);
-               LWLockAcquire(ctl->locks->BufferLocks[slotno], LW_EXCLUSIVE);
+               LWLockRelease(shared->ControlLock);
+               LWLockAcquire(shared->BufferLocks[slotno], LW_EXCLUSIVE);
  
                 /*
                  * Check to see if someone else already did the read, or took the
@@ -323,8 +334,8 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
                 if (shared->page_number[slotno] != pageno ||
                         shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS)
                 {
-                       LWLockRelease(ctl->locks->BufferLocks[slotno]);
-                       LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
+                       LWLockRelease(shared->BufferLocks[slotno]);
+                       LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
                         continue;
                 }
  
@@ -332,14 +343,14 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
                 ok = SlruPhysicalReadPage(ctl, pageno, slotno);
  
                 /* Re-acquire shared control lock and update page state */
-               LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
+               LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
  
                 Assert(shared->page_number[slotno] == pageno &&
                            shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS);
  
                 shared->page_status[slotno] = ok ? SLRU_PAGE_CLEAN : SLRU_PAGE_EMPTY;
  
-               LWLockRelease(ctl->locks->BufferLocks[slotno]);
+               LWLockRelease(shared->BufferLocks[slotno]);
  
                 /* Now it's okay to ereport if we failed */
                 if (!ok)
@@ -364,11 +375,11 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
   * Control lock must be held at entry, and will be held at exit.
   */
  void
-SimpleLruWritePage(SlruCtl ctl, int slotno)
+SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata)
  {
         int                     pageno;
         bool            ok;
-       SlruShared      shared = (SlruShared) ctl->shared;
+       SlruShared      shared = ctl->shared;
  
         /* Do nothing if page does not need writing */
         if (shared->page_status[slotno] != SLRU_PAGE_DIRTY &&
@@ -378,8 +389,8 @@ SimpleLruWritePage(SlruCtl ctl, int slotno)
         pageno = shared->page_number[slotno];
  
         /* Release shared lock, grab per-buffer lock instead */
-       LWLockRelease(ctl->locks->ControlLock);
-       LWLockAcquire(ctl->locks->BufferLocks[slotno], LW_EXCLUSIVE);
+       LWLockRelease(shared->ControlLock);
+       LWLockAcquire(shared->BufferLocks[slotno], LW_EXCLUSIVE);
  
         /*
          * Check to see if someone else already did the write, or took the
@@ -392,8 +403,8 @@ SimpleLruWritePage(SlruCtl ctl, int slotno)
                 (shared->page_status[slotno] != SLRU_PAGE_DIRTY &&
                  shared->page_status[slotno] != SLRU_PAGE_WRITE_IN_PROGRESS))
         {
-               LWLockRelease(ctl->locks->BufferLocks[slotno]);
-               LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
+               LWLockRelease(shared->BufferLocks[slotno]);
+               LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
                 return;
         }
  
@@ -412,10 +423,19 @@ SimpleLruWritePage(SlruCtl ctl, int slotno)
         shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS;
  
         /* Okay, do the write */
-       ok = SlruPhysicalWritePage(ctl, pageno, slotno);
+       ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata);
+
+       /* If we failed, and we're in a flush, better close the files */
+       if (!ok && fdata)
+       {
+               int             i;
+
+               for (i = 0; i < fdata->num_files; i++)
+                       close(fdata->fd[i]);
+       }
  
         /* Re-acquire shared control lock and update page state */
-       LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
+       LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
  
         Assert(shared->page_number[slotno] == pageno &&
                    (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS ||
@@ -425,7 +445,7 @@ SimpleLruWritePage(SlruCtl ctl, int slotno)
         if (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS)
                 shared->page_status[slotno] = ok ? SLRU_PAGE_CLEAN : SLRU_PAGE_DIRTY;
  
-       LWLockRelease(ctl->locks->BufferLocks[slotno]);
+       LWLockRelease(shared->BufferLocks[slotno]);
  
         /* Now it's okay to ereport if we failed */
         if (!ok)
@@ -445,7 +465,7 @@ SimpleLruWritePage(SlruCtl ctl, int slotno)
  static bool
  SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
  {
-       SlruShared      shared = (SlruShared) ctl->shared;
+       SlruShared      shared = ctl->shared;
         int                     segno = pageno / SLRU_PAGES_PER_SEGMENT;
         int                     rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
         int                     offset = rpageno * BLCKSZ;
@@ -482,6 +502,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
         {
                 slru_errcause = SLRU_SEEK_FAILED;
                 slru_errno = errno;
+               close(fd);
                 return false;
         }
  
@@ -490,6 +511,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
         {
                 slru_errcause = SLRU_READ_FAILED;
                 slru_errno = errno;
+               close(fd);
                 return false;
         }
  
@@ -511,50 +533,80 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
   * info in static variables to let SlruReportIOError make the report.
   *
   * For now, assume it's not worth keeping a file pointer open across
- * read/write operations.  We could cache one virtual file pointer ...
+ * independent read/write operations.  We do batch operations during
+ * SimpleLruFlush, though.
+ *
+ * fdata is NULL for a standalone write, pointer to open-file info during
+ * SimpleLruFlush.
   */
  static bool
-SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno)
+SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
  {
-       SlruShared      shared = (SlruShared) ctl->shared;
+       SlruShared      shared = ctl->shared;
         int                     segno = pageno / SLRU_PAGES_PER_SEGMENT;
         int                     rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
         int                     offset = rpageno * BLCKSZ;
         char            path[MAXPGPATH];
-       int                     fd;
-
-       SlruFileName(ctl, path, segno);
+       int                     fd = -1;
  
         /*
-        * If the file doesn't already exist, we should create it.  It is
-        * possible for this to need to happen when writing a page that's not
-        * first in its segment; we assume the OS can cope with that.  (Note:
-        * it might seem that it'd be okay to create files only when
-        * SimpleLruZeroPage is called for the first page of a segment.
-        * However, if after a crash and restart the REDO logic elects to
-        * replay the log from a checkpoint before the latest one, then it's
-        * possible that we will get commands to set transaction status of
-        * transactions that have already been truncated from the commit log.
-        * Easiest way to deal with that is to accept references to
-        * nonexistent files here and in SlruPhysicalReadPage.)
+        * During a Flush, we may already have the desired file open.
          */
-       fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
-       if (fd < 0)
+       if (fdata)
         {
-               if (errno != ENOENT)
+               int             i;
+
+               for (i = 0; i < fdata->num_files; i++)
                 {
-                       slru_errcause = SLRU_OPEN_FAILED;
-                       slru_errno = errno;
-                       return false;
+                       if (fdata->segno[i] == segno)
+                       {
+                               fd = fdata->fd[i];
+                               break;
+                       }
                 }
+       }
  
-               fd = BasicOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
-                                                  S_IRUSR | S_IWUSR);
+       if (fd < 0)
+       {
+               /*
+                * If the file doesn't already exist, we should create it.  It is
+                * possible for this to need to happen when writing a page that's not
+                * first in its segment; we assume the OS can cope with that.
+                * (Note: it might seem that it'd be okay to create files only when
+                * SimpleLruZeroPage is called for the first page of a segment.
+                * However, if after a crash and restart the REDO logic elects to
+                * replay the log from a checkpoint before the latest one, then it's
+                * possible that we will get commands to set transaction status of
+                * transactions that have already been truncated from the commit log.
+                * Easiest way to deal with that is to accept references to
+                * nonexistent files here and in SlruPhysicalReadPage.)
+                */
+               SlruFileName(ctl, path, segno);
+               fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
                 if (fd < 0)
                 {
-                       slru_errcause = SLRU_CREATE_FAILED;
-                       slru_errno = errno;
-                       return false;
+                       if (errno != ENOENT)
+                       {
+                               slru_errcause = SLRU_OPEN_FAILED;
+                               slru_errno = errno;
+                               return false;
+                       }
+
+                       fd = BasicOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+                                                          S_IRUSR | S_IWUSR);
+                       if (fd < 0)
+                       {
+                               slru_errcause = SLRU_CREATE_FAILED;
+                               slru_errno = errno;
+                               return false;
+                       }
+               }
+
+               if (fdata)
+               {
+                       fdata->fd[fdata->num_files] = fd;
+                       fdata->segno[fdata->num_files] = segno;
+                       fdata->num_files++;
                 }
         }
  
@@ -562,6 +614,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno)
         {
                 slru_errcause = SLRU_SEEK_FAILED;
                 slru_errno = errno;
+               if (!fdata)
+                       close(fd);
                 return false;
         }
  
@@ -573,14 +627,31 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno)
                         errno = ENOSPC;
                 slru_errcause = SLRU_WRITE_FAILED;
                 slru_errno = errno;
+               if (!fdata)
+                       close(fd);
                 return false;
         }
  
-       if (close(fd))
+       /*
+        * If not part of Flush, need to fsync now.  We assume this happens
+        * infrequently enough that it's not a performance issue.
+        */
+       if (!fdata)
         {
-               slru_errcause = SLRU_CLOSE_FAILED;
-               slru_errno = errno;
-               return false;
+               if (pg_fsync(fd))
+               {
+                       slru_errcause = SLRU_FSYNC_FAILED;
+                       slru_errno = errno;
+                       close(fd);
+                       return false;
+               }
+
+               if (close(fd))
+               {
+                       slru_errcause = SLRU_CLOSE_FAILED;
+                       slru_errno = errno;
+                       return false;
+               }
         }
  
         return true;
@@ -637,6 +708,13 @@ SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid)
                                   errdetail("could not write to file \"%s\" at offset %u: %m",
                                                         path, offset)));
                         break;
+               case SLRU_FSYNC_FAILED:
+                       ereport(ERROR,
+                                       (errcode_for_file_access(),
+                               errmsg("could not access status of transaction %u", xid),
+                                 errdetail("could not fsync file \"%s\": %m",
+                                                       path)));
+                       break;
                 case SLRU_CLOSE_FAILED:
                         ereport(ERROR,
                                         (errcode_for_file_access(),
@@ -668,7 +746,7 @@ SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid)
  static int
  SlruSelectLRUPage(SlruCtl ctl, int pageno)
  {
-       SlruShared      shared = (SlruShared) ctl->shared;
+       SlruShared      shared = ctl->shared;
  
         /* Outer loop handles restart after I/O */
         for (;;)
@@ -717,7 +795,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
                         (void) SimpleLruReadPage(ctl, shared->page_number[bestslot],
                                                                          InvalidTransactionId, false);
                 else
-                       SimpleLruWritePage(ctl, bestslot);
+                       SimpleLruWritePage(ctl, bestslot, NULL);
  
                 /*
                  * Now loop back and try again.  This is the easiest way of
@@ -733,7 +811,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
  void
  SimpleLruSetLatestPage(SlruCtl ctl, int pageno)
  {
-       SlruShared      shared = (SlruShared) ctl->shared;
+       SlruShared      shared = ctl->shared;
  
         shared->latest_page_number = pageno;
  }
@@ -744,16 +822,20 @@ SimpleLruSetLatestPage(SlruCtl ctl, int pageno)
  void
  SimpleLruFlush(SlruCtl ctl, bool checkpoint)
  {
-#ifdef USE_ASSERT_CHECKING             /* only used in Assert() */
-       SlruShared      shared = (SlruShared) ctl->shared;
-#endif
+       SlruShared      shared = ctl->shared;
+       SlruFlushData fdata;
         int                     slotno;
+       int                     pageno = 0;
+       int                     i;
+       bool            ok;
+
+       fdata.num_files = 0;
  
-       LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
+       LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
  
         for (slotno = 0; slotno < NUM_CLOG_BUFFERS; slotno++)
         {
-               SimpleLruWritePage(ctl, slotno);
+               SimpleLruWritePage(ctl, slotno, &fdata);
  
                 /*
                  * When called during a checkpoint, we cannot assert that the slot
@@ -765,7 +847,32 @@ SimpleLruFlush(SlruCtl ctl, bool checkpoint)
                            shared->page_status[slotno] == SLRU_PAGE_CLEAN);
         }
  
-       LWLockRelease(ctl->locks->ControlLock);
+       LWLockRelease(shared->ControlLock);
+
+       /*
+        * Now fsync and close any files that were open
+        */
+       ok = true;
+       for (i = 0; i < fdata.num_files; i++)
+       {
+               if (pg_fsync(fdata.fd[i]))
+               {
+                       slru_errcause = SLRU_FSYNC_FAILED;
+                       slru_errno = errno;
+                       pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
+                       ok = false;
+               }
+
+               if (close(fdata.fd[i]))
+               {
+                       slru_errcause = SLRU_CLOSE_FAILED;
+                       slru_errno = errno;
+                       pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
+                       ok = false;
+               }
+       }
+       if (!ok)
+               SlruReportIOError(ctl, pageno, InvalidTransactionId);
  }
  
  /*
@@ -786,7 +893,7 @@ void
  SimpleLruTruncate(SlruCtl ctl, int cutoffPage)
  {
         int                     slotno;
-       SlruShared      shared = (SlruShared) ctl->shared;
+       SlruShared      shared = ctl->shared;
  
         /*
          * The cutoff point is the start of the segment containing cutoffPage.
@@ -805,7 +912,7 @@ SimpleLruTruncate(SlruCtl ctl, int cutoffPage)
          * have been flushed already during the checkpoint, we're just being
          * extra careful here.)
          */
-       LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
+       LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
  
  restart:;
  
@@ -817,7 +924,7 @@ restart:;
          */
         if (ctl->PagePrecedes(shared->latest_page_number, cutoffPage))
         {
-               LWLockRelease(ctl->locks->ControlLock);
+               LWLockRelease(shared->ControlLock);
                 ereport(LOG,
                                 (errmsg("could not truncate directory \"%s\": apparent wraparound",
                                                 ctl->Dir)));
@@ -849,11 +956,11 @@ restart:;
                         (void) SimpleLruReadPage(ctl, shared->page_number[slotno],
                                                                          InvalidTransactionId, false);
                 else
-                       SimpleLruWritePage(ctl, slotno);
+                       SimpleLruWritePage(ctl, slotno, NULL);
                 goto restart;
         }
  
-       LWLockRelease(ctl->locks->ControlLock);
+       LWLockRelease(shared->ControlLock);
  
         /* Now we can remove the old segment(s) */
         (void) SlruScanDirectory(ctl, cutoffPage, true);
@@ -878,7 +985,8 @@ SlruScanDirectory(SlruCtl ctl, int cutoffPage, bool doDeletions)
         if (cldir == NULL)
                 ereport(ERROR,
                                 (errcode_for_file_access(),
-                          errmsg("could not open directory \"%s\": %m", ctl->Dir)));
+                                errmsg("could not open directory \"%s\": %m",
+                                               ctl->Dir)));
  
         errno = 0;
         while ((clde = readdir(cldir)) != NULL)
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c

index ce80a4feff7a6454152d6c154cca5c50296c975d..6bb683386a2f109d2b413f73d76a4b860044dd10 100644 (file)
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -27,14 +27,17 @@
   *
   * If the bgwriter exits unexpectedly, the postmaster treats that the same
   * as a backend crash: shared memory may be corrupted, so remaining backends
- * should be killed by SIGQUIT and then a recovery cycle started.
+ * should be killed by SIGQUIT and then a recovery cycle started.  (Even if
+ * shared memory isn't corrupted, we have lost information about which
+ * files need to be fsync'd for the next checkpoint, and so a system
+ * restart needs to be forced.)
   *
   *
   * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.1 2004/05/29 22:48:19 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.2 2004/05/31 03:47:59 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -55,13 +58,54 @@
  #include "utils/guc.h"
  
  
-/*
+/*----------
   * Shared memory area for communication between bgwriter and backends
+ *
+ * The ckpt counters allow backends to watch for completion of a checkpoint
+ * request they send.  Here's how it works:
+ *     * At start of a checkpoint, bgwriter increments ckpt_started.
+ *     * On completion of a checkpoint, bgwriter sets ckpt_done to
+ *       equal ckpt_started.
+ *     * On failure of a checkpoint, bgwrite first increments ckpt_failed,
+ *       then sets ckpt_done to equal ckpt_started.
+ * All three fields are declared sig_atomic_t to ensure they can be read
+ * and written without explicit locking.  The algorithm for backends is:
+ *     1. Record current values of ckpt_failed and ckpt_started (in that
+ *        order!).
+ *     2. Send signal to request checkpoint.
+ *     3. Sleep until ckpt_started changes.  Now you know a checkpoint has
+ *        begun since you started this algorithm (although *not* that it was
+ *        specifically initiated by your signal).
+ *     4. Record new value of ckpt_started.
+ *     5. Sleep until ckpt_done >= saved value of ckpt_started.  (Use modulo
+ *        arithmetic here in case counters wrap around.)  Now you know a
+ *        checkpoint has started and completed, but not whether it was
+ *        successful.
+ *     6. If ckpt_failed is different from the originally saved value,
+ *        assume request failed; otherwise it was definitely successful.
+ *
+ * The requests array holds fsync requests sent by backends and not yet
+ * absorbed by the bgwriter.
+ *----------
   */
+typedef struct
+{
+       RelFileNode             rnode;
+       BlockNumber             segno;
+       /* might add a request-type field later */
+} BgWriterRequest;
+
  typedef struct
  {
         pid_t   bgwriter_pid;           /* PID of bgwriter (0 if not started) */
-       sig_atomic_t    checkpoint_count; /* advances when checkpoint done */
+
+       sig_atomic_t    ckpt_started;   /* advances when checkpoint starts */
+       sig_atomic_t    ckpt_done;              /* advances when checkpoint done */
+       sig_atomic_t    ckpt_failed;    /* advances when checkpoint fails */
+
+       int                             num_requests;   /* current # of requests */
+       int                             max_requests;   /* allocated array size */
+       BgWriterRequest requests[1];    /* VARIABLE LENGTH ARRAY */
  } BgWriterShmemStruct;
  
  static BgWriterShmemStruct *BgWriterShmem;
@@ -86,6 +130,10 @@ static volatile sig_atomic_t shutdown_requested = false;
  /*
   * Private state
   */
+static bool            am_bg_writer = false;
+
+static bool            ckpt_active = false;
+
  static time_t  last_checkpoint_time;
  
  
@@ -106,6 +154,7 @@ BackgroundWriterMain(void)
  {
         Assert(BgWriterShmem != NULL);
         BgWriterShmem->bgwriter_pid = MyProcPid;
+       am_bg_writer = true;
  
         /*
          * Properly accept or ignore signals the postmaster might send us
@@ -180,6 +229,17 @@ BackgroundWriterMain(void)
                  */
                 InError = false;
  
+               /* Warn any waiting backends that the checkpoint failed. */
+               if (ckpt_active)
+               {
+                       /* use volatile pointer to prevent code rearrangement */
+                       volatile BgWriterShmemStruct *bgs = BgWriterShmem;
+
+                       bgs->ckpt_failed++;
+                       bgs->ckpt_done = bgs->ckpt_started;
+                       ckpt_active = false;
+               }
+
                 /*
                  * Exit interrupt holdoff section we implicitly established above.
                  */
@@ -214,8 +274,17 @@ BackgroundWriterMain(void)
                 long            udelay;
  
                 /*
-                * Process any signals received recently.
+                * Emergency bailout if postmaster has died.  This is to avoid the
+                * necessity for manual cleanup of all postmaster children.
                  */
+               if (!PostmasterIsAlive(true))
+                       exit(1);
+
+               /*
+                * Process any requests or signals received recently.
+                */
+               AbsorbFsyncRequests();
+
                 if (got_SIGHUP)
                 {
                         got_SIGHUP = false;
@@ -265,8 +334,20 @@ BackgroundWriterMain(void)
                                                          errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
                         }
  
+                       /*
+                        * Indicate checkpoint start to any waiting backends.
+                        */
+                       ckpt_active = true;
+                       BgWriterShmem->ckpt_started++;
+
                         CreateCheckPoint(false, force_checkpoint);
  
+                       /*
+                        * Indicate checkpoint completion to any waiting backends.
+                        */
+                       BgWriterShmem->ckpt_done = BgWriterShmem->ckpt_started;
+                       ckpt_active = false;
+
                         /*
                          * Note we record the checkpoint start time not end time as
                          * last_checkpoint_time.  This is so that time-driven checkpoints
@@ -274,14 +355,11 @@ BackgroundWriterMain(void)
                          */
                         last_checkpoint_time = now;
  
-                       /*
-                        * Indicate checkpoint completion to any waiting backends.
-                        */
-                       BgWriterShmem->checkpoint_count++;
-
                         /*
                          * After any checkpoint, close all smgr files.  This is so we
                          * won't hang onto smgr references to deleted files indefinitely.
+                        * (It is safe to do this because this process does not have a
+                        * relcache, and so no dangling references could remain.)
                          */
                         smgrcloseall();
  
@@ -301,6 +379,8 @@ BackgroundWriterMain(void)
                  * we respond reasonably promptly when someone signals us,
                  * break down the sleep into 1-second increments, and check for
                  * interrupts after each nap.
+                *
+                * We absorb pending requests after each short sleep.
                  */
                 udelay = ((n > 0) ? BgWriterDelay : 10000) * 1000L;
                 while (udelay > 1000000L)
@@ -308,17 +388,11 @@ BackgroundWriterMain(void)
                         if (got_SIGHUP || checkpoint_requested || shutdown_requested)
                                 break;
                         pg_usleep(1000000L);
+                       AbsorbFsyncRequests();
                         udelay -= 1000000L;
                 }
                 if (!(got_SIGHUP || checkpoint_requested || shutdown_requested))
                         pg_usleep(udelay);
-
-               /*
-                * Emergency bailout if postmaster has died.  This is to avoid the
-                * necessity for manual cleanup of all postmaster children.
-                */
-               if (!PostmasterIsAlive(true))
-                       exit(1);
         }
  }
  
@@ -387,10 +461,11 @@ int
  BgWriterShmemSize(void)
  {
         /*
-        * This is not worth measuring right now, but may become so after we
-        * add fsync signaling ...
+        * Currently, the size of the requests[] array is arbitrarily set
+        * equal to NBuffers.  This may prove too large or small ...
          */
-       return MAXALIGN(sizeof(BgWriterShmemStruct));
+       return MAXALIGN(sizeof(BgWriterShmemStruct) +
+                                       (NBuffers - 1) * sizeof(BgWriterRequest));
  }
  
  /*
@@ -404,7 +479,7 @@ BgWriterShmemInit(void)
  
         BgWriterShmem = (BgWriterShmemStruct *)
                 ShmemInitStruct("Background Writer Data",
-                                               sizeof(BgWriterShmemStruct),
+                                               BgWriterShmemSize(),
                                                 &found);
         if (BgWriterShmem == NULL)
                 ereport(FATAL,
@@ -414,6 +489,7 @@ BgWriterShmemInit(void)
                 return;                                 /* already initialized */
  
         MemSet(BgWriterShmem, 0, sizeof(BgWriterShmemStruct));
+       BgWriterShmem->max_requests = NBuffers;
  }
  
  /*
@@ -427,8 +503,10 @@ BgWriterShmemInit(void)
  void
  RequestCheckpoint(bool waitforit)
  {
-       volatile sig_atomic_t *count_ptr = &BgWriterShmem->checkpoint_count;
-       sig_atomic_t    old_count = *count_ptr;
+       /* use volatile pointer to prevent code rearrangement */
+       volatile BgWriterShmemStruct *bgs = BgWriterShmem;
+       sig_atomic_t    old_failed = bgs->ckpt_failed;
+       sig_atomic_t    old_started = bgs->ckpt_started;
  
         /*
          * Send signal to request checkpoint.  When waitforit is false,
@@ -442,15 +520,119 @@ RequestCheckpoint(bool waitforit)
                          "could not signal for checkpoint: %m");
  
         /*
-        * If requested, wait for completion.  We detect completion by
-        * observing a change in checkpoint_count in shared memory.
+        * If requested, wait for completion.  We detect completion according
+        * to the algorithm given above.
          */
         if (waitforit)
         {
-               while (*count_ptr == old_count)
+               while (bgs->ckpt_started == old_started)
                 {
                         CHECK_FOR_INTERRUPTS();
-                       pg_usleep(1000000L);
+                       pg_usleep(100000L);
+               }
+               old_started = bgs->ckpt_started;
+               /*
+                * We are waiting for ckpt_done >= old_started, in a modulo
+                * sense.  This is a little tricky since we don't know the
+                * width or signedness of sig_atomic_t.  We make the lowest
+                * common denominator assumption that it is only as wide
+                * as "char".  This means that this algorithm will cope
+                * correctly as long as we don't sleep for more than 127
+                * completed checkpoints.  (If we do, we will get another
+                * chance to exit after 128 more checkpoints...)
+                */
+               while (((signed char) (bgs->ckpt_done - old_started)) < 0)
+               {
+                       CHECK_FOR_INTERRUPTS();
+                       pg_usleep(100000L);
                 }
+               if (bgs->ckpt_failed != old_failed)
+                       ereport(ERROR,
+                                       (errmsg("checkpoint request failed"),
+                                        errhint("Consult the postmaster log for details.")));
+       }
+}
+
+/*
+ * ForwardFsyncRequest
+ *             Forward a file-fsync request from a backend to the bgwriter
+ *
+ * Whenever a backend is compelled to write directly to a relation
+ * (which should be seldom, if the bgwriter is getting its job done),
+ * the backend calls this routine to pass over knowledge that the relation
+ * is dirty and must be fsync'd before next checkpoint.
+ *
+ * If we are unable to pass over the request (at present, this can happen
+ * if the shared memory queue is full), we return false.  That forces
+ * the backend to do its own fsync.  We hope that will be even more seldom.
+ *
+ * Note: we presently make no attempt to eliminate duplicate requests
+ * in the requests[] queue.  The bgwriter will have to eliminate dups
+ * internally anyway, so we may as well avoid holding the lock longer
+ * than we have to here.
+ */
+bool
+ForwardFsyncRequest(RelFileNode rnode, BlockNumber segno)
+{
+       BgWriterRequest *request;
+
+       if (!IsUnderPostmaster)
+               return false;                   /* probably shouldn't even get here */
+       Assert(BgWriterShmem != NULL);
+
+       LWLockAcquire(BgWriterCommLock, LW_EXCLUSIVE);
+       if (BgWriterShmem->bgwriter_pid == 0 ||
+               BgWriterShmem->num_requests >= BgWriterShmem->max_requests)
+       {
+               LWLockRelease(BgWriterCommLock);
+               return false;
+       }
+       request = &BgWriterShmem->requests[BgWriterShmem->num_requests++];
+       request->rnode = rnode;
+       request->segno = segno;
+       LWLockRelease(BgWriterCommLock);
+       return true;
+}
+
+/*
+ * AbsorbFsyncRequests
+ *             Retrieve queued fsync requests and pass them to local smgr.
+ *
+ * This is exported because it must be called during CreateCheckpoint;
+ * we have to be sure we have accepted all pending requests *after* we
+ * establish the checkpoint redo pointer.  Since CreateCheckpoint
+ * sometimes runs in non-bgwriter processes, do nothing if not bgwriter.
+ */
+void
+AbsorbFsyncRequests(void)
+{
+       BgWriterRequest *requests = NULL;
+       BgWriterRequest *request;
+       int                     n;
+
+       if (!am_bg_writer)
+               return;
+
+       /*
+        * We try to avoid holding the lock for a long time by copying the
+        * request array.
+        */
+       LWLockAcquire(BgWriterCommLock, LW_EXCLUSIVE);
+
+       n = BgWriterShmem->num_requests;
+       if (n > 0)
+       {
+               requests = (BgWriterRequest *) palloc(n * sizeof(BgWriterRequest));
+               memcpy(requests, BgWriterShmem->requests, n * sizeof(BgWriterRequest));
+       }
+       BgWriterShmem->num_requests = 0;
+
+       LWLockRelease(BgWriterCommLock);
+
+       for (request = requests; n > 0; request++, n--)
+       {
+               RememberFsyncRequest(request->rnode, request->segno);
         }
+       if (requests)
+               pfree(requests);
  }
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c

index f718e33cd598beddb09c76f8467e18f16017c162..2386bc89bf3b7eaf22be77b5448924e89529412f 100644 (file)
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.166 2004/05/29 22:48:19 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.167 2004/05/31 03:48:02 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -1044,6 +1044,9 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
   *             bothering to write them out first.      This is NOT rollback-able,
   *             and so should be used only with extreme caution!
   *
+ *             There is no particularly good reason why this doesn't have a
+ *             firstDelBlock parameter, except that current callers don't need it.
+ *
   *             We assume that the caller holds an exclusive lock on the relation,
   *             which should assure that no new buffers will be acquired for the rel
   *             meanwhile.
@@ -1052,14 +1055,15 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
  void
  DropRelationBuffers(Relation rel)
  {
-       DropRelFileNodeBuffers(rel->rd_node, rel->rd_istemp);
+       DropRelFileNodeBuffers(rel->rd_node, rel->rd_istemp, 0);
  }
  
  /* ---------------------------------------------------------------------
   *             DropRelFileNodeBuffers
   *
   *             This is the same as DropRelationBuffers, except that the target
- *             relation is specified by RelFileNode and temp status.
+ *             relation is specified by RelFileNode and temp status, and one
+ *             may specify the first block to drop.
   *
   *             This is NOT rollback-able.      One legitimate use is to clear the
   *             buffer cache of buffers for a relation that is being deleted
@@ -1067,7 +1071,8 @@ DropRelationBuffers(Relation rel)
   * --------------------------------------------------------------------
   */
  void
-DropRelFileNodeBuffers(RelFileNode rnode, bool istemp)
+DropRelFileNodeBuffers(RelFileNode rnode, bool istemp,
+                                          BlockNumber firstDelBlock)
  {
         int                     i;
         BufferDesc *bufHdr;
@@ -1077,7 +1082,8 @@ DropRelFileNodeBuffers(RelFileNode rnode, bool istemp)
                 for (i = 0; i < NLocBuffer; i++)
                 {
                         bufHdr = &LocalBufferDescriptors[i];
-                       if (RelFileNodeEquals(bufHdr->tag.rnode, rnode))
+                       if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
+                               bufHdr->tag.blockNum >= firstDelBlock)
                         {
                                 bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
                                 bufHdr->cntxDirty = false;
@@ -1094,7 +1100,8 @@ DropRelFileNodeBuffers(RelFileNode rnode, bool istemp)
         {
                 bufHdr = &BufferDescriptors[i - 1];
  recheck:
-               if (RelFileNodeEquals(bufHdr->tag.rnode, rnode))
+               if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
+                       bufHdr->tag.blockNum >= firstDelBlock)
                 {
                         /*
                          * If there is I/O in progress, better wait till it's done;
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c

index 5ef12de949518be73314b6341308886733f6b730..96de54110cfaab2f21d42120c63f3c09fbb6d961 100644 (file)
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.108 2004/02/23 23:03:10 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.109 2004/05/31 03:48:04 tgl Exp $
   *
   * NOTES:
   *
@@ -484,6 +484,7 @@ Insert(File file)
         DO_DB(_dump_lru());
  }
  
+/* returns 0 on success, -1 on re-open failure (with errno set) */
  static int
  LruInsert(File file)
  {
@@ -685,6 +686,7 @@ filepath(const char *filename)
         return buf;
  }
  
+/* returns 0 on success, -1 on re-open failure (with errno set) */
  static int
  FileAccess(File file)
  {
@@ -954,7 +956,10 @@ FileRead(File file, char *buffer, int amount)
                            file, VfdCache[file].fileName,
                            VfdCache[file].seekPos, amount, buffer));
  
-       FileAccess(file);
+       returnCode = FileAccess(file);
+       if (returnCode < 0)
+               return returnCode;
+
         returnCode = read(VfdCache[file].fd, buffer, amount);
         if (returnCode > 0)
                 VfdCache[file].seekPos += returnCode;
@@ -975,7 +980,9 @@ FileWrite(File file, char *buffer, int amount)
                            file, VfdCache[file].fileName,
                            VfdCache[file].seekPos, amount, buffer));
  
-       FileAccess(file);
+       returnCode = FileAccess(file);
+       if (returnCode < 0)
+               return returnCode;
  
         errno = 0;
         returnCode = write(VfdCache[file].fd, buffer, amount);
@@ -992,9 +999,28 @@ FileWrite(File file, char *buffer, int amount)
         return returnCode;
  }
  
+int
+FileSync(File file)
+{
+       int                     returnCode;
+
+       Assert(FileIsValid(file));
+
+       DO_DB(elog(LOG, "FileSync: %d (%s)",
+                          file, VfdCache[file].fileName));
+
+       returnCode = FileAccess(file);
+       if (returnCode < 0)
+               return returnCode;
+
+       return pg_fsync(VfdCache[file].fd);
+}
+
  long
  FileSeek(File file, long offset, int whence)
  {
+       int                     returnCode;
+
         Assert(FileIsValid(file));
  
         DO_DB(elog(LOG, "FileSeek: %d (%s) %ld %ld %d",
@@ -1014,8 +1040,11 @@ FileSeek(File file, long offset, int whence)
                                 VfdCache[file].seekPos += offset;
                                 break;
                         case SEEK_END:
-                               FileAccess(file);
-                               VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+                               returnCode = FileAccess(file);
+                               if (returnCode < 0)
+                                       return returnCode;
+                               VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+                                                                                          offset, whence);
                                 break;
                         default:
                                 elog(ERROR, "invalid whence: %d", whence);
@@ -1030,14 +1059,17 @@ FileSeek(File file, long offset, int whence)
                                 if (offset < 0)
                                         elog(ERROR, "invalid seek offset: %ld", offset);
                                 if (VfdCache[file].seekPos != offset)
-                                       VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+                                       VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+                                                                                                  offset, whence);
                                 break;
                         case SEEK_CUR:
                                 if (offset != 0 || VfdCache[file].seekPos == FileUnknownPos)
-                                       VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+                                       VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+                                                                                                  offset, whence);
                                 break;
                         case SEEK_END:
-                               VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+                               VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+                                                                                          offset, whence);
                                 break;
                         default:
                                 elog(ERROR, "invalid whence: %d", whence);
@@ -1071,7 +1103,10 @@ FileTruncate(File file, long offset)
         DO_DB(elog(LOG, "FileTruncate %d (%s)",
                            file, VfdCache[file].fileName));
  
-       FileAccess(file);
+       returnCode = FileAccess(file);
+       if (returnCode < 0)
+               return returnCode;
+
         returnCode = ftruncate(VfdCache[file].fd, (size_t) offset);
         return returnCode;
  }
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c

index 2122a243207b12049b5d655e3a01305e70b3aade..5ac5868f690b32196f7eca791674d2de3c6ca4b0 100644 (file)
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.104 2004/04/19 17:42:58 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.105 2004/05/31 03:48:06 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -21,8 +21,10 @@
  
  #include "catalog/catalog.h"
  #include "miscadmin.h"
+#include "postmaster/bgwriter.h"
  #include "storage/fd.h"
  #include "storage/smgr.h"
+#include "utils/hsearch.h"
  #include "utils/memutils.h"
  
  
@@ -33,37 +35,68 @@
   *     system's file size limit (often 2GBytes).  In order to do that,
   *     we break relations up into chunks of < 2GBytes and store one chunk
   *     in each of several files that represent the relation.  See the
- *     BLCKSZ and RELSEG_SIZE configuration constants in
- *     include/pg_config.h.  All chunks except the last MUST have size exactly
- *     equal to RELSEG_SIZE blocks --- see mdnblocks() and mdtruncate().
+ *     BLCKSZ and RELSEG_SIZE configuration constants in pg_config_manual.h.
+ *     All chunks except the last MUST have size exactly equal to RELSEG_SIZE
+ *     blocks --- see mdnblocks() and mdtruncate().
   *
   *     The file descriptor pointer (md_fd field) stored in the SMgrRelation
   *     cache is, therefore, just the head of a list of MdfdVec objects.
   *     But note the md_fd pointer can be NULL, indicating relation not open.
   *
+ *     Note that mdfd_chain == NULL does not necessarily mean the relation
+ *     doesn't have another segment after this one; we may just not have
+ *     opened the next segment yet.  (We could not have "all segments are
+ *     in the chain" as an invariant anyway, since another backend could
+ *     extend the relation when we weren't looking.)
+ *
   *     All MdfdVec objects are palloc'd in the MdCxt memory context.
   */
  
  typedef struct _MdfdVec
  {
         File            mdfd_vfd;                       /* fd number in fd.c's pool */
-
-#ifndef LET_OS_MANAGE_FILESIZE
-       struct _MdfdVec *mdfd_chain;    /* for large relations */
+       BlockNumber     mdfd_segno;                     /* segment number, from 0 */
+#ifndef LET_OS_MANAGE_FILESIZE         /* for large relations */
+       struct _MdfdVec *mdfd_chain;    /* next segment, or NULL */
  #endif
  } MdfdVec;
  
  static MemoryContext MdCxt;            /* context for all md.c allocations */
  
  
-/* routines declared here */
-static MdfdVec *mdopen(SMgrRelation reln);
+/*
+ * In some contexts (currently, standalone backends and the bgwriter process)
+ * we keep track of pending fsync operations: we need to remember all relation
+ * segments that have been written since the last checkpoint, so that we can
+ * fsync them down to disk before completing the next checkpoint.  This hash
+ * table remembers the pending operations.  We use a hash table not because
+ * we want to look up individual operations, but simply as a convenient way
+ * of eliminating duplicate requests.
+ *
+ * (Regular backends do not track pending operations locally, but forward
+ * them to the bgwriter.)
+ *
+ * XXX for WIN32, may want to expand this to track pending deletes, too.
+ */
+typedef struct
+{
+       RelFileNode     rnode;                  /* the targeted relation */
+       BlockNumber     segno;                  /* which segment */
+} PendingOperationEntry;
+
+static HTAB *pendingOpsTable = NULL;
+
+
+/* local routines */
+static MdfdVec *mdopen(SMgrRelation reln, bool allowNotFound);
+static bool register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
  static MdfdVec *_fdvec_alloc(void);
  #ifndef LET_OS_MANAGE_FILESIZE
  static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno,
                                                           int oflags);
  #endif
-static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno);
+static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno,
+                                                        bool allowNotFound);
  static BlockNumber _mdnblocks(File file, Size blcksz);
  
  
@@ -79,6 +112,31 @@ mdinit(void)
                                                                   ALLOCSET_DEFAULT_INITSIZE,
                                                                   ALLOCSET_DEFAULT_MAXSIZE);
  
+       /*
+        * Create pending-operations hashtable if we need it.  Currently,
+        * we need it if we are standalone (not under a postmaster) OR
+        * if we are a bootstrap-mode subprocess of a postmaster (that is,
+        * a startup or bgwriter process).
+        */
+       if (!IsUnderPostmaster || IsBootstrapProcessingMode())
+       {
+               HASHCTL         hash_ctl;
+
+               MemSet(&hash_ctl, 0, sizeof(hash_ctl));
+               hash_ctl.keysize = sizeof(PendingOperationEntry);
+               hash_ctl.entrysize = sizeof(PendingOperationEntry);
+               hash_ctl.hash = tag_hash;
+               hash_ctl.hcxt = MdCxt;
+               pendingOpsTable = hash_create("Pending Ops Table",
+                                                                         100L,
+                                                                         &hash_ctl,
+                                                                         HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+               if (pendingOpsTable == NULL)
+                       ereport(FATAL,
+                                       (errcode(ERRCODE_OUT_OF_MEMORY),
+                                        errmsg("out of memory")));
+       }
+
         return true;
  }
  
@@ -130,6 +188,7 @@ mdcreate(SMgrRelation reln, bool isRedo)
         reln->md_fd = _fdvec_alloc();
  
         reln->md_fd->mdfd_vfd = fd;
+       reln->md_fd->mdfd_segno = 0;
  #ifndef LET_OS_MANAGE_FILESIZE
         reln->md_fd->mdfd_chain = NULL;
  #endif
@@ -217,7 +276,7 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer)
         int                     nbytes;
         MdfdVec    *v;
  
-       v = _mdfd_getseg(reln, blocknum);
+       v = _mdfd_getseg(reln, blocknum, false);
  
  #ifndef LET_OS_MANAGE_FILESIZE
         seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
@@ -252,6 +311,9 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer)
                 return false;
         }
  
+       if (!register_dirty_segment(reln, v))
+               return false;
+
  #ifndef LET_OS_MANAGE_FILESIZE
         Assert(_mdnblocks(v->mdfd_vfd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
  #endif
@@ -261,12 +323,14 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer)
  
  /*
   *     mdopen() -- Open the specified relation.  ereport's on failure.
+ *             (Optionally, can return NULL instead of ereport for ENOENT.)
   *
   * Note we only open the first segment, when there are multiple segments.
   */
  static MdfdVec *
-mdopen(SMgrRelation reln)
+mdopen(SMgrRelation reln, bool allowNotFound)
  {
+       MdfdVec    *mdfd;
         char       *path;
         File            fd;
  
@@ -292,6 +356,8 @@ mdopen(SMgrRelation reln)
                 if (fd < 0)
                 {
                         pfree(path);
+                       if (allowNotFound && errno == ENOENT)
+                               return NULL;
                         ereport(ERROR,
                                         (errcode_for_file_access(),
                                          errmsg("could not open relation %u/%u: %m",
@@ -302,15 +368,16 @@ mdopen(SMgrRelation reln)
  
         pfree(path);
  
-       reln->md_fd = _fdvec_alloc();
+       reln->md_fd = mdfd = _fdvec_alloc();
  
-       reln->md_fd->mdfd_vfd = fd;
+       mdfd->mdfd_vfd = fd;
+       mdfd->mdfd_segno = 0;
  #ifndef LET_OS_MANAGE_FILESIZE
-       reln->md_fd->mdfd_chain = NULL;
+       mdfd->mdfd_chain = NULL;
         Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
  #endif
  
-       return reln->md_fd;
+       return mdfd;
  }
  
  /*
@@ -361,7 +428,7 @@ mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
         int                     nbytes;
         MdfdVec    *v;
  
-       v = _mdfd_getseg(reln, blocknum);
+       v = _mdfd_getseg(reln, blocknum, false);
  
  #ifndef LET_OS_MANAGE_FILESIZE
         seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
@@ -403,7 +470,7 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer)
         long            seekpos;
         MdfdVec    *v;
  
-       v = _mdfd_getseg(reln, blocknum);
+       v = _mdfd_getseg(reln, blocknum, false);
  
  #ifndef LET_OS_MANAGE_FILESIZE
         seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
@@ -418,6 +485,9 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer)
         if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
                 return false;
  
+       if (!register_dirty_segment(reln, v))
+               return false;
+
         return true;
  }
  
@@ -434,7 +504,7 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer)
  BlockNumber
  mdnblocks(SMgrRelation reln)
  {
-       MdfdVec    *v = mdopen(reln);
+       MdfdVec    *v = mdopen(reln, false);
  
  #ifndef LET_OS_MANAGE_FILESIZE
         BlockNumber nblocks;
@@ -516,7 +586,7 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks)
         if (nblocks == curnblk)
                 return nblocks;                 /* no work */
  
-       v = mdopen(reln);
+       v = mdopen(reln, false);
  
  #ifndef LET_OS_MANAGE_FILESIZE
         priorblocks = 0;
@@ -576,40 +646,154 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks)
  }
  
  /*
- *     mdcommit() -- Commit a transaction.
+ *     mdsync() -- Sync previous writes to stable storage.
+ *
+ * This is only called during checkpoints, and checkpoints should only
+ * occur in processes that have created a pendingOpsTable.
   */
  bool
-mdcommit(void)
+mdsync(void)
  {
+       HASH_SEQ_STATUS hstat;
+       PendingOperationEntry *entry;
+
+       if (!pendingOpsTable)
+               return false;
+
         /*
-        * We don't actually have to do anything here...
+        * If we are in the bgwriter, the sync had better include all fsync
+        * requests that were queued by backends before the checkpoint REDO
+        * point was determined.  We go that a little better by accepting
+        * all requests queued up to the point where we start fsync'ing.
          */
+       AbsorbFsyncRequests();
+
+       hash_seq_init(&hstat, pendingOpsTable);
+       while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
+       {
+               /*
+                * If fsync is off then we don't have to bother opening the file
+                * at all.  (We delay checking until this point so that changing
+                * fsync on the fly behaves sensibly.)
+                */
+               if (enableFsync)
+               {
+                       SMgrRelation reln;
+                       MdfdVec *seg;
+
+                       /*
+                        * Find or create an smgr hash entry for this relation.
+                        * This may seem a bit unclean -- md calling smgr?  But it's
+                        * really the best solution.  It ensures that the open file
+                        * reference isn't permanently leaked if we get an error here.
+                        * (You may say "but an unreferenced SMgrRelation is still a
+                        * leak!"  Not really, because the only case in which a checkpoint
+                        * is done by a process that isn't about to shut down is in the
+                        * bgwriter, and it will periodically do smgrcloseall().  This
+                        * fact justifies our not closing the reln in the success path
+                        * either, which is a good thing since in non-bgwriter cases
+                        * we couldn't safely do that.)  Furthermore, in many cases
+                        * the relation will have been dirtied through this same smgr
+                        * relation, and so we can save a file open/close cycle.
+                        */
+                       reln = smgropen(entry->rnode);
+
+                       /*
+                        * It is possible that the relation has been dropped or truncated
+                        * since the fsync request was entered.  Therefore, we have to
+                        * allow file-not-found errors.  This applies both during
+                        * _mdfd_getseg() and during FileSync, since fd.c might have
+                        * closed the file behind our back.
+                        */
+                       seg = _mdfd_getseg(reln,
+                                                          entry->segno * ((BlockNumber) RELSEG_SIZE),
+                                                          true);
+                       if (seg)
+                       {
+                               if (FileSync(seg->mdfd_vfd) < 0 &&
+                                       errno != ENOENT)
+                               {
+                                       ereport(LOG,
+                                                       (errcode_for_file_access(),
+                                                        errmsg("could not fsync segment %u of relation %u/%u: %m",
+                                                                       entry->segno,
+                                                                       entry->rnode.tblNode,
+                                                                       entry->rnode.relNode)));
+                                       return false;
+                               }
+                       }
+               }
+
+               /* Okay, delete this entry */
+               if (hash_search(pendingOpsTable, entry,
+                                               HASH_REMOVE, NULL) == NULL)
+                       elog(ERROR, "pendingOpsTable corrupted");
+       }
+
         return true;
  }
  
  /*
- *     mdabort() -- Abort a transaction.
+ * register_dirty_segment() -- Mark a relation segment as needing fsync
+ *
+ * If there is a local pending-ops table, just make an entry in it for
+ * mdsync to process later.  Otherwise, try to pass off the fsync request
+ * to the background writer process.  If that fails, just do the fsync
+ * locally before returning (we expect this will not happen often enough
+ * to be a performance problem).
+ *
+ * A false result implies I/O failure during local fsync.  errno will be
+ * valid for error reporting.
   */
-bool
-mdabort(void)
+static bool
+register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
  {
-       /*
-        * We don't actually have to do anything here...
-        */
+       if (pendingOpsTable)
+       {
+               PendingOperationEntry entry;
+
+               /* ensure any pad bytes in the struct are zeroed */
+               MemSet(&entry, 0, sizeof(entry));
+               entry.rnode = reln->smgr_rnode;
+               entry.segno = seg->mdfd_segno;
+
+               if (hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL) != NULL)
+                       return true;
+               /* out of memory: fall through to do it locally */
+       }
+       else
+       {
+               if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno))
+                       return true;
+       }
+
+       if (FileSync(seg->mdfd_vfd) < 0)
+               return false;
         return true;
  }
  
  /*
- *     mdsync() -- Sync previous writes to stable storage.
+ * RememberFsyncRequest() -- callback from bgwriter side of fsync request
+ *
+ * We stuff the fsync request into the local hash table for execution
+ * during the bgwriter's next checkpoint.
   */
-bool
-mdsync(void)
+void
+RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
  {
-       sync();
-       if (IsUnderPostmaster)
-               pg_usleep(2000000L);
-       sync();
-       return true;
+       PendingOperationEntry entry;
+
+       Assert(pendingOpsTable);
+
+       /* ensure any pad bytes in the struct are zeroed */
+       MemSet(&entry, 0, sizeof(entry));
+       entry.rnode = rnode;
+       entry.segno = segno;
+
+       if (hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL) == NULL)
+               ereport(FATAL,
+                               (errcode(ERRCODE_OUT_OF_MEMORY),
+                                errmsg("out of memory")));
  }
  
  /*
@@ -618,18 +802,11 @@ mdsync(void)
  static MdfdVec *
  _fdvec_alloc(void)
  {
-       MdfdVec *v;
-
-       v = (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
-       v->mdfd_vfd = -1;
-#ifndef LET_OS_MANAGE_FILESIZE
-       v->mdfd_chain = NULL;
-#endif
-
-       return v;
+       return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
  }
  
  #ifndef LET_OS_MANAGE_FILESIZE
+
  /*
   * Open the specified segment of the relation,
   * and make a MdfdVec object for it.  Returns NULL on failure.
@@ -642,11 +819,11 @@ _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
         char       *path,
                            *fullpath;
  
-       /* be sure we have enough space for the '.segno', if any */
         path = relpath(reln->smgr_rnode);
  
         if (segno > 0)
         {
+               /* be sure we have enough space for the '.segno' */
                 fullpath = (char *) palloc(strlen(path) + 12);
                 sprintf(fullpath, "%s.%u", path, segno);
                 pfree(path);
@@ -667,32 +844,36 @@ _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
  
         /* fill the entry */
         v->mdfd_vfd = fd;
+       v->mdfd_segno = segno;
         v->mdfd_chain = NULL;
         Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
  
         /* all done */
         return v;
  }
-#endif
+
+#endif /* LET_OS_MANAGE_FILESIZE */
  
  /*
   *     _mdfd_getseg() -- Find the segment of the relation holding the
- *                                       specified block.  ereport's on failure.
+ *             specified block.  ereport's on failure.
+ *             (Optionally, can return NULL instead of ereport for ENOENT.)
   */
  static MdfdVec *
-_mdfd_getseg(SMgrRelation reln, BlockNumber blkno)
+_mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool allowNotFound)
  {
-       MdfdVec    *v = mdopen(reln);
-
+       MdfdVec    *v = mdopen(reln, allowNotFound);
  #ifndef LET_OS_MANAGE_FILESIZE
-       BlockNumber segno;
-       BlockNumber i;
+       BlockNumber segstogo;
+       BlockNumber nextsegno;
  
-       for (segno = blkno / ((BlockNumber) RELSEG_SIZE), i = 1;
-                segno > 0;
-                i++, segno--)
-       {
+       if (!v)
+               return NULL;                    /* only possible if allowNotFound */
  
+       for (segstogo = blkno / ((BlockNumber) RELSEG_SIZE), nextsegno = 1;
+                segstogo > 0;
+                nextsegno++, segstogo--)
+       {
                 if (v->mdfd_chain == NULL)
                 {
                         /*
@@ -705,16 +886,21 @@ _mdfd_getseg(SMgrRelation reln, BlockNumber blkno)
                          * one new segment per call, so this restriction seems
                          * reasonable.
                          */
-                       v->mdfd_chain = _mdfd_openseg(reln, i, (segno == 1) ? O_CREAT : 0);
-
+                       v->mdfd_chain = _mdfd_openseg(reln,
+                                                                                 nextsegno,
+                                                                                 (segstogo == 1) ? O_CREAT : 0);
                         if (v->mdfd_chain == NULL)
+                       {
+                               if (allowNotFound && errno == ENOENT)
+                                       return NULL;
                                 ereport(ERROR,
                                                 (errcode_for_file_access(),
                                                  errmsg("could not open segment %u of relation %u/%u (target block %u): %m",
-                                                               i,
+                                                               nextsegno,
                                                                 reln->smgr_rnode.tblNode,
                                                                 reln->smgr_rnode.relNode,
                                                                 blkno)));
+                       }
                 }
                 v = v->mdfd_chain;
         }
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c

index d242744a4d7c47d2cc6700e1c3af8ecc23ddc522..c204e2796c4b2125b5f83a1a3fb574e2a9bd85a1 100644 (file)
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -11,7 +11,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.70 2004/02/11 22:55:25 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.71 2004/05/31 03:48:06 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -56,7 +56,7 @@ typedef struct f_smgr
  static const f_smgr smgrsw[] = {
         /* magnetic disk */
         {mdinit, NULL, mdclose, mdcreate, mdunlink, mdextend,
-        mdread, mdwrite, mdnblocks, mdtruncate, mdcommit, mdabort, mdsync
+        mdread, mdwrite, mdnblocks, mdtruncate, NULL, NULL, mdsync
         }
  };
  
@@ -407,7 +407,7 @@ smgr_internal_unlink(RelFileNode rnode, int which, bool isTemp, bool isRedo)
          * Get rid of any leftover buffers for the rel (shouldn't be any in the
          * commit case, but there can be in the abort case).
          */
-       DropRelFileNodeBuffers(rnode, isTemp);
+       DropRelFileNodeBuffers(rnode, isTemp, 0);
  
         /*
          * Tell the free space map to forget this relation.  It won't be accessed
@@ -638,7 +638,7 @@ smgrcommit(void)
                 if (smgrsw[i].smgr_commit)
                 {
                         if (! (*(smgrsw[i].smgr_commit)) ())
-                               elog(FATAL, "transaction commit failed on %s: %m",
+                               elog(ERROR, "transaction commit failed on %s: %m",
                                          DatumGetCString(DirectFunctionCall1(smgrout,
                                                                                                          Int16GetDatum(i))));
                 }
@@ -658,7 +658,7 @@ smgrabort(void)
                 if (smgrsw[i].smgr_abort)
                 {
                         if (! (*(smgrsw[i].smgr_abort)) ())
-                               elog(FATAL, "transaction abort failed on %s: %m",
+                               elog(ERROR, "transaction abort failed on %s: %m",
                                          DatumGetCString(DirectFunctionCall1(smgrout,
                                                                                                          Int16GetDatum(i))));
                 }
@@ -678,7 +678,7 @@ smgrsync(void)
                 if (smgrsw[i].smgr_sync)
                 {
                         if (! (*(smgrsw[i].smgr_sync)) ())
-                               elog(PANIC, "storage sync failed on %s: %m",
+                               elog(ERROR, "storage sync failed on %s: %m",
                                          DatumGetCString(DirectFunctionCall1(smgrout,
                                                                                                          Int16GetDatum(i))));
                 }
@@ -707,6 +707,13 @@ smgr_redo(XLogRecPtr lsn, XLogRecord *record)
  
                 reln = smgropen(xlrec->rnode);
  
+               /*
+                * First, force bufmgr to drop any buffers it has for the to-be-
+                * truncated blocks.  We must do this, else subsequent XLogReadBuffer
+                * operations will not re-extend the file properly.
+                */
+               DropRelFileNodeBuffers(xlrec->rnode, false, xlrec->blkno);
+
                 /* Can't use smgrtruncate because it would try to xlog */
  
                 /*
diff --git a/src/include/access/slru.h b/src/include/access/slru.h

index fec968e7a202c4b53146e0e451c64afdfe9d8fdf..213cca5c21654510ba3f78ed90f20ed981bc6e7a 100644 (file)
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -6,26 +6,17 @@
   * Portions Copyright (c) 2003, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/access/slru.h,v 1.5 2004/05/28 05:13:17 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/slru.h,v 1.6 2004/05/31 03:48:08 tgl Exp $
   */
  #ifndef SLRU_H
  #define SLRU_H
  
-#include "access/xlog.h"
+#include "storage/lwlock.h"
  
-/* exported because lwlock.c needs it */
-#define NUM_CLOG_BUFFERS       8
  
-/*
- * Note: the separation between SlruLockData and SlruSharedData is purely
- * historical; the structs could be combined.
- */
-typedef struct SlruLockData
-{
-       LWLockId        ControlLock;
-       LWLockId        BufferLocks[NUM_CLOG_BUFFERS];  /* Per-buffer I/O locks */
-} SlruLockData;
-typedef SlruLockData *SlruLock;
+/* Opaque structs known only in slru.c */
+typedef struct SlruSharedData *SlruShared;
+typedef struct SlruFlushData *SlruFlush;
  
  /*
   * SlruCtlData is an unshared structure that points to the active information
@@ -33,13 +24,13 @@ typedef SlruLockData *SlruLock;
   */
  typedef struct SlruCtlData
  {
-       void       *shared;                     /* pointer to SlruSharedData */
-       SlruLock        locks;
+       SlruShared      shared;
+
+       LWLockId        ControlLock;
  
         /*
-        * Dir is set during SimpleLruShmemInit and does not change thereafter.
-        * The value is automatically inherited by backends via fork, and
-        * doesn't need to be in shared memory.
+        * Dir is set during SimpleLruInit and does not change thereafter.
+        * Since it's always the same, it doesn't need to be in shared memory.
          */
         char            Dir[MAXPGPATH];
  
@@ -51,13 +42,16 @@ typedef struct SlruCtlData
         bool            (*PagePrecedes) (int, int);
  
  } SlruCtlData;
+
  typedef SlruCtlData *SlruCtl;
  
+
  extern int     SimpleLruShmemSize(void);
  extern void SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir);
  extern int     SimpleLruZeroPage(SlruCtl ctl, int pageno);
-extern char *SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite);
-extern void SimpleLruWritePage(SlruCtl ctl, int slotno);
+extern char *SimpleLruReadPage(SlruCtl ctl, int pageno,
+                                                          TransactionId xid, bool forwrite);
+extern void SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata);
  extern void SimpleLruSetLatestPage(SlruCtl ctl, int pageno);
  extern void SimpleLruFlush(SlruCtl ctl, bool checkpoint);
  extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage);
diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h

index c11af72e78945f91dfe76bfab00eeab7eabea886..ed56e9639e889ec64fe0dd2c45476561d9b0dcaf 100644 (file)
--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -5,13 +5,17 @@
   *
   * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
   *
- * $PostgreSQL: pgsql/src/include/postmaster/bgwriter.h,v 1.1 2004/05/29 22:48:23 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/postmaster/bgwriter.h,v 1.2 2004/05/31 03:48:09 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
  #ifndef _BGWRITER_H
  #define _BGWRITER_H
  
+#include "storage/block.h"
+#include "storage/relfilenode.h"
+
+
  /* GUC options */
  extern int     BgWriterDelay;
  extern int     BgWriterPercent;
@@ -23,6 +27,9 @@ extern void BackgroundWriterMain(void);
  
  extern void RequestCheckpoint(bool waitforit);
  
+extern bool ForwardFsyncRequest(RelFileNode rnode, BlockNumber segno);
+extern void AbsorbFsyncRequests(void);
+
  extern int     BgWriterShmemSize(void);
  extern void BgWriterShmemInit(void);
  
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h

index 27752d412b56435ddbe61bf77adcd83554d1df92..95b426bb8b93f21faa93c2069167eb8fe048ea49 100644 (file)
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.80 2004/05/29 22:48:23 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.81 2004/05/31 03:48:10 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -154,7 +154,8 @@ extern BlockNumber RelationGetNumberOfBlocks(Relation relation);
  extern void RelationTruncate(Relation rel, BlockNumber nblocks);
  extern int     FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock);
  extern void DropRelationBuffers(Relation rel);
-extern void DropRelFileNodeBuffers(RelFileNode rnode, bool istemp);
+extern void DropRelFileNodeBuffers(RelFileNode rnode, bool istemp,
+                                                                  BlockNumber firstDelBlock);
  extern void DropBuffers(Oid dbid);
  
  #ifdef NOT_USED
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h

index 177925cf3e80776dbc34dd543ca8e36b5fcbea76..430ed5d8c74181f11b10a41428453f123159f06b 100644 (file)
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.44 2004/02/23 23:03:10 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.45 2004/05/31 03:48:10 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -15,7 +15,7 @@
  /*
   * calls:
   *
- *     File {Close, Read, Write, Seek, Tell, MarkDirty, Sync}
+ *     File {Close, Read, Write, Seek, Tell, Sync}
   *     {File Name Open, Allocate, Free} File
   *
   * These are NOT JUST RENAMINGS OF THE UNIX ROUTINES.
@@ -66,6 +66,7 @@ extern void FileClose(File file);
  extern void FileUnlink(File file);
  extern int     FileRead(File file, char *buffer, int amount);
  extern int     FileWrite(File file, char *buffer, int amount);
+extern int     FileSync(File file);
  extern long FileSeek(File file, long offset, int whence);
  extern int     FileTruncate(File file, long offset);
  
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h

index 34f9c6613c709f9122a054dfb37deab756bede29..e06d9a4bf77f43af92bdb216ad5b009c75a868c1 100644 (file)
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.10 2003/12/20 17:31:21 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.11 2004/05/31 03:48:10 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -37,6 +37,7 @@ typedef enum LWLockId
         ControlFileLock,
         CheckpointLock,
         RelCacheInitLock,
+       BgWriterCommLock,
  
         NumFixedLWLocks,                        /* must be last except for
                                                                  * MaxDynamicLWLock */
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h

index 41367d35e819b92ff1a5b889753e1f193006366a..6a28c3824fad8e6b5f1a90cabb455e8219b8b53f 100644 (file)
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.41 2004/02/11 22:55:26 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.42 2004/05/31 03:48:10 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -83,10 +83,10 @@ extern bool mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer);
  extern bool mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer);
  extern BlockNumber mdnblocks(SMgrRelation reln);
  extern BlockNumber mdtruncate(SMgrRelation reln, BlockNumber nblocks);
-extern bool mdcommit(void);
-extern bool mdabort(void);
  extern bool mdsync(void);
  
+extern void RememberFsyncRequest(RelFileNode rnode, BlockNumber segno);
+
  /* smgrtype.c */
  extern Datum smgrout(PG_FUNCTION_ARGS);
  extern Datum smgrin(PG_FUNCTION_ARGS);
author	Tom Lane <tgl@sss.pgh.pa.us>
	Mon, 31 May 2004 03:48:10 +0000 (03:48 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Mon, 31 May 2004 03:48:10 +0000 (03:48 +0000)
src/backend/access/transam/clog.c		patch \| blob \| history
src/backend/access/transam/slru.c		patch \| blob \| history
src/backend/postmaster/bgwriter.c		patch \| blob \| history
src/backend/storage/buffer/bufmgr.c		patch \| blob \| history
src/backend/storage/file/fd.c		patch \| blob \| history
src/backend/storage/smgr/md.c		patch \| blob \| history
src/backend/storage/smgr/smgr.c		patch \| blob \| history
src/include/access/slru.h		patch \| blob \| history
src/include/postmaster/bgwriter.h		patch \| blob \| history
src/include/storage/bufmgr.h		patch \| blob \| history
src/include/storage/fd.h		patch \| blob \| history
src/include/storage/lwlock.h		patch \| blob \| history
src/include/storage/smgr.h		patch \| blob \| history