]> granicus.if.org Git - postgresql/commitdiff
Per previous discussions, get rid of use of sync(2) in favor of
authorTom Lane <tgl@sss.pgh.pa.us>
Mon, 31 May 2004 03:48:10 +0000 (03:48 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Mon, 31 May 2004 03:48:10 +0000 (03:48 +0000)
explicitly fsync'ing every (non-temp) file we have written since the
last checkpoint.  In the vast majority of cases, the burden of the
fsyncs should fall on the bgwriter process not on backends.  (To this
end, we assume that an fsync issued by the bgwriter will force out
blocks written to the same file by other processes using other file
descriptors.  Anyone have a problem with that?)  This makes the world
safe for WIN32, which ain't even got sync(2), and really makes the world
safe for Unixen as well, because sync(2) never had the semantics we need:
it offers no way to wait for the requested I/O to finish.

Along the way, fix a bug I recently introduced in xlog recovery:
file truncation replay failed to clear bufmgr buffers for the dropped
blocks, which could result in 'PANIC:  heap_delete_redo: no block'
later on in xlog replay.

13 files changed:
src/backend/access/transam/clog.c
src/backend/access/transam/slru.c
src/backend/postmaster/bgwriter.c
src/backend/storage/buffer/bufmgr.c
src/backend/storage/file/fd.c
src/backend/storage/smgr/md.c
src/backend/storage/smgr/smgr.c
src/include/access/slru.h
src/include/postmaster/bgwriter.h
src/include/storage/bufmgr.h
src/include/storage/fd.h
src/include/storage/lwlock.h
src/include/storage/smgr.h

index 88e1f1256ad80a8f007341325e83fa25db75b9c1..97f887d0a06ce234e256dda5bf204333ec7033b0 100644 (file)
@@ -13,7 +13,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.19 2003/11/29 19:51:40 pgsql Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.20 2004/05/31 03:47:54 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -97,7 +97,7 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status)
        Assert(status == TRANSACTION_STATUS_COMMITTED ||
                   status == TRANSACTION_STATUS_ABORTED);
 
-       LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
+       LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
 
        byteptr = SimpleLruReadPage(ClogCtl, pageno, xid, true);
        byteptr += byteno;
@@ -110,7 +110,7 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status)
 
        /* ...->page_status[slotno] = CLOG_PAGE_DIRTY; already done */
 
-       LWLockRelease(ClogCtl->locks->ControlLock);
+       LWLockRelease(ClogCtl->ControlLock);
 }
 
 /*
@@ -128,14 +128,14 @@ TransactionIdGetStatus(TransactionId xid)
        char       *byteptr;
        XidStatus       status;
 
-       LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
+       LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
 
        byteptr = SimpleLruReadPage(ClogCtl, pageno, xid, false);
        byteptr += byteno;
 
        status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
 
-       LWLockRelease(ClogCtl->locks->ControlLock);
+       LWLockRelease(ClogCtl->ControlLock);
 
        return status;
 }
@@ -169,16 +169,16 @@ BootStrapCLOG(void)
 {
        int                     slotno;
 
-       LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
+       LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
 
        /* Create and zero the first page of the commit log */
        slotno = ZeroCLOGPage(0, false);
 
        /* Make sure it's written out */
-       SimpleLruWritePage(ClogCtl, slotno);
+       SimpleLruWritePage(ClogCtl, slotno, NULL);
        /* Assert(ClogCtl->page_status[slotno] == CLOG_PAGE_CLEAN); */
 
-       LWLockRelease(ClogCtl->locks->ControlLock);
+       LWLockRelease(ClogCtl->ControlLock);
 }
 
 /*
@@ -256,12 +256,12 @@ ExtendCLOG(TransactionId newestXact)
 
        pageno = TransactionIdToPage(newestXact);
 
-       LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
+       LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
 
        /* Zero the page and make an XLOG entry about it */
        ZeroCLOGPage(pageno, true);
 
-       LWLockRelease(ClogCtl->locks->ControlLock);
+       LWLockRelease(ClogCtl->ControlLock);
 }
 
 
@@ -351,13 +351,13 @@ clog_redo(XLogRecPtr lsn, XLogRecord *record)
 
                memcpy(&pageno, XLogRecGetData(record), sizeof(int));
 
-               LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
+               LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
 
                slotno = ZeroCLOGPage(pageno, false);
-               SimpleLruWritePage(ClogCtl, slotno);
+               SimpleLruWritePage(ClogCtl, slotno, NULL);
                /* Assert(ClogCtl->page_status[slotno] == SLRU_PAGE_CLEAN); */
 
-               LWLockRelease(ClogCtl->locks->ControlLock);
+               LWLockRelease(ClogCtl->ControlLock);
        }
 }
 
index 57dcd2b33798c3e9230681bb5cfaee593903aa5a..58798d0f07fcb56d230f886258c94bc07e5a4f4a 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.15 2004/05/29 22:48:18 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.16 2004/05/31 03:47:54 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -17,6 +17,7 @@
 #include <unistd.h>
 
 #include "access/slru.h"
+#include "access/clog.h"               /* only for NUM_CLOG_BUFFERS */
 #include "postmaster/bgwriter.h"
 #include "storage/fd.h"
 #include "storage/lwlock.h"
@@ -100,6 +101,8 @@ typedef enum
  */
 typedef struct SlruSharedData
 {
+       LWLockId        ControlLock;
+
        /*
         * Info for each buffer slot.  Page number is undefined when status is
         * EMPTY.  lru_count is essentially the number of page switches since
@@ -110,6 +113,7 @@ typedef struct SlruSharedData
        SlruPageStatus page_status[NUM_CLOG_BUFFERS];
        int                     page_number[NUM_CLOG_BUFFERS];
        unsigned int page_lru_count[NUM_CLOG_BUFFERS];
+       LWLockId        BufferLocks[NUM_CLOG_BUFFERS];  /* Per-buffer I/O locks */
 
        /*
         * latest_page_number is the page number of the current end of the
@@ -118,12 +122,24 @@ typedef struct SlruSharedData
         */
        int                     latest_page_number;
 } SlruSharedData;
-typedef SlruSharedData *SlruShared;
-
 
 #define SlruFileName(ctl, path, seg) \
        snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg)
 
+/*
+ * During SimpleLruFlush(), we will usually not need to write/fsync more
+ * than one or two physical files, but we may need to write several pages
+ * per file.  We can consolidate the I/O requests by leaving files open
+ * until control returns to SimpleLruFlush().  This data structure remembers
+ * which files are open.
+ */
+typedef struct SlruFlushData
+{
+       int                     num_files;                                      /* # files actually open */
+       int                     fd[NUM_CLOG_BUFFERS];           /* their FD's */
+       int                     segno[NUM_CLOG_BUFFERS];        /* their clog seg#s */
+} SlruFlushData;
+
 /*
  * Macro to mark a buffer slot "most recently used".
  */
@@ -145,14 +161,17 @@ typedef enum
        SLRU_SEEK_FAILED,
        SLRU_READ_FAILED,
        SLRU_WRITE_FAILED,
+       SLRU_FSYNC_FAILED,
        SLRU_CLOSE_FAILED
 } SlruErrorCause;
+
 static SlruErrorCause slru_errcause;
 static int     slru_errno;
 
 
 static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno);
-static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno);
+static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno,
+                                                                 SlruFlush fdata);
 static void SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid);
 static int     SlruSelectLRUPage(SlruCtl ctl, int pageno);
 static bool SlruScanDirectory(SlruCtl ctl, int cutoffPage, bool doDeletions);
@@ -165,24 +184,16 @@ static bool SlruScanDirectory(SlruCtl ctl, int cutoffPage, bool doDeletions);
 int
 SimpleLruShmemSize(void)
 {
-       return MAXALIGN(sizeof(SlruSharedData))
-               + BLCKSZ * NUM_CLOG_BUFFERS
-               + MAXALIGN(sizeof(SlruLockData))
-               ;
+       return MAXALIGN(sizeof(SlruSharedData)) + BLCKSZ * NUM_CLOG_BUFFERS;
 }
 
 void
 SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir)
 {
-       bool            found;
-       char       *ptr;
        SlruShared      shared;
-       SlruLock        locks;
+       bool            found;
 
-       ptr = ShmemInitStruct(name, SimpleLruShmemSize(), &found);
-       shared = (SlruShared) ptr;
-       locks = (SlruLock) (ptr + MAXALIGN(sizeof(SlruSharedData)) +
-                                               BLCKSZ * NUM_CLOG_BUFFERS);
+       shared = (SlruShared) ShmemInitStruct(name, SimpleLruShmemSize(), &found);
 
        if (!IsUnderPostmaster)
        {
@@ -192,18 +203,18 @@ SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir)
 
                Assert(!found);
 
-               locks->ControlLock = LWLockAssign();
-
                memset(shared, 0, sizeof(SlruSharedData));
 
+               shared->ControlLock = LWLockAssign();
+
                bufptr = (char *) shared + MAXALIGN(sizeof(SlruSharedData));
 
                for (slotno = 0; slotno < NUM_CLOG_BUFFERS; slotno++)
                {
-                       locks->BufferLocks[slotno] = LWLockAssign();
                        shared->page_buffer[slotno] = bufptr;
                        shared->page_status[slotno] = SLRU_PAGE_EMPTY;
                        shared->page_lru_count[slotno] = 1;
+                       shared->BufferLocks[slotno] = LWLockAssign();
                        bufptr += BLCKSZ;
                }
 
@@ -213,10 +224,10 @@ SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir)
                Assert(found);
 
        /* Initialize the unshared control struct */
-       ctl->locks = locks;
        ctl->shared = shared;
+       ctl->ControlLock = shared->ControlLock;
 
-       /* Init directory path */
+       /* Initialize unshared copy of directory path */
        snprintf(ctl->Dir, MAXPGPATH, "%s/%s", DataDir, subdir);
 }
 
@@ -232,7 +243,7 @@ int
 SimpleLruZeroPage(SlruCtl ctl, int pageno)
 {
        int                     slotno;
-       SlruShared      shared = (SlruShared) ctl->shared;
+       SlruShared      shared = ctl->shared;
 
        /* Find a suitable buffer slot for the page */
        slotno = SlruSelectLRUPage(ctl, pageno);
@@ -270,7 +281,7 @@ SimpleLruZeroPage(SlruCtl ctl, int pageno)
 char *
 SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
 {
-       SlruShared      shared = (SlruShared) ctl->shared;
+       SlruShared      shared = ctl->shared;
 
        /* Outer loop handles restart if we lose the buffer to someone else */
        for (;;)
@@ -313,8 +324,8 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
                SlruRecentlyUsed(shared, slotno);
 
                /* Release shared lock, grab per-buffer lock instead */
-               LWLockRelease(ctl->locks->ControlLock);
-               LWLockAcquire(ctl->locks->BufferLocks[slotno], LW_EXCLUSIVE);
+               LWLockRelease(shared->ControlLock);
+               LWLockAcquire(shared->BufferLocks[slotno], LW_EXCLUSIVE);
 
                /*
                 * Check to see if someone else already did the read, or took the
@@ -323,8 +334,8 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
                if (shared->page_number[slotno] != pageno ||
                        shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS)
                {
-                       LWLockRelease(ctl->locks->BufferLocks[slotno]);
-                       LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
+                       LWLockRelease(shared->BufferLocks[slotno]);
+                       LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
                        continue;
                }
 
@@ -332,14 +343,14 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
                ok = SlruPhysicalReadPage(ctl, pageno, slotno);
 
                /* Re-acquire shared control lock and update page state */
-               LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
+               LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
 
                Assert(shared->page_number[slotno] == pageno &&
                           shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS);
 
                shared->page_status[slotno] = ok ? SLRU_PAGE_CLEAN : SLRU_PAGE_EMPTY;
 
-               LWLockRelease(ctl->locks->BufferLocks[slotno]);
+               LWLockRelease(shared->BufferLocks[slotno]);
 
                /* Now it's okay to ereport if we failed */
                if (!ok)
@@ -364,11 +375,11 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
  * Control lock must be held at entry, and will be held at exit.
  */
 void
-SimpleLruWritePage(SlruCtl ctl, int slotno)
+SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata)
 {
        int                     pageno;
        bool            ok;
-       SlruShared      shared = (SlruShared) ctl->shared;
+       SlruShared      shared = ctl->shared;
 
        /* Do nothing if page does not need writing */
        if (shared->page_status[slotno] != SLRU_PAGE_DIRTY &&
@@ -378,8 +389,8 @@ SimpleLruWritePage(SlruCtl ctl, int slotno)
        pageno = shared->page_number[slotno];
 
        /* Release shared lock, grab per-buffer lock instead */
-       LWLockRelease(ctl->locks->ControlLock);
-       LWLockAcquire(ctl->locks->BufferLocks[slotno], LW_EXCLUSIVE);
+       LWLockRelease(shared->ControlLock);
+       LWLockAcquire(shared->BufferLocks[slotno], LW_EXCLUSIVE);
 
        /*
         * Check to see if someone else already did the write, or took the
@@ -392,8 +403,8 @@ SimpleLruWritePage(SlruCtl ctl, int slotno)
                (shared->page_status[slotno] != SLRU_PAGE_DIRTY &&
                 shared->page_status[slotno] != SLRU_PAGE_WRITE_IN_PROGRESS))
        {
-               LWLockRelease(ctl->locks->BufferLocks[slotno]);
-               LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
+               LWLockRelease(shared->BufferLocks[slotno]);
+               LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
                return;
        }
 
@@ -412,10 +423,19 @@ SimpleLruWritePage(SlruCtl ctl, int slotno)
        shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS;
 
        /* Okay, do the write */
-       ok = SlruPhysicalWritePage(ctl, pageno, slotno);
+       ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata);
+
+       /* If we failed, and we're in a flush, better close the files */
+       if (!ok && fdata)
+       {
+               int             i;
+
+               for (i = 0; i < fdata->num_files; i++)
+                       close(fdata->fd[i]);
+       }
 
        /* Re-acquire shared control lock and update page state */
-       LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
+       LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
 
        Assert(shared->page_number[slotno] == pageno &&
                   (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS ||
@@ -425,7 +445,7 @@ SimpleLruWritePage(SlruCtl ctl, int slotno)
        if (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS)
                shared->page_status[slotno] = ok ? SLRU_PAGE_CLEAN : SLRU_PAGE_DIRTY;
 
-       LWLockRelease(ctl->locks->BufferLocks[slotno]);
+       LWLockRelease(shared->BufferLocks[slotno]);
 
        /* Now it's okay to ereport if we failed */
        if (!ok)
@@ -445,7 +465,7 @@ SimpleLruWritePage(SlruCtl ctl, int slotno)
 static bool
 SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
 {
-       SlruShared      shared = (SlruShared) ctl->shared;
+       SlruShared      shared = ctl->shared;
        int                     segno = pageno / SLRU_PAGES_PER_SEGMENT;
        int                     rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
        int                     offset = rpageno * BLCKSZ;
@@ -482,6 +502,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
        {
                slru_errcause = SLRU_SEEK_FAILED;
                slru_errno = errno;
+               close(fd);
                return false;
        }
 
@@ -490,6 +511,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
        {
                slru_errcause = SLRU_READ_FAILED;
                slru_errno = errno;
+               close(fd);
                return false;
        }
 
@@ -511,50 +533,80 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
  * info in static variables to let SlruReportIOError make the report.
  *
  * For now, assume it's not worth keeping a file pointer open across
- * read/write operations.  We could cache one virtual file pointer ...
+ * independent read/write operations.  We do batch operations during
+ * SimpleLruFlush, though.
+ *
+ * fdata is NULL for a standalone write, pointer to open-file info during
+ * SimpleLruFlush.
  */
 static bool
-SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno)
+SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
 {
-       SlruShared      shared = (SlruShared) ctl->shared;
+       SlruShared      shared = ctl->shared;
        int                     segno = pageno / SLRU_PAGES_PER_SEGMENT;
        int                     rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
        int                     offset = rpageno * BLCKSZ;
        char            path[MAXPGPATH];
-       int                     fd;
-
-       SlruFileName(ctl, path, segno);
+       int                     fd = -1;
 
        /*
-        * If the file doesn't already exist, we should create it.  It is
-        * possible for this to need to happen when writing a page that's not
-        * first in its segment; we assume the OS can cope with that.  (Note:
-        * it might seem that it'd be okay to create files only when
-        * SimpleLruZeroPage is called for the first page of a segment.
-        * However, if after a crash and restart the REDO logic elects to
-        * replay the log from a checkpoint before the latest one, then it's
-        * possible that we will get commands to set transaction status of
-        * transactions that have already been truncated from the commit log.
-        * Easiest way to deal with that is to accept references to
-        * nonexistent files here and in SlruPhysicalReadPage.)
+        * During a Flush, we may already have the desired file open.
         */
-       fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
-       if (fd < 0)
+       if (fdata)
        {
-               if (errno != ENOENT)
+               int             i;
+
+               for (i = 0; i < fdata->num_files; i++)
                {
-                       slru_errcause = SLRU_OPEN_FAILED;
-                       slru_errno = errno;
-                       return false;
+                       if (fdata->segno[i] == segno)
+                       {
+                               fd = fdata->fd[i];
+                               break;
+                       }
                }
+       }
 
-               fd = BasicOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
-                                                  S_IRUSR | S_IWUSR);
+       if (fd < 0)
+       {
+               /*
+                * If the file doesn't already exist, we should create it.  It is
+                * possible for this to need to happen when writing a page that's not
+                * first in its segment; we assume the OS can cope with that.
+                * (Note: it might seem that it'd be okay to create files only when
+                * SimpleLruZeroPage is called for the first page of a segment.
+                * However, if after a crash and restart the REDO logic elects to
+                * replay the log from a checkpoint before the latest one, then it's
+                * possible that we will get commands to set transaction status of
+                * transactions that have already been truncated from the commit log.
+                * Easiest way to deal with that is to accept references to
+                * nonexistent files here and in SlruPhysicalReadPage.)
+                */
+               SlruFileName(ctl, path, segno);
+               fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
                if (fd < 0)
                {
-                       slru_errcause = SLRU_CREATE_FAILED;
-                       slru_errno = errno;
-                       return false;
+                       if (errno != ENOENT)
+                       {
+                               slru_errcause = SLRU_OPEN_FAILED;
+                               slru_errno = errno;
+                               return false;
+                       }
+
+                       fd = BasicOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+                                                          S_IRUSR | S_IWUSR);
+                       if (fd < 0)
+                       {
+                               slru_errcause = SLRU_CREATE_FAILED;
+                               slru_errno = errno;
+                               return false;
+                       }
+               }
+
+               if (fdata)
+               {
+                       fdata->fd[fdata->num_files] = fd;
+                       fdata->segno[fdata->num_files] = segno;
+                       fdata->num_files++;
                }
        }
 
@@ -562,6 +614,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno)
        {
                slru_errcause = SLRU_SEEK_FAILED;
                slru_errno = errno;
+               if (!fdata)
+                       close(fd);
                return false;
        }
 
@@ -573,14 +627,31 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno)
                        errno = ENOSPC;
                slru_errcause = SLRU_WRITE_FAILED;
                slru_errno = errno;
+               if (!fdata)
+                       close(fd);
                return false;
        }
 
-       if (close(fd))
+       /*
+        * If not part of Flush, need to fsync now.  We assume this happens
+        * infrequently enough that it's not a performance issue.
+        */
+       if (!fdata)
        {
-               slru_errcause = SLRU_CLOSE_FAILED;
-               slru_errno = errno;
-               return false;
+               if (pg_fsync(fd))
+               {
+                       slru_errcause = SLRU_FSYNC_FAILED;
+                       slru_errno = errno;
+                       close(fd);
+                       return false;
+               }
+
+               if (close(fd))
+               {
+                       slru_errcause = SLRU_CLOSE_FAILED;
+                       slru_errno = errno;
+                       return false;
+               }
        }
 
        return true;
@@ -637,6 +708,13 @@ SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid)
                                  errdetail("could not write to file \"%s\" at offset %u: %m",
                                                        path, offset)));
                        break;
+               case SLRU_FSYNC_FAILED:
+                       ereport(ERROR,
+                                       (errcode_for_file_access(),
+                               errmsg("could not access status of transaction %u", xid),
+                                 errdetail("could not fsync file \"%s\": %m",
+                                                       path)));
+                       break;
                case SLRU_CLOSE_FAILED:
                        ereport(ERROR,
                                        (errcode_for_file_access(),
@@ -668,7 +746,7 @@ SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid)
 static int
 SlruSelectLRUPage(SlruCtl ctl, int pageno)
 {
-       SlruShared      shared = (SlruShared) ctl->shared;
+       SlruShared      shared = ctl->shared;
 
        /* Outer loop handles restart after I/O */
        for (;;)
@@ -717,7 +795,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
                        (void) SimpleLruReadPage(ctl, shared->page_number[bestslot],
                                                                         InvalidTransactionId, false);
                else
-                       SimpleLruWritePage(ctl, bestslot);
+                       SimpleLruWritePage(ctl, bestslot, NULL);
 
                /*
                 * Now loop back and try again.  This is the easiest way of
@@ -733,7 +811,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
 void
 SimpleLruSetLatestPage(SlruCtl ctl, int pageno)
 {
-       SlruShared      shared = (SlruShared) ctl->shared;
+       SlruShared      shared = ctl->shared;
 
        shared->latest_page_number = pageno;
 }
@@ -744,16 +822,20 @@ SimpleLruSetLatestPage(SlruCtl ctl, int pageno)
 void
 SimpleLruFlush(SlruCtl ctl, bool checkpoint)
 {
-#ifdef USE_ASSERT_CHECKING             /* only used in Assert() */
-       SlruShared      shared = (SlruShared) ctl->shared;
-#endif
+       SlruShared      shared = ctl->shared;
+       SlruFlushData fdata;
        int                     slotno;
+       int                     pageno = 0;
+       int                     i;
+       bool            ok;
+
+       fdata.num_files = 0;
 
-       LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
+       LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
 
        for (slotno = 0; slotno < NUM_CLOG_BUFFERS; slotno++)
        {
-               SimpleLruWritePage(ctl, slotno);
+               SimpleLruWritePage(ctl, slotno, &fdata);
 
                /*
                 * When called during a checkpoint, we cannot assert that the slot
@@ -765,7 +847,32 @@ SimpleLruFlush(SlruCtl ctl, bool checkpoint)
                           shared->page_status[slotno] == SLRU_PAGE_CLEAN);
        }
 
-       LWLockRelease(ctl->locks->ControlLock);
+       LWLockRelease(shared->ControlLock);
+
+       /*
+        * Now fsync and close any files that were open
+        */
+       ok = true;
+       for (i = 0; i < fdata.num_files; i++)
+       {
+               if (pg_fsync(fdata.fd[i]))
+               {
+                       slru_errcause = SLRU_FSYNC_FAILED;
+                       slru_errno = errno;
+                       pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
+                       ok = false;
+               }
+
+               if (close(fdata.fd[i]))
+               {
+                       slru_errcause = SLRU_CLOSE_FAILED;
+                       slru_errno = errno;
+                       pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
+                       ok = false;
+               }
+       }
+       if (!ok)
+               SlruReportIOError(ctl, pageno, InvalidTransactionId);
 }
 
 /*
@@ -786,7 +893,7 @@ void
 SimpleLruTruncate(SlruCtl ctl, int cutoffPage)
 {
        int                     slotno;
-       SlruShared      shared = (SlruShared) ctl->shared;
+       SlruShared      shared = ctl->shared;
 
        /*
         * The cutoff point is the start of the segment containing cutoffPage.
@@ -805,7 +912,7 @@ SimpleLruTruncate(SlruCtl ctl, int cutoffPage)
         * have been flushed already during the checkpoint, we're just being
         * extra careful here.)
         */
-       LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
+       LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
 
 restart:;
 
@@ -817,7 +924,7 @@ restart:;
         */
        if (ctl->PagePrecedes(shared->latest_page_number, cutoffPage))
        {
-               LWLockRelease(ctl->locks->ControlLock);
+               LWLockRelease(shared->ControlLock);
                ereport(LOG,
                                (errmsg("could not truncate directory \"%s\": apparent wraparound",
                                                ctl->Dir)));
@@ -849,11 +956,11 @@ restart:;
                        (void) SimpleLruReadPage(ctl, shared->page_number[slotno],
                                                                         InvalidTransactionId, false);
                else
-                       SimpleLruWritePage(ctl, slotno);
+                       SimpleLruWritePage(ctl, slotno, NULL);
                goto restart;
        }
 
-       LWLockRelease(ctl->locks->ControlLock);
+       LWLockRelease(shared->ControlLock);
 
        /* Now we can remove the old segment(s) */
        (void) SlruScanDirectory(ctl, cutoffPage, true);
@@ -878,7 +985,8 @@ SlruScanDirectory(SlruCtl ctl, int cutoffPage, bool doDeletions)
        if (cldir == NULL)
                ereport(ERROR,
                                (errcode_for_file_access(),
-                          errmsg("could not open directory \"%s\": %m", ctl->Dir)));
+                                errmsg("could not open directory \"%s\": %m",
+                                               ctl->Dir)));
 
        errno = 0;
        while ((clde = readdir(cldir)) != NULL)
index ce80a4feff7a6454152d6c154cca5c50296c975d..6bb683386a2f109d2b413f73d76a4b860044dd10 100644 (file)
  *
  * If the bgwriter exits unexpectedly, the postmaster treats that the same
  * as a backend crash: shared memory may be corrupted, so remaining backends
- * should be killed by SIGQUIT and then a recovery cycle started.
+ * should be killed by SIGQUIT and then a recovery cycle started.  (Even if
+ * shared memory isn't corrupted, we have lost information about which
+ * files need to be fsync'd for the next checkpoint, and so a system
+ * restart needs to be forced.)
  *
  *
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.1 2004/05/29 22:48:19 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.2 2004/05/31 03:47:59 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "utils/guc.h"
 
 
-/*
+/*----------
  * Shared memory area for communication between bgwriter and backends
+ *
+ * The ckpt counters allow backends to watch for completion of a checkpoint
+ * request they send.  Here's how it works:
+ *     * At start of a checkpoint, bgwriter increments ckpt_started.
+ *     * On completion of a checkpoint, bgwriter sets ckpt_done to
+ *       equal ckpt_started.
+ *     * On failure of a checkpoint, bgwrite first increments ckpt_failed,
+ *       then sets ckpt_done to equal ckpt_started.
+ * All three fields are declared sig_atomic_t to ensure they can be read
+ * and written without explicit locking.  The algorithm for backends is:
+ *     1. Record current values of ckpt_failed and ckpt_started (in that
+ *        order!).
+ *     2. Send signal to request checkpoint.
+ *     3. Sleep until ckpt_started changes.  Now you know a checkpoint has
+ *        begun since you started this algorithm (although *not* that it was
+ *        specifically initiated by your signal).
+ *     4. Record new value of ckpt_started.
+ *     5. Sleep until ckpt_done >= saved value of ckpt_started.  (Use modulo
+ *        arithmetic here in case counters wrap around.)  Now you know a
+ *        checkpoint has started and completed, but not whether it was
+ *        successful.
+ *     6. If ckpt_failed is different from the originally saved value,
+ *        assume request failed; otherwise it was definitely successful.
+ *
+ * The requests array holds fsync requests sent by backends and not yet
+ * absorbed by the bgwriter.
+ *----------
  */
+typedef struct
+{
+       RelFileNode             rnode;
+       BlockNumber             segno;
+       /* might add a request-type field later */
+} BgWriterRequest;
+
 typedef struct
 {
        pid_t   bgwriter_pid;           /* PID of bgwriter (0 if not started) */
-       sig_atomic_t    checkpoint_count; /* advances when checkpoint done */
+
+       sig_atomic_t    ckpt_started;   /* advances when checkpoint starts */
+       sig_atomic_t    ckpt_done;              /* advances when checkpoint done */
+       sig_atomic_t    ckpt_failed;    /* advances when checkpoint fails */
+
+       int                             num_requests;   /* current # of requests */
+       int                             max_requests;   /* allocated array size */
+       BgWriterRequest requests[1];    /* VARIABLE LENGTH ARRAY */
 } BgWriterShmemStruct;
 
 static BgWriterShmemStruct *BgWriterShmem;
@@ -86,6 +130,10 @@ static volatile sig_atomic_t shutdown_requested = false;
 /*
  * Private state
  */
+static bool            am_bg_writer = false;
+
+static bool            ckpt_active = false;
+
 static time_t  last_checkpoint_time;
 
 
@@ -106,6 +154,7 @@ BackgroundWriterMain(void)
 {
        Assert(BgWriterShmem != NULL);
        BgWriterShmem->bgwriter_pid = MyProcPid;
+       am_bg_writer = true;
 
        /*
         * Properly accept or ignore signals the postmaster might send us
@@ -180,6 +229,17 @@ BackgroundWriterMain(void)
                 */
                InError = false;
 
+               /* Warn any waiting backends that the checkpoint failed. */
+               if (ckpt_active)
+               {
+                       /* use volatile pointer to prevent code rearrangement */
+                       volatile BgWriterShmemStruct *bgs = BgWriterShmem;
+
+                       bgs->ckpt_failed++;
+                       bgs->ckpt_done = bgs->ckpt_started;
+                       ckpt_active = false;
+               }
+
                /*
                 * Exit interrupt holdoff section we implicitly established above.
                 */
@@ -214,8 +274,17 @@ BackgroundWriterMain(void)
                long            udelay;
 
                /*
-                * Process any signals received recently.
+                * Emergency bailout if postmaster has died.  This is to avoid the
+                * necessity for manual cleanup of all postmaster children.
                 */
+               if (!PostmasterIsAlive(true))
+                       exit(1);
+
+               /*
+                * Process any requests or signals received recently.
+                */
+               AbsorbFsyncRequests();
+
                if (got_SIGHUP)
                {
                        got_SIGHUP = false;
@@ -265,8 +334,20 @@ BackgroundWriterMain(void)
                                                         errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
                        }
 
+                       /*
+                        * Indicate checkpoint start to any waiting backends.
+                        */
+                       ckpt_active = true;
+                       BgWriterShmem->ckpt_started++;
+
                        CreateCheckPoint(false, force_checkpoint);
 
+                       /*
+                        * Indicate checkpoint completion to any waiting backends.
+                        */
+                       BgWriterShmem->ckpt_done = BgWriterShmem->ckpt_started;
+                       ckpt_active = false;
+
                        /*
                         * Note we record the checkpoint start time not end time as
                         * last_checkpoint_time.  This is so that time-driven checkpoints
@@ -274,14 +355,11 @@ BackgroundWriterMain(void)
                         */
                        last_checkpoint_time = now;
 
-                       /*
-                        * Indicate checkpoint completion to any waiting backends.
-                        */
-                       BgWriterShmem->checkpoint_count++;
-
                        /*
                         * After any checkpoint, close all smgr files.  This is so we
                         * won't hang onto smgr references to deleted files indefinitely.
+                        * (It is safe to do this because this process does not have a
+                        * relcache, and so no dangling references could remain.)
                         */
                        smgrcloseall();
 
@@ -301,6 +379,8 @@ BackgroundWriterMain(void)
                 * we respond reasonably promptly when someone signals us,
                 * break down the sleep into 1-second increments, and check for
                 * interrupts after each nap.
+                *
+                * We absorb pending requests after each short sleep.
                 */
                udelay = ((n > 0) ? BgWriterDelay : 10000) * 1000L;
                while (udelay > 1000000L)
@@ -308,17 +388,11 @@ BackgroundWriterMain(void)
                        if (got_SIGHUP || checkpoint_requested || shutdown_requested)
                                break;
                        pg_usleep(1000000L);
+                       AbsorbFsyncRequests();
                        udelay -= 1000000L;
                }
                if (!(got_SIGHUP || checkpoint_requested || shutdown_requested))
                        pg_usleep(udelay);
-
-               /*
-                * Emergency bailout if postmaster has died.  This is to avoid the
-                * necessity for manual cleanup of all postmaster children.
-                */
-               if (!PostmasterIsAlive(true))
-                       exit(1);
        }
 }
 
@@ -387,10 +461,11 @@ int
 BgWriterShmemSize(void)
 {
        /*
-        * This is not worth measuring right now, but may become so after we
-        * add fsync signaling ...
+        * Currently, the size of the requests[] array is arbitrarily set
+        * equal to NBuffers.  This may prove too large or small ...
         */
-       return MAXALIGN(sizeof(BgWriterShmemStruct));
+       return MAXALIGN(sizeof(BgWriterShmemStruct) +
+                                       (NBuffers - 1) * sizeof(BgWriterRequest));
 }
 
 /*
@@ -404,7 +479,7 @@ BgWriterShmemInit(void)
 
        BgWriterShmem = (BgWriterShmemStruct *)
                ShmemInitStruct("Background Writer Data",
-                                               sizeof(BgWriterShmemStruct),
+                                               BgWriterShmemSize(),
                                                &found);
        if (BgWriterShmem == NULL)
                ereport(FATAL,
@@ -414,6 +489,7 @@ BgWriterShmemInit(void)
                return;                                 /* already initialized */
 
        MemSet(BgWriterShmem, 0, sizeof(BgWriterShmemStruct));
+       BgWriterShmem->max_requests = NBuffers;
 }
 
 /*
@@ -427,8 +503,10 @@ BgWriterShmemInit(void)
 void
 RequestCheckpoint(bool waitforit)
 {
-       volatile sig_atomic_t *count_ptr = &BgWriterShmem->checkpoint_count;
-       sig_atomic_t    old_count = *count_ptr;
+       /* use volatile pointer to prevent code rearrangement */
+       volatile BgWriterShmemStruct *bgs = BgWriterShmem;
+       sig_atomic_t    old_failed = bgs->ckpt_failed;
+       sig_atomic_t    old_started = bgs->ckpt_started;
 
        /*
         * Send signal to request checkpoint.  When waitforit is false,
@@ -442,15 +520,119 @@ RequestCheckpoint(bool waitforit)
                         "could not signal for checkpoint: %m");
 
        /*
-        * If requested, wait for completion.  We detect completion by
-        * observing a change in checkpoint_count in shared memory.
+        * If requested, wait for completion.  We detect completion according
+        * to the algorithm given above.
         */
        if (waitforit)
        {
-               while (*count_ptr == old_count)
+               while (bgs->ckpt_started == old_started)
                {
                        CHECK_FOR_INTERRUPTS();
-                       pg_usleep(1000000L);
+                       pg_usleep(100000L);
+               }
+               old_started = bgs->ckpt_started;
+               /*
+                * We are waiting for ckpt_done >= old_started, in a modulo
+                * sense.  This is a little tricky since we don't know the
+                * width or signedness of sig_atomic_t.  We make the lowest
+                * common denominator assumption that it is only as wide
+                * as "char".  This means that this algorithm will cope
+                * correctly as long as we don't sleep for more than 127
+                * completed checkpoints.  (If we do, we will get another
+                * chance to exit after 128 more checkpoints...)
+                */
+               while (((signed char) (bgs->ckpt_done - old_started)) < 0)
+               {
+                       CHECK_FOR_INTERRUPTS();
+                       pg_usleep(100000L);
                }
+               if (bgs->ckpt_failed != old_failed)
+                       ereport(ERROR,
+                                       (errmsg("checkpoint request failed"),
+                                        errhint("Consult the postmaster log for details.")));
+       }
+}
+
+/*
+ * ForwardFsyncRequest
+ *             Forward a file-fsync request from a backend to the bgwriter
+ *
+ * Whenever a backend is compelled to write directly to a relation
+ * (which should be seldom, if the bgwriter is getting its job done),
+ * the backend calls this routine to pass over knowledge that the relation
+ * is dirty and must be fsync'd before next checkpoint.
+ *
+ * If we are unable to pass over the request (at present, this can happen
+ * if the shared memory queue is full), we return false.  That forces
+ * the backend to do its own fsync.  We hope that will be even more seldom.
+ *
+ * Note: we presently make no attempt to eliminate duplicate requests
+ * in the requests[] queue.  The bgwriter will have to eliminate dups
+ * internally anyway, so we may as well avoid holding the lock longer
+ * than we have to here.
+ */
+bool
+ForwardFsyncRequest(RelFileNode rnode, BlockNumber segno)
+{
+       BgWriterRequest *request;
+
+       if (!IsUnderPostmaster)
+               return false;                   /* probably shouldn't even get here */
+       Assert(BgWriterShmem != NULL);
+
+       LWLockAcquire(BgWriterCommLock, LW_EXCLUSIVE);
+       if (BgWriterShmem->bgwriter_pid == 0 ||
+               BgWriterShmem->num_requests >= BgWriterShmem->max_requests)
+       {
+               LWLockRelease(BgWriterCommLock);
+               return false;
+       }
+       request = &BgWriterShmem->requests[BgWriterShmem->num_requests++];
+       request->rnode = rnode;
+       request->segno = segno;
+       LWLockRelease(BgWriterCommLock);
+       return true;
+}
+
+/*
+ * AbsorbFsyncRequests
+ *             Retrieve queued fsync requests and pass them to local smgr.
+ *
+ * This is exported because it must be called during CreateCheckpoint;
+ * we have to be sure we have accepted all pending requests *after* we
+ * establish the checkpoint redo pointer.  Since CreateCheckpoint
+ * sometimes runs in non-bgwriter processes, do nothing if not bgwriter.
+ */
+void
+AbsorbFsyncRequests(void)
+{
+       BgWriterRequest *requests = NULL;
+       BgWriterRequest *request;
+       int                     n;
+
+       if (!am_bg_writer)
+               return;
+
+       /*
+        * We try to avoid holding the lock for a long time by copying the
+        * request array.
+        */
+       LWLockAcquire(BgWriterCommLock, LW_EXCLUSIVE);
+
+       n = BgWriterShmem->num_requests;
+       if (n > 0)
+       {
+               requests = (BgWriterRequest *) palloc(n * sizeof(BgWriterRequest));
+               memcpy(requests, BgWriterShmem->requests, n * sizeof(BgWriterRequest));
+       }
+       BgWriterShmem->num_requests = 0;
+
+       LWLockRelease(BgWriterCommLock);
+
+       for (request = requests; n > 0; request++, n--)
+       {
+               RememberFsyncRequest(request->rnode, request->segno);
        }
+       if (requests)
+               pfree(requests);
 }
index f718e33cd598beddb09c76f8467e18f16017c162..2386bc89bf3b7eaf22be77b5448924e89529412f 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.166 2004/05/29 22:48:19 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.167 2004/05/31 03:48:02 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1044,6 +1044,9 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
  *             bothering to write them out first.      This is NOT rollback-able,
  *             and so should be used only with extreme caution!
  *
+ *             There is no particularly good reason why this doesn't have a
+ *             firstDelBlock parameter, except that current callers don't need it.
+ *
  *             We assume that the caller holds an exclusive lock on the relation,
  *             which should assure that no new buffers will be acquired for the rel
  *             meanwhile.
@@ -1052,14 +1055,15 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 void
 DropRelationBuffers(Relation rel)
 {
-       DropRelFileNodeBuffers(rel->rd_node, rel->rd_istemp);
+       DropRelFileNodeBuffers(rel->rd_node, rel->rd_istemp, 0);
 }
 
 /* ---------------------------------------------------------------------
  *             DropRelFileNodeBuffers
  *
  *             This is the same as DropRelationBuffers, except that the target
- *             relation is specified by RelFileNode and temp status.
+ *             relation is specified by RelFileNode and temp status, and one
+ *             may specify the first block to drop.
  *
  *             This is NOT rollback-able.      One legitimate use is to clear the
  *             buffer cache of buffers for a relation that is being deleted
@@ -1067,7 +1071,8 @@ DropRelationBuffers(Relation rel)
  * --------------------------------------------------------------------
  */
 void
-DropRelFileNodeBuffers(RelFileNode rnode, bool istemp)
+DropRelFileNodeBuffers(RelFileNode rnode, bool istemp,
+                                          BlockNumber firstDelBlock)
 {
        int                     i;
        BufferDesc *bufHdr;
@@ -1077,7 +1082,8 @@ DropRelFileNodeBuffers(RelFileNode rnode, bool istemp)
                for (i = 0; i < NLocBuffer; i++)
                {
                        bufHdr = &LocalBufferDescriptors[i];
-                       if (RelFileNodeEquals(bufHdr->tag.rnode, rnode))
+                       if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
+                               bufHdr->tag.blockNum >= firstDelBlock)
                        {
                                bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
                                bufHdr->cntxDirty = false;
@@ -1094,7 +1100,8 @@ DropRelFileNodeBuffers(RelFileNode rnode, bool istemp)
        {
                bufHdr = &BufferDescriptors[i - 1];
 recheck:
-               if (RelFileNodeEquals(bufHdr->tag.rnode, rnode))
+               if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
+                       bufHdr->tag.blockNum >= firstDelBlock)
                {
                        /*
                         * If there is I/O in progress, better wait till it's done;
index 5ef12de949518be73314b6341308886733f6b730..96de54110cfaab2f21d42120c63f3c09fbb6d961 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.108 2004/02/23 23:03:10 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.109 2004/05/31 03:48:04 tgl Exp $
  *
  * NOTES:
  *
@@ -484,6 +484,7 @@ Insert(File file)
        DO_DB(_dump_lru());
 }
 
+/* returns 0 on success, -1 on re-open failure (with errno set) */
 static int
 LruInsert(File file)
 {
@@ -685,6 +686,7 @@ filepath(const char *filename)
        return buf;
 }
 
+/* returns 0 on success, -1 on re-open failure (with errno set) */
 static int
 FileAccess(File file)
 {
@@ -954,7 +956,10 @@ FileRead(File file, char *buffer, int amount)
                           file, VfdCache[file].fileName,
                           VfdCache[file].seekPos, amount, buffer));
 
-       FileAccess(file);
+       returnCode = FileAccess(file);
+       if (returnCode < 0)
+               return returnCode;
+
        returnCode = read(VfdCache[file].fd, buffer, amount);
        if (returnCode > 0)
                VfdCache[file].seekPos += returnCode;
@@ -975,7 +980,9 @@ FileWrite(File file, char *buffer, int amount)
                           file, VfdCache[file].fileName,
                           VfdCache[file].seekPos, amount, buffer));
 
-       FileAccess(file);
+       returnCode = FileAccess(file);
+       if (returnCode < 0)
+               return returnCode;
 
        errno = 0;
        returnCode = write(VfdCache[file].fd, buffer, amount);
@@ -992,9 +999,28 @@ FileWrite(File file, char *buffer, int amount)
        return returnCode;
 }
 
+int
+FileSync(File file)
+{
+       int                     returnCode;
+
+       Assert(FileIsValid(file));
+
+       DO_DB(elog(LOG, "FileSync: %d (%s)",
+                          file, VfdCache[file].fileName));
+
+       returnCode = FileAccess(file);
+       if (returnCode < 0)
+               return returnCode;
+
+       return pg_fsync(VfdCache[file].fd);
+}
+
 long
 FileSeek(File file, long offset, int whence)
 {
+       int                     returnCode;
+
        Assert(FileIsValid(file));
 
        DO_DB(elog(LOG, "FileSeek: %d (%s) %ld %ld %d",
@@ -1014,8 +1040,11 @@ FileSeek(File file, long offset, int whence)
                                VfdCache[file].seekPos += offset;
                                break;
                        case SEEK_END:
-                               FileAccess(file);
-                               VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+                               returnCode = FileAccess(file);
+                               if (returnCode < 0)
+                                       return returnCode;
+                               VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+                                                                                          offset, whence);
                                break;
                        default:
                                elog(ERROR, "invalid whence: %d", whence);
@@ -1030,14 +1059,17 @@ FileSeek(File file, long offset, int whence)
                                if (offset < 0)
                                        elog(ERROR, "invalid seek offset: %ld", offset);
                                if (VfdCache[file].seekPos != offset)
-                                       VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+                                       VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+                                                                                                  offset, whence);
                                break;
                        case SEEK_CUR:
                                if (offset != 0 || VfdCache[file].seekPos == FileUnknownPos)
-                                       VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+                                       VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+                                                                                                  offset, whence);
                                break;
                        case SEEK_END:
-                               VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+                               VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+                                                                                          offset, whence);
                                break;
                        default:
                                elog(ERROR, "invalid whence: %d", whence);
@@ -1071,7 +1103,10 @@ FileTruncate(File file, long offset)
        DO_DB(elog(LOG, "FileTruncate %d (%s)",
                           file, VfdCache[file].fileName));
 
-       FileAccess(file);
+       returnCode = FileAccess(file);
+       if (returnCode < 0)
+               return returnCode;
+
        returnCode = ftruncate(VfdCache[file].fd, (size_t) offset);
        return returnCode;
 }
index 2122a243207b12049b5d655e3a01305e70b3aade..5ac5868f690b32196f7eca791674d2de3c6ca4b0 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.104 2004/04/19 17:42:58 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.105 2004/05/31 03:48:06 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 
 #include "catalog/catalog.h"
 #include "miscadmin.h"
+#include "postmaster/bgwriter.h"
 #include "storage/fd.h"
 #include "storage/smgr.h"
+#include "utils/hsearch.h"
 #include "utils/memutils.h"
 
 
  *     system's file size limit (often 2GBytes).  In order to do that,
  *     we break relations up into chunks of < 2GBytes and store one chunk
  *     in each of several files that represent the relation.  See the
- *     BLCKSZ and RELSEG_SIZE configuration constants in
- *     include/pg_config.h.  All chunks except the last MUST have size exactly
- *     equal to RELSEG_SIZE blocks --- see mdnblocks() and mdtruncate().
+ *     BLCKSZ and RELSEG_SIZE configuration constants in pg_config_manual.h.
+ *     All chunks except the last MUST have size exactly equal to RELSEG_SIZE
+ *     blocks --- see mdnblocks() and mdtruncate().
  *
  *     The file descriptor pointer (md_fd field) stored in the SMgrRelation
  *     cache is, therefore, just the head of a list of MdfdVec objects.
  *     But note the md_fd pointer can be NULL, indicating relation not open.
  *
+ *     Note that mdfd_chain == NULL does not necessarily mean the relation
+ *     doesn't have another segment after this one; we may just not have
+ *     opened the next segment yet.  (We could not have "all segments are
+ *     in the chain" as an invariant anyway, since another backend could
+ *     extend the relation when we weren't looking.)
+ *
  *     All MdfdVec objects are palloc'd in the MdCxt memory context.
  */
 
 typedef struct _MdfdVec
 {
        File            mdfd_vfd;                       /* fd number in fd.c's pool */
-
-#ifndef LET_OS_MANAGE_FILESIZE
-       struct _MdfdVec *mdfd_chain;    /* for large relations */
+       BlockNumber     mdfd_segno;                     /* segment number, from 0 */
+#ifndef LET_OS_MANAGE_FILESIZE         /* for large relations */
+       struct _MdfdVec *mdfd_chain;    /* next segment, or NULL */
 #endif
 } MdfdVec;
 
 static MemoryContext MdCxt;            /* context for all md.c allocations */
 
 
-/* routines declared here */
-static MdfdVec *mdopen(SMgrRelation reln);
+/*
+ * In some contexts (currently, standalone backends and the bgwriter process)
+ * we keep track of pending fsync operations: we need to remember all relation
+ * segments that have been written since the last checkpoint, so that we can
+ * fsync them down to disk before completing the next checkpoint.  This hash
+ * table remembers the pending operations.  We use a hash table not because
+ * we want to look up individual operations, but simply as a convenient way
+ * of eliminating duplicate requests.
+ *
+ * (Regular backends do not track pending operations locally, but forward
+ * them to the bgwriter.)
+ *
+ * XXX for WIN32, may want to expand this to track pending deletes, too.
+ */
+typedef struct
+{
+       RelFileNode     rnode;                  /* the targeted relation */
+       BlockNumber     segno;                  /* which segment */
+} PendingOperationEntry;
+
+static HTAB *pendingOpsTable = NULL;
+
+
+/* local routines */
+static MdfdVec *mdopen(SMgrRelation reln, bool allowNotFound);
+static bool register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
 static MdfdVec *_fdvec_alloc(void);
 #ifndef LET_OS_MANAGE_FILESIZE
 static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno,
                                                          int oflags);
 #endif
-static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno);
+static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno,
+                                                        bool allowNotFound);
 static BlockNumber _mdnblocks(File file, Size blcksz);
 
 
@@ -79,6 +112,31 @@ mdinit(void)
                                                                  ALLOCSET_DEFAULT_INITSIZE,
                                                                  ALLOCSET_DEFAULT_MAXSIZE);
 
+       /*
+        * Create pending-operations hashtable if we need it.  Currently,
+        * we need it if we are standalone (not under a postmaster) OR
+        * if we are a bootstrap-mode subprocess of a postmaster (that is,
+        * a startup or bgwriter process).
+        */
+       if (!IsUnderPostmaster || IsBootstrapProcessingMode())
+       {
+               HASHCTL         hash_ctl;
+
+               MemSet(&hash_ctl, 0, sizeof(hash_ctl));
+               hash_ctl.keysize = sizeof(PendingOperationEntry);
+               hash_ctl.entrysize = sizeof(PendingOperationEntry);
+               hash_ctl.hash = tag_hash;
+               hash_ctl.hcxt = MdCxt;
+               pendingOpsTable = hash_create("Pending Ops Table",
+                                                                         100L,
+                                                                         &hash_ctl,
+                                                                         HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+               if (pendingOpsTable == NULL)
+                       ereport(FATAL,
+                                       (errcode(ERRCODE_OUT_OF_MEMORY),
+                                        errmsg("out of memory")));
+       }
+
        return true;
 }
 
@@ -130,6 +188,7 @@ mdcreate(SMgrRelation reln, bool isRedo)
        reln->md_fd = _fdvec_alloc();
 
        reln->md_fd->mdfd_vfd = fd;
+       reln->md_fd->mdfd_segno = 0;
 #ifndef LET_OS_MANAGE_FILESIZE
        reln->md_fd->mdfd_chain = NULL;
 #endif
@@ -217,7 +276,7 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer)
        int                     nbytes;
        MdfdVec    *v;
 
-       v = _mdfd_getseg(reln, blocknum);
+       v = _mdfd_getseg(reln, blocknum, false);
 
 #ifndef LET_OS_MANAGE_FILESIZE
        seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
@@ -252,6 +311,9 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer)
                return false;
        }
 
+       if (!register_dirty_segment(reln, v))
+               return false;
+
 #ifndef LET_OS_MANAGE_FILESIZE
        Assert(_mdnblocks(v->mdfd_vfd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
 #endif
@@ -261,12 +323,14 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer)
 
 /*
  *     mdopen() -- Open the specified relation.  ereport's on failure.
+ *             (Optionally, can return NULL instead of ereport for ENOENT.)
  *
  * Note we only open the first segment, when there are multiple segments.
  */
 static MdfdVec *
-mdopen(SMgrRelation reln)
+mdopen(SMgrRelation reln, bool allowNotFound)
 {
+       MdfdVec    *mdfd;
        char       *path;
        File            fd;
 
@@ -292,6 +356,8 @@ mdopen(SMgrRelation reln)
                if (fd < 0)
                {
                        pfree(path);
+                       if (allowNotFound && errno == ENOENT)
+                               return NULL;
                        ereport(ERROR,
                                        (errcode_for_file_access(),
                                         errmsg("could not open relation %u/%u: %m",
@@ -302,15 +368,16 @@ mdopen(SMgrRelation reln)
 
        pfree(path);
 
-       reln->md_fd = _fdvec_alloc();
+       reln->md_fd = mdfd = _fdvec_alloc();
 
-       reln->md_fd->mdfd_vfd = fd;
+       mdfd->mdfd_vfd = fd;
+       mdfd->mdfd_segno = 0;
 #ifndef LET_OS_MANAGE_FILESIZE
-       reln->md_fd->mdfd_chain = NULL;
+       mdfd->mdfd_chain = NULL;
        Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
 #endif
 
-       return reln->md_fd;
+       return mdfd;
 }
 
 /*
@@ -361,7 +428,7 @@ mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
        int                     nbytes;
        MdfdVec    *v;
 
-       v = _mdfd_getseg(reln, blocknum);
+       v = _mdfd_getseg(reln, blocknum, false);
 
 #ifndef LET_OS_MANAGE_FILESIZE
        seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
@@ -403,7 +470,7 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer)
        long            seekpos;
        MdfdVec    *v;
 
-       v = _mdfd_getseg(reln, blocknum);
+       v = _mdfd_getseg(reln, blocknum, false);
 
 #ifndef LET_OS_MANAGE_FILESIZE
        seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
@@ -418,6 +485,9 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer)
        if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
                return false;
 
+       if (!register_dirty_segment(reln, v))
+               return false;
+
        return true;
 }
 
@@ -434,7 +504,7 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer)
 BlockNumber
 mdnblocks(SMgrRelation reln)
 {
-       MdfdVec    *v = mdopen(reln);
+       MdfdVec    *v = mdopen(reln, false);
 
 #ifndef LET_OS_MANAGE_FILESIZE
        BlockNumber nblocks;
@@ -516,7 +586,7 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks)
        if (nblocks == curnblk)
                return nblocks;                 /* no work */
 
-       v = mdopen(reln);
+       v = mdopen(reln, false);
 
 #ifndef LET_OS_MANAGE_FILESIZE
        priorblocks = 0;
@@ -576,40 +646,154 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks)
 }
 
 /*
- *     mdcommit() -- Commit a transaction.
+ *     mdsync() -- Sync previous writes to stable storage.
+ *
+ * This is only called during checkpoints, and checkpoints should only
+ * occur in processes that have created a pendingOpsTable.
  */
 bool
-mdcommit(void)
+mdsync(void)
 {
+       HASH_SEQ_STATUS hstat;
+       PendingOperationEntry *entry;
+
+       if (!pendingOpsTable)
+               return false;
+
        /*
-        * We don't actually have to do anything here...
+        * If we are in the bgwriter, the sync had better include all fsync
+        * requests that were queued by backends before the checkpoint REDO
+        * point was determined.  We go that a little better by accepting
+        * all requests queued up to the point where we start fsync'ing.
         */
+       AbsorbFsyncRequests();
+
+       hash_seq_init(&hstat, pendingOpsTable);
+       while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
+       {
+               /*
+                * If fsync is off then we don't have to bother opening the file
+                * at all.  (We delay checking until this point so that changing
+                * fsync on the fly behaves sensibly.)
+                */
+               if (enableFsync)
+               {
+                       SMgrRelation reln;
+                       MdfdVec *seg;
+
+                       /*
+                        * Find or create an smgr hash entry for this relation.
+                        * This may seem a bit unclean -- md calling smgr?  But it's
+                        * really the best solution.  It ensures that the open file
+                        * reference isn't permanently leaked if we get an error here.
+                        * (You may say "but an unreferenced SMgrRelation is still a
+                        * leak!"  Not really, because the only case in which a checkpoint
+                        * is done by a process that isn't about to shut down is in the
+                        * bgwriter, and it will periodically do smgrcloseall().  This
+                        * fact justifies our not closing the reln in the success path
+                        * either, which is a good thing since in non-bgwriter cases
+                        * we couldn't safely do that.)  Furthermore, in many cases
+                        * the relation will have been dirtied through this same smgr
+                        * relation, and so we can save a file open/close cycle.
+                        */
+                       reln = smgropen(entry->rnode);
+
+                       /*
+                        * It is possible that the relation has been dropped or truncated
+                        * since the fsync request was entered.  Therefore, we have to
+                        * allow file-not-found errors.  This applies both during
+                        * _mdfd_getseg() and during FileSync, since fd.c might have
+                        * closed the file behind our back.
+                        */
+                       seg = _mdfd_getseg(reln,
+                                                          entry->segno * ((BlockNumber) RELSEG_SIZE),
+                                                          true);
+                       if (seg)
+                       {
+                               if (FileSync(seg->mdfd_vfd) < 0 &&
+                                       errno != ENOENT)
+                               {
+                                       ereport(LOG,
+                                                       (errcode_for_file_access(),
+                                                        errmsg("could not fsync segment %u of relation %u/%u: %m",
+                                                                       entry->segno,
+                                                                       entry->rnode.tblNode,
+                                                                       entry->rnode.relNode)));
+                                       return false;
+                               }
+                       }
+               }
+
+               /* Okay, delete this entry */
+               if (hash_search(pendingOpsTable, entry,
+                                               HASH_REMOVE, NULL) == NULL)
+                       elog(ERROR, "pendingOpsTable corrupted");
+       }
+
        return true;
 }
 
 /*
- *     mdabort() -- Abort a transaction.
+ * register_dirty_segment() -- Mark a relation segment as needing fsync
+ *
+ * If there is a local pending-ops table, just make an entry in it for
+ * mdsync to process later.  Otherwise, try to pass off the fsync request
+ * to the background writer process.  If that fails, just do the fsync
+ * locally before returning (we expect this will not happen often enough
+ * to be a performance problem).
+ *
+ * A false result implies I/O failure during local fsync.  errno will be
+ * valid for error reporting.
  */
-bool
-mdabort(void)
+static bool
+register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
 {
-       /*
-        * We don't actually have to do anything here...
-        */
+       if (pendingOpsTable)
+       {
+               PendingOperationEntry entry;
+
+               /* ensure any pad bytes in the struct are zeroed */
+               MemSet(&entry, 0, sizeof(entry));
+               entry.rnode = reln->smgr_rnode;
+               entry.segno = seg->mdfd_segno;
+
+               if (hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL) != NULL)
+                       return true;
+               /* out of memory: fall through to do it locally */
+       }
+       else
+       {
+               if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno))
+                       return true;
+       }
+
+       if (FileSync(seg->mdfd_vfd) < 0)
+               return false;
        return true;
 }
 
 /*
- *     mdsync() -- Sync previous writes to stable storage.
+ * RememberFsyncRequest() -- callback from bgwriter side of fsync request
+ *
+ * We stuff the fsync request into the local hash table for execution
+ * during the bgwriter's next checkpoint.
  */
-bool
-mdsync(void)
+void
+RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
 {
-       sync();
-       if (IsUnderPostmaster)
-               pg_usleep(2000000L);
-       sync();
-       return true;
+       PendingOperationEntry entry;
+
+       Assert(pendingOpsTable);
+
+       /* ensure any pad bytes in the struct are zeroed */
+       MemSet(&entry, 0, sizeof(entry));
+       entry.rnode = rnode;
+       entry.segno = segno;
+
+       if (hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL) == NULL)
+               ereport(FATAL,
+                               (errcode(ERRCODE_OUT_OF_MEMORY),
+                                errmsg("out of memory")));
 }
 
 /*
@@ -618,18 +802,11 @@ mdsync(void)
 static MdfdVec *
 _fdvec_alloc(void)
 {
-       MdfdVec *v;
-
-       v = (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
-       v->mdfd_vfd = -1;
-#ifndef LET_OS_MANAGE_FILESIZE
-       v->mdfd_chain = NULL;
-#endif
-
-       return v;
+       return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
 }
 
 #ifndef LET_OS_MANAGE_FILESIZE
+
 /*
  * Open the specified segment of the relation,
  * and make a MdfdVec object for it.  Returns NULL on failure.
@@ -642,11 +819,11 @@ _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
        char       *path,
                           *fullpath;
 
-       /* be sure we have enough space for the '.segno', if any */
        path = relpath(reln->smgr_rnode);
 
        if (segno > 0)
        {
+               /* be sure we have enough space for the '.segno' */
                fullpath = (char *) palloc(strlen(path) + 12);
                sprintf(fullpath, "%s.%u", path, segno);
                pfree(path);
@@ -667,32 +844,36 @@ _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
 
        /* fill the entry */
        v->mdfd_vfd = fd;
+       v->mdfd_segno = segno;
        v->mdfd_chain = NULL;
        Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
 
        /* all done */
        return v;
 }
-#endif
+
+#endif /* LET_OS_MANAGE_FILESIZE */
 
 /*
  *     _mdfd_getseg() -- Find the segment of the relation holding the
- *                                       specified block.  ereport's on failure.
+ *             specified block.  ereport's on failure.
+ *             (Optionally, can return NULL instead of ereport for ENOENT.)
  */
 static MdfdVec *
-_mdfd_getseg(SMgrRelation reln, BlockNumber blkno)
+_mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool allowNotFound)
 {
-       MdfdVec    *v = mdopen(reln);
-
+       MdfdVec    *v = mdopen(reln, allowNotFound);
 #ifndef LET_OS_MANAGE_FILESIZE
-       BlockNumber segno;
-       BlockNumber i;
+       BlockNumber segstogo;
+       BlockNumber nextsegno;
 
-       for (segno = blkno / ((BlockNumber) RELSEG_SIZE), i = 1;
-                segno > 0;
-                i++, segno--)
-       {
+       if (!v)
+               return NULL;                    /* only possible if allowNotFound */
 
+       for (segstogo = blkno / ((BlockNumber) RELSEG_SIZE), nextsegno = 1;
+                segstogo > 0;
+                nextsegno++, segstogo--)
+       {
                if (v->mdfd_chain == NULL)
                {
                        /*
@@ -705,16 +886,21 @@ _mdfd_getseg(SMgrRelation reln, BlockNumber blkno)
                         * one new segment per call, so this restriction seems
                         * reasonable.
                         */
-                       v->mdfd_chain = _mdfd_openseg(reln, i, (segno == 1) ? O_CREAT : 0);
-
+                       v->mdfd_chain = _mdfd_openseg(reln,
+                                                                                 nextsegno,
+                                                                                 (segstogo == 1) ? O_CREAT : 0);
                        if (v->mdfd_chain == NULL)
+                       {
+                               if (allowNotFound && errno == ENOENT)
+                                       return NULL;
                                ereport(ERROR,
                                                (errcode_for_file_access(),
                                                 errmsg("could not open segment %u of relation %u/%u (target block %u): %m",
-                                                               i,
+                                                               nextsegno,
                                                                reln->smgr_rnode.tblNode,
                                                                reln->smgr_rnode.relNode,
                                                                blkno)));
+                       }
                }
                v = v->mdfd_chain;
        }
index d242744a4d7c47d2cc6700e1c3af8ecc23ddc522..c204e2796c4b2125b5f83a1a3fb574e2a9bd85a1 100644 (file)
@@ -11,7 +11,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.70 2004/02/11 22:55:25 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.71 2004/05/31 03:48:06 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -56,7 +56,7 @@ typedef struct f_smgr
 static const f_smgr smgrsw[] = {
        /* magnetic disk */
        {mdinit, NULL, mdclose, mdcreate, mdunlink, mdextend,
-        mdread, mdwrite, mdnblocks, mdtruncate, mdcommit, mdabort, mdsync
+        mdread, mdwrite, mdnblocks, mdtruncate, NULL, NULL, mdsync
        }
 };
 
@@ -407,7 +407,7 @@ smgr_internal_unlink(RelFileNode rnode, int which, bool isTemp, bool isRedo)
         * Get rid of any leftover buffers for the rel (shouldn't be any in the
         * commit case, but there can be in the abort case).
         */
-       DropRelFileNodeBuffers(rnode, isTemp);
+       DropRelFileNodeBuffers(rnode, isTemp, 0);
 
        /*
         * Tell the free space map to forget this relation.  It won't be accessed
@@ -638,7 +638,7 @@ smgrcommit(void)
                if (smgrsw[i].smgr_commit)
                {
                        if (! (*(smgrsw[i].smgr_commit)) ())
-                               elog(FATAL, "transaction commit failed on %s: %m",
+                               elog(ERROR, "transaction commit failed on %s: %m",
                                         DatumGetCString(DirectFunctionCall1(smgrout,
                                                                                                         Int16GetDatum(i))));
                }
@@ -658,7 +658,7 @@ smgrabort(void)
                if (smgrsw[i].smgr_abort)
                {
                        if (! (*(smgrsw[i].smgr_abort)) ())
-                               elog(FATAL, "transaction abort failed on %s: %m",
+                               elog(ERROR, "transaction abort failed on %s: %m",
                                         DatumGetCString(DirectFunctionCall1(smgrout,
                                                                                                         Int16GetDatum(i))));
                }
@@ -678,7 +678,7 @@ smgrsync(void)
                if (smgrsw[i].smgr_sync)
                {
                        if (! (*(smgrsw[i].smgr_sync)) ())
-                               elog(PANIC, "storage sync failed on %s: %m",
+                               elog(ERROR, "storage sync failed on %s: %m",
                                         DatumGetCString(DirectFunctionCall1(smgrout,
                                                                                                         Int16GetDatum(i))));
                }
@@ -707,6 +707,13 @@ smgr_redo(XLogRecPtr lsn, XLogRecord *record)
 
                reln = smgropen(xlrec->rnode);
 
+               /*
+                * First, force bufmgr to drop any buffers it has for the to-be-
+                * truncated blocks.  We must do this, else subsequent XLogReadBuffer
+                * operations will not re-extend the file properly.
+                */
+               DropRelFileNodeBuffers(xlrec->rnode, false, xlrec->blkno);
+
                /* Can't use smgrtruncate because it would try to xlog */
 
                /*
index fec968e7a202c4b53146e0e451c64afdfe9d8fdf..213cca5c21654510ba3f78ed90f20ed981bc6e7a 100644 (file)
@@ -6,26 +6,17 @@
  * Portions Copyright (c) 2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/slru.h,v 1.5 2004/05/28 05:13:17 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/slru.h,v 1.6 2004/05/31 03:48:08 tgl Exp $
  */
 #ifndef SLRU_H
 #define SLRU_H
 
-#include "access/xlog.h"
+#include "storage/lwlock.h"
 
-/* exported because lwlock.c needs it */
-#define NUM_CLOG_BUFFERS       8
 
-/*
- * Note: the separation between SlruLockData and SlruSharedData is purely
- * historical; the structs could be combined.
- */
-typedef struct SlruLockData
-{
-       LWLockId        ControlLock;
-       LWLockId        BufferLocks[NUM_CLOG_BUFFERS];  /* Per-buffer I/O locks */
-} SlruLockData;
-typedef SlruLockData *SlruLock;
+/* Opaque structs known only in slru.c */
+typedef struct SlruSharedData *SlruShared;
+typedef struct SlruFlushData *SlruFlush;
 
 /*
  * SlruCtlData is an unshared structure that points to the active information
@@ -33,13 +24,13 @@ typedef SlruLockData *SlruLock;
  */
 typedef struct SlruCtlData
 {
-       void       *shared;                     /* pointer to SlruSharedData */
-       SlruLock        locks;
+       SlruShared      shared;
+
+       LWLockId        ControlLock;
 
        /*
-        * Dir is set during SimpleLruShmemInit and does not change thereafter.
-        * The value is automatically inherited by backends via fork, and
-        * doesn't need to be in shared memory.
+        * Dir is set during SimpleLruInit and does not change thereafter.
+        * Since it's always the same, it doesn't need to be in shared memory.
         */
        char            Dir[MAXPGPATH];
 
@@ -51,13 +42,16 @@ typedef struct SlruCtlData
        bool            (*PagePrecedes) (int, int);
 
 } SlruCtlData;
+
 typedef SlruCtlData *SlruCtl;
 
+
 extern int     SimpleLruShmemSize(void);
 extern void SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir);
 extern int     SimpleLruZeroPage(SlruCtl ctl, int pageno);
-extern char *SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite);
-extern void SimpleLruWritePage(SlruCtl ctl, int slotno);
+extern char *SimpleLruReadPage(SlruCtl ctl, int pageno,
+                                                          TransactionId xid, bool forwrite);
+extern void SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata);
 extern void SimpleLruSetLatestPage(SlruCtl ctl, int pageno);
 extern void SimpleLruFlush(SlruCtl ctl, bool checkpoint);
 extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage);
index c11af72e78945f91dfe76bfab00eeab7eabea886..ed56e9639e889ec64fe0dd2c45476561d9b0dcaf 100644 (file)
@@ -5,13 +5,17 @@
  *
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  *
- * $PostgreSQL: pgsql/src/include/postmaster/bgwriter.h,v 1.1 2004/05/29 22:48:23 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/postmaster/bgwriter.h,v 1.2 2004/05/31 03:48:09 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #ifndef _BGWRITER_H
 #define _BGWRITER_H
 
+#include "storage/block.h"
+#include "storage/relfilenode.h"
+
+
 /* GUC options */
 extern int     BgWriterDelay;
 extern int     BgWriterPercent;
@@ -23,6 +27,9 @@ extern void BackgroundWriterMain(void);
 
 extern void RequestCheckpoint(bool waitforit);
 
+extern bool ForwardFsyncRequest(RelFileNode rnode, BlockNumber segno);
+extern void AbsorbFsyncRequests(void);
+
 extern int     BgWriterShmemSize(void);
 extern void BgWriterShmemInit(void);
 
index 27752d412b56435ddbe61bf77adcd83554d1df92..95b426bb8b93f21faa93c2069167eb8fe048ea49 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.80 2004/05/29 22:48:23 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.81 2004/05/31 03:48:10 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -154,7 +154,8 @@ extern BlockNumber RelationGetNumberOfBlocks(Relation relation);
 extern void RelationTruncate(Relation rel, BlockNumber nblocks);
 extern int     FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock);
 extern void DropRelationBuffers(Relation rel);
-extern void DropRelFileNodeBuffers(RelFileNode rnode, bool istemp);
+extern void DropRelFileNodeBuffers(RelFileNode rnode, bool istemp,
+                                                                  BlockNumber firstDelBlock);
 extern void DropBuffers(Oid dbid);
 
 #ifdef NOT_USED
index 177925cf3e80776dbc34dd543ca8e36b5fcbea76..430ed5d8c74181f11b10a41428453f123159f06b 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.44 2004/02/23 23:03:10 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.45 2004/05/31 03:48:10 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -15,7 +15,7 @@
 /*
  * calls:
  *
- *     File {Close, Read, Write, Seek, Tell, MarkDirty, Sync}
+ *     File {Close, Read, Write, Seek, Tell, Sync}
  *     {File Name Open, Allocate, Free} File
  *
  * These are NOT JUST RENAMINGS OF THE UNIX ROUTINES.
@@ -66,6 +66,7 @@ extern void FileClose(File file);
 extern void FileUnlink(File file);
 extern int     FileRead(File file, char *buffer, int amount);
 extern int     FileWrite(File file, char *buffer, int amount);
+extern int     FileSync(File file);
 extern long FileSeek(File file, long offset, int whence);
 extern int     FileTruncate(File file, long offset);
 
index 34f9c6613c709f9122a054dfb37deab756bede29..e06d9a4bf77f43af92bdb216ad5b009c75a868c1 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.10 2003/12/20 17:31:21 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.11 2004/05/31 03:48:10 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -37,6 +37,7 @@ typedef enum LWLockId
        ControlFileLock,
        CheckpointLock,
        RelCacheInitLock,
+       BgWriterCommLock,
 
        NumFixedLWLocks,                        /* must be last except for
                                                                 * MaxDynamicLWLock */
index 41367d35e819b92ff1a5b889753e1f193006366a..6a28c3824fad8e6b5f1a90cabb455e8219b8b53f 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.41 2004/02/11 22:55:26 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.42 2004/05/31 03:48:10 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -83,10 +83,10 @@ extern bool mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer);
 extern bool mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer);
 extern BlockNumber mdnblocks(SMgrRelation reln);
 extern BlockNumber mdtruncate(SMgrRelation reln, BlockNumber nblocks);
-extern bool mdcommit(void);
-extern bool mdabort(void);
 extern bool mdsync(void);
 
+extern void RememberFsyncRequest(RelFileNode rnode, BlockNumber segno);
+
 /* smgrtype.c */
 extern Datum smgrout(PG_FUNCTION_ARGS);
 extern Datum smgrin(PG_FUNCTION_ARGS);