]> granicus.if.org Git - postgresql/blobdiff - src/backend/storage/file/fd.c
pgindent run for 8.2.
[postgresql] / src / backend / storage / file / fd.c
index 76d2fcfd22356793b22a60bbbdafcdaf07d300d8..a594b16edf35f12e15e04d7772f190eda54165b0 100644 (file)
@@ -3,11 +3,11 @@
  * fd.c
  *       Virtual file descriptor code.
  *
- * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
+ * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.66 2000/11/10 03:53:44 vadim Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.130 2006/10/04 00:29:57 momjian Exp $
  *
  * NOTES:
  *
 
 #include "postgres.h"
 
-#include <sys/types.h>
 #include <sys/file.h>
 #include <sys/param.h>
 #include <sys/stat.h>
-#include <errno.h>
 #include <unistd.h>
 #include <fcntl.h>
 
 #include "miscadmin.h"
+#include "access/xact.h"
 #include "storage/fd.h"
+#include "storage/ipc.h"
+
 
 /*
- * Problem: Postgres does a system(ld...) to do dynamic loading.
- * This will open several extra files in addition to those used by
- * Postgres.  We need to guarantee that there are file descriptors free
- * for ld to use.
- *
- * The current solution is to limit the number of file descriptors
- * that this code will allocate at one time: it leaves RESERVE_FOR_LD free.
+ * We must leave some file descriptors free for system(), the dynamic loader,
+ * and other code that tries to open files without consulting fd.c.  This
+ * is the number left free.  (While we can be pretty sure we won't get
+ * EMFILE, there's never any guarantee that we won't get ENFILE due to
+ * other processes chewing up FDs.     So it's a bad idea to try to open files
+ * without consulting fd.c.  Nonetheless we cannot control all code.)
  *
- * (Even though most dynamic loaders now use dlopen(3) or the
- * equivalent, the OS must still open several files to perform the
- * dynamic loading.  Keep this here.)
+ * Because this is just a fixed setting, we are effectively assuming that
+ * no such code will leave FDs open over the long term; otherwise the slop
+ * is likely to be insufficient.  Note in particular that we expect that
+ * loading a shared library does not result in any permanent increase in
+ * the number of open files.  (This appears to be true on most if not
+ * all platforms as of Feb 2004.)
  */
-#ifndef RESERVE_FOR_LD
-#define RESERVE_FOR_LD 10
-#endif
+#define NUM_RESERVED_FDS               10
+
+/*
+ * If we have fewer than this many usable FDs after allowing for the reserved
+ * ones, choke.
+ */
+#define FD_MINFREE                             10
+
 
 /*
- * We need to ensure that we have at least some file descriptors
- * available to postgreSQL after we've reserved the ones for LD,
- * so we set that value here.
+ * A number of platforms allow individual processes to open many more files
+ * than they can really support when *many* processes do the same thing.
+ * This GUC parameter lets the DBA limit max_safe_fds to something less than
+ * what the postmaster's initial probe suggests will work.
+ */
+int                    max_files_per_process = 1000;
+
+/*
+ * Maximum number of file descriptors to open for either VFD entries or
+ * AllocateFile/AllocateDir operations.  This is initialized to a conservative
+ * value, and remains that way indefinitely in bootstrap or standalone-backend
+ * cases.  In normal postmaster operation, the postmaster calls
+ * set_max_safe_fds() late in initialization to update the value, and that
+ * value is then inherited by forked subprocesses.
  *
- * I think 10 is an appropriate value so that's what it'll be
- * for now.
+ * Note: the value of max_files_per_process is taken into account while
+ * setting this variable, and so need not be tested separately.
  */
-#ifndef FD_MINFREE
-#define FD_MINFREE 10
-#endif
+static int     max_safe_fds = 32;      /* default if not changed */
+
 
 /* Debugging.... */
 
 
 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
 
-#define FileUnknownPos (-1)
+#define FileUnknownPos (-1L)
+
+/* these are the assigned bits in fdstate below: */
+#define FD_TEMPORARY           (1 << 0)        /* T = delete when closed */
+#define FD_XACT_TEMPORARY      (1 << 1)        /* T = delete at eoXact */
 
 typedef struct vfd
 {
        signed short fd;                        /* current FD, or VFD_CLOSED if none */
        unsigned short fdstate;         /* bitflags for VFD's state */
-
-/* these are the assigned bits in fdstate: */
-#define FD_DIRTY               (1 << 0)/* written to, but not yet fsync'd */
-#define FD_TEMPORARY   (1 << 1)/* should be unlinked when closed */
-
+       SubTransactionId create_subid;          /* for TEMPORARY fds, creating subxact */
        File            nextFree;               /* link to next free VFD, if in freelist */
-       File            lruMoreRecently;/* doubly linked recency-of-use list */
+       File            lruMoreRecently;        /* doubly linked recency-of-use list */
        File            lruLessRecently;
        long            seekPos;                /* current logical file position */
        char       *fileName;           /* name of file, or NULL for unused VFD */
        /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
-       int                     fileFlags;              /* open(2) flags for opening the file */
+       int                     fileFlags;              /* open(2) flags for (re)opening the file */
        int                     fileMode;               /* mode to pass to open(2) */
 } Vfd;
 
@@ -130,19 +148,37 @@ static Size SizeVfdCache = 0;
 static int     nfile = 0;
 
 /*
- * List of stdio FILEs opened with AllocateFile.
+ * List of stdio FILEs and <dirent.h> DIRs opened with AllocateFile
+ * and AllocateDir.
  *
- * Since we don't want to encourage heavy use of AllocateFile, it seems
- * OK to put a pretty small maximum limit on the number of simultaneously
- * allocated files.
+ * Since we don't want to encourage heavy use of AllocateFile or AllocateDir,
+ * it seems OK to put a pretty small maximum limit on the number of
+ * simultaneously allocated descs.
  */
-#define MAX_ALLOCATED_FILES  32
+#define MAX_ALLOCATED_DESCS  32
 
-static int     numAllocatedFiles = 0;
-static FILE *allocatedFiles[MAX_ALLOCATED_FILES];
+typedef enum
+{
+       AllocateDescFile,
+       AllocateDescDir
+} AllocateDescKind;
+
+typedef struct
+{
+       AllocateDescKind kind;
+       union
+       {
+               FILE       *file;
+               DIR                *dir;
+       }                       desc;
+       SubTransactionId create_subid;
+} AllocateDesc;
+
+static int     numAllocatedDescs = 0;
+static AllocateDesc allocatedDescs[MAX_ALLOCATED_DESCS];
 
 /*
- * Number of temporary files opened during the current transaction;
+ * Number of temporary files opened during the current session;
  * this is used in generation of tempfile names.
  */
 static long tempFileCounter = 0;
@@ -188,23 +224,216 @@ static File AllocateVfd(void);
 static void FreeVfd(File file);
 
 static int     FileAccess(File file);
-static File fileNameOpenFile(FileName fileName, int fileFlags, int fileMode);
-static char *filepath(char *filename);
-static long pg_nofile(void);
+static char *make_database_relative(const char *filename);
+static void AtProcExit_Files(int code, Datum arg);
+static void CleanupTempFiles(bool isProcExit);
+static void RemovePgTempFilesInDir(const char *tmpdirname);
+
 
-#ifndef XLOG
 /*
- * pg_fsync --- same as fsync except does nothing if -F switch was given
+ * pg_fsync --- do fsync with or without writethrough
  */
 int
 pg_fsync(int fd)
+{
+#ifndef HAVE_FSYNC_WRITETHROUGH_ONLY
+       if (sync_method != SYNC_METHOD_FSYNC_WRITETHROUGH)
+               return pg_fsync_no_writethrough(fd);
+       else
+#endif
+               return pg_fsync_writethrough(fd);
+}
+
+
+/*
+ * pg_fsync_no_writethrough --- same as fsync except does nothing if
+ *     enableFsync is off
+ */
+int
+pg_fsync_no_writethrough(int fd)
 {
        if (enableFsync)
                return fsync(fd);
        else
                return 0;
 }
+
+/*
+ * pg_fsync_writethrough
+ */
+int
+pg_fsync_writethrough(int fd)
+{
+       if (enableFsync)
+       {
+#ifdef WIN32
+               return _commit(fd);
+#elif defined(F_FULLFSYNC)
+               return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
+#else
+               return -1;
 #endif
+       }
+       else
+               return 0;
+}
+
+/*
+ * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
+ *
+ * Not all platforms have fdatasync; treat as fsync if not available.
+ */
+int
+pg_fdatasync(int fd)
+{
+       if (enableFsync)
+       {
+#ifdef HAVE_FDATASYNC
+               return fdatasync(fd);
+#else
+               return fsync(fd);
+#endif
+       }
+       else
+               return 0;
+}
+
+/*
+ * InitFileAccess --- initialize this module during backend startup
+ *
+ * This is called during either normal or standalone backend start.
+ * It is *not* called in the postmaster.
+ */
+void
+InitFileAccess(void)
+{
+       Assert(SizeVfdCache == 0);      /* call me only once */
+
+       /* initialize cache header entry */
+       VfdCache = (Vfd *) malloc(sizeof(Vfd));
+       if (VfdCache == NULL)
+               ereport(FATAL,
+                               (errcode(ERRCODE_OUT_OF_MEMORY),
+                                errmsg("out of memory")));
+
+       MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
+       VfdCache->fd = VFD_CLOSED;
+
+       SizeVfdCache = 1;
+
+       /* register proc-exit hook to ensure temp files are dropped at exit */
+       on_proc_exit(AtProcExit_Files, 0);
+}
+
+/*
+ * count_usable_fds --- count how many FDs the system will let us open,
+ *             and estimate how many are already open.
+ *
+ * We stop counting if usable_fds reaches max_to_probe.  Note: a small
+ * value of max_to_probe might result in an underestimate of already_open;
+ * we must fill in any "gaps" in the set of used FDs before the calculation
+ * of already_open will give the right answer. In practice, max_to_probe
+ * of a couple of dozen should be enough to ensure good results.
+ *
+ * We assume stdin (FD 0) is available for dup'ing
+ */
+static void
+count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
+{
+       int                *fd;
+       int                     size;
+       int                     used = 0;
+       int                     highestfd = 0;
+       int                     j;
+
+       size = 1024;
+       fd = (int *) palloc(size * sizeof(int));
+
+       /* dup until failure or probe limit reached */
+       for (;;)
+       {
+               int                     thisfd;
+
+               thisfd = dup(0);
+               if (thisfd < 0)
+               {
+                       /* Expect EMFILE or ENFILE, else it's fishy */
+                       if (errno != EMFILE && errno != ENFILE)
+                               elog(WARNING, "dup(0) failed after %d successes: %m", used);
+                       break;
+               }
+
+               if (used >= size)
+               {
+                       size *= 2;
+                       fd = (int *) repalloc(fd, size * sizeof(int));
+               }
+               fd[used++] = thisfd;
+
+               if (highestfd < thisfd)
+                       highestfd = thisfd;
+
+               if (used >= max_to_probe)
+                       break;
+       }
+
+       /* release the files we opened */
+       for (j = 0; j < used; j++)
+               close(fd[j]);
+
+       pfree(fd);
+
+       /*
+        * Return results.      usable_fds is just the number of successful dups. We
+        * assume that the system limit is highestfd+1 (remember 0 is a legal FD
+        * number) and so already_open is highestfd+1 - usable_fds.
+        */
+       *usable_fds = used;
+       *already_open = highestfd + 1 - used;
+}
+
+/*
+ * set_max_safe_fds
+ *             Determine number of filedescriptors that fd.c is allowed to use
+ */
+void
+set_max_safe_fds(void)
+{
+       int                     usable_fds;
+       int                     already_open;
+
+       /*----------
+        * We want to set max_safe_fds to
+        *                      MIN(usable_fds, max_files_per_process - already_open)
+        * less the slop factor for files that are opened without consulting
+        * fd.c.  This ensures that we won't exceed either max_files_per_process
+        * or the experimentally-determined EMFILE limit.
+        *----------
+        */
+       count_usable_fds(max_files_per_process,
+                                        &usable_fds, &already_open);
+
+       max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
+
+       /*
+        * Take off the FDs reserved for system() etc.
+        */
+       max_safe_fds -= NUM_RESERVED_FDS;
+
+       /*
+        * Make sure we still have enough to get by.
+        */
+       if (max_safe_fds < FD_MINFREE)
+               ereport(FATAL,
+                               (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+                                errmsg("insufficient file descriptors available to start server process"),
+                                errdetail("System allows %d, we need at least %d.",
+                                                  max_safe_fds + NUM_RESERVED_FDS,
+                                                  FD_MINFREE + NUM_RESERVED_FDS)));
+
+       elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
+                max_safe_fds, usable_fds, already_open);
+}
 
 /*
  * BasicOpenFile --- same as open(2) except can free other FDs if needed
@@ -212,7 +441,7 @@ pg_fsync(int fd)
  * This is exported for use by places that really want a plain kernel FD,
  * but need to be proof against running out of FDs.  Once an FD has been
  * successfully returned, it is the caller's responsibility to ensure that
- * it will not be leaked on elog()!  Most users should *not* call this
+ * it will not be leaked on ereport()! Most users should *not* call this
  * routine directly, but instead use the VFD abstraction level, which
  * provides protection against descriptor leaks as well as management of
  * files that need to be open for more than a short period of time.
@@ -225,7 +454,7 @@ pg_fsync(int fd)
 int
 BasicOpenFile(FileName fileName, int fileFlags, int fileMode)
 {
-       int             fd;
+       int                     fd;
 
 tryAgain:
        fd = open(fileName, fileFlags, fileMode);
@@ -235,10 +464,11 @@ tryAgain:
 
        if (errno == EMFILE || errno == ENFILE)
        {
-               int             save_errno = errno;
+               int                     save_errno = errno;
 
-               DO_DB(elog(DEBUG, "BasicOpenFile: not enough descs, retry, er= %d",
-                                  errno));
+               ereport(LOG,
+                               (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+                                errmsg("out of file descriptors: %m; release and retry")));
                errno = 0;
                if (ReleaseLruFile())
                        goto tryAgain;
@@ -248,39 +478,6 @@ tryAgain:
        return -1;                                      /* failure */
 }
 
-/*
- * pg_nofile: determine number of filedescriptors that fd.c is allowed to use
- */
-static long
-pg_nofile(void)
-{
-       static long no_files = 0;
-
-       if (no_files == 0)
-       {
-               /* need do this calculation only once */
-#ifndef HAVE_SYSCONF
-               no_files = (long) NOFILE;
-#else
-               no_files = sysconf(_SC_OPEN_MAX);
-               if (no_files == -1)
-               {
-                       elog(DEBUG, "pg_nofile: Unable to get _SC_OPEN_MAX using sysconf(); using %d", NOFILE);
-                       no_files = (long) NOFILE;
-               }
-#endif
-
-               if ((no_files - RESERVE_FOR_LD) < FD_MINFREE)
-                       elog(FATAL, "pg_nofile: insufficient File Descriptors in postmaster to start backend (%ld).\n"
-                                "                   O/S allows %ld, Postmaster reserves %d, We need %d (MIN) after that.",
-                                no_files - RESERVE_FOR_LD, no_files, RESERVE_FOR_LD, FD_MINFREE);
-
-               no_files -= RESERVE_FOR_LD;
-       }
-
-       return no_files;
-}
-
 #if defined(FDDEBUG)
 
 static void
@@ -290,18 +487,17 @@ _dump_lru(void)
        Vfd                *vfdP = &VfdCache[mru];
        char            buf[2048];
 
-       sprintf(buf, "LRU: MOST %d ", mru);
+       snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
        while (mru != 0)
        {
                mru = vfdP->lruLessRecently;
                vfdP = &VfdCache[mru];
-               sprintf(buf + strlen(buf), "%d ", mru);
+               snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
        }
-       sprintf(buf + strlen(buf), "LEAST");
-       elog(DEBUG, buf);
+       snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
+       elog(LOG, buf);
 }
-
-#endif  /* FDDEBUG */
+#endif   /* FDDEBUG */
 
 static void
 Delete(File file)
@@ -310,7 +506,7 @@ Delete(File file)
 
        Assert(file != 0);
 
-       DO_DB(elog(DEBUG, "Delete %d (%s)",
+       DO_DB(elog(LOG, "Delete %d (%s)",
                           file, VfdCache[file].fileName));
        DO_DB(_dump_lru());
 
@@ -326,11 +522,10 @@ static void
 LruDelete(File file)
 {
        Vfd                *vfdP;
-       int                     returnValue;
 
        Assert(file != 0);
 
-       DO_DB(elog(DEBUG, "LruDelete %d (%s)",
+       DO_DB(elog(LOG, "LruDelete %d (%s)",
                           file, VfdCache[file].fileName));
 
        vfdP = &VfdCache[file];
@@ -340,19 +535,12 @@ LruDelete(File file)
 
        /* save the seek position */
        vfdP->seekPos = (long) lseek(vfdP->fd, 0L, SEEK_CUR);
-       Assert(vfdP->seekPos != -1);
-
-       /* if we have written to the file, sync it before closing */
-       if (vfdP->fdstate & FD_DIRTY)
-       {
-               returnValue = pg_fsync(vfdP->fd);
-               Assert(returnValue != -1);
-               vfdP->fdstate &= ~FD_DIRTY;
-       }
+       Assert(vfdP->seekPos != -1L);
 
        /* close the file */
-       returnValue = close(vfdP->fd);
-       Assert(returnValue != -1);
+       if (close(vfdP->fd))
+               elog(ERROR, "failed to close \"%s\": %m",
+                        vfdP->fileName);
 
        --nfile;
        vfdP->fd = VFD_CLOSED;
@@ -365,7 +553,7 @@ Insert(File file)
 
        Assert(file != 0);
 
-       DO_DB(elog(DEBUG, "Insert %d (%s)",
+       DO_DB(elog(LOG, "Insert %d (%s)",
                           file, VfdCache[file].fileName));
        DO_DB(_dump_lru());
 
@@ -379,50 +567,52 @@ Insert(File file)
        DO_DB(_dump_lru());
 }
 
+/* returns 0 on success, -1 on re-open failure (with errno set) */
 static int
 LruInsert(File file)
 {
        Vfd                *vfdP;
-       int                     returnValue;
 
        Assert(file != 0);
 
-       DO_DB(elog(DEBUG, "LruInsert %d (%s)",
+       DO_DB(elog(LOG, "LruInsert %d (%s)",
                           file, VfdCache[file].fileName));
 
        vfdP = &VfdCache[file];
 
        if (FileIsNotOpen(file))
        {
-               while (nfile + numAllocatedFiles >= pg_nofile())
+               while (nfile + numAllocatedDescs >= max_safe_fds)
                {
-                       if (! ReleaseLruFile())
+                       if (!ReleaseLruFile())
                                break;
                }
 
                /*
-                * The open could still fail for lack of file descriptors, eg due
-                * to overall system file table being full.  So, be prepared to
-                * release another FD if necessary...
+                * The open could still fail for lack of file descriptors, eg due to
+                * overall system file table being full.  So, be prepared to release
+                * another FD if necessary...
                 */
                vfdP->fd = BasicOpenFile(vfdP->fileName, vfdP->fileFlags,
                                                                 vfdP->fileMode);
                if (vfdP->fd < 0)
                {
-                       DO_DB(elog(DEBUG, "RE_OPEN FAILED: %d", errno));
+                       DO_DB(elog(LOG, "RE_OPEN FAILED: %d", errno));
                        return vfdP->fd;
                }
                else
                {
-                       DO_DB(elog(DEBUG, "RE_OPEN SUCCESS"));
+                       DO_DB(elog(LOG, "RE_OPEN SUCCESS"));
                        ++nfile;
                }
 
                /* seek to the right position */
                if (vfdP->seekPos != 0L)
                {
-                       returnValue = lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
-                       Assert(returnValue != -1);
+                       long            returnValue;
+
+                       returnValue = (long) lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
+                       Assert(returnValue != -1L);
                }
        }
 
@@ -438,13 +628,13 @@ LruInsert(File file)
 static bool
 ReleaseLruFile(void)
 {
-       DO_DB(elog(DEBUG, "ReleaseLruFile. Opened %d", nfile));
+       DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
 
        if (nfile > 0)
        {
                /*
-                * There are opened files and so there should be at least one used
-                * vfd in the ring.
+                * There are opened files and so there should be at least one used vfd
+                * in the ring.
                 */
                Assert(VfdCache[0].lruMoreRecently != 0);
                LruDelete(VfdCache[0].lruMoreRecently);
@@ -459,45 +649,36 @@ AllocateVfd(void)
        Index           i;
        File            file;
 
-       DO_DB(elog(DEBUG, "AllocateVfd. Size %d", SizeVfdCache));
+       DO_DB(elog(LOG, "AllocateVfd. Size %lu", SizeVfdCache));
 
-       if (SizeVfdCache == 0)
-       {
-               /* initialize header entry first time through */
-               VfdCache = (Vfd *) malloc(sizeof(Vfd));
-               Assert(VfdCache != NULL);
-               MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
-               VfdCache->fd = VFD_CLOSED;
-
-               SizeVfdCache = 1;
-
-               /*
-                * register proc-exit call to ensure temp files are dropped at
-                * exit
-                */
-               on_proc_exit(AtEOXact_Files, 0);
-       }
+       Assert(SizeVfdCache > 0);       /* InitFileAccess not called? */
 
        if (VfdCache[0].nextFree == 0)
        {
-
                /*
-                * The free list is empty so it is time to increase the size of
-                * the array.  We choose to double it each time this happens.
-                * However, there's not much point in starting *real* small.
+                * The free list is empty so it is time to increase the size of the
+                * array.  We choose to double it each time this happens. However,
+                * there's not much point in starting *real* small.
                 */
                Size            newCacheSize = SizeVfdCache * 2;
+               Vfd                *newVfdCache;
 
                if (newCacheSize < 32)
                        newCacheSize = 32;
 
-               VfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
-               Assert(VfdCache != NULL);
+               /*
+                * Be careful not to clobber VfdCache ptr if realloc fails.
+                */
+               newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
+               if (newVfdCache == NULL)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_OUT_OF_MEMORY),
+                                        errmsg("out of memory")));
+               VfdCache = newVfdCache;
 
                /*
                 * Initialize the new entries and link them into the free list.
                 */
-
                for (i = SizeVfdCache; i < newCacheSize; i++)
                {
                        MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
@@ -510,7 +691,6 @@ AllocateVfd(void)
                /*
                 * Record the new size
                 */
-
                SizeVfdCache = newCacheSize;
        }
 
@@ -526,7 +706,7 @@ FreeVfd(File file)
 {
        Vfd                *vfdP = &VfdCache[file];
 
-       DO_DB(elog(DEBUG, "FreeVfd: %d (%s)",
+       DO_DB(elog(LOG, "FreeVfd: %d (%s)",
                           file, vfdP->fileName ? vfdP->fileName : ""));
 
        if (vfdP->fileName != NULL)
@@ -534,57 +714,41 @@ FreeVfd(File file)
                free(vfdP->fileName);
                vfdP->fileName = NULL;
        }
+       vfdP->fdstate = 0x0;
 
        vfdP->nextFree = VfdCache[0].nextFree;
        VfdCache[0].nextFree = file;
 }
 
-/* filepath()
- * Convert given pathname to absolute.
+/*
+ * make_database_relative()
+ *             Prepend DatabasePath to the given file name.
  *
- * (Generally, this isn't actually necessary, considering that we
- * should be cd'd into the database directory.  Presently it is only
- * necessary to do it in "bootstrap" mode.     Maybe we should change
- * bootstrap mode to do the cd, and save a few cycles/bytes here.)
+ * Result is a palloc'd string.
  */
 static char *
-filepath(char *filename)
+make_database_relative(const char *filename)
 {
        char       *buf;
-       int                     len;
-
-       /* Not an absolute path name? Then fill in with database path... */
-       if (*filename != SEP_CHAR)
-       {
-               len = strlen(DatabasePath) + strlen(filename) + 2;
-               buf = (char *) palloc(len);
-               sprintf(buf, "%s%c%s", DatabasePath, SEP_CHAR, filename);
-       }
-       else
-       {
-               buf = (char *) palloc(strlen(filename) + 1);
-               strcpy(buf, filename);
-       }
-
-#ifdef FILEDEBUG
-       printf("filepath: path is %s\n", buf);
-#endif
 
+       Assert(!is_absolute_path(filename));
+       buf = (char *) palloc(strlen(DatabasePath) + strlen(filename) + 2);
+       sprintf(buf, "%s/%s", DatabasePath, filename);
        return buf;
 }
 
+/* returns 0 on success, -1 on re-open failure (with errno set) */
 static int
 FileAccess(File file)
 {
        int                     returnValue;
 
-       DO_DB(elog(DEBUG, "FileAccess %d (%s)",
+       DO_DB(elog(LOG, "FileAccess %d (%s)",
                           file, VfdCache[file].fileName));
 
        /*
-        * Is the file open?  If not, open it and put it at the head of the
-        * LRU ring (possibly closing the least recently used file to get an
-        * FD).
+        * Is the file open?  If not, open it and put it at the head of the LRU
+        * ring (possibly closing the least recently used file to get an FD).
         */
 
        if (FileIsNotOpen(file))
@@ -595,11 +759,9 @@ FileAccess(File file)
        }
        else if (VfdCache[0].lruLessRecently != file)
        {
-
                /*
-                * We now know that the file is open and that it is not the last
-                * one accessed, so we need to move it to the head of the Lru
-                * ring.
+                * We now know that the file is open and that it is not the last one
+                * accessed, so we need to move it to the head of the Lru ring.
                 */
 
                Delete(file);
@@ -620,29 +782,40 @@ FileInvalidate(File file)
        if (!FileIsNotOpen(file))
                LruDelete(file);
 }
-
 #endif
 
-static File
-fileNameOpenFile(FileName fileName,
-                                int fileFlags,
-                                int fileMode)
+/*
+ * open a file in an arbitrary directory
+ *
+ * NB: if the passed pathname is relative (which it usually is),
+ * it will be interpreted relative to the process' working directory
+ * (which should always be $PGDATA when this code is running).
+ */
+File
+PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
 {
+       char       *fnamecopy;
        File            file;
        Vfd                *vfdP;
 
-       if (fileName == NULL)
-               elog(ERROR, "fileNameOpenFile: NULL fname");
-
-       DO_DB(elog(DEBUG, "fileNameOpenFile: %s %x %o",
+       DO_DB(elog(LOG, "PathNameOpenFile: %s %x %o",
                           fileName, fileFlags, fileMode));
 
+       /*
+        * We need a malloc'd copy of the file name; fail cleanly if no room.
+        */
+       fnamecopy = strdup(fileName);
+       if (fnamecopy == NULL)
+               ereport(ERROR,
+                               (errcode(ERRCODE_OUT_OF_MEMORY),
+                                errmsg("out of memory")));
+
        file = AllocateVfd();
        vfdP = &VfdCache[file];
 
-       while (nfile + numAllocatedFiles >= pg_nofile())
+       while (nfile + numAllocatedDescs >= max_safe_fds)
        {
-               if (! ReleaseLruFile())
+               if (!ReleaseLruFile())
                        break;
        }
 
@@ -651,36 +824,30 @@ fileNameOpenFile(FileName fileName,
        if (vfdP->fd < 0)
        {
                FreeVfd(file);
+               free(fnamecopy);
                return -1;
        }
        ++nfile;
-       DO_DB(elog(DEBUG, "fileNameOpenFile: success %d",
+       DO_DB(elog(LOG, "PathNameOpenFile: success %d",
                           vfdP->fd));
 
        Insert(file);
 
-       vfdP->fileName = malloc(strlen(fileName) + 1);
-       strcpy(vfdP->fileName, fileName);
-
-       vfdP->fileFlags = fileFlags & ~(O_TRUNC | O_EXCL);
+       vfdP->fileName = fnamecopy;
+       /* Saved flags are adjusted to be OK for re-opening file */
+       vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
        vfdP->fileMode = fileMode;
        vfdP->seekPos = 0;
-#ifdef XLOG
-       /*
-        * Have to fsync file on commit. Alternative way - log
-        * file creation and fsync log before actual file creation.
-        */
-       if (fileFlags & O_CREAT)
-               vfdP->fdstate = FD_DIRTY;
-#else
        vfdP->fdstate = 0x0;
-#endif
 
        return file;
 }
 
 /*
- * open a file in the database directory ($PGDATA/base/...)
+ * open a file in the database directory ($PGDATA/base/DIROID/)
+ *
+ * The passed name MUST be a relative path.  Effectively, this
+ * prepends DatabasePath to it and then acts like PathNameOpenFile.
  */
 File
 FileNameOpenFile(FileName fileName, int fileFlags, int fileMode)
@@ -688,50 +855,80 @@ FileNameOpenFile(FileName fileName, int fileFlags, int fileMode)
        File            fd;
        char       *fname;
 
-       fname = filepath(fileName);
-       fd = fileNameOpenFile(fname, fileFlags, fileMode);
+       fname = make_database_relative(fileName);
+       fd = PathNameOpenFile(fname, fileFlags, fileMode);
        pfree(fname);
        return fd;
 }
 
-/*
- * open a file in an arbitrary directory
- */
-File
-PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
-{
-       return fileNameOpenFile(fileName, fileFlags, fileMode);
-}
-
 /*
  * Open a temporary file that will disappear when we close it.
  *
  * This routine takes care of generating an appropriate tempfile name.
  * There's no need to pass in fileFlags or fileMode either, since only
  * one setting makes any sense for a temp file.
+ *
+ * interXact: if true, don't close the file at end-of-transaction. In
+ * most cases, you don't want temporary files to outlive the transaction
+ * that created them, so this should be false -- but if you need
+ * "somewhat" temporary storage, this might be useful. In either case,
+ * the file is removed when the File is explicitly closed.
  */
 File
-OpenTemporaryFile(void)
+OpenTemporaryFile(bool interXact)
 {
-       char            tempfilename[64];
+       char            tempfilepath[MAXPGPATH];
        File            file;
 
        /*
-        * Generate a tempfile name that's unique within the current
-        * transaction
+        * Generate a tempfile name that should be unique within the current
+        * database instance.
         */
-       snprintf(tempfilename, sizeof(tempfilename),
-                        "pg_sorttemp%d.%ld", MyProcPid, tempFileCounter++);
+       snprintf(tempfilepath, sizeof(tempfilepath),
+                        "%s/%s%d.%ld", PG_TEMP_FILES_DIR, PG_TEMP_FILE_PREFIX,
+                        MyProcPid, tempFileCounter++);
 
-       /* Open the file */
-       file = FileNameOpenFile(tempfilename,
-                                                       O_RDWR | O_CREAT | O_TRUNC | PG_BINARY, 0600);
+       /*
+        * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
+        * temp file that can be reused.
+        */
+       file = FileNameOpenFile(tempfilepath,
+                                                       O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
+                                                       0600);
        if (file <= 0)
-               elog(ERROR, "Failed to create temporary file %s", tempfilename);
+       {
+               char       *dirpath;
+
+               /*
+                * We might need to create the pg_tempfiles subdirectory, if no one
+                * has yet done so.
+                *
+                * Don't check for error from mkdir; it could fail if someone else
+                * just did the same thing.  If it doesn't work then we'll bomb out on
+                * the second create attempt, instead.
+                */
+               dirpath = make_database_relative(PG_TEMP_FILES_DIR);
+               mkdir(dirpath, S_IRWXU);
+               pfree(dirpath);
+
+               file = FileNameOpenFile(tempfilepath,
+                                                               O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
+                                                               0600);
+               if (file <= 0)
+                       elog(ERROR, "could not create temporary file \"%s\": %m",
+                                tempfilepath);
+       }
 
-       /* Mark it for deletion at close or EOXact */
+       /* Mark it for deletion at close */
        VfdCache[file].fdstate |= FD_TEMPORARY;
 
+       /* Mark it for deletion at EOXact */
+       if (!interXact)
+       {
+               VfdCache[file].fdstate |= FD_XACT_TEMPORARY;
+               VfdCache[file].create_subid = GetCurrentSubTransactionId();
+       }
+
        return file;
 }
 
@@ -741,40 +938,40 @@ OpenTemporaryFile(void)
 void
 FileClose(File file)
 {
-       int                     returnValue;
+       Vfd                *vfdP;
 
        Assert(FileIsValid(file));
 
-       DO_DB(elog(DEBUG, "FileClose: %d (%s)",
+       DO_DB(elog(LOG, "FileClose: %d (%s)",
                           file, VfdCache[file].fileName));
 
+       vfdP = &VfdCache[file];
+
        if (!FileIsNotOpen(file))
        {
-
                /* remove the file from the lru ring */
                Delete(file);
 
-               /* if we did any writes, sync the file before closing */
-               if (VfdCache[file].fdstate & FD_DIRTY)
-               {
-                       returnValue = pg_fsync(VfdCache[file].fd);
-                       Assert(returnValue != -1);
-                       VfdCache[file].fdstate &= ~FD_DIRTY;
-               }
-
                /* close the file */
-               returnValue = close(VfdCache[file].fd);
-               Assert(returnValue != -1);
+               if (close(vfdP->fd))
+                       elog(ERROR, "failed to close \"%s\": %m",
+                                vfdP->fileName);
 
                --nfile;
-               VfdCache[file].fd = VFD_CLOSED;
+               vfdP->fd = VFD_CLOSED;
        }
 
        /*
         * Delete the file if it was temporary
         */
-       if (VfdCache[file].fdstate & FD_TEMPORARY)
-               unlink(VfdCache[file].fileName);
+       if (vfdP->fdstate & FD_TEMPORARY)
+       {
+               /* reset flag so that die() interrupt won't cause problems */
+               vfdP->fdstate &= ~FD_TEMPORARY;
+               if (unlink(vfdP->fileName))
+                       elog(LOG, "failed to unlink \"%s\": %m",
+                                vfdP->fileName);
+       }
 
        /*
         * Return the Vfd slot to the free list
@@ -790,7 +987,7 @@ FileUnlink(File file)
 {
        Assert(FileIsValid(file));
 
-       DO_DB(elog(DEBUG, "FileUnlink: %d (%s)",
+       DO_DB(elog(LOG, "FileUnlink: %d (%s)",
                           file, VfdCache[file].fileName));
 
        /* force FileClose to delete it */
@@ -806,15 +1003,49 @@ FileRead(File file, char *buffer, int amount)
 
        Assert(FileIsValid(file));
 
-       DO_DB(elog(DEBUG, "FileRead: %d (%s) %d %p",
-                          file, VfdCache[file].fileName, amount, buffer));
+       DO_DB(elog(LOG, "FileRead: %d (%s) %ld %d %p",
+                          file, VfdCache[file].fileName,
+                          VfdCache[file].seekPos, amount, buffer));
 
-       FileAccess(file);
+       returnCode = FileAccess(file);
+       if (returnCode < 0)
+               return returnCode;
+
+retry:
        returnCode = read(VfdCache[file].fd, buffer, amount);
-       if (returnCode > 0)
+
+       if (returnCode >= 0)
                VfdCache[file].seekPos += returnCode;
        else
+       {
+               /*
+                * Windows may run out of kernel buffers and return "Insufficient
+                * system resources" error.  Wait a bit and retry to solve it.
+                *
+                * It is rumored that EINTR is also possible on some Unix filesystems,
+                * in which case immediate retry is indicated.
+                */
+#ifdef WIN32
+               DWORD           error = GetLastError();
+
+               switch (error)
+               {
+                       case ERROR_NO_SYSTEM_RESOURCES:
+                               pg_usleep(1000L);
+                               errno = EINTR;
+                               break;
+                       default:
+                               _dosmaperr(error);
+                               break;
+               }
+#endif
+               /* OK to retry if interrupted */
+               if (errno == EINTR)
+                       goto retry;
+
+               /* Trouble, so assume we don't know the file position anymore */
                VfdCache[file].seekPos = FileUnknownPos;
+       }
 
        return returnCode;
 }
@@ -826,32 +1057,81 @@ FileWrite(File file, char *buffer, int amount)
 
        Assert(FileIsValid(file));
 
-       DO_DB(elog(DEBUG, "FileWrite: %d (%s) %d %p",
-                          file, VfdCache[file].fileName, amount, buffer));
+       DO_DB(elog(LOG, "FileWrite: %d (%s) %ld %d %p",
+                          file, VfdCache[file].fileName,
+                          VfdCache[file].seekPos, amount, buffer));
 
-       FileAccess(file);
+       returnCode = FileAccess(file);
+       if (returnCode < 0)
+               return returnCode;
+
+retry:
+       errno = 0;
        returnCode = write(VfdCache[file].fd, buffer, amount);
-       if (returnCode > 0)
-       {
+
+       /* if write didn't set errno, assume problem is no disk space */
+       if (returnCode != amount && errno == 0)
+               errno = ENOSPC;
+
+       if (returnCode >= 0)
                VfdCache[file].seekPos += returnCode;
-#ifndef XLOG
-               /* mark the file as needing fsync */
-               VfdCache[file].fdstate |= FD_DIRTY;
-#endif
-       }
        else
+       {
+               /*
+                * See comments in FileRead()
+                */
+#ifdef WIN32
+               DWORD           error = GetLastError();
+
+               switch (error)
+               {
+                       case ERROR_NO_SYSTEM_RESOURCES:
+                               pg_usleep(1000L);
+                               errno = EINTR;
+                               break;
+                       default:
+                               _dosmaperr(error);
+                               break;
+               }
+#endif
+               /* OK to retry if interrupted */
+               if (errno == EINTR)
+                       goto retry;
+
+               /* Trouble, so assume we don't know the file position anymore */
                VfdCache[file].seekPos = FileUnknownPos;
+       }
 
        return returnCode;
 }
 
+int
+FileSync(File file)
+{
+       int                     returnCode;
+
+       Assert(FileIsValid(file));
+
+       DO_DB(elog(LOG, "FileSync: %d (%s)",
+                          file, VfdCache[file].fileName));
+
+       returnCode = FileAccess(file);
+       if (returnCode < 0)
+               return returnCode;
+
+       return pg_fsync(VfdCache[file].fd);
+}
+
 long
 FileSeek(File file, long offset, int whence)
 {
+       int                     returnCode;
+
        Assert(FileIsValid(file));
 
-       DO_DB(elog(DEBUG, "FileSeek: %d (%s) %ld %d",
-                          file, VfdCache[file].fileName, offset, whence));
+       DO_DB(elog(LOG, "FileSeek: %d (%s) %ld %ld %d",
+                          file, VfdCache[file].fileName,
+                          VfdCache[file].seekPos, offset, whence));
 
        if (FileIsNotOpen(file))
        {
@@ -859,18 +1139,21 @@ FileSeek(File file, long offset, int whence)
                {
                        case SEEK_SET:
                                if (offset < 0)
-                                       elog(ERROR, "FileSeek: invalid offset: %ld", offset);
+                                       elog(ERROR, "invalid seek offset: %ld", offset);
                                VfdCache[file].seekPos = offset;
                                break;
                        case SEEK_CUR:
                                VfdCache[file].seekPos += offset;
                                break;
                        case SEEK_END:
-                               FileAccess(file);
-                               VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+                               returnCode = FileAccess(file);
+                               if (returnCode < 0)
+                                       return returnCode;
+                               VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+                                                                                          offset, whence);
                                break;
                        default:
-                               elog(ERROR, "FileSeek: invalid whence: %d", whence);
+                               elog(ERROR, "invalid whence: %d", whence);
                                break;
                }
        }
@@ -880,19 +1163,22 @@ FileSeek(File file, long offset, int whence)
                {
                        case SEEK_SET:
                                if (offset < 0)
-                                       elog(ERROR, "FileSeek: invalid offset: %ld", offset);
+                                       elog(ERROR, "invalid seek offset: %ld", offset);
                                if (VfdCache[file].seekPos != offset)
-                                       VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+                                       VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+                                                                                                  offset, whence);
                                break;
                        case SEEK_CUR:
                                if (offset != 0 || VfdCache[file].seekPos == FileUnknownPos)
-                                       VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+                                       VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+                                                                                                  offset, whence);
                                break;
                        case SEEK_END:
-                               VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+                               VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+                                                                                          offset, whence);
                                break;
                        default:
-                               elog(ERROR, "FileSeek: invalid whence: %d", whence);
+                               elog(ERROR, "invalid whence: %d", whence);
                                break;
                }
        }
@@ -907,11 +1193,10 @@ long
 FileTell(File file)
 {
        Assert(FileIsValid(file));
-       DO_DB(elog(DEBUG, "FileTell %d (%s)",
+       DO_DB(elog(LOG, "FileTell %d (%s)",
                           file, VfdCache[file].fileName));
        return VfdCache[file].seekPos;
 }
-
 #endif
 
 int
@@ -921,154 +1206,201 @@ FileTruncate(File file, long offset)
 
        Assert(FileIsValid(file));
 
-       DO_DB(elog(DEBUG, "FileTruncate %d (%s)",
+       DO_DB(elog(LOG, "FileTruncate %d (%s)",
                           file, VfdCache[file].fileName));
 
-       FileSync(file);
-       FileAccess(file);
+       returnCode = FileAccess(file);
+       if (returnCode < 0)
+               return returnCode;
+
        returnCode = ftruncate(VfdCache[file].fd, (size_t) offset);
        return returnCode;
 }
 
+
 /*
- * FileSync --- if a file is marked as dirty, fsync it.
- *
- * The FD_DIRTY bit is slightly misnamed: it doesn't mean that we need to
- * write the file, but that we *have* written it and need to execute an
- * fsync() to ensure the changes are down on disk before we mark the current
- * transaction committed.
- *
- * FD_DIRTY is set by FileWrite or by an explicit FileMarkDirty() call.
- * It is cleared after successfully fsync'ing the file.  FileClose() will
- * fsync a dirty File that is about to be closed, since there will be no
- * other place to remember the need to fsync after the VFD is gone.
+ * Routines that want to use stdio (ie, FILE*) should use AllocateFile
+ * rather than plain fopen().  This lets fd.c deal with freeing FDs if
+ * necessary to open the file. When done, call FreeFile rather than fclose.
  *
- * Note that the DIRTY bit is logically associated with the actual disk file,
- * not with any particular kernel FD we might have open for it.  We assume
- * that fsync will force out any dirty buffers for that file, whether or not
- * they were written through the FD being used for the fsync call --- they
- * might even have been written by some other backend!
+ * Note that files that will be open for any significant length of time
+ * should NOT be handled this way, since they cannot share kernel file
+ * descriptors with other files; there is grave risk of running out of FDs
+ * if anyone locks down too many FDs.  Most callers of this routine are
+ * simply reading a config file that they will read and close immediately.
  *
- * Note also that LruDelete currently fsyncs a dirty file that it is about
- * to close the kernel file descriptor for.  The idea there is to avoid
- * having to re-open the kernel descriptor later.  But it's not real clear
- * that this is a performance win; we could end up fsyncing the same file
- * multiple times in a transaction, which would probably cost more time
- * than is saved by avoiding an open() call.  This should be studied.
+ * fd.c will automatically close all files opened with AllocateFile at
+ * transaction commit or abort; this prevents FD leakage if a routine
+ * that calls AllocateFile is terminated prematurely by ereport(ERROR).
  *
- * This routine used to think it could skip the fsync if the file is
- * physically closed, but that is now WRONG; see comments for FileMarkDirty.
+ * Ideally this should be the *only* direct call of fopen() in the backend.
  */
-int
-FileSync(File file)
+FILE *
+AllocateFile(const char *name, const char *mode)
 {
-       int                     returnCode;
+       FILE       *file;
 
-       Assert(FileIsValid(file));
+       DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
+                          numAllocatedDescs, name));
+
+       /*
+        * The test against MAX_ALLOCATED_DESCS prevents us from overflowing
+        * allocatedFiles[]; the test against max_safe_fds prevents AllocateFile
+        * from hogging every one of the available FDs, which'd lead to infinite
+        * looping.
+        */
+       if (numAllocatedDescs >= MAX_ALLOCATED_DESCS ||
+               numAllocatedDescs >= max_safe_fds - 1)
+               elog(ERROR, "too many private files demanded");
 
-       if (!(VfdCache[file].fdstate & FD_DIRTY))
+TryAgain:
+       if ((file = fopen(name, mode)) != NULL)
        {
-               /* Need not sync if file is not dirty. */
-               returnCode = 0;
+               AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
+
+               desc->kind = AllocateDescFile;
+               desc->desc.file = file;
+               desc->create_subid = GetCurrentSubTransactionId();
+               numAllocatedDescs++;
+               return desc->desc.file;
        }
-       else if (!enableFsync)
+
+       if (errno == EMFILE || errno == ENFILE)
        {
-               /* Don't force the file open if pg_fsync isn't gonna sync it. */
-               returnCode = 0;
-               VfdCache[file].fdstate &= ~FD_DIRTY;
+               int                     save_errno = errno;
+
+               ereport(LOG,
+                               (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+                                errmsg("out of file descriptors: %m; release and retry")));
+               errno = 0;
+               if (ReleaseLruFile())
+                       goto TryAgain;
+               errno = save_errno;
        }
-       else
+
+       /*
+        * TEMPORARY hack to log the Windows error code on fopen failures, in
+        * hopes of diagnosing some hard-to-reproduce problems.
+        */
+#ifdef WIN32
        {
+               int                     save_errno = errno;
 
-               /*
-                * We don't use FileAccess() because we don't want to force the
-                * file to the front of the LRU ring; we aren't expecting to
-                * access it again soon.
-                */
-               if (FileIsNotOpen(file))
-               {
-                       returnCode = LruInsert(file);
-                       if (returnCode != 0)
-                               return returnCode;
-               }
-               returnCode = pg_fsync(VfdCache[file].fd);
-               if (returnCode == 0)
-                       VfdCache[file].fdstate &= ~FD_DIRTY;
+               elog(LOG, "Windows fopen(\"%s\",\"%s\") failed: code %lu, errno %d",
+                        name, mode, GetLastError(), save_errno);
+               errno = save_errno;
        }
+#endif
 
-       return returnCode;
+       return NULL;
 }
 
 /*
- * FileMarkDirty --- mark a file as needing fsync at transaction commit.
+ * Free an AllocateDesc of either type.
  *
- * Since FileWrite marks the file dirty, this routine is not needed in
- * normal use. It is called when the buffer manager detects that some other
- * backend has written out a shared buffer that this backend dirtied (but
- * didn't write) in the current xact.  In that scenario, we need to fsync
- * the file before we can commit.  We cannot assume that the other backend
- * has fsync'd the file yet; we need to do our own fsync to ensure that
- * (a) the disk page is written and (b) this backend's commit is delayed
- * until the write is complete.
+ * The argument *must* point into the allocatedDescs[] array.
+ */
+static int
+FreeDesc(AllocateDesc *desc)
+{
+       int                     result;
+
+       /* Close the underlying object */
+       switch (desc->kind)
+       {
+               case AllocateDescFile:
+                       result = fclose(desc->desc.file);
+                       break;
+               case AllocateDescDir:
+                       result = closedir(desc->desc.dir);
+                       break;
+               default:
+                       elog(ERROR, "AllocateDesc kind not recognized");
+                       result = 0;                     /* keep compiler quiet */
+                       break;
+       }
+
+       /* Compact storage in the allocatedDescs array */
+       numAllocatedDescs--;
+       *desc = allocatedDescs[numAllocatedDescs];
+
+       return result;
+}
+
+/*
+ * Close a file returned by AllocateFile.
  *
- * Note we are assuming that an fsync issued by this backend will write
- * kernel disk buffers that were dirtied by another backend.  Furthermore,
- * it doesn't matter whether we currently have the file physically open;
- * we must fsync even if we have to re-open the file to do it.
+ * Note we do not check fclose's return value --- it is up to the caller
+ * to handle close errors.
  */
-void
-FileMarkDirty(File file)
+int
+FreeFile(FILE *file)
 {
-       Assert(FileIsValid(file));
+       int                     i;
 
-       DO_DB(elog(DEBUG, "FileMarkDirty: %d (%s)",
-                          file, VfdCache[file].fileName));
+       DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
+
+       /* Remove file from list of allocated files, if it's present */
+       for (i = numAllocatedDescs; --i >= 0;)
+       {
+               AllocateDesc *desc = &allocatedDescs[i];
+
+               if (desc->kind == AllocateDescFile && desc->desc.file == file)
+                       return FreeDesc(desc);
+       }
 
-       VfdCache[file].fdstate |= FD_DIRTY;
+       /* Only get here if someone passes us a file not in allocatedDescs */
+       elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
+
+       return fclose(file);
 }
 
 
 /*
- * Routines that want to use stdio (ie, FILE*) should use AllocateFile
- * rather than plain fopen().  This lets fd.c deal with freeing FDs if
- * necessary to open the file. When done, call FreeFile rather than fclose.
- *
- * Note that files that will be open for any significant length of time
- * should NOT be handled this way, since they cannot share kernel file
- * descriptors with other files; there is grave risk of running out of FDs
- * if anyone locks down too many FDs.  Most callers of this routine are
- * simply reading a config file that they will read and close immediately.
- *
- * fd.c will automatically close all files opened with AllocateFile at
- * transaction commit or abort; this prevents FD leakage if a routine
- * that calls AllocateFile is terminated prematurely by elog(ERROR).
+ * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
+ * rather than plain opendir().  This lets fd.c deal with freeing FDs if
+ * necessary to open the directory, and with closing it after an elog.
+ * When done, call FreeDir rather than closedir.
  *
- * Ideally this should be the *only* direct call of fopen() in the backend.
+ * Ideally this should be the *only* direct call of opendir() in the backend.
  */
-
-FILE *
-AllocateFile(char *name, char *mode)
+DIR *
+AllocateDir(const char *dirname)
 {
-       FILE       *file;
+       DIR                *dir;
 
-       DO_DB(elog(DEBUG, "AllocateFile: Allocated %d", numAllocatedFiles));
+       DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
+                          numAllocatedDescs, dirname));
 
-       if (numAllocatedFiles >= MAX_ALLOCATED_FILES)
-               elog(ERROR, "AllocateFile: too many private FDs demanded");
+       /*
+        * The test against MAX_ALLOCATED_DESCS prevents us from overflowing
+        * allocatedDescs[]; the test against max_safe_fds prevents AllocateDir
+        * from hogging every one of the available FDs, which'd lead to infinite
+        * looping.
+        */
+       if (numAllocatedDescs >= MAX_ALLOCATED_DESCS ||
+               numAllocatedDescs >= max_safe_fds - 1)
+               elog(ERROR, "too many private dirs demanded");
 
 TryAgain:
-       if ((file = fopen(name, mode)) != NULL)
+       if ((dir = opendir(dirname)) != NULL)
        {
-               allocatedFiles[numAllocatedFiles++] = file;
-               return file;
+               AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
+
+               desc->kind = AllocateDescDir;
+               desc->desc.dir = dir;
+               desc->create_subid = GetCurrentSubTransactionId();
+               numAllocatedDescs++;
+               return desc->desc.dir;
        }
 
        if (errno == EMFILE || errno == ENFILE)
        {
-               int             save_errno = errno;
+               int                     save_errno = errno;
 
-               DO_DB(elog(DEBUG, "AllocateFile: not enough descs, retry, er= %d",
-                                  errno));
+               ereport(LOG,
+                               (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+                                errmsg("out of file descriptors: %m; release and retry")));
                errno = 0;
                if (ReleaseLruFile())
                        goto TryAgain;
@@ -1078,28 +1410,89 @@ TryAgain:
        return NULL;
 }
 
-void
-FreeFile(FILE *file)
+/*
+ * Read a directory opened with AllocateDir, ereport'ing any error.
+ *
+ * This is easier to use than raw readdir() since it takes care of some
+ * otherwise rather tedious and error-prone manipulation of errno.     Also,
+ * if you are happy with a generic error message for AllocateDir failure,
+ * you can just do
+ *
+ *             dir = AllocateDir(path);
+ *             while ((dirent = ReadDir(dir, path)) != NULL)
+ *                     process dirent;
+ *             FreeDir(dir);
+ *
+ * since a NULL dir parameter is taken as indicating AllocateDir failed.
+ * (Make sure errno hasn't been changed since AllocateDir if you use this
+ * shortcut.)
+ *
+ * The pathname passed to AllocateDir must be passed to this routine too,
+ * but it is only used for error reporting.
+ */
+struct dirent *
+ReadDir(DIR *dir, const char *dirname)
+{
+       struct dirent *dent;
+
+       /* Give a generic message for AllocateDir failure, if caller didn't */
+       if (dir == NULL)
+               ereport(ERROR,
+                               (errcode_for_file_access(),
+                                errmsg("could not open directory \"%s\": %m",
+                                               dirname)));
+
+       errno = 0;
+       if ((dent = readdir(dir)) != NULL)
+               return dent;
+
+#ifdef WIN32
+
+       /*
+        * This fix is in mingw cvs (runtime/mingwex/dirent.c rev 1.4), but not in
+        * released version
+        */
+       if (GetLastError() == ERROR_NO_MORE_FILES)
+               errno = 0;
+#endif
+
+       if (errno)
+               ereport(ERROR,
+                               (errcode_for_file_access(),
+                                errmsg("could not read directory \"%s\": %m",
+                                               dirname)));
+       return NULL;
+}
+
+/*
+ * Close a directory opened with AllocateDir.
+ *
+ * Note we do not check closedir's return value --- it is up to the caller
+ * to handle close errors.
+ */
+int
+FreeDir(DIR *dir)
 {
        int                     i;
 
-       DO_DB(elog(DEBUG, "FreeFile: Allocated %d", numAllocatedFiles));
+       DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
 
-       /* Remove file from list of allocated files, if it's present */
-       for (i = numAllocatedFiles; --i >= 0;)
+       /* Remove dir from list of allocated dirs, if it's present */
+       for (i = numAllocatedDescs; --i >= 0;)
        {
-               if (allocatedFiles[i] == file)
-               {
-                       allocatedFiles[i] = allocatedFiles[--numAllocatedFiles];
-                       break;
-               }
+               AllocateDesc *desc = &allocatedDescs[i];
+
+               if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
+                       return FreeDesc(desc);
        }
-       if (i < 0)
-               elog(NOTICE, "FreeFile: file was not obtained from AllocateFile");
 
-       fclose(file);
+       /* Only get here if someone passes us a dir not in allocatedDescs */
+       elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
+
+       return closedir(dir);
 }
 
+
 /*
  * closeAllVfds
  *
@@ -1124,21 +1517,88 @@ closeAllVfds(void)
 }
 
 /*
- * AtEOXact_Files
+ * AtEOSubXact_Files
  *
- * This routine is called during transaction commit or abort or backend
- * exit (it doesn't particularly care which).  All still-open temporary-file
- * VFDs are closed, which also causes the underlying files to be deleted.
- * Furthermore, all "allocated" stdio files are closed.
+ * Take care of subtransaction commit/abort.  At abort, we close temp files
+ * that the subtransaction may have opened.  At commit, we reassign the
+ * files that were opened to the parent subtransaction.
+ */
+void
+AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
+                                 SubTransactionId parentSubid)
+{
+       Index           i;
+
+       if (SizeVfdCache > 0)
+       {
+               Assert(FileIsNotOpen(0));               /* Make sure ring not corrupted */
+               for (i = 1; i < SizeVfdCache; i++)
+               {
+                       unsigned short fdstate = VfdCache[i].fdstate;
+
+                       if ((fdstate & FD_XACT_TEMPORARY) &&
+                               VfdCache[i].create_subid == mySubid)
+                       {
+                               if (isCommit)
+                                       VfdCache[i].create_subid = parentSubid;
+                               else if (VfdCache[i].fileName != NULL)
+                                       FileClose(i);
+                       }
+               }
+       }
+
+       for (i = 0; i < numAllocatedDescs; i++)
+       {
+               if (allocatedDescs[i].create_subid == mySubid)
+               {
+                       if (isCommit)
+                               allocatedDescs[i].create_subid = parentSubid;
+                       else
+                       {
+                               /* have to recheck the item after FreeDesc (ugly) */
+                               FreeDesc(&allocatedDescs[i--]);
+                       }
+               }
+       }
+}
+
+/*
+ * AtEOXact_Files
  *
- * This routine is not involved in fsync'ing non-temporary files at xact
- * commit; that is done by FileSync under control of the buffer manager.
- * During a commit, that is done *before* control gets here.  If we still
- * have any needs-fsync bits set when we get here, we assume this is abort
- * and clear them.
+ * This routine is called during transaction commit or abort (it doesn't
+ * particularly care which).  All still-open per-transaction temporary file
+ * VFDs are closed, which also causes the underlying files to be
+ * deleted. Furthermore, all "allocated" stdio files are closed.
  */
 void
 AtEOXact_Files(void)
+{
+       CleanupTempFiles(false);
+}
+
+/*
+ * AtProcExit_Files
+ *
+ * on_proc_exit hook to clean up temp files during backend shutdown.
+ * Here, we want to clean up *all* temp files including interXact ones.
+ */
+static void
+AtProcExit_Files(int code, Datum arg)
+{
+       CleanupTempFiles(true);
+}
+
+/*
+ * Close temporary files and delete their underlying files.
+ *
+ * isProcExit: if true, this is being called as the backend process is
+ * exiting. If that's the case, we should remove all temporary files; if
+ * that's not the case, we are being called for transaction commit/abort
+ * and should only remove transaction-local temp files.  In either case,
+ * also clean up "allocated" stdio files and dirs.
+ */
+static void
+CleanupTempFiles(bool isProcExit)
 {
        Index           i;
 
@@ -1147,20 +1607,110 @@ AtEOXact_Files(void)
                Assert(FileIsNotOpen(0));               /* Make sure ring not corrupted */
                for (i = 1; i < SizeVfdCache; i++)
                {
-                       if ((VfdCache[i].fdstate & FD_TEMPORARY) &&
-                               VfdCache[i].fileName != NULL)
-                               FileClose(i);
-                       else
-                               VfdCache[i].fdstate &= ~FD_DIRTY;
+                       unsigned short fdstate = VfdCache[i].fdstate;
+
+                       if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL)
+                       {
+                               /*
+                                * If we're in the process of exiting a backend process, close
+                                * all temporary files. Otherwise, only close temporary files
+                                * local to the current transaction.
+                                */
+                               if (isProcExit || (fdstate & FD_XACT_TEMPORARY))
+                                       FileClose(i);
+                       }
                }
        }
 
-       while (numAllocatedFiles > 0)
-               FreeFile(allocatedFiles[0]);
+       while (numAllocatedDescs > 0)
+               FreeDesc(&allocatedDescs[0]);
+}
+
+
+/*
+ * Remove temporary files left over from a prior postmaster session
+ *
+ * This should be called during postmaster startup.  It will forcibly
+ * remove any leftover files created by OpenTemporaryFile.
+ *
+ * NOTE: we could, but don't, call this during a post-backend-crash restart
+ * cycle.  The argument for not doing it is that someone might want to examine
+ * the temp files for debugging purposes.  This does however mean that
+ * OpenTemporaryFile had better allow for collision with an existing temp
+ * file name.
+ */
+void
+RemovePgTempFiles(void)
+{
+       char            temp_path[MAXPGPATH];
+       DIR                *db_dir;
+       struct dirent *db_de;
+
+       /*
+        * Cycle through pgsql_tmp directories for all databases and remove old
+        * temp files.
+        */
+       db_dir = AllocateDir("base");
+
+       while ((db_de = ReadDir(db_dir, "base")) != NULL)
+       {
+               if (strcmp(db_de->d_name, ".") == 0 ||
+                       strcmp(db_de->d_name, "..") == 0)
+                       continue;
+
+               snprintf(temp_path, sizeof(temp_path), "base/%s/%s",
+                                db_de->d_name, PG_TEMP_FILES_DIR);
+               RemovePgTempFilesInDir(temp_path);
+       }
+
+       FreeDir(db_dir);
 
        /*
-        * Reset the tempfile name counter to 0; not really necessary, but
-        * helps keep the names from growing unreasonably long.
+        * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
+        * DataDir as well.
         */
-       tempFileCounter = 0;
+#ifdef EXEC_BACKEND
+       RemovePgTempFilesInDir(PG_TEMP_FILES_DIR);
+#endif
+}
+
+/* Process one pgsql_tmp directory for RemovePgTempFiles */
+static void
+RemovePgTempFilesInDir(const char *tmpdirname)
+{
+       DIR                *temp_dir;
+       struct dirent *temp_de;
+       char            rm_path[MAXPGPATH];
+
+       temp_dir = AllocateDir(tmpdirname);
+       if (temp_dir == NULL)
+       {
+               /* anything except ENOENT is fishy */
+               if (errno != ENOENT)
+                       elog(LOG,
+                                "could not open temporary-files directory \"%s\": %m",
+                                tmpdirname);
+               return;
+       }
+
+       while ((temp_de = ReadDir(temp_dir, tmpdirname)) != NULL)
+       {
+               if (strcmp(temp_de->d_name, ".") == 0 ||
+                       strcmp(temp_de->d_name, "..") == 0)
+                       continue;
+
+               snprintf(rm_path, sizeof(rm_path), "%s/%s",
+                                tmpdirname, temp_de->d_name);
+
+               if (strncmp(temp_de->d_name,
+                                       PG_TEMP_FILE_PREFIX,
+                                       strlen(PG_TEMP_FILE_PREFIX)) == 0)
+                       unlink(rm_path);        /* note we ignore any error */
+               else
+                       elog(LOG,
+                                "unexpected file found in temporary-files directory: \"%s\"",
+                                rm_path);
+       }
+
+       FreeDir(temp_dir);
 }