]> granicus.if.org Git - postgresql/blobdiff - src/backend/storage/file/fd.c
pgindent run for 8.2.
[postgresql] / src / backend / storage / file / fd.c
index 271a752a623fc2a9a5263702979a1e2492c0ac22..a594b16edf35f12e15e04d7772f190eda54165b0 100644 (file)
@@ -3,11 +3,11 @@
  * fd.c
  *       Virtual file descriptor code.
  *
- * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.100 2003/08/04 00:43:23 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.130 2006/10/04 00:29:57 momjian Exp $
  *
  * NOTES:
  *
 #include <sys/file.h>
 #include <sys/param.h>
 #include <sys/stat.h>
-#include <dirent.h>
-#include <errno.h>
 #include <unistd.h>
 #include <fcntl.h>
 
 #include "miscadmin.h"
+#include "access/xact.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
 
 
-/* Filename components for OpenTemporaryFile */
-#define PG_TEMP_FILES_DIR "pgsql_tmp"
-#define PG_TEMP_FILE_PREFIX "pgsql_tmp"
-
-
 /*
- * Problem: Postgres does a system(ld...) to do dynamic loading.
- * This will open several extra files in addition to those used by
- * Postgres.  We need to guarantee that there are file descriptors free
- * for ld to use.
- *
- * The current solution is to limit the number of file descriptors
- * that this code will allocate at one time: it leaves RESERVE_FOR_LD free.
+ * We must leave some file descriptors free for system(), the dynamic loader,
+ * and other code that tries to open files without consulting fd.c.  This
+ * is the number left free.  (While we can be pretty sure we won't get
+ * EMFILE, there's never any guarantee that we won't get ENFILE due to
+ * other processes chewing up FDs.     So it's a bad idea to try to open files
+ * without consulting fd.c.  Nonetheless we cannot control all code.)
  *
- * (Even though most dynamic loaders now use dlopen(3) or the
- * equivalent, the OS must still open several files to perform the
- * dynamic loading.  And stdin/stdout/stderr count too.  Keep this here.)
+ * Because this is just a fixed setting, we are effectively assuming that
+ * no such code will leave FDs open over the long term; otherwise the slop
+ * is likely to be insufficient.  Note in particular that we expect that
+ * loading a shared library does not result in any permanent increase in
+ * the number of open files.  (This appears to be true on most if not
+ * all platforms as of Feb 2004.)
  */
-#ifndef RESERVE_FOR_LD
-#define RESERVE_FOR_LD 10
-#endif
+#define NUM_RESERVED_FDS               10
 
 /*
- * We need to ensure that we have at least some file descriptors
- * available to postgreSQL after we've reserved the ones for LD,
- * so we set that value here.
- *
- * I think 10 is an appropriate value so that's what it'll be
- * for now.
+ * If we have fewer than this many usable FDs after allowing for the reserved
+ * ones, choke.
  */
-#ifndef FD_MINFREE
-#define FD_MINFREE 10
-#endif
+#define FD_MINFREE                             10
+
 
 /*
- * A number of platforms return values for sysconf(_SC_OPEN_MAX) that are
- * far beyond what they can really support.  This GUC parameter limits what
- * we will believe.
+ * A number of platforms allow individual processes to open many more files
+ * than they can really support when *many* processes do the same thing.
+ * This GUC parameter lets the DBA limit max_safe_fds to something less than
+ * what the postmaster's initial probe suggests will work.
  */
 int                    max_files_per_process = 1000;
 
+/*
+ * Maximum number of file descriptors to open for either VFD entries or
+ * AllocateFile/AllocateDir operations.  This is initialized to a conservative
+ * value, and remains that way indefinitely in bootstrap or standalone-backend
+ * cases.  In normal postmaster operation, the postmaster calls
+ * set_max_safe_fds() late in initialization to update the value, and that
+ * value is then inherited by forked subprocesses.
+ *
+ * Note: the value of max_files_per_process is taken into account while
+ * setting this variable, and so need not be tested separately.
+ */
+static int     max_safe_fds = 32;      /* default if not changed */
+
 
 /* Debugging.... */
 
@@ -120,6 +123,7 @@ typedef struct vfd
 {
        signed short fd;                        /* current FD, or VFD_CLOSED if none */
        unsigned short fdstate;         /* bitflags for VFD's state */
+       SubTransactionId create_subid;          /* for TEMPORARY fds, creating subxact */
        File            nextFree;               /* link to next free VFD, if in freelist */
        File            lruMoreRecently;        /* doubly linked recency-of-use list */
        File            lruLessRecently;
@@ -144,16 +148,34 @@ static Size SizeVfdCache = 0;
 static int     nfile = 0;
 
 /*
- * List of stdio FILEs opened with AllocateFile.
+ * List of stdio FILEs and <dirent.h> DIRs opened with AllocateFile
+ * and AllocateDir.
  *
- * Since we don't want to encourage heavy use of AllocateFile, it seems
- * OK to put a pretty small maximum limit on the number of simultaneously
- * allocated files.
+ * Since we don't want to encourage heavy use of AllocateFile or AllocateDir,
+ * it seems OK to put a pretty small maximum limit on the number of
+ * simultaneously allocated descs.
  */
-#define MAX_ALLOCATED_FILES  32
+#define MAX_ALLOCATED_DESCS  32
 
-static int     numAllocatedFiles = 0;
-static FILE *allocatedFiles[MAX_ALLOCATED_FILES];
+typedef enum
+{
+       AllocateDescFile,
+       AllocateDescDir
+} AllocateDescKind;
+
+typedef struct
+{
+       AllocateDescKind kind;
+       union
+       {
+               FILE       *file;
+               DIR                *dir;
+       }                       desc;
+       SubTransactionId create_subid;
+} AllocateDesc;
+
+static int     numAllocatedDescs = 0;
+static AllocateDesc allocatedDescs[MAX_ALLOCATED_DESCS];
 
 /*
  * Number of temporary files opened during the current session;
@@ -202,18 +224,33 @@ static File AllocateVfd(void);
 static void FreeVfd(File file);
 
 static int     FileAccess(File file);
-static File fileNameOpenFile(FileName fileName, int fileFlags, int fileMode);
-static char *filepath(const char *filename);
-static long pg_nofile(void);
-static void AtProcExit_Files(void);
+static char *make_database_relative(const char *filename);
+static void AtProcExit_Files(int code, Datum arg);
 static void CleanupTempFiles(bool isProcExit);
+static void RemovePgTempFilesInDir(const char *tmpdirname);
 
 
 /*
- * pg_fsync --- same as fsync except does nothing if enableFsync is off
+ * pg_fsync --- do fsync with or without writethrough
  */
 int
 pg_fsync(int fd)
+{
+#ifndef HAVE_FSYNC_WRITETHROUGH_ONLY
+       if (sync_method != SYNC_METHOD_FSYNC_WRITETHROUGH)
+               return pg_fsync_no_writethrough(fd);
+       else
+#endif
+               return pg_fsync_writethrough(fd);
+}
+
+
+/*
+ * pg_fsync_no_writethrough --- same as fsync except does nothing if
+ *     enableFsync is off
+ */
+int
+pg_fsync_no_writethrough(int fd)
 {
        if (enableFsync)
                return fsync(fd);
@@ -221,6 +258,26 @@ pg_fsync(int fd)
                return 0;
 }
 
+/*
+ * pg_fsync_writethrough
+ */
+int
+pg_fsync_writethrough(int fd)
+{
+       if (enableFsync)
+       {
+#ifdef WIN32
+               return _commit(fd);
+#elif defined(F_FULLFSYNC)
+               return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
+#else
+               return -1;
+#endif
+       }
+       else
+               return 0;
+}
+
 /*
  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
  *
@@ -241,6 +298,143 @@ pg_fdatasync(int fd)
                return 0;
 }
 
+/*
+ * InitFileAccess --- initialize this module during backend startup
+ *
+ * This is called during either normal or standalone backend start.
+ * It is *not* called in the postmaster.
+ */
+void
+InitFileAccess(void)
+{
+       Assert(SizeVfdCache == 0);      /* call me only once */
+
+       /* initialize cache header entry */
+       VfdCache = (Vfd *) malloc(sizeof(Vfd));
+       if (VfdCache == NULL)
+               ereport(FATAL,
+                               (errcode(ERRCODE_OUT_OF_MEMORY),
+                                errmsg("out of memory")));
+
+       MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
+       VfdCache->fd = VFD_CLOSED;
+
+       SizeVfdCache = 1;
+
+       /* register proc-exit hook to ensure temp files are dropped at exit */
+       on_proc_exit(AtProcExit_Files, 0);
+}
+
+/*
+ * count_usable_fds --- count how many FDs the system will let us open,
+ *             and estimate how many are already open.
+ *
+ * We stop counting if usable_fds reaches max_to_probe.  Note: a small
+ * value of max_to_probe might result in an underestimate of already_open;
+ * we must fill in any "gaps" in the set of used FDs before the calculation
+ * of already_open will give the right answer. In practice, max_to_probe
+ * of a couple of dozen should be enough to ensure good results.
+ *
+ * We assume stdin (FD 0) is available for dup'ing
+ */
+static void
+count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
+{
+       int                *fd;
+       int                     size;
+       int                     used = 0;
+       int                     highestfd = 0;
+       int                     j;
+
+       size = 1024;
+       fd = (int *) palloc(size * sizeof(int));
+
+       /* dup until failure or probe limit reached */
+       for (;;)
+       {
+               int                     thisfd;
+
+               thisfd = dup(0);
+               if (thisfd < 0)
+               {
+                       /* Expect EMFILE or ENFILE, else it's fishy */
+                       if (errno != EMFILE && errno != ENFILE)
+                               elog(WARNING, "dup(0) failed after %d successes: %m", used);
+                       break;
+               }
+
+               if (used >= size)
+               {
+                       size *= 2;
+                       fd = (int *) repalloc(fd, size * sizeof(int));
+               }
+               fd[used++] = thisfd;
+
+               if (highestfd < thisfd)
+                       highestfd = thisfd;
+
+               if (used >= max_to_probe)
+                       break;
+       }
+
+       /* release the files we opened */
+       for (j = 0; j < used; j++)
+               close(fd[j]);
+
+       pfree(fd);
+
+       /*
+        * Return results.      usable_fds is just the number of successful dups. We
+        * assume that the system limit is highestfd+1 (remember 0 is a legal FD
+        * number) and so already_open is highestfd+1 - usable_fds.
+        */
+       *usable_fds = used;
+       *already_open = highestfd + 1 - used;
+}
+
+/*
+ * set_max_safe_fds
+ *             Determine number of filedescriptors that fd.c is allowed to use
+ */
+void
+set_max_safe_fds(void)
+{
+       int                     usable_fds;
+       int                     already_open;
+
+       /*----------
+        * We want to set max_safe_fds to
+        *                      MIN(usable_fds, max_files_per_process - already_open)
+        * less the slop factor for files that are opened without consulting
+        * fd.c.  This ensures that we won't exceed either max_files_per_process
+        * or the experimentally-determined EMFILE limit.
+        *----------
+        */
+       count_usable_fds(max_files_per_process,
+                                        &usable_fds, &already_open);
+
+       max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
+
+       /*
+        * Take off the FDs reserved for system() etc.
+        */
+       max_safe_fds -= NUM_RESERVED_FDS;
+
+       /*
+        * Make sure we still have enough to get by.
+        */
+       if (max_safe_fds < FD_MINFREE)
+               ereport(FATAL,
+                               (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+                                errmsg("insufficient file descriptors available to start server process"),
+                                errdetail("System allows %d, we need at least %d.",
+                                                  max_safe_fds + NUM_RESERVED_FDS,
+                                                  FD_MINFREE + NUM_RESERVED_FDS)));
+
+       elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
+                max_safe_fds, usable_fds, already_open);
+}
+
 /*
  * BasicOpenFile --- same as open(2) except can free other FDs if needed
  *
@@ -274,7 +468,7 @@ tryAgain:
 
                ereport(LOG,
                                (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
-                         errmsg("out of file descriptors: %m; release and retry")));
+                                errmsg("out of file descriptors: %m; release and retry")));
                errno = 0;
                if (ReleaseLruFile())
                        goto tryAgain;
@@ -284,63 +478,6 @@ tryAgain:
        return -1;                                      /* failure */
 }
 
-/*
- * pg_nofile: determine number of filedescriptors that fd.c is allowed to use
- */
-static long
-pg_nofile(void)
-{
-       static long no_files = 0;
-
-       /* need do this calculation only once */
-       if (no_files == 0)
-       {
-               /*
-                * Ask the system what its files-per-process limit is.
-                */
-#ifdef HAVE_SYSCONF
-               no_files = sysconf(_SC_OPEN_MAX);
-               if (no_files <= 0)
-               {
-#ifdef NOFILE
-                       no_files = (long) NOFILE;
-#else
-                       no_files = (long) max_files_per_process;
-#endif
-                       elog(LOG, "sysconf(_SC_OPEN_MAX) failed; using %ld",
-                                no_files);
-               }
-#else                                                  /* !HAVE_SYSCONF */
-#ifdef NOFILE
-               no_files = (long) NOFILE;
-#else
-               no_files = (long) max_files_per_process;
-#endif
-#endif   /* HAVE_SYSCONF */
-
-               /*
-                * Some platforms return hopelessly optimistic values.  Apply a
-                * configurable upper limit.
-                */
-               if (no_files > (long) max_files_per_process)
-                       no_files = (long) max_files_per_process;
-
-               /*
-                * Make sure we have enough to get by after reserving some for LD.
-                */
-               if ((no_files - RESERVE_FOR_LD) < FD_MINFREE)
-                       ereport(FATAL,
-                                       (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
-                                        errmsg("insufficient file descriptors available to start backend"),
-                                        errdetail("System allows %ld, we need at least %d.",
-                                                          no_files, RESERVE_FOR_LD + FD_MINFREE)));
-
-               no_files -= RESERVE_FOR_LD;
-       }
-
-       return no_files;
-}
-
 #if defined(FDDEBUG)
 
 static void
@@ -402,7 +539,7 @@ LruDelete(File file)
 
        /* close the file */
        if (close(vfdP->fd))
-               elog(LOG, "failed to close \"%s\": %m",
+               elog(ERROR, "failed to close \"%s\": %m",
                         vfdP->fileName);
 
        --nfile;
@@ -430,6 +567,7 @@ Insert(File file)
        DO_DB(_dump_lru());
 }
 
+/* returns 0 on success, -1 on re-open failure (with errno set) */
 static int
 LruInsert(File file)
 {
@@ -444,16 +582,16 @@ LruInsert(File file)
 
        if (FileIsNotOpen(file))
        {
-               while (nfile + numAllocatedFiles >= pg_nofile())
+               while (nfile + numAllocatedDescs >= max_safe_fds)
                {
                        if (!ReleaseLruFile())
                                break;
                }
 
                /*
-                * The open could still fail for lack of file descriptors, eg due
-                * to overall system file table being full.  So, be prepared to
-                * release another FD if necessary...
+                * The open could still fail for lack of file descriptors, eg due to
+                * overall system file table being full.  So, be prepared to release
+                * another FD if necessary...
                 */
                vfdP->fd = BasicOpenFile(vfdP->fileName, vfdP->fileFlags,
                                                                 vfdP->fileMode);
@@ -495,8 +633,8 @@ ReleaseLruFile(void)
        if (nfile > 0)
        {
                /*
-                * There are opened files and so there should be at least one used
-                * vfd in the ring.
+                * There are opened files and so there should be at least one used vfd
+                * in the ring.
                 */
                Assert(VfdCache[0].lruMoreRecently != 0);
                LruDelete(VfdCache[0].lruMoreRecently);
@@ -511,34 +649,16 @@ AllocateVfd(void)
        Index           i;
        File            file;
 
-       DO_DB(elog(LOG, "AllocateVfd. Size %d", SizeVfdCache));
+       DO_DB(elog(LOG, "AllocateVfd. Size %lu", SizeVfdCache));
 
-       if (SizeVfdCache == 0)
-       {
-               /* initialize header entry first time through */
-               VfdCache = (Vfd *) malloc(sizeof(Vfd));
-               if (VfdCache == NULL)
-                       ereport(FATAL,
-                                       (errcode(ERRCODE_OUT_OF_MEMORY),
-                                        errmsg("out of memory")));
-               MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
-               VfdCache->fd = VFD_CLOSED;
-
-               SizeVfdCache = 1;
-
-               /*
-                * register proc-exit call to ensure temp files are dropped at
-                * exit
-                */
-               on_proc_exit(AtProcExit_Files, 0);
-       }
+       Assert(SizeVfdCache > 0);       /* InitFileAccess not called? */
 
        if (VfdCache[0].nextFree == 0)
        {
                /*
-                * The free list is empty so it is time to increase the size of
-                * the array.  We choose to double it each time this happens.
-                * However, there's not much point in starting *real* small.
+                * The free list is empty so it is time to increase the size of the
+                * array.  We choose to double it each time this happens. However,
+                * there's not much point in starting *real* small.
                 */
                Size            newCacheSize = SizeVfdCache * 2;
                Vfd                *newVfdCache;
@@ -600,37 +720,24 @@ FreeVfd(File file)
        VfdCache[0].nextFree = file;
 }
 
-/* filepath()
- * Convert given pathname to absolute.
+/*
+ * make_database_relative()
+ *             Prepend DatabasePath to the given file name.
  *
  * Result is a palloc'd string.
- *
- * (Generally, this isn't actually necessary, considering that we
- * should be cd'd into the database directory.  Presently it is only
- * necessary to do it in "bootstrap" mode.     Maybe we should change
- * bootstrap mode to do the cd, and save a few cycles/bytes here.)
  */
 static char *
-filepath(const char *filename)
+make_database_relative(const char *filename)
 {
        char       *buf;
 
-       /* Not an absolute path name? Then fill in with database path... */
-       if (!is_absolute_path(filename))
-       {
-               buf = (char *) palloc(strlen(DatabasePath) + strlen(filename) + 2);
-               sprintf(buf, "%s/%s", DatabasePath, filename);
-       }
-       else
-               buf = pstrdup(filename);
-
-#ifdef FILEDEBUG
-       printf("filepath: path is %s\n", buf);
-#endif
-
+       Assert(!is_absolute_path(filename));
+       buf = (char *) palloc(strlen(DatabasePath) + strlen(filename) + 2);
+       sprintf(buf, "%s/%s", DatabasePath, filename);
        return buf;
 }
 
+/* returns 0 on success, -1 on re-open failure (with errno set) */
 static int
 FileAccess(File file)
 {
@@ -640,9 +747,8 @@ FileAccess(File file)
                           file, VfdCache[file].fileName));
 
        /*
-        * Is the file open?  If not, open it and put it at the head of the
-        * LRU ring (possibly closing the least recently used file to get an
-        * FD).
+        * Is the file open?  If not, open it and put it at the head of the LRU
+        * ring (possibly closing the least recently used file to get an FD).
         */
 
        if (FileIsNotOpen(file))
@@ -654,9 +760,8 @@ FileAccess(File file)
        else if (VfdCache[0].lruLessRecently != file)
        {
                /*
-                * We now know that the file is open and that it is not the last
-                * one accessed, so we need to move it to the head of the Lru
-                * ring.
+                * We now know that the file is open and that it is not the last one
+                * accessed, so we need to move it to the head of the Lru ring.
                 */
 
                Delete(file);
@@ -679,16 +784,21 @@ FileInvalidate(File file)
 }
 #endif
 
-static File
-fileNameOpenFile(FileName fileName,
-                                int fileFlags,
-                                int fileMode)
+/*
+ * open a file in an arbitrary directory
+ *
+ * NB: if the passed pathname is relative (which it usually is),
+ * it will be interpreted relative to the process' working directory
+ * (which should always be $PGDATA when this code is running).
+ */
+File
+PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
 {
        char       *fnamecopy;
        File            file;
        Vfd                *vfdP;
 
-       DO_DB(elog(LOG, "fileNameOpenFile: %s %x %o",
+       DO_DB(elog(LOG, "PathNameOpenFile: %s %x %o",
                           fileName, fileFlags, fileMode));
 
        /*
@@ -703,7 +813,7 @@ fileNameOpenFile(FileName fileName,
        file = AllocateVfd();
        vfdP = &VfdCache[file];
 
-       while (nfile + numAllocatedFiles >= pg_nofile())
+       while (nfile + numAllocatedDescs >= max_safe_fds)
        {
                if (!ReleaseLruFile())
                        break;
@@ -718,7 +828,7 @@ fileNameOpenFile(FileName fileName,
                return -1;
        }
        ++nfile;
-       DO_DB(elog(LOG, "fileNameOpenFile: success %d",
+       DO_DB(elog(LOG, "PathNameOpenFile: success %d",
                           vfdP->fd));
 
        Insert(file);
@@ -734,7 +844,10 @@ fileNameOpenFile(FileName fileName,
 }
 
 /*
- * open a file in the database directory ($PGDATA/base/...)
+ * open a file in the database directory ($PGDATA/base/DIROID/)
+ *
+ * The passed name MUST be a relative path.  Effectively, this
+ * prepends DatabasePath to it and then acts like PathNameOpenFile.
  */
 File
 FileNameOpenFile(FileName fileName, int fileFlags, int fileMode)
@@ -742,21 +855,12 @@ FileNameOpenFile(FileName fileName, int fileFlags, int fileMode)
        File            fd;
        char       *fname;
 
-       fname = filepath(fileName);
-       fd = fileNameOpenFile(fname, fileFlags, fileMode);
+       fname = make_database_relative(fileName);
+       fd = PathNameOpenFile(fname, fileFlags, fileMode);
        pfree(fname);
        return fd;
 }
 
-/*
- * open a file in an arbitrary directory
- */
-File
-PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
-{
-       return fileNameOpenFile(fileName, fileFlags, fileMode);
-}
-
 /*
  * Open a temporary file that will disappear when we close it.
  *
@@ -785,8 +889,8 @@ OpenTemporaryFile(bool interXact)
                         MyProcPid, tempFileCounter++);
 
        /*
-        * Open the file.  Note: we don't use O_EXCL, in case there is an
-        * orphaned temp file that can be reused.
+        * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
+        * temp file that can be reused.
         */
        file = FileNameOpenFile(tempfilepath,
                                                        O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
@@ -796,14 +900,14 @@ OpenTemporaryFile(bool interXact)
                char       *dirpath;
 
                /*
-                * We might need to create the pg_tempfiles subdirectory, if no
-                * one has yet done so.
+                * We might need to create the pg_tempfiles subdirectory, if no one
+                * has yet done so.
                 *
                 * Don't check for error from mkdir; it could fail if someone else
-                * just did the same thing.  If it doesn't work then we'll bomb
-                * out on the second create attempt, instead.
+                * just did the same thing.  If it doesn't work then we'll bomb out on
+                * the second create attempt, instead.
                 */
-               dirpath = filepath(PG_TEMP_FILES_DIR);
+               dirpath = make_database_relative(PG_TEMP_FILES_DIR);
                mkdir(dirpath, S_IRWXU);
                pfree(dirpath);
 
@@ -820,7 +924,10 @@ OpenTemporaryFile(bool interXact)
 
        /* Mark it for deletion at EOXact */
        if (!interXact)
+       {
                VfdCache[file].fdstate |= FD_XACT_TEMPORARY;
+               VfdCache[file].create_subid = GetCurrentSubTransactionId();
+       }
 
        return file;
 }
@@ -847,7 +954,7 @@ FileClose(File file)
 
                /* close the file */
                if (close(vfdP->fd))
-                       elog(LOG, "failed to close \"%s\": %m",
+                       elog(ERROR, "failed to close \"%s\": %m",
                                 vfdP->fileName);
 
                --nfile;
@@ -900,12 +1007,45 @@ FileRead(File file, char *buffer, int amount)
                           file, VfdCache[file].fileName,
                           VfdCache[file].seekPos, amount, buffer));
 
-       FileAccess(file);
+       returnCode = FileAccess(file);
+       if (returnCode < 0)
+               return returnCode;
+
+retry:
        returnCode = read(VfdCache[file].fd, buffer, amount);
-       if (returnCode > 0)
+
+       if (returnCode >= 0)
                VfdCache[file].seekPos += returnCode;
        else
+       {
+               /*
+                * Windows may run out of kernel buffers and return "Insufficient
+                * system resources" error.  Wait a bit and retry to solve it.
+                *
+                * It is rumored that EINTR is also possible on some Unix filesystems,
+                * in which case immediate retry is indicated.
+                */
+#ifdef WIN32
+               DWORD           error = GetLastError();
+
+               switch (error)
+               {
+                       case ERROR_NO_SYSTEM_RESOURCES:
+                               pg_usleep(1000L);
+                               errno = EINTR;
+                               break;
+                       default:
+                               _dosmaperr(error);
+                               break;
+               }
+#endif
+               /* OK to retry if interrupted */
+               if (errno == EINTR)
+                       goto retry;
+
+               /* Trouble, so assume we don't know the file position anymore */
                VfdCache[file].seekPos = FileUnknownPos;
+       }
 
        return returnCode;
 }
@@ -921,8 +1061,11 @@ FileWrite(File file, char *buffer, int amount)
                           file, VfdCache[file].fileName,
                           VfdCache[file].seekPos, amount, buffer));
 
-       FileAccess(file);
+       returnCode = FileAccess(file);
+       if (returnCode < 0)
+               return returnCode;
 
+retry:
        errno = 0;
        returnCode = write(VfdCache[file].fd, buffer, amount);
 
@@ -930,17 +1073,60 @@ FileWrite(File file, char *buffer, int amount)
        if (returnCode != amount && errno == 0)
                errno = ENOSPC;
 
-       if (returnCode > 0)
+       if (returnCode >= 0)
                VfdCache[file].seekPos += returnCode;
        else
+       {
+               /*
+                * See comments in FileRead()
+                */
+#ifdef WIN32
+               DWORD           error = GetLastError();
+
+               switch (error)
+               {
+                       case ERROR_NO_SYSTEM_RESOURCES:
+                               pg_usleep(1000L);
+                               errno = EINTR;
+                               break;
+                       default:
+                               _dosmaperr(error);
+                               break;
+               }
+#endif
+               /* OK to retry if interrupted */
+               if (errno == EINTR)
+                       goto retry;
+
+               /* Trouble, so assume we don't know the file position anymore */
                VfdCache[file].seekPos = FileUnknownPos;
+       }
 
        return returnCode;
 }
 
+int
+FileSync(File file)
+{
+       int                     returnCode;
+
+       Assert(FileIsValid(file));
+
+       DO_DB(elog(LOG, "FileSync: %d (%s)",
+                          file, VfdCache[file].fileName));
+
+       returnCode = FileAccess(file);
+       if (returnCode < 0)
+               return returnCode;
+
+       return pg_fsync(VfdCache[file].fd);
+}
+
 long
 FileSeek(File file, long offset, int whence)
 {
+       int                     returnCode;
+
        Assert(FileIsValid(file));
 
        DO_DB(elog(LOG, "FileSeek: %d (%s) %ld %ld %d",
@@ -960,8 +1146,11 @@ FileSeek(File file, long offset, int whence)
                                VfdCache[file].seekPos += offset;
                                break;
                        case SEEK_END:
-                               FileAccess(file);
-                               VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+                               returnCode = FileAccess(file);
+                               if (returnCode < 0)
+                                       return returnCode;
+                               VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+                                                                                          offset, whence);
                                break;
                        default:
                                elog(ERROR, "invalid whence: %d", whence);
@@ -976,14 +1165,17 @@ FileSeek(File file, long offset, int whence)
                                if (offset < 0)
                                        elog(ERROR, "invalid seek offset: %ld", offset);
                                if (VfdCache[file].seekPos != offset)
-                                       VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+                                       VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+                                                                                                  offset, whence);
                                break;
                        case SEEK_CUR:
                                if (offset != 0 || VfdCache[file].seekPos == FileUnknownPos)
-                                       VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+                                       VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+                                                                                                  offset, whence);
                                break;
                        case SEEK_END:
-                               VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+                               VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+                                                                                          offset, whence);
                                break;
                        default:
                                elog(ERROR, "invalid whence: %d", whence);
@@ -1017,7 +1209,10 @@ FileTruncate(File file, long offset)
        DO_DB(elog(LOG, "FileTruncate %d (%s)",
                           file, VfdCache[file].fileName));
 
-       FileAccess(file);
+       returnCode = FileAccess(file);
+       if (returnCode < 0)
+               return returnCode;
+
        returnCode = ftruncate(VfdCache[file].fd, (size_t) offset);
        return returnCode;
 }
@@ -1041,21 +1236,33 @@ FileTruncate(File file, long offset)
  * Ideally this should be the *only* direct call of fopen() in the backend.
  */
 FILE *
-AllocateFile(char *name, char *mode)
+AllocateFile(const char *name, const char *mode)
 {
        FILE       *file;
 
-       DO_DB(elog(LOG, "AllocateFile: Allocated %d", numAllocatedFiles));
+       DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
+                          numAllocatedDescs, name));
 
-       if (numAllocatedFiles >= MAX_ALLOCATED_FILES)
-               elog(ERROR, "too many private FDs demanded");
+       /*
+        * The test against MAX_ALLOCATED_DESCS prevents us from overflowing
+        * allocatedFiles[]; the test against max_safe_fds prevents AllocateFile
+        * from hogging every one of the available FDs, which'd lead to infinite
+        * looping.
+        */
+       if (numAllocatedDescs >= MAX_ALLOCATED_DESCS ||
+               numAllocatedDescs >= max_safe_fds - 1)
+               elog(ERROR, "too many private files demanded");
 
 TryAgain:
        if ((file = fopen(name, mode)) != NULL)
        {
-               allocatedFiles[numAllocatedFiles] = file;
-               numAllocatedFiles++;
-               return file;
+               AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
+
+               desc->kind = AllocateDescFile;
+               desc->desc.file = file;
+               desc->create_subid = GetCurrentSubTransactionId();
+               numAllocatedDescs++;
+               return desc->desc.file;
        }
 
        if (errno == EMFILE || errno == ENFILE)
@@ -1064,39 +1271,228 @@ TryAgain:
 
                ereport(LOG,
                                (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
-                         errmsg("out of file descriptors: %m; release and retry")));
+                                errmsg("out of file descriptors: %m; release and retry")));
                errno = 0;
                if (ReleaseLruFile())
                        goto TryAgain;
                errno = save_errno;
        }
 
+       /*
+        * TEMPORARY hack to log the Windows error code on fopen failures, in
+        * hopes of diagnosing some hard-to-reproduce problems.
+        */
+#ifdef WIN32
+       {
+               int                     save_errno = errno;
+
+               elog(LOG, "Windows fopen(\"%s\",\"%s\") failed: code %lu, errno %d",
+                        name, mode, GetLastError(), save_errno);
+               errno = save_errno;
+       }
+#endif
+
        return NULL;
 }
 
-void
+/*
+ * Free an AllocateDesc of either type.
+ *
+ * The argument *must* point into the allocatedDescs[] array.
+ */
+static int
+FreeDesc(AllocateDesc *desc)
+{
+       int                     result;
+
+       /* Close the underlying object */
+       switch (desc->kind)
+       {
+               case AllocateDescFile:
+                       result = fclose(desc->desc.file);
+                       break;
+               case AllocateDescDir:
+                       result = closedir(desc->desc.dir);
+                       break;
+               default:
+                       elog(ERROR, "AllocateDesc kind not recognized");
+                       result = 0;                     /* keep compiler quiet */
+                       break;
+       }
+
+       /* Compact storage in the allocatedDescs array */
+       numAllocatedDescs--;
+       *desc = allocatedDescs[numAllocatedDescs];
+
+       return result;
+}
+
+/*
+ * Close a file returned by AllocateFile.
+ *
+ * Note we do not check fclose's return value --- it is up to the caller
+ * to handle close errors.
+ */
+int
 FreeFile(FILE *file)
 {
        int                     i;
 
-       DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedFiles));
+       DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
 
        /* Remove file from list of allocated files, if it's present */
-       for (i = numAllocatedFiles; --i >= 0;)
+       for (i = numAllocatedDescs; --i >= 0;)
        {
-               if (allocatedFiles[i] == file)
-               {
-                       numAllocatedFiles--;
-                       allocatedFiles[i] = allocatedFiles[numAllocatedFiles];
-                       break;
-               }
+               AllocateDesc *desc = &allocatedDescs[i];
+
+               if (desc->kind == AllocateDescFile && desc->desc.file == file)
+                       return FreeDesc(desc);
+       }
+
+       /* Only get here if someone passes us a file not in allocatedDescs */
+       elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
+
+       return fclose(file);
+}
+
+
+/*
+ * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
+ * rather than plain opendir().  This lets fd.c deal with freeing FDs if
+ * necessary to open the directory, and with closing it after an elog.
+ * When done, call FreeDir rather than closedir.
+ *
+ * Ideally this should be the *only* direct call of opendir() in the backend.
+ */
+DIR *
+AllocateDir(const char *dirname)
+{
+       DIR                *dir;
+
+       DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
+                          numAllocatedDescs, dirname));
+
+       /*
+        * The test against MAX_ALLOCATED_DESCS prevents us from overflowing
+        * allocatedDescs[]; the test against max_safe_fds prevents AllocateDir
+        * from hogging every one of the available FDs, which'd lead to infinite
+        * looping.
+        */
+       if (numAllocatedDescs >= MAX_ALLOCATED_DESCS ||
+               numAllocatedDescs >= max_safe_fds - 1)
+               elog(ERROR, "too many private dirs demanded");
+
+TryAgain:
+       if ((dir = opendir(dirname)) != NULL)
+       {
+               AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
+
+               desc->kind = AllocateDescDir;
+               desc->desc.dir = dir;
+               desc->create_subid = GetCurrentSubTransactionId();
+               numAllocatedDescs++;
+               return desc->desc.dir;
+       }
+
+       if (errno == EMFILE || errno == ENFILE)
+       {
+               int                     save_errno = errno;
+
+               ereport(LOG,
+                               (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+                                errmsg("out of file descriptors: %m; release and retry")));
+               errno = 0;
+               if (ReleaseLruFile())
+                       goto TryAgain;
+               errno = save_errno;
+       }
+
+       return NULL;
+}
+
+/*
+ * Read a directory opened with AllocateDir, ereport'ing any error.
+ *
+ * This is easier to use than raw readdir() since it takes care of some
+ * otherwise rather tedious and error-prone manipulation of errno.     Also,
+ * if you are happy with a generic error message for AllocateDir failure,
+ * you can just do
+ *
+ *             dir = AllocateDir(path);
+ *             while ((dirent = ReadDir(dir, path)) != NULL)
+ *                     process dirent;
+ *             FreeDir(dir);
+ *
+ * since a NULL dir parameter is taken as indicating AllocateDir failed.
+ * (Make sure errno hasn't been changed since AllocateDir if you use this
+ * shortcut.)
+ *
+ * The pathname passed to AllocateDir must be passed to this routine too,
+ * but it is only used for error reporting.
+ */
+struct dirent *
+ReadDir(DIR *dir, const char *dirname)
+{
+       struct dirent *dent;
+
+       /* Give a generic message for AllocateDir failure, if caller didn't */
+       if (dir == NULL)
+               ereport(ERROR,
+                               (errcode_for_file_access(),
+                                errmsg("could not open directory \"%s\": %m",
+                                               dirname)));
+
+       errno = 0;
+       if ((dent = readdir(dir)) != NULL)
+               return dent;
+
+#ifdef WIN32
+
+       /*
+        * This fix is in mingw cvs (runtime/mingwex/dirent.c rev 1.4), but not in
+        * released version
+        */
+       if (GetLastError() == ERROR_NO_MORE_FILES)
+               errno = 0;
+#endif
+
+       if (errno)
+               ereport(ERROR,
+                               (errcode_for_file_access(),
+                                errmsg("could not read directory \"%s\": %m",
+                                               dirname)));
+       return NULL;
+}
+
+/*
+ * Close a directory opened with AllocateDir.
+ *
+ * Note we do not check closedir's return value --- it is up to the caller
+ * to handle close errors.
+ */
+int
+FreeDir(DIR *dir)
+{
+       int                     i;
+
+       DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
+
+       /* Remove dir from list of allocated dirs, if it's present */
+       for (i = numAllocatedDescs; --i >= 0;)
+       {
+               AllocateDesc *desc = &allocatedDescs[i];
+
+               if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
+                       return FreeDesc(desc);
        }
-       if (i < 0)
-               elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
 
-       fclose(file);
+       /* Only get here if someone passes us a dir not in allocatedDescs */
+       elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
+
+       return closedir(dir);
 }
 
+
 /*
  * closeAllVfds
  *
@@ -1120,6 +1516,52 @@ closeAllVfds(void)
        }
 }
 
+/*
+ * AtEOSubXact_Files
+ *
+ * Take care of subtransaction commit/abort.  At abort, we close temp files
+ * that the subtransaction may have opened.  At commit, we reassign the
+ * files that were opened to the parent subtransaction.
+ */
+void
+AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
+                                 SubTransactionId parentSubid)
+{
+       Index           i;
+
+       if (SizeVfdCache > 0)
+       {
+               Assert(FileIsNotOpen(0));               /* Make sure ring not corrupted */
+               for (i = 1; i < SizeVfdCache; i++)
+               {
+                       unsigned short fdstate = VfdCache[i].fdstate;
+
+                       if ((fdstate & FD_XACT_TEMPORARY) &&
+                               VfdCache[i].create_subid == mySubid)
+                       {
+                               if (isCommit)
+                                       VfdCache[i].create_subid = parentSubid;
+                               else if (VfdCache[i].fileName != NULL)
+                                       FileClose(i);
+                       }
+               }
+       }
+
+       for (i = 0; i < numAllocatedDescs; i++)
+       {
+               if (allocatedDescs[i].create_subid == mySubid)
+               {
+                       if (isCommit)
+                               allocatedDescs[i].create_subid = parentSubid;
+                       else
+                       {
+                               /* have to recheck the item after FreeDesc (ugly) */
+                               FreeDesc(&allocatedDescs[i--]);
+                       }
+               }
+       }
+}
+
 /*
  * AtEOXact_Files
  *
@@ -1141,7 +1583,7 @@ AtEOXact_Files(void)
  * Here, we want to clean up *all* temp files including interXact ones.
  */
 static void
-AtProcExit_Files(void)
+AtProcExit_Files(int code, Datum arg)
 {
        CleanupTempFiles(true);
 }
@@ -1153,7 +1595,7 @@ AtProcExit_Files(void)
  * exiting. If that's the case, we should remove all temporary files; if
  * that's not the case, we are being called for transaction commit/abort
  * and should only remove transaction-local temp files.  In either case,
- * also clean up "allocated" stdio files.
+ * also clean up "allocated" stdio files and dirs.
  */
 static void
 CleanupTempFiles(bool isProcExit)
@@ -1170,9 +1612,9 @@ CleanupTempFiles(bool isProcExit)
                        if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL)
                        {
                                /*
-                                * If we're in the process of exiting a backend process,
-                                * close all temporary files. Otherwise, only close
-                                * temporary files local to the current transaction.
+                                * If we're in the process of exiting a backend process, close
+                                * all temporary files. Otherwise, only close temporary files
+                                * local to the current transaction.
                                 */
                                if (isProcExit || (fdstate & FD_XACT_TEMPORARY))
                                        FileClose(i);
@@ -1180,8 +1622,8 @@ CleanupTempFiles(bool isProcExit)
                }
        }
 
-       while (numAllocatedFiles > 0)
-               FreeFile(allocatedFiles[0]);
+       while (numAllocatedDescs > 0)
+               FreeDesc(&allocatedDescs[0]);
 }
 
 
@@ -1200,57 +1642,75 @@ CleanupTempFiles(bool isProcExit)
 void
 RemovePgTempFiles(void)
 {
-       char            db_path[MAXPGPATH];
        char            temp_path[MAXPGPATH];
-       char            rm_path[MAXPGPATH];
        DIR                *db_dir;
-       DIR                *temp_dir;
        struct dirent *db_de;
-       struct dirent *temp_de;
 
        /*
-        * Cycle through pg_tempfiles for all databases and remove old temp
-        * files.
+        * Cycle through pgsql_tmp directories for all databases and remove old
+        * temp files.
         */
-       snprintf(db_path, sizeof(db_path), "%s/base", DataDir);
-       if ((db_dir = opendir(db_path)) != NULL)
+       db_dir = AllocateDir("base");
+
+       while ((db_de = ReadDir(db_dir, "base")) != NULL)
        {
-               while ((db_de = readdir(db_dir)) != NULL)
-               {
-                       if (strcmp(db_de->d_name, ".") == 0 ||
-                               strcmp(db_de->d_name, "..") == 0)
-                               continue;
-
-                       snprintf(temp_path, sizeof(temp_path),
-                                        "%s/%s/%s",
-                                        db_path, db_de->d_name,
-                                        PG_TEMP_FILES_DIR);
-                       if ((temp_dir = opendir(temp_path)) != NULL)
-                       {
-                               while ((temp_de = readdir(temp_dir)) != NULL)
-                               {
-                                       if (strcmp(temp_de->d_name, ".") == 0 ||
-                                               strcmp(temp_de->d_name, "..") == 0)
-                                               continue;
-
-                                       snprintf(rm_path, sizeof(temp_path),
-                                                        "%s/%s/%s/%s",
-                                                        db_path, db_de->d_name,
-                                                        PG_TEMP_FILES_DIR,
-                                                        temp_de->d_name);
-
-                                       if (strncmp(temp_de->d_name,
-                                                               PG_TEMP_FILE_PREFIX,
-                                                               strlen(PG_TEMP_FILE_PREFIX)) == 0)
-                                               unlink(rm_path);
-                                       else
-                                               elog(LOG,
-                                                        "unexpected file found in temporary-files directory: \"%s\"",
-                                                        rm_path);
-                               }
-                               closedir(temp_dir);
-                       }
-               }
-               closedir(db_dir);
+               if (strcmp(db_de->d_name, ".") == 0 ||
+                       strcmp(db_de->d_name, "..") == 0)
+                       continue;
+
+               snprintf(temp_path, sizeof(temp_path), "base/%s/%s",
+                                db_de->d_name, PG_TEMP_FILES_DIR);
+               RemovePgTempFilesInDir(temp_path);
        }
+
+       FreeDir(db_dir);
+
+       /*
+        * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
+        * DataDir as well.
+        */
+#ifdef EXEC_BACKEND
+       RemovePgTempFilesInDir(PG_TEMP_FILES_DIR);
+#endif
+}
+
+/* Process one pgsql_tmp directory for RemovePgTempFiles */
+static void
+RemovePgTempFilesInDir(const char *tmpdirname)
+{
+       DIR                *temp_dir;
+       struct dirent *temp_de;
+       char            rm_path[MAXPGPATH];
+
+       temp_dir = AllocateDir(tmpdirname);
+       if (temp_dir == NULL)
+       {
+               /* anything except ENOENT is fishy */
+               if (errno != ENOENT)
+                       elog(LOG,
+                                "could not open temporary-files directory \"%s\": %m",
+                                tmpdirname);
+               return;
+       }
+
+       while ((temp_de = ReadDir(temp_dir, tmpdirname)) != NULL)
+       {
+               if (strcmp(temp_de->d_name, ".") == 0 ||
+                       strcmp(temp_de->d_name, "..") == 0)
+                       continue;
+
+               snprintf(rm_path, sizeof(rm_path), "%s/%s",
+                                tmpdirname, temp_de->d_name);
+
+               if (strncmp(temp_de->d_name,
+                                       PG_TEMP_FILE_PREFIX,
+                                       strlen(PG_TEMP_FILE_PREFIX)) == 0)
+                       unlink(rm_path);        /* note we ignore any error */
+               else
+                       elog(LOG,
+                                "unexpected file found in temporary-files directory: \"%s\"",
+                                rm_path);
+       }
+
+       FreeDir(temp_dir);
 }