* fd.c
* Virtual file descriptor code.
*
- * Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $Id: fd.c,v 1.37 1999/02/13 23:18:05 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.130 2006/10/04 00:29:57 momjian Exp $
*
* NOTES:
*
*-------------------------------------------------------------------------
*/
-#include <sys/types.h>
-#include <stdio.h>
+#include "postgres.h"
+
#include <sys/file.h>
#include <sys/param.h>
-#include <errno.h>
#include <sys/stat.h>
-#include <string.h>
#include <unistd.h>
#include <fcntl.h>
-#include "postgres.h"
-#include "miscadmin.h" /* for DataDir */
-#include "utils/palloc.h"
+#include "miscadmin.h"
+#include "access/xact.h"
#include "storage/fd.h"
+#include "storage/ipc.h"
+
/*
- * Problem: Postgres does a system(ld...) to do dynamic loading. This
- * will open several extra files in addition to those used by
- * Postgres. We need to do this hack to guarentee that there are file
- * descriptors free for ld to use.
+ * We must leave some file descriptors free for system(), the dynamic loader,
+ * and other code that tries to open files without consulting fd.c. This
+ * is the number left free. (While we can be pretty sure we won't get
+ * EMFILE, there's never any guarantee that we won't get ENFILE due to
+ * other processes chewing up FDs. So it's a bad idea to try to open files
+ * without consulting fd.c. Nonetheless we cannot control all code.)
*
- * The current solution is to limit the number of files descriptors
- * that this code will allocated at one time. (it leaves
- * RESERVE_FOR_LD free).
- *
- * (Even though most dynamic loaders now use dlopen(3) or the
- * equivalent, the OS must still open several files to perform the
- * dynamic loading. Keep this here.)
+ * Because this is just a fixed setting, we are effectively assuming that
+ * no such code will leave FDs open over the long term; otherwise the slop
+ * is likely to be insufficient. Note in particular that we expect that
+ * loading a shared library does not result in any permanent increase in
+ * the number of open files. (This appears to be true on most if not
+ * all platforms as of Feb 2004.)
*/
-#ifndef RESERVE_FOR_LD
-#define RESERVE_FOR_LD 10
-#endif
+#define NUM_RESERVED_FDS 10
+
+/*
+ * If we have fewer than this many usable FDs after allowing for the reserved
+ * ones, choke.
+ */
+#define FD_MINFREE 10
+
+
+/*
+ * A number of platforms allow individual processes to open many more files
+ * than they can really support when *many* processes do the same thing.
+ * This GUC parameter lets the DBA limit max_safe_fds to something less than
+ * what the postmaster's initial probe suggests will work.
+ */
+int max_files_per_process = 1000;
/*
- * We need to ensure that we have at least some file descriptors
- * available to postgreSQL after we've reserved the ones for LD,
- * so we set that value here.
+ * Maximum number of file descriptors to open for either VFD entries or
+ * AllocateFile/AllocateDir operations. This is initialized to a conservative
+ * value, and remains that way indefinitely in bootstrap or standalone-backend
+ * cases. In normal postmaster operation, the postmaster calls
+ * set_max_safe_fds() late in initialization to update the value, and that
+ * value is then inherited by forked subprocesses.
*
- * I think 10 is an apropriate value so that's what it'll be
- * for now.
+ * Note: the value of max_files_per_process is taken into account while
+ * setting this variable, and so need not be tested separately.
*/
-#ifndef FD_MINFREE
-#define FD_MINFREE 10
-#endif
+static int max_safe_fds = 32; /* default if not changed */
+
/* Debugging.... */
#define DO_DB(A) /* A */
#endif
-#define VFD_CLOSED -1
+#define VFD_CLOSED (-1)
-#include "storage/fd.h"
-#include "utils/elog.h"
+#define FileIsValid(file) \
+ ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
-typedef struct vfd
-{
- signed short fd;
- unsigned short fdstate;
+#define FileUnknownPos (-1L)
-#define FD_DIRTY (1 << 0)
+/* these are the assigned bits in fdstate below: */
+#define FD_TEMPORARY (1 << 0) /* T = delete when closed */
+#define FD_XACT_TEMPORARY (1 << 1) /* T = delete at eoXact */
- File nextFree;
- File lruMoreRecently;
+typedef struct vfd
+{
+ signed short fd; /* current FD, or VFD_CLOSED if none */
+ unsigned short fdstate; /* bitflags for VFD's state */
+ SubTransactionId create_subid; /* for TEMPORARY fds, creating subxact */
+ File nextFree; /* link to next free VFD, if in freelist */
+ File lruMoreRecently; /* doubly linked recency-of-use list */
File lruLessRecently;
- long seekPos;
- char *fileName;
- int fileFlags;
- int fileMode;
+ long seekPos; /* current logical file position */
+ char *fileName; /* name of file, or NULL for unused VFD */
+ /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
+ int fileFlags; /* open(2) flags for (re)opening the file */
+ int fileMode; /* mode to pass to open(2) */
} Vfd;
/*
* Virtual File Descriptor array pointer and size. This grows as
- * needed.
+ * needed. 'File' values are indexes into this array.
+ * Note that VfdCache[0] is not a usable VFD, just a list header.
*/
static Vfd *VfdCache;
static Size SizeVfdCache = 0;
/*
- * Number of file descriptors known to be open.
+ * Number of file descriptors known to be in use by VFD entries.
*/
static int nfile = 0;
/*
+ * List of stdio FILEs and <dirent.h> DIRs opened with AllocateFile
+ * and AllocateDir.
+ *
+ * Since we don't want to encourage heavy use of AllocateFile or AllocateDir,
+ * it seems OK to put a pretty small maximum limit on the number of
+ * simultaneously allocated descs.
+ */
+#define MAX_ALLOCATED_DESCS 32
+
+typedef enum
+{
+ AllocateDescFile,
+ AllocateDescDir
+} AllocateDescKind;
+
+typedef struct
+{
+ AllocateDescKind kind;
+ union
+ {
+ FILE *file;
+ DIR *dir;
+ } desc;
+ SubTransactionId create_subid;
+} AllocateDesc;
+
+static int numAllocatedDescs = 0;
+static AllocateDesc allocatedDescs[MAX_ALLOCATED_DESCS];
+
+/*
+ * Number of temporary files opened during the current session;
+ * this is used in generation of tempfile names.
+ */
+static long tempFileCounter = 0;
+
+
+/*--------------------
+ *
* Private Routines
*
* Delete - delete a file from the Lru ring
- * LruDelete - remove a file from the Lru ring and close
+ * LruDelete - remove a file from the Lru ring and close its FD
* Insert - put a file at the front of the Lru ring
- * LruInsert - put a file at the front of the Lru ring and open
- * AssertLruRoom - make sure that there is a free fd.
+ * LruInsert - put a file at the front of the Lru ring and open it
+ * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
+ * AllocateVfd - grab a free (or new) file record (from VfdArray)
+ * FreeVfd - free a file record
*
- * the Last Recently Used ring is a doubly linked list that begins and
+ * The Least Recently Used ring is a doubly linked list that begins and
* ends on element zero. Element zero is special -- it doesn't represent
* a file and its "fd" field always == VFD_CLOSED. Element zero is just an
* anchor that shows us the beginning/end of the ring.
+ * Only VFD elements that are currently really open (have an FD assigned) are
+ * in the Lru ring. Elements that are "virtually" open can be recognized
+ * by having a non-null fileName field.
*
* example:
*
* \\less--> MostRecentlyUsedFile <---/ |
* \more---/ \--less--/
*
- * AllocateVfd - grab a free (or new) file record (from VfdArray)
- * FreeVfd - free a file record
- *
+ *--------------------
*/
static void Delete(File file);
static void LruDelete(File file);
static void Insert(File file);
static int LruInsert(File file);
-static void AssertLruRoom(void);
+static bool ReleaseLruFile(void);
static File AllocateVfd(void);
static void FreeVfd(File file);
static int FileAccess(File file);
-static File fileNameOpenFile(FileName fileName, int fileFlags, int fileMode);
-static char *filepath(char *filename);
-static long pg_nofile(void);
+static char *make_database_relative(const char *filename);
+static void AtProcExit_Files(int code, Datum arg);
+static void CleanupTempFiles(bool isProcExit);
+static void RemovePgTempFilesInDir(const char *tmpdirname);
+
+/*
+ * pg_fsync --- do fsync with or without writethrough
+ */
int
pg_fsync(int fd)
{
- extern int fsyncOff;
-
- return fsyncOff ? 0 : fsync(fd);
+#ifndef HAVE_FSYNC_WRITETHROUGH_ONLY
+ if (sync_method != SYNC_METHOD_FSYNC_WRITETHROUGH)
+ return pg_fsync_no_writethrough(fd);
+ else
+#endif
+ return pg_fsync_writethrough(fd);
}
-#define fsync pg_fsync
-static long
-pg_nofile(void)
+/*
+ * pg_fsync_no_writethrough --- same as fsync except does nothing if
+ * enableFsync is off
+ */
+int
+pg_fsync_no_writethrough(int fd)
+{
+ if (enableFsync)
+ return fsync(fd);
+ else
+ return 0;
+}
+
+/*
+ * pg_fsync_writethrough
+ */
+int
+pg_fsync_writethrough(int fd)
{
- static long no_files = 0;
+ if (enableFsync)
+ {
+#ifdef WIN32
+ return _commit(fd);
+#elif defined(F_FULLFSYNC)
+ return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
+#else
+ return -1;
+#endif
+ }
+ else
+ return 0;
+}
- if (no_files == 0)
+/*
+ * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
+ *
+ * Not all platforms have fdatasync; treat as fsync if not available.
+ */
+int
+pg_fdatasync(int fd)
+{
+ if (enableFsync)
{
-#ifndef HAVE_SYSCONF
- no_files = (long) NOFILE;
+#ifdef HAVE_FDATASYNC
+ return fdatasync(fd);
#else
- no_files = sysconf(_SC_OPEN_MAX);
- if (no_files == -1)
+ return fsync(fd);
+#endif
+ }
+ else
+ return 0;
+}
+
+/*
+ * InitFileAccess --- initialize this module during backend startup
+ *
+ * This is called during either normal or standalone backend start.
+ * It is *not* called in the postmaster.
+ */
+void
+InitFileAccess(void)
+{
+ Assert(SizeVfdCache == 0); /* call me only once */
+
+ /* initialize cache header entry */
+ VfdCache = (Vfd *) malloc(sizeof(Vfd));
+ if (VfdCache == NULL)
+ ereport(FATAL,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+
+ MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
+ VfdCache->fd = VFD_CLOSED;
+
+ SizeVfdCache = 1;
+
+ /* register proc-exit hook to ensure temp files are dropped at exit */
+ on_proc_exit(AtProcExit_Files, 0);
+}
+
+/*
+ * count_usable_fds --- count how many FDs the system will let us open,
+ * and estimate how many are already open.
+ *
+ * We stop counting if usable_fds reaches max_to_probe. Note: a small
+ * value of max_to_probe might result in an underestimate of already_open;
+ * we must fill in any "gaps" in the set of used FDs before the calculation
+ * of already_open will give the right answer. In practice, max_to_probe
+ * of a couple of dozen should be enough to ensure good results.
+ *
+ * We assume stdin (FD 0) is available for dup'ing
+ */
+static void
+count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
+{
+ int *fd;
+ int size;
+ int used = 0;
+ int highestfd = 0;
+ int j;
+
+ size = 1024;
+ fd = (int *) palloc(size * sizeof(int));
+
+ /* dup until failure or probe limit reached */
+ for (;;)
+ {
+ int thisfd;
+
+ thisfd = dup(0);
+ if (thisfd < 0)
{
- elog(DEBUG, "pg_nofile: Unable to get _SC_OPEN_MAX using sysconf() using (%d)", NOFILE);
- no_files = (long) NOFILE;
+ /* Expect EMFILE or ENFILE, else it's fishy */
+ if (errno != EMFILE && errno != ENFILE)
+ elog(WARNING, "dup(0) failed after %d successes: %m", used);
+ break;
}
-#endif
+
+ if (used >= size)
+ {
+ size *= 2;
+ fd = (int *) repalloc(fd, size * sizeof(int));
+ }
+ fd[used++] = thisfd;
+
+ if (highestfd < thisfd)
+ highestfd = thisfd;
+
+ if (used >= max_to_probe)
+ break;
+ }
+
+ /* release the files we opened */
+ for (j = 0; j < used; j++)
+ close(fd[j]);
+
+ pfree(fd);
+
+ /*
+ * Return results. usable_fds is just the number of successful dups. We
+ * assume that the system limit is highestfd+1 (remember 0 is a legal FD
+ * number) and so already_open is highestfd+1 - usable_fds.
+ */
+ *usable_fds = used;
+ *already_open = highestfd + 1 - used;
+}
+
+/*
+ * set_max_safe_fds
+ * Determine number of filedescriptors that fd.c is allowed to use
+ */
+void
+set_max_safe_fds(void)
+{
+ int usable_fds;
+ int already_open;
+
+ /*----------
+ * We want to set max_safe_fds to
+ * MIN(usable_fds, max_files_per_process - already_open)
+ * less the slop factor for files that are opened without consulting
+ * fd.c. This ensures that we won't exceed either max_files_per_process
+ * or the experimentally-determined EMFILE limit.
+ *----------
+ */
+ count_usable_fds(max_files_per_process,
+ &usable_fds, &already_open);
+
+ max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
+
+ /*
+ * Take off the FDs reserved for system() etc.
+ */
+ max_safe_fds -= NUM_RESERVED_FDS;
+
+ /*
+ * Make sure we still have enough to get by.
+ */
+ if (max_safe_fds < FD_MINFREE)
+ ereport(FATAL,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("insufficient file descriptors available to start server process"),
+ errdetail("System allows %d, we need at least %d.",
+ max_safe_fds + NUM_RESERVED_FDS,
+ FD_MINFREE + NUM_RESERVED_FDS)));
+
+ elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
+ max_safe_fds, usable_fds, already_open);
+}
+
+/*
+ * BasicOpenFile --- same as open(2) except can free other FDs if needed
+ *
+ * This is exported for use by places that really want a plain kernel FD,
+ * but need to be proof against running out of FDs. Once an FD has been
+ * successfully returned, it is the caller's responsibility to ensure that
+ * it will not be leaked on ereport()! Most users should *not* call this
+ * routine directly, but instead use the VFD abstraction level, which
+ * provides protection against descriptor leaks as well as management of
+ * files that need to be open for more than a short period of time.
+ *
+ * Ideally this should be the *only* direct call of open() in the backend.
+ * In practice, the postmaster calls open() directly, and there are some
+ * direct open() calls done early in backend startup. Those are OK since
+ * this module wouldn't have any open files to close at that point anyway.
+ */
+int
+BasicOpenFile(FileName fileName, int fileFlags, int fileMode)
+{
+ int fd;
+
+tryAgain:
+ fd = open(fileName, fileFlags, fileMode);
+
+ if (fd >= 0)
+ return fd; /* success! */
+
+ if (errno == EMFILE || errno == ENFILE)
+ {
+ int save_errno = errno;
+
+ ereport(LOG,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("out of file descriptors: %m; release and retry")));
+ errno = 0;
+ if (ReleaseLruFile())
+ goto tryAgain;
+ errno = save_errno;
}
- if ((no_files - RESERVE_FOR_LD) < FD_MINFREE)
- elog(FATAL, "pg_nofile: insufficient File Descriptors in postmaster to start backend (%ld).\n"
- " O/S allows %ld, Postmaster reserves %d, We need %d (MIN) after that.",
- no_files - RESERVE_FOR_LD, no_files, RESERVE_FOR_LD, FD_MINFREE);
- return no_files - RESERVE_FOR_LD;
+ return -1; /* failure */
}
#if defined(FDDEBUG)
+
static void
-_dump_lru()
+_dump_lru(void)
{
int mru = VfdCache[0].lruLessRecently;
Vfd *vfdP = &VfdCache[mru];
char buf[2048];
- sprintf(buf, "LRU: MOST %d ", mru);
+ snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
while (mru != 0)
{
mru = vfdP->lruLessRecently;
vfdP = &VfdCache[mru];
- sprintf(buf + strlen(buf), "%d ", mru);
+ snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
}
- sprintf(buf + strlen(buf), "LEAST");
- elog(DEBUG, buf);
+ snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
+ elog(LOG, buf);
}
-
-#endif /* FDDEBUG */
+#endif /* FDDEBUG */
static void
Delete(File file)
{
- Vfd *fileP;
+ Vfd *vfdP;
+
+ Assert(file != 0);
- DO_DB(elog(DEBUG, "Delete %d (%s)",
+ DO_DB(elog(LOG, "Delete %d (%s)",
file, VfdCache[file].fileName));
DO_DB(_dump_lru());
- Assert(file != 0);
-
- fileP = &VfdCache[file];
+ vfdP = &VfdCache[file];
- VfdCache[fileP->lruLessRecently].lruMoreRecently = VfdCache[file].lruMoreRecently;
- VfdCache[fileP->lruMoreRecently].lruLessRecently = VfdCache[file].lruLessRecently;
+ VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
+ VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
DO_DB(_dump_lru());
}
static void
LruDelete(File file)
{
- Vfd *fileP;
- int returnValue;
-
- DO_DB(elog(DEBUG, "LruDelete %d (%s)",
- file, VfdCache[file].fileName));
+ Vfd *vfdP;
Assert(file != 0);
- fileP = &VfdCache[file];
+ DO_DB(elog(LOG, "LruDelete %d (%s)",
+ file, VfdCache[file].fileName));
+
+ vfdP = &VfdCache[file];
/* delete the vfd record from the LRU ring */
Delete(file);
/* save the seek position */
- fileP->seekPos = (long) lseek(fileP->fd, 0L, SEEK_CUR);
- Assert(fileP->seekPos != -1);
-
- /* if we have written to the file, sync it */
- if (fileP->fdstate & FD_DIRTY)
- {
- returnValue = fsync(fileP->fd);
- Assert(returnValue != -1);
- fileP->fdstate &= ~FD_DIRTY;
- }
+ vfdP->seekPos = (long) lseek(vfdP->fd, 0L, SEEK_CUR);
+ Assert(vfdP->seekPos != -1L);
/* close the file */
- returnValue = close(fileP->fd);
- Assert(returnValue != -1);
+ if (close(vfdP->fd))
+ elog(ERROR, "failed to close \"%s\": %m",
+ vfdP->fileName);
--nfile;
- fileP->fd = VFD_CLOSED;
-
+ vfdP->fd = VFD_CLOSED;
}
static void
{
Vfd *vfdP;
- DO_DB(elog(DEBUG, "Insert %d (%s)",
+ Assert(file != 0);
+
+ DO_DB(elog(LOG, "Insert %d (%s)",
file, VfdCache[file].fileName));
DO_DB(_dump_lru());
DO_DB(_dump_lru());
}
+/* returns 0 on success, -1 on re-open failure (with errno set) */
static int
LruInsert(File file)
{
Vfd *vfdP;
- int returnValue;
- DO_DB(elog(DEBUG, "LruInsert %d (%s)",
+ Assert(file != 0);
+
+ DO_DB(elog(LOG, "LruInsert %d (%s)",
file, VfdCache[file].fileName));
vfdP = &VfdCache[file];
if (FileIsNotOpen(file))
{
-
- if (nfile >= pg_nofile())
- AssertLruRoom();
-
- /*
- * Note, we check to see if there's a free file descriptor before
- * attempting to open a file. One general way to do this is to try
- * to open the null device which everybody should be able to open
- * all the time. If this fails, we assume this is because there's
- * no free file descriptors.
- */
-tryAgain:
- vfdP->fd = open(vfdP->fileName, vfdP->fileFlags, vfdP->fileMode);
- if (vfdP->fd < 0 && (errno == EMFILE || errno == ENFILE))
+ while (nfile + numAllocatedDescs >= max_safe_fds)
{
- errno = 0;
- AssertLruRoom();
- goto tryAgain;
+ if (!ReleaseLruFile())
+ break;
}
+ /*
+ * The open could still fail for lack of file descriptors, eg due to
+ * overall system file table being full. So, be prepared to release
+ * another FD if necessary...
+ */
+ vfdP->fd = BasicOpenFile(vfdP->fileName, vfdP->fileFlags,
+ vfdP->fileMode);
if (vfdP->fd < 0)
{
- DO_DB(elog(DEBUG, "RE_OPEN FAILED: %d",
- errno));
+ DO_DB(elog(LOG, "RE_OPEN FAILED: %d", errno));
return vfdP->fd;
}
else
{
- DO_DB(elog(DEBUG, "RE_OPEN SUCCESS"));
+ DO_DB(elog(LOG, "RE_OPEN SUCCESS"));
++nfile;
}
/* seek to the right position */
if (vfdP->seekPos != 0L)
{
- returnValue = lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
- Assert(returnValue != -1);
- }
-
- /* init state on open */
- vfdP->fdstate = 0x0;
+ long returnValue;
+ returnValue = (long) lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
+ Assert(returnValue != -1L);
+ }
}
/*
return 0;
}
-static void
-AssertLruRoom()
+static bool
+ReleaseLruFile(void)
{
- DO_DB(elog(DEBUG, "AssertLruRoom. Opened %d", nfile));
+ DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
- if (nfile <= 0)
- elog(FATAL, "AssertLruRoom: No opened files - no one can be closed");
-
- /*
- * There are opened files and so there should be at least one used vfd
- * in the ring.
- */
- Assert(VfdCache[0].lruMoreRecently != 0);
- LruDelete(VfdCache[0].lruMoreRecently);
+ if (nfile > 0)
+ {
+ /*
+ * There are opened files and so there should be at least one used vfd
+ * in the ring.
+ */
+ Assert(VfdCache[0].lruMoreRecently != 0);
+ LruDelete(VfdCache[0].lruMoreRecently);
+ return true; /* freed a file */
+ }
+ return false; /* no files available to free */
}
static File
-AllocateVfd()
+AllocateVfd(void)
{
Index i;
File file;
- DO_DB(elog(DEBUG, "AllocateVfd. Size %d", SizeVfdCache));
-
- if (SizeVfdCache == 0)
- {
-
- /* initialize */
- VfdCache = (Vfd *) malloc(sizeof(Vfd));
- VfdCache->nextFree = 0;
- VfdCache->lruMoreRecently = 0;
- VfdCache->lruLessRecently = 0;
- VfdCache->fd = VFD_CLOSED;
- VfdCache->fdstate = 0x0;
+ DO_DB(elog(LOG, "AllocateVfd. Size %lu", SizeVfdCache));
- SizeVfdCache = 1;
- }
+ Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
if (VfdCache[0].nextFree == 0)
{
-
/*
- * The free list is empty so it is time to increase the size of
- * the array
+ * The free list is empty so it is time to increase the size of the
+ * array. We choose to double it each time this happens. However,
+ * there's not much point in starting *real* small.
*/
+ Size newCacheSize = SizeVfdCache * 2;
+ Vfd *newVfdCache;
- VfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * SizeVfdCache * 2);
- Assert(VfdCache != NULL);
+ if (newCacheSize < 32)
+ newCacheSize = 32;
/*
- * Set up the free list for the new entries
+ * Be careful not to clobber VfdCache ptr if realloc fails.
*/
+ newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
+ if (newVfdCache == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ VfdCache = newVfdCache;
- for (i = SizeVfdCache; i < 2 * SizeVfdCache; i++)
+ /*
+ * Initialize the new entries and link them into the free list.
+ */
+ for (i = SizeVfdCache; i < newCacheSize; i++)
{
- MemSet((char *) &(VfdCache[i]), 0, sizeof(VfdCache[0]));
+ MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
VfdCache[i].nextFree = i + 1;
VfdCache[i].fd = VFD_CLOSED;
}
-
- /*
- * Element 0 is the first and last element of the free list
- */
-
+ VfdCache[newCacheSize - 1].nextFree = 0;
VfdCache[0].nextFree = SizeVfdCache;
- VfdCache[2 * SizeVfdCache - 1].nextFree = 0;
/*
* Record the new size
*/
-
- SizeVfdCache *= 2;
+ SizeVfdCache = newCacheSize;
}
+
file = VfdCache[0].nextFree;
VfdCache[0].nextFree = VfdCache[file].nextFree;
static void
FreeVfd(File file)
{
- DO_DB(elog(DEBUG, "FreeVfd: %d (%s)",
- file, VfdCache[file].fileName));
+ Vfd *vfdP = &VfdCache[file];
+
+ DO_DB(elog(LOG, "FreeVfd: %d (%s)",
+ file, vfdP->fileName ? vfdP->fileName : ""));
+
+ if (vfdP->fileName != NULL)
+ {
+ free(vfdP->fileName);
+ vfdP->fileName = NULL;
+ }
+ vfdP->fdstate = 0x0;
- VfdCache[file].nextFree = VfdCache[0].nextFree;
+ vfdP->nextFree = VfdCache[0].nextFree;
VfdCache[0].nextFree = file;
}
-/* filepath()
- * Open specified file name.
- * Fill in absolute path fields if necessary.
+/*
+ * make_database_relative()
+ * Prepend DatabasePath to the given file name.
*
+ * Result is a palloc'd string.
*/
static char *
-filepath(char *filename)
+make_database_relative(const char *filename)
{
char *buf;
- int len;
-
- /* Not an absolute path name? Then fill in with database path... */
- if (*filename != SEP_CHAR)
- {
- len = strlen(DatabasePath) + strlen(filename) + 2;
- buf = (char *) palloc(len);
- sprintf(buf, "%s%c%s", DatabasePath, SEP_CHAR, filename);
- }
- else
- {
- buf = (char *) palloc(strlen(filename) + 1);
- strcpy(buf, filename);
- }
-
-#ifdef FILEDEBUG
- printf("filepath: path is %s\n", buf);
-#endif
+ Assert(!is_absolute_path(filename));
+ buf = (char *) palloc(strlen(DatabasePath) + strlen(filename) + 2);
+ sprintf(buf, "%s/%s", DatabasePath, filename);
return buf;
}
+/* returns 0 on success, -1 on re-open failure (with errno set) */
static int
FileAccess(File file)
{
int returnValue;
- DO_DB(elog(DEBUG, "FileAccess %d (%s)",
+ DO_DB(elog(LOG, "FileAccess %d (%s)",
file, VfdCache[file].fileName));
/*
- * Is the file open? If not, close the least recently used, then open
- * it and stick it at the head of the used ring
+ * Is the file open? If not, open it and put it at the head of the LRU
+ * ring (possibly closing the least recently used file to get an FD).
*/
if (FileIsNotOpen(file))
{
-
returnValue = LruInsert(file);
if (returnValue != 0)
return returnValue;
-
}
- else
+ else if (VfdCache[0].lruLessRecently != file)
{
-
/*
- * We now know that the file is open and that it is not the last
- * one accessed, so we need to more it to the head of the Lru
- * ring.
+ * We now know that the file is open and that it is not the last one
+ * accessed, so we need to move it to the head of the Lru ring.
*/
Delete(file);
void
FileInvalidate(File file)
{
- Assert(file > 0);
+ Assert(FileIsValid(file));
if (!FileIsNotOpen(file))
LruDelete(file);
}
-
#endif
-/* VARARGS2 */
-static File
-fileNameOpenFile(FileName fileName,
- int fileFlags,
- int fileMode)
+/*
+ * open a file in an arbitrary directory
+ *
+ * NB: if the passed pathname is relative (which it usually is),
+ * it will be interpreted relative to the process' working directory
+ * (which should always be $PGDATA when this code is running).
+ */
+File
+PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
{
+ char *fnamecopy;
File file;
Vfd *vfdP;
- DO_DB(elog(DEBUG, "fileNameOpenFile: %s %x %o",
+ DO_DB(elog(LOG, "PathNameOpenFile: %s %x %o",
fileName, fileFlags, fileMode));
+ /*
+ * We need a malloc'd copy of the file name; fail cleanly if no room.
+ */
+ fnamecopy = strdup(fileName);
+ if (fnamecopy == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+
file = AllocateVfd();
vfdP = &VfdCache[file];
- if (nfile >= pg_nofile())
- AssertLruRoom();
-
-tryAgain:
- vfdP->fd = open(fileName, fileFlags, fileMode);
- if (vfdP->fd < 0 && (errno == EMFILE || errno == ENFILE))
+ while (nfile + numAllocatedDescs >= max_safe_fds)
{
- DO_DB(elog(DEBUG, "fileNameOpenFile: not enough descs, retry, er= %d",
- errno));
- errno = 0;
- AssertLruRoom();
- goto tryAgain;
+ if (!ReleaseLruFile())
+ break;
}
- vfdP->fdstate = 0x0;
+ vfdP->fd = BasicOpenFile(fileName, fileFlags, fileMode);
if (vfdP->fd < 0)
{
FreeVfd(file);
+ free(fnamecopy);
return -1;
}
++nfile;
- DO_DB(elog(DEBUG, "fileNameOpenFile: success %d",
+ DO_DB(elog(LOG, "PathNameOpenFile: success %d",
vfdP->fd));
Insert(file);
- if (fileName == NULL)
- elog(ERROR, "fileNameOpenFile: NULL fname");
- vfdP->fileName = malloc(strlen(fileName) + 1);
- strcpy(vfdP->fileName, fileName);
-
- vfdP->fileFlags = fileFlags & ~(O_TRUNC | O_EXCL);
+ vfdP->fileName = fnamecopy;
+ /* Saved flags are adjusted to be OK for re-opening file */
+ vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
vfdP->fileMode = fileMode;
vfdP->seekPos = 0;
+ vfdP->fdstate = 0x0;
return file;
}
/*
- * open a file in the database directory ($PGDATA/base/...)
+ * open a file in the database directory ($PGDATA/base/DIROID/)
+ *
+ * The passed name MUST be a relative path. Effectively, this
+ * prepends DatabasePath to it and then acts like PathNameOpenFile.
*/
File
FileNameOpenFile(FileName fileName, int fileFlags, int fileMode)
File fd;
char *fname;
- fname = filepath(fileName);
- fd = fileNameOpenFile(fname, fileFlags, fileMode);
+ fname = make_database_relative(fileName);
+ fd = PathNameOpenFile(fname, fileFlags, fileMode);
pfree(fname);
return fd;
}
/*
- * open a file in an arbitrary directory
+ * Open a temporary file that will disappear when we close it.
+ *
+ * This routine takes care of generating an appropriate tempfile name.
+ * There's no need to pass in fileFlags or fileMode either, since only
+ * one setting makes any sense for a temp file.
+ *
+ * interXact: if true, don't close the file at end-of-transaction. In
+ * most cases, you don't want temporary files to outlive the transaction
+ * that created them, so this should be false -- but if you need
+ * "somewhat" temporary storage, this might be useful. In either case,
+ * the file is removed when the File is explicitly closed.
*/
File
-PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
+OpenTemporaryFile(bool interXact)
{
- return fileNameOpenFile(fileName, fileFlags, fileMode);
+ char tempfilepath[MAXPGPATH];
+ File file;
+
+ /*
+ * Generate a tempfile name that should be unique within the current
+ * database instance.
+ */
+ snprintf(tempfilepath, sizeof(tempfilepath),
+ "%s/%s%d.%ld", PG_TEMP_FILES_DIR, PG_TEMP_FILE_PREFIX,
+ MyProcPid, tempFileCounter++);
+
+ /*
+ * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
+ * temp file that can be reused.
+ */
+ file = FileNameOpenFile(tempfilepath,
+ O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
+ 0600);
+ if (file <= 0)
+ {
+ char *dirpath;
+
+ /*
+ * We might need to create the pg_tempfiles subdirectory, if no one
+ * has yet done so.
+ *
+ * Don't check for error from mkdir; it could fail if someone else
+ * just did the same thing. If it doesn't work then we'll bomb out on
+ * the second create attempt, instead.
+ */
+ dirpath = make_database_relative(PG_TEMP_FILES_DIR);
+ mkdir(dirpath, S_IRWXU);
+ pfree(dirpath);
+
+ file = FileNameOpenFile(tempfilepath,
+ O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
+ 0600);
+ if (file <= 0)
+ elog(ERROR, "could not create temporary file \"%s\": %m",
+ tempfilepath);
+ }
+
+ /* Mark it for deletion at close */
+ VfdCache[file].fdstate |= FD_TEMPORARY;
+
+ /* Mark it for deletion at EOXact */
+ if (!interXact)
+ {
+ VfdCache[file].fdstate |= FD_XACT_TEMPORARY;
+ VfdCache[file].create_subid = GetCurrentSubTransactionId();
+ }
+
+ return file;
}
+/*
+ * close a file when done with it
+ */
void
FileClose(File file)
{
- int returnValue;
+ Vfd *vfdP;
+
+ Assert(FileIsValid(file));
- DO_DB(elog(DEBUG, "FileClose: %d (%s)",
+ DO_DB(elog(LOG, "FileClose: %d (%s)",
file, VfdCache[file].fileName));
+ vfdP = &VfdCache[file];
+
if (!FileIsNotOpen(file))
{
-
/* remove the file from the lru ring */
Delete(file);
- /* if we did any writes, sync the file before closing */
- if (VfdCache[file].fdstate & FD_DIRTY)
- {
- returnValue = fsync(VfdCache[file].fd);
- Assert(returnValue != -1);
- VfdCache[file].fdstate &= ~FD_DIRTY;
- }
-
/* close the file */
- returnValue = close(VfdCache[file].fd);
- Assert(returnValue != -1);
+ if (close(vfdP->fd))
+ elog(ERROR, "failed to close \"%s\": %m",
+ vfdP->fileName);
--nfile;
- VfdCache[file].fd = VFD_CLOSED;
+ vfdP->fd = VFD_CLOSED;
}
/*
- * Add the Vfd slot to the free list
+ * Delete the file if it was temporary
*/
- FreeVfd(file);
+ if (vfdP->fdstate & FD_TEMPORARY)
+ {
+ /* reset flag so that die() interrupt won't cause problems */
+ vfdP->fdstate &= ~FD_TEMPORARY;
+ if (unlink(vfdP->fileName))
+ elog(LOG, "failed to unlink \"%s\": %m",
+ vfdP->fileName);
+ }
/*
- * Free the filename string
+ * Return the Vfd slot to the free list
*/
- free(VfdCache[file].fileName);
+ FreeVfd(file);
}
+/*
+ * close a file and forcibly delete the underlying Unix file
+ */
void
FileUnlink(File file)
{
- int returnValue;
+ Assert(FileIsValid(file));
- DO_DB(elog(DEBUG, "FileUnlink: %d (%s)",
+ DO_DB(elog(LOG, "FileUnlink: %d (%s)",
file, VfdCache[file].fileName));
- if (!FileIsNotOpen(file))
- {
-
- /* remove the file from the lru ring */
- Delete(file);
+ /* force FileClose to delete it */
+ VfdCache[file].fdstate |= FD_TEMPORARY;
- /* if we did any writes, sync the file before closing */
- if (VfdCache[file].fdstate & FD_DIRTY)
- {
- returnValue = fsync(VfdCache[file].fd);
- Assert(returnValue != -1);
- VfdCache[file].fdstate &= ~FD_DIRTY;
- }
-
- /* close the file */
- returnValue = close(VfdCache[file].fd);
- Assert(returnValue != -1);
-
- --nfile;
- VfdCache[file].fd = VFD_CLOSED;
- }
- /* add the Vfd slot to the free list */
- FreeVfd(file);
-
- /* free the filename string */
- unlink(VfdCache[file].fileName);
- free(VfdCache[file].fileName);
+ FileClose(file);
}
int
{
int returnCode;
- DO_DB(elog(DEBUG, "FileRead: %d (%s) %d %p",
- file, VfdCache[file].fileName, amount, buffer));
+ Assert(FileIsValid(file));
- FileAccess(file);
+ DO_DB(elog(LOG, "FileRead: %d (%s) %ld %d %p",
+ file, VfdCache[file].fileName,
+ VfdCache[file].seekPos, amount, buffer));
+
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return returnCode;
+
+retry:
returnCode = read(VfdCache[file].fd, buffer, amount);
- if (returnCode > 0)
+
+ if (returnCode >= 0)
VfdCache[file].seekPos += returnCode;
+ else
+ {
+ /*
+ * Windows may run out of kernel buffers and return "Insufficient
+ * system resources" error. Wait a bit and retry to solve it.
+ *
+ * It is rumored that EINTR is also possible on some Unix filesystems,
+ * in which case immediate retry is indicated.
+ */
+#ifdef WIN32
+ DWORD error = GetLastError();
+
+ switch (error)
+ {
+ case ERROR_NO_SYSTEM_RESOURCES:
+ pg_usleep(1000L);
+ errno = EINTR;
+ break;
+ default:
+ _dosmaperr(error);
+ break;
+ }
+#endif
+ /* OK to retry if interrupted */
+ if (errno == EINTR)
+ goto retry;
+
+ /* Trouble, so assume we don't know the file position anymore */
+ VfdCache[file].seekPos = FileUnknownPos;
+ }
return returnCode;
}
{
int returnCode;
- DO_DB(elog(DEBUG, "FileWrite: %d (%s) %d %p",
- file, VfdCache[file].fileName, amount, buffer));
+ Assert(FileIsValid(file));
- FileAccess(file);
+ DO_DB(elog(LOG, "FileWrite: %d (%s) %ld %d %p",
+ file, VfdCache[file].fileName,
+ VfdCache[file].seekPos, amount, buffer));
+
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return returnCode;
+
+retry:
+ errno = 0;
returnCode = write(VfdCache[file].fd, buffer, amount);
- if (returnCode > 0)
- { /* changed by Boris with Mao's advice */
+
+ /* if write didn't set errno, assume problem is no disk space */
+ if (returnCode != amount && errno == 0)
+ errno = ENOSPC;
+
+ if (returnCode >= 0)
VfdCache[file].seekPos += returnCode;
- }
+ else
+ {
+ /*
+ * See comments in FileRead()
+ */
+#ifdef WIN32
+ DWORD error = GetLastError();
- /* record the write */
- VfdCache[file].fdstate |= FD_DIRTY;
+ switch (error)
+ {
+ case ERROR_NO_SYSTEM_RESOURCES:
+ pg_usleep(1000L);
+ errno = EINTR;
+ break;
+ default:
+ _dosmaperr(error);
+ break;
+ }
+#endif
+ /* OK to retry if interrupted */
+ if (errno == EINTR)
+ goto retry;
+
+ /* Trouble, so assume we don't know the file position anymore */
+ VfdCache[file].seekPos = FileUnknownPos;
+ }
return returnCode;
}
+int
+FileSync(File file)
+{
+ int returnCode;
+
+ Assert(FileIsValid(file));
+
+ DO_DB(elog(LOG, "FileSync: %d (%s)",
+ file, VfdCache[file].fileName));
+
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return returnCode;
+
+ return pg_fsync(VfdCache[file].fd);
+}
+
long
FileSeek(File file, long offset, int whence)
{
int returnCode;
- DO_DB(elog(DEBUG, "FileSeek: %d (%s) %ld %d",
- file, VfdCache[file].fileName, offset, whence));
+ Assert(FileIsValid(file));
+
+ DO_DB(elog(LOG, "FileSeek: %d (%s) %ld %ld %d",
+ file, VfdCache[file].fileName,
+ VfdCache[file].seekPos, offset, whence));
if (FileIsNotOpen(file))
{
switch (whence)
{
case SEEK_SET:
+ if (offset < 0)
+ elog(ERROR, "invalid seek offset: %ld", offset);
VfdCache[file].seekPos = offset;
- return offset;
+ break;
case SEEK_CUR:
- VfdCache[file].seekPos = VfdCache[file].seekPos + offset;
- return VfdCache[file].seekPos;
+ VfdCache[file].seekPos += offset;
+ break;
case SEEK_END:
- FileAccess(file);
- returnCode = VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
- return returnCode;
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return returnCode;
+ VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+ offset, whence);
+ break;
default:
- elog(ERROR, "FileSeek: invalid whence: %d", whence);
+ elog(ERROR, "invalid whence: %d", whence);
break;
}
}
else
{
- returnCode = VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
- return returnCode;
+ switch (whence)
+ {
+ case SEEK_SET:
+ if (offset < 0)
+ elog(ERROR, "invalid seek offset: %ld", offset);
+ if (VfdCache[file].seekPos != offset)
+ VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+ offset, whence);
+ break;
+ case SEEK_CUR:
+ if (offset != 0 || VfdCache[file].seekPos == FileUnknownPos)
+ VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+ offset, whence);
+ break;
+ case SEEK_END:
+ VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+ offset, whence);
+ break;
+ default:
+ elog(ERROR, "invalid whence: %d", whence);
+ break;
+ }
}
- /* NOTREACHED */
- return -1L;
+ return VfdCache[file].seekPos;
}
/*
long
FileTell(File file)
{
- DO_DB(elog(DEBUG, "FileTell %d (%s)",
+ Assert(FileIsValid(file));
+ DO_DB(elog(LOG, "FileTell %d (%s)",
file, VfdCache[file].fileName));
return VfdCache[file].seekPos;
}
-
#endif
int
-FileTruncate(File file, int offset)
+FileTruncate(File file, long offset)
{
int returnCode;
- DO_DB(elog(DEBUG, "FileTruncate %d (%s)",
+ Assert(FileIsValid(file));
+
+ DO_DB(elog(LOG, "FileTruncate %d (%s)",
file, VfdCache[file].fileName));
- FileSync(file);
- FileAccess(file);
- returnCode = ftruncate(VfdCache[file].fd, offset);
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return returnCode;
+
+ returnCode = ftruncate(VfdCache[file].fd, (size_t) offset);
return returnCode;
}
-int
-FileSync(File file)
+
+/*
+ * Routines that want to use stdio (ie, FILE*) should use AllocateFile
+ * rather than plain fopen(). This lets fd.c deal with freeing FDs if
+ * necessary to open the file. When done, call FreeFile rather than fclose.
+ *
+ * Note that files that will be open for any significant length of time
+ * should NOT be handled this way, since they cannot share kernel file
+ * descriptors with other files; there is grave risk of running out of FDs
+ * if anyone locks down too many FDs. Most callers of this routine are
+ * simply reading a config file that they will read and close immediately.
+ *
+ * fd.c will automatically close all files opened with AllocateFile at
+ * transaction commit or abort; this prevents FD leakage if a routine
+ * that calls AllocateFile is terminated prematurely by ereport(ERROR).
+ *
+ * Ideally this should be the *only* direct call of fopen() in the backend.
+ */
+FILE *
+AllocateFile(const char *name, const char *mode)
{
- int returnCode;
+ FILE *file;
+
+ DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
+ numAllocatedDescs, name));
/*
- * If the file isn't open, then we don't need to sync it; we always
- * sync files when we close them. Also, if we haven't done any writes
- * that we haven't already synced, we can ignore the request.
+ * The test against MAX_ALLOCATED_DESCS prevents us from overflowing
+ * allocatedFiles[]; the test against max_safe_fds prevents AllocateFile
+ * from hogging every one of the available FDs, which'd lead to infinite
+ * looping.
*/
+ if (numAllocatedDescs >= MAX_ALLOCATED_DESCS ||
+ numAllocatedDescs >= max_safe_fds - 1)
+ elog(ERROR, "too many private files demanded");
- if (VfdCache[file].fd < 0 || !(VfdCache[file].fdstate & FD_DIRTY))
- returnCode = 0;
- else
+TryAgain:
+ if ((file = fopen(name, mode)) != NULL)
{
- returnCode = fsync(VfdCache[file].fd);
- VfdCache[file].fdstate &= ~FD_DIRTY;
+ AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
+
+ desc->kind = AllocateDescFile;
+ desc->desc.file = file;
+ desc->create_subid = GetCurrentSubTransactionId();
+ numAllocatedDescs++;
+ return desc->desc.file;
}
- return returnCode;
+ if (errno == EMFILE || errno == ENFILE)
+ {
+ int save_errno = errno;
+
+ ereport(LOG,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("out of file descriptors: %m; release and retry")));
+ errno = 0;
+ if (ReleaseLruFile())
+ goto TryAgain;
+ errno = save_errno;
+ }
+
+ /*
+ * TEMPORARY hack to log the Windows error code on fopen failures, in
+ * hopes of diagnosing some hard-to-reproduce problems.
+ */
+#ifdef WIN32
+ {
+ int save_errno = errno;
+
+ elog(LOG, "Windows fopen(\"%s\",\"%s\") failed: code %lu, errno %d",
+ name, mode, GetLastError(), save_errno);
+ errno = save_errno;
+ }
+#endif
+
+ return NULL;
}
-int
-FileNameUnlink(char *filename)
+/*
+ * Free an AllocateDesc of either type.
+ *
+ * The argument *must* point into the allocatedDescs[] array.
+ */
+static int
+FreeDesc(AllocateDesc *desc)
{
- int retval;
- char *fname;
+ int result;
- fname = filepath(filename);
- retval = unlink(fname);
- pfree(fname);
- return retval;
+ /* Close the underlying object */
+ switch (desc->kind)
+ {
+ case AllocateDescFile:
+ result = fclose(desc->desc.file);
+ break;
+ case AllocateDescDir:
+ result = closedir(desc->desc.dir);
+ break;
+ default:
+ elog(ERROR, "AllocateDesc kind not recognized");
+ result = 0; /* keep compiler quiet */
+ break;
+ }
+
+ /* Compact storage in the allocatedDescs array */
+ numAllocatedDescs--;
+ *desc = allocatedDescs[numAllocatedDescs];
+
+ return result;
}
/*
- * if we want to be sure that we have a real file descriptor available
- * (e.g., we want to know this in psort) we call AllocateFile to force
- * availability. when we are done we call FreeFile to deallocate the
- * descriptor.
+ * Close a file returned by AllocateFile.
*
- * allocatedFiles keeps track of how many have been allocated so we
- * can give a warning if there are too few left.
+ * Note we do not check fclose's return value --- it is up to the caller
+ * to handle close errors.
*/
-static int allocatedFiles = 0;
+int
+FreeFile(FILE *file)
+{
+ int i;
-FILE *
-AllocateFile(char *name, char *mode)
+ DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
+
+ /* Remove file from list of allocated files, if it's present */
+ for (i = numAllocatedDescs; --i >= 0;)
+ {
+ AllocateDesc *desc = &allocatedDescs[i];
+
+ if (desc->kind == AllocateDescFile && desc->desc.file == file)
+ return FreeDesc(desc);
+ }
+
+ /* Only get here if someone passes us a file not in allocatedDescs */
+ elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
+
+ return fclose(file);
+}
+
+
+/*
+ * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
+ * rather than plain opendir(). This lets fd.c deal with freeing FDs if
+ * necessary to open the directory, and with closing it after an elog.
+ * When done, call FreeDir rather than closedir.
+ *
+ * Ideally this should be the *only* direct call of opendir() in the backend.
+ */
+DIR *
+AllocateDir(const char *dirname)
{
- FILE *file;
- int fdleft;
+ DIR *dir;
- DO_DB(elog(DEBUG, "AllocateFile: Allocated %d.", allocatedFiles));
+ DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
+ numAllocatedDescs, dirname));
+
+ /*
+ * The test against MAX_ALLOCATED_DESCS prevents us from overflowing
+ * allocatedDescs[]; the test against max_safe_fds prevents AllocateDir
+ * from hogging every one of the available FDs, which'd lead to infinite
+ * looping.
+ */
+ if (numAllocatedDescs >= MAX_ALLOCATED_DESCS ||
+ numAllocatedDescs >= max_safe_fds - 1)
+ elog(ERROR, "too many private dirs demanded");
TryAgain:
- if ((file = fopen(name, mode)) == NULL)
+ if ((dir = opendir(dirname)) != NULL)
{
- if (errno == EMFILE || errno == ENFILE)
- {
- DO_DB(elog(DEBUG, "AllocateFile: not enough descs, retry, er= %d",
- errno));
- errno = 0;
- AssertLruRoom();
+ AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
+
+ desc->kind = AllocateDescDir;
+ desc->desc.dir = dir;
+ desc->create_subid = GetCurrentSubTransactionId();
+ numAllocatedDescs++;
+ return desc->desc.dir;
+ }
+
+ if (errno == EMFILE || errno == ENFILE)
+ {
+ int save_errno = errno;
+
+ ereport(LOG,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("out of file descriptors: %m; release and retry")));
+ errno = 0;
+ if (ReleaseLruFile())
goto TryAgain;
+ errno = save_errno;
+ }
+
+ return NULL;
+}
+
+/*
+ * Read a directory opened with AllocateDir, ereport'ing any error.
+ *
+ * This is easier to use than raw readdir() since it takes care of some
+ * otherwise rather tedious and error-prone manipulation of errno. Also,
+ * if you are happy with a generic error message for AllocateDir failure,
+ * you can just do
+ *
+ * dir = AllocateDir(path);
+ * while ((dirent = ReadDir(dir, path)) != NULL)
+ * process dirent;
+ * FreeDir(dir);
+ *
+ * since a NULL dir parameter is taken as indicating AllocateDir failed.
+ * (Make sure errno hasn't been changed since AllocateDir if you use this
+ * shortcut.)
+ *
+ * The pathname passed to AllocateDir must be passed to this routine too,
+ * but it is only used for error reporting.
+ */
+struct dirent *
+ReadDir(DIR *dir, const char *dirname)
+{
+ struct dirent *dent;
+
+ /* Give a generic message for AllocateDir failure, if caller didn't */
+ if (dir == NULL)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open directory \"%s\": %m",
+ dirname)));
+
+ errno = 0;
+ if ((dent = readdir(dir)) != NULL)
+ return dent;
+
+#ifdef WIN32
+
+ /*
+ * This fix is in mingw cvs (runtime/mingwex/dirent.c rev 1.4), but not in
+ * released version
+ */
+ if (GetLastError() == ERROR_NO_MORE_FILES)
+ errno = 0;
+#endif
+
+ if (errno)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read directory \"%s\": %m",
+ dirname)));
+ return NULL;
+}
+
+/*
+ * Close a directory opened with AllocateDir.
+ *
+ * Note we do not check closedir's return value --- it is up to the caller
+ * to handle close errors.
+ */
+int
+FreeDir(DIR *dir)
+{
+ int i;
+
+ DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
+
+ /* Remove dir from list of allocated dirs, if it's present */
+ for (i = numAllocatedDescs; --i >= 0;)
+ {
+ AllocateDesc *desc = &allocatedDescs[i];
+
+ if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
+ return FreeDesc(desc);
+ }
+
+ /* Only get here if someone passes us a dir not in allocatedDescs */
+ elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
+
+ return closedir(dir);
+}
+
+
+/*
+ * closeAllVfds
+ *
+ * Force all VFDs into the physically-closed state, so that the fewest
+ * possible number of kernel file descriptors are in use. There is no
+ * change in the logical state of the VFDs.
+ */
+void
+closeAllVfds(void)
+{
+ Index i;
+
+ if (SizeVfdCache > 0)
+ {
+ Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
+ for (i = 1; i < SizeVfdCache; i++)
+ {
+ if (!FileIsNotOpen(i))
+ LruDelete(i);
}
}
- else
+}
+
+/*
+ * AtEOSubXact_Files
+ *
+ * Take care of subtransaction commit/abort. At abort, we close temp files
+ * that the subtransaction may have opened. At commit, we reassign the
+ * files that were opened to the parent subtransaction.
+ */
+void
+AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
+ SubTransactionId parentSubid)
+{
+ Index i;
+
+ if (SizeVfdCache > 0)
{
- ++allocatedFiles;
- fdleft = pg_nofile() - allocatedFiles;
- if (fdleft < 6)
- elog(NOTICE, "warning: few usable file descriptors left (%d)", fdleft);
+ Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
+ for (i = 1; i < SizeVfdCache; i++)
+ {
+ unsigned short fdstate = VfdCache[i].fdstate;
+
+ if ((fdstate & FD_XACT_TEMPORARY) &&
+ VfdCache[i].create_subid == mySubid)
+ {
+ if (isCommit)
+ VfdCache[i].create_subid = parentSubid;
+ else if (VfdCache[i].fileName != NULL)
+ FileClose(i);
+ }
+ }
+ }
+
+ for (i = 0; i < numAllocatedDescs; i++)
+ {
+ if (allocatedDescs[i].create_subid == mySubid)
+ {
+ if (isCommit)
+ allocatedDescs[i].create_subid = parentSubid;
+ else
+ {
+ /* have to recheck the item after FreeDesc (ugly) */
+ FreeDesc(&allocatedDescs[i--]);
+ }
+ }
}
- return file;
}
/*
- * XXX What happens if FreeFile() is called without a previous
- * AllocateFile()?
+ * AtEOXact_Files
+ *
+ * This routine is called during transaction commit or abort (it doesn't
+ * particularly care which). All still-open per-transaction temporary file
+ * VFDs are closed, which also causes the underlying files to be
+ * deleted. Furthermore, all "allocated" stdio files are closed.
*/
void
-FreeFile(FILE *file)
+AtEOXact_Files(void)
{
- DO_DB(elog(DEBUG, "FreeFile: Allocated %d.", allocatedFiles));
+ CleanupTempFiles(false);
+}
- Assert(allocatedFiles > 0);
- fclose(file);
- --allocatedFiles;
+/*
+ * AtProcExit_Files
+ *
+ * on_proc_exit hook to clean up temp files during backend shutdown.
+ * Here, we want to clean up *all* temp files including interXact ones.
+ */
+static void
+AtProcExit_Files(int code, Datum arg)
+{
+ CleanupTempFiles(true);
}
+/*
+ * Close temporary files and delete their underlying files.
+ *
+ * isProcExit: if true, this is being called as the backend process is
+ * exiting. If that's the case, we should remove all temporary files; if
+ * that's not the case, we are being called for transaction commit/abort
+ * and should only remove transaction-local temp files. In either case,
+ * also clean up "allocated" stdio files and dirs.
+ */
+static void
+CleanupTempFiles(bool isProcExit)
+{
+ Index i;
+
+ if (SizeVfdCache > 0)
+ {
+ Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
+ for (i = 1; i < SizeVfdCache; i++)
+ {
+ unsigned short fdstate = VfdCache[i].fdstate;
+
+ if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL)
+ {
+ /*
+ * If we're in the process of exiting a backend process, close
+ * all temporary files. Otherwise, only close temporary files
+ * local to the current transaction.
+ */
+ if (isProcExit || (fdstate & FD_XACT_TEMPORARY))
+ FileClose(i);
+ }
+ }
+ }
+
+ while (numAllocatedDescs > 0)
+ FreeDesc(&allocatedDescs[0]);
+}
+
+
+/*
+ * Remove temporary files left over from a prior postmaster session
+ *
+ * This should be called during postmaster startup. It will forcibly
+ * remove any leftover files created by OpenTemporaryFile.
+ *
+ * NOTE: we could, but don't, call this during a post-backend-crash restart
+ * cycle. The argument for not doing it is that someone might want to examine
+ * the temp files for debugging purposes. This does however mean that
+ * OpenTemporaryFile had better allow for collision with an existing temp
+ * file name.
+ */
void
-closeAllVfds()
+RemovePgTempFiles(void)
{
- int i;
+ char temp_path[MAXPGPATH];
+ DIR *db_dir;
+ struct dirent *db_de;
+
+ /*
+ * Cycle through pgsql_tmp directories for all databases and remove old
+ * temp files.
+ */
+ db_dir = AllocateDir("base");
+
+ while ((db_de = ReadDir(db_dir, "base")) != NULL)
+ {
+ if (strcmp(db_de->d_name, ".") == 0 ||
+ strcmp(db_de->d_name, "..") == 0)
+ continue;
+
+ snprintf(temp_path, sizeof(temp_path), "base/%s/%s",
+ db_de->d_name, PG_TEMP_FILES_DIR);
+ RemovePgTempFilesInDir(temp_path);
+ }
+
+ FreeDir(db_dir);
- Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
- for (i = 1; i < SizeVfdCache; i++)
+ /*
+ * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
+ * DataDir as well.
+ */
+#ifdef EXEC_BACKEND
+ RemovePgTempFilesInDir(PG_TEMP_FILES_DIR);
+#endif
+}
+
+/* Process one pgsql_tmp directory for RemovePgTempFiles */
+static void
+RemovePgTempFilesInDir(const char *tmpdirname)
+{
+ DIR *temp_dir;
+ struct dirent *temp_de;
+ char rm_path[MAXPGPATH];
+
+ temp_dir = AllocateDir(tmpdirname);
+ if (temp_dir == NULL)
+ {
+ /* anything except ENOENT is fishy */
+ if (errno != ENOENT)
+ elog(LOG,
+ "could not open temporary-files directory \"%s\": %m",
+ tmpdirname);
+ return;
+ }
+
+ while ((temp_de = ReadDir(temp_dir, tmpdirname)) != NULL)
{
- if (!FileIsNotOpen(i))
- LruDelete(i);
+ if (strcmp(temp_de->d_name, ".") == 0 ||
+ strcmp(temp_de->d_name, "..") == 0)
+ continue;
+
+ snprintf(rm_path, sizeof(rm_path), "%s/%s",
+ tmpdirname, temp_de->d_name);
+
+ if (strncmp(temp_de->d_name,
+ PG_TEMP_FILE_PREFIX,
+ strlen(PG_TEMP_FILE_PREFIX)) == 0)
+ unlink(rm_path); /* note we ignore any error */
+ else
+ elog(LOG,
+ "unexpected file found in temporary-files directory: \"%s\"",
+ rm_path);
}
+
+ FreeDir(temp_dir);
}