X-Git-Url: https://granicus.if.org/sourcecode?a=blobdiff_plain;f=src%2Fbackend%2Fstorage%2Ffile%2Ffd.c;h=a594b16edf35f12e15e04d7772f190eda54165b0;hb=f99a569a2ee3763b4ae174e81250c95ca0fdcbb6;hp=271a752a623fc2a9a5263702979a1e2492c0ac22;hpb=089003fb462fcce46c02bf47322b429f73c33c50;p=postgresql diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 271a752a62..a594b16edf 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -3,11 +3,11 @@ * fd.c * Virtual file descriptor code. * - * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.100 2003/08/04 00:43:23 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.130 2006/10/04 00:29:57 momjian Exp $ * * NOTES: * @@ -43,57 +43,60 @@ #include #include #include -#include -#include #include #include #include "miscadmin.h" +#include "access/xact.h" #include "storage/fd.h" #include "storage/ipc.h" -/* Filename components for OpenTemporaryFile */ -#define PG_TEMP_FILES_DIR "pgsql_tmp" -#define PG_TEMP_FILE_PREFIX "pgsql_tmp" - - /* - * Problem: Postgres does a system(ld...) to do dynamic loading. - * This will open several extra files in addition to those used by - * Postgres. We need to guarantee that there are file descriptors free - * for ld to use. - * - * The current solution is to limit the number of file descriptors - * that this code will allocate at one time: it leaves RESERVE_FOR_LD free. + * We must leave some file descriptors free for system(), the dynamic loader, + * and other code that tries to open files without consulting fd.c. This + * is the number left free. (While we can be pretty sure we won't get + * EMFILE, there's never any guarantee that we won't get ENFILE due to + * other processes chewing up FDs. So it's a bad idea to try to open files + * without consulting fd.c. Nonetheless we cannot control all code.) * - * (Even though most dynamic loaders now use dlopen(3) or the - * equivalent, the OS must still open several files to perform the - * dynamic loading. And stdin/stdout/stderr count too. Keep this here.) + * Because this is just a fixed setting, we are effectively assuming that + * no such code will leave FDs open over the long term; otherwise the slop + * is likely to be insufficient. Note in particular that we expect that + * loading a shared library does not result in any permanent increase in + * the number of open files. (This appears to be true on most if not + * all platforms as of Feb 2004.) */ -#ifndef RESERVE_FOR_LD -#define RESERVE_FOR_LD 10 -#endif +#define NUM_RESERVED_FDS 10 /* - * We need to ensure that we have at least some file descriptors - * available to postgreSQL after we've reserved the ones for LD, - * so we set that value here. - * - * I think 10 is an appropriate value so that's what it'll be - * for now. + * If we have fewer than this many usable FDs after allowing for the reserved + * ones, choke. */ -#ifndef FD_MINFREE -#define FD_MINFREE 10 -#endif +#define FD_MINFREE 10 + /* - * A number of platforms return values for sysconf(_SC_OPEN_MAX) that are - * far beyond what they can really support. This GUC parameter limits what - * we will believe. + * A number of platforms allow individual processes to open many more files + * than they can really support when *many* processes do the same thing. + * This GUC parameter lets the DBA limit max_safe_fds to something less than + * what the postmaster's initial probe suggests will work. */ int max_files_per_process = 1000; +/* + * Maximum number of file descriptors to open for either VFD entries or + * AllocateFile/AllocateDir operations. This is initialized to a conservative + * value, and remains that way indefinitely in bootstrap or standalone-backend + * cases. In normal postmaster operation, the postmaster calls + * set_max_safe_fds() late in initialization to update the value, and that + * value is then inherited by forked subprocesses. + * + * Note: the value of max_files_per_process is taken into account while + * setting this variable, and so need not be tested separately. + */ +static int max_safe_fds = 32; /* default if not changed */ + /* Debugging.... */ @@ -120,6 +123,7 @@ typedef struct vfd { signed short fd; /* current FD, or VFD_CLOSED if none */ unsigned short fdstate; /* bitflags for VFD's state */ + SubTransactionId create_subid; /* for TEMPORARY fds, creating subxact */ File nextFree; /* link to next free VFD, if in freelist */ File lruMoreRecently; /* doubly linked recency-of-use list */ File lruLessRecently; @@ -144,16 +148,34 @@ static Size SizeVfdCache = 0; static int nfile = 0; /* - * List of stdio FILEs opened with AllocateFile. + * List of stdio FILEs and DIRs opened with AllocateFile + * and AllocateDir. * - * Since we don't want to encourage heavy use of AllocateFile, it seems - * OK to put a pretty small maximum limit on the number of simultaneously - * allocated files. + * Since we don't want to encourage heavy use of AllocateFile or AllocateDir, + * it seems OK to put a pretty small maximum limit on the number of + * simultaneously allocated descs. */ -#define MAX_ALLOCATED_FILES 32 +#define MAX_ALLOCATED_DESCS 32 -static int numAllocatedFiles = 0; -static FILE *allocatedFiles[MAX_ALLOCATED_FILES]; +typedef enum +{ + AllocateDescFile, + AllocateDescDir +} AllocateDescKind; + +typedef struct +{ + AllocateDescKind kind; + union + { + FILE *file; + DIR *dir; + } desc; + SubTransactionId create_subid; +} AllocateDesc; + +static int numAllocatedDescs = 0; +static AllocateDesc allocatedDescs[MAX_ALLOCATED_DESCS]; /* * Number of temporary files opened during the current session; @@ -202,18 +224,33 @@ static File AllocateVfd(void); static void FreeVfd(File file); static int FileAccess(File file); -static File fileNameOpenFile(FileName fileName, int fileFlags, int fileMode); -static char *filepath(const char *filename); -static long pg_nofile(void); -static void AtProcExit_Files(void); +static char *make_database_relative(const char *filename); +static void AtProcExit_Files(int code, Datum arg); static void CleanupTempFiles(bool isProcExit); +static void RemovePgTempFilesInDir(const char *tmpdirname); /* - * pg_fsync --- same as fsync except does nothing if enableFsync is off + * pg_fsync --- do fsync with or without writethrough */ int pg_fsync(int fd) +{ +#ifndef HAVE_FSYNC_WRITETHROUGH_ONLY + if (sync_method != SYNC_METHOD_FSYNC_WRITETHROUGH) + return pg_fsync_no_writethrough(fd); + else +#endif + return pg_fsync_writethrough(fd); +} + + +/* + * pg_fsync_no_writethrough --- same as fsync except does nothing if + * enableFsync is off + */ +int +pg_fsync_no_writethrough(int fd) { if (enableFsync) return fsync(fd); @@ -221,6 +258,26 @@ pg_fsync(int fd) return 0; } +/* + * pg_fsync_writethrough + */ +int +pg_fsync_writethrough(int fd) +{ + if (enableFsync) + { +#ifdef WIN32 + return _commit(fd); +#elif defined(F_FULLFSYNC) + return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0; +#else + return -1; +#endif + } + else + return 0; +} + /* * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off * @@ -241,6 +298,143 @@ pg_fdatasync(int fd) return 0; } +/* + * InitFileAccess --- initialize this module during backend startup + * + * This is called during either normal or standalone backend start. + * It is *not* called in the postmaster. + */ +void +InitFileAccess(void) +{ + Assert(SizeVfdCache == 0); /* call me only once */ + + /* initialize cache header entry */ + VfdCache = (Vfd *) malloc(sizeof(Vfd)); + if (VfdCache == NULL) + ereport(FATAL, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd)); + VfdCache->fd = VFD_CLOSED; + + SizeVfdCache = 1; + + /* register proc-exit hook to ensure temp files are dropped at exit */ + on_proc_exit(AtProcExit_Files, 0); +} + +/* + * count_usable_fds --- count how many FDs the system will let us open, + * and estimate how many are already open. + * + * We stop counting if usable_fds reaches max_to_probe. Note: a small + * value of max_to_probe might result in an underestimate of already_open; + * we must fill in any "gaps" in the set of used FDs before the calculation + * of already_open will give the right answer. In practice, max_to_probe + * of a couple of dozen should be enough to ensure good results. + * + * We assume stdin (FD 0) is available for dup'ing + */ +static void +count_usable_fds(int max_to_probe, int *usable_fds, int *already_open) +{ + int *fd; + int size; + int used = 0; + int highestfd = 0; + int j; + + size = 1024; + fd = (int *) palloc(size * sizeof(int)); + + /* dup until failure or probe limit reached */ + for (;;) + { + int thisfd; + + thisfd = dup(0); + if (thisfd < 0) + { + /* Expect EMFILE or ENFILE, else it's fishy */ + if (errno != EMFILE && errno != ENFILE) + elog(WARNING, "dup(0) failed after %d successes: %m", used); + break; + } + + if (used >= size) + { + size *= 2; + fd = (int *) repalloc(fd, size * sizeof(int)); + } + fd[used++] = thisfd; + + if (highestfd < thisfd) + highestfd = thisfd; + + if (used >= max_to_probe) + break; + } + + /* release the files we opened */ + for (j = 0; j < used; j++) + close(fd[j]); + + pfree(fd); + + /* + * Return results. usable_fds is just the number of successful dups. We + * assume that the system limit is highestfd+1 (remember 0 is a legal FD + * number) and so already_open is highestfd+1 - usable_fds. + */ + *usable_fds = used; + *already_open = highestfd + 1 - used; +} + +/* + * set_max_safe_fds + * Determine number of filedescriptors that fd.c is allowed to use + */ +void +set_max_safe_fds(void) +{ + int usable_fds; + int already_open; + + /*---------- + * We want to set max_safe_fds to + * MIN(usable_fds, max_files_per_process - already_open) + * less the slop factor for files that are opened without consulting + * fd.c. This ensures that we won't exceed either max_files_per_process + * or the experimentally-determined EMFILE limit. + *---------- + */ + count_usable_fds(max_files_per_process, + &usable_fds, &already_open); + + max_safe_fds = Min(usable_fds, max_files_per_process - already_open); + + /* + * Take off the FDs reserved for system() etc. + */ + max_safe_fds -= NUM_RESERVED_FDS; + + /* + * Make sure we still have enough to get by. + */ + if (max_safe_fds < FD_MINFREE) + ereport(FATAL, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("insufficient file descriptors available to start server process"), + errdetail("System allows %d, we need at least %d.", + max_safe_fds + NUM_RESERVED_FDS, + FD_MINFREE + NUM_RESERVED_FDS))); + + elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d", + max_safe_fds, usable_fds, already_open); +} + /* * BasicOpenFile --- same as open(2) except can free other FDs if needed * @@ -274,7 +468,7 @@ tryAgain: ereport(LOG, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), - errmsg("out of file descriptors: %m; release and retry"))); + errmsg("out of file descriptors: %m; release and retry"))); errno = 0; if (ReleaseLruFile()) goto tryAgain; @@ -284,63 +478,6 @@ tryAgain: return -1; /* failure */ } -/* - * pg_nofile: determine number of filedescriptors that fd.c is allowed to use - */ -static long -pg_nofile(void) -{ - static long no_files = 0; - - /* need do this calculation only once */ - if (no_files == 0) - { - /* - * Ask the system what its files-per-process limit is. - */ -#ifdef HAVE_SYSCONF - no_files = sysconf(_SC_OPEN_MAX); - if (no_files <= 0) - { -#ifdef NOFILE - no_files = (long) NOFILE; -#else - no_files = (long) max_files_per_process; -#endif - elog(LOG, "sysconf(_SC_OPEN_MAX) failed; using %ld", - no_files); - } -#else /* !HAVE_SYSCONF */ -#ifdef NOFILE - no_files = (long) NOFILE; -#else - no_files = (long) max_files_per_process; -#endif -#endif /* HAVE_SYSCONF */ - - /* - * Some platforms return hopelessly optimistic values. Apply a - * configurable upper limit. - */ - if (no_files > (long) max_files_per_process) - no_files = (long) max_files_per_process; - - /* - * Make sure we have enough to get by after reserving some for LD. - */ - if ((no_files - RESERVE_FOR_LD) < FD_MINFREE) - ereport(FATAL, - (errcode(ERRCODE_INSUFFICIENT_RESOURCES), - errmsg("insufficient file descriptors available to start backend"), - errdetail("System allows %ld, we need at least %d.", - no_files, RESERVE_FOR_LD + FD_MINFREE))); - - no_files -= RESERVE_FOR_LD; - } - - return no_files; -} - #if defined(FDDEBUG) static void @@ -402,7 +539,7 @@ LruDelete(File file) /* close the file */ if (close(vfdP->fd)) - elog(LOG, "failed to close \"%s\": %m", + elog(ERROR, "failed to close \"%s\": %m", vfdP->fileName); --nfile; @@ -430,6 +567,7 @@ Insert(File file) DO_DB(_dump_lru()); } +/* returns 0 on success, -1 on re-open failure (with errno set) */ static int LruInsert(File file) { @@ -444,16 +582,16 @@ LruInsert(File file) if (FileIsNotOpen(file)) { - while (nfile + numAllocatedFiles >= pg_nofile()) + while (nfile + numAllocatedDescs >= max_safe_fds) { if (!ReleaseLruFile()) break; } /* - * The open could still fail for lack of file descriptors, eg due - * to overall system file table being full. So, be prepared to - * release another FD if necessary... + * The open could still fail for lack of file descriptors, eg due to + * overall system file table being full. So, be prepared to release + * another FD if necessary... */ vfdP->fd = BasicOpenFile(vfdP->fileName, vfdP->fileFlags, vfdP->fileMode); @@ -495,8 +633,8 @@ ReleaseLruFile(void) if (nfile > 0) { /* - * There are opened files and so there should be at least one used - * vfd in the ring. + * There are opened files and so there should be at least one used vfd + * in the ring. */ Assert(VfdCache[0].lruMoreRecently != 0); LruDelete(VfdCache[0].lruMoreRecently); @@ -511,34 +649,16 @@ AllocateVfd(void) Index i; File file; - DO_DB(elog(LOG, "AllocateVfd. Size %d", SizeVfdCache)); + DO_DB(elog(LOG, "AllocateVfd. Size %lu", SizeVfdCache)); - if (SizeVfdCache == 0) - { - /* initialize header entry first time through */ - VfdCache = (Vfd *) malloc(sizeof(Vfd)); - if (VfdCache == NULL) - ereport(FATAL, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd)); - VfdCache->fd = VFD_CLOSED; - - SizeVfdCache = 1; - - /* - * register proc-exit call to ensure temp files are dropped at - * exit - */ - on_proc_exit(AtProcExit_Files, 0); - } + Assert(SizeVfdCache > 0); /* InitFileAccess not called? */ if (VfdCache[0].nextFree == 0) { /* - * The free list is empty so it is time to increase the size of - * the array. We choose to double it each time this happens. - * However, there's not much point in starting *real* small. + * The free list is empty so it is time to increase the size of the + * array. We choose to double it each time this happens. However, + * there's not much point in starting *real* small. */ Size newCacheSize = SizeVfdCache * 2; Vfd *newVfdCache; @@ -600,37 +720,24 @@ FreeVfd(File file) VfdCache[0].nextFree = file; } -/* filepath() - * Convert given pathname to absolute. +/* + * make_database_relative() + * Prepend DatabasePath to the given file name. * * Result is a palloc'd string. - * - * (Generally, this isn't actually necessary, considering that we - * should be cd'd into the database directory. Presently it is only - * necessary to do it in "bootstrap" mode. Maybe we should change - * bootstrap mode to do the cd, and save a few cycles/bytes here.) */ static char * -filepath(const char *filename) +make_database_relative(const char *filename) { char *buf; - /* Not an absolute path name? Then fill in with database path... */ - if (!is_absolute_path(filename)) - { - buf = (char *) palloc(strlen(DatabasePath) + strlen(filename) + 2); - sprintf(buf, "%s/%s", DatabasePath, filename); - } - else - buf = pstrdup(filename); - -#ifdef FILEDEBUG - printf("filepath: path is %s\n", buf); -#endif - + Assert(!is_absolute_path(filename)); + buf = (char *) palloc(strlen(DatabasePath) + strlen(filename) + 2); + sprintf(buf, "%s/%s", DatabasePath, filename); return buf; } +/* returns 0 on success, -1 on re-open failure (with errno set) */ static int FileAccess(File file) { @@ -640,9 +747,8 @@ FileAccess(File file) file, VfdCache[file].fileName)); /* - * Is the file open? If not, open it and put it at the head of the - * LRU ring (possibly closing the least recently used file to get an - * FD). + * Is the file open? If not, open it and put it at the head of the LRU + * ring (possibly closing the least recently used file to get an FD). */ if (FileIsNotOpen(file)) @@ -654,9 +760,8 @@ FileAccess(File file) else if (VfdCache[0].lruLessRecently != file) { /* - * We now know that the file is open and that it is not the last - * one accessed, so we need to move it to the head of the Lru - * ring. + * We now know that the file is open and that it is not the last one + * accessed, so we need to move it to the head of the Lru ring. */ Delete(file); @@ -679,16 +784,21 @@ FileInvalidate(File file) } #endif -static File -fileNameOpenFile(FileName fileName, - int fileFlags, - int fileMode) +/* + * open a file in an arbitrary directory + * + * NB: if the passed pathname is relative (which it usually is), + * it will be interpreted relative to the process' working directory + * (which should always be $PGDATA when this code is running). + */ +File +PathNameOpenFile(FileName fileName, int fileFlags, int fileMode) { char *fnamecopy; File file; Vfd *vfdP; - DO_DB(elog(LOG, "fileNameOpenFile: %s %x %o", + DO_DB(elog(LOG, "PathNameOpenFile: %s %x %o", fileName, fileFlags, fileMode)); /* @@ -703,7 +813,7 @@ fileNameOpenFile(FileName fileName, file = AllocateVfd(); vfdP = &VfdCache[file]; - while (nfile + numAllocatedFiles >= pg_nofile()) + while (nfile + numAllocatedDescs >= max_safe_fds) { if (!ReleaseLruFile()) break; @@ -718,7 +828,7 @@ fileNameOpenFile(FileName fileName, return -1; } ++nfile; - DO_DB(elog(LOG, "fileNameOpenFile: success %d", + DO_DB(elog(LOG, "PathNameOpenFile: success %d", vfdP->fd)); Insert(file); @@ -734,7 +844,10 @@ fileNameOpenFile(FileName fileName, } /* - * open a file in the database directory ($PGDATA/base/...) + * open a file in the database directory ($PGDATA/base/DIROID/) + * + * The passed name MUST be a relative path. Effectively, this + * prepends DatabasePath to it and then acts like PathNameOpenFile. */ File FileNameOpenFile(FileName fileName, int fileFlags, int fileMode) @@ -742,21 +855,12 @@ FileNameOpenFile(FileName fileName, int fileFlags, int fileMode) File fd; char *fname; - fname = filepath(fileName); - fd = fileNameOpenFile(fname, fileFlags, fileMode); + fname = make_database_relative(fileName); + fd = PathNameOpenFile(fname, fileFlags, fileMode); pfree(fname); return fd; } -/* - * open a file in an arbitrary directory - */ -File -PathNameOpenFile(FileName fileName, int fileFlags, int fileMode) -{ - return fileNameOpenFile(fileName, fileFlags, fileMode); -} - /* * Open a temporary file that will disappear when we close it. * @@ -785,8 +889,8 @@ OpenTemporaryFile(bool interXact) MyProcPid, tempFileCounter++); /* - * Open the file. Note: we don't use O_EXCL, in case there is an - * orphaned temp file that can be reused. + * Open the file. Note: we don't use O_EXCL, in case there is an orphaned + * temp file that can be reused. */ file = FileNameOpenFile(tempfilepath, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY, @@ -796,14 +900,14 @@ OpenTemporaryFile(bool interXact) char *dirpath; /* - * We might need to create the pg_tempfiles subdirectory, if no - * one has yet done so. + * We might need to create the pg_tempfiles subdirectory, if no one + * has yet done so. * * Don't check for error from mkdir; it could fail if someone else - * just did the same thing. If it doesn't work then we'll bomb - * out on the second create attempt, instead. + * just did the same thing. If it doesn't work then we'll bomb out on + * the second create attempt, instead. */ - dirpath = filepath(PG_TEMP_FILES_DIR); + dirpath = make_database_relative(PG_TEMP_FILES_DIR); mkdir(dirpath, S_IRWXU); pfree(dirpath); @@ -820,7 +924,10 @@ OpenTemporaryFile(bool interXact) /* Mark it for deletion at EOXact */ if (!interXact) + { VfdCache[file].fdstate |= FD_XACT_TEMPORARY; + VfdCache[file].create_subid = GetCurrentSubTransactionId(); + } return file; } @@ -847,7 +954,7 @@ FileClose(File file) /* close the file */ if (close(vfdP->fd)) - elog(LOG, "failed to close \"%s\": %m", + elog(ERROR, "failed to close \"%s\": %m", vfdP->fileName); --nfile; @@ -900,12 +1007,45 @@ FileRead(File file, char *buffer, int amount) file, VfdCache[file].fileName, VfdCache[file].seekPos, amount, buffer)); - FileAccess(file); + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; + +retry: returnCode = read(VfdCache[file].fd, buffer, amount); - if (returnCode > 0) + + if (returnCode >= 0) VfdCache[file].seekPos += returnCode; else + { + /* + * Windows may run out of kernel buffers and return "Insufficient + * system resources" error. Wait a bit and retry to solve it. + * + * It is rumored that EINTR is also possible on some Unix filesystems, + * in which case immediate retry is indicated. + */ +#ifdef WIN32 + DWORD error = GetLastError(); + + switch (error) + { + case ERROR_NO_SYSTEM_RESOURCES: + pg_usleep(1000L); + errno = EINTR; + break; + default: + _dosmaperr(error); + break; + } +#endif + /* OK to retry if interrupted */ + if (errno == EINTR) + goto retry; + + /* Trouble, so assume we don't know the file position anymore */ VfdCache[file].seekPos = FileUnknownPos; + } return returnCode; } @@ -921,8 +1061,11 @@ FileWrite(File file, char *buffer, int amount) file, VfdCache[file].fileName, VfdCache[file].seekPos, amount, buffer)); - FileAccess(file); + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; +retry: errno = 0; returnCode = write(VfdCache[file].fd, buffer, amount); @@ -930,17 +1073,60 @@ FileWrite(File file, char *buffer, int amount) if (returnCode != amount && errno == 0) errno = ENOSPC; - if (returnCode > 0) + if (returnCode >= 0) VfdCache[file].seekPos += returnCode; else + { + /* + * See comments in FileRead() + */ +#ifdef WIN32 + DWORD error = GetLastError(); + + switch (error) + { + case ERROR_NO_SYSTEM_RESOURCES: + pg_usleep(1000L); + errno = EINTR; + break; + default: + _dosmaperr(error); + break; + } +#endif + /* OK to retry if interrupted */ + if (errno == EINTR) + goto retry; + + /* Trouble, so assume we don't know the file position anymore */ VfdCache[file].seekPos = FileUnknownPos; + } return returnCode; } +int +FileSync(File file) +{ + int returnCode; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileSync: %d (%s)", + file, VfdCache[file].fileName)); + + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; + + return pg_fsync(VfdCache[file].fd); +} + long FileSeek(File file, long offset, int whence) { + int returnCode; + Assert(FileIsValid(file)); DO_DB(elog(LOG, "FileSeek: %d (%s) %ld %ld %d", @@ -960,8 +1146,11 @@ FileSeek(File file, long offset, int whence) VfdCache[file].seekPos += offset; break; case SEEK_END: - FileAccess(file); - VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence); + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; + VfdCache[file].seekPos = lseek(VfdCache[file].fd, + offset, whence); break; default: elog(ERROR, "invalid whence: %d", whence); @@ -976,14 +1165,17 @@ FileSeek(File file, long offset, int whence) if (offset < 0) elog(ERROR, "invalid seek offset: %ld", offset); if (VfdCache[file].seekPos != offset) - VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence); + VfdCache[file].seekPos = lseek(VfdCache[file].fd, + offset, whence); break; case SEEK_CUR: if (offset != 0 || VfdCache[file].seekPos == FileUnknownPos) - VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence); + VfdCache[file].seekPos = lseek(VfdCache[file].fd, + offset, whence); break; case SEEK_END: - VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence); + VfdCache[file].seekPos = lseek(VfdCache[file].fd, + offset, whence); break; default: elog(ERROR, "invalid whence: %d", whence); @@ -1017,7 +1209,10 @@ FileTruncate(File file, long offset) DO_DB(elog(LOG, "FileTruncate %d (%s)", file, VfdCache[file].fileName)); - FileAccess(file); + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; + returnCode = ftruncate(VfdCache[file].fd, (size_t) offset); return returnCode; } @@ -1041,21 +1236,33 @@ FileTruncate(File file, long offset) * Ideally this should be the *only* direct call of fopen() in the backend. */ FILE * -AllocateFile(char *name, char *mode) +AllocateFile(const char *name, const char *mode) { FILE *file; - DO_DB(elog(LOG, "AllocateFile: Allocated %d", numAllocatedFiles)); + DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)", + numAllocatedDescs, name)); - if (numAllocatedFiles >= MAX_ALLOCATED_FILES) - elog(ERROR, "too many private FDs demanded"); + /* + * The test against MAX_ALLOCATED_DESCS prevents us from overflowing + * allocatedFiles[]; the test against max_safe_fds prevents AllocateFile + * from hogging every one of the available FDs, which'd lead to infinite + * looping. + */ + if (numAllocatedDescs >= MAX_ALLOCATED_DESCS || + numAllocatedDescs >= max_safe_fds - 1) + elog(ERROR, "too many private files demanded"); TryAgain: if ((file = fopen(name, mode)) != NULL) { - allocatedFiles[numAllocatedFiles] = file; - numAllocatedFiles++; - return file; + AllocateDesc *desc = &allocatedDescs[numAllocatedDescs]; + + desc->kind = AllocateDescFile; + desc->desc.file = file; + desc->create_subid = GetCurrentSubTransactionId(); + numAllocatedDescs++; + return desc->desc.file; } if (errno == EMFILE || errno == ENFILE) @@ -1064,39 +1271,228 @@ TryAgain: ereport(LOG, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), - errmsg("out of file descriptors: %m; release and retry"))); + errmsg("out of file descriptors: %m; release and retry"))); errno = 0; if (ReleaseLruFile()) goto TryAgain; errno = save_errno; } + /* + * TEMPORARY hack to log the Windows error code on fopen failures, in + * hopes of diagnosing some hard-to-reproduce problems. + */ +#ifdef WIN32 + { + int save_errno = errno; + + elog(LOG, "Windows fopen(\"%s\",\"%s\") failed: code %lu, errno %d", + name, mode, GetLastError(), save_errno); + errno = save_errno; + } +#endif + return NULL; } -void +/* + * Free an AllocateDesc of either type. + * + * The argument *must* point into the allocatedDescs[] array. + */ +static int +FreeDesc(AllocateDesc *desc) +{ + int result; + + /* Close the underlying object */ + switch (desc->kind) + { + case AllocateDescFile: + result = fclose(desc->desc.file); + break; + case AllocateDescDir: + result = closedir(desc->desc.dir); + break; + default: + elog(ERROR, "AllocateDesc kind not recognized"); + result = 0; /* keep compiler quiet */ + break; + } + + /* Compact storage in the allocatedDescs array */ + numAllocatedDescs--; + *desc = allocatedDescs[numAllocatedDescs]; + + return result; +} + +/* + * Close a file returned by AllocateFile. + * + * Note we do not check fclose's return value --- it is up to the caller + * to handle close errors. + */ +int FreeFile(FILE *file) { int i; - DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedFiles)); + DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs)); /* Remove file from list of allocated files, if it's present */ - for (i = numAllocatedFiles; --i >= 0;) + for (i = numAllocatedDescs; --i >= 0;) { - if (allocatedFiles[i] == file) - { - numAllocatedFiles--; - allocatedFiles[i] = allocatedFiles[numAllocatedFiles]; - break; - } + AllocateDesc *desc = &allocatedDescs[i]; + + if (desc->kind == AllocateDescFile && desc->desc.file == file) + return FreeDesc(desc); + } + + /* Only get here if someone passes us a file not in allocatedDescs */ + elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile"); + + return fclose(file); +} + + +/* + * Routines that want to use (ie, DIR*) should use AllocateDir + * rather than plain opendir(). This lets fd.c deal with freeing FDs if + * necessary to open the directory, and with closing it after an elog. + * When done, call FreeDir rather than closedir. + * + * Ideally this should be the *only* direct call of opendir() in the backend. + */ +DIR * +AllocateDir(const char *dirname) +{ + DIR *dir; + + DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)", + numAllocatedDescs, dirname)); + + /* + * The test against MAX_ALLOCATED_DESCS prevents us from overflowing + * allocatedDescs[]; the test against max_safe_fds prevents AllocateDir + * from hogging every one of the available FDs, which'd lead to infinite + * looping. + */ + if (numAllocatedDescs >= MAX_ALLOCATED_DESCS || + numAllocatedDescs >= max_safe_fds - 1) + elog(ERROR, "too many private dirs demanded"); + +TryAgain: + if ((dir = opendir(dirname)) != NULL) + { + AllocateDesc *desc = &allocatedDescs[numAllocatedDescs]; + + desc->kind = AllocateDescDir; + desc->desc.dir = dir; + desc->create_subid = GetCurrentSubTransactionId(); + numAllocatedDescs++; + return desc->desc.dir; + } + + if (errno == EMFILE || errno == ENFILE) + { + int save_errno = errno; + + ereport(LOG, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("out of file descriptors: %m; release and retry"))); + errno = 0; + if (ReleaseLruFile()) + goto TryAgain; + errno = save_errno; + } + + return NULL; +} + +/* + * Read a directory opened with AllocateDir, ereport'ing any error. + * + * This is easier to use than raw readdir() since it takes care of some + * otherwise rather tedious and error-prone manipulation of errno. Also, + * if you are happy with a generic error message for AllocateDir failure, + * you can just do + * + * dir = AllocateDir(path); + * while ((dirent = ReadDir(dir, path)) != NULL) + * process dirent; + * FreeDir(dir); + * + * since a NULL dir parameter is taken as indicating AllocateDir failed. + * (Make sure errno hasn't been changed since AllocateDir if you use this + * shortcut.) + * + * The pathname passed to AllocateDir must be passed to this routine too, + * but it is only used for error reporting. + */ +struct dirent * +ReadDir(DIR *dir, const char *dirname) +{ + struct dirent *dent; + + /* Give a generic message for AllocateDir failure, if caller didn't */ + if (dir == NULL) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open directory \"%s\": %m", + dirname))); + + errno = 0; + if ((dent = readdir(dir)) != NULL) + return dent; + +#ifdef WIN32 + + /* + * This fix is in mingw cvs (runtime/mingwex/dirent.c rev 1.4), but not in + * released version + */ + if (GetLastError() == ERROR_NO_MORE_FILES) + errno = 0; +#endif + + if (errno) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read directory \"%s\": %m", + dirname))); + return NULL; +} + +/* + * Close a directory opened with AllocateDir. + * + * Note we do not check closedir's return value --- it is up to the caller + * to handle close errors. + */ +int +FreeDir(DIR *dir) +{ + int i; + + DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs)); + + /* Remove dir from list of allocated dirs, if it's present */ + for (i = numAllocatedDescs; --i >= 0;) + { + AllocateDesc *desc = &allocatedDescs[i]; + + if (desc->kind == AllocateDescDir && desc->desc.dir == dir) + return FreeDesc(desc); } - if (i < 0) - elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile"); - fclose(file); + /* Only get here if someone passes us a dir not in allocatedDescs */ + elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir"); + + return closedir(dir); } + /* * closeAllVfds * @@ -1120,6 +1516,52 @@ closeAllVfds(void) } } +/* + * AtEOSubXact_Files + * + * Take care of subtransaction commit/abort. At abort, we close temp files + * that the subtransaction may have opened. At commit, we reassign the + * files that were opened to the parent subtransaction. + */ +void +AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, + SubTransactionId parentSubid) +{ + Index i; + + if (SizeVfdCache > 0) + { + Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */ + for (i = 1; i < SizeVfdCache; i++) + { + unsigned short fdstate = VfdCache[i].fdstate; + + if ((fdstate & FD_XACT_TEMPORARY) && + VfdCache[i].create_subid == mySubid) + { + if (isCommit) + VfdCache[i].create_subid = parentSubid; + else if (VfdCache[i].fileName != NULL) + FileClose(i); + } + } + } + + for (i = 0; i < numAllocatedDescs; i++) + { + if (allocatedDescs[i].create_subid == mySubid) + { + if (isCommit) + allocatedDescs[i].create_subid = parentSubid; + else + { + /* have to recheck the item after FreeDesc (ugly) */ + FreeDesc(&allocatedDescs[i--]); + } + } + } +} + /* * AtEOXact_Files * @@ -1141,7 +1583,7 @@ AtEOXact_Files(void) * Here, we want to clean up *all* temp files including interXact ones. */ static void -AtProcExit_Files(void) +AtProcExit_Files(int code, Datum arg) { CleanupTempFiles(true); } @@ -1153,7 +1595,7 @@ AtProcExit_Files(void) * exiting. If that's the case, we should remove all temporary files; if * that's not the case, we are being called for transaction commit/abort * and should only remove transaction-local temp files. In either case, - * also clean up "allocated" stdio files. + * also clean up "allocated" stdio files and dirs. */ static void CleanupTempFiles(bool isProcExit) @@ -1170,9 +1612,9 @@ CleanupTempFiles(bool isProcExit) if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL) { /* - * If we're in the process of exiting a backend process, - * close all temporary files. Otherwise, only close - * temporary files local to the current transaction. + * If we're in the process of exiting a backend process, close + * all temporary files. Otherwise, only close temporary files + * local to the current transaction. */ if (isProcExit || (fdstate & FD_XACT_TEMPORARY)) FileClose(i); @@ -1180,8 +1622,8 @@ CleanupTempFiles(bool isProcExit) } } - while (numAllocatedFiles > 0) - FreeFile(allocatedFiles[0]); + while (numAllocatedDescs > 0) + FreeDesc(&allocatedDescs[0]); } @@ -1200,57 +1642,75 @@ CleanupTempFiles(bool isProcExit) void RemovePgTempFiles(void) { - char db_path[MAXPGPATH]; char temp_path[MAXPGPATH]; - char rm_path[MAXPGPATH]; DIR *db_dir; - DIR *temp_dir; struct dirent *db_de; - struct dirent *temp_de; /* - * Cycle through pg_tempfiles for all databases and remove old temp - * files. + * Cycle through pgsql_tmp directories for all databases and remove old + * temp files. */ - snprintf(db_path, sizeof(db_path), "%s/base", DataDir); - if ((db_dir = opendir(db_path)) != NULL) + db_dir = AllocateDir("base"); + + while ((db_de = ReadDir(db_dir, "base")) != NULL) { - while ((db_de = readdir(db_dir)) != NULL) - { - if (strcmp(db_de->d_name, ".") == 0 || - strcmp(db_de->d_name, "..") == 0) - continue; - - snprintf(temp_path, sizeof(temp_path), - "%s/%s/%s", - db_path, db_de->d_name, - PG_TEMP_FILES_DIR); - if ((temp_dir = opendir(temp_path)) != NULL) - { - while ((temp_de = readdir(temp_dir)) != NULL) - { - if (strcmp(temp_de->d_name, ".") == 0 || - strcmp(temp_de->d_name, "..") == 0) - continue; - - snprintf(rm_path, sizeof(temp_path), - "%s/%s/%s/%s", - db_path, db_de->d_name, - PG_TEMP_FILES_DIR, - temp_de->d_name); - - if (strncmp(temp_de->d_name, - PG_TEMP_FILE_PREFIX, - strlen(PG_TEMP_FILE_PREFIX)) == 0) - unlink(rm_path); - else - elog(LOG, - "unexpected file found in temporary-files directory: \"%s\"", - rm_path); - } - closedir(temp_dir); - } - } - closedir(db_dir); + if (strcmp(db_de->d_name, ".") == 0 || + strcmp(db_de->d_name, "..") == 0) + continue; + + snprintf(temp_path, sizeof(temp_path), "base/%s/%s", + db_de->d_name, PG_TEMP_FILES_DIR); + RemovePgTempFilesInDir(temp_path); } + + FreeDir(db_dir); + + /* + * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of + * DataDir as well. + */ +#ifdef EXEC_BACKEND + RemovePgTempFilesInDir(PG_TEMP_FILES_DIR); +#endif +} + +/* Process one pgsql_tmp directory for RemovePgTempFiles */ +static void +RemovePgTempFilesInDir(const char *tmpdirname) +{ + DIR *temp_dir; + struct dirent *temp_de; + char rm_path[MAXPGPATH]; + + temp_dir = AllocateDir(tmpdirname); + if (temp_dir == NULL) + { + /* anything except ENOENT is fishy */ + if (errno != ENOENT) + elog(LOG, + "could not open temporary-files directory \"%s\": %m", + tmpdirname); + return; + } + + while ((temp_de = ReadDir(temp_dir, tmpdirname)) != NULL) + { + if (strcmp(temp_de->d_name, ".") == 0 || + strcmp(temp_de->d_name, "..") == 0) + continue; + + snprintf(rm_path, sizeof(rm_path), "%s/%s", + tmpdirname, temp_de->d_name); + + if (strncmp(temp_de->d_name, + PG_TEMP_FILE_PREFIX, + strlen(PG_TEMP_FILE_PREFIX)) == 0) + unlink(rm_path); /* note we ignore any error */ + else + elog(LOG, + "unexpected file found in temporary-files directory: \"%s\"", + rm_path); + } + + FreeDir(temp_dir); }