1 /*-------------------------------------------------------------------------
4 * Virtual file descriptor code.
6 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
10 * $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.120 2005/08/08 03:11:49 tgl Exp $
14 * This code manages a cache of 'virtual' file descriptors (VFDs).
15 * The server opens many file descriptors for a variety of reasons,
16 * including base tables, scratch files (e.g., sort and hash spool
17 * files), and random calls to C library routines like system(3); it
18 * is quite easy to exceed system limits on the number of open files a
19 * single process can have. (This is around 256 on many modern
20 * operating systems, but can be as low as 32 on others.)
22 * VFDs are managed as an LRU pool, with actual OS file descriptors
23 * being opened and closed as needed. Obviously, if a routine is
24 * opened using these interfaces, all subsequent operations must also
25 * be through these interfaces (the File type is not a real file
28 * For this scheme to work, most (if not all) routines throughout the
29 * server should use these interfaces instead of calling the C library
30 * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31 * may find ourselves short of real file descriptors anyway.
33 * This file used to contain a bunch of stuff to support RAID levels 0
34 * (jbod), 1 (duplex) and 5 (xor parity). That stuff is all gone
35 * because the parallel query processing code that called it is all
36 * gone. If you really need it you could get it from the original
38 *-------------------------------------------------------------------------
44 #include <sys/param.h>
49 #include "miscadmin.h"
50 #include "access/xact.h"
51 #include "storage/fd.h"
52 #include "storage/ipc.h"
56 * We must leave some file descriptors free for system(), the dynamic loader,
57 * and other code that tries to open files without consulting fd.c. This
58 * is the number left free. (While we can be pretty sure we won't get
59 * EMFILE, there's never any guarantee that we won't get ENFILE due to
60 * other processes chewing up FDs. So it's a bad idea to try to open files
61 * without consulting fd.c. Nonetheless we cannot control all code.)
63 * Because this is just a fixed setting, we are effectively assuming that
64 * no such code will leave FDs open over the long term; otherwise the slop
65 * is likely to be insufficient. Note in particular that we expect that
66 * loading a shared library does not result in any permanent increase in
67 * the number of open files. (This appears to be true on most if not
68 * all platforms as of Feb 2004.)
70 #define NUM_RESERVED_FDS 10
73 * If we have fewer than this many usable FDs after allowing for the reserved
80 * A number of platforms allow individual processes to open many more files
81 * than they can really support when *many* processes do the same thing.
82 * This GUC parameter lets the DBA limit max_safe_fds to something less than
83 * what the postmaster's initial probe suggests will work.
85 int max_files_per_process = 1000;
88 * Maximum number of file descriptors to open for either VFD entries or
89 * AllocateFile/AllocateDir operations. This is initialized to a conservative
90 * value, and remains that way indefinitely in bootstrap or standalone-backend
91 * cases. In normal postmaster operation, the postmaster calls
92 * set_max_safe_fds() late in initialization to update the value, and that
93 * value is then inherited by forked subprocesses.
95 * Note: the value of max_files_per_process is taken into account while
96 * setting this variable, and so need not be tested separately.
98 static int max_safe_fds = 32; /* default if not changed */
106 #define DO_DB(A) /* A */
109 #define VFD_CLOSED (-1)
111 #define FileIsValid(file) \
112 ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
114 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
116 #define FileUnknownPos (-1L)
118 /* these are the assigned bits in fdstate below: */
119 #define FD_TEMPORARY (1 << 0) /* T = delete when closed */
120 #define FD_XACT_TEMPORARY (1 << 1) /* T = delete at eoXact */
124 signed short fd; /* current FD, or VFD_CLOSED if none */
125 unsigned short fdstate; /* bitflags for VFD's state */
126 SubTransactionId create_subid; /* for TEMPORARY fds, creating subxact */
127 File nextFree; /* link to next free VFD, if in freelist */
128 File lruMoreRecently; /* doubly linked recency-of-use list */
129 File lruLessRecently;
130 long seekPos; /* current logical file position */
131 char *fileName; /* name of file, or NULL for unused VFD */
132 /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
133 int fileFlags; /* open(2) flags for (re)opening the file */
134 int fileMode; /* mode to pass to open(2) */
138 * Virtual File Descriptor array pointer and size. This grows as
139 * needed. 'File' values are indexes into this array.
140 * Note that VfdCache[0] is not a usable VFD, just a list header.
142 static Vfd *VfdCache;
143 static Size SizeVfdCache = 0;
146 * Number of file descriptors known to be in use by VFD entries.
148 static int nfile = 0;
151 * List of stdio FILEs and <dirent.h> DIRs opened with AllocateFile
154 * Since we don't want to encourage heavy use of AllocateFile or AllocateDir,
155 * it seems OK to put a pretty small maximum limit on the number of
156 * simultaneously allocated descs.
158 #define MAX_ALLOCATED_DESCS 32
168 AllocateDescKind kind;
174 SubTransactionId create_subid;
177 static int numAllocatedDescs = 0;
178 static AllocateDesc allocatedDescs[MAX_ALLOCATED_DESCS];
181 * Number of temporary files opened during the current session;
182 * this is used in generation of tempfile names.
184 static long tempFileCounter = 0;
187 /*--------------------
191 * Delete - delete a file from the Lru ring
192 * LruDelete - remove a file from the Lru ring and close its FD
193 * Insert - put a file at the front of the Lru ring
194 * LruInsert - put a file at the front of the Lru ring and open it
195 * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
196 * AllocateVfd - grab a free (or new) file record (from VfdArray)
197 * FreeVfd - free a file record
199 * The Least Recently Used ring is a doubly linked list that begins and
200 * ends on element zero. Element zero is special -- it doesn't represent
201 * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
202 * anchor that shows us the beginning/end of the ring.
203 * Only VFD elements that are currently really open (have an FD assigned) are
204 * in the Lru ring. Elements that are "virtually" open can be recognized
205 * by having a non-null fileName field.
209 * /--less----\ /---------\
211 * #0 --more---> LeastRecentlyUsed --more-\ \
213 * \\less--> MostRecentlyUsedFile <---/ |
214 * \more---/ \--less--/
216 *--------------------
218 static void Delete(File file);
219 static void LruDelete(File file);
220 static void Insert(File file);
221 static int LruInsert(File file);
222 static bool ReleaseLruFile(void);
223 static File AllocateVfd(void);
224 static void FreeVfd(File file);
226 static int FileAccess(File file);
227 static char *make_database_relative(const char *filename);
228 static void AtProcExit_Files(int code, Datum arg);
229 static void CleanupTempFiles(bool isProcExit);
230 static void RemovePgTempFilesInDir(const char *tmpdirname);
234 * pg_fsync --- do fsync with or without writethrough
239 #ifndef HAVE_FSYNC_WRITETHROUGH_ONLY
240 if (sync_method != SYNC_METHOD_FSYNC_WRITETHROUGH)
241 return pg_fsync_no_writethrough(fd);
244 return pg_fsync_writethrough(fd);
249 * pg_fsync_no_writethrough --- same as fsync except does nothing if
253 pg_fsync_no_writethrough(int fd)
262 * pg_fsync_writethrough
265 pg_fsync_writethrough(int fd)
270 #elif defined(__darwin__)
271 return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
280 * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
282 * Not all platforms have fdatasync; treat as fsync if not available.
289 #ifdef HAVE_FDATASYNC
290 return fdatasync(fd);
300 * InitFileAccess --- initialize this module during backend startup
302 * This is called during either normal or standalone backend start.
303 * It is *not* called in the postmaster.
308 Assert(SizeVfdCache == 0); /* call me only once */
310 /* initialize cache header entry */
311 VfdCache = (Vfd *) malloc(sizeof(Vfd));
312 if (VfdCache == NULL)
314 (errcode(ERRCODE_OUT_OF_MEMORY),
315 errmsg("out of memory")));
317 MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
318 VfdCache->fd = VFD_CLOSED;
322 /* register proc-exit hook to ensure temp files are dropped at exit */
323 on_proc_exit(AtProcExit_Files, 0);
327 * count_usable_fds --- count how many FDs the system will let us open,
328 * and estimate how many are already open.
330 * We stop counting if usable_fds reaches max_to_probe. Note: a small
331 * value of max_to_probe might result in an underestimate of already_open;
332 * we must fill in any "gaps" in the set of used FDs before the calculation
333 * of already_open will give the right answer. In practice, max_to_probe
334 * of a couple of dozen should be enough to ensure good results.
336 * We assume stdin (FD 0) is available for dup'ing
339 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
348 fd = (int *) palloc(size * sizeof(int));
350 /* dup until failure or probe limit reached */
358 /* Expect EMFILE or ENFILE, else it's fishy */
359 if (errno != EMFILE && errno != ENFILE)
360 elog(WARNING, "dup(0) failed after %d successes: %m", used);
367 fd = (int *) repalloc(fd, size * sizeof(int));
371 if (highestfd < thisfd)
374 if (used >= max_to_probe)
378 /* release the files we opened */
379 for (j = 0; j < used; j++)
385 * Return results. usable_fds is just the number of successful dups.
386 * We assume that the system limit is highestfd+1 (remember 0 is a
387 * legal FD number) and so already_open is highestfd+1 - usable_fds.
390 *already_open = highestfd + 1 - used;
395 * Determine number of filedescriptors that fd.c is allowed to use
398 set_max_safe_fds(void)
404 * We want to set max_safe_fds to
405 * MIN(usable_fds, max_files_per_process - already_open)
406 * less the slop factor for files that are opened without consulting
407 * fd.c. This ensures that we won't exceed either max_files_per_process
408 * or the experimentally-determined EMFILE limit.
411 count_usable_fds(max_files_per_process,
412 &usable_fds, &already_open);
414 max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
417 * Take off the FDs reserved for system() etc.
419 max_safe_fds -= NUM_RESERVED_FDS;
422 * Make sure we still have enough to get by.
424 if (max_safe_fds < FD_MINFREE)
426 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
427 errmsg("insufficient file descriptors available to start server process"),
428 errdetail("System allows %d, we need at least %d.",
429 max_safe_fds + NUM_RESERVED_FDS,
430 FD_MINFREE + NUM_RESERVED_FDS)));
432 elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
433 max_safe_fds, usable_fds, already_open);
437 * BasicOpenFile --- same as open(2) except can free other FDs if needed
439 * This is exported for use by places that really want a plain kernel FD,
440 * but need to be proof against running out of FDs. Once an FD has been
441 * successfully returned, it is the caller's responsibility to ensure that
442 * it will not be leaked on ereport()! Most users should *not* call this
443 * routine directly, but instead use the VFD abstraction level, which
444 * provides protection against descriptor leaks as well as management of
445 * files that need to be open for more than a short period of time.
447 * Ideally this should be the *only* direct call of open() in the backend.
448 * In practice, the postmaster calls open() directly, and there are some
449 * direct open() calls done early in backend startup. Those are OK since
450 * this module wouldn't have any open files to close at that point anyway.
453 BasicOpenFile(FileName fileName, int fileFlags, int fileMode)
458 fd = open(fileName, fileFlags, fileMode);
461 return fd; /* success! */
463 if (errno == EMFILE || errno == ENFILE)
465 int save_errno = errno;
468 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
469 errmsg("out of file descriptors: %m; release and retry")));
471 if (ReleaseLruFile())
476 return -1; /* failure */
484 int mru = VfdCache[0].lruLessRecently;
485 Vfd *vfdP = &VfdCache[mru];
488 snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
491 mru = vfdP->lruLessRecently;
492 vfdP = &VfdCache[mru];
493 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
495 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
507 DO_DB(elog(LOG, "Delete %d (%s)",
508 file, VfdCache[file].fileName));
511 vfdP = &VfdCache[file];
513 VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
514 VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
526 DO_DB(elog(LOG, "LruDelete %d (%s)",
527 file, VfdCache[file].fileName));
529 vfdP = &VfdCache[file];
531 /* delete the vfd record from the LRU ring */
534 /* save the seek position */
535 vfdP->seekPos = (long) lseek(vfdP->fd, 0L, SEEK_CUR);
536 Assert(vfdP->seekPos != -1L);
540 elog(ERROR, "failed to close \"%s\": %m",
544 vfdP->fd = VFD_CLOSED;
554 DO_DB(elog(LOG, "Insert %d (%s)",
555 file, VfdCache[file].fileName));
558 vfdP = &VfdCache[file];
560 vfdP->lruMoreRecently = 0;
561 vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
562 VfdCache[0].lruLessRecently = file;
563 VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
568 /* returns 0 on success, -1 on re-open failure (with errno set) */
576 DO_DB(elog(LOG, "LruInsert %d (%s)",
577 file, VfdCache[file].fileName));
579 vfdP = &VfdCache[file];
581 if (FileIsNotOpen(file))
583 while (nfile + numAllocatedDescs >= max_safe_fds)
585 if (!ReleaseLruFile())
590 * The open could still fail for lack of file descriptors, eg due
591 * to overall system file table being full. So, be prepared to
592 * release another FD if necessary...
594 vfdP->fd = BasicOpenFile(vfdP->fileName, vfdP->fileFlags,
598 DO_DB(elog(LOG, "RE_OPEN FAILED: %d", errno));
603 DO_DB(elog(LOG, "RE_OPEN SUCCESS"));
607 /* seek to the right position */
608 if (vfdP->seekPos != 0L)
612 returnValue = (long) lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
613 Assert(returnValue != -1L);
618 * put it at the head of the Lru ring
629 DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
634 * There are opened files and so there should be at least one used
637 Assert(VfdCache[0].lruMoreRecently != 0);
638 LruDelete(VfdCache[0].lruMoreRecently);
639 return true; /* freed a file */
641 return false; /* no files available to free */
650 DO_DB(elog(LOG, "AllocateVfd. Size %d", SizeVfdCache));
652 Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
654 if (VfdCache[0].nextFree == 0)
657 * The free list is empty so it is time to increase the size of
658 * the array. We choose to double it each time this happens.
659 * However, there's not much point in starting *real* small.
661 Size newCacheSize = SizeVfdCache * 2;
664 if (newCacheSize < 32)
668 * Be careful not to clobber VfdCache ptr if realloc fails.
670 newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
671 if (newVfdCache == NULL)
673 (errcode(ERRCODE_OUT_OF_MEMORY),
674 errmsg("out of memory")));
675 VfdCache = newVfdCache;
678 * Initialize the new entries and link them into the free list.
680 for (i = SizeVfdCache; i < newCacheSize; i++)
682 MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
683 VfdCache[i].nextFree = i + 1;
684 VfdCache[i].fd = VFD_CLOSED;
686 VfdCache[newCacheSize - 1].nextFree = 0;
687 VfdCache[0].nextFree = SizeVfdCache;
690 * Record the new size
692 SizeVfdCache = newCacheSize;
695 file = VfdCache[0].nextFree;
697 VfdCache[0].nextFree = VfdCache[file].nextFree;
705 Vfd *vfdP = &VfdCache[file];
707 DO_DB(elog(LOG, "FreeVfd: %d (%s)",
708 file, vfdP->fileName ? vfdP->fileName : ""));
710 if (vfdP->fileName != NULL)
712 free(vfdP->fileName);
713 vfdP->fileName = NULL;
717 vfdP->nextFree = VfdCache[0].nextFree;
718 VfdCache[0].nextFree = file;
722 * make_database_relative()
723 * Prepend DatabasePath to the given file name.
725 * Result is a palloc'd string.
728 make_database_relative(const char *filename)
732 Assert(!is_absolute_path(filename));
733 buf = (char *) palloc(strlen(DatabasePath) + strlen(filename) + 2);
734 sprintf(buf, "%s/%s", DatabasePath, filename);
738 /* returns 0 on success, -1 on re-open failure (with errno set) */
740 FileAccess(File file)
744 DO_DB(elog(LOG, "FileAccess %d (%s)",
745 file, VfdCache[file].fileName));
748 * Is the file open? If not, open it and put it at the head of the
749 * LRU ring (possibly closing the least recently used file to get an
753 if (FileIsNotOpen(file))
755 returnValue = LruInsert(file);
756 if (returnValue != 0)
759 else if (VfdCache[0].lruLessRecently != file)
762 * We now know that the file is open and that it is not the last
763 * one accessed, so we need to move it to the head of the Lru
775 * Called when we get a shared invalidation message on some relation.
779 FileInvalidate(File file)
781 Assert(FileIsValid(file));
782 if (!FileIsNotOpen(file))
788 * open a file in an arbitrary directory
790 * NB: if the passed pathname is relative (which it usually is),
791 * it will be interpreted relative to the process' working directory
792 * (which should always be $PGDATA when this code is running).
795 PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
801 DO_DB(elog(LOG, "PathNameOpenFile: %s %x %o",
802 fileName, fileFlags, fileMode));
805 * We need a malloc'd copy of the file name; fail cleanly if no room.
807 fnamecopy = strdup(fileName);
808 if (fnamecopy == NULL)
810 (errcode(ERRCODE_OUT_OF_MEMORY),
811 errmsg("out of memory")));
813 file = AllocateVfd();
814 vfdP = &VfdCache[file];
816 while (nfile + numAllocatedDescs >= max_safe_fds)
818 if (!ReleaseLruFile())
822 vfdP->fd = BasicOpenFile(fileName, fileFlags, fileMode);
831 DO_DB(elog(LOG, "PathNameOpenFile: success %d",
836 vfdP->fileName = fnamecopy;
837 /* Saved flags are adjusted to be OK for re-opening file */
838 vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
839 vfdP->fileMode = fileMode;
847 * open a file in the database directory ($PGDATA/base/DIROID/)
849 * The passed name MUST be a relative path. Effectively, this
850 * prepends DatabasePath to it and then acts like PathNameOpenFile.
853 FileNameOpenFile(FileName fileName, int fileFlags, int fileMode)
858 fname = make_database_relative(fileName);
859 fd = PathNameOpenFile(fname, fileFlags, fileMode);
865 * Open a temporary file that will disappear when we close it.
867 * This routine takes care of generating an appropriate tempfile name.
868 * There's no need to pass in fileFlags or fileMode either, since only
869 * one setting makes any sense for a temp file.
871 * interXact: if true, don't close the file at end-of-transaction. In
872 * most cases, you don't want temporary files to outlive the transaction
873 * that created them, so this should be false -- but if you need
874 * "somewhat" temporary storage, this might be useful. In either case,
875 * the file is removed when the File is explicitly closed.
878 OpenTemporaryFile(bool interXact)
880 char tempfilepath[MAXPGPATH];
884 * Generate a tempfile name that should be unique within the current
887 snprintf(tempfilepath, sizeof(tempfilepath),
888 "%s/%s%d.%ld", PG_TEMP_FILES_DIR, PG_TEMP_FILE_PREFIX,
889 MyProcPid, tempFileCounter++);
892 * Open the file. Note: we don't use O_EXCL, in case there is an
893 * orphaned temp file that can be reused.
895 file = FileNameOpenFile(tempfilepath,
896 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
903 * We might need to create the pg_tempfiles subdirectory, if no
904 * one has yet done so.
906 * Don't check for error from mkdir; it could fail if someone else
907 * just did the same thing. If it doesn't work then we'll bomb
908 * out on the second create attempt, instead.
910 dirpath = make_database_relative(PG_TEMP_FILES_DIR);
911 mkdir(dirpath, S_IRWXU);
914 file = FileNameOpenFile(tempfilepath,
915 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
918 elog(ERROR, "could not create temporary file \"%s\": %m",
922 /* Mark it for deletion at close */
923 VfdCache[file].fdstate |= FD_TEMPORARY;
925 /* Mark it for deletion at EOXact */
928 VfdCache[file].fdstate |= FD_XACT_TEMPORARY;
929 VfdCache[file].create_subid = GetCurrentSubTransactionId();
936 * close a file when done with it
943 Assert(FileIsValid(file));
945 DO_DB(elog(LOG, "FileClose: %d (%s)",
946 file, VfdCache[file].fileName));
948 vfdP = &VfdCache[file];
950 if (!FileIsNotOpen(file))
952 /* remove the file from the lru ring */
957 elog(ERROR, "failed to close \"%s\": %m",
961 vfdP->fd = VFD_CLOSED;
965 * Delete the file if it was temporary
967 if (vfdP->fdstate & FD_TEMPORARY)
969 /* reset flag so that die() interrupt won't cause problems */
970 vfdP->fdstate &= ~FD_TEMPORARY;
971 if (unlink(vfdP->fileName))
972 elog(LOG, "failed to unlink \"%s\": %m",
977 * Return the Vfd slot to the free list
983 * close a file and forcibly delete the underlying Unix file
986 FileUnlink(File file)
988 Assert(FileIsValid(file));
990 DO_DB(elog(LOG, "FileUnlink: %d (%s)",
991 file, VfdCache[file].fileName));
993 /* force FileClose to delete it */
994 VfdCache[file].fdstate |= FD_TEMPORARY;
1000 FileRead(File file, char *buffer, int amount)
1004 Assert(FileIsValid(file));
1006 DO_DB(elog(LOG, "FileRead: %d (%s) %ld %d %p",
1007 file, VfdCache[file].fileName,
1008 VfdCache[file].seekPos, amount, buffer));
1010 returnCode = FileAccess(file);
1014 returnCode = read(VfdCache[file].fd, buffer, amount);
1016 VfdCache[file].seekPos += returnCode;
1018 VfdCache[file].seekPos = FileUnknownPos;
1024 FileWrite(File file, char *buffer, int amount)
1028 Assert(FileIsValid(file));
1030 DO_DB(elog(LOG, "FileWrite: %d (%s) %ld %d %p",
1031 file, VfdCache[file].fileName,
1032 VfdCache[file].seekPos, amount, buffer));
1034 returnCode = FileAccess(file);
1039 returnCode = write(VfdCache[file].fd, buffer, amount);
1041 /* if write didn't set errno, assume problem is no disk space */
1042 if (returnCode != amount && errno == 0)
1046 VfdCache[file].seekPos += returnCode;
1048 VfdCache[file].seekPos = FileUnknownPos;
1058 Assert(FileIsValid(file));
1060 DO_DB(elog(LOG, "FileSync: %d (%s)",
1061 file, VfdCache[file].fileName));
1063 returnCode = FileAccess(file);
1067 return pg_fsync(VfdCache[file].fd);
1071 FileSeek(File file, long offset, int whence)
1075 Assert(FileIsValid(file));
1077 DO_DB(elog(LOG, "FileSeek: %d (%s) %ld %ld %d",
1078 file, VfdCache[file].fileName,
1079 VfdCache[file].seekPos, offset, whence));
1081 if (FileIsNotOpen(file))
1087 elog(ERROR, "invalid seek offset: %ld", offset);
1088 VfdCache[file].seekPos = offset;
1091 VfdCache[file].seekPos += offset;
1094 returnCode = FileAccess(file);
1097 VfdCache[file].seekPos = lseek(VfdCache[file].fd,
1101 elog(ERROR, "invalid whence: %d", whence);
1111 elog(ERROR, "invalid seek offset: %ld", offset);
1112 if (VfdCache[file].seekPos != offset)
1113 VfdCache[file].seekPos = lseek(VfdCache[file].fd,
1117 if (offset != 0 || VfdCache[file].seekPos == FileUnknownPos)
1118 VfdCache[file].seekPos = lseek(VfdCache[file].fd,
1122 VfdCache[file].seekPos = lseek(VfdCache[file].fd,
1126 elog(ERROR, "invalid whence: %d", whence);
1130 return VfdCache[file].seekPos;
1134 * XXX not actually used but here for completeness
1140 Assert(FileIsValid(file));
1141 DO_DB(elog(LOG, "FileTell %d (%s)",
1142 file, VfdCache[file].fileName));
1143 return VfdCache[file].seekPos;
1148 FileTruncate(File file, long offset)
1152 Assert(FileIsValid(file));
1154 DO_DB(elog(LOG, "FileTruncate %d (%s)",
1155 file, VfdCache[file].fileName));
1157 returnCode = FileAccess(file);
1161 returnCode = ftruncate(VfdCache[file].fd, (size_t) offset);
1167 * Routines that want to use stdio (ie, FILE*) should use AllocateFile
1168 * rather than plain fopen(). This lets fd.c deal with freeing FDs if
1169 * necessary to open the file. When done, call FreeFile rather than fclose.
1171 * Note that files that will be open for any significant length of time
1172 * should NOT be handled this way, since they cannot share kernel file
1173 * descriptors with other files; there is grave risk of running out of FDs
1174 * if anyone locks down too many FDs. Most callers of this routine are
1175 * simply reading a config file that they will read and close immediately.
1177 * fd.c will automatically close all files opened with AllocateFile at
1178 * transaction commit or abort; this prevents FD leakage if a routine
1179 * that calls AllocateFile is terminated prematurely by ereport(ERROR).
1181 * Ideally this should be the *only* direct call of fopen() in the backend.
1184 AllocateFile(char *name, char *mode)
1188 DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
1189 numAllocatedDescs, name));
1192 * The test against MAX_ALLOCATED_DESCS prevents us from overflowing
1193 * allocatedFiles[]; the test against max_safe_fds prevents
1194 * AllocateFile from hogging every one of the available FDs, which'd
1195 * lead to infinite looping.
1197 if (numAllocatedDescs >= MAX_ALLOCATED_DESCS ||
1198 numAllocatedDescs >= max_safe_fds - 1)
1199 elog(ERROR, "too many private files demanded");
1202 if ((file = fopen(name, mode)) != NULL)
1204 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
1206 desc->kind = AllocateDescFile;
1207 desc->desc.file = file;
1208 desc->create_subid = GetCurrentSubTransactionId();
1209 numAllocatedDescs++;
1210 return desc->desc.file;
1213 if (errno == EMFILE || errno == ENFILE)
1215 int save_errno = errno;
1218 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1219 errmsg("out of file descriptors: %m; release and retry")));
1221 if (ReleaseLruFile())
1230 * Free an AllocateDesc of either type.
1232 * The argument *must* point into the allocatedDescs[] array.
1235 FreeDesc(AllocateDesc *desc)
1239 /* Close the underlying object */
1242 case AllocateDescFile:
1243 result = fclose(desc->desc.file);
1245 case AllocateDescDir:
1246 result = closedir(desc->desc.dir);
1249 elog(ERROR, "AllocateDesc kind not recognized");
1250 result = 0; /* keep compiler quiet */
1254 /* Compact storage in the allocatedDescs array */
1255 numAllocatedDescs--;
1256 *desc = allocatedDescs[numAllocatedDescs];
1262 * Close a file returned by AllocateFile.
1264 * Note we do not check fclose's return value --- it is up to the caller
1265 * to handle close errors.
1268 FreeFile(FILE *file)
1272 DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
1274 /* Remove file from list of allocated files, if it's present */
1275 for (i = numAllocatedDescs; --i >= 0;)
1277 AllocateDesc *desc = &allocatedDescs[i];
1279 if (desc->kind == AllocateDescFile && desc->desc.file == file)
1280 return FreeDesc(desc);
1283 /* Only get here if someone passes us a file not in allocatedDescs */
1284 elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
1286 return fclose(file);
1291 * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
1292 * rather than plain opendir(). This lets fd.c deal with freeing FDs if
1293 * necessary to open the directory, and with closing it after an elog.
1294 * When done, call FreeDir rather than closedir.
1296 * Ideally this should be the *only* direct call of opendir() in the backend.
1299 AllocateDir(const char *dirname)
1303 DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
1304 numAllocatedDescs, dirname));
1307 * The test against MAX_ALLOCATED_DESCS prevents us from overflowing
1308 * allocatedDescs[]; the test against max_safe_fds prevents
1309 * AllocateDir from hogging every one of the available FDs, which'd
1310 * lead to infinite looping.
1312 if (numAllocatedDescs >= MAX_ALLOCATED_DESCS ||
1313 numAllocatedDescs >= max_safe_fds - 1)
1314 elog(ERROR, "too many private dirs demanded");
1317 if ((dir = opendir(dirname)) != NULL)
1319 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
1321 desc->kind = AllocateDescDir;
1322 desc->desc.dir = dir;
1323 desc->create_subid = GetCurrentSubTransactionId();
1324 numAllocatedDescs++;
1325 return desc->desc.dir;
1328 if (errno == EMFILE || errno == ENFILE)
1330 int save_errno = errno;
1333 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1334 errmsg("out of file descriptors: %m; release and retry")));
1336 if (ReleaseLruFile())
1345 * Read a directory opened with AllocateDir, ereport'ing any error.
1347 * This is easier to use than raw readdir() since it takes care of some
1348 * otherwise rather tedious and error-prone manipulation of errno. Also,
1349 * if you are happy with a generic error message for AllocateDir failure,
1352 * dir = AllocateDir(path);
1353 * while ((dirent = ReadDir(dir, path)) != NULL)
1357 * since a NULL dir parameter is taken as indicating AllocateDir failed.
1358 * (Make sure errno hasn't been changed since AllocateDir if you use this
1361 * The pathname passed to AllocateDir must be passed to this routine too,
1362 * but it is only used for error reporting.
1365 ReadDir(DIR *dir, const char *dirname)
1367 struct dirent *dent;
1369 /* Give a generic message for AllocateDir failure, if caller didn't */
1372 (errcode_for_file_access(),
1373 errmsg("could not open directory \"%s\": %m",
1377 if ((dent = readdir(dir)) != NULL)
1382 * This fix is in mingw cvs (runtime/mingwex/dirent.c rev 1.4), but
1383 * not in released version
1385 if (GetLastError() == ERROR_NO_MORE_FILES)
1391 (errcode_for_file_access(),
1392 errmsg("could not read directory \"%s\": %m",
1398 * Close a directory opened with AllocateDir.
1400 * Note we do not check closedir's return value --- it is up to the caller
1401 * to handle close errors.
1408 DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
1410 /* Remove dir from list of allocated dirs, if it's present */
1411 for (i = numAllocatedDescs; --i >= 0;)
1413 AllocateDesc *desc = &allocatedDescs[i];
1415 if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
1416 return FreeDesc(desc);
1419 /* Only get here if someone passes us a dir not in allocatedDescs */
1420 elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
1422 return closedir(dir);
1429 * Force all VFDs into the physically-closed state, so that the fewest
1430 * possible number of kernel file descriptors are in use. There is no
1431 * change in the logical state of the VFDs.
1438 if (SizeVfdCache > 0)
1440 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
1441 for (i = 1; i < SizeVfdCache; i++)
1443 if (!FileIsNotOpen(i))
1452 * Take care of subtransaction commit/abort. At abort, we close temp files
1453 * that the subtransaction may have opened. At commit, we reassign the
1454 * files that were opened to the parent subtransaction.
1457 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
1458 SubTransactionId parentSubid)
1462 if (SizeVfdCache > 0)
1464 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
1465 for (i = 1; i < SizeVfdCache; i++)
1467 unsigned short fdstate = VfdCache[i].fdstate;
1469 if ((fdstate & FD_XACT_TEMPORARY) &&
1470 VfdCache[i].create_subid == mySubid)
1473 VfdCache[i].create_subid = parentSubid;
1474 else if (VfdCache[i].fileName != NULL)
1480 for (i = 0; i < numAllocatedDescs; i++)
1482 if (allocatedDescs[i].create_subid == mySubid)
1485 allocatedDescs[i].create_subid = parentSubid;
1488 /* have to recheck the item after FreeDesc (ugly) */
1489 FreeDesc(&allocatedDescs[i--]);
1498 * This routine is called during transaction commit or abort (it doesn't
1499 * particularly care which). All still-open per-transaction temporary file
1500 * VFDs are closed, which also causes the underlying files to be
1501 * deleted. Furthermore, all "allocated" stdio files are closed.
1504 AtEOXact_Files(void)
1506 CleanupTempFiles(false);
1512 * on_proc_exit hook to clean up temp files during backend shutdown.
1513 * Here, we want to clean up *all* temp files including interXact ones.
1516 AtProcExit_Files(int code, Datum arg)
1518 CleanupTempFiles(true);
1522 * Close temporary files and delete their underlying files.
1524 * isProcExit: if true, this is being called as the backend process is
1525 * exiting. If that's the case, we should remove all temporary files; if
1526 * that's not the case, we are being called for transaction commit/abort
1527 * and should only remove transaction-local temp files. In either case,
1528 * also clean up "allocated" stdio files and dirs.
1531 CleanupTempFiles(bool isProcExit)
1535 if (SizeVfdCache > 0)
1537 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
1538 for (i = 1; i < SizeVfdCache; i++)
1540 unsigned short fdstate = VfdCache[i].fdstate;
1542 if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL)
1545 * If we're in the process of exiting a backend process,
1546 * close all temporary files. Otherwise, only close
1547 * temporary files local to the current transaction.
1549 if (isProcExit || (fdstate & FD_XACT_TEMPORARY))
1555 while (numAllocatedDescs > 0)
1556 FreeDesc(&allocatedDescs[0]);
1561 * Remove temporary files left over from a prior postmaster session
1563 * This should be called during postmaster startup. It will forcibly
1564 * remove any leftover files created by OpenTemporaryFile.
1566 * NOTE: we could, but don't, call this during a post-backend-crash restart
1567 * cycle. The argument for not doing it is that someone might want to examine
1568 * the temp files for debugging purposes. This does however mean that
1569 * OpenTemporaryFile had better allow for collision with an existing temp
1573 RemovePgTempFiles(void)
1575 char temp_path[MAXPGPATH];
1577 struct dirent *db_de;
1580 * Cycle through pgsql_tmp directories for all databases and remove old
1583 db_dir = AllocateDir("base");
1585 while ((db_de = ReadDir(db_dir, "base")) != NULL)
1587 if (strcmp(db_de->d_name, ".") == 0 ||
1588 strcmp(db_de->d_name, "..") == 0)
1591 snprintf(temp_path, sizeof(temp_path), "base/%s/%s",
1592 db_de->d_name, PG_TEMP_FILES_DIR);
1593 RemovePgTempFilesInDir(temp_path);
1599 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top
1600 * level of DataDir as well.
1603 RemovePgTempFilesInDir(PG_TEMP_FILES_DIR);
1607 /* Process one pgsql_tmp directory for RemovePgTempFiles */
1609 RemovePgTempFilesInDir(const char *tmpdirname)
1612 struct dirent *temp_de;
1613 char rm_path[MAXPGPATH];
1615 temp_dir = AllocateDir(tmpdirname);
1616 if (temp_dir == NULL)
1618 /* anything except ENOENT is fishy */
1619 if (errno != ENOENT)
1621 "could not open temporary-files directory \"%s\": %m",
1626 while ((temp_de = ReadDir(temp_dir, tmpdirname)) != NULL)
1628 if (strcmp(temp_de->d_name, ".") == 0 ||
1629 strcmp(temp_de->d_name, "..") == 0)
1632 snprintf(rm_path, sizeof(rm_path), "%s/%s",
1633 tmpdirname, temp_de->d_name);
1635 if (strncmp(temp_de->d_name,
1636 PG_TEMP_FILE_PREFIX,
1637 strlen(PG_TEMP_FILE_PREFIX)) == 0)
1638 unlink(rm_path); /* note we ignore any error */
1641 "unexpected file found in temporary-files directory: \"%s\"",