1 /*-------------------------------------------------------------------------
4 * Virtual file descriptor code.
6 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
10 * $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.145 2008/09/19 04:57:10 alvherre Exp $
14 * This code manages a cache of 'virtual' file descriptors (VFDs).
15 * The server opens many file descriptors for a variety of reasons,
16 * including base tables, scratch files (e.g., sort and hash spool
17 * files), and random calls to C library routines like system(3); it
18 * is quite easy to exceed system limits on the number of open files a
19 * single process can have. (This is around 256 on many modern
20 * operating systems, but can be as low as 32 on others.)
22 * VFDs are managed as an LRU pool, with actual OS file descriptors
23 * being opened and closed as needed. Obviously, if a routine is
24 * opened using these interfaces, all subsequent operations must also
25 * be through these interfaces (the File type is not a real file
28 * For this scheme to work, most (if not all) routines throughout the
29 * server should use these interfaces instead of calling the C library
30 * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31 * may find ourselves short of real file descriptors anyway.
33 * This file used to contain a bunch of stuff to support RAID levels 0
34 * (jbod), 1 (duplex) and 5 (xor parity). That stuff is all gone
35 * because the parallel query processing code that called it is all
36 * gone. If you really need it you could get it from the original
38 *-------------------------------------------------------------------------
44 #include <sys/param.h>
49 #include "miscadmin.h"
50 #include "access/xact.h"
51 #include "catalog/pg_tablespace.h"
52 #include "storage/fd.h"
53 #include "storage/ipc.h"
54 #include "utils/guc.h"
58 * We must leave some file descriptors free for system(), the dynamic loader,
59 * and other code that tries to open files without consulting fd.c. This
60 * is the number left free. (While we can be pretty sure we won't get
61 * EMFILE, there's never any guarantee that we won't get ENFILE due to
62 * other processes chewing up FDs. So it's a bad idea to try to open files
63 * without consulting fd.c. Nonetheless we cannot control all code.)
65 * Because this is just a fixed setting, we are effectively assuming that
66 * no such code will leave FDs open over the long term; otherwise the slop
67 * is likely to be insufficient. Note in particular that we expect that
68 * loading a shared library does not result in any permanent increase in
69 * the number of open files. (This appears to be true on most if not
70 * all platforms as of Feb 2004.)
72 #define NUM_RESERVED_FDS 10
75 * If we have fewer than this many usable FDs after allowing for the reserved
82 * A number of platforms allow individual processes to open many more files
83 * than they can really support when *many* processes do the same thing.
84 * This GUC parameter lets the DBA limit max_safe_fds to something less than
85 * what the postmaster's initial probe suggests will work.
87 int max_files_per_process = 1000;
90 * Maximum number of file descriptors to open for either VFD entries or
91 * AllocateFile/AllocateDir operations. This is initialized to a conservative
92 * value, and remains that way indefinitely in bootstrap or standalone-backend
93 * cases. In normal postmaster operation, the postmaster calls
94 * set_max_safe_fds() late in initialization to update the value, and that
95 * value is then inherited by forked subprocesses.
97 * Note: the value of max_files_per_process is taken into account while
98 * setting this variable, and so need not be tested separately.
100 static int max_safe_fds = 32; /* default if not changed */
108 #define DO_DB(A) /* A */
111 #define VFD_CLOSED (-1)
113 #define FileIsValid(file) \
114 ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
116 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
118 #define FileUnknownPos ((off_t) -1)
120 /* these are the assigned bits in fdstate below: */
121 #define FD_TEMPORARY (1 << 0) /* T = delete when closed */
122 #define FD_XACT_TEMPORARY (1 << 1) /* T = delete at eoXact */
125 * Flag to tell whether it's worth scanning VfdCache looking for temp files to
128 static bool have_xact_temporary_files = false;
132 int fd; /* current FD, or VFD_CLOSED if none */
133 unsigned short fdstate; /* bitflags for VFD's state */
134 SubTransactionId create_subid; /* for TEMPORARY fds, creating subxact */
135 File nextFree; /* link to next free VFD, if in freelist */
136 File lruMoreRecently; /* doubly linked recency-of-use list */
137 File lruLessRecently;
138 off_t seekPos; /* current logical file position */
139 char *fileName; /* name of file, or NULL for unused VFD */
140 /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
141 int fileFlags; /* open(2) flags for (re)opening the file */
142 int fileMode; /* mode to pass to open(2) */
146 * Virtual File Descriptor array pointer and size. This grows as
147 * needed. 'File' values are indexes into this array.
148 * Note that VfdCache[0] is not a usable VFD, just a list header.
150 static Vfd *VfdCache;
151 static Size SizeVfdCache = 0;
154 * Number of file descriptors known to be in use by VFD entries.
156 static int nfile = 0;
159 * List of stdio FILEs and <dirent.h> DIRs opened with AllocateFile
162 * Since we don't want to encourage heavy use of AllocateFile or AllocateDir,
163 * it seems OK to put a pretty small maximum limit on the number of
164 * simultaneously allocated descs.
166 #define MAX_ALLOCATED_DESCS 32
176 AllocateDescKind kind;
182 SubTransactionId create_subid;
185 static int numAllocatedDescs = 0;
186 static AllocateDesc allocatedDescs[MAX_ALLOCATED_DESCS];
189 * Number of temporary files opened during the current session;
190 * this is used in generation of tempfile names.
192 static long tempFileCounter = 0;
195 * Array of OIDs of temp tablespaces. When numTempTableSpaces is -1,
196 * this has not been set in the current transaction.
198 static Oid *tempTableSpaces = NULL;
199 static int numTempTableSpaces = -1;
200 static int nextTempTableSpace = 0;
203 /*--------------------
207 * Delete - delete a file from the Lru ring
208 * LruDelete - remove a file from the Lru ring and close its FD
209 * Insert - put a file at the front of the Lru ring
210 * LruInsert - put a file at the front of the Lru ring and open it
211 * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
212 * AllocateVfd - grab a free (or new) file record (from VfdArray)
213 * FreeVfd - free a file record
215 * The Least Recently Used ring is a doubly linked list that begins and
216 * ends on element zero. Element zero is special -- it doesn't represent
217 * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
218 * anchor that shows us the beginning/end of the ring.
219 * Only VFD elements that are currently really open (have an FD assigned) are
220 * in the Lru ring. Elements that are "virtually" open can be recognized
221 * by having a non-null fileName field.
225 * /--less----\ /---------\
227 * #0 --more---> LeastRecentlyUsed --more-\ \
229 * \\less--> MostRecentlyUsedFile <---/ |
230 * \more---/ \--less--/
232 *--------------------
234 static void Delete(File file);
235 static void LruDelete(File file);
236 static void Insert(File file);
237 static int LruInsert(File file);
238 static bool ReleaseLruFile(void);
239 static File AllocateVfd(void);
240 static void FreeVfd(File file);
242 static int FileAccess(File file);
243 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
244 static void AtProcExit_Files(int code, Datum arg);
245 static void CleanupTempFiles(bool isProcExit);
246 static void RemovePgTempFilesInDir(const char *tmpdirname);
250 * pg_fsync --- do fsync with or without writethrough
255 #ifndef HAVE_FSYNC_WRITETHROUGH_ONLY
256 if (sync_method != SYNC_METHOD_FSYNC_WRITETHROUGH)
257 return pg_fsync_no_writethrough(fd);
260 return pg_fsync_writethrough(fd);
265 * pg_fsync_no_writethrough --- same as fsync except does nothing if
269 pg_fsync_no_writethrough(int fd)
278 * pg_fsync_writethrough
281 pg_fsync_writethrough(int fd)
287 #elif defined(F_FULLFSYNC)
288 return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
298 * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
300 * Not all platforms have fdatasync; treat as fsync if not available.
307 #ifdef HAVE_FDATASYNC
308 return fdatasync(fd);
318 * InitFileAccess --- initialize this module during backend startup
320 * This is called during either normal or standalone backend start.
321 * It is *not* called in the postmaster.
326 Assert(SizeVfdCache == 0); /* call me only once */
328 /* initialize cache header entry */
329 VfdCache = (Vfd *) malloc(sizeof(Vfd));
330 if (VfdCache == NULL)
332 (errcode(ERRCODE_OUT_OF_MEMORY),
333 errmsg("out of memory")));
335 MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
336 VfdCache->fd = VFD_CLOSED;
340 /* register proc-exit hook to ensure temp files are dropped at exit */
341 on_proc_exit(AtProcExit_Files, 0);
345 * count_usable_fds --- count how many FDs the system will let us open,
346 * and estimate how many are already open.
348 * We stop counting if usable_fds reaches max_to_probe. Note: a small
349 * value of max_to_probe might result in an underestimate of already_open;
350 * we must fill in any "gaps" in the set of used FDs before the calculation
351 * of already_open will give the right answer. In practice, max_to_probe
352 * of a couple of dozen should be enough to ensure good results.
354 * We assume stdin (FD 0) is available for dup'ing
357 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
366 fd = (int *) palloc(size * sizeof(int));
368 /* dup until failure or probe limit reached */
376 /* Expect EMFILE or ENFILE, else it's fishy */
377 if (errno != EMFILE && errno != ENFILE)
378 elog(WARNING, "dup(0) failed after %d successes: %m", used);
385 fd = (int *) repalloc(fd, size * sizeof(int));
389 if (highestfd < thisfd)
392 if (used >= max_to_probe)
396 /* release the files we opened */
397 for (j = 0; j < used; j++)
403 * Return results. usable_fds is just the number of successful dups. We
404 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
405 * number) and so already_open is highestfd+1 - usable_fds.
408 *already_open = highestfd + 1 - used;
413 * Determine number of filedescriptors that fd.c is allowed to use
416 set_max_safe_fds(void)
422 * We want to set max_safe_fds to
423 * MIN(usable_fds, max_files_per_process - already_open)
424 * less the slop factor for files that are opened without consulting
425 * fd.c. This ensures that we won't exceed either max_files_per_process
426 * or the experimentally-determined EMFILE limit.
429 count_usable_fds(max_files_per_process,
430 &usable_fds, &already_open);
432 max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
435 * Take off the FDs reserved for system() etc.
437 max_safe_fds -= NUM_RESERVED_FDS;
440 * Make sure we still have enough to get by.
442 if (max_safe_fds < FD_MINFREE)
444 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
445 errmsg("insufficient file descriptors available to start server process"),
446 errdetail("System allows %d, we need at least %d.",
447 max_safe_fds + NUM_RESERVED_FDS,
448 FD_MINFREE + NUM_RESERVED_FDS)));
450 elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
451 max_safe_fds, usable_fds, already_open);
455 * BasicOpenFile --- same as open(2) except can free other FDs if needed
457 * This is exported for use by places that really want a plain kernel FD,
458 * but need to be proof against running out of FDs. Once an FD has been
459 * successfully returned, it is the caller's responsibility to ensure that
460 * it will not be leaked on ereport()! Most users should *not* call this
461 * routine directly, but instead use the VFD abstraction level, which
462 * provides protection against descriptor leaks as well as management of
463 * files that need to be open for more than a short period of time.
465 * Ideally this should be the *only* direct call of open() in the backend.
466 * In practice, the postmaster calls open() directly, and there are some
467 * direct open() calls done early in backend startup. Those are OK since
468 * this module wouldn't have any open files to close at that point anyway.
471 BasicOpenFile(FileName fileName, int fileFlags, int fileMode)
476 fd = open(fileName, fileFlags, fileMode);
479 return fd; /* success! */
481 if (errno == EMFILE || errno == ENFILE)
483 int save_errno = errno;
486 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
487 errmsg("out of file descriptors: %m; release and retry")));
489 if (ReleaseLruFile())
494 return -1; /* failure */
502 int mru = VfdCache[0].lruLessRecently;
503 Vfd *vfdP = &VfdCache[mru];
506 snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
509 mru = vfdP->lruLessRecently;
510 vfdP = &VfdCache[mru];
511 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
513 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
525 DO_DB(elog(LOG, "Delete %d (%s)",
526 file, VfdCache[file].fileName));
529 vfdP = &VfdCache[file];
531 VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
532 VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
544 DO_DB(elog(LOG, "LruDelete %d (%s)",
545 file, VfdCache[file].fileName));
547 vfdP = &VfdCache[file];
549 /* delete the vfd record from the LRU ring */
552 /* save the seek position */
553 vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
554 Assert(vfdP->seekPos != (off_t) -1);
558 elog(ERROR, "could not close file \"%s\": %m", vfdP->fileName);
561 vfdP->fd = VFD_CLOSED;
571 DO_DB(elog(LOG, "Insert %d (%s)",
572 file, VfdCache[file].fileName));
575 vfdP = &VfdCache[file];
577 vfdP->lruMoreRecently = 0;
578 vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
579 VfdCache[0].lruLessRecently = file;
580 VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
585 /* returns 0 on success, -1 on re-open failure (with errno set) */
593 DO_DB(elog(LOG, "LruInsert %d (%s)",
594 file, VfdCache[file].fileName));
596 vfdP = &VfdCache[file];
598 if (FileIsNotOpen(file))
600 while (nfile + numAllocatedDescs >= max_safe_fds)
602 if (!ReleaseLruFile())
607 * The open could still fail for lack of file descriptors, eg due to
608 * overall system file table being full. So, be prepared to release
609 * another FD if necessary...
611 vfdP->fd = BasicOpenFile(vfdP->fileName, vfdP->fileFlags,
615 DO_DB(elog(LOG, "RE_OPEN FAILED: %d", errno));
620 DO_DB(elog(LOG, "RE_OPEN SUCCESS"));
624 /* seek to the right position */
625 if (vfdP->seekPos != (off_t) 0)
629 returnValue = lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
630 Assert(returnValue != (off_t) -1);
635 * put it at the head of the Lru ring
646 DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
651 * There are opened files and so there should be at least one used vfd
654 Assert(VfdCache[0].lruMoreRecently != 0);
655 LruDelete(VfdCache[0].lruMoreRecently);
656 return true; /* freed a file */
658 return false; /* no files available to free */
667 DO_DB(elog(LOG, "AllocateVfd. Size %lu", SizeVfdCache));
669 Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
671 if (VfdCache[0].nextFree == 0)
674 * The free list is empty so it is time to increase the size of the
675 * array. We choose to double it each time this happens. However,
676 * there's not much point in starting *real* small.
678 Size newCacheSize = SizeVfdCache * 2;
681 if (newCacheSize < 32)
685 * Be careful not to clobber VfdCache ptr if realloc fails.
687 newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
688 if (newVfdCache == NULL)
690 (errcode(ERRCODE_OUT_OF_MEMORY),
691 errmsg("out of memory")));
692 VfdCache = newVfdCache;
695 * Initialize the new entries and link them into the free list.
697 for (i = SizeVfdCache; i < newCacheSize; i++)
699 MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
700 VfdCache[i].nextFree = i + 1;
701 VfdCache[i].fd = VFD_CLOSED;
703 VfdCache[newCacheSize - 1].nextFree = 0;
704 VfdCache[0].nextFree = SizeVfdCache;
707 * Record the new size
709 SizeVfdCache = newCacheSize;
712 file = VfdCache[0].nextFree;
714 VfdCache[0].nextFree = VfdCache[file].nextFree;
722 Vfd *vfdP = &VfdCache[file];
724 DO_DB(elog(LOG, "FreeVfd: %d (%s)",
725 file, vfdP->fileName ? vfdP->fileName : ""));
727 if (vfdP->fileName != NULL)
729 free(vfdP->fileName);
730 vfdP->fileName = NULL;
734 vfdP->nextFree = VfdCache[0].nextFree;
735 VfdCache[0].nextFree = file;
738 /* returns 0 on success, -1 on re-open failure (with errno set) */
740 FileAccess(File file)
744 DO_DB(elog(LOG, "FileAccess %d (%s)",
745 file, VfdCache[file].fileName));
748 * Is the file open? If not, open it and put it at the head of the LRU
749 * ring (possibly closing the least recently used file to get an FD).
752 if (FileIsNotOpen(file))
754 returnValue = LruInsert(file);
755 if (returnValue != 0)
758 else if (VfdCache[0].lruLessRecently != file)
761 * We now know that the file is open and that it is not the last one
762 * accessed, so we need to move it to the head of the Lru ring.
773 * Called when we get a shared invalidation message on some relation.
777 FileInvalidate(File file)
779 Assert(FileIsValid(file));
780 if (!FileIsNotOpen(file))
786 * open a file in an arbitrary directory
788 * NB: if the passed pathname is relative (which it usually is),
789 * it will be interpreted relative to the process' working directory
790 * (which should always be $PGDATA when this code is running).
793 PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
799 DO_DB(elog(LOG, "PathNameOpenFile: %s %x %o",
800 fileName, fileFlags, fileMode));
803 * We need a malloc'd copy of the file name; fail cleanly if no room.
805 fnamecopy = strdup(fileName);
806 if (fnamecopy == NULL)
808 (errcode(ERRCODE_OUT_OF_MEMORY),
809 errmsg("out of memory")));
811 file = AllocateVfd();
812 vfdP = &VfdCache[file];
814 while (nfile + numAllocatedDescs >= max_safe_fds)
816 if (!ReleaseLruFile())
820 vfdP->fd = BasicOpenFile(fileName, fileFlags, fileMode);
829 DO_DB(elog(LOG, "PathNameOpenFile: success %d",
834 vfdP->fileName = fnamecopy;
835 /* Saved flags are adjusted to be OK for re-opening file */
836 vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
837 vfdP->fileMode = fileMode;
845 * Open a temporary file that will disappear when we close it.
847 * This routine takes care of generating an appropriate tempfile name.
848 * There's no need to pass in fileFlags or fileMode either, since only
849 * one setting makes any sense for a temp file.
851 * interXact: if true, don't close the file at end-of-transaction. In
852 * most cases, you don't want temporary files to outlive the transaction
853 * that created them, so this should be false -- but if you need
854 * "somewhat" temporary storage, this might be useful. In either case,
855 * the file is removed when the File is explicitly closed.
858 OpenTemporaryFile(bool interXact)
863 * If some temp tablespace(s) have been given to us, try to use the next
864 * one. If a given tablespace can't be found, we silently fall back to
865 * the database's default tablespace.
867 * BUT: if the temp file is slated to outlive the current transaction,
868 * force it into the database's default tablespace, so that it will not
869 * pose a threat to possible tablespace drop attempts.
871 if (numTempTableSpaces > 0 && !interXact)
873 Oid tblspcOid = GetNextTempTableSpace();
875 if (OidIsValid(tblspcOid))
876 file = OpenTemporaryFileInTablespace(tblspcOid, false);
880 * If not, or if tablespace is bad, create in database's default
881 * tablespace. MyDatabaseTableSpace should normally be set before we get
882 * here, but just in case it isn't, fall back to pg_default tablespace.
885 file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
886 MyDatabaseTableSpace :
887 DEFAULTTABLESPACE_OID,
890 /* Mark it for deletion at close */
891 VfdCache[file].fdstate |= FD_TEMPORARY;
893 /* Mark it for deletion at EOXact */
896 VfdCache[file].fdstate |= FD_XACT_TEMPORARY;
897 VfdCache[file].create_subid = GetCurrentSubTransactionId();
899 /* ensure cleanup happens at eoxact */
900 have_xact_temporary_files = true;
907 * Open a temporary file in a specific tablespace.
908 * Subroutine for OpenTemporaryFile, which see for details.
911 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
913 char tempdirpath[MAXPGPATH];
914 char tempfilepath[MAXPGPATH];
918 * Identify the tempfile directory for this tablespace.
920 * If someone tries to specify pg_global, use pg_default instead.
922 if (tblspcOid == DEFAULTTABLESPACE_OID ||
923 tblspcOid == GLOBALTABLESPACE_OID)
925 /* The default tablespace is {datadir}/base */
926 snprintf(tempdirpath, sizeof(tempdirpath), "base/%s",
931 /* All other tablespaces are accessed via symlinks */
932 snprintf(tempdirpath, sizeof(tempdirpath), "pg_tblspc/%u/%s",
933 tblspcOid, PG_TEMP_FILES_DIR);
937 * Generate a tempfile name that should be unique within the current
940 snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
941 tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
944 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
945 * temp file that can be reused.
947 file = PathNameOpenFile(tempfilepath,
948 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
953 * We might need to create the tablespace's tempfile directory, if no
954 * one has yet done so.
956 * Don't check for error from mkdir; it could fail if someone else
957 * just did the same thing. If it doesn't work then we'll bomb out on
958 * the second create attempt, instead.
960 mkdir(tempdirpath, S_IRWXU);
962 file = PathNameOpenFile(tempfilepath,
963 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
965 if (file <= 0 && rejectError)
966 elog(ERROR, "could not create temporary file \"%s\": %m",
974 * close a file when done with it
980 struct stat filestats;
982 Assert(FileIsValid(file));
984 DO_DB(elog(LOG, "FileClose: %d (%s)",
985 file, VfdCache[file].fileName));
987 vfdP = &VfdCache[file];
989 if (!FileIsNotOpen(file))
991 /* remove the file from the lru ring */
996 elog(ERROR, "could not close file \"%s\": %m", vfdP->fileName);
999 vfdP->fd = VFD_CLOSED;
1003 * Delete the file if it was temporary
1005 if (vfdP->fdstate & FD_TEMPORARY)
1007 /* reset flag so that die() interrupt won't cause problems */
1008 vfdP->fdstate &= ~FD_TEMPORARY;
1009 if (log_temp_files >= 0)
1011 if (stat(vfdP->fileName, &filestats) == 0)
1013 if (filestats.st_size >= log_temp_files)
1015 (errmsg("temporary file: path \"%s\", size %lu",
1017 (unsigned long) filestats.st_size)));
1020 elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
1022 if (unlink(vfdP->fileName))
1023 elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
1027 * Return the Vfd slot to the free list
1033 FileRead(File file, char *buffer, int amount)
1037 Assert(FileIsValid(file));
1039 DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
1040 file, VfdCache[file].fileName,
1041 (int64) VfdCache[file].seekPos,
1044 returnCode = FileAccess(file);
1049 returnCode = read(VfdCache[file].fd, buffer, amount);
1051 if (returnCode >= 0)
1052 VfdCache[file].seekPos += returnCode;
1056 * Windows may run out of kernel buffers and return "Insufficient
1057 * system resources" error. Wait a bit and retry to solve it.
1059 * It is rumored that EINTR is also possible on some Unix filesystems,
1060 * in which case immediate retry is indicated.
1063 DWORD error = GetLastError();
1067 case ERROR_NO_SYSTEM_RESOURCES:
1076 /* OK to retry if interrupted */
1080 /* Trouble, so assume we don't know the file position anymore */
1081 VfdCache[file].seekPos = FileUnknownPos;
1088 FileWrite(File file, char *buffer, int amount)
1092 Assert(FileIsValid(file));
1094 DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
1095 file, VfdCache[file].fileName,
1096 (int64) VfdCache[file].seekPos,
1099 returnCode = FileAccess(file);
1105 returnCode = write(VfdCache[file].fd, buffer, amount);
1107 /* if write didn't set errno, assume problem is no disk space */
1108 if (returnCode != amount && errno == 0)
1111 if (returnCode >= 0)
1112 VfdCache[file].seekPos += returnCode;
1116 * See comments in FileRead()
1119 DWORD error = GetLastError();
1123 case ERROR_NO_SYSTEM_RESOURCES:
1132 /* OK to retry if interrupted */
1136 /* Trouble, so assume we don't know the file position anymore */
1137 VfdCache[file].seekPos = FileUnknownPos;
1148 Assert(FileIsValid(file));
1150 DO_DB(elog(LOG, "FileSync: %d (%s)",
1151 file, VfdCache[file].fileName));
1153 returnCode = FileAccess(file);
1157 return pg_fsync(VfdCache[file].fd);
1161 FileSeek(File file, off_t offset, int whence)
1165 Assert(FileIsValid(file));
1167 DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
1168 file, VfdCache[file].fileName,
1169 (int64) VfdCache[file].seekPos,
1170 (int64) offset, whence));
1172 if (FileIsNotOpen(file))
1178 elog(ERROR, "invalid seek offset: " INT64_FORMAT,
1180 VfdCache[file].seekPos = offset;
1183 VfdCache[file].seekPos += offset;
1186 returnCode = FileAccess(file);
1189 VfdCache[file].seekPos = lseek(VfdCache[file].fd,
1193 elog(ERROR, "invalid whence: %d", whence);
1203 elog(ERROR, "invalid seek offset: " INT64_FORMAT,
1205 if (VfdCache[file].seekPos != offset)
1206 VfdCache[file].seekPos = lseek(VfdCache[file].fd,
1210 if (offset != 0 || VfdCache[file].seekPos == FileUnknownPos)
1211 VfdCache[file].seekPos = lseek(VfdCache[file].fd,
1215 VfdCache[file].seekPos = lseek(VfdCache[file].fd,
1219 elog(ERROR, "invalid whence: %d", whence);
1223 return VfdCache[file].seekPos;
1227 * XXX not actually used but here for completeness
1233 Assert(FileIsValid(file));
1234 DO_DB(elog(LOG, "FileTell %d (%s)",
1235 file, VfdCache[file].fileName));
1236 return VfdCache[file].seekPos;
1241 FileTruncate(File file, off_t offset)
1245 Assert(FileIsValid(file));
1247 DO_DB(elog(LOG, "FileTruncate %d (%s)",
1248 file, VfdCache[file].fileName));
1250 returnCode = FileAccess(file);
1254 returnCode = ftruncate(VfdCache[file].fd, offset);
1260 * Routines that want to use stdio (ie, FILE*) should use AllocateFile
1261 * rather than plain fopen(). This lets fd.c deal with freeing FDs if
1262 * necessary to open the file. When done, call FreeFile rather than fclose.
1264 * Note that files that will be open for any significant length of time
1265 * should NOT be handled this way, since they cannot share kernel file
1266 * descriptors with other files; there is grave risk of running out of FDs
1267 * if anyone locks down too many FDs. Most callers of this routine are
1268 * simply reading a config file that they will read and close immediately.
1270 * fd.c will automatically close all files opened with AllocateFile at
1271 * transaction commit or abort; this prevents FD leakage if a routine
1272 * that calls AllocateFile is terminated prematurely by ereport(ERROR).
1274 * Ideally this should be the *only* direct call of fopen() in the backend.
1277 AllocateFile(const char *name, const char *mode)
1281 DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
1282 numAllocatedDescs, name));
1285 * The test against MAX_ALLOCATED_DESCS prevents us from overflowing
1286 * allocatedFiles[]; the test against max_safe_fds prevents AllocateFile
1287 * from hogging every one of the available FDs, which'd lead to infinite
1290 if (numAllocatedDescs >= MAX_ALLOCATED_DESCS ||
1291 numAllocatedDescs >= max_safe_fds - 1)
1292 elog(ERROR, "too many private files demanded");
1295 if ((file = fopen(name, mode)) != NULL)
1297 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
1299 desc->kind = AllocateDescFile;
1300 desc->desc.file = file;
1301 desc->create_subid = GetCurrentSubTransactionId();
1302 numAllocatedDescs++;
1303 return desc->desc.file;
1306 if (errno == EMFILE || errno == ENFILE)
1308 int save_errno = errno;
1311 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1312 errmsg("out of file descriptors: %m; release and retry")));
1314 if (ReleaseLruFile())
1323 * Free an AllocateDesc of either type.
1325 * The argument *must* point into the allocatedDescs[] array.
1328 FreeDesc(AllocateDesc *desc)
1332 /* Close the underlying object */
1335 case AllocateDescFile:
1336 result = fclose(desc->desc.file);
1338 case AllocateDescDir:
1339 result = closedir(desc->desc.dir);
1342 elog(ERROR, "AllocateDesc kind not recognized");
1343 result = 0; /* keep compiler quiet */
1347 /* Compact storage in the allocatedDescs array */
1348 numAllocatedDescs--;
1349 *desc = allocatedDescs[numAllocatedDescs];
1355 * Close a file returned by AllocateFile.
1357 * Note we do not check fclose's return value --- it is up to the caller
1358 * to handle close errors.
1361 FreeFile(FILE *file)
1365 DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
1367 /* Remove file from list of allocated files, if it's present */
1368 for (i = numAllocatedDescs; --i >= 0;)
1370 AllocateDesc *desc = &allocatedDescs[i];
1372 if (desc->kind == AllocateDescFile && desc->desc.file == file)
1373 return FreeDesc(desc);
1376 /* Only get here if someone passes us a file not in allocatedDescs */
1377 elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
1379 return fclose(file);
1384 * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
1385 * rather than plain opendir(). This lets fd.c deal with freeing FDs if
1386 * necessary to open the directory, and with closing it after an elog.
1387 * When done, call FreeDir rather than closedir.
1389 * Ideally this should be the *only* direct call of opendir() in the backend.
1392 AllocateDir(const char *dirname)
1396 DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
1397 numAllocatedDescs, dirname));
1400 * The test against MAX_ALLOCATED_DESCS prevents us from overflowing
1401 * allocatedDescs[]; the test against max_safe_fds prevents AllocateDir
1402 * from hogging every one of the available FDs, which'd lead to infinite
1405 if (numAllocatedDescs >= MAX_ALLOCATED_DESCS ||
1406 numAllocatedDescs >= max_safe_fds - 1)
1407 elog(ERROR, "too many private dirs demanded");
1410 if ((dir = opendir(dirname)) != NULL)
1412 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
1414 desc->kind = AllocateDescDir;
1415 desc->desc.dir = dir;
1416 desc->create_subid = GetCurrentSubTransactionId();
1417 numAllocatedDescs++;
1418 return desc->desc.dir;
1421 if (errno == EMFILE || errno == ENFILE)
1423 int save_errno = errno;
1426 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1427 errmsg("out of file descriptors: %m; release and retry")));
1429 if (ReleaseLruFile())
1438 * Read a directory opened with AllocateDir, ereport'ing any error.
1440 * This is easier to use than raw readdir() since it takes care of some
1441 * otherwise rather tedious and error-prone manipulation of errno. Also,
1442 * if you are happy with a generic error message for AllocateDir failure,
1445 * dir = AllocateDir(path);
1446 * while ((dirent = ReadDir(dir, path)) != NULL)
1450 * since a NULL dir parameter is taken as indicating AllocateDir failed.
1451 * (Make sure errno hasn't been changed since AllocateDir if you use this
1454 * The pathname passed to AllocateDir must be passed to this routine too,
1455 * but it is only used for error reporting.
1458 ReadDir(DIR *dir, const char *dirname)
1460 struct dirent *dent;
1462 /* Give a generic message for AllocateDir failure, if caller didn't */
1465 (errcode_for_file_access(),
1466 errmsg("could not open directory \"%s\": %m",
1470 if ((dent = readdir(dir)) != NULL)
1476 * This fix is in mingw cvs (runtime/mingwex/dirent.c rev 1.4), but not in
1479 if (GetLastError() == ERROR_NO_MORE_FILES)
1485 (errcode_for_file_access(),
1486 errmsg("could not read directory \"%s\": %m",
1492 * Close a directory opened with AllocateDir.
1494 * Note we do not check closedir's return value --- it is up to the caller
1495 * to handle close errors.
1502 DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
1504 /* Remove dir from list of allocated dirs, if it's present */
1505 for (i = numAllocatedDescs; --i >= 0;)
1507 AllocateDesc *desc = &allocatedDescs[i];
1509 if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
1510 return FreeDesc(desc);
1513 /* Only get here if someone passes us a dir not in allocatedDescs */
1514 elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
1516 return closedir(dir);
1523 * Force all VFDs into the physically-closed state, so that the fewest
1524 * possible number of kernel file descriptors are in use. There is no
1525 * change in the logical state of the VFDs.
1532 if (SizeVfdCache > 0)
1534 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
1535 for (i = 1; i < SizeVfdCache; i++)
1537 if (!FileIsNotOpen(i))
1545 * SetTempTablespaces
1547 * Define a list (actually an array) of OIDs of tablespaces to use for
1548 * temporary files. This list will be used until end of transaction,
1549 * unless this function is called again before then. It is caller's
1550 * responsibility that the passed-in array has adequate lifespan (typically
1551 * it'd be allocated in TopTransactionContext).
1554 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
1556 Assert(numSpaces >= 0);
1557 tempTableSpaces = tableSpaces;
1558 numTempTableSpaces = numSpaces;
1561 * Select a random starting point in the list. This is to minimize
1562 * conflicts between backends that are most likely sharing the same list
1563 * of temp tablespaces. Note that if we create multiple temp files in the
1564 * same transaction, we'll advance circularly through the list --- this
1565 * ensures that large temporary sort files are nicely spread across all
1566 * available tablespaces.
1569 nextTempTableSpace = random() % numSpaces;
1571 nextTempTableSpace = 0;
1575 * TempTablespacesAreSet
1577 * Returns TRUE if SetTempTablespaces has been called in current transaction.
1578 * (This is just so that tablespaces.c doesn't need its own per-transaction
1582 TempTablespacesAreSet(void)
1584 return (numTempTableSpaces >= 0);
1588 * GetNextTempTableSpace
1590 * Select the next temp tablespace to use. A result of InvalidOid means
1591 * to use the current database's default tablespace.
1594 GetNextTempTableSpace(void)
1596 if (numTempTableSpaces > 0)
1598 /* Advance nextTempTableSpace counter with wraparound */
1599 if (++nextTempTableSpace >= numTempTableSpaces)
1600 nextTempTableSpace = 0;
1601 return tempTableSpaces[nextTempTableSpace];
1610 * Take care of subtransaction commit/abort. At abort, we close temp files
1611 * that the subtransaction may have opened. At commit, we reassign the
1612 * files that were opened to the parent subtransaction.
1615 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
1616 SubTransactionId parentSubid)
1620 if (have_xact_temporary_files)
1622 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
1623 for (i = 1; i < SizeVfdCache; i++)
1625 unsigned short fdstate = VfdCache[i].fdstate;
1627 if ((fdstate & FD_XACT_TEMPORARY) &&
1628 VfdCache[i].create_subid == mySubid)
1631 VfdCache[i].create_subid = parentSubid;
1632 else if (VfdCache[i].fileName != NULL)
1638 for (i = 0; i < numAllocatedDescs; i++)
1640 if (allocatedDescs[i].create_subid == mySubid)
1643 allocatedDescs[i].create_subid = parentSubid;
1646 /* have to recheck the item after FreeDesc (ugly) */
1647 FreeDesc(&allocatedDescs[i--]);
1656 * This routine is called during transaction commit or abort (it doesn't
1657 * particularly care which). All still-open per-transaction temporary file
1658 * VFDs are closed, which also causes the underlying files to be
1659 * deleted. Furthermore, all "allocated" stdio files are closed.
1660 * We also forget any transaction-local temp tablespace list.
1663 AtEOXact_Files(void)
1665 CleanupTempFiles(false);
1666 tempTableSpaces = NULL;
1667 numTempTableSpaces = -1;
1673 * on_proc_exit hook to clean up temp files during backend shutdown.
1674 * Here, we want to clean up *all* temp files including interXact ones.
1677 AtProcExit_Files(int code, Datum arg)
1679 CleanupTempFiles(true);
1683 * Close temporary files and delete their underlying files.
1685 * isProcExit: if true, this is being called as the backend process is
1686 * exiting. If that's the case, we should remove all temporary files; if
1687 * that's not the case, we are being called for transaction commit/abort
1688 * and should only remove transaction-local temp files. In either case,
1689 * also clean up "allocated" stdio files and dirs.
1692 CleanupTempFiles(bool isProcExit)
1697 * Careful here: at proc_exit we need extra cleanup, not just
1698 * xact_temporary files.
1700 if (isProcExit || have_xact_temporary_files)
1702 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
1703 for (i = 1; i < SizeVfdCache; i++)
1705 unsigned short fdstate = VfdCache[i].fdstate;
1707 if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL)
1710 * If we're in the process of exiting a backend process, close
1711 * all temporary files. Otherwise, only close temporary files
1712 * local to the current transaction.
1714 if (isProcExit || (fdstate & FD_XACT_TEMPORARY))
1719 have_xact_temporary_files = false;
1722 while (numAllocatedDescs > 0)
1723 FreeDesc(&allocatedDescs[0]);
1728 * Remove temporary files left over from a prior postmaster session
1730 * This should be called during postmaster startup. It will forcibly
1731 * remove any leftover files created by OpenTemporaryFile.
1733 * NOTE: we could, but don't, call this during a post-backend-crash restart
1734 * cycle. The argument for not doing it is that someone might want to examine
1735 * the temp files for debugging purposes. This does however mean that
1736 * OpenTemporaryFile had better allow for collision with an existing temp
1740 RemovePgTempFiles(void)
1742 char temp_path[MAXPGPATH];
1744 struct dirent *spc_de;
1747 * First process temp files in pg_default ($PGDATA/base)
1749 snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
1750 RemovePgTempFilesInDir(temp_path);
1753 * Cycle through temp directories for all non-default tablespaces.
1755 spc_dir = AllocateDir("pg_tblspc");
1757 while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL)
1759 if (strcmp(spc_de->d_name, ".") == 0 ||
1760 strcmp(spc_de->d_name, "..") == 0)
1763 snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
1764 spc_de->d_name, PG_TEMP_FILES_DIR);
1765 RemovePgTempFilesInDir(temp_path);
1771 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
1775 RemovePgTempFilesInDir(PG_TEMP_FILES_DIR);
1779 /* Process one pgsql_tmp directory for RemovePgTempFiles */
1781 RemovePgTempFilesInDir(const char *tmpdirname)
1784 struct dirent *temp_de;
1785 char rm_path[MAXPGPATH];
1787 temp_dir = AllocateDir(tmpdirname);
1788 if (temp_dir == NULL)
1790 /* anything except ENOENT is fishy */
1791 if (errno != ENOENT)
1793 "could not open temporary-files directory \"%s\": %m",
1798 while ((temp_de = ReadDir(temp_dir, tmpdirname)) != NULL)
1800 if (strcmp(temp_de->d_name, ".") == 0 ||
1801 strcmp(temp_de->d_name, "..") == 0)
1804 snprintf(rm_path, sizeof(rm_path), "%s/%s",
1805 tmpdirname, temp_de->d_name);
1807 if (strncmp(temp_de->d_name,
1808 PG_TEMP_FILE_PREFIX,
1809 strlen(PG_TEMP_FILE_PREFIX)) == 0)
1810 unlink(rm_path); /* note we ignore any error */
1813 "unexpected file found in temporary-files directory: \"%s\"",