1 /*-------------------------------------------------------------------------
4 * Virtual file descriptor code.
6 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
10 * src/backend/storage/file/fd.c
14 * This code manages a cache of 'virtual' file descriptors (VFDs).
15 * The server opens many file descriptors for a variety of reasons,
16 * including base tables, scratch files (e.g., sort and hash spool
17 * files), and random calls to C library routines like system(3); it
18 * is quite easy to exceed system limits on the number of open files a
19 * single process can have. (This is around 256 on many modern
20 * operating systems, but can be as low as 32 on others.)
22 * VFDs are managed as an LRU pool, with actual OS file descriptors
23 * being opened and closed as needed. Obviously, if a routine is
24 * opened using these interfaces, all subsequent operations must also
25 * be through these interfaces (the File type is not a real file
28 * For this scheme to work, most (if not all) routines throughout the
29 * server should use these interfaces instead of calling the C library
30 * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31 * may find ourselves short of real file descriptors anyway.
35 * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36 * A File opened with OpenTemporaryFile is automatically deleted when the
37 * File is closed, either explicitly or implicitly at end of transaction or
38 * process exit. PathNameOpenFile is intended for files that are held open
39 * for a long time, like relation files. It is the caller's responsibility
40 * to close them, there is no automatic mechanism in fd.c for that.
42 * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
43 * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
44 * They behave like the corresponding native functions, except that the handle
45 * is registered with the current subtransaction, and will be automatically
46 * closed at abort. These are intended mainly for short operations like
47 * reading a configuration file; there is a limit on the number of files that
48 * can be opened using these functions at any one time.
50 * Finally, BasicOpenFile is just a thin wrapper around open() that can
51 * release file descriptors in use by the virtual file descriptors if
52 * necessary. There is no automatic cleanup of file descriptors returned by
53 * BasicOpenFile, it is solely the caller's responsibility to close the file
54 * descriptor by calling close(2).
56 *-------------------------------------------------------------------------
62 #include <sys/param.h>
70 #ifdef HAVE_SYS_RESOURCE_H
71 #include <sys/resource.h> /* for getrlimit */
74 #include "miscadmin.h"
75 #include "access/xact.h"
76 #include "access/xlog.h"
77 #include "catalog/catalog.h"
78 #include "catalog/pg_tablespace.h"
80 #include "portability/mem.h"
81 #include "storage/fd.h"
82 #include "storage/ipc.h"
83 #include "utils/guc.h"
84 #include "utils/resowner_private.h"
87 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
88 #if defined(HAVE_SYNC_FILE_RANGE)
89 #define PG_FLUSH_DATA_WORKS 1
90 #elif !defined(WIN32) && defined(MS_ASYNC)
91 #define PG_FLUSH_DATA_WORKS 1
92 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
93 #define PG_FLUSH_DATA_WORKS 1
97 * We must leave some file descriptors free for system(), the dynamic loader,
98 * and other code that tries to open files without consulting fd.c. This
99 * is the number left free. (While we can be pretty sure we won't get
100 * EMFILE, there's never any guarantee that we won't get ENFILE due to
101 * other processes chewing up FDs. So it's a bad idea to try to open files
102 * without consulting fd.c. Nonetheless we cannot control all code.)
104 * Because this is just a fixed setting, we are effectively assuming that
105 * no such code will leave FDs open over the long term; otherwise the slop
106 * is likely to be insufficient. Note in particular that we expect that
107 * loading a shared library does not result in any permanent increase in
108 * the number of open files. (This appears to be true on most if not
109 * all platforms as of Feb 2004.)
111 #define NUM_RESERVED_FDS 10
114 * If we have fewer than this many usable FDs after allowing for the reserved
117 #define FD_MINFREE 10
120 * Default mode for created files, unless something else is specified using
121 * the *Perm() function variants.
123 #define PG_FILE_MODE_DEFAULT (S_IRUSR | S_IWUSR)
126 * A number of platforms allow individual processes to open many more files
127 * than they can really support when *many* processes do the same thing.
128 * This GUC parameter lets the DBA limit max_safe_fds to something less than
129 * what the postmaster's initial probe suggests will work.
131 int max_files_per_process = 1000;
134 * Maximum number of file descriptors to open for either VFD entries or
135 * AllocateFile/AllocateDir/OpenTransientFile operations. This is initialized
136 * to a conservative value, and remains that way indefinitely in bootstrap or
137 * standalone-backend cases. In normal postmaster operation, the postmaster
138 * calls set_max_safe_fds() late in initialization to update the value, and
139 * that value is then inherited by forked subprocesses.
141 * Note: the value of max_files_per_process is taken into account while
142 * setting this variable, and so need not be tested separately.
144 int max_safe_fds = 32; /* default if not changed */
152 int _do_db_save_errno = errno; \
154 errno = _do_db_save_errno; \
161 #define VFD_CLOSED (-1)
163 #define FileIsValid(file) \
164 ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
166 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
169 * Note: a VFD's seekPos is normally always valid, but if for some reason
170 * an lseek() fails, it might become set to FileUnknownPos. We can struggle
171 * along without knowing the seek position in many cases, but in some places
172 * we have to fail if we don't have it.
174 #define FileUnknownPos ((off_t) -1)
175 #define FilePosIsUnknown(pos) ((pos) < 0)
177 /* these are the assigned bits in fdstate below: */
178 #define FD_TEMPORARY (1 << 0) /* T = delete when closed */
179 #define FD_XACT_TEMPORARY (1 << 1) /* T = delete at eoXact */
183 int fd; /* current FD, or VFD_CLOSED if none */
184 unsigned short fdstate; /* bitflags for VFD's state */
185 ResourceOwner resowner; /* owner, for automatic cleanup */
186 File nextFree; /* link to next free VFD, if in freelist */
187 File lruMoreRecently; /* doubly linked recency-of-use list */
188 File lruLessRecently;
189 off_t seekPos; /* current logical file position, or -1 */
190 off_t fileSize; /* current size of file (0 if not temporary) */
191 char *fileName; /* name of file, or NULL for unused VFD */
192 /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
193 int fileFlags; /* open(2) flags for (re)opening the file */
194 mode_t fileMode; /* mode to pass to open(2) */
198 * Virtual File Descriptor array pointer and size. This grows as
199 * needed. 'File' values are indexes into this array.
200 * Note that VfdCache[0] is not a usable VFD, just a list header.
202 static Vfd *VfdCache;
203 static Size SizeVfdCache = 0;
206 * Number of file descriptors known to be in use by VFD entries.
208 static int nfile = 0;
211 * Flag to tell whether it's worth scanning VfdCache looking for temp files
214 static bool have_xact_temporary_files = false;
217 * Tracks the total size of all temporary files. Note: when temp_file_limit
218 * is being enforced, this cannot overflow since the limit cannot be more
219 * than INT_MAX kilobytes. When not enforcing, it could theoretically
220 * overflow, but we don't care.
222 static uint64 temporary_files_size = 0;
225 * List of OS handles opened with AllocateFile, AllocateDir and
238 AllocateDescKind kind;
239 SubTransactionId create_subid;
248 static int numAllocatedDescs = 0;
249 static int maxAllocatedDescs = 0;
250 static AllocateDesc *allocatedDescs = NULL;
253 * Number of temporary files opened during the current session;
254 * this is used in generation of tempfile names.
256 static long tempFileCounter = 0;
259 * Array of OIDs of temp tablespaces. When numTempTableSpaces is -1,
260 * this has not been set in the current transaction.
262 static Oid *tempTableSpaces = NULL;
263 static int numTempTableSpaces = -1;
264 static int nextTempTableSpace = 0;
267 /*--------------------
271 * Delete - delete a file from the Lru ring
272 * LruDelete - remove a file from the Lru ring and close its FD
273 * Insert - put a file at the front of the Lru ring
274 * LruInsert - put a file at the front of the Lru ring and open it
275 * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
276 * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
277 * AllocateVfd - grab a free (or new) file record (from VfdArray)
278 * FreeVfd - free a file record
280 * The Least Recently Used ring is a doubly linked list that begins and
281 * ends on element zero. Element zero is special -- it doesn't represent
282 * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
283 * anchor that shows us the beginning/end of the ring.
284 * Only VFD elements that are currently really open (have an FD assigned) are
285 * in the Lru ring. Elements that are "virtually" open can be recognized
286 * by having a non-null fileName field.
290 * /--less----\ /---------\
292 * #0 --more---> LeastRecentlyUsed --more-\ \
294 * \\less--> MostRecentlyUsedFile <---/ |
295 * \more---/ \--less--/
297 *--------------------
299 static void Delete(File file);
300 static void LruDelete(File file);
301 static void Insert(File file);
302 static int LruInsert(File file);
303 static bool ReleaseLruFile(void);
304 static void ReleaseLruFiles(void);
305 static File AllocateVfd(void);
306 static void FreeVfd(File file);
308 static int FileAccess(File file);
309 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
310 static bool reserveAllocatedDesc(void);
311 static int FreeDesc(AllocateDesc *desc);
312 static struct dirent *ReadDirExtended(DIR *dir, const char *dirname, int elevel);
314 static void AtProcExit_Files(int code, Datum arg);
315 static void CleanupTempFiles(bool isProcExit);
316 static void RemovePgTempFilesInDir(const char *tmpdirname);
317 static void RemovePgTempRelationFiles(const char *tsdirname);
318 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
319 static bool looks_like_temp_rel_name(const char *name);
321 static void walkdir(const char *path,
322 void (*action) (const char *fname, bool isdir, int elevel),
323 bool process_symlinks,
325 #ifdef PG_FLUSH_DATA_WORKS
326 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
328 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
330 static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
331 static int fsync_parent_path(const char *fname, int elevel);
335 * pg_fsync --- do fsync with or without writethrough
340 /* #if is to skip the sync_method test if there's no need for it */
341 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
342 if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
343 return pg_fsync_writethrough(fd);
346 return pg_fsync_no_writethrough(fd);
351 * pg_fsync_no_writethrough --- same as fsync except does nothing if
355 pg_fsync_no_writethrough(int fd)
364 * pg_fsync_writethrough
367 pg_fsync_writethrough(int fd)
373 #elif defined(F_FULLFSYNC)
374 return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
385 * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
387 * Not all platforms have fdatasync; treat as fsync if not available.
394 #ifdef HAVE_FDATASYNC
395 return fdatasync(fd);
405 * pg_flush_data --- advise OS that the described dirty data should be flushed
407 * offset of 0 with nbytes 0 means that the entire file should be flushed;
408 * in this case, this function may have side-effects on the file's
412 pg_flush_data(int fd, off_t offset, off_t nbytes)
415 * Right now file flushing is primarily used to avoid making later
416 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
417 * if fsyncs are disabled - that's a decision we might want to make
418 * configurable at some point.
424 * We compile all alternatives that are supported on the current platform,
425 * to find portability problems more easily.
427 #if defined(HAVE_SYNC_FILE_RANGE)
432 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
433 * tells the OS that writeback for the specified blocks should be
434 * started, but that we don't want to wait for completion. Note that
435 * this call might block if too much dirty data exists in the range.
436 * This is the preferable method on OSs supporting it, as it works
437 * reliably when available (contrast to msync()) and doesn't flush out
438 * clean data (like FADV_DONTNEED).
440 rc = sync_file_range(fd, offset, nbytes,
441 SYNC_FILE_RANGE_WRITE);
443 /* don't error out, this is just a performance optimization */
447 (errcode_for_file_access(),
448 errmsg("could not flush dirty data: %m")));
454 #if !defined(WIN32) && defined(MS_ASYNC)
457 static int pagesize = 0;
460 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
461 * writeback. On linux it only does so if MS_SYNC is specified, but
462 * then it does the writeback synchronously. Luckily all common linux
463 * systems have sync_file_range(). This is preferable over
464 * FADV_DONTNEED because it doesn't flush out clean data.
466 * We map the file (mmap()), tell the kernel to sync back the contents
467 * (msync()), and then remove the mapping again (munmap()).
470 /* mmap() needs actual length if we want to map whole file */
471 if (offset == 0 && nbytes == 0)
473 nbytes = lseek(fd, 0, SEEK_END);
477 (errcode_for_file_access(),
478 errmsg("could not determine dirty data size: %m")));
484 * Some platforms reject partial-page mmap() attempts. To deal with
485 * that, just truncate the request to a page boundary. If any extra
486 * bytes don't get flushed, well, it's only a hint anyway.
489 /* fetch pagesize only once */
491 pagesize = sysconf(_SC_PAGESIZE);
493 /* align length to pagesize, dropping any fractional page */
495 nbytes = (nbytes / pagesize) * pagesize;
497 /* fractional-page request is a no-op */
502 * mmap could well fail, particularly on 32-bit platforms where there
503 * may simply not be enough address space. If so, silently fall
504 * through to the next implementation.
506 if (nbytes <= (off_t) SSIZE_MAX)
507 p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
515 rc = msync(p, (size_t) nbytes, MS_ASYNC);
519 (errcode_for_file_access(),
520 errmsg("could not flush dirty data: %m")));
521 /* NB: need to fall through to munmap()! */
524 rc = munmap(p, (size_t) nbytes);
527 /* FATAL error because mapping would remain */
529 (errcode_for_file_access(),
530 errmsg("could not munmap() while flushing data: %m")));
537 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
542 * Signal the kernel that the passed in range should not be cached
543 * anymore. This has the, desired, side effect of writing out dirty
544 * data, and the, undesired, side effect of likely discarding useful
545 * clean cached blocks. For the latter reason this is the least
549 rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
553 /* don't error out, this is just a performance optimization */
555 (errcode_for_file_access(),
556 errmsg("could not flush dirty data: %m")));
566 * fsync_fname -- fsync a file or directory, handling errors properly
568 * Try to fsync a file or directory. When doing the latter, ignore errors that
569 * indicate the OS just doesn't allow/require fsyncing directories.
572 fsync_fname(const char *fname, bool isdir)
574 fsync_fname_ext(fname, isdir, false, ERROR);
578 * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
580 * This routine ensures that, after returning, the effect of renaming file
581 * persists in case of a crash. A crash while this routine is running will
582 * leave you with either the pre-existing or the moved file in place of the
583 * new file; no mixed state or truncated files are possible.
585 * It does so by using fsync on the old filename and the possibly existing
586 * target filename before the rename, and the target file and directory after.
588 * Note that rename() cannot be used across arbitrary directories, as they
589 * might not be on the same filesystem. Therefore this routine does not
590 * support renaming across directories.
592 * Log errors with the caller specified severity.
594 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
598 durable_rename(const char *oldfile, const char *newfile, int elevel)
603 * First fsync the old and target path (if it exists), to ensure that they
604 * are properly persistent on disk. Syncing the target file is not
605 * strictly necessary, but it makes it easier to reason about crashes;
606 * because it's then guaranteed that either source or target file exists
609 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
612 fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
618 (errcode_for_file_access(),
619 errmsg("could not open file \"%s\": %m", newfile)));
625 if (pg_fsync(fd) != 0)
629 /* close file upon error, might not be in transaction context */
631 CloseTransientFile(fd);
635 (errcode_for_file_access(),
636 errmsg("could not fsync file \"%s\": %m", newfile)));
639 CloseTransientFile(fd);
642 /* Time to do the real deal... */
643 if (rename(oldfile, newfile) < 0)
646 (errcode_for_file_access(),
647 errmsg("could not rename file \"%s\" to \"%s\": %m",
653 * To guarantee renaming the file is persistent, fsync the file with its
654 * new name, and its containing directory.
656 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
659 if (fsync_parent_path(newfile, elevel) != 0)
666 * durable_unlink -- remove a file in a durable manner
668 * This routine ensures that, after returning, the effect of removing file
669 * persists in case of a crash. A crash while this routine is running will
670 * leave the system in no mixed state.
672 * It does so by using fsync on the parent directory of the file after the
673 * actual removal is done.
675 * Log errors with the severity specified by caller.
677 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
681 durable_unlink(const char *fname, int elevel)
683 if (unlink(fname) < 0)
686 (errcode_for_file_access(),
687 errmsg("could not remove file \"%s\": %m",
693 * To guarantee that the removal of the file is persistent, fsync its
696 if (fsync_parent_path(fname, elevel) != 0)
703 * durable_link_or_rename -- rename a file in a durable manner.
705 * Similar to durable_rename(), except that this routine tries (but does not
706 * guarantee) not to overwrite the target file.
708 * Note that a crash in an unfortunate moment can leave you with two links to
711 * Log errors with the caller specified severity.
713 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
717 durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
720 * Ensure that, if we crash directly after the rename/link, a file with
721 * valid contents is moved into place.
723 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
726 #if HAVE_WORKING_LINK
727 if (link(oldfile, newfile) < 0)
730 (errcode_for_file_access(),
731 errmsg("could not link file \"%s\" to \"%s\": %m",
737 /* XXX: Add racy file existence check? */
738 if (rename(oldfile, newfile) < 0)
741 (errcode_for_file_access(),
742 errmsg("could not rename file \"%s\" to \"%s\": %m",
749 * Make change persistent in case of an OS crash, both the new entry and
750 * its parent directory need to be flushed.
752 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
755 /* Same for parent directory */
756 if (fsync_parent_path(newfile, elevel) != 0)
763 * InitFileAccess --- initialize this module during backend startup
765 * This is called during either normal or standalone backend start.
766 * It is *not* called in the postmaster.
771 Assert(SizeVfdCache == 0); /* call me only once */
773 /* initialize cache header entry */
774 VfdCache = (Vfd *) malloc(sizeof(Vfd));
775 if (VfdCache == NULL)
777 (errcode(ERRCODE_OUT_OF_MEMORY),
778 errmsg("out of memory")));
780 MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
781 VfdCache->fd = VFD_CLOSED;
785 /* register proc-exit hook to ensure temp files are dropped at exit */
786 on_proc_exit(AtProcExit_Files, 0);
790 * count_usable_fds --- count how many FDs the system will let us open,
791 * and estimate how many are already open.
793 * We stop counting if usable_fds reaches max_to_probe. Note: a small
794 * value of max_to_probe might result in an underestimate of already_open;
795 * we must fill in any "gaps" in the set of used FDs before the calculation
796 * of already_open will give the right answer. In practice, max_to_probe
797 * of a couple of dozen should be enough to ensure good results.
799 * We assume stdin (FD 0) is available for dup'ing
802 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
810 #ifdef HAVE_GETRLIMIT
812 int getrlimit_status;
816 fd = (int *) palloc(size * sizeof(int));
818 #ifdef HAVE_GETRLIMIT
819 #ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */
820 getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
821 #else /* but BSD doesn't ... */
822 getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
823 #endif /* RLIMIT_NOFILE */
824 if (getrlimit_status != 0)
825 ereport(WARNING, (errmsg("getrlimit failed: %m")));
826 #endif /* HAVE_GETRLIMIT */
828 /* dup until failure or probe limit reached */
833 #ifdef HAVE_GETRLIMIT
836 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
839 if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
846 /* Expect EMFILE or ENFILE, else it's fishy */
847 if (errno != EMFILE && errno != ENFILE)
848 elog(WARNING, "dup(0) failed after %d successes: %m", used);
855 fd = (int *) repalloc(fd, size * sizeof(int));
859 if (highestfd < thisfd)
862 if (used >= max_to_probe)
866 /* release the files we opened */
867 for (j = 0; j < used; j++)
873 * Return results. usable_fds is just the number of successful dups. We
874 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
875 * number) and so already_open is highestfd+1 - usable_fds.
878 *already_open = highestfd + 1 - used;
883 * Determine number of filedescriptors that fd.c is allowed to use
886 set_max_safe_fds(void)
892 * We want to set max_safe_fds to
893 * MIN(usable_fds, max_files_per_process - already_open)
894 * less the slop factor for files that are opened without consulting
895 * fd.c. This ensures that we won't exceed either max_files_per_process
896 * or the experimentally-determined EMFILE limit.
899 count_usable_fds(max_files_per_process,
900 &usable_fds, &already_open);
902 max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
905 * Take off the FDs reserved for system() etc.
907 max_safe_fds -= NUM_RESERVED_FDS;
910 * Make sure we still have enough to get by.
912 if (max_safe_fds < FD_MINFREE)
914 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
915 errmsg("insufficient file descriptors available to start server process"),
916 errdetail("System allows %d, we need at least %d.",
917 max_safe_fds + NUM_RESERVED_FDS,
918 FD_MINFREE + NUM_RESERVED_FDS)));
920 elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
921 max_safe_fds, usable_fds, already_open);
925 * Open a file with BasicOpenFilePerm() and pass default file mode for the
926 * fileMode parameter.
929 BasicOpenFile(const char *fileName, int fileFlags)
931 return BasicOpenFilePerm(fileName, fileFlags, PG_FILE_MODE_DEFAULT);
935 * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
937 * This is exported for use by places that really want a plain kernel FD,
938 * but need to be proof against running out of FDs. Once an FD has been
939 * successfully returned, it is the caller's responsibility to ensure that
940 * it will not be leaked on ereport()! Most users should *not* call this
941 * routine directly, but instead use the VFD abstraction level, which
942 * provides protection against descriptor leaks as well as management of
943 * files that need to be open for more than a short period of time.
945 * Ideally this should be the *only* direct call of open() in the backend.
946 * In practice, the postmaster calls open() directly, and there are some
947 * direct open() calls done early in backend startup. Those are OK since
948 * this module wouldn't have any open files to close at that point anyway.
951 BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
956 fd = open(fileName, fileFlags, fileMode);
959 return fd; /* success! */
961 if (errno == EMFILE || errno == ENFILE)
963 int save_errno = errno;
966 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
967 errmsg("out of file descriptors: %m; release and retry")));
969 if (ReleaseLruFile())
974 return -1; /* failure */
982 int mru = VfdCache[0].lruLessRecently;
983 Vfd *vfdP = &VfdCache[mru];
986 snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
989 mru = vfdP->lruLessRecently;
990 vfdP = &VfdCache[mru];
991 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
993 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
994 elog(LOG, "%s", buf);
1005 DO_DB(elog(LOG, "Delete %d (%s)",
1006 file, VfdCache[file].fileName));
1009 vfdP = &VfdCache[file];
1011 VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1012 VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1018 LruDelete(File file)
1024 DO_DB(elog(LOG, "LruDelete %d (%s)",
1025 file, VfdCache[file].fileName));
1027 vfdP = &VfdCache[file];
1030 * Normally we should know the seek position, but if for some reason we
1031 * have lost track of it, try again to get it. If we still can't get it,
1032 * we have a problem: we will be unable to restore the file seek position
1033 * when and if the file is re-opened. But we can't really throw an error
1034 * and refuse to close the file, or activities such as transaction cleanup
1037 if (FilePosIsUnknown(vfdP->seekPos))
1039 vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
1040 if (FilePosIsUnknown(vfdP->seekPos))
1041 elog(LOG, "could not seek file \"%s\" before closing: %m",
1046 * Close the file. We aren't expecting this to fail; if it does, better
1047 * to leak the FD than to mess up our internal state.
1049 if (close(vfdP->fd))
1050 elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
1051 vfdP->fd = VFD_CLOSED;
1054 /* delete the vfd record from the LRU ring */
1065 DO_DB(elog(LOG, "Insert %d (%s)",
1066 file, VfdCache[file].fileName));
1069 vfdP = &VfdCache[file];
1071 vfdP->lruMoreRecently = 0;
1072 vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1073 VfdCache[0].lruLessRecently = file;
1074 VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1079 /* returns 0 on success, -1 on re-open failure (with errno set) */
1081 LruInsert(File file)
1087 DO_DB(elog(LOG, "LruInsert %d (%s)",
1088 file, VfdCache[file].fileName));
1090 vfdP = &VfdCache[file];
1092 if (FileIsNotOpen(file))
1094 /* Close excess kernel FDs. */
1098 * The open could still fail for lack of file descriptors, eg due to
1099 * overall system file table being full. So, be prepared to release
1100 * another FD if necessary...
1102 vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1106 DO_DB(elog(LOG, "re-open failed: %m"));
1115 * Seek to the right position. We need no special case for seekPos
1116 * equal to FileUnknownPos, as lseek() will certainly reject that
1117 * (thus completing the logic noted in LruDelete() that we will fail
1118 * to re-open a file if we couldn't get its seek position before
1121 if (vfdP->seekPos != (off_t) 0)
1123 if (lseek(vfdP->fd, vfdP->seekPos, SEEK_SET) < 0)
1126 * If we fail to restore the seek position, treat it like an
1129 int save_errno = errno;
1131 elog(LOG, "could not seek file \"%s\" after re-opening: %m",
1133 (void) close(vfdP->fd);
1134 vfdP->fd = VFD_CLOSED;
1143 * put it at the head of the Lru ring
1152 * Release one kernel FD by closing the least-recently-used VFD.
1155 ReleaseLruFile(void)
1157 DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1162 * There are opened files and so there should be at least one used vfd
1165 Assert(VfdCache[0].lruMoreRecently != 0);
1166 LruDelete(VfdCache[0].lruMoreRecently);
1167 return true; /* freed a file */
1169 return false; /* no files available to free */
1173 * Release kernel FDs as needed to get under the max_safe_fds limit.
1174 * After calling this, it's OK to try to open another file.
1177 ReleaseLruFiles(void)
1179 while (nfile + numAllocatedDescs >= max_safe_fds)
1181 if (!ReleaseLruFile())
1192 DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1194 Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1196 if (VfdCache[0].nextFree == 0)
1199 * The free list is empty so it is time to increase the size of the
1200 * array. We choose to double it each time this happens. However,
1201 * there's not much point in starting *real* small.
1203 Size newCacheSize = SizeVfdCache * 2;
1206 if (newCacheSize < 32)
1210 * Be careful not to clobber VfdCache ptr if realloc fails.
1212 newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1213 if (newVfdCache == NULL)
1215 (errcode(ERRCODE_OUT_OF_MEMORY),
1216 errmsg("out of memory")));
1217 VfdCache = newVfdCache;
1220 * Initialize the new entries and link them into the free list.
1222 for (i = SizeVfdCache; i < newCacheSize; i++)
1224 MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1225 VfdCache[i].nextFree = i + 1;
1226 VfdCache[i].fd = VFD_CLOSED;
1228 VfdCache[newCacheSize - 1].nextFree = 0;
1229 VfdCache[0].nextFree = SizeVfdCache;
1232 * Record the new size
1234 SizeVfdCache = newCacheSize;
1237 file = VfdCache[0].nextFree;
1239 VfdCache[0].nextFree = VfdCache[file].nextFree;
1247 Vfd *vfdP = &VfdCache[file];
1249 DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1250 file, vfdP->fileName ? vfdP->fileName : ""));
1252 if (vfdP->fileName != NULL)
1254 free(vfdP->fileName);
1255 vfdP->fileName = NULL;
1257 vfdP->fdstate = 0x0;
1259 vfdP->nextFree = VfdCache[0].nextFree;
1260 VfdCache[0].nextFree = file;
1263 /* returns 0 on success, -1 on re-open failure (with errno set) */
1265 FileAccess(File file)
1269 DO_DB(elog(LOG, "FileAccess %d (%s)",
1270 file, VfdCache[file].fileName));
1273 * Is the file open? If not, open it and put it at the head of the LRU
1274 * ring (possibly closing the least recently used file to get an FD).
1277 if (FileIsNotOpen(file))
1279 returnValue = LruInsert(file);
1280 if (returnValue != 0)
1283 else if (VfdCache[0].lruLessRecently != file)
1286 * We now know that the file is open and that it is not the last one
1287 * accessed, so we need to move it to the head of the Lru ring.
1298 * Called when we get a shared invalidation message on some relation.
1302 FileInvalidate(File file)
1304 Assert(FileIsValid(file));
1305 if (!FileIsNotOpen(file))
1311 * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1312 * fileMode parameter.
1315 PathNameOpenFile(const char *fileName, int fileFlags)
1317 return PathNameOpenFilePerm(fileName, fileFlags, PG_FILE_MODE_DEFAULT);
1321 * open a file in an arbitrary directory
1323 * NB: if the passed pathname is relative (which it usually is),
1324 * it will be interpreted relative to the process' working directory
1325 * (which should always be $PGDATA when this code is running).
1328 PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1334 DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1335 fileName, fileFlags, fileMode));
1338 * We need a malloc'd copy of the file name; fail cleanly if no room.
1340 fnamecopy = strdup(fileName);
1341 if (fnamecopy == NULL)
1343 (errcode(ERRCODE_OUT_OF_MEMORY),
1344 errmsg("out of memory")));
1346 file = AllocateVfd();
1347 vfdP = &VfdCache[file];
1349 /* Close excess kernel FDs. */
1352 vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1356 int save_errno = errno;
1364 DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1369 vfdP->fileName = fnamecopy;
1370 /* Saved flags are adjusted to be OK for re-opening file */
1371 vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1372 vfdP->fileMode = fileMode;
1375 vfdP->fdstate = 0x0;
1376 vfdP->resowner = NULL;
1382 * Open a temporary file that will disappear when we close it.
1384 * This routine takes care of generating an appropriate tempfile name.
1385 * There's no need to pass in fileFlags or fileMode either, since only
1386 * one setting makes any sense for a temp file.
1388 * Unless interXact is true, the file is remembered by CurrentResourceOwner
1389 * to ensure it's closed and deleted when it's no longer needed, typically at
1390 * the end-of-transaction. In most cases, you don't want temporary files to
1391 * outlive the transaction that created them, so this should be false -- but
1392 * if you need "somewhat" temporary storage, this might be useful. In either
1393 * case, the file is removed when the File is explicitly closed.
1396 OpenTemporaryFile(bool interXact)
1401 * If some temp tablespace(s) have been given to us, try to use the next
1402 * one. If a given tablespace can't be found, we silently fall back to
1403 * the database's default tablespace.
1405 * BUT: if the temp file is slated to outlive the current transaction,
1406 * force it into the database's default tablespace, so that it will not
1407 * pose a threat to possible tablespace drop attempts.
1409 if (numTempTableSpaces > 0 && !interXact)
1411 Oid tblspcOid = GetNextTempTableSpace();
1413 if (OidIsValid(tblspcOid))
1414 file = OpenTemporaryFileInTablespace(tblspcOid, false);
1418 * If not, or if tablespace is bad, create in database's default
1419 * tablespace. MyDatabaseTableSpace should normally be set before we get
1420 * here, but just in case it isn't, fall back to pg_default tablespace.
1423 file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
1424 MyDatabaseTableSpace :
1425 DEFAULTTABLESPACE_OID,
1428 /* Mark it for deletion at close */
1429 VfdCache[file].fdstate |= FD_TEMPORARY;
1431 /* Register it with the current resource owner */
1434 VfdCache[file].fdstate |= FD_XACT_TEMPORARY;
1436 ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1437 ResourceOwnerRememberFile(CurrentResourceOwner, file);
1438 VfdCache[file].resowner = CurrentResourceOwner;
1440 /* ensure cleanup happens at eoxact */
1441 have_xact_temporary_files = true;
1448 * Open a temporary file in a specific tablespace.
1449 * Subroutine for OpenTemporaryFile, which see for details.
1452 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1454 char tempdirpath[MAXPGPATH];
1455 char tempfilepath[MAXPGPATH];
1459 * Identify the tempfile directory for this tablespace.
1461 * If someone tries to specify pg_global, use pg_default instead.
1463 if (tblspcOid == DEFAULTTABLESPACE_OID ||
1464 tblspcOid == GLOBALTABLESPACE_OID)
1466 /* The default tablespace is {datadir}/base */
1467 snprintf(tempdirpath, sizeof(tempdirpath), "base/%s",
1472 /* All other tablespaces are accessed via symlinks */
1473 snprintf(tempdirpath, sizeof(tempdirpath), "pg_tblspc/%u/%s/%s",
1474 tblspcOid, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
1478 * Generate a tempfile name that should be unique within the current
1479 * database instance.
1481 snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1482 tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1485 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1486 * temp file that can be reused.
1488 file = PathNameOpenFile(tempfilepath,
1489 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1493 * We might need to create the tablespace's tempfile directory, if no
1494 * one has yet done so.
1496 * Don't check for error from mkdir; it could fail if someone else
1497 * just did the same thing. If it doesn't work then we'll bomb out on
1498 * the second create attempt, instead.
1500 mkdir(tempdirpath, S_IRWXU);
1502 file = PathNameOpenFile(tempfilepath,
1503 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1504 if (file <= 0 && rejectError)
1505 elog(ERROR, "could not create temporary file \"%s\": %m",
1513 * close a file when done with it
1516 FileClose(File file)
1520 Assert(FileIsValid(file));
1522 DO_DB(elog(LOG, "FileClose: %d (%s)",
1523 file, VfdCache[file].fileName));
1525 vfdP = &VfdCache[file];
1527 if (!FileIsNotOpen(file))
1529 /* close the file */
1530 if (close(vfdP->fd))
1531 elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
1534 vfdP->fd = VFD_CLOSED;
1536 /* remove the file from the lru ring */
1541 * Delete the file if it was temporary, and make a log entry if wanted
1543 if (vfdP->fdstate & FD_TEMPORARY)
1545 struct stat filestats;
1549 * If we get an error, as could happen within the ereport/elog calls,
1550 * we'll come right back here during transaction abort. Reset the
1551 * flag to ensure that we can't get into an infinite loop. This code
1552 * is arranged to ensure that the worst-case consequence is failing to
1553 * emit log message(s), not failing to attempt the unlink.
1555 vfdP->fdstate &= ~FD_TEMPORARY;
1557 /* Subtract its size from current usage (do first in case of error) */
1558 temporary_files_size -= vfdP->fileSize;
1561 /* first try the stat() */
1562 if (stat(vfdP->fileName, &filestats))
1567 /* in any case do the unlink */
1568 if (unlink(vfdP->fileName))
1569 elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
1571 /* and last report the stat results */
1572 if (stat_errno == 0)
1574 pgstat_report_tempfile(filestats.st_size);
1576 if (log_temp_files >= 0)
1578 if ((filestats.st_size / 1024) >= log_temp_files)
1580 (errmsg("temporary file: path \"%s\", size %lu",
1582 (unsigned long) filestats.st_size)));
1588 elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
1592 /* Unregister it from the resource owner */
1594 ResourceOwnerForgetFile(vfdP->resowner, file);
1597 * Return the Vfd slot to the free list
1603 * FilePrefetch - initiate asynchronous read of a given range of the file.
1604 * The logical seek position is unaffected.
1606 * Currently the only implementation of this function is using posix_fadvise
1607 * which is the simplest standardized interface that accomplishes this.
1608 * We could add an implementation using libaio in the future; but note that
1609 * this API is inappropriate for libaio, which wants to have a buffer provided
1613 FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
1615 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1618 Assert(FileIsValid(file));
1620 DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1621 file, VfdCache[file].fileName,
1622 (int64) offset, amount));
1624 returnCode = FileAccess(file);
1628 pgstat_report_wait_start(wait_event_info);
1629 returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
1630 POSIX_FADV_WILLNEED);
1631 pgstat_report_wait_end();
1635 Assert(FileIsValid(file));
1641 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
1645 Assert(FileIsValid(file));
1647 DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
1648 file, VfdCache[file].fileName,
1649 (int64) offset, (int64) nbytes));
1652 * Caution: do not call pg_flush_data with nbytes = 0, it could trash the
1653 * file's seek position. We prefer to define that as a no-op here.
1658 returnCode = FileAccess(file);
1662 pgstat_report_wait_start(wait_event_info);
1663 pg_flush_data(VfdCache[file].fd, offset, nbytes);
1664 pgstat_report_wait_end();
1668 FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
1673 Assert(FileIsValid(file));
1675 DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
1676 file, VfdCache[file].fileName,
1677 (int64) VfdCache[file].seekPos,
1680 returnCode = FileAccess(file);
1684 vfdP = &VfdCache[file];
1687 pgstat_report_wait_start(wait_event_info);
1688 returnCode = read(vfdP->fd, buffer, amount);
1689 pgstat_report_wait_end();
1691 if (returnCode >= 0)
1693 /* if seekPos is unknown, leave it that way */
1694 if (!FilePosIsUnknown(vfdP->seekPos))
1695 vfdP->seekPos += returnCode;
1700 * Windows may run out of kernel buffers and return "Insufficient
1701 * system resources" error. Wait a bit and retry to solve it.
1703 * It is rumored that EINTR is also possible on some Unix filesystems,
1704 * in which case immediate retry is indicated.
1707 DWORD error = GetLastError();
1711 case ERROR_NO_SYSTEM_RESOURCES:
1720 /* OK to retry if interrupted */
1724 /* Trouble, so assume we don't know the file position anymore */
1725 vfdP->seekPos = FileUnknownPos;
1732 FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
1737 Assert(FileIsValid(file));
1739 DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
1740 file, VfdCache[file].fileName,
1741 (int64) VfdCache[file].seekPos,
1744 returnCode = FileAccess(file);
1748 vfdP = &VfdCache[file];
1751 * If enforcing temp_file_limit and it's a temp file, check to see if the
1752 * write would overrun temp_file_limit, and throw error if so. Note: it's
1753 * really a modularity violation to throw error here; we should set errno
1754 * and return -1. However, there's no way to report a suitable error
1755 * message if we do that. All current callers would just throw error
1756 * immediately anyway, so this is safe at present.
1758 if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMPORARY))
1763 * Normally we should know the seek position, but if for some reason
1764 * we have lost track of it, try again to get it. Here, it's fine to
1765 * throw an error if we still can't get it.
1767 if (FilePosIsUnknown(vfdP->seekPos))
1769 vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
1770 if (FilePosIsUnknown(vfdP->seekPos))
1771 elog(ERROR, "could not seek file \"%s\": %m", vfdP->fileName);
1774 newPos = vfdP->seekPos + amount;
1775 if (newPos > vfdP->fileSize)
1777 uint64 newTotal = temporary_files_size;
1779 newTotal += newPos - vfdP->fileSize;
1780 if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
1782 (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
1783 errmsg("temporary file size exceeds temp_file_limit (%dkB)",
1790 pgstat_report_wait_start(wait_event_info);
1791 returnCode = write(vfdP->fd, buffer, amount);
1792 pgstat_report_wait_end();
1794 /* if write didn't set errno, assume problem is no disk space */
1795 if (returnCode != amount && errno == 0)
1798 if (returnCode >= 0)
1800 /* if seekPos is unknown, leave it that way */
1801 if (!FilePosIsUnknown(vfdP->seekPos))
1802 vfdP->seekPos += returnCode;
1805 * Maintain fileSize and temporary_files_size if it's a temp file.
1807 * If seekPos is -1 (unknown), this will do nothing; but we could only
1808 * get here in that state if we're not enforcing temporary_files_size,
1811 if (vfdP->fdstate & FD_TEMPORARY)
1813 off_t newPos = vfdP->seekPos;
1815 if (newPos > vfdP->fileSize)
1817 temporary_files_size += newPos - vfdP->fileSize;
1818 vfdP->fileSize = newPos;
1825 * See comments in FileRead()
1828 DWORD error = GetLastError();
1832 case ERROR_NO_SYSTEM_RESOURCES:
1841 /* OK to retry if interrupted */
1845 /* Trouble, so assume we don't know the file position anymore */
1846 vfdP->seekPos = FileUnknownPos;
1853 FileSync(File file, uint32 wait_event_info)
1857 Assert(FileIsValid(file));
1859 DO_DB(elog(LOG, "FileSync: %d (%s)",
1860 file, VfdCache[file].fileName));
1862 returnCode = FileAccess(file);
1866 pgstat_report_wait_start(wait_event_info);
1867 returnCode = pg_fsync(VfdCache[file].fd);
1868 pgstat_report_wait_end();
1874 FileSeek(File file, off_t offset, int whence)
1878 Assert(FileIsValid(file));
1880 DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
1881 file, VfdCache[file].fileName,
1882 (int64) VfdCache[file].seekPos,
1883 (int64) offset, whence));
1885 vfdP = &VfdCache[file];
1887 if (FileIsNotOpen(file))
1897 vfdP->seekPos = offset;
1900 if (FilePosIsUnknown(vfdP->seekPos) ||
1901 vfdP->seekPos + offset < 0)
1906 vfdP->seekPos += offset;
1909 if (FileAccess(file) < 0)
1911 vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1914 elog(ERROR, "invalid whence: %d", whence);
1928 if (vfdP->seekPos != offset)
1929 vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1932 if (offset != 0 || FilePosIsUnknown(vfdP->seekPos))
1933 vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1936 vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1939 elog(ERROR, "invalid whence: %d", whence);
1944 return vfdP->seekPos;
1948 * XXX not actually used but here for completeness
1954 Assert(FileIsValid(file));
1955 DO_DB(elog(LOG, "FileTell %d (%s)",
1956 file, VfdCache[file].fileName));
1957 return VfdCache[file].seekPos;
1962 FileTruncate(File file, off_t offset, uint32 wait_event_info)
1966 Assert(FileIsValid(file));
1968 DO_DB(elog(LOG, "FileTruncate %d (%s)",
1969 file, VfdCache[file].fileName));
1971 returnCode = FileAccess(file);
1975 pgstat_report_wait_start(wait_event_info);
1976 returnCode = ftruncate(VfdCache[file].fd, offset);
1977 pgstat_report_wait_end();
1979 if (returnCode == 0 && VfdCache[file].fileSize > offset)
1981 /* adjust our state for truncation of a temp file */
1982 Assert(VfdCache[file].fdstate & FD_TEMPORARY);
1983 temporary_files_size -= VfdCache[file].fileSize - offset;
1984 VfdCache[file].fileSize = offset;
1991 * Return the pathname associated with an open file.
1993 * The returned string points to an internal buffer, which is valid until
1994 * the file is closed.
1997 FilePathName(File file)
1999 Assert(FileIsValid(file));
2001 return VfdCache[file].fileName;
2005 * Return the raw file descriptor of an opened file.
2007 * The returned file descriptor will be valid until the file is closed, but
2008 * there are a lot of things that can make that happen. So the caller should
2009 * be careful not to do much of anything else before it finishes using the
2010 * returned file descriptor.
2013 FileGetRawDesc(File file)
2015 Assert(FileIsValid(file));
2016 return VfdCache[file].fd;
2020 * FileGetRawFlags - returns the file flags on open(2)
2023 FileGetRawFlags(File file)
2025 Assert(FileIsValid(file));
2026 return VfdCache[file].fileFlags;
2030 * FileGetRawMode - returns the mode bitmask passed to open(2)
2033 FileGetRawMode(File file)
2035 Assert(FileIsValid(file));
2036 return VfdCache[file].fileMode;
2040 * Make room for another allocatedDescs[] array entry if needed and possible.
2041 * Returns true if an array element is available.
2044 reserveAllocatedDesc(void)
2046 AllocateDesc *newDescs;
2049 /* Quick out if array already has a free slot. */
2050 if (numAllocatedDescs < maxAllocatedDescs)
2054 * If the array hasn't yet been created in the current process, initialize
2055 * it with FD_MINFREE / 2 elements. In many scenarios this is as many as
2056 * we will ever need, anyway. We don't want to look at max_safe_fds
2057 * immediately because set_max_safe_fds() may not have run yet.
2059 if (allocatedDescs == NULL)
2061 newMax = FD_MINFREE / 2;
2062 newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2063 /* Out of memory already? Treat as fatal error. */
2064 if (newDescs == NULL)
2066 (errcode(ERRCODE_OUT_OF_MEMORY),
2067 errmsg("out of memory")));
2068 allocatedDescs = newDescs;
2069 maxAllocatedDescs = newMax;
2074 * Consider enlarging the array beyond the initial allocation used above.
2075 * By the time this happens, max_safe_fds should be known accurately.
2077 * We mustn't let allocated descriptors hog all the available FDs, and in
2078 * practice we'd better leave a reasonable number of FDs for VFD use. So
2079 * set the maximum to max_safe_fds / 2. (This should certainly be at
2080 * least as large as the initial size, FD_MINFREE / 2.)
2082 newMax = max_safe_fds / 2;
2083 if (newMax > maxAllocatedDescs)
2085 newDescs = (AllocateDesc *) realloc(allocatedDescs,
2086 newMax * sizeof(AllocateDesc));
2087 /* Treat out-of-memory as a non-fatal error. */
2088 if (newDescs == NULL)
2090 allocatedDescs = newDescs;
2091 maxAllocatedDescs = newMax;
2095 /* Can't enlarge allocatedDescs[] any more. */
2100 * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2101 * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2102 * necessary to open the file. When done, call FreeFile rather than fclose.
2104 * Note that files that will be open for any significant length of time
2105 * should NOT be handled this way, since they cannot share kernel file
2106 * descriptors with other files; there is grave risk of running out of FDs
2107 * if anyone locks down too many FDs. Most callers of this routine are
2108 * simply reading a config file that they will read and close immediately.
2110 * fd.c will automatically close all files opened with AllocateFile at
2111 * transaction commit or abort; this prevents FD leakage if a routine
2112 * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2114 * Ideally this should be the *only* direct call of fopen() in the backend.
2117 AllocateFile(const char *name, const char *mode)
2121 DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2122 numAllocatedDescs, name));
2124 /* Can we allocate another non-virtual FD? */
2125 if (!reserveAllocatedDesc())
2127 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2128 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2129 maxAllocatedDescs, name)));
2131 /* Close excess kernel FDs. */
2135 if ((file = fopen(name, mode)) != NULL)
2137 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2139 desc->kind = AllocateDescFile;
2140 desc->desc.file = file;
2141 desc->create_subid = GetCurrentSubTransactionId();
2142 numAllocatedDescs++;
2143 return desc->desc.file;
2146 if (errno == EMFILE || errno == ENFILE)
2148 int save_errno = errno;
2151 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2152 errmsg("out of file descriptors: %m; release and retry")));
2154 if (ReleaseLruFile())
2163 * Open a file with OpenTransientFilePerm() and pass default file mode for
2164 * the fileMode parameter.
2167 OpenTransientFile(const char *fileName, int fileFlags)
2169 return OpenTransientFilePerm(fileName, fileFlags, PG_FILE_MODE_DEFAULT);
2173 * Like AllocateFile, but returns an unbuffered fd like open(2)
2176 OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2180 DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2181 numAllocatedDescs, fileName));
2183 /* Can we allocate another non-virtual FD? */
2184 if (!reserveAllocatedDesc())
2186 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2187 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2188 maxAllocatedDescs, fileName)));
2190 /* Close excess kernel FDs. */
2193 fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2197 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2199 desc->kind = AllocateDescRawFD;
2201 desc->create_subid = GetCurrentSubTransactionId();
2202 numAllocatedDescs++;
2207 return -1; /* failure */
2211 * Routines that want to initiate a pipe stream should use OpenPipeStream
2212 * rather than plain popen(). This lets fd.c deal with freeing FDs if
2213 * necessary. When done, call ClosePipeStream rather than pclose.
2216 OpenPipeStream(const char *command, const char *mode)
2220 DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2221 numAllocatedDescs, command));
2223 /* Can we allocate another non-virtual FD? */
2224 if (!reserveAllocatedDesc())
2226 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2227 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2228 maxAllocatedDescs, command)));
2230 /* Close excess kernel FDs. */
2237 if ((file = popen(command, mode)) != NULL)
2239 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2241 desc->kind = AllocateDescPipe;
2242 desc->desc.file = file;
2243 desc->create_subid = GetCurrentSubTransactionId();
2244 numAllocatedDescs++;
2245 return desc->desc.file;
2248 if (errno == EMFILE || errno == ENFILE)
2250 int save_errno = errno;
2253 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2254 errmsg("out of file descriptors: %m; release and retry")));
2256 if (ReleaseLruFile())
2265 * Free an AllocateDesc of any type.
2267 * The argument *must* point into the allocatedDescs[] array.
2270 FreeDesc(AllocateDesc *desc)
2274 /* Close the underlying object */
2277 case AllocateDescFile:
2278 result = fclose(desc->desc.file);
2280 case AllocateDescPipe:
2281 result = pclose(desc->desc.file);
2283 case AllocateDescDir:
2284 result = closedir(desc->desc.dir);
2286 case AllocateDescRawFD:
2287 result = close(desc->desc.fd);
2290 elog(ERROR, "AllocateDesc kind not recognized");
2291 result = 0; /* keep compiler quiet */
2295 /* Compact storage in the allocatedDescs array */
2296 numAllocatedDescs--;
2297 *desc = allocatedDescs[numAllocatedDescs];
2303 * Close a file returned by AllocateFile.
2305 * Note we do not check fclose's return value --- it is up to the caller
2306 * to handle close errors.
2309 FreeFile(FILE *file)
2313 DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2315 /* Remove file from list of allocated files, if it's present */
2316 for (i = numAllocatedDescs; --i >= 0;)
2318 AllocateDesc *desc = &allocatedDescs[i];
2320 if (desc->kind == AllocateDescFile && desc->desc.file == file)
2321 return FreeDesc(desc);
2324 /* Only get here if someone passes us a file not in allocatedDescs */
2325 elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2327 return fclose(file);
2331 * Close a file returned by OpenTransientFile.
2333 * Note we do not check close's return value --- it is up to the caller
2334 * to handle close errors.
2337 CloseTransientFile(int fd)
2341 DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2343 /* Remove fd from list of allocated files, if it's present */
2344 for (i = numAllocatedDescs; --i >= 0;)
2346 AllocateDesc *desc = &allocatedDescs[i];
2348 if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2349 return FreeDesc(desc);
2352 /* Only get here if someone passes us a file not in allocatedDescs */
2353 elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2359 * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2360 * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2361 * necessary to open the directory, and with closing it after an elog.
2362 * When done, call FreeDir rather than closedir.
2364 * Ideally this should be the *only* direct call of opendir() in the backend.
2367 AllocateDir(const char *dirname)
2371 DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2372 numAllocatedDescs, dirname));
2374 /* Can we allocate another non-virtual FD? */
2375 if (!reserveAllocatedDesc())
2377 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2378 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2379 maxAllocatedDescs, dirname)));
2381 /* Close excess kernel FDs. */
2385 if ((dir = opendir(dirname)) != NULL)
2387 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2389 desc->kind = AllocateDescDir;
2390 desc->desc.dir = dir;
2391 desc->create_subid = GetCurrentSubTransactionId();
2392 numAllocatedDescs++;
2393 return desc->desc.dir;
2396 if (errno == EMFILE || errno == ENFILE)
2398 int save_errno = errno;
2401 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2402 errmsg("out of file descriptors: %m; release and retry")));
2404 if (ReleaseLruFile())
2413 * Read a directory opened with AllocateDir, ereport'ing any error.
2415 * This is easier to use than raw readdir() since it takes care of some
2416 * otherwise rather tedious and error-prone manipulation of errno. Also,
2417 * if you are happy with a generic error message for AllocateDir failure,
2420 * dir = AllocateDir(path);
2421 * while ((dirent = ReadDir(dir, path)) != NULL)
2425 * since a NULL dir parameter is taken as indicating AllocateDir failed.
2426 * (Make sure errno hasn't been changed since AllocateDir if you use this
2429 * The pathname passed to AllocateDir must be passed to this routine too,
2430 * but it is only used for error reporting.
2433 ReadDir(DIR *dir, const char *dirname)
2435 return ReadDirExtended(dir, dirname, ERROR);
2439 * Alternate version that allows caller to specify the elevel for any
2440 * error report. If elevel < ERROR, returns NULL on any error.
2442 static struct dirent *
2443 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2445 struct dirent *dent;
2447 /* Give a generic message for AllocateDir failure, if caller didn't */
2451 (errcode_for_file_access(),
2452 errmsg("could not open directory \"%s\": %m",
2458 if ((dent = readdir(dir)) != NULL)
2463 (errcode_for_file_access(),
2464 errmsg("could not read directory \"%s\": %m",
2470 * Close a directory opened with AllocateDir.
2472 * Note we do not check closedir's return value --- it is up to the caller
2473 * to handle close errors.
2480 DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2482 /* Remove dir from list of allocated dirs, if it's present */
2483 for (i = numAllocatedDescs; --i >= 0;)
2485 AllocateDesc *desc = &allocatedDescs[i];
2487 if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2488 return FreeDesc(desc);
2491 /* Only get here if someone passes us a dir not in allocatedDescs */
2492 elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2494 return closedir(dir);
2499 * Close a pipe stream returned by OpenPipeStream.
2502 ClosePipeStream(FILE *file)
2506 DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2508 /* Remove file from list of allocated files, if it's present */
2509 for (i = numAllocatedDescs; --i >= 0;)
2511 AllocateDesc *desc = &allocatedDescs[i];
2513 if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2514 return FreeDesc(desc);
2517 /* Only get here if someone passes us a file not in allocatedDescs */
2518 elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2520 return pclose(file);
2526 * Force all VFDs into the physically-closed state, so that the fewest
2527 * possible number of kernel file descriptors are in use. There is no
2528 * change in the logical state of the VFDs.
2535 if (SizeVfdCache > 0)
2537 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2538 for (i = 1; i < SizeVfdCache; i++)
2540 if (!FileIsNotOpen(i))
2548 * SetTempTablespaces
2550 * Define a list (actually an array) of OIDs of tablespaces to use for
2551 * temporary files. This list will be used until end of transaction,
2552 * unless this function is called again before then. It is caller's
2553 * responsibility that the passed-in array has adequate lifespan (typically
2554 * it'd be allocated in TopTransactionContext).
2557 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2559 Assert(numSpaces >= 0);
2560 tempTableSpaces = tableSpaces;
2561 numTempTableSpaces = numSpaces;
2564 * Select a random starting point in the list. This is to minimize
2565 * conflicts between backends that are most likely sharing the same list
2566 * of temp tablespaces. Note that if we create multiple temp files in the
2567 * same transaction, we'll advance circularly through the list --- this
2568 * ensures that large temporary sort files are nicely spread across all
2569 * available tablespaces.
2572 nextTempTableSpace = random() % numSpaces;
2574 nextTempTableSpace = 0;
2578 * TempTablespacesAreSet
2580 * Returns true if SetTempTablespaces has been called in current transaction.
2581 * (This is just so that tablespaces.c doesn't need its own per-transaction
2585 TempTablespacesAreSet(void)
2587 return (numTempTableSpaces >= 0);
2591 * GetNextTempTableSpace
2593 * Select the next temp tablespace to use. A result of InvalidOid means
2594 * to use the current database's default tablespace.
2597 GetNextTempTableSpace(void)
2599 if (numTempTableSpaces > 0)
2601 /* Advance nextTempTableSpace counter with wraparound */
2602 if (++nextTempTableSpace >= numTempTableSpaces)
2603 nextTempTableSpace = 0;
2604 return tempTableSpaces[nextTempTableSpace];
2613 * Take care of subtransaction commit/abort. At abort, we close temp files
2614 * that the subtransaction may have opened. At commit, we reassign the
2615 * files that were opened to the parent subtransaction.
2618 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2619 SubTransactionId parentSubid)
2623 for (i = 0; i < numAllocatedDescs; i++)
2625 if (allocatedDescs[i].create_subid == mySubid)
2628 allocatedDescs[i].create_subid = parentSubid;
2631 /* have to recheck the item after FreeDesc (ugly) */
2632 FreeDesc(&allocatedDescs[i--]);
2641 * This routine is called during transaction commit or abort (it doesn't
2642 * particularly care which). All still-open per-transaction temporary file
2643 * VFDs are closed, which also causes the underlying files to be deleted
2644 * (although they should've been closed already by the ResourceOwner
2645 * cleanup). Furthermore, all "allocated" stdio files are closed. We also
2646 * forget any transaction-local temp tablespace list.
2649 AtEOXact_Files(void)
2651 CleanupTempFiles(false);
2652 tempTableSpaces = NULL;
2653 numTempTableSpaces = -1;
2659 * on_proc_exit hook to clean up temp files during backend shutdown.
2660 * Here, we want to clean up *all* temp files including interXact ones.
2663 AtProcExit_Files(int code, Datum arg)
2665 CleanupTempFiles(true);
2669 * Close temporary files and delete their underlying files.
2671 * isProcExit: if true, this is being called as the backend process is
2672 * exiting. If that's the case, we should remove all temporary files; if
2673 * that's not the case, we are being called for transaction commit/abort
2674 * and should only remove transaction-local temp files. In either case,
2675 * also clean up "allocated" stdio files, dirs and fds.
2678 CleanupTempFiles(bool isProcExit)
2683 * Careful here: at proc_exit we need extra cleanup, not just
2684 * xact_temporary files.
2686 if (isProcExit || have_xact_temporary_files)
2688 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2689 for (i = 1; i < SizeVfdCache; i++)
2691 unsigned short fdstate = VfdCache[i].fdstate;
2693 if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL)
2696 * If we're in the process of exiting a backend process, close
2697 * all temporary files. Otherwise, only close temporary files
2698 * local to the current transaction. They should be closed by
2699 * the ResourceOwner mechanism already, so this is just a
2700 * debugging cross-check.
2704 else if (fdstate & FD_XACT_TEMPORARY)
2707 "temporary file %s not closed at end-of-transaction",
2708 VfdCache[i].fileName);
2714 have_xact_temporary_files = false;
2717 /* Clean up "allocated" stdio files, dirs and fds. */
2718 while (numAllocatedDescs > 0)
2719 FreeDesc(&allocatedDescs[0]);
2724 * Remove temporary and temporary relation files left over from a prior
2725 * postmaster session
2727 * This should be called during postmaster startup. It will forcibly
2728 * remove any leftover files created by OpenTemporaryFile and any leftover
2729 * temporary relation files created by mdcreate.
2731 * NOTE: we could, but don't, call this during a post-backend-crash restart
2732 * cycle. The argument for not doing it is that someone might want to examine
2733 * the temp files for debugging purposes. This does however mean that
2734 * OpenTemporaryFile had better allow for collision with an existing temp
2738 RemovePgTempFiles(void)
2740 char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
2742 struct dirent *spc_de;
2745 * First process temp files in pg_default ($PGDATA/base)
2747 snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
2748 RemovePgTempFilesInDir(temp_path);
2749 RemovePgTempRelationFiles("base");
2752 * Cycle through temp directories for all non-default tablespaces.
2754 spc_dir = AllocateDir("pg_tblspc");
2756 while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL)
2758 if (strcmp(spc_de->d_name, ".") == 0 ||
2759 strcmp(spc_de->d_name, "..") == 0)
2762 snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
2763 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
2764 RemovePgTempFilesInDir(temp_path);
2766 snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
2767 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
2768 RemovePgTempRelationFiles(temp_path);
2774 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
2778 RemovePgTempFilesInDir(PG_TEMP_FILES_DIR);
2782 /* Process one pgsql_tmp directory for RemovePgTempFiles */
2784 RemovePgTempFilesInDir(const char *tmpdirname)
2787 struct dirent *temp_de;
2788 char rm_path[MAXPGPATH * 2];
2790 temp_dir = AllocateDir(tmpdirname);
2791 if (temp_dir == NULL)
2793 /* anything except ENOENT is fishy */
2794 if (errno != ENOENT)
2796 "could not open temporary-files directory \"%s\": %m",
2801 while ((temp_de = ReadDir(temp_dir, tmpdirname)) != NULL)
2803 if (strcmp(temp_de->d_name, ".") == 0 ||
2804 strcmp(temp_de->d_name, "..") == 0)
2807 snprintf(rm_path, sizeof(rm_path), "%s/%s",
2808 tmpdirname, temp_de->d_name);
2810 if (strncmp(temp_de->d_name,
2811 PG_TEMP_FILE_PREFIX,
2812 strlen(PG_TEMP_FILE_PREFIX)) == 0)
2813 unlink(rm_path); /* note we ignore any error */
2816 "unexpected file found in temporary-files directory: \"%s\"",
2823 /* Process one tablespace directory, look for per-DB subdirectories */
2825 RemovePgTempRelationFiles(const char *tsdirname)
2829 char dbspace_path[MAXPGPATH * 2];
2831 ts_dir = AllocateDir(tsdirname);
2834 /* anything except ENOENT is fishy */
2835 if (errno != ENOENT)
2837 "could not open tablespace directory \"%s\": %m",
2842 while ((de = ReadDir(ts_dir, tsdirname)) != NULL)
2847 * We're only interested in the per-database directories, which have
2848 * numeric names. Note that this code will also (properly) ignore "."
2851 while (isdigit((unsigned char) de->d_name[i]))
2853 if (de->d_name[i] != '\0' || i == 0)
2856 snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
2857 tsdirname, de->d_name);
2858 RemovePgTempRelationFilesInDbspace(dbspace_path);
2864 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
2866 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
2870 char rm_path[MAXPGPATH * 2];
2872 dbspace_dir = AllocateDir(dbspacedirname);
2873 if (dbspace_dir == NULL)
2875 /* we just saw this directory, so it really ought to be there */
2877 "could not open dbspace directory \"%s\": %m",
2882 while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
2884 if (!looks_like_temp_rel_name(de->d_name))
2887 snprintf(rm_path, sizeof(rm_path), "%s/%s",
2888 dbspacedirname, de->d_name);
2890 unlink(rm_path); /* note we ignore any error */
2893 FreeDir(dbspace_dir);
2896 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
2898 looks_like_temp_rel_name(const char *name)
2903 /* Must start with "t". */
2907 /* Followed by a non-empty string of digits and then an underscore. */
2908 for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
2910 if (pos == 1 || name[pos] != '_')
2913 /* Followed by another nonempty string of digits. */
2914 for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
2919 /* We might have _forkname or .segment or both. */
2920 if (name[pos] == '_')
2922 int forkchar = forkname_chars(&name[pos + 1], NULL);
2926 pos += forkchar + 1;
2928 if (name[pos] == '.')
2932 for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
2939 /* Now we should be at the end. */
2940 if (name[pos] != '\0')
2947 * Issue fsync recursively on PGDATA and all its contents.
2949 * We fsync regular files and directories wherever they are, but we
2950 * follow symlinks only for pg_wal and immediately under pg_tblspc.
2951 * Other symlinks are presumed to point at files we're not responsible
2952 * for fsyncing, and might not have privileges to write at all.
2954 * Errors are logged but not considered fatal; that's because this is used
2955 * only during database startup, to deal with the possibility that there are
2956 * issued-but-unsynced writes pending against the data directory. We want to
2957 * ensure that such writes reach disk before anything that's done in the new
2958 * run. However, aborting on error would result in failure to start for
2959 * harmless cases such as read-only files in the data directory, and that's
2962 * Note we assume we're chdir'd into PGDATA to begin with.
2965 SyncDataDirectory(void)
2967 bool xlog_is_symlink;
2969 /* We can skip this whole thing if fsync is disabled. */
2974 * If pg_wal is a symlink, we'll need to recurse into it separately,
2975 * because the first walkdir below will ignore it.
2977 xlog_is_symlink = false;
2983 if (lstat("pg_wal", &st) < 0)
2985 (errcode_for_file_access(),
2986 errmsg("could not stat file \"%s\": %m",
2988 else if (S_ISLNK(st.st_mode))
2989 xlog_is_symlink = true;
2992 if (pgwin32_is_junction("pg_wal"))
2993 xlog_is_symlink = true;
2997 * If possible, hint to the kernel that we're soon going to fsync the data
2998 * directory and its contents. Errors in this step are even less
2999 * interesting than normal, so log them only at DEBUG1.
3001 #ifdef PG_FLUSH_DATA_WORKS
3002 walkdir(".", pre_sync_fname, false, DEBUG1);
3003 if (xlog_is_symlink)
3004 walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3005 walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3009 * Now we do the fsync()s in the same order.
3011 * The main call ignores symlinks, so in addition to specially processing
3012 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3013 * process_symlinks = true. Note that if there are any plain directories
3014 * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3015 * so we don't worry about optimizing it.
3017 walkdir(".", datadir_fsync_fname, false, LOG);
3018 if (xlog_is_symlink)
3019 walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3020 walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3024 * walkdir: recursively walk a directory, applying the action to each
3025 * regular file and directory (including the named directory itself).
3027 * If process_symlinks is true, the action and recursion are also applied
3028 * to regular files and directories that are pointed to by symlinks in the
3029 * given directory; otherwise symlinks are ignored. Symlinks are always
3030 * ignored in subdirectories, ie we intentionally don't pass down the
3031 * process_symlinks flag to recursive calls.
3033 * Errors are reported at level elevel, which might be ERROR or less.
3035 * See also walkdir in initdb.c, which is a frontend version of this logic.
3038 walkdir(const char *path,
3039 void (*action) (const char *fname, bool isdir, int elevel),
3040 bool process_symlinks,
3046 dir = AllocateDir(path);
3050 (errcode_for_file_access(),
3051 errmsg("could not open directory \"%s\": %m", path)));
3055 while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3057 char subpath[MAXPGPATH * 2];
3061 CHECK_FOR_INTERRUPTS();
3063 if (strcmp(de->d_name, ".") == 0 ||
3064 strcmp(de->d_name, "..") == 0)
3067 snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3069 if (process_symlinks)
3070 sret = stat(subpath, &fst);
3072 sret = lstat(subpath, &fst);
3077 (errcode_for_file_access(),
3078 errmsg("could not stat file \"%s\": %m", subpath)));
3082 if (S_ISREG(fst.st_mode))
3083 (*action) (subpath, false, elevel);
3084 else if (S_ISDIR(fst.st_mode))
3085 walkdir(subpath, action, false, elevel);
3088 FreeDir(dir); /* we ignore any error here */
3091 * It's important to fsync the destination directory itself as individual
3092 * file fsyncs don't guarantee that the directory entry for the file is
3095 (*action) (path, true, elevel);
3100 * Hint to the OS that it should get ready to fsync() this file.
3102 * Ignores errors trying to open unreadable files, and logs other errors at a
3103 * caller-specified level.
3105 #ifdef PG_FLUSH_DATA_WORKS
3108 pre_sync_fname(const char *fname, bool isdir, int elevel)
3112 /* Don't try to flush directories, it'll likely just fail */
3116 fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3120 if (errno == EACCES)
3123 (errcode_for_file_access(),
3124 errmsg("could not open file \"%s\": %m", fname)));
3129 * pg_flush_data() ignores errors, which is ok because this is only a
3132 pg_flush_data(fd, 0, 0);
3134 (void) CloseTransientFile(fd);
3137 #endif /* PG_FLUSH_DATA_WORKS */
3140 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3143 * We want to silently ignoring errors about unreadable files. Pass that
3144 * desire on to fsync_fname_ext().
3146 fsync_fname_ext(fname, isdir, true, elevel);
3150 * fsync_fname_ext -- Try to fsync a file or directory
3152 * If ignore_perm is true, ignore errors upon trying to open unreadable
3153 * files. Logs other errors at a caller-specified level.
3155 * Returns 0 if the operation succeeded, -1 otherwise.
3158 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3165 * Some OSs require directories to be opened read-only whereas other
3166 * systems don't allow us to fsync files opened read-only; so we need both
3167 * cases here. Using O_RDWR will cause us to fail to fsync files that are
3168 * not writable by our userid, but we assume that's OK.
3176 fd = OpenTransientFile(fname, flags);
3179 * Some OSs don't allow us to open directories at all (Windows returns
3180 * EACCES), just ignore the error in that case. If desired also silently
3181 * ignoring errors about unreadable files. Log others.
3183 if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3185 else if (fd < 0 && ignore_perm && errno == EACCES)
3190 (errcode_for_file_access(),
3191 errmsg("could not open file \"%s\": %m", fname)));
3195 returncode = pg_fsync(fd);
3198 * Some OSes don't allow us to fsync directories at all, so we can ignore
3199 * those errors. Anything else needs to be logged.
3201 if (returncode != 0 && !(isdir && errno == EBADF))
3205 /* close file upon error, might not be in transaction context */
3207 (void) CloseTransientFile(fd);
3211 (errcode_for_file_access(),
3212 errmsg("could not fsync file \"%s\": %m", fname)));
3216 (void) CloseTransientFile(fd);
3222 * fsync_parent_path -- fsync the parent path of a file or directory
3224 * This is aimed at making file operations persistent on disk in case of
3225 * an OS crash or power failure.
3228 fsync_parent_path(const char *fname, int elevel)
3230 char parentpath[MAXPGPATH];
3232 strlcpy(parentpath, fname, MAXPGPATH);
3233 get_parent_directory(parentpath);
3236 * get_parent_directory() returns an empty string if the input argument is
3237 * just a file name (see comments in path.c), so handle that as being the
3238 * current directory.
3240 if (strlen(parentpath) == 0)
3241 strlcpy(parentpath, ".", MAXPGPATH);
3243 if (fsync_fname_ext(parentpath, true, false, elevel) != 0)