From dc6c4c9dc2a111519b76b22daaaac86c5608223b Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Fri, 1 Dec 2017 16:30:56 -0800 Subject: [PATCH] Add infrastructure for sharing temporary files between backends. SharedFileSet allows temporary files to be created by one backend and then exported for read-only access by other backends, with clean-up managed by reference counting associated with a DSM segment. This includes changes to fd.c and buffile.c to support the new kind of temporary file. This will be used by an upcoming patch adding support for parallel hash joins. Author: Thomas Munro Reviewed-By: Peter Geoghegan, Andres Freund, Robert Haas, Rushabh Lathia Discussion: https://postgr.es/m/CAEepm=2W=cOkiZxcg6qiFQP-dHUe09aqTrEMM7yJDrHMhDv_RA@mail.gmail.com https://postgr.es/m/CAH2-WznJ_UgLux=_jTgCQ4yFz0iBntudsNKa1we3kN1BAG=88w@mail.gmail.com --- src/backend/storage/file/Makefile | 2 +- src/backend/storage/file/buffile.c | 205 +++++++++++- src/backend/storage/file/fd.c | 395 +++++++++++++++++++---- src/backend/storage/file/sharedfileset.c | 244 ++++++++++++++ src/include/storage/buffile.h | 7 + src/include/storage/fd.h | 11 +- src/include/storage/sharedfileset.h | 45 +++ src/tools/pgindent/typedefs.list | 1 + 8 files changed, 848 insertions(+), 62 deletions(-) create mode 100644 src/backend/storage/file/sharedfileset.c create mode 100644 src/include/storage/sharedfileset.h diff --git a/src/backend/storage/file/Makefile b/src/backend/storage/file/Makefile index d2198f2b93..ca6a0e4f7d 100644 --- a/src/backend/storage/file/Makefile +++ b/src/backend/storage/file/Makefile @@ -12,6 +12,6 @@ subdir = src/backend/storage/file top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = fd.o buffile.o copydir.o reinit.o +OBJS = fd.o buffile.o copydir.o reinit.o sharedfileset.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c index 06bf2fadbf..fa9940da9b 100644 --- a/src/backend/storage/file/buffile.c +++ b/src/backend/storage/file/buffile.c @@ -31,12 +31,18 @@ * BufFile also supports temporary files that exceed the OS file size limit * (by opening multiple fd.c temporary files). This is an essential feature * for sorts and hashjoins on large amounts of data. + * + * BufFile supports temporary files that can be made read-only and shared with + * other backends, as infrastructure for parallel execution. Such files need + * to be created as a member of a SharedFileSet that all participants are + * attached to. *------------------------------------------------------------------------- */ #include "postgres.h" #include "executor/instrument.h" +#include "miscadmin.h" #include "pgstat.h" #include "storage/fd.h" #include "storage/buffile.h" @@ -70,6 +76,10 @@ struct BufFile bool isInterXact; /* keep open over transactions? */ bool dirty; /* does buffer need to be written? */ + bool readOnly; /* has the file been set to read only? */ + + SharedFileSet *fileset; /* space for segment files if shared */ + const char *name; /* name of this BufFile if shared */ /* * resowner is the ResourceOwner to use for underlying temp files. (We @@ -94,6 +104,7 @@ static void extendBufFile(BufFile *file); static void BufFileLoadBuffer(BufFile *file); static void BufFileDumpBuffer(BufFile *file); static int BufFileFlush(BufFile *file); +static File MakeNewSharedSegment(BufFile *file, int segment); /* @@ -117,6 +128,9 @@ makeBufFile(File firstfile) file->curOffset = 0L; file->pos = 0; file->nbytes = 0; + file->readOnly = false; + file->fileset = NULL; + file->name = NULL; return file; } @@ -134,7 +148,11 @@ extendBufFile(BufFile *file) oldowner = CurrentResourceOwner; CurrentResourceOwner = file->resowner; - pfile = OpenTemporaryFile(file->isInterXact); + if (file->fileset == NULL) + pfile = OpenTemporaryFile(file->isInterXact); + else + pfile = MakeNewSharedSegment(file, file->numFiles); + Assert(pfile >= 0); CurrentResourceOwner = oldowner; @@ -175,6 +193,189 @@ BufFileCreateTemp(bool interXact) return file; } +/* + * Build the name for a given segment of a given BufFile. + */ +static void +SharedSegmentName(char *name, const char *buffile_name, int segment) +{ + snprintf(name, MAXPGPATH, "%s.%d", buffile_name, segment); +} + +/* + * Create a new segment file backing a shared BufFile. + */ +static File +MakeNewSharedSegment(BufFile *buffile, int segment) +{ + char name[MAXPGPATH]; + File file; + + SharedSegmentName(name, buffile->name, segment); + file = SharedFileSetCreate(buffile->fileset, name); + + /* SharedFileSetCreate would've errored out */ + Assert(file > 0); + + return file; +} + +/* + * Create a BufFile that can be discovered and opened read-only by other + * backends that are attached to the same SharedFileSet using the same name. + * + * The naming scheme for shared BufFiles is left up to the calling code. The + * name will appear as part of one or more filenames on disk, and might + * provide clues to administrators about which subsystem is generating + * temporary file data. Since each SharedFileSet object is backed by one or + * more uniquely named temporary directory, names don't conflict with + * unrelated SharedFileSet objects. + */ +BufFile * +BufFileCreateShared(SharedFileSet *fileset, const char *name) +{ + BufFile *file; + + file = (BufFile *) palloc(sizeof(BufFile)); + file->fileset = fileset; + file->name = pstrdup(name); + file->numFiles = 1; + file->files = (File *) palloc(sizeof(File)); + file->files[0] = MakeNewSharedSegment(file, 0); + file->offsets = (off_t *) palloc(sizeof(off_t)); + file->offsets[0] = 0L; + file->isInterXact = false; + file->dirty = false; + file->resowner = CurrentResourceOwner; + file->curFile = 0; + file->curOffset = 0L; + file->pos = 0; + file->nbytes = 0; + file->readOnly = false; + file->name = pstrdup(name); + + return file; +} + +/* + * Open a file that was previously created in another backend (or this one) + * with BufFileCreateShared in the same SharedFileSet using the same name. + * The backend that created the file must have called BufFileClose() or + * BufFileExport() to make sure that it is ready to be opened by other + * backends and render it read-only. + */ +BufFile * +BufFileOpenShared(SharedFileSet *fileset, const char *name) +{ + BufFile *file = (BufFile *) palloc(sizeof(BufFile)); + char segment_name[MAXPGPATH]; + Size capacity = 16; + File *files = palloc(sizeof(File) * capacity); + int nfiles = 0; + + file = (BufFile *) palloc(sizeof(BufFile)); + files = palloc(sizeof(File) * capacity); + + /* + * We don't know how many segments there are, so we'll probe the + * filesystem to find out. + */ + for (;;) + { + /* See if we need to expand our file segment array. */ + if (nfiles + 1 > capacity) + { + capacity *= 2; + files = repalloc(files, sizeof(File) * capacity); + } + /* Try to load a segment. */ + SharedSegmentName(segment_name, name, nfiles); + files[nfiles] = SharedFileSetOpen(fileset, segment_name); + if (files[nfiles] <= 0) + break; + ++nfiles; + + CHECK_FOR_INTERRUPTS(); + } + + /* + * If we didn't find any files at all, then no BufFile exists with this + * name. + */ + if (nfiles == 0) + return NULL; + + file->numFiles = nfiles; + file->files = files; + file->offsets = (off_t *) palloc0(sizeof(off_t) * nfiles); + file->isInterXact = false; + file->dirty = false; + file->resowner = CurrentResourceOwner; /* Unused, can't extend */ + file->curFile = 0; + file->curOffset = 0L; + file->pos = 0; + file->nbytes = 0; + file->readOnly = true; /* Can't write to files opened this way */ + file->fileset = fileset; + file->name = pstrdup(name); + + return file; +} + +/* + * Delete a BufFile that was created by BufFileCreateShared in the given + * SharedFileSet using the given name. + * + * It is not necessary to delete files explicitly with this function. It is + * provided only as a way to delete files proactively, rather than waiting for + * the SharedFileSet to be cleaned up. + * + * Only one backend should attempt to delete a given name, and should know + * that it exists and has been exported or closed. + */ +void +BufFileDeleteShared(SharedFileSet *fileset, const char *name) +{ + char segment_name[MAXPGPATH]; + int segment = 0; + bool found = false; + + /* + * We don't know how many segments the file has. We'll keep deleting + * until we run out. If we don't manage to find even an initial segment, + * raise an error. + */ + for (;;) + { + SharedSegmentName(segment_name, name, segment); + if (!SharedFileSetDelete(fileset, segment_name, true)) + break; + found = true; + ++segment; + + CHECK_FOR_INTERRUPTS(); + } + + if (!found) + elog(ERROR, "could not delete unknown shared BufFile \"%s\"", name); +} + +/* + * BufFileExportShared --- flush and make read-only, in preparation for sharing. + */ +void +BufFileExportShared(BufFile *file) +{ + /* Must be a file belonging to a SharedFileSet. */ + Assert(file->fileset != NULL); + + /* It's probably a bug if someone calls this twice. */ + Assert(!file->readOnly); + + BufFileFlush(file); + file->readOnly = true; +} + /* * Close a BufFile * @@ -390,6 +591,8 @@ BufFileWrite(BufFile *file, void *ptr, size_t size) size_t nwritten = 0; size_t nthistime; + Assert(!file->readOnly); + while (size > 0) { if (file->pos >= BLCKSZ) diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index aa2fe2c6c0..2e93e4ad63 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -39,6 +39,14 @@ * for a long time, like relation files. It is the caller's responsibility * to close them, there is no automatic mechanism in fd.c for that. * + * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage + * temporary files that have names so that they can be shared between + * backends. Such files are automatically closed and count against the + * temporary file limit of the backend that creates them, but unlike anonymous + * files they are not automatically deleted. See sharedfileset.c for a shared + * ownership mechanism that provides automatic cleanup for shared files when + * the last of a group of backends detaches. + * * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively. * They behave like the corresponding native functions, except that the handle @@ -175,8 +183,9 @@ int max_safe_fds = 32; /* default if not changed */ #define FilePosIsUnknown(pos) ((pos) < 0) /* these are the assigned bits in fdstate below: */ -#define FD_TEMPORARY (1 << 0) /* T = delete when closed */ -#define FD_XACT_TEMPORARY (1 << 1) /* T = delete at eoXact */ +#define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */ +#define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */ +#define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */ typedef struct vfd { @@ -313,7 +322,7 @@ static struct dirent *ReadDirExtended(DIR *dir, const char *dirname, int elevel) static void AtProcExit_Files(int code, Datum arg); static void CleanupTempFiles(bool isProcExit); -static void RemovePgTempFilesInDir(const char *tmpdirname); +static void RemovePgTempFilesInDir(const char *tmpdirname, bool unlink_all); static void RemovePgTempRelationFiles(const char *tsdirname); static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname); static bool looks_like_temp_rel_name(const char *name); @@ -326,6 +335,7 @@ static void walkdir(const char *path, static void pre_sync_fname(const char *fname, bool isdir, int elevel); #endif static void datadir_fsync_fname(const char *fname, bool isdir, int elevel); +static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel); static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel); static int fsync_parent_path(const char *fname, int elevel); @@ -1294,6 +1304,39 @@ FileAccess(File file) return 0; } +/* + * Called whenever a temporary file is deleted to report its size. + */ +static void +ReportTemporaryFileUsage(const char *path, off_t size) +{ + pgstat_report_tempfile(size); + + if (log_temp_files >= 0) + { + if ((size / 1024) >= log_temp_files) + ereport(LOG, + (errmsg("temporary file: path \"%s\", size %lu", + path, (unsigned long) size))); + } +} + +/* + * Called to register a temporary file for automatic close. + * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called + * before the file was opened. + */ +static void +RegisterTemporaryFile(File file) +{ + ResourceOwnerRememberFile(CurrentResourceOwner, file); + VfdCache[file].resowner = CurrentResourceOwner; + + /* Backup mechanism for closing at end of xact. */ + VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT; + have_xact_temporary_files = true; +} + /* * Called when we get a shared invalidation message on some relation. */ @@ -1378,6 +1421,67 @@ PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode) return file; } +/* + * Create directory 'directory'. If necessary, create 'basedir', which must + * be the directory above it. This is designed for creating the top-level + * temporary directory on demand before creating a directory underneath it. + * Do nothing if the directory already exists. + * + * Directories created within the top-level temporary directory should begin + * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and + * deleted at startup by RemovePgTempFiles(). Further subdirectories below + * that do not need any particular prefix. +*/ +void +PathNameCreateTemporaryDir(const char *basedir, const char *directory) +{ + if (mkdir(directory, S_IRWXU) < 0) + { + if (errno == EEXIST) + return; + + /* + * Failed. Try to create basedir first in case it's missing. Tolerate + * EEXIST to close a race against another process following the same + * algorithm. + */ + if (mkdir(basedir, S_IRWXU) < 0 && errno != EEXIST) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("cannot create temporary directory \"%s\": %m", + basedir))); + + /* Try again. */ + if (mkdir(directory, S_IRWXU) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("cannot create temporary subdirectory \"%s\": %m", + directory))); + } +} + +/* + * Delete a directory and everything in it, if it exists. + */ +void +PathNameDeleteTemporaryDir(const char *dirname) +{ + struct stat statbuf; + + /* Silently ignore missing directory. */ + if (stat(dirname, &statbuf) != 0 && errno == ENOENT) + return; + + /* + * Currently, walkdir doesn't offer a way for our passed in function to + * maintain state. Perhaps it should, so that we could tell the caller + * whether this operation succeeded or failed. Since this operation is + * used in a cleanup path, we wouldn't actually behave differently: we'll + * just log failures. + */ + walkdir(dirname, unlink_if_exists_fname, false, LOG); +} + /* * Open a temporary file that will disappear when we close it. * @@ -1432,53 +1536,52 @@ OpenTemporaryFile(bool interXact) DEFAULTTABLESPACE_OID, true); - /* Mark it for deletion at close */ - VfdCache[file].fdstate |= FD_TEMPORARY; + /* Mark it for deletion at close and temporary file size limit */ + VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT; /* Register it with the current resource owner */ if (!interXact) - { - VfdCache[file].fdstate |= FD_XACT_TEMPORARY; - - VfdCache[file].resowner = CurrentResourceOwner; - ResourceOwnerRememberFile(CurrentResourceOwner, file); - - /* ensure cleanup happens at eoxact */ - have_xact_temporary_files = true; - } + RegisterTemporaryFile(file); return file; } /* - * Open a temporary file in a specific tablespace. - * Subroutine for OpenTemporaryFile, which see for details. + * Return the path of the temp directory in a given tablespace. */ -static File -OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError) +void +TempTablespacePath(char *path, Oid tablespace) { - char tempdirpath[MAXPGPATH]; - char tempfilepath[MAXPGPATH]; - File file; - /* * Identify the tempfile directory for this tablespace. * * If someone tries to specify pg_global, use pg_default instead. */ - if (tblspcOid == DEFAULTTABLESPACE_OID || - tblspcOid == GLOBALTABLESPACE_OID) - { - /* The default tablespace is {datadir}/base */ - snprintf(tempdirpath, sizeof(tempdirpath), "base/%s", - PG_TEMP_FILES_DIR); - } + if (tablespace == InvalidOid || + tablespace == DEFAULTTABLESPACE_OID || + tablespace == GLOBALTABLESPACE_OID) + snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR); else { /* All other tablespaces are accessed via symlinks */ - snprintf(tempdirpath, sizeof(tempdirpath), "pg_tblspc/%u/%s/%s", - tblspcOid, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR); + snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s", + tablespace, TABLESPACE_VERSION_DIRECTORY, + PG_TEMP_FILES_DIR); } +} + +/* + * Open a temporary file in a specific tablespace. + * Subroutine for OpenTemporaryFile, which see for details. + */ +static File +OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError) +{ + char tempdirpath[MAXPGPATH]; + char tempfilepath[MAXPGPATH]; + File file; + + TempTablespacePath(tempdirpath, tblspcOid); /* * Generate a tempfile name that should be unique within the current @@ -1515,6 +1618,130 @@ OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError) return file; } + +/* + * Create a new file. The directory containing it must already exist. Files + * created this way are subject to temp_file_limit and are automatically + * closed at end of transaction, but are not automatically deleted on close + * because they are intended to be shared between cooperating backends. + * + * If the file is inside the top-level temporary directory, its name should + * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary + * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be + * inside a directory created with PathnameCreateTemporaryDir(), in which case + * the prefix isn't needed. + */ +File +PathNameCreateTemporaryFile(const char *path, bool error_on_failure) +{ + File file; + + ResourceOwnerEnlargeFiles(CurrentResourceOwner); + + /* + * Open the file. Note: we don't use O_EXCL, in case there is an orphaned + * temp file that can be reused. + */ + file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY); + if (file <= 0) + { + if (error_on_failure) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create temporary file \"%s\": %m", + path))); + else + return file; + } + + /* Mark it for temp_file_limit accounting. */ + VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT; + + /* Register it for automatic close. */ + RegisterTemporaryFile(file); + + return file; +} + +/* + * Open a file that was created with PathNameCreateTemporaryFile, possibly in + * another backend. Files opened this way don't count against the + * temp_file_limit of the caller, are read-only and are automatically closed + * at the end of the transaction but are not deleted on close. + */ +File +PathNameOpenTemporaryFile(const char *path) +{ + File file; + + ResourceOwnerEnlargeFiles(CurrentResourceOwner); + + /* We open the file read-only. */ + file = PathNameOpenFile(path, O_RDONLY | PG_BINARY); + + /* If no such file, then we don't raise an error. */ + if (file <= 0 && errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open temporary file \"%s\": %m", + path))); + + if (file > 0) + { + /* Register it for automatic close. */ + RegisterTemporaryFile(file); + } + + return file; +} + +/* + * Delete a file by pathname. Return true if the file existed, false if + * didn't. + */ +bool +PathNameDeleteTemporaryFile(const char *path, bool error_on_failure) +{ + struct stat filestats; + int stat_errno; + + /* Get the final size for pgstat reporting. */ + if (stat(path, &filestats) != 0) + stat_errno = errno; + else + stat_errno = 0; + + /* + * Unlike FileClose's automatic file deletion code, we tolerate + * non-existence to support BufFileDeleteShared which doesn't know how + * many segments it has to delete until it runs out. + */ + if (stat_errno == ENOENT) + return false; + + if (unlink(path) < 0) + { + if (errno != ENOENT) + ereport(error_on_failure ? ERROR : LOG, + (errcode_for_file_access(), + errmsg("cannot unlink temporary file \"%s\": %m", + path))); + return false; + } + + if (stat_errno == 0) + ReportTemporaryFileUsage(path, filestats.st_size); + else + { + errno = stat_errno; + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", path))); + } + + return true; +} + /* * close a file when done with it */ @@ -1543,10 +1770,17 @@ FileClose(File file) Delete(file); } + if (vfdP->fdstate & FD_TEMP_FILE_LIMIT) + { + /* Subtract its size from current usage (do first in case of error) */ + temporary_files_size -= vfdP->fileSize; + vfdP->fileSize = 0; + } + /* * Delete the file if it was temporary, and make a log entry if wanted */ - if (vfdP->fdstate & FD_TEMPORARY) + if (vfdP->fdstate & FD_DELETE_AT_CLOSE) { struct stat filestats; int stat_errno; @@ -1558,11 +1792,8 @@ FileClose(File file) * is arranged to ensure that the worst-case consequence is failing to * emit log message(s), not failing to attempt the unlink. */ - vfdP->fdstate &= ~FD_TEMPORARY; + vfdP->fdstate &= ~FD_DELETE_AT_CLOSE; - /* Subtract its size from current usage (do first in case of error) */ - temporary_files_size -= vfdP->fileSize; - vfdP->fileSize = 0; /* first try the stat() */ if (stat(vfdP->fileName, &filestats)) @@ -1576,18 +1807,7 @@ FileClose(File file) /* and last report the stat results */ if (stat_errno == 0) - { - pgstat_report_tempfile(filestats.st_size); - - if (log_temp_files >= 0) - { - if ((filestats.st_size / 1024) >= log_temp_files) - ereport(LOG, - (errmsg("temporary file: path \"%s\", size %lu", - vfdP->fileName, - (unsigned long) filestats.st_size))); - } - } + ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size); else { errno = stat_errno; @@ -1761,7 +1981,7 @@ FileWrite(File file, char *buffer, int amount, uint32 wait_event_info) * message if we do that. All current callers would just throw error * immediately anyway, so this is safe at present. */ - if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMPORARY)) + if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT)) { off_t newPos; @@ -1814,7 +2034,7 @@ retry: * get here in that state if we're not enforcing temporary_files_size, * so we don't care. */ - if (vfdP->fdstate & FD_TEMPORARY) + if (vfdP->fdstate & FD_TEMP_FILE_LIMIT) { off_t newPos = vfdP->seekPos; @@ -1985,7 +2205,7 @@ FileTruncate(File file, off_t offset, uint32 wait_event_info) if (returnCode == 0 && VfdCache[file].fileSize > offset) { /* adjust our state for truncation of a temp file */ - Assert(VfdCache[file].fdstate & FD_TEMPORARY); + Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT); temporary_files_size -= VfdCache[file].fileSize - offset; VfdCache[file].fileSize = offset; } @@ -2593,6 +2813,24 @@ TempTablespacesAreSet(void) return (numTempTableSpaces >= 0); } +/* + * GetTempTablespaces + * + * Populate an array with the OIDs of the tablespaces that should be used for + * temporary files. Return the number that were copied into the output array. + */ +int +GetTempTablespaces(Oid *tableSpaces, int numSpaces) +{ + int i; + + Assert(TempTablespacesAreSet()); + for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i) + tableSpaces[i] = tempTableSpaces[i]; + + return i; +} + /* * GetNextTempTableSpace * @@ -2696,7 +2934,8 @@ CleanupTempFiles(bool isProcExit) { unsigned short fdstate = VfdCache[i].fdstate; - if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL) + if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) && + VfdCache[i].fileName != NULL) { /* * If we're in the process of exiting a backend process, close @@ -2707,7 +2946,7 @@ CleanupTempFiles(bool isProcExit) */ if (isProcExit) FileClose(i); - else if (fdstate & FD_XACT_TEMPORARY) + else if (fdstate & FD_CLOSE_AT_EOXACT) { elog(WARNING, "temporary file %s not closed at end-of-transaction", @@ -2751,7 +2990,7 @@ RemovePgTempFiles(void) * First process temp files in pg_default ($PGDATA/base) */ snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR); - RemovePgTempFilesInDir(temp_path); + RemovePgTempFilesInDir(temp_path, false); RemovePgTempRelationFiles("base"); /* @@ -2767,7 +3006,7 @@ RemovePgTempFiles(void) snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s", spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR); - RemovePgTempFilesInDir(temp_path); + RemovePgTempFilesInDir(temp_path, false); snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s", spc_de->d_name, TABLESPACE_VERSION_DIRECTORY); @@ -2785,9 +3024,15 @@ RemovePgTempFiles(void) #endif } -/* Process one pgsql_tmp directory for RemovePgTempFiles */ +/* + * Process one pgsql_tmp directory for RemovePgTempFiles. At the top level in + * each tablespace, this should be called with unlink_all = false, so that + * only files matching the temporary name prefix will be unlinked. When + * recursing it will be called with unlink_all = true to unlink everything + * under a top-level temporary directory. + */ static void -RemovePgTempFilesInDir(const char *tmpdirname) +RemovePgTempFilesInDir(const char *tmpdirname, bool unlink_all) { DIR *temp_dir; struct dirent *temp_de; @@ -2813,10 +3058,25 @@ RemovePgTempFilesInDir(const char *tmpdirname) snprintf(rm_path, sizeof(rm_path), "%s/%s", tmpdirname, temp_de->d_name); - if (strncmp(temp_de->d_name, + if (unlink_all || + strncmp(temp_de->d_name, PG_TEMP_FILE_PREFIX, strlen(PG_TEMP_FILE_PREFIX)) == 0) - unlink(rm_path); /* note we ignore any error */ + { + struct stat statbuf; + + /* note that we ignore any error here and below */ + if (lstat(rm_path, &statbuf) < 0) + continue; + + if (S_ISDIR(statbuf.st_mode)) + { + RemovePgTempFilesInDir(rm_path, true); + rmdir(rm_path); + } + else + unlink(rm_path); + } else elog(LOG, "unexpected file found in temporary-files directory: \"%s\"", @@ -3152,6 +3412,23 @@ datadir_fsync_fname(const char *fname, bool isdir, int elevel) fsync_fname_ext(fname, isdir, true, elevel); } +static void +unlink_if_exists_fname(const char *fname, bool isdir, int elevel) +{ + if (isdir) + { + if (rmdir(fname) != 0 && errno != ENOENT) + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not rmdir directory \"%s\": %m", fname))); + } + else + { + /* Use PathNameDeleteTemporaryFile to report filesize */ + PathNameDeleteTemporaryFile(fname, false); + } +} + /* * fsync_fname_ext -- Try to fsync a file or directory * diff --git a/src/backend/storage/file/sharedfileset.c b/src/backend/storage/file/sharedfileset.c new file mode 100644 index 0000000000..343b2283f0 --- /dev/null +++ b/src/backend/storage/file/sharedfileset.c @@ -0,0 +1,244 @@ +/*------------------------------------------------------------------------- + * + * sharedfileset.c + * Shared temporary file management. + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/file/sharedfileset.c + * + * SharefFileSets provide a temporary namespace (think directory) so that + * files can be discovered by name, and a shared ownership semantics so that + * shared files survive until the last user detaches. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/hash.h" +#include "catalog/pg_tablespace.h" +#include "commands/tablespace.h" +#include "miscadmin.h" +#include "storage/dsm.h" +#include "storage/sharedfileset.h" +#include "utils/builtins.h" + +static void SharedFileSetOnDetach(dsm_segment *segment, Datum datum); +static void SharedFileSetPath(char *path, SharedFileSet *fileset, Oid tablespace); +static void SharedFilePath(char *path, SharedFileSet *fileset, const char *name); +static Oid ChooseTablespace(const SharedFileSet *fileset, const char *name); + +/* + * Initialize a space for temporary files that can be opened for read-only + * access by other backends. Other backends must attach to it before + * accessing it. Associate this SharedFileSet with 'seg'. Any contained + * files will be deleted when the last backend detaches. + * + * Files will be distributed over the tablespaces configured in + * temp_tablespaces. + * + * Under the covers the set is one or more directories which will eventually + * be deleted when there are no backends attached. + */ +void +SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg) +{ + static uint32 counter = 0; + + SpinLockInit(&fileset->mutex); + fileset->refcnt = 1; + fileset->creator_pid = MyProcPid; + fileset->number = counter; + counter = (counter + 1) % INT_MAX; + + /* Capture the tablespace OIDs so that all backends agree on them. */ + PrepareTempTablespaces(); + fileset->ntablespaces = + GetTempTablespaces(&fileset->tablespaces[0], + lengthof(fileset->tablespaces)); + if (fileset->ntablespaces == 0) + { + fileset->tablespaces[0] = DEFAULTTABLESPACE_OID; + fileset->ntablespaces = 1; + } + + /* Register our cleanup callback. */ + on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset)); +} + +/* + * Attach to a set of directories that was created with SharedFileSetInit. + */ +void +SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg) +{ + bool success; + + SpinLockAcquire(&fileset->mutex); + if (fileset->refcnt == 0) + success = false; + else + { + ++fileset->refcnt; + success = true; + } + SpinLockRelease(&fileset->mutex); + + if (!success) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not attach to a SharedFileSet that is already destroyed"))); + + /* Register our cleanup callback. */ + on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset)); +} + +/* + * Create a new file in the given set. + */ +File +SharedFileSetCreate(SharedFileSet *fileset, const char *name) +{ + char path[MAXPGPATH]; + File file; + + SharedFilePath(path, fileset, name); + file = PathNameCreateTemporaryFile(path, false); + + /* If we failed, see if we need to create the directory on demand. */ + if (file <= 0) + { + char tempdirpath[MAXPGPATH]; + char filesetpath[MAXPGPATH]; + Oid tablespace = ChooseTablespace(fileset, name); + + TempTablespacePath(tempdirpath, tablespace); + SharedFileSetPath(filesetpath, fileset, tablespace); + PathNameCreateTemporaryDir(tempdirpath, filesetpath); + file = PathNameCreateTemporaryFile(path, true); + } + + return file; +} + +/* + * Open a file that was created with SharedFileSetCreate(), possibly in + * another backend. + */ +File +SharedFileSetOpen(SharedFileSet *fileset, const char *name) +{ + char path[MAXPGPATH]; + File file; + + SharedFilePath(path, fileset, name); + file = PathNameOpenTemporaryFile(path); + + return file; +} + +/* + * Delete a file that was created with PathNameCreateShared(). + * Return true if the file existed, false if didn't. + */ +bool +SharedFileSetDelete(SharedFileSet *fileset, const char *name, + bool error_on_failure) +{ + char path[MAXPGPATH]; + + SharedFilePath(path, fileset, name); + + return PathNameDeleteTemporaryFile(path, error_on_failure); +} + +/* + * Delete all files in the set. + */ +void +SharedFileSetDeleteAll(SharedFileSet *fileset) +{ + char dirpath[MAXPGPATH]; + int i; + + /* + * Delete the directory we created in each tablespace. Doesn't fail + * because we use this in error cleanup paths, but can generate LOG + * message on IO error. + */ + for (i = 0; i < fileset->ntablespaces; ++i) + { + SharedFileSetPath(dirpath, fileset, fileset->tablespaces[i]); + PathNameDeleteTemporaryDir(dirpath); + } +} + +/* + * Callback function that will be invoked when this backend detaches from a + * DSM segment holding a SharedFileSet that it has created or attached to. If + * we are the last to detach, then try to remove the directories and + * everything in them. We can't raise an error on failures, because this runs + * in error cleanup paths. + */ +static void +SharedFileSetOnDetach(dsm_segment *segment, Datum datum) +{ + bool unlink_all = false; + SharedFileSet *fileset = (SharedFileSet *) DatumGetPointer(datum); + + SpinLockAcquire(&fileset->mutex); + Assert(fileset->refcnt > 0); + if (--fileset->refcnt == 0) + unlink_all = true; + SpinLockRelease(&fileset->mutex); + + /* + * If we are the last to detach, we delete the directory in all + * tablespaces. Note that we are still actually attached for the rest of + * this function so we can safely access its data. + */ + if (unlink_all) + SharedFileSetDeleteAll(fileset); +} + +/* + * Build the path for the directory holding the files backing a SharedFileSet + * in a given tablespace. + */ +static void +SharedFileSetPath(char *path, SharedFileSet *fileset, Oid tablespace) +{ + char tempdirpath[MAXPGPATH]; + + TempTablespacePath(tempdirpath, tablespace); + snprintf(path, MAXPGPATH, "%s/%s%d.%u.sharedfileset", + tempdirpath, PG_TEMP_FILE_PREFIX, + fileset->creator_pid, fileset->number); +} + +/* + * Sorting hat to determine which tablespace a given shared temporary file + * belongs in. + */ +static Oid +ChooseTablespace(const SharedFileSet *fileset, const char *name) +{ + uint32 hash = hash_any((const unsigned char *) name, strlen(name)); + + return fileset->tablespaces[hash % fileset->ntablespaces]; +} + +/* + * Compute the full path of a file in a SharedFileSet. + */ +static void +SharedFilePath(char *path, SharedFileSet *fileset, const char *name) +{ + char dirpath[MAXPGPATH]; + + SharedFileSetPath(dirpath, fileset, ChooseTablespace(fileset, name)); + snprintf(path, MAXPGPATH, "%s/%s", dirpath, name); +} diff --git a/src/include/storage/buffile.h b/src/include/storage/buffile.h index 640908717d..c3d7a61b64 100644 --- a/src/include/storage/buffile.h +++ b/src/include/storage/buffile.h @@ -26,6 +26,8 @@ #ifndef BUFFILE_H #define BUFFILE_H +#include "storage/sharedfileset.h" + /* BufFile is an opaque type whose details are not known outside buffile.c. */ typedef struct BufFile BufFile; @@ -42,4 +44,9 @@ extern int BufFileSeek(BufFile *file, int fileno, off_t offset, int whence); extern void BufFileTell(BufFile *file, int *fileno, off_t *offset); extern int BufFileSeekBlock(BufFile *file, long blknum); +extern BufFile *BufFileCreateShared(SharedFileSet *fileset, const char *name); +extern void BufFileExportShared(BufFile *file); +extern BufFile *BufFileOpenShared(SharedFileSet *fileset, const char *name); +extern void BufFileDeleteShared(SharedFileSet *fileset, const char *name); + #endif /* BUFFILE_H */ diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 6ea26e63b8..9829281509 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -79,6 +79,14 @@ extern int FileGetRawDesc(File file); extern int FileGetRawFlags(File file); extern mode_t FileGetRawMode(File file); +/* Operations used for sharing named temporary files */ +extern File PathNameCreateTemporaryFile(const char *name, bool error_on_failure); +extern File PathNameOpenTemporaryFile(const char *name); +extern bool PathNameDeleteTemporaryFile(const char *name, bool error_on_failure); +extern void PathNameCreateTemporaryDir(const char *base, const char *name); +extern void PathNameDeleteTemporaryDir(const char *name); +extern void TempTablespacePath(char *path, Oid tablespace); + /* Operations that allow use of regular stdio --- USE WITH CAUTION */ extern FILE *AllocateFile(const char *name, const char *mode); extern int FreeFile(FILE *file); @@ -107,6 +115,7 @@ extern void set_max_safe_fds(void); extern void closeAllVfds(void); extern void SetTempTablespaces(Oid *tableSpaces, int numSpaces); extern bool TempTablespacesAreSet(void); +extern int GetTempTablespaces(Oid *tableSpaces, int numSpaces); extern Oid GetNextTempTableSpace(void); extern void AtEOXact_Files(void); extern void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, @@ -124,7 +133,7 @@ extern int durable_unlink(const char *fname, int loglevel); extern int durable_link_or_rename(const char *oldfile, const char *newfile, int loglevel); extern void SyncDataDirectory(void); -/* Filename components for OpenTemporaryFile */ +/* Filename components */ #define PG_TEMP_FILES_DIR "pgsql_tmp" #define PG_TEMP_FILE_PREFIX "pgsql_tmp" diff --git a/src/include/storage/sharedfileset.h b/src/include/storage/sharedfileset.h new file mode 100644 index 0000000000..20651bb93b --- /dev/null +++ b/src/include/storage/sharedfileset.h @@ -0,0 +1,45 @@ +/*------------------------------------------------------------------------- + * + * sharedfileset.h + * Shared temporary file management. + * + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/sharedfilespace.h + * + *------------------------------------------------------------------------- + */ + +#ifndef SHAREDFILESET_H +#define SHAREDFILESET_H + +#include "storage/dsm.h" +#include "storage/fd.h" +#include "storage/spin.h" + +/* + * A set of temporary files that can be shared by multiple backends. + */ +typedef struct SharedFileSet +{ + pid_t creator_pid; /* PID of the creating process */ + uint32 number; /* per-PID identifier */ + slock_t mutex; /* mutex protecting the reference count */ + int refcnt; /* number of attached backends */ + int ntablespaces; /* number of tablespaces to use */ + Oid tablespaces[8]; /* OIDs of tablespaces to use. Assumes that + * it's rare that there more than temp + * tablespaces. */ +} SharedFileSet; + +extern void SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg); +extern void SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg); +extern File SharedFileSetCreate(SharedFileSet *fileset, const char *name); +extern File SharedFileSetOpen(SharedFileSet *fileset, const char *name); +extern bool SharedFileSetDelete(SharedFileSet *fileset, const char *name, + bool error_on_failure); +extern void SharedFileSetDeleteAll(SharedFileSet *fileset); + +#endif diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 3e84720038..72eb9fd390 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2026,6 +2026,7 @@ SharedBitmapState SharedDependencyObjectType SharedDependencyType SharedExecutorInstrumentation +SharedFileSet SharedInvalCatalogMsg SharedInvalCatcacheMsg SharedInvalRelcacheMsg -- 2.40.0