top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
-OBJS = fd.o buffile.o copydir.o reinit.o
+OBJS = fd.o buffile.o copydir.o reinit.o sharedfileset.o
include $(top_srcdir)/src/backend/common.mk
* BufFile also supports temporary files that exceed the OS file size limit
* (by opening multiple fd.c temporary files). This is an essential feature
* for sorts and hashjoins on large amounts of data.
+ *
+ * BufFile supports temporary files that can be made read-only and shared with
+ * other backends, as infrastructure for parallel execution. Such files need
+ * to be created as a member of a SharedFileSet that all participants are
+ * attached to.
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "executor/instrument.h"
+#include "miscadmin.h"
#include "pgstat.h"
#include "storage/fd.h"
#include "storage/buffile.h"
bool isInterXact; /* keep open over transactions? */
bool dirty; /* does buffer need to be written? */
+ bool readOnly; /* has the file been set to read only? */
+
+ SharedFileSet *fileset; /* space for segment files if shared */
+ const char *name; /* name of this BufFile if shared */
/*
* resowner is the ResourceOwner to use for underlying temp files. (We
static void BufFileLoadBuffer(BufFile *file);
static void BufFileDumpBuffer(BufFile *file);
static int BufFileFlush(BufFile *file);
+static File MakeNewSharedSegment(BufFile *file, int segment);
/*
file->curOffset = 0L;
file->pos = 0;
file->nbytes = 0;
+ file->readOnly = false;
+ file->fileset = NULL;
+ file->name = NULL;
return file;
}
oldowner = CurrentResourceOwner;
CurrentResourceOwner = file->resowner;
- pfile = OpenTemporaryFile(file->isInterXact);
+ if (file->fileset == NULL)
+ pfile = OpenTemporaryFile(file->isInterXact);
+ else
+ pfile = MakeNewSharedSegment(file, file->numFiles);
+
Assert(pfile >= 0);
CurrentResourceOwner = oldowner;
return file;
}
+/*
+ * Build the name for a given segment of a given BufFile.
+ */
+static void
+SharedSegmentName(char *name, const char *buffile_name, int segment)
+{
+ snprintf(name, MAXPGPATH, "%s.%d", buffile_name, segment);
+}
+
+/*
+ * Create a new segment file backing a shared BufFile.
+ */
+static File
+MakeNewSharedSegment(BufFile *buffile, int segment)
+{
+ char name[MAXPGPATH];
+ File file;
+
+ SharedSegmentName(name, buffile->name, segment);
+ file = SharedFileSetCreate(buffile->fileset, name);
+
+ /* SharedFileSetCreate would've errored out */
+ Assert(file > 0);
+
+ return file;
+}
+
+/*
+ * Create a BufFile that can be discovered and opened read-only by other
+ * backends that are attached to the same SharedFileSet using the same name.
+ *
+ * The naming scheme for shared BufFiles is left up to the calling code. The
+ * name will appear as part of one or more filenames on disk, and might
+ * provide clues to administrators about which subsystem is generating
+ * temporary file data. Since each SharedFileSet object is backed by one or
+ * more uniquely named temporary directory, names don't conflict with
+ * unrelated SharedFileSet objects.
+ */
+BufFile *
+BufFileCreateShared(SharedFileSet *fileset, const char *name)
+{
+ BufFile *file;
+
+ file = (BufFile *) palloc(sizeof(BufFile));
+ file->fileset = fileset;
+ file->name = pstrdup(name);
+ file->numFiles = 1;
+ file->files = (File *) palloc(sizeof(File));
+ file->files[0] = MakeNewSharedSegment(file, 0);
+ file->offsets = (off_t *) palloc(sizeof(off_t));
+ file->offsets[0] = 0L;
+ file->isInterXact = false;
+ file->dirty = false;
+ file->resowner = CurrentResourceOwner;
+ file->curFile = 0;
+ file->curOffset = 0L;
+ file->pos = 0;
+ file->nbytes = 0;
+ file->readOnly = false;
+ file->name = pstrdup(name);
+
+ return file;
+}
+
+/*
+ * Open a file that was previously created in another backend (or this one)
+ * with BufFileCreateShared in the same SharedFileSet using the same name.
+ * The backend that created the file must have called BufFileClose() or
+ * BufFileExport() to make sure that it is ready to be opened by other
+ * backends and render it read-only.
+ */
+BufFile *
+BufFileOpenShared(SharedFileSet *fileset, const char *name)
+{
+ BufFile *file = (BufFile *) palloc(sizeof(BufFile));
+ char segment_name[MAXPGPATH];
+ Size capacity = 16;
+ File *files = palloc(sizeof(File) * capacity);
+ int nfiles = 0;
+
+ file = (BufFile *) palloc(sizeof(BufFile));
+ files = palloc(sizeof(File) * capacity);
+
+ /*
+ * We don't know how many segments there are, so we'll probe the
+ * filesystem to find out.
+ */
+ for (;;)
+ {
+ /* See if we need to expand our file segment array. */
+ if (nfiles + 1 > capacity)
+ {
+ capacity *= 2;
+ files = repalloc(files, sizeof(File) * capacity);
+ }
+ /* Try to load a segment. */
+ SharedSegmentName(segment_name, name, nfiles);
+ files[nfiles] = SharedFileSetOpen(fileset, segment_name);
+ if (files[nfiles] <= 0)
+ break;
+ ++nfiles;
+
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ /*
+ * If we didn't find any files at all, then no BufFile exists with this
+ * name.
+ */
+ if (nfiles == 0)
+ return NULL;
+
+ file->numFiles = nfiles;
+ file->files = files;
+ file->offsets = (off_t *) palloc0(sizeof(off_t) * nfiles);
+ file->isInterXact = false;
+ file->dirty = false;
+ file->resowner = CurrentResourceOwner; /* Unused, can't extend */
+ file->curFile = 0;
+ file->curOffset = 0L;
+ file->pos = 0;
+ file->nbytes = 0;
+ file->readOnly = true; /* Can't write to files opened this way */
+ file->fileset = fileset;
+ file->name = pstrdup(name);
+
+ return file;
+}
+
+/*
+ * Delete a BufFile that was created by BufFileCreateShared in the given
+ * SharedFileSet using the given name.
+ *
+ * It is not necessary to delete files explicitly with this function. It is
+ * provided only as a way to delete files proactively, rather than waiting for
+ * the SharedFileSet to be cleaned up.
+ *
+ * Only one backend should attempt to delete a given name, and should know
+ * that it exists and has been exported or closed.
+ */
+void
+BufFileDeleteShared(SharedFileSet *fileset, const char *name)
+{
+ char segment_name[MAXPGPATH];
+ int segment = 0;
+ bool found = false;
+
+ /*
+ * We don't know how many segments the file has. We'll keep deleting
+ * until we run out. If we don't manage to find even an initial segment,
+ * raise an error.
+ */
+ for (;;)
+ {
+ SharedSegmentName(segment_name, name, segment);
+ if (!SharedFileSetDelete(fileset, segment_name, true))
+ break;
+ found = true;
+ ++segment;
+
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ if (!found)
+ elog(ERROR, "could not delete unknown shared BufFile \"%s\"", name);
+}
+
+/*
+ * BufFileExportShared --- flush and make read-only, in preparation for sharing.
+ */
+void
+BufFileExportShared(BufFile *file)
+{
+ /* Must be a file belonging to a SharedFileSet. */
+ Assert(file->fileset != NULL);
+
+ /* It's probably a bug if someone calls this twice. */
+ Assert(!file->readOnly);
+
+ BufFileFlush(file);
+ file->readOnly = true;
+}
+
/*
* Close a BufFile
*
size_t nwritten = 0;
size_t nthistime;
+ Assert(!file->readOnly);
+
while (size > 0)
{
if (file->pos >= BLCKSZ)
* for a long time, like relation files. It is the caller's responsibility
* to close them, there is no automatic mechanism in fd.c for that.
*
+ * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
+ * temporary files that have names so that they can be shared between
+ * backends. Such files are automatically closed and count against the
+ * temporary file limit of the backend that creates them, but unlike anonymous
+ * files they are not automatically deleted. See sharedfileset.c for a shared
+ * ownership mechanism that provides automatic cleanup for shared files when
+ * the last of a group of backends detaches.
+ *
* AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
* wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
* They behave like the corresponding native functions, except that the handle
#define FilePosIsUnknown(pos) ((pos) < 0)
/* these are the assigned bits in fdstate below: */
-#define FD_TEMPORARY (1 << 0) /* T = delete when closed */
-#define FD_XACT_TEMPORARY (1 << 1) /* T = delete at eoXact */
+#define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
+#define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
+#define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
typedef struct vfd
{
static void AtProcExit_Files(int code, Datum arg);
static void CleanupTempFiles(bool isProcExit);
-static void RemovePgTempFilesInDir(const char *tmpdirname);
+static void RemovePgTempFilesInDir(const char *tmpdirname, bool unlink_all);
static void RemovePgTempRelationFiles(const char *tsdirname);
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
static bool looks_like_temp_rel_name(const char *name);
static void pre_sync_fname(const char *fname, bool isdir, int elevel);
#endif
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
+static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
static int fsync_parent_path(const char *fname, int elevel);
return 0;
}
+/*
+ * Called whenever a temporary file is deleted to report its size.
+ */
+static void
+ReportTemporaryFileUsage(const char *path, off_t size)
+{
+ pgstat_report_tempfile(size);
+
+ if (log_temp_files >= 0)
+ {
+ if ((size / 1024) >= log_temp_files)
+ ereport(LOG,
+ (errmsg("temporary file: path \"%s\", size %lu",
+ path, (unsigned long) size)));
+ }
+}
+
+/*
+ * Called to register a temporary file for automatic close.
+ * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
+ * before the file was opened.
+ */
+static void
+RegisterTemporaryFile(File file)
+{
+ ResourceOwnerRememberFile(CurrentResourceOwner, file);
+ VfdCache[file].resowner = CurrentResourceOwner;
+
+ /* Backup mechanism for closing at end of xact. */
+ VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
+ have_xact_temporary_files = true;
+}
+
/*
* Called when we get a shared invalidation message on some relation.
*/
return file;
}
+/*
+ * Create directory 'directory'. If necessary, create 'basedir', which must
+ * be the directory above it. This is designed for creating the top-level
+ * temporary directory on demand before creating a directory underneath it.
+ * Do nothing if the directory already exists.
+ *
+ * Directories created within the top-level temporary directory should begin
+ * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
+ * deleted at startup by RemovePgTempFiles(). Further subdirectories below
+ * that do not need any particular prefix.
+*/
+void
+PathNameCreateTemporaryDir(const char *basedir, const char *directory)
+{
+ if (mkdir(directory, S_IRWXU) < 0)
+ {
+ if (errno == EEXIST)
+ return;
+
+ /*
+ * Failed. Try to create basedir first in case it's missing. Tolerate
+ * EEXIST to close a race against another process following the same
+ * algorithm.
+ */
+ if (mkdir(basedir, S_IRWXU) < 0 && errno != EEXIST)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("cannot create temporary directory \"%s\": %m",
+ basedir)));
+
+ /* Try again. */
+ if (mkdir(directory, S_IRWXU) < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("cannot create temporary subdirectory \"%s\": %m",
+ directory)));
+ }
+}
+
+/*
+ * Delete a directory and everything in it, if it exists.
+ */
+void
+PathNameDeleteTemporaryDir(const char *dirname)
+{
+ struct stat statbuf;
+
+ /* Silently ignore missing directory. */
+ if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
+ return;
+
+ /*
+ * Currently, walkdir doesn't offer a way for our passed in function to
+ * maintain state. Perhaps it should, so that we could tell the caller
+ * whether this operation succeeded or failed. Since this operation is
+ * used in a cleanup path, we wouldn't actually behave differently: we'll
+ * just log failures.
+ */
+ walkdir(dirname, unlink_if_exists_fname, false, LOG);
+}
+
/*
* Open a temporary file that will disappear when we close it.
*
DEFAULTTABLESPACE_OID,
true);
- /* Mark it for deletion at close */
- VfdCache[file].fdstate |= FD_TEMPORARY;
+ /* Mark it for deletion at close and temporary file size limit */
+ VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
/* Register it with the current resource owner */
if (!interXact)
- {
- VfdCache[file].fdstate |= FD_XACT_TEMPORARY;
-
- VfdCache[file].resowner = CurrentResourceOwner;
- ResourceOwnerRememberFile(CurrentResourceOwner, file);
-
- /* ensure cleanup happens at eoxact */
- have_xact_temporary_files = true;
- }
+ RegisterTemporaryFile(file);
return file;
}
/*
- * Open a temporary file in a specific tablespace.
- * Subroutine for OpenTemporaryFile, which see for details.
+ * Return the path of the temp directory in a given tablespace.
*/
-static File
-OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
+void
+TempTablespacePath(char *path, Oid tablespace)
{
- char tempdirpath[MAXPGPATH];
- char tempfilepath[MAXPGPATH];
- File file;
-
/*
* Identify the tempfile directory for this tablespace.
*
* If someone tries to specify pg_global, use pg_default instead.
*/
- if (tblspcOid == DEFAULTTABLESPACE_OID ||
- tblspcOid == GLOBALTABLESPACE_OID)
- {
- /* The default tablespace is {datadir}/base */
- snprintf(tempdirpath, sizeof(tempdirpath), "base/%s",
- PG_TEMP_FILES_DIR);
- }
+ if (tablespace == InvalidOid ||
+ tablespace == DEFAULTTABLESPACE_OID ||
+ tablespace == GLOBALTABLESPACE_OID)
+ snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
else
{
/* All other tablespaces are accessed via symlinks */
- snprintf(tempdirpath, sizeof(tempdirpath), "pg_tblspc/%u/%s/%s",
- tblspcOid, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
+ snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
+ tablespace, TABLESPACE_VERSION_DIRECTORY,
+ PG_TEMP_FILES_DIR);
}
+}
+
+/*
+ * Open a temporary file in a specific tablespace.
+ * Subroutine for OpenTemporaryFile, which see for details.
+ */
+static File
+OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
+{
+ char tempdirpath[MAXPGPATH];
+ char tempfilepath[MAXPGPATH];
+ File file;
+
+ TempTablespacePath(tempdirpath, tblspcOid);
/*
* Generate a tempfile name that should be unique within the current
return file;
}
+
+/*
+ * Create a new file. The directory containing it must already exist. Files
+ * created this way are subject to temp_file_limit and are automatically
+ * closed at end of transaction, but are not automatically deleted on close
+ * because they are intended to be shared between cooperating backends.
+ *
+ * If the file is inside the top-level temporary directory, its name should
+ * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
+ * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
+ * inside a directory created with PathnameCreateTemporaryDir(), in which case
+ * the prefix isn't needed.
+ */
+File
+PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
+{
+ File file;
+
+ ResourceOwnerEnlargeFiles(CurrentResourceOwner);
+
+ /*
+ * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
+ * temp file that can be reused.
+ */
+ file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
+ if (file <= 0)
+ {
+ if (error_on_failure)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not create temporary file \"%s\": %m",
+ path)));
+ else
+ return file;
+ }
+
+ /* Mark it for temp_file_limit accounting. */
+ VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
+
+ /* Register it for automatic close. */
+ RegisterTemporaryFile(file);
+
+ return file;
+}
+
+/*
+ * Open a file that was created with PathNameCreateTemporaryFile, possibly in
+ * another backend. Files opened this way don't count against the
+ * temp_file_limit of the caller, are read-only and are automatically closed
+ * at the end of the transaction but are not deleted on close.
+ */
+File
+PathNameOpenTemporaryFile(const char *path)
+{
+ File file;
+
+ ResourceOwnerEnlargeFiles(CurrentResourceOwner);
+
+ /* We open the file read-only. */
+ file = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
+
+ /* If no such file, then we don't raise an error. */
+ if (file <= 0 && errno != ENOENT)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open temporary file \"%s\": %m",
+ path)));
+
+ if (file > 0)
+ {
+ /* Register it for automatic close. */
+ RegisterTemporaryFile(file);
+ }
+
+ return file;
+}
+
+/*
+ * Delete a file by pathname. Return true if the file existed, false if
+ * didn't.
+ */
+bool
+PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
+{
+ struct stat filestats;
+ int stat_errno;
+
+ /* Get the final size for pgstat reporting. */
+ if (stat(path, &filestats) != 0)
+ stat_errno = errno;
+ else
+ stat_errno = 0;
+
+ /*
+ * Unlike FileClose's automatic file deletion code, we tolerate
+ * non-existence to support BufFileDeleteShared which doesn't know how
+ * many segments it has to delete until it runs out.
+ */
+ if (stat_errno == ENOENT)
+ return false;
+
+ if (unlink(path) < 0)
+ {
+ if (errno != ENOENT)
+ ereport(error_on_failure ? ERROR : LOG,
+ (errcode_for_file_access(),
+ errmsg("cannot unlink temporary file \"%s\": %m",
+ path)));
+ return false;
+ }
+
+ if (stat_errno == 0)
+ ReportTemporaryFileUsage(path, filestats.st_size);
+ else
+ {
+ errno = stat_errno;
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not stat file \"%s\": %m", path)));
+ }
+
+ return true;
+}
+
/*
* close a file when done with it
*/
Delete(file);
}
+ if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
+ {
+ /* Subtract its size from current usage (do first in case of error) */
+ temporary_files_size -= vfdP->fileSize;
+ vfdP->fileSize = 0;
+ }
+
/*
* Delete the file if it was temporary, and make a log entry if wanted
*/
- if (vfdP->fdstate & FD_TEMPORARY)
+ if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
{
struct stat filestats;
int stat_errno;
* is arranged to ensure that the worst-case consequence is failing to
* emit log message(s), not failing to attempt the unlink.
*/
- vfdP->fdstate &= ~FD_TEMPORARY;
+ vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
- /* Subtract its size from current usage (do first in case of error) */
- temporary_files_size -= vfdP->fileSize;
- vfdP->fileSize = 0;
/* first try the stat() */
if (stat(vfdP->fileName, &filestats))
/* and last report the stat results */
if (stat_errno == 0)
- {
- pgstat_report_tempfile(filestats.st_size);
-
- if (log_temp_files >= 0)
- {
- if ((filestats.st_size / 1024) >= log_temp_files)
- ereport(LOG,
- (errmsg("temporary file: path \"%s\", size %lu",
- vfdP->fileName,
- (unsigned long) filestats.st_size)));
- }
- }
+ ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
else
{
errno = stat_errno;
* message if we do that. All current callers would just throw error
* immediately anyway, so this is safe at present.
*/
- if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMPORARY))
+ if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
{
off_t newPos;
* get here in that state if we're not enforcing temporary_files_size,
* so we don't care.
*/
- if (vfdP->fdstate & FD_TEMPORARY)
+ if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
{
off_t newPos = vfdP->seekPos;
if (returnCode == 0 && VfdCache[file].fileSize > offset)
{
/* adjust our state for truncation of a temp file */
- Assert(VfdCache[file].fdstate & FD_TEMPORARY);
+ Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
temporary_files_size -= VfdCache[file].fileSize - offset;
VfdCache[file].fileSize = offset;
}
return (numTempTableSpaces >= 0);
}
+/*
+ * GetTempTablespaces
+ *
+ * Populate an array with the OIDs of the tablespaces that should be used for
+ * temporary files. Return the number that were copied into the output array.
+ */
+int
+GetTempTablespaces(Oid *tableSpaces, int numSpaces)
+{
+ int i;
+
+ Assert(TempTablespacesAreSet());
+ for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
+ tableSpaces[i] = tempTableSpaces[i];
+
+ return i;
+}
+
/*
* GetNextTempTableSpace
*
{
unsigned short fdstate = VfdCache[i].fdstate;
- if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL)
+ if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
+ VfdCache[i].fileName != NULL)
{
/*
* If we're in the process of exiting a backend process, close
*/
if (isProcExit)
FileClose(i);
- else if (fdstate & FD_XACT_TEMPORARY)
+ else if (fdstate & FD_CLOSE_AT_EOXACT)
{
elog(WARNING,
"temporary file %s not closed at end-of-transaction",
* First process temp files in pg_default ($PGDATA/base)
*/
snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
- RemovePgTempFilesInDir(temp_path);
+ RemovePgTempFilesInDir(temp_path, false);
RemovePgTempRelationFiles("base");
/*
snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
- RemovePgTempFilesInDir(temp_path);
+ RemovePgTempFilesInDir(temp_path, false);
snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
#endif
}
-/* Process one pgsql_tmp directory for RemovePgTempFiles */
+/*
+ * Process one pgsql_tmp directory for RemovePgTempFiles. At the top level in
+ * each tablespace, this should be called with unlink_all = false, so that
+ * only files matching the temporary name prefix will be unlinked. When
+ * recursing it will be called with unlink_all = true to unlink everything
+ * under a top-level temporary directory.
+ */
static void
-RemovePgTempFilesInDir(const char *tmpdirname)
+RemovePgTempFilesInDir(const char *tmpdirname, bool unlink_all)
{
DIR *temp_dir;
struct dirent *temp_de;
snprintf(rm_path, sizeof(rm_path), "%s/%s",
tmpdirname, temp_de->d_name);
- if (strncmp(temp_de->d_name,
+ if (unlink_all ||
+ strncmp(temp_de->d_name,
PG_TEMP_FILE_PREFIX,
strlen(PG_TEMP_FILE_PREFIX)) == 0)
- unlink(rm_path); /* note we ignore any error */
+ {
+ struct stat statbuf;
+
+ /* note that we ignore any error here and below */
+ if (lstat(rm_path, &statbuf) < 0)
+ continue;
+
+ if (S_ISDIR(statbuf.st_mode))
+ {
+ RemovePgTempFilesInDir(rm_path, true);
+ rmdir(rm_path);
+ }
+ else
+ unlink(rm_path);
+ }
else
elog(LOG,
"unexpected file found in temporary-files directory: \"%s\"",
fsync_fname_ext(fname, isdir, true, elevel);
}
+static void
+unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
+{
+ if (isdir)
+ {
+ if (rmdir(fname) != 0 && errno != ENOENT)
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not rmdir directory \"%s\": %m", fname)));
+ }
+ else
+ {
+ /* Use PathNameDeleteTemporaryFile to report filesize */
+ PathNameDeleteTemporaryFile(fname, false);
+ }
+}
+
/*
* fsync_fname_ext -- Try to fsync a file or directory
*
--- /dev/null
+/*-------------------------------------------------------------------------
+ *
+ * sharedfileset.c
+ * Shared temporary file management.
+ *
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/storage/file/sharedfileset.c
+ *
+ * SharefFileSets provide a temporary namespace (think directory) so that
+ * files can be discovered by name, and a shared ownership semantics so that
+ * shared files survive until the last user detaches.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/hash.h"
+#include "catalog/pg_tablespace.h"
+#include "commands/tablespace.h"
+#include "miscadmin.h"
+#include "storage/dsm.h"
+#include "storage/sharedfileset.h"
+#include "utils/builtins.h"
+
+static void SharedFileSetOnDetach(dsm_segment *segment, Datum datum);
+static void SharedFileSetPath(char *path, SharedFileSet *fileset, Oid tablespace);
+static void SharedFilePath(char *path, SharedFileSet *fileset, const char *name);
+static Oid ChooseTablespace(const SharedFileSet *fileset, const char *name);
+
+/*
+ * Initialize a space for temporary files that can be opened for read-only
+ * access by other backends. Other backends must attach to it before
+ * accessing it. Associate this SharedFileSet with 'seg'. Any contained
+ * files will be deleted when the last backend detaches.
+ *
+ * Files will be distributed over the tablespaces configured in
+ * temp_tablespaces.
+ *
+ * Under the covers the set is one or more directories which will eventually
+ * be deleted when there are no backends attached.
+ */
+void
+SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg)
+{
+ static uint32 counter = 0;
+
+ SpinLockInit(&fileset->mutex);
+ fileset->refcnt = 1;
+ fileset->creator_pid = MyProcPid;
+ fileset->number = counter;
+ counter = (counter + 1) % INT_MAX;
+
+ /* Capture the tablespace OIDs so that all backends agree on them. */
+ PrepareTempTablespaces();
+ fileset->ntablespaces =
+ GetTempTablespaces(&fileset->tablespaces[0],
+ lengthof(fileset->tablespaces));
+ if (fileset->ntablespaces == 0)
+ {
+ fileset->tablespaces[0] = DEFAULTTABLESPACE_OID;
+ fileset->ntablespaces = 1;
+ }
+
+ /* Register our cleanup callback. */
+ on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset));
+}
+
+/*
+ * Attach to a set of directories that was created with SharedFileSetInit.
+ */
+void
+SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg)
+{
+ bool success;
+
+ SpinLockAcquire(&fileset->mutex);
+ if (fileset->refcnt == 0)
+ success = false;
+ else
+ {
+ ++fileset->refcnt;
+ success = true;
+ }
+ SpinLockRelease(&fileset->mutex);
+
+ if (!success)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("could not attach to a SharedFileSet that is already destroyed")));
+
+ /* Register our cleanup callback. */
+ on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset));
+}
+
+/*
+ * Create a new file in the given set.
+ */
+File
+SharedFileSetCreate(SharedFileSet *fileset, const char *name)
+{
+ char path[MAXPGPATH];
+ File file;
+
+ SharedFilePath(path, fileset, name);
+ file = PathNameCreateTemporaryFile(path, false);
+
+ /* If we failed, see if we need to create the directory on demand. */
+ if (file <= 0)
+ {
+ char tempdirpath[MAXPGPATH];
+ char filesetpath[MAXPGPATH];
+ Oid tablespace = ChooseTablespace(fileset, name);
+
+ TempTablespacePath(tempdirpath, tablespace);
+ SharedFileSetPath(filesetpath, fileset, tablespace);
+ PathNameCreateTemporaryDir(tempdirpath, filesetpath);
+ file = PathNameCreateTemporaryFile(path, true);
+ }
+
+ return file;
+}
+
+/*
+ * Open a file that was created with SharedFileSetCreate(), possibly in
+ * another backend.
+ */
+File
+SharedFileSetOpen(SharedFileSet *fileset, const char *name)
+{
+ char path[MAXPGPATH];
+ File file;
+
+ SharedFilePath(path, fileset, name);
+ file = PathNameOpenTemporaryFile(path);
+
+ return file;
+}
+
+/*
+ * Delete a file that was created with PathNameCreateShared().
+ * Return true if the file existed, false if didn't.
+ */
+bool
+SharedFileSetDelete(SharedFileSet *fileset, const char *name,
+ bool error_on_failure)
+{
+ char path[MAXPGPATH];
+
+ SharedFilePath(path, fileset, name);
+
+ return PathNameDeleteTemporaryFile(path, error_on_failure);
+}
+
+/*
+ * Delete all files in the set.
+ */
+void
+SharedFileSetDeleteAll(SharedFileSet *fileset)
+{
+ char dirpath[MAXPGPATH];
+ int i;
+
+ /*
+ * Delete the directory we created in each tablespace. Doesn't fail
+ * because we use this in error cleanup paths, but can generate LOG
+ * message on IO error.
+ */
+ for (i = 0; i < fileset->ntablespaces; ++i)
+ {
+ SharedFileSetPath(dirpath, fileset, fileset->tablespaces[i]);
+ PathNameDeleteTemporaryDir(dirpath);
+ }
+}
+
+/*
+ * Callback function that will be invoked when this backend detaches from a
+ * DSM segment holding a SharedFileSet that it has created or attached to. If
+ * we are the last to detach, then try to remove the directories and
+ * everything in them. We can't raise an error on failures, because this runs
+ * in error cleanup paths.
+ */
+static void
+SharedFileSetOnDetach(dsm_segment *segment, Datum datum)
+{
+ bool unlink_all = false;
+ SharedFileSet *fileset = (SharedFileSet *) DatumGetPointer(datum);
+
+ SpinLockAcquire(&fileset->mutex);
+ Assert(fileset->refcnt > 0);
+ if (--fileset->refcnt == 0)
+ unlink_all = true;
+ SpinLockRelease(&fileset->mutex);
+
+ /*
+ * If we are the last to detach, we delete the directory in all
+ * tablespaces. Note that we are still actually attached for the rest of
+ * this function so we can safely access its data.
+ */
+ if (unlink_all)
+ SharedFileSetDeleteAll(fileset);
+}
+
+/*
+ * Build the path for the directory holding the files backing a SharedFileSet
+ * in a given tablespace.
+ */
+static void
+SharedFileSetPath(char *path, SharedFileSet *fileset, Oid tablespace)
+{
+ char tempdirpath[MAXPGPATH];
+
+ TempTablespacePath(tempdirpath, tablespace);
+ snprintf(path, MAXPGPATH, "%s/%s%d.%u.sharedfileset",
+ tempdirpath, PG_TEMP_FILE_PREFIX,
+ fileset->creator_pid, fileset->number);
+}
+
+/*
+ * Sorting hat to determine which tablespace a given shared temporary file
+ * belongs in.
+ */
+static Oid
+ChooseTablespace(const SharedFileSet *fileset, const char *name)
+{
+ uint32 hash = hash_any((const unsigned char *) name, strlen(name));
+
+ return fileset->tablespaces[hash % fileset->ntablespaces];
+}
+
+/*
+ * Compute the full path of a file in a SharedFileSet.
+ */
+static void
+SharedFilePath(char *path, SharedFileSet *fileset, const char *name)
+{
+ char dirpath[MAXPGPATH];
+
+ SharedFileSetPath(dirpath, fileset, ChooseTablespace(fileset, name));
+ snprintf(path, MAXPGPATH, "%s/%s", dirpath, name);
+}
#ifndef BUFFILE_H
#define BUFFILE_H
+#include "storage/sharedfileset.h"
+
/* BufFile is an opaque type whose details are not known outside buffile.c. */
typedef struct BufFile BufFile;
extern void BufFileTell(BufFile *file, int *fileno, off_t *offset);
extern int BufFileSeekBlock(BufFile *file, long blknum);
+extern BufFile *BufFileCreateShared(SharedFileSet *fileset, const char *name);
+extern void BufFileExportShared(BufFile *file);
+extern BufFile *BufFileOpenShared(SharedFileSet *fileset, const char *name);
+extern void BufFileDeleteShared(SharedFileSet *fileset, const char *name);
+
#endif /* BUFFILE_H */
extern int FileGetRawFlags(File file);
extern mode_t FileGetRawMode(File file);
+/* Operations used for sharing named temporary files */
+extern File PathNameCreateTemporaryFile(const char *name, bool error_on_failure);
+extern File PathNameOpenTemporaryFile(const char *name);
+extern bool PathNameDeleteTemporaryFile(const char *name, bool error_on_failure);
+extern void PathNameCreateTemporaryDir(const char *base, const char *name);
+extern void PathNameDeleteTemporaryDir(const char *name);
+extern void TempTablespacePath(char *path, Oid tablespace);
+
/* Operations that allow use of regular stdio --- USE WITH CAUTION */
extern FILE *AllocateFile(const char *name, const char *mode);
extern int FreeFile(FILE *file);
extern void closeAllVfds(void);
extern void SetTempTablespaces(Oid *tableSpaces, int numSpaces);
extern bool TempTablespacesAreSet(void);
+extern int GetTempTablespaces(Oid *tableSpaces, int numSpaces);
extern Oid GetNextTempTableSpace(void);
extern void AtEOXact_Files(void);
extern void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
extern int durable_link_or_rename(const char *oldfile, const char *newfile, int loglevel);
extern void SyncDataDirectory(void);
-/* Filename components for OpenTemporaryFile */
+/* Filename components */
#define PG_TEMP_FILES_DIR "pgsql_tmp"
#define PG_TEMP_FILE_PREFIX "pgsql_tmp"
--- /dev/null
+/*-------------------------------------------------------------------------
+ *
+ * sharedfileset.h
+ * Shared temporary file management.
+ *
+ *
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/sharedfilespace.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef SHAREDFILESET_H
+#define SHAREDFILESET_H
+
+#include "storage/dsm.h"
+#include "storage/fd.h"
+#include "storage/spin.h"
+
+/*
+ * A set of temporary files that can be shared by multiple backends.
+ */
+typedef struct SharedFileSet
+{
+ pid_t creator_pid; /* PID of the creating process */
+ uint32 number; /* per-PID identifier */
+ slock_t mutex; /* mutex protecting the reference count */
+ int refcnt; /* number of attached backends */
+ int ntablespaces; /* number of tablespaces to use */
+ Oid tablespaces[8]; /* OIDs of tablespaces to use. Assumes that
+ * it's rare that there more than temp
+ * tablespaces. */
+} SharedFileSet;
+
+extern void SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg);
+extern void SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg);
+extern File SharedFileSetCreate(SharedFileSet *fileset, const char *name);
+extern File SharedFileSetOpen(SharedFileSet *fileset, const char *name);
+extern bool SharedFileSetDelete(SharedFileSet *fileset, const char *name,
+ bool error_on_failure);
+extern void SharedFileSetDeleteAll(SharedFileSet *fileset);
+
+#endif
SharedDependencyObjectType
SharedDependencyType
SharedExecutorInstrumentation
+SharedFileSet
SharedInvalCatalogMsg
SharedInvalCatcacheMsg
SharedInvalRelcacheMsg