]> granicus.if.org Git - postgresql/commitdiff
Add infrastructure for sharing temporary files between backends.
authorAndres Freund <andres@anarazel.de>
Sat, 2 Dec 2017 00:30:56 +0000 (16:30 -0800)
committerAndres Freund <andres@anarazel.de>
Sat, 2 Dec 2017 00:30:56 +0000 (16:30 -0800)
SharedFileSet allows temporary files to be created by one backend and
then exported for read-only access by other backends, with clean-up
managed by reference counting associated with a DSM segment.  This
includes changes to fd.c and buffile.c to support the new kind of
temporary file.

This will be used by an upcoming patch adding support for parallel
hash joins.

Author: Thomas Munro
Reviewed-By: Peter Geoghegan, Andres Freund, Robert Haas, Rushabh Lathia
Discussion:
    https://postgr.es/m/CAEepm=2W=cOkiZxcg6qiFQP-dHUe09aqTrEMM7yJDrHMhDv_RA@mail.gmail.com
    https://postgr.es/m/CAH2-WznJ_UgLux=_jTgCQ4yFz0iBntudsNKa1we3kN1BAG=88w@mail.gmail.com

src/backend/storage/file/Makefile
src/backend/storage/file/buffile.c
src/backend/storage/file/fd.c
src/backend/storage/file/sharedfileset.c [new file with mode: 0644]
src/include/storage/buffile.h
src/include/storage/fd.h
src/include/storage/sharedfileset.h [new file with mode: 0644]
src/tools/pgindent/typedefs.list

index d2198f2b93e8fda8cbd015e98d1ccce6b1412105..ca6a0e4f7d5b48bba61a73a3b42245dd718616ba 100644 (file)
@@ -12,6 +12,6 @@ subdir = src/backend/storage/file
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = fd.o buffile.o copydir.o reinit.o
+OBJS = fd.o buffile.o copydir.o reinit.o sharedfileset.o
 
 include $(top_srcdir)/src/backend/common.mk
index 06bf2fadbf17a1ad38c40f0a06e4e038761a7f6b..fa9940da9b3a6b71eca475b9b90b575a56c5d5ed 100644 (file)
  * BufFile also supports temporary files that exceed the OS file size limit
  * (by opening multiple fd.c temporary files).  This is an essential feature
  * for sorts and hashjoins on large amounts of data.
+ *
+ * BufFile supports temporary files that can be made read-only and shared with
+ * other backends, as infrastructure for parallel execution.  Such files need
+ * to be created as a member of a SharedFileSet that all participants are
+ * attached to.
  *-------------------------------------------------------------------------
  */
 
 #include "postgres.h"
 
 #include "executor/instrument.h"
+#include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/fd.h"
 #include "storage/buffile.h"
@@ -70,6 +76,10 @@ struct BufFile
 
        bool            isInterXact;    /* keep open over transactions? */
        bool            dirty;                  /* does buffer need to be written? */
+       bool            readOnly;               /* has the file been set to read only? */
+
+       SharedFileSet *fileset;         /* space for segment files if shared */
+       const char *name;                       /* name of this BufFile if shared */
 
        /*
         * resowner is the ResourceOwner to use for underlying temp files.  (We
@@ -94,6 +104,7 @@ static void extendBufFile(BufFile *file);
 static void BufFileLoadBuffer(BufFile *file);
 static void BufFileDumpBuffer(BufFile *file);
 static int     BufFileFlush(BufFile *file);
+static File MakeNewSharedSegment(BufFile *file, int segment);
 
 
 /*
@@ -117,6 +128,9 @@ makeBufFile(File firstfile)
        file->curOffset = 0L;
        file->pos = 0;
        file->nbytes = 0;
+       file->readOnly = false;
+       file->fileset = NULL;
+       file->name = NULL;
 
        return file;
 }
@@ -134,7 +148,11 @@ extendBufFile(BufFile *file)
        oldowner = CurrentResourceOwner;
        CurrentResourceOwner = file->resowner;
 
-       pfile = OpenTemporaryFile(file->isInterXact);
+       if (file->fileset == NULL)
+               pfile = OpenTemporaryFile(file->isInterXact);
+       else
+               pfile = MakeNewSharedSegment(file, file->numFiles);
+
        Assert(pfile >= 0);
 
        CurrentResourceOwner = oldowner;
@@ -175,6 +193,189 @@ BufFileCreateTemp(bool interXact)
        return file;
 }
 
+/*
+ * Build the name for a given segment of a given BufFile.
+ */
+static void
+SharedSegmentName(char *name, const char *buffile_name, int segment)
+{
+       snprintf(name, MAXPGPATH, "%s.%d", buffile_name, segment);
+}
+
+/*
+ * Create a new segment file backing a shared BufFile.
+ */
+static File
+MakeNewSharedSegment(BufFile *buffile, int segment)
+{
+       char            name[MAXPGPATH];
+       File            file;
+
+       SharedSegmentName(name, buffile->name, segment);
+       file = SharedFileSetCreate(buffile->fileset, name);
+
+       /* SharedFileSetCreate would've errored out */
+       Assert(file > 0);
+
+       return file;
+}
+
+/*
+ * Create a BufFile that can be discovered and opened read-only by other
+ * backends that are attached to the same SharedFileSet using the same name.
+ *
+ * The naming scheme for shared BufFiles is left up to the calling code.  The
+ * name will appear as part of one or more filenames on disk, and might
+ * provide clues to administrators about which subsystem is generating
+ * temporary file data.  Since each SharedFileSet object is backed by one or
+ * more uniquely named temporary directory, names don't conflict with
+ * unrelated SharedFileSet objects.
+ */
+BufFile *
+BufFileCreateShared(SharedFileSet *fileset, const char *name)
+{
+       BufFile    *file;
+
+       file = (BufFile *) palloc(sizeof(BufFile));
+       file->fileset = fileset;
+       file->name = pstrdup(name);
+       file->numFiles = 1;
+       file->files = (File *) palloc(sizeof(File));
+       file->files[0] = MakeNewSharedSegment(file, 0);
+       file->offsets = (off_t *) palloc(sizeof(off_t));
+       file->offsets[0] = 0L;
+       file->isInterXact = false;
+       file->dirty = false;
+       file->resowner = CurrentResourceOwner;
+       file->curFile = 0;
+       file->curOffset = 0L;
+       file->pos = 0;
+       file->nbytes = 0;
+       file->readOnly = false;
+       file->name = pstrdup(name);
+
+       return file;
+}
+
+/*
+ * Open a file that was previously created in another backend (or this one)
+ * with BufFileCreateShared in the same SharedFileSet using the same name.
+ * The backend that created the file must have called BufFileClose() or
+ * BufFileExport() to make sure that it is ready to be opened by other
+ * backends and render it read-only.
+ */
+BufFile *
+BufFileOpenShared(SharedFileSet *fileset, const char *name)
+{
+       BufFile    *file = (BufFile *) palloc(sizeof(BufFile));
+       char            segment_name[MAXPGPATH];
+       Size            capacity = 16;
+       File       *files = palloc(sizeof(File) * capacity);
+       int                     nfiles = 0;
+
+       file = (BufFile *) palloc(sizeof(BufFile));
+       files = palloc(sizeof(File) * capacity);
+
+       /*
+        * We don't know how many segments there are, so we'll probe the
+        * filesystem to find out.
+        */
+       for (;;)
+       {
+               /* See if we need to expand our file segment array. */
+               if (nfiles + 1 > capacity)
+               {
+                       capacity *= 2;
+                       files = repalloc(files, sizeof(File) * capacity);
+               }
+               /* Try to load a segment. */
+               SharedSegmentName(segment_name, name, nfiles);
+               files[nfiles] = SharedFileSetOpen(fileset, segment_name);
+               if (files[nfiles] <= 0)
+                       break;
+               ++nfiles;
+
+               CHECK_FOR_INTERRUPTS();
+       }
+
+       /*
+        * If we didn't find any files at all, then no BufFile exists with this
+        * name.
+        */
+       if (nfiles == 0)
+               return NULL;
+
+       file->numFiles = nfiles;
+       file->files = files;
+       file->offsets = (off_t *) palloc0(sizeof(off_t) * nfiles);
+       file->isInterXact = false;
+       file->dirty = false;
+       file->resowner = CurrentResourceOwner;  /* Unused, can't extend */
+       file->curFile = 0;
+       file->curOffset = 0L;
+       file->pos = 0;
+       file->nbytes = 0;
+       file->readOnly = true;          /* Can't write to files opened this way */
+       file->fileset = fileset;
+       file->name = pstrdup(name);
+
+       return file;
+}
+
+/*
+ * Delete a BufFile that was created by BufFileCreateShared in the given
+ * SharedFileSet using the given name.
+ *
+ * It is not necessary to delete files explicitly with this function.  It is
+ * provided only as a way to delete files proactively, rather than waiting for
+ * the SharedFileSet to be cleaned up.
+ *
+ * Only one backend should attempt to delete a given name, and should know
+ * that it exists and has been exported or closed.
+ */
+void
+BufFileDeleteShared(SharedFileSet *fileset, const char *name)
+{
+       char            segment_name[MAXPGPATH];
+       int                     segment = 0;
+       bool            found = false;
+
+       /*
+        * We don't know how many segments the file has.  We'll keep deleting
+        * until we run out.  If we don't manage to find even an initial segment,
+        * raise an error.
+        */
+       for (;;)
+       {
+               SharedSegmentName(segment_name, name, segment);
+               if (!SharedFileSetDelete(fileset, segment_name, true))
+                       break;
+               found = true;
+               ++segment;
+
+               CHECK_FOR_INTERRUPTS();
+       }
+
+       if (!found)
+               elog(ERROR, "could not delete unknown shared BufFile \"%s\"", name);
+}
+
+/*
+ * BufFileExportShared --- flush and make read-only, in preparation for sharing.
+ */
+void
+BufFileExportShared(BufFile *file)
+{
+       /* Must be a file belonging to a SharedFileSet. */
+       Assert(file->fileset != NULL);
+
+       /* It's probably a bug if someone calls this twice. */
+       Assert(!file->readOnly);
+
+       BufFileFlush(file);
+       file->readOnly = true;
+}
+
 /*
  * Close a BufFile
  *
@@ -390,6 +591,8 @@ BufFileWrite(BufFile *file, void *ptr, size_t size)
        size_t          nwritten = 0;
        size_t          nthistime;
 
+       Assert(!file->readOnly);
+
        while (size > 0)
        {
                if (file->pos >= BLCKSZ)
index aa2fe2c6c04d6c86c19f0607f0ef053f2291b8dd..2e93e4ad632c1160c2ea615b14bc7c6929379230 100644 (file)
  * for a long time, like relation files. It is the caller's responsibility
  * to close them, there is no automatic mechanism in fd.c for that.
  *
+ * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
+ * temporary files that have names so that they can be shared between
+ * backends.  Such files are automatically closed and count against the
+ * temporary file limit of the backend that creates them, but unlike anonymous
+ * files they are not automatically deleted.  See sharedfileset.c for a shared
+ * ownership mechanism that provides automatic cleanup for shared files when
+ * the last of a group of backends detaches.
+ *
  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
  * They behave like the corresponding native functions, except that the handle
@@ -175,8 +183,9 @@ int                 max_safe_fds = 32;      /* default if not changed */
 #define FilePosIsUnknown(pos) ((pos) < 0)
 
 /* these are the assigned bits in fdstate below: */
-#define FD_TEMPORARY           (1 << 0)        /* T = delete when closed */
-#define FD_XACT_TEMPORARY      (1 << 1)        /* T = delete at eoXact */
+#define FD_DELETE_AT_CLOSE     (1 << 0)        /* T = delete when closed */
+#define FD_CLOSE_AT_EOXACT     (1 << 1)        /* T = close at eoXact */
+#define FD_TEMP_FILE_LIMIT     (1 << 2)        /* T = respect temp_file_limit */
 
 typedef struct vfd
 {
@@ -313,7 +322,7 @@ static struct dirent *ReadDirExtended(DIR *dir, const char *dirname, int elevel)
 
 static void AtProcExit_Files(int code, Datum arg);
 static void CleanupTempFiles(bool isProcExit);
-static void RemovePgTempFilesInDir(const char *tmpdirname);
+static void RemovePgTempFilesInDir(const char *tmpdirname, bool unlink_all);
 static void RemovePgTempRelationFiles(const char *tsdirname);
 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
 static bool looks_like_temp_rel_name(const char *name);
@@ -326,6 +335,7 @@ static void walkdir(const char *path,
 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
 #endif
 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
+static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
 
 static int     fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
 static int     fsync_parent_path(const char *fname, int elevel);
@@ -1294,6 +1304,39 @@ FileAccess(File file)
        return 0;
 }
 
+/*
+ * Called whenever a temporary file is deleted to report its size.
+ */
+static void
+ReportTemporaryFileUsage(const char *path, off_t size)
+{
+       pgstat_report_tempfile(size);
+
+       if (log_temp_files >= 0)
+       {
+               if ((size / 1024) >= log_temp_files)
+                       ereport(LOG,
+                                       (errmsg("temporary file: path \"%s\", size %lu",
+                                                       path, (unsigned long) size)));
+       }
+}
+
+/*
+ * Called to register a temporary file for automatic close.
+ * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
+ * before the file was opened.
+ */
+static void
+RegisterTemporaryFile(File file)
+{
+       ResourceOwnerRememberFile(CurrentResourceOwner, file);
+       VfdCache[file].resowner = CurrentResourceOwner;
+
+       /* Backup mechanism for closing at end of xact. */
+       VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
+       have_xact_temporary_files = true;
+}
+
 /*
  *     Called when we get a shared invalidation message on some relation.
  */
@@ -1378,6 +1421,67 @@ PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
        return file;
 }
 
+/*
+ * Create directory 'directory'.  If necessary, create 'basedir', which must
+ * be the directory above it.  This is designed for creating the top-level
+ * temporary directory on demand before creating a directory underneath it.
+ * Do nothing if the directory already exists.
+ *
+ * Directories created within the top-level temporary directory should begin
+ * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
+ * deleted at startup by RemovePgTempFiles().  Further subdirectories below
+ * that do not need any particular prefix.
+*/
+void
+PathNameCreateTemporaryDir(const char *basedir, const char *directory)
+{
+       if (mkdir(directory, S_IRWXU) < 0)
+       {
+               if (errno == EEXIST)
+                       return;
+
+               /*
+                * Failed.  Try to create basedir first in case it's missing. Tolerate
+                * EEXIST to close a race against another process following the same
+                * algorithm.
+                */
+               if (mkdir(basedir, S_IRWXU) < 0 && errno != EEXIST)
+                       ereport(ERROR,
+                                       (errcode_for_file_access(),
+                                        errmsg("cannot create temporary directory \"%s\": %m",
+                                                       basedir)));
+
+               /* Try again. */
+               if (mkdir(directory, S_IRWXU) < 0)
+                       ereport(ERROR,
+                                       (errcode_for_file_access(),
+                                        errmsg("cannot create temporary subdirectory \"%s\": %m",
+                                                       directory)));
+       }
+}
+
+/*
+ * Delete a directory and everything in it, if it exists.
+ */
+void
+PathNameDeleteTemporaryDir(const char *dirname)
+{
+       struct stat statbuf;
+
+       /* Silently ignore missing directory. */
+       if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
+               return;
+
+       /*
+        * Currently, walkdir doesn't offer a way for our passed in function to
+        * maintain state.  Perhaps it should, so that we could tell the caller
+        * whether this operation succeeded or failed.  Since this operation is
+        * used in a cleanup path, we wouldn't actually behave differently: we'll
+        * just log failures.
+        */
+       walkdir(dirname, unlink_if_exists_fname, false, LOG);
+}
+
 /*
  * Open a temporary file that will disappear when we close it.
  *
@@ -1432,53 +1536,52 @@ OpenTemporaryFile(bool interXact)
                                                                                         DEFAULTTABLESPACE_OID,
                                                                                         true);
 
-       /* Mark it for deletion at close */
-       VfdCache[file].fdstate |= FD_TEMPORARY;
+       /* Mark it for deletion at close and temporary file size limit */
+       VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
 
        /* Register it with the current resource owner */
        if (!interXact)
-       {
-               VfdCache[file].fdstate |= FD_XACT_TEMPORARY;
-
-               VfdCache[file].resowner = CurrentResourceOwner;
-               ResourceOwnerRememberFile(CurrentResourceOwner, file);
-
-               /* ensure cleanup happens at eoxact */
-               have_xact_temporary_files = true;
-       }
+               RegisterTemporaryFile(file);
 
        return file;
 }
 
 /*
- * Open a temporary file in a specific tablespace.
- * Subroutine for OpenTemporaryFile, which see for details.
+ * Return the path of the temp directory in a given tablespace.
  */
-static File
-OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
+void
+TempTablespacePath(char *path, Oid tablespace)
 {
-       char            tempdirpath[MAXPGPATH];
-       char            tempfilepath[MAXPGPATH];
-       File            file;
-
        /*
         * Identify the tempfile directory for this tablespace.
         *
         * If someone tries to specify pg_global, use pg_default instead.
         */
-       if (tblspcOid == DEFAULTTABLESPACE_OID ||
-               tblspcOid == GLOBALTABLESPACE_OID)
-       {
-               /* The default tablespace is {datadir}/base */
-               snprintf(tempdirpath, sizeof(tempdirpath), "base/%s",
-                                PG_TEMP_FILES_DIR);
-       }
+       if (tablespace == InvalidOid ||
+               tablespace == DEFAULTTABLESPACE_OID ||
+               tablespace == GLOBALTABLESPACE_OID)
+               snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
        else
        {
                /* All other tablespaces are accessed via symlinks */
-               snprintf(tempdirpath, sizeof(tempdirpath), "pg_tblspc/%u/%s/%s",
-                                tblspcOid, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
+               snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
+                                tablespace, TABLESPACE_VERSION_DIRECTORY,
+                                PG_TEMP_FILES_DIR);
        }
+}
+
+/*
+ * Open a temporary file in a specific tablespace.
+ * Subroutine for OpenTemporaryFile, which see for details.
+ */
+static File
+OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
+{
+       char            tempdirpath[MAXPGPATH];
+       char            tempfilepath[MAXPGPATH];
+       File            file;
+
+       TempTablespacePath(tempdirpath, tblspcOid);
 
        /*
         * Generate a tempfile name that should be unique within the current
@@ -1515,6 +1618,130 @@ OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
        return file;
 }
 
+
+/*
+ * Create a new file.  The directory containing it must already exist.  Files
+ * created this way are subject to temp_file_limit and are automatically
+ * closed at end of transaction, but are not automatically deleted on close
+ * because they are intended to be shared between cooperating backends.
+ *
+ * If the file is inside the top-level temporary directory, its name should
+ * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
+ * and deleted at startup by RemovePgTempFiles().  Alternatively, it can be
+ * inside a directory created with PathnameCreateTemporaryDir(), in which case
+ * the prefix isn't needed.
+ */
+File
+PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
+{
+       File            file;
+
+       ResourceOwnerEnlargeFiles(CurrentResourceOwner);
+
+       /*
+        * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
+        * temp file that can be reused.
+        */
+       file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
+       if (file <= 0)
+       {
+               if (error_on_failure)
+                       ereport(ERROR,
+                                       (errcode_for_file_access(),
+                                        errmsg("could not create temporary file \"%s\": %m",
+                                                       path)));
+               else
+                       return file;
+       }
+
+       /* Mark it for temp_file_limit accounting. */
+       VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
+
+       /* Register it for automatic close. */
+       RegisterTemporaryFile(file);
+
+       return file;
+}
+
+/*
+ * Open a file that was created with PathNameCreateTemporaryFile, possibly in
+ * another backend.  Files opened this way don't count against the
+ * temp_file_limit of the caller, are read-only and are automatically closed
+ * at the end of the transaction but are not deleted on close.
+ */
+File
+PathNameOpenTemporaryFile(const char *path)
+{
+       File            file;
+
+       ResourceOwnerEnlargeFiles(CurrentResourceOwner);
+
+       /* We open the file read-only. */
+       file = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
+
+       /* If no such file, then we don't raise an error. */
+       if (file <= 0 && errno != ENOENT)
+               ereport(ERROR,
+                               (errcode_for_file_access(),
+                                errmsg("could not open temporary file \"%s\": %m",
+                                               path)));
+
+       if (file > 0)
+       {
+               /* Register it for automatic close. */
+               RegisterTemporaryFile(file);
+       }
+
+       return file;
+}
+
+/*
+ * Delete a file by pathname.  Return true if the file existed, false if
+ * didn't.
+ */
+bool
+PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
+{
+       struct stat filestats;
+       int                     stat_errno;
+
+       /* Get the final size for pgstat reporting. */
+       if (stat(path, &filestats) != 0)
+               stat_errno = errno;
+       else
+               stat_errno = 0;
+
+       /*
+        * Unlike FileClose's automatic file deletion code, we tolerate
+        * non-existence to support BufFileDeleteShared which doesn't know how
+        * many segments it has to delete until it runs out.
+        */
+       if (stat_errno == ENOENT)
+               return false;
+
+       if (unlink(path) < 0)
+       {
+               if (errno != ENOENT)
+                       ereport(error_on_failure ? ERROR : LOG,
+                                       (errcode_for_file_access(),
+                                        errmsg("cannot unlink temporary file \"%s\": %m",
+                                                       path)));
+               return false;
+       }
+
+       if (stat_errno == 0)
+               ReportTemporaryFileUsage(path, filestats.st_size);
+       else
+       {
+               errno = stat_errno;
+               ereport(LOG,
+                               (errcode_for_file_access(),
+                                errmsg("could not stat file \"%s\": %m", path)));
+       }
+
+       return true;
+}
+
 /*
  * close a file when done with it
  */
@@ -1543,10 +1770,17 @@ FileClose(File file)
                Delete(file);
        }
 
+       if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
+       {
+               /* Subtract its size from current usage (do first in case of error) */
+               temporary_files_size -= vfdP->fileSize;
+               vfdP->fileSize = 0;
+       }
+
        /*
         * Delete the file if it was temporary, and make a log entry if wanted
         */
-       if (vfdP->fdstate & FD_TEMPORARY)
+       if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
        {
                struct stat filestats;
                int                     stat_errno;
@@ -1558,11 +1792,8 @@ FileClose(File file)
                 * is arranged to ensure that the worst-case consequence is failing to
                 * emit log message(s), not failing to attempt the unlink.
                 */
-               vfdP->fdstate &= ~FD_TEMPORARY;
+               vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
 
-               /* Subtract its size from current usage (do first in case of error) */
-               temporary_files_size -= vfdP->fileSize;
-               vfdP->fileSize = 0;
 
                /* first try the stat() */
                if (stat(vfdP->fileName, &filestats))
@@ -1576,18 +1807,7 @@ FileClose(File file)
 
                /* and last report the stat results */
                if (stat_errno == 0)
-               {
-                       pgstat_report_tempfile(filestats.st_size);
-
-                       if (log_temp_files >= 0)
-                       {
-                               if ((filestats.st_size / 1024) >= log_temp_files)
-                                       ereport(LOG,
-                                                       (errmsg("temporary file: path \"%s\", size %lu",
-                                                                       vfdP->fileName,
-                                                                       (unsigned long) filestats.st_size)));
-                       }
-               }
+                       ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
                else
                {
                        errno = stat_errno;
@@ -1761,7 +1981,7 @@ FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
         * message if we do that.  All current callers would just throw error
         * immediately anyway, so this is safe at present.
         */
-       if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMPORARY))
+       if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
        {
                off_t           newPos;
 
@@ -1814,7 +2034,7 @@ retry:
                 * get here in that state if we're not enforcing temporary_files_size,
                 * so we don't care.
                 */
-               if (vfdP->fdstate & FD_TEMPORARY)
+               if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
                {
                        off_t           newPos = vfdP->seekPos;
 
@@ -1985,7 +2205,7 @@ FileTruncate(File file, off_t offset, uint32 wait_event_info)
        if (returnCode == 0 && VfdCache[file].fileSize > offset)
        {
                /* adjust our state for truncation of a temp file */
-               Assert(VfdCache[file].fdstate & FD_TEMPORARY);
+               Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
                temporary_files_size -= VfdCache[file].fileSize - offset;
                VfdCache[file].fileSize = offset;
        }
@@ -2593,6 +2813,24 @@ TempTablespacesAreSet(void)
        return (numTempTableSpaces >= 0);
 }
 
+/*
+ * GetTempTablespaces
+ *
+ * Populate an array with the OIDs of the tablespaces that should be used for
+ * temporary files.  Return the number that were copied into the output array.
+ */
+int
+GetTempTablespaces(Oid *tableSpaces, int numSpaces)
+{
+       int                     i;
+
+       Assert(TempTablespacesAreSet());
+       for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
+               tableSpaces[i] = tempTableSpaces[i];
+
+       return i;
+}
+
 /*
  * GetNextTempTableSpace
  *
@@ -2696,7 +2934,8 @@ CleanupTempFiles(bool isProcExit)
                {
                        unsigned short fdstate = VfdCache[i].fdstate;
 
-                       if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL)
+                       if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
+                               VfdCache[i].fileName != NULL)
                        {
                                /*
                                 * If we're in the process of exiting a backend process, close
@@ -2707,7 +2946,7 @@ CleanupTempFiles(bool isProcExit)
                                 */
                                if (isProcExit)
                                        FileClose(i);
-                               else if (fdstate & FD_XACT_TEMPORARY)
+                               else if (fdstate & FD_CLOSE_AT_EOXACT)
                                {
                                        elog(WARNING,
                                                 "temporary file %s not closed at end-of-transaction",
@@ -2751,7 +2990,7 @@ RemovePgTempFiles(void)
         * First process temp files in pg_default ($PGDATA/base)
         */
        snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
-       RemovePgTempFilesInDir(temp_path);
+       RemovePgTempFilesInDir(temp_path, false);
        RemovePgTempRelationFiles("base");
 
        /*
@@ -2767,7 +3006,7 @@ RemovePgTempFiles(void)
 
                snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
                                 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
-               RemovePgTempFilesInDir(temp_path);
+               RemovePgTempFilesInDir(temp_path, false);
 
                snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
                                 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
@@ -2785,9 +3024,15 @@ RemovePgTempFiles(void)
 #endif
 }
 
-/* Process one pgsql_tmp directory for RemovePgTempFiles */
+/*
+ * Process one pgsql_tmp directory for RemovePgTempFiles.  At the top level in
+ * each tablespace, this should be called with unlink_all = false, so that
+ * only files matching the temporary name prefix will be unlinked.  When
+ * recursing it will be called with unlink_all = true to unlink everything
+ * under a top-level temporary directory.
+ */
 static void
-RemovePgTempFilesInDir(const char *tmpdirname)
+RemovePgTempFilesInDir(const char *tmpdirname, bool unlink_all)
 {
        DIR                *temp_dir;
        struct dirent *temp_de;
@@ -2813,10 +3058,25 @@ RemovePgTempFilesInDir(const char *tmpdirname)
                snprintf(rm_path, sizeof(rm_path), "%s/%s",
                                 tmpdirname, temp_de->d_name);
 
-               if (strncmp(temp_de->d_name,
+               if (unlink_all ||
+                       strncmp(temp_de->d_name,
                                        PG_TEMP_FILE_PREFIX,
                                        strlen(PG_TEMP_FILE_PREFIX)) == 0)
-                       unlink(rm_path);        /* note we ignore any error */
+               {
+                       struct stat statbuf;
+
+                       /* note that we ignore any error here and below */
+                       if (lstat(rm_path, &statbuf) < 0)
+                               continue;
+
+                       if (S_ISDIR(statbuf.st_mode))
+                       {
+                               RemovePgTempFilesInDir(rm_path, true);
+                               rmdir(rm_path);
+                       }
+                       else
+                               unlink(rm_path);
+               }
                else
                        elog(LOG,
                                 "unexpected file found in temporary-files directory: \"%s\"",
@@ -3152,6 +3412,23 @@ datadir_fsync_fname(const char *fname, bool isdir, int elevel)
        fsync_fname_ext(fname, isdir, true, elevel);
 }
 
+static void
+unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
+{
+       if (isdir)
+       {
+               if (rmdir(fname) != 0 && errno != ENOENT)
+                       ereport(elevel,
+                                       (errcode_for_file_access(),
+                                        errmsg("could not rmdir directory \"%s\": %m", fname)));
+       }
+       else
+       {
+               /* Use PathNameDeleteTemporaryFile to report filesize */
+               PathNameDeleteTemporaryFile(fname, false);
+       }
+}
+
 /*
  * fsync_fname_ext -- Try to fsync a file or directory
  *
diff --git a/src/backend/storage/file/sharedfileset.c b/src/backend/storage/file/sharedfileset.c
new file mode 100644 (file)
index 0000000..343b228
--- /dev/null
@@ -0,0 +1,244 @@
+/*-------------------------------------------------------------------------
+ *
+ * sharedfileset.c
+ *       Shared temporary file management.
+ *
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *       src/backend/storage/file/sharedfileset.c
+ *
+ * SharefFileSets provide a temporary namespace (think directory) so that
+ * files can be discovered by name, and a shared ownership semantics so that
+ * shared files survive until the last user detaches.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/hash.h"
+#include "catalog/pg_tablespace.h"
+#include "commands/tablespace.h"
+#include "miscadmin.h"
+#include "storage/dsm.h"
+#include "storage/sharedfileset.h"
+#include "utils/builtins.h"
+
+static void SharedFileSetOnDetach(dsm_segment *segment, Datum datum);
+static void SharedFileSetPath(char *path, SharedFileSet *fileset, Oid tablespace);
+static void SharedFilePath(char *path, SharedFileSet *fileset, const char *name);
+static Oid     ChooseTablespace(const SharedFileSet *fileset, const char *name);
+
+/*
+ * Initialize a space for temporary files that can be opened for read-only
+ * access by other backends.  Other backends must attach to it before
+ * accessing it.  Associate this SharedFileSet with 'seg'.  Any contained
+ * files will be deleted when the last backend detaches.
+ *
+ * Files will be distributed over the tablespaces configured in
+ * temp_tablespaces.
+ *
+ * Under the covers the set is one or more directories which will eventually
+ * be deleted when there are no backends attached.
+ */
+void
+SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg)
+{
+       static uint32 counter = 0;
+
+       SpinLockInit(&fileset->mutex);
+       fileset->refcnt = 1;
+       fileset->creator_pid = MyProcPid;
+       fileset->number = counter;
+       counter = (counter + 1) % INT_MAX;
+
+       /* Capture the tablespace OIDs so that all backends agree on them. */
+       PrepareTempTablespaces();
+       fileset->ntablespaces =
+               GetTempTablespaces(&fileset->tablespaces[0],
+                                                  lengthof(fileset->tablespaces));
+       if (fileset->ntablespaces == 0)
+       {
+               fileset->tablespaces[0] = DEFAULTTABLESPACE_OID;
+               fileset->ntablespaces = 1;
+       }
+
+       /* Register our cleanup callback. */
+       on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset));
+}
+
+/*
+ * Attach to a set of directories that was created with SharedFileSetInit.
+ */
+void
+SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg)
+{
+       bool            success;
+
+       SpinLockAcquire(&fileset->mutex);
+       if (fileset->refcnt == 0)
+               success = false;
+       else
+       {
+               ++fileset->refcnt;
+               success = true;
+       }
+       SpinLockRelease(&fileset->mutex);
+
+       if (!success)
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                errmsg("could not attach to a SharedFileSet that is already destroyed")));
+
+       /* Register our cleanup callback. */
+       on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset));
+}
+
+/*
+ * Create a new file in the given set.
+ */
+File
+SharedFileSetCreate(SharedFileSet *fileset, const char *name)
+{
+       char            path[MAXPGPATH];
+       File            file;
+
+       SharedFilePath(path, fileset, name);
+       file = PathNameCreateTemporaryFile(path, false);
+
+       /* If we failed, see if we need to create the directory on demand. */
+       if (file <= 0)
+       {
+               char            tempdirpath[MAXPGPATH];
+               char            filesetpath[MAXPGPATH];
+               Oid                     tablespace = ChooseTablespace(fileset, name);
+
+               TempTablespacePath(tempdirpath, tablespace);
+               SharedFileSetPath(filesetpath, fileset, tablespace);
+               PathNameCreateTemporaryDir(tempdirpath, filesetpath);
+               file = PathNameCreateTemporaryFile(path, true);
+       }
+
+       return file;
+}
+
+/*
+ * Open a file that was created with SharedFileSetCreate(), possibly in
+ * another backend.
+ */
+File
+SharedFileSetOpen(SharedFileSet *fileset, const char *name)
+{
+       char            path[MAXPGPATH];
+       File            file;
+
+       SharedFilePath(path, fileset, name);
+       file = PathNameOpenTemporaryFile(path);
+
+       return file;
+}
+
+/*
+ * Delete a file that was created with PathNameCreateShared().
+ * Return true if the file existed, false if didn't.
+ */
+bool
+SharedFileSetDelete(SharedFileSet *fileset, const char *name,
+                                       bool error_on_failure)
+{
+       char            path[MAXPGPATH];
+
+       SharedFilePath(path, fileset, name);
+
+       return PathNameDeleteTemporaryFile(path, error_on_failure);
+}
+
+/*
+ * Delete all files in the set.
+ */
+void
+SharedFileSetDeleteAll(SharedFileSet *fileset)
+{
+       char            dirpath[MAXPGPATH];
+       int                     i;
+
+       /*
+        * Delete the directory we created in each tablespace.  Doesn't fail
+        * because we use this in error cleanup paths, but can generate LOG
+        * message on IO error.
+        */
+       for (i = 0; i < fileset->ntablespaces; ++i)
+       {
+               SharedFileSetPath(dirpath, fileset, fileset->tablespaces[i]);
+               PathNameDeleteTemporaryDir(dirpath);
+       }
+}
+
+/*
+ * Callback function that will be invoked when this backend detaches from a
+ * DSM segment holding a SharedFileSet that it has created or attached to.  If
+ * we are the last to detach, then try to remove the directories and
+ * everything in them.  We can't raise an error on failures, because this runs
+ * in error cleanup paths.
+ */
+static void
+SharedFileSetOnDetach(dsm_segment *segment, Datum datum)
+{
+       bool            unlink_all = false;
+       SharedFileSet *fileset = (SharedFileSet *) DatumGetPointer(datum);
+
+       SpinLockAcquire(&fileset->mutex);
+       Assert(fileset->refcnt > 0);
+       if (--fileset->refcnt == 0)
+               unlink_all = true;
+       SpinLockRelease(&fileset->mutex);
+
+       /*
+        * If we are the last to detach, we delete the directory in all
+        * tablespaces.  Note that we are still actually attached for the rest of
+        * this function so we can safely access its data.
+        */
+       if (unlink_all)
+               SharedFileSetDeleteAll(fileset);
+}
+
+/*
+ * Build the path for the directory holding the files backing a SharedFileSet
+ * in a given tablespace.
+ */
+static void
+SharedFileSetPath(char *path, SharedFileSet *fileset, Oid tablespace)
+{
+       char            tempdirpath[MAXPGPATH];
+
+       TempTablespacePath(tempdirpath, tablespace);
+       snprintf(path, MAXPGPATH, "%s/%s%d.%u.sharedfileset",
+                        tempdirpath, PG_TEMP_FILE_PREFIX,
+                        fileset->creator_pid, fileset->number);
+}
+
+/*
+ * Sorting hat to determine which tablespace a given shared temporary file
+ * belongs in.
+ */
+static Oid
+ChooseTablespace(const SharedFileSet *fileset, const char *name)
+{
+       uint32          hash = hash_any((const unsigned char *) name, strlen(name));
+
+       return fileset->tablespaces[hash % fileset->ntablespaces];
+}
+
+/*
+ * Compute the full path of a file in a SharedFileSet.
+ */
+static void
+SharedFilePath(char *path, SharedFileSet *fileset, const char *name)
+{
+       char            dirpath[MAXPGPATH];
+
+       SharedFileSetPath(dirpath, fileset, ChooseTablespace(fileset, name));
+       snprintf(path, MAXPGPATH, "%s/%s", dirpath, name);
+}
index 640908717d9258cd4849f442b50bde063d48fb54..c3d7a61b64c92d3a5f778728d6464760e3572080 100644 (file)
@@ -26,6 +26,8 @@
 #ifndef BUFFILE_H
 #define BUFFILE_H
 
+#include "storage/sharedfileset.h"
+
 /* BufFile is an opaque type whose details are not known outside buffile.c. */
 
 typedef struct BufFile BufFile;
@@ -42,4 +44,9 @@ extern int    BufFileSeek(BufFile *file, int fileno, off_t offset, int whence);
 extern void BufFileTell(BufFile *file, int *fileno, off_t *offset);
 extern int     BufFileSeekBlock(BufFile *file, long blknum);
 
+extern BufFile *BufFileCreateShared(SharedFileSet *fileset, const char *name);
+extern void BufFileExportShared(BufFile *file);
+extern BufFile *BufFileOpenShared(SharedFileSet *fileset, const char *name);
+extern void BufFileDeleteShared(SharedFileSet *fileset, const char *name);
+
 #endif                                                 /* BUFFILE_H */
index 6ea26e63b847352613fb68703ec350199dffc57d..982928150927f2f66932b8f774b3448620f1efd8 100644 (file)
@@ -79,6 +79,14 @@ extern int   FileGetRawDesc(File file);
 extern int     FileGetRawFlags(File file);
 extern mode_t FileGetRawMode(File file);
 
+/* Operations used for sharing named temporary files */
+extern File PathNameCreateTemporaryFile(const char *name, bool error_on_failure);
+extern File PathNameOpenTemporaryFile(const char *name);
+extern bool PathNameDeleteTemporaryFile(const char *name, bool error_on_failure);
+extern void PathNameCreateTemporaryDir(const char *base, const char *name);
+extern void PathNameDeleteTemporaryDir(const char *name);
+extern void TempTablespacePath(char *path, Oid tablespace);
+
 /* Operations that allow use of regular stdio --- USE WITH CAUTION */
 extern FILE *AllocateFile(const char *name, const char *mode);
 extern int     FreeFile(FILE *file);
@@ -107,6 +115,7 @@ extern void set_max_safe_fds(void);
 extern void closeAllVfds(void);
 extern void SetTempTablespaces(Oid *tableSpaces, int numSpaces);
 extern bool TempTablespacesAreSet(void);
+extern int     GetTempTablespaces(Oid *tableSpaces, int numSpaces);
 extern Oid     GetNextTempTableSpace(void);
 extern void AtEOXact_Files(void);
 extern void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
@@ -124,7 +133,7 @@ extern int  durable_unlink(const char *fname, int loglevel);
 extern int     durable_link_or_rename(const char *oldfile, const char *newfile, int loglevel);
 extern void SyncDataDirectory(void);
 
-/* Filename components for OpenTemporaryFile */
+/* Filename components */
 #define PG_TEMP_FILES_DIR "pgsql_tmp"
 #define PG_TEMP_FILE_PREFIX "pgsql_tmp"
 
diff --git a/src/include/storage/sharedfileset.h b/src/include/storage/sharedfileset.h
new file mode 100644 (file)
index 0000000..20651bb
--- /dev/null
@@ -0,0 +1,45 @@
+/*-------------------------------------------------------------------------
+ *
+ * sharedfileset.h
+ *       Shared temporary file management.
+ *
+ *
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/sharedfilespace.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef SHAREDFILESET_H
+#define SHAREDFILESET_H
+
+#include "storage/dsm.h"
+#include "storage/fd.h"
+#include "storage/spin.h"
+
+/*
+ * A set of temporary files that can be shared by multiple backends.
+ */
+typedef struct SharedFileSet
+{
+       pid_t           creator_pid;    /* PID of the creating process */
+       uint32          number;                 /* per-PID identifier */
+       slock_t         mutex;                  /* mutex protecting the reference count */
+       int                     refcnt;                 /* number of attached backends */
+       int                     ntablespaces;   /* number of tablespaces to use */
+       Oid                     tablespaces[8]; /* OIDs of tablespaces to use. Assumes that
+                                                                * it's rare that there more than temp
+                                                                * tablespaces. */
+} SharedFileSet;
+
+extern void SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg);
+extern void SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg);
+extern File SharedFileSetCreate(SharedFileSet *fileset, const char *name);
+extern File SharedFileSetOpen(SharedFileSet *fileset, const char *name);
+extern bool SharedFileSetDelete(SharedFileSet *fileset, const char *name,
+                                       bool error_on_failure);
+extern void SharedFileSetDeleteAll(SharedFileSet *fileset);
+
+#endif
index 3e84720038610ddb1d4a804768082a7224342953..72eb9fd390cd07f7130cc1ae2e50f21d5ce5c316 100644 (file)
@@ -2026,6 +2026,7 @@ SharedBitmapState
 SharedDependencyObjectType
 SharedDependencyType
 SharedExecutorInstrumentation
+SharedFileSet
 SharedInvalCatalogMsg
 SharedInvalCatcacheMsg
 SharedInvalRelcacheMsg