]> granicus.if.org Git - postgresql/commitdiff
Split 'BufFile' routines out of fd.c into a new module, buffile.c. Extend
authorTom Lane <tgl@sss.pgh.pa.us>
Wed, 13 Oct 1999 15:02:32 +0000 (15:02 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Wed, 13 Oct 1999 15:02:32 +0000 (15:02 +0000)
BufFile so that it handles multi-segment temporary files transparently.
This allows sorts and hashes to work with data exceeding 2Gig (or whatever
the local limit on file size is).  Change psort.c to use relative seeks
instead of absolute seeks for backwards scanning, so that it won't fail
when the data volume exceeds 2Gig.

12 files changed:
src/backend/executor/nodeHash.c
src/backend/executor/nodeHashjoin.c
src/backend/storage/file/Makefile
src/backend/storage/file/buffile.c [new file with mode: 0644]
src/backend/storage/file/fd.c
src/backend/storage/large_object/inv_api.c
src/backend/utils/sort/psort.c
src/include/executor/hashjoin.h
src/include/executor/nodeHashjoin.h
src/include/storage/buffile.h [new file with mode: 0644]
src/include/storage/fd.h
src/include/utils/psort.h

index 214cc7f06964dbf8c3af866a5b27949bdb37663e..c6295acf7237a23947202302a1298ce135065f2c 100644 (file)
@@ -6,7 +6,7 @@
  * Copyright (c) 1994, Regents of the University of California
  *
  *
- *     $Id: nodeHash.c,v 1.38 1999/07/17 20:16:58 momjian Exp $
+ *     $Id: nodeHash.c,v 1.39 1999/10/13 15:02:25 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -75,12 +75,7 @@ ExecHash(Hash *node)
                 * ----------------
                 */
                for (i = 0; i < nbatch; i++)
-               {
-                       File            tfile = OpenTemporaryFile();
-
-                       Assert(tfile >= 0);
-                       hashtable->innerBatchFile[i] = BufFileCreate(tfile);
-               }
+                       hashtable->innerBatchFile[i] = BufFileCreateTemp();
        }
 
        /* ----------------
index 439b8634cda1eb81a772129394634903f84cd729..ffda9723182a7b04594b223f642cd2c497f3911f 100644 (file)
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/executor/nodeHashjoin.c,v 1.26 1999/07/17 20:16:58 momjian Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/executor/nodeHashjoin.c,v 1.27 1999/10/13 15:02:25 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -129,12 +129,7 @@ ExecHashJoin(HashJoin *node)
                 * ----------------
                 */
                for (i = 0; i < hashtable->nbatch; i++)
-               {
-                       File            tfile = OpenTemporaryFile();
-
-                       Assert(tfile >= 0);
-                       hashtable->outerBatchFile[i] = BufFileCreate(tfile);
-               }
+                       hashtable->outerBatchFile[i] = BufFileCreateTemp();
        }
        else if (hashtable == NULL)
                return NULL;
@@ -551,13 +546,12 @@ ExecHashJoinNewBatch(HashJoinState *hjstate)
         * Rewind inner and outer batch files for this batch, so that we can
         * start reading them.
         */
-       if (BufFileSeek(hashtable->outerBatchFile[newbatch - 1], 0L,
-                                       SEEK_SET) != 0L)
+       if (BufFileSeek(hashtable->outerBatchFile[newbatch - 1], 0, 0L, SEEK_SET))
                elog(ERROR, "Failed to rewind hash temp file");
 
        innerFile = hashtable->innerBatchFile[newbatch - 1];
 
-       if (BufFileSeek(innerFile, 0L, SEEK_SET) != 0L)
+       if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))
                elog(ERROR, "Failed to rewind hash temp file");
 
        /*
index 9f321ff57fb6f8d0d14372fba92af17532436a5b..766a0c1d1c39763c2e15379a8779605fd88b654b 100644 (file)
@@ -4,7 +4,7 @@
 #    Makefile for storage/file
 #
 # IDENTIFICATION
-#    $Header: /cvsroot/pgsql/src/backend/storage/file/Makefile,v 1.5 1998/04/06 00:25:05 momjian Exp $
+#    $Header: /cvsroot/pgsql/src/backend/storage/file/Makefile,v 1.6 1999/10/13 15:02:29 tgl Exp $
 #
 #-------------------------------------------------------------------------
 
@@ -13,7 +13,7 @@ include ../../../Makefile.global
 
 CFLAGS += -I../..
 
-OBJS = fd.o
+OBJS = fd.o buffile.o
 
 all: SUBSYS.o
 
diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c
new file mode 100644 (file)
index 0000000..cd7da90
--- /dev/null
@@ -0,0 +1,556 @@
+/*-------------------------------------------------------------------------
+ *
+ * buffile.c
+ *       Management of large buffered files, primarily temporary files.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *       $Header: /cvsroot/pgsql/src/backend/storage/file/buffile.c,v 1.1 1999/10/13 15:02:29 tgl Exp $
+ *
+ * NOTES:
+ *
+ * BufFiles provide a very incomplete emulation of stdio atop virtual Files
+ * (as managed by fd.c).  Currently, we only support the buffered-I/O
+ * aspect of stdio: a read or write of the low-level File occurs only
+ * when the buffer is filled or emptied.  This is an even bigger win
+ * for virtual Files than for ordinary kernel files, since reducing the
+ * frequency with which a virtual File is touched reduces "thrashing"
+ * of opening/closing file descriptors.
+ *
+ * Note that BufFile structs are allocated with palloc(), and therefore
+ * will go away automatically at transaction end.  If the underlying
+ * virtual File is made with OpenTemporaryFile, then all resources for
+ * the file are certain to be cleaned up even if processing is aborted
+ * by elog(ERROR).  To avoid confusion, the caller should take care that
+ * all calls for a single BufFile are made in the same palloc context.
+ *
+ * BufFile also supports temporary files that exceed the OS file size limit
+ * (by opening multiple fd.c temporary files).  This is an essential feature
+ * for sorts and hashjoins on large amounts of data.  It is possible to have
+ * more than one BufFile reading/writing the same temp file, although the
+ * caller is responsible for avoiding ill effects from buffer overlap when
+ * this is done.
+ *-------------------------------------------------------------------------
+ */
+
+#include <errno.h>
+
+#include "postgres.h"
+
+#include "storage/buffile.h"
+
+/*
+ * The maximum safe file size is presumed to be RELSEG_SIZE * BLCKSZ.
+ * Note we adhere to this limit whether or not LET_OS_MANAGE_FILESIZE
+ * is defined, although md.c ignores it when that symbol is defined.
+ */
+#define MAX_PHYSICAL_FILESIZE  (RELSEG_SIZE * BLCKSZ)
+
+/*
+ * To handle multiple BufFiles on a single logical temp file, we use this
+ * data structure representing a logical file (which can be made up of
+ * multiple physical files to get around the OS file size limit).
+ */
+typedef struct LogicalFile
+{
+       int                     refCount;               /* number of BufFiles using me */
+       bool            isTemp;                 /* can only add files if this is TRUE */
+       int                     numFiles;               /* number of physical files in set */
+       /* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */
+
+       File       *files;                      /* palloc'd array with numFiles entries */
+       long       *offsets;            /* palloc'd array with numFiles entries */
+       /* offsets[i] is the current seek position of files[i].  We use this
+        * to avoid making redundant FileSeek calls.
+        */
+} LogicalFile;
+
+/*
+ * A single file buffer looks like this.
+ */
+struct BufFile
+{
+       LogicalFile *logFile;           /* the underlying LogicalFile */
+       bool            dirty;                  /* does buffer need to be written? */
+       /*
+        * "current pos" is position of start of buffer within LogicalFile.
+        * Position as seen by user of BufFile is (curFile, curOffset + pos).
+        */
+       int                     curFile;                /* file index (0..n) part of current pos */
+       int                     curOffset;              /* offset part of current pos */
+       int                     pos;                    /* next read/write position in buffer */
+       int                     nbytes;                 /* total # of valid bytes in buffer */
+       char            buffer[BLCKSZ];
+};
+
+static LogicalFile *makeLogicalFile(File firstfile);
+static void extendLogicalFile(LogicalFile *file);
+static void deleteLogicalFile(LogicalFile *file);
+static void BufFileLoadBuffer(BufFile *file);
+static void BufFileDumpBuffer(BufFile *file);
+static int     BufFileFlush(BufFile *file);
+
+
+/*
+ * Create a LogicalFile with one component file and refcount 1.
+ * NOTE: caller must set isTemp true if appropriate.
+ */
+static LogicalFile *
+makeLogicalFile(File firstfile)
+{
+       LogicalFile *file = (LogicalFile *) palloc(sizeof(LogicalFile));
+
+       file->refCount = 1;
+       file->isTemp = false;
+       file->numFiles = 1;
+       file->files = (File *) palloc(sizeof(File));
+       file->files[0] = firstfile;
+       file->offsets = (long *) palloc(sizeof(long));
+       file->offsets[0] = 0L;
+
+       return file;
+}
+
+/*
+ * Add another component temp file.
+ */
+static void
+extendLogicalFile(LogicalFile *file)
+{
+       File            pfile;
+
+       Assert(file->isTemp);
+       pfile = OpenTemporaryFile();
+       Assert(pfile >= 0);
+
+       file->files = (File *) repalloc(file->files,
+                                                                       (file->numFiles+1) * sizeof(File));
+       file->offsets = (long *) repalloc(file->offsets,
+                                                                         (file->numFiles+1) * sizeof(long));
+       file->files[file->numFiles] = pfile;
+       file->offsets[file->numFiles] = 0L;
+       file->numFiles++;
+}
+
+/*
+ * Close and delete a LogicalFile when its refCount has gone to zero.
+ */
+static void
+deleteLogicalFile(LogicalFile *file)
+{
+       int i;
+
+       for (i = 0; i < file->numFiles; i++)
+               FileClose(file->files[i]);
+       pfree(file->files);
+       pfree(file->offsets);
+       pfree(file);
+}
+
+/*
+ * Create a BufFile for a new temporary file (which will expand to become
+ * multiple temporary files if more than MAX_PHYSICAL_FILESIZE bytes are
+ * written to it).
+ */
+BufFile *
+BufFileCreateTemp(void)
+{
+       BufFile    *bfile = (BufFile *) palloc(sizeof(BufFile));
+       File            pfile;
+       LogicalFile *lfile;
+
+       pfile = OpenTemporaryFile();
+       Assert(pfile >= 0);
+
+       lfile = makeLogicalFile(pfile);
+       lfile->isTemp = true;
+
+       bfile->logFile = lfile;
+       bfile->dirty = false;
+       bfile->curFile = 0;
+       bfile->curOffset = 0L;
+       bfile->pos = 0;
+       bfile->nbytes = 0;
+
+       return bfile;
+}
+
+/*
+ * Create a BufFile and attach it to an already-opened virtual File.
+ *
+ * This is comparable to fdopen() in stdio.  This is the only way at present
+ * to attach a BufFile to a non-temporary file.  Note that BufFiles created
+ * in this way CANNOT be expanded into multiple files.
+ */
+BufFile *
+BufFileCreate(File file)
+{
+       BufFile    *bfile = (BufFile *) palloc(sizeof(BufFile));
+       LogicalFile *lfile;
+
+       lfile = makeLogicalFile(file);
+
+       bfile->logFile = lfile;
+       bfile->dirty = false;
+       bfile->curFile = 0;
+       bfile->curOffset = 0L;
+       bfile->pos = 0;
+       bfile->nbytes = 0;
+
+       return bfile;
+}
+
+/*
+ * Create an additional BufFile accessing the same underlying file as an
+ * existing BufFile.  This is useful for having multiple read/write access
+ * positions in a single temporary file.  Note the caller is responsible
+ * for avoiding trouble due to overlapping buffer positions!  (Caller may
+ * assume that buffer size is BLCKSZ...)
+ */
+BufFile *
+BufFileReaccess(BufFile *file)
+{
+       BufFile    *bfile = (BufFile *) palloc(sizeof(BufFile));
+
+       bfile->logFile = file->logFile;
+       bfile->logFile->refCount++;
+       bfile->dirty = false;
+       bfile->curFile = 0;
+       bfile->curOffset = 0L;
+       bfile->pos = 0;
+       bfile->nbytes = 0;
+
+       return bfile;
+}
+
+/*
+ * Close a BufFile
+ *
+ * Like fclose(), this also implicitly FileCloses the underlying File.
+ */
+void
+BufFileClose(BufFile *file)
+{
+       /* flush any unwritten data */
+       BufFileFlush(file);
+       /* close the underlying (with delete if it's a temp file) */
+       if (--(file->logFile->refCount) <= 0)
+               deleteLogicalFile(file->logFile);
+       /* release the buffer space */
+       pfree(file);
+}
+
+/* BufFileLoadBuffer
+ *
+ * Load some data into buffer, if possible, starting from curOffset.
+ * At call, must have dirty = false, pos and nbytes = 0.
+ * On exit, nbytes is number of bytes loaded.
+ */
+static void
+BufFileLoadBuffer(BufFile *file)
+{
+       LogicalFile *lfile = file->logFile;
+       File    thisfile;
+
+       /*
+        * Advance to next component file if necessary and possible.
+        *
+        * This path can only be taken if there is more than one component,
+        * so it won't interfere with reading a non-temp file that is over
+        * MAX_PHYSICAL_FILESIZE.
+        */
+       if (file->curOffset >= MAX_PHYSICAL_FILESIZE &&
+               file->curFile+1 < lfile->numFiles)
+       {
+               file->curFile++;
+               file->curOffset = 0L;
+       }
+       thisfile = lfile->files[file->curFile];
+       /*
+        * May need to reposition physical file, if more than one BufFile
+        * is using it.
+        */
+       if (file->curOffset != lfile->offsets[file->curFile])
+       {
+               if (FileSeek(thisfile, file->curOffset, SEEK_SET) != file->curOffset)
+                       return;                         /* seek failed, read nothing */
+               lfile->offsets[file->curFile] = file->curOffset;
+       }
+       file->nbytes = FileRead(thisfile, file->buffer, sizeof(file->buffer));
+       if (file->nbytes < 0)
+               file->nbytes = 0;
+       lfile->offsets[file->curFile] += file->nbytes;
+       /* we choose not to advance curOffset here */
+}
+
+/* BufFileDumpBuffer
+ *
+ * Dump buffer contents starting at curOffset.
+ * At call, should have dirty = true, nbytes > 0.
+ * On exit, dirty is cleared if successful write, and curOffset is advanced.
+ */
+static void
+BufFileDumpBuffer(BufFile *file)
+{
+       LogicalFile *lfile = file->logFile;
+       int                     wpos = 0;
+       int                     bytestowrite;
+       File            thisfile;
+
+       /*
+        * Unlike BufFileLoadBuffer, we must dump the whole buffer even if
+        * it crosses a component-file boundary; so we need a loop.
+        */
+       while (wpos < file->nbytes)
+       {
+               /*
+                * Advance to next component file if necessary and possible.
+                */
+               if (file->curOffset >= MAX_PHYSICAL_FILESIZE && lfile->isTemp)
+               {
+                       while (file->curFile+1 >= lfile->numFiles)
+                               extendLogicalFile(lfile);
+                       file->curFile++;
+                       file->curOffset = 0L;
+               }
+               /*
+                * Enforce per-file size limit only for temp files, else just try
+                * to write as much as asked...
+                */
+               bytestowrite = file->nbytes - wpos;
+               if (lfile->isTemp)
+               {
+                       long    availbytes = MAX_PHYSICAL_FILESIZE - file->curOffset;
+
+                       if ((long) bytestowrite > availbytes)
+                               bytestowrite = (int) availbytes;
+               }
+               thisfile = lfile->files[file->curFile];
+               /*
+                * May need to reposition physical file, if more than one BufFile
+                * is using it.
+                */
+               if (file->curOffset != lfile->offsets[file->curFile])
+               {
+                       if (FileSeek(thisfile, file->curOffset, SEEK_SET) != file->curOffset)
+                               return;                 /* seek failed, give up */
+                       lfile->offsets[file->curFile] = file->curOffset;
+               }
+               bytestowrite = FileWrite(thisfile, file->buffer, bytestowrite);
+               if (bytestowrite <= 0)
+                       return;                         /* failed to write */
+               lfile->offsets[file->curFile] += bytestowrite;
+               file->curOffset += bytestowrite;
+               wpos += bytestowrite;
+       }
+       file->dirty = false;
+       /*
+        * At this point, curOffset has been advanced to the end of the buffer,
+        * ie, its original value + nbytes.  We need to make it point to the
+        * logical file position, ie, original value + pos, in case that is less
+        * (as could happen due to a small backwards seek in a dirty buffer!)
+        */
+       file->curOffset -= (file->nbytes - file->pos);
+       if (file->curOffset < 0)        /* handle possible segment crossing */
+       {
+               file->curFile--;
+               Assert(file->curFile >= 0);
+               file->curOffset += MAX_PHYSICAL_FILESIZE;
+       }
+       /* Now we can set the buffer empty without changing the logical position */
+       file->pos = 0;
+       file->nbytes = 0;
+}
+
+/* BufFileRead
+ *
+ * Like fread() except we assume 1-byte element size.
+ */
+size_t
+BufFileRead(BufFile *file, void *ptr, size_t size)
+{
+       size_t          nread = 0;
+       size_t          nthistime;
+
+       if (file->dirty)
+       {
+               if (BufFileFlush(file) != 0)
+                       return 0;                       /* could not flush... */
+               Assert(! file->dirty);
+       }
+
+       while (size > 0)
+       {
+               if (file->pos >= file->nbytes)
+               {
+                       /* Try to load more data into buffer. */
+                       file->curOffset += file->pos;
+                       file->pos = 0;
+                       file->nbytes = 0;
+                       BufFileLoadBuffer(file);
+                       if (file->nbytes <= 0)
+                               break;                  /* no more data available */
+               }
+
+               nthistime = file->nbytes - file->pos;
+               if (nthistime > size)
+                       nthistime = size;
+               Assert(nthistime > 0);
+
+               memcpy(ptr, file->buffer + file->pos, nthistime);
+
+               file->pos += nthistime;
+               ptr = (void *) ((char *) ptr + nthistime);
+               size -= nthistime;
+               nread += nthistime;
+       }
+
+       return nread;
+}
+
+/* BufFileWrite
+ *
+ * Like fwrite() except we assume 1-byte element size.
+ */
+size_t
+BufFileWrite(BufFile *file, void *ptr, size_t size)
+{
+       size_t          nwritten = 0;
+       size_t          nthistime;
+
+       while (size > 0)
+       {
+               if (file->pos >= BLCKSZ)
+               {
+                       /* Buffer full, dump it out */
+                       if (file->dirty)
+                       {
+                               BufFileDumpBuffer(file);
+                               if (file->dirty)
+                                       break;          /* I/O error */
+                       }
+                       else
+                       {
+                               /* Hmm, went directly from reading to writing? */
+                               file->curOffset += file->pos;
+                               file->pos = 0;
+                               file->nbytes = 0;
+                       }
+               }
+
+               nthistime = BLCKSZ - file->pos;
+               if (nthistime > size)
+                       nthistime = size;
+               Assert(nthistime > 0);
+
+               memcpy(file->buffer + file->pos, ptr, nthistime);
+
+               file->dirty = true;
+               file->pos += nthistime;
+               if (file->nbytes < file->pos)
+                       file->nbytes = file->pos;
+               ptr = (void *) ((char *) ptr + nthistime);
+               size -= nthistime;
+               nwritten += nthistime;
+       }
+
+       return nwritten;
+}
+
+/* BufFileFlush
+ *
+ * Like fflush()
+ */
+static int
+BufFileFlush(BufFile *file)
+{
+       if (file->dirty)
+       {
+               BufFileDumpBuffer(file);
+               if (file->dirty)
+                       return EOF;
+       }
+
+       return 0;
+}
+
+/* BufFileSeek
+ *
+ * Like fseek().  Result is 0 if OK, EOF if not.
+ */
+int
+BufFileSeek(BufFile *file, int fileno, long offset, int whence)
+{
+       int newFile;
+       long newOffset;
+       switch (whence)
+       {
+               case SEEK_SET:
+                       if (fileno < 0 || fileno >= file->logFile->numFiles ||
+                               offset < 0)
+                               return EOF;
+                       newFile = fileno;
+                       newOffset = offset;
+                       break;
+               case SEEK_CUR:
+                       /*
+                        * Relative seek considers only the signed offset, ignoring fileno.
+                        * Note that large offsets (> 1 gig) risk overflow.
+                        */
+                       newFile = file->curFile;
+                       newOffset = (file->curOffset + file->pos) + offset;
+                       break;
+#ifdef NOT_USED
+               case SEEK_END:
+                       /* could be implemented, not needed currently */
+                       break;
+#endif
+               default:
+                       elog(ERROR, "BufFileSeek: invalid whence: %d", whence);
+                       return EOF;
+       }
+       while (newOffset < 0)
+       {
+               if (--newFile < 0)
+                       return EOF;
+               newOffset += MAX_PHYSICAL_FILESIZE;
+       }
+       if (file->logFile->isTemp)
+       {
+               while (newOffset > MAX_PHYSICAL_FILESIZE)
+               {
+                       if (++newFile >= file->logFile->numFiles)
+                               return EOF;
+                       newOffset -= MAX_PHYSICAL_FILESIZE;
+               }
+       }
+       if (newFile == file->curFile &&
+               newOffset >= file->curOffset &&
+               newOffset <= file->curOffset + file->nbytes)
+       {
+               /*
+                * Seek is to a point within existing buffer; we can just adjust
+                * pos-within-buffer, without flushing buffer.  Note this is OK
+                * whether reading or writing, but buffer remains dirty if we
+                * were writing.
+                */
+               file->pos = (int) (newOffset - file->curOffset);
+               return 0;
+       }
+       /* Otherwise, must reposition buffer, so flush any dirty data */
+       if (BufFileFlush(file) != 0)
+               return EOF;
+       file->curFile = newFile;
+       file->curOffset = newOffset;
+       file->pos = 0;
+       file->nbytes = 0;
+       return 0;
+}
+
+extern void
+BufFileTell(BufFile *file, int *fileno, long *offset)
+{
+       *fileno = file->curFile;
+       *offset = file->curOffset + file->pos;
+}
index 4cdb638819e81b2be731b4d61b1da56b0dc241d8..2fce82ecfd6aaaff658ffd89392fdcaaa9af3234 100644 (file)
@@ -6,7 +6,7 @@
  * Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Id: fd.c,v 1.48 1999/09/27 15:47:49 vadim Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.49 1999/10/13 15:02:29 tgl Exp $
  *
  * NOTES:
  *
@@ -49,7 +49,6 @@
 #include "miscadmin.h"
 #include "storage/fd.h"
 
-bool   ReleaseDataFile(void);
 /*
  * Problem: Postgres does a system(ld...) to do dynamic loading.
  * This will open several extra files in addition to those used by
@@ -188,7 +187,6 @@ static int  FileAccess(File file);
 static File fileNameOpenFile(FileName fileName, int fileFlags, int fileMode);
 static char *filepath(char *filename);
 static long pg_nofile(void);
-static int     BufFileFlush(BufFile *file);
 
 /*
  * pg_fsync --- same as fsync except does nothing if -F switch was given
@@ -411,6 +409,9 @@ ReleaseLruFile()
        LruDelete(VfdCache[0].lruMoreRecently);
 }
 
+/*
+ * Force one kernel file descriptor to be released (temporarily).
+ */
 bool
 ReleaseDataFile()
 {
@@ -506,8 +507,11 @@ FreeVfd(File file)
 
 /* filepath()
  * Convert given pathname to absolute.
- * (Is this actually necessary, considering that we should be cd'd
- * into the database directory??)
+ *
+ * (Generally, this isn't actually necessary, considering that we
+ * should be cd'd into the database directory.  Presently it is only
+ * necessary to do it in "bootstrap" mode.  Maybe we should change
+ * bootstrap mode to do the cd, and save a few cycles/bytes here.)
  */
 static char *
 filepath(char *filename)
@@ -851,7 +855,7 @@ FileTell(File file)
 #endif
 
 int
-FileTruncate(File file, int offset)
+FileTruncate(File file, long offset)
 {
        int                     returnCode;
 
@@ -862,7 +866,7 @@ FileTruncate(File file, int offset)
 
        FileSync(file);
        FileAccess(file);
-       returnCode = ftruncate(VfdCache[file].fd, offset);
+       returnCode = ftruncate(VfdCache[file].fd, (size_t) offset);
        return returnCode;
 }
 
@@ -890,18 +894,6 @@ FileSync(File file)
        return returnCode;
 }
 
-int
-FileNameUnlink(char *filename)
-{
-       int                     retval;
-       char       *fname;
-
-       fname = filepath(filename);
-       retval = unlink(fname);
-       pfree(fname);
-       return retval;
-}
-
 /*
  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
  * rather than plain fopen().  This lets fd.c deal with freeing FDs if
@@ -1023,186 +1015,3 @@ AtEOXact_Files(void)
         */
        tempFileCounter = 0;
 }
-
-
-/*
- * Operations on BufFiles --- a very incomplete emulation of stdio
- * atop virtual Files. Currently, we only support the buffered-I/O
- * aspect of stdio: a read or write of the low-level File occurs only
- * when the buffer is filled or emptied.  This is an even bigger win
- * for virtual Files than ordinary kernel files, since reducing the
- * frequency with which a virtual File is touched reduces "thrashing"
- * of opening/closing file descriptors.
- *
- * Note that BufFile structs are allocated with palloc(), and therefore
- * will go away automatically at transaction end.  If the underlying
- * virtual File is made with OpenTemporaryFile, then all resources for
- * the file are certain to be cleaned up even if processing is aborted
- * by elog(ERROR).
- */
-
-struct BufFile
-{
-       File            file;                   /* the underlying virtual File */
-       bool            dirty;                  /* does buffer need to be written? */
-       int                     pos;                    /* next read/write position in buffer */
-       int                     nbytes;                 /* total # of valid bytes in buffer */
-       char            buffer[BLCKSZ];
-};
-
-
-/*
- * Create a BufFile and attach it to an (already opened) virtual File.
- *
- * This is comparable to fdopen() in stdio.
- */
-BufFile    *
-BufFileCreate(File file)
-{
-       BufFile    *bfile = (BufFile *) palloc(sizeof(BufFile));
-
-       bfile->file = file;
-       bfile->dirty = false;
-       bfile->pos = 0;
-       bfile->nbytes = 0;
-
-       return bfile;
-}
-
-/*
- * Close a BufFile
- *
- * Like fclose(), this also implicitly FileCloses the underlying File.
- */
-void
-BufFileClose(BufFile *file)
-{
-       /* flush any unwritten data */
-       BufFileFlush(file);
-       /* close the underlying (with delete if it's a temp file) */
-       FileClose(file->file);
-       /* release the buffer space */
-       pfree(file);
-}
-
-/* BufFileRead
- *
- * Like fread() except we assume 1-byte element size.
- */
-size_t
-BufFileRead(BufFile *file, void *ptr, size_t size)
-{
-       size_t          nread = 0;
-       size_t          nthistime;
-
-       if (file->dirty)
-       {
-               elog(NOTICE, "BufFileRead: should have flushed after writing");
-               BufFileFlush(file);
-       }
-
-       while (size > 0)
-       {
-               if (file->pos >= file->nbytes)
-               {
-                       /* Try to load more data into buffer */
-                       file->pos = 0;
-                       file->nbytes = FileRead(file->file, file->buffer,
-                                                                       sizeof(file->buffer));
-                       if (file->nbytes < 0)
-                               file->nbytes = 0;
-                       if (file->nbytes <= 0)
-                               break;                  /* no more data available */
-               }
-
-               nthistime = file->nbytes - file->pos;
-               if (nthistime > size)
-                       nthistime = size;
-               Assert(nthistime > 0);
-
-               memcpy(ptr, file->buffer + file->pos, nthistime);
-
-               file->pos += nthistime;
-               ptr = (void *) ((char *) ptr + nthistime);
-               size -= nthistime;
-               nread += nthistime;
-       }
-
-       return nread;
-}
-
-/* BufFileWrite
- *
- * Like fwrite() except we assume 1-byte element size.
- */
-size_t
-BufFileWrite(BufFile *file, void *ptr, size_t size)
-{
-       size_t          nwritten = 0;
-       size_t          nthistime;
-
-       while (size > 0)
-       {
-               if (file->pos >= BLCKSZ)
-               {
-                       /* Buffer full, dump it out */
-                       if (file->dirty)
-                       {
-                               if (FileWrite(file->file, file->buffer, file->nbytes) < 0)
-                                       break;          /* I/O error */
-                               file->dirty = false;
-                       }
-                       file->pos = 0;
-                       file->nbytes = 0;
-               }
-
-               nthistime = BLCKSZ - file->pos;
-               if (nthistime > size)
-                       nthistime = size;
-               Assert(nthistime > 0);
-
-               memcpy(file->buffer + file->pos, ptr, nthistime);
-
-               file->dirty = true;
-               file->pos += nthistime;
-               if (file->nbytes < file->pos)
-                       file->nbytes = file->pos;
-               ptr = (void *) ((char *) ptr + nthistime);
-               size -= nthistime;
-               nwritten += nthistime;
-       }
-
-       return nwritten;
-}
-
-/* BufFileFlush
- *
- * Like fflush()
- */
-static int
-BufFileFlush(BufFile *file)
-{
-       if (file->dirty)
-       {
-               if (FileWrite(file->file, file->buffer, file->nbytes) < 0)
-                       return EOF;
-               file->dirty = false;
-       }
-
-       return 0;
-}
-
-/* BufFileSeek
- *
- * Like fseek(), or really more like lseek() since the return value is
- * the new file offset (or -1 in case of error).
- */
-long
-BufFileSeek(BufFile *file, long offset, int whence)
-{
-       if (BufFileFlush(file) < 0)
-               return -1L;
-       file->pos = 0;
-       file->nbytes = 0;
-       return FileSeek(file->file, offset, whence);
-}
index f59e99d7fbe90fd5b0156749b01ef84c7c83a053..555768a0f3e51439276b38b17d0bc5d62fb48343 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/storage/large_object/inv_api.c,v 1.59 1999/09/18 19:07:32 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/storage/large_object/inv_api.c,v 1.60 1999/10/13 15:02:25 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -300,6 +300,21 @@ inv_destroy(Oid lobjId)
  *             end of relations.  Once clustering works, we should fix this.
  */
 #ifdef NOT_USED
+
+struct pgstat
+{                                                              /* just the fields we need from stat
+                                                                * structure */
+       int                     st_ino;
+       int                     st_mode;
+       unsigned int st_size;
+       unsigned int st_sizehigh;       /* high order bits */
+/* 2^64 == 1.8 x 10^20 bytes */
+       int                     st_uid;
+       int                     st_atime_s;             /* just the seconds */
+       int                     st_mtime_s;             /* since SysV and the new BSD both have */
+       int                     st_ctime_s;             /* usec fields.. */
+};
+
 int
 inv_stat(LargeObjectDesc *obj_desc, struct pgstat * stbuf)
 {
index b1ac25280756552f089488c2d5778cc475bb45ff..14db10c119837607cdce1bfa043faf793697bbb9 100644 (file)
@@ -4,7 +4,7 @@
  *
  * Copyright (c) 1994, Regents of the University of California
  *
- *       $Id: psort.c,v 1.56 1999/07/17 20:18:16 momjian Exp $
+ *       $Id: psort.c,v 1.57 1999/10/13 15:02:31 tgl Exp $
  *
  * NOTES
  *             Sorts the first relation into the second relation.
@@ -142,7 +142,8 @@ psort_begin(Sort *node, int nkeys, ScanKey key)
                PS(node)->psort_grab_file = mergeruns(node);
 
        PS(node)->psort_current = 0;
-       PS(node)->psort_saved = 0;
+       PS(node)->psort_saved_fileno = 0;
+       PS(node)->psort_saved = 0L;
 
        return true;
 }
@@ -227,7 +228,7 @@ inittapes(Sort *node)
 
 #define SETTUPLEN(TUP, LEN)            ((TUP)->t_len = (LEN) - HEAPTUPLESIZE)
 
-#define rewind(FP)             BufFileSeek(FP, 0L, SEEK_SET)
+#define rewind(FP)             BufFileSeek(FP, 0, 0L, SEEK_SET)
 
  /*
   * USEMEM                     - record use of memory FREEMEM             - record
@@ -764,9 +765,6 @@ psort_grabtuple(Sort *node, bool *should_free)
                                tup = ALLOCTUP(tuplen);
                                SETTUPLEN(tup, tuplen);
                                GETTUP(node, tup, tuplen, PS(node)->psort_grab_file);
-
-                               /* Update current merged sort file position */
-                               PS(node)->psort_current += tuplen + sizeof(tlendummy);
                                return tup;
                        }
                        else
@@ -775,70 +773,67 @@ psort_grabtuple(Sort *node, bool *should_free)
                                return NULL;
                        }
                }
-               /* Backward */
-               if (PS(node)->psort_current <= sizeof(tlendummy))
-                       return NULL;
-
-               /*
+               /* Backward.
+                *
                 * if all tuples are fetched already then we return last tuple,
                 * else - tuple before last returned.
                 */
                if (PS(node)->all_fetched)
                {
-
                        /*
-                        * psort_current is pointing to the zero tuplen at the end of
-                        * file
+                        * Assume seek position is pointing just past the zero tuplen
+                        * at the end of file; back up and fetch last tuple's ending
+                        * length word.  If seek fails we must have a completely empty
+                        * file.
                         */
-                       BufFileSeek(PS(node)->psort_grab_file,
-                                 PS(node)->psort_current - sizeof(tlendummy), SEEK_SET);
+                       if (BufFileSeek(PS(node)->psort_grab_file, 0,
+                                                       - (long) (2 * sizeof(tlendummy)), SEEK_CUR))
+                               return NULL;
                        GETLEN(tuplen, PS(node)->psort_grab_file);
-                       if (PS(node)->psort_current < tuplen)
-                               elog(ERROR, "psort_grabtuple: too big last tuple len in backward scan");
                        PS(node)->all_fetched = false;
                }
                else
                {
-                       /* move to position of end tlen of prev tuple */
-                       PS(node)->psort_current -= sizeof(tlendummy);
-                       BufFileSeek(PS(node)->psort_grab_file,
-                                               PS(node)->psort_current, SEEK_SET);
-                       GETLEN(tuplen, PS(node)->psort_grab_file);      /* get tlen of prev
-                                                                                                                * tuple */
+                       /*
+                        * Back up and fetch prev tuple's ending length word.
+                        * If seek fails, assume we are at start of file.
+                        */
+                       if (BufFileSeek(PS(node)->psort_grab_file, 0,
+                                                       - (long) sizeof(tlendummy), SEEK_CUR))
+                               return NULL;
+                       GETLEN(tuplen, PS(node)->psort_grab_file);
                        if (tuplen == 0)
                                elog(ERROR, "psort_grabtuple: tuplen is 0 in backward scan");
-                       if (PS(node)->psort_current <= tuplen + sizeof(tlendummy))
-                       {                                       /* prev tuple should be first one */
-                               if (PS(node)->psort_current != tuplen)
-                                       elog(ERROR, "psort_grabtuple: first tuple expected in backward scan");
-                               PS(node)->psort_current = 0;
-                               BufFileSeek(PS(node)->psort_grab_file,
-                                                       PS(node)->psort_current, SEEK_SET);
-                               return NULL;
-                       }
-
                        /*
-                        * Get position of prev tuple. This tuple becomes current
-                        * tuple now and we have to return previous one.
+                        * Back up to get ending length word of tuple before it.
                         */
-                       PS(node)->psort_current -= tuplen;
-                       /* move to position of end tlen of prev tuple */
-                       BufFileSeek(PS(node)->psort_grab_file,
-                                 PS(node)->psort_current - sizeof(tlendummy), SEEK_SET);
+                       if (BufFileSeek(PS(node)->psort_grab_file, 0,
+                                                       - (long) (tuplen + 2*sizeof(tlendummy)), SEEK_CUR))
+                       {
+                               /* If fail, presumably the prev tuple is the first in the file.
+                                * Back up so that it becomes next to read in forward direction
+                                * (not obviously right, but that is what in-memory case does)
+                                */
+                               if (BufFileSeek(PS(node)->psort_grab_file, 0,
+                                                               - (long) (tuplen + sizeof(tlendummy)), SEEK_CUR))
+                                       elog(ERROR, "psort_grabtuple: too big last tuple len in backward scan");
+                               return NULL;
+                       }
                        GETLEN(tuplen, PS(node)->psort_grab_file);
-                       if (PS(node)->psort_current < tuplen + sizeof(tlendummy))
-                               elog(ERROR, "psort_grabtuple: too big tuple len in backward scan");
                }
 
                /*
-                * move to prev (or last) tuple start position + sizeof(t_len)
+                * Now we have the length of the prior tuple, back up and read it.
+                * Note: GETTUP expects we are positioned after the initial length
+                * word of the tuple, so back up to that point.
                 */
-               BufFileSeek(PS(node)->psort_grab_file,
-                                       PS(node)->psort_current - tuplen, SEEK_SET);
+               if (BufFileSeek(PS(node)->psort_grab_file, 0,
+                                               - (long) tuplen, SEEK_CUR))
+                       elog(ERROR, "psort_grabtuple: too big tuple len in backward scan");
                tup = ALLOCTUP(tuplen);
                SETTUPLEN(tup, tuplen);
                GETTUP(node, tup, tuplen, PS(node)->psort_grab_file);
-               return tup;                             /* file position is equal to psort_current */
+               return tup;
        }
        else
        {
@@ -875,6 +870,8 @@ psort_grabtuple(Sort *node, bool *should_free)
 
 /*
  *             psort_markpos   - saves current position in the merged sort file
+ *
+ * XXX I suspect these need to save & restore the all_fetched flag as well!
  */
 void
 psort_markpos(Sort *node)
@@ -882,7 +879,12 @@ psort_markpos(Sort *node)
        Assert(node != (Sort *) NULL);
        Assert(PS(node) != (Psortstate *) NULL);
 
-       PS(node)->psort_saved = PS(node)->psort_current;
+       if (PS(node)->using_tape_files == true)
+               BufFileTell(PS(node)->psort_grab_file,
+                                       & PS(node)->psort_saved_fileno,
+                                       & PS(node)->psort_saved);
+       else
+               PS(node)->psort_saved = PS(node)->psort_current;
 }
 
 /*
@@ -897,8 +899,11 @@ psort_restorepos(Sort *node)
 
        if (PS(node)->using_tape_files == true)
                BufFileSeek(PS(node)->psort_grab_file,
-                                       PS(node)->psort_saved, SEEK_SET);
-       PS(node)->psort_current = PS(node)->psort_saved;
+                                       PS(node)->psort_saved_fileno,
+                                       PS(node)->psort_saved,
+                                       SEEK_SET);
+       else
+               PS(node)->psort_current = PS(node)->psort_saved;
 }
 
 /*
@@ -952,7 +957,8 @@ psort_rescan(Sort *node)
        {
                PS(node)->all_fetched = false;
                PS(node)->psort_current = 0;
-               PS(node)->psort_saved = 0;
+               PS(node)->psort_saved_fileno = 0;
+               PS(node)->psort_saved = 0L;
                if (PS(node)->using_tape_files == true)
                        rewind(PS(node)->psort_grab_file);
        }
@@ -973,11 +979,7 @@ psort_rescan(Sort *node)
 static BufFile *
 gettape()
 {
-       File            tfile;
-
-       tfile = OpenTemporaryFile();
-       Assert(tfile >= 0);
-       return BufFileCreate(tfile);
+       return BufFileCreateTemp();
 }
 
 /*
index 724a7ddd313f3854a4e7ecc1440415b6a2efc2a3..f501cd0365417d9a5f2752d8fd862952df5954d4 100644 (file)
@@ -6,7 +6,7 @@
  *
  * Copyright (c) 1994, Regents of the University of California
  *
- * $Id: hashjoin.h,v 1.14 1999/07/15 15:21:08 momjian Exp $
+ * $Id: hashjoin.h,v 1.15 1999/10/13 15:02:26 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -14,6 +14,7 @@
 #define HASHJOIN_H
 
 #include "access/htup.h"
+#include "storage/buffile.h"
 
 /* ----------------------------------------------------------------
  *                             hash-join hash table structures
index 2061ac7bdcaf78310627ade614dc4b614b8100db..9d5390f79c690713d2f60569fb2416c2815545d4 100644 (file)
@@ -6,7 +6,7 @@
  *
  * Copyright (c) 1994, Regents of the University of California
  *
- * $Id: nodeHashjoin.h,v 1.15 1999/07/15 15:21:12 momjian Exp $
+ * $Id: nodeHashjoin.h,v 1.16 1999/10/13 15:02:26 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -14,6 +14,7 @@
 #define NODEHASHJOIN_H
 
 #include "nodes/plannodes.h"
+#include "storage/buffile.h"
 
 extern TupleTableSlot *ExecHashJoin(HashJoin *node);
 extern bool ExecInitHashJoin(HashJoin *node, EState *estate, Plan *parent);
diff --git a/src/include/storage/buffile.h b/src/include/storage/buffile.h
new file mode 100644 (file)
index 0000000..2416d64
--- /dev/null
@@ -0,0 +1,47 @@
+/*-------------------------------------------------------------------------
+ *
+ * buffile.h
+ *       Management of large buffered files, primarily temporary files.
+ *
+ * The BufFile routines provide a partial replacement for stdio atop
+ * virtual file descriptors managed by fd.c.  Currently they only support
+ * buffered access to a virtual file, without any of stdio's formatting
+ * features.  That's enough for immediate needs, but the set of facilities
+ * could be expanded if necessary.
+ *
+ * BufFile also supports working with temporary files that exceed the OS
+ * file size limit and/or the largest offset representable in an int.
+ * It might be better to split that out as a separately accessible module,
+ * but currently we have no need for oversize temp files without buffered
+ * access.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: buffile.h,v 1.1 1999/10/13 15:02:32 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef BUFFILE_H
+#define BUFFILE_H
+
+#include "storage/fd.h"
+
+/* BufFile is an opaque type whose details are not known outside buffile.c. */
+
+typedef struct BufFile BufFile;
+
+/*
+ * prototypes for functions in buffile.c
+ */
+
+extern BufFile *BufFileCreateTemp(void);
+extern BufFile *BufFileCreate(File file);
+extern BufFile *BufFileReaccess(BufFile *file);
+extern void BufFileClose(BufFile *file);
+extern size_t BufFileRead(BufFile *file, void *ptr, size_t size);
+extern size_t BufFileWrite(BufFile *file, void *ptr, size_t size);
+extern int BufFileSeek(BufFile *file, int fileno, long offset, int whence);
+extern void BufFileTell(BufFile *file, int *fileno, long *offset);
+
+#endif  /* BUFFILE_H */
index beb93bf699c69d9f2817592cf1a1e6803cdb2770..42d1f46579759937fc665a11e119e7224bf3924a 100644 (file)
@@ -6,10 +6,11 @@
  *
  * Copyright (c) 1994, Regents of the University of California
  *
- * $Id: fd.h,v 1.17 1999/07/17 20:18:34 momjian Exp $
+ * $Id: fd.h,v 1.18 1999/10/13 15:02:32 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
+
 /*
  * calls:
  *
  * use FreeFile, not fclose, to close it.  AVOID using stdio for files
  * that you intend to hold open for any length of time, since there is
  * no way for them to share kernel file descriptors with other files.
- *
- * The BufFile routines provide a partial replacement for stdio.  Currently
- * they only support buffered access to a virtual file, without any of
- * stdio's formatting features.  That's enough for immediate needs, but
- * the set of facilities could be expanded if necessary.
  */
 #ifndef FD_H
 #define FD_H
@@ -46,25 +42,6 @@ typedef char *FileName;
 
 typedef int File;
 
-/* BufFile is an opaque type whose details are not known outside fd.c. */
-
-typedef struct BufFile BufFile;
-
-/* why is this here? fd.c doesn't want it ... */
-struct pgstat
-{                                                              /* just the fields we need from stat
-                                                                * structure */
-       int                     st_ino;
-       int                     st_mode;
-       unsigned int st_size;
-       unsigned int st_sizehigh;       /* high order bits */
-/* 2^64 == 1.8 x 10^20 bytes */
-       int                     st_uid;
-       int                     st_atime_s;             /* just the seconds */
-       int                     st_mtime_s;             /* since SysV and the new BSD both have */
-       int                     st_ctime_s;             /* usec fields.. */
-};
-
 /*
  * prototypes for functions in fd.c
  */
@@ -78,24 +55,15 @@ extern void FileUnlink(File file);
 extern int     FileRead(File file, char *buffer, int amount);
 extern int     FileWrite(File file, char *buffer, int amount);
 extern long FileSeek(File file, long offset, int whence);
-extern int     FileTruncate(File file, int offset);
+extern int     FileTruncate(File file, long offset);
 extern int     FileSync(File file);
 
 /* Operations that allow use of regular stdio --- USE WITH CAUTION */
 extern FILE *AllocateFile(char *name, char *mode);
 extern void FreeFile(FILE *);
 
-/* Operations on BufFiles --- a very incomplete emulation of stdio
- * atop virtual Files...
- */
-extern BufFile *BufFileCreate(File file);
-extern void BufFileClose(BufFile *file);
-extern size_t BufFileRead(BufFile *file, void *ptr, size_t size);
-extern size_t BufFileWrite(BufFile *file, void *ptr, size_t size);
-extern long BufFileSeek(BufFile *file, long offset, int whence);
-
 /* Miscellaneous support routines */
-extern int     FileNameUnlink(char *filename);
+extern bool    ReleaseDataFile(void);
 extern void closeAllVfds(void);
 extern void AtEOXact_Files(void);
 extern int     pg_fsync(int fd);
index 0deac024154413aa0b1799295da2e18adecf76a2..9a100bad0d8b6ae5a5c38620ebe60bb0fe39e8cb 100644 (file)
@@ -6,7 +6,7 @@
  *
  * Copyright (c) 1994, Regents of the University of California
  *
- * $Id: psort.h,v 1.21 1999/07/16 17:07:39 momjian Exp $
+ * $Id: psort.h,v 1.22 1999/10/13 15:02:28 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -15,7 +15,7 @@
 
 #include "access/relscan.h"
 #include "nodes/plannodes.h"
-#include "storage/fd.h"
+#include "storage/buffile.h"
 #include "utils/lselect.h"
 
 #define MAXTAPES               7               /* See Knuth Fig. 70, p273 */
@@ -57,7 +57,8 @@ typedef struct Psortstate
        struct leftist *Tuples;
 
        BufFile    *psort_grab_file;
-       long            psort_current;  /* could be file offset, or array index */
+       long            psort_current;  /* array index (only used if not tape) */
+       int                     psort_saved_fileno;     /* upper bits of psort_saved, if tape */
        long            psort_saved;    /* could be file offset, or array index */
        bool            using_tape_files;
        bool            all_fetched;    /* this is for cursors */