]> granicus.if.org Git - postgresql/blobdiff - src/backend/storage/smgr/md.c
Update copyright for 2019
[postgresql] / src / backend / storage / smgr / md.c
index b0b596d6d9f2266a298ed17dddbd5bebb5c9f5a7..e4501ff9bc909712c7316c0937537991f3e5734a 100644 (file)
@@ -10,7 +10,7 @@
  * It doesn't matter whether the bits are on spinning rust or some other
  * storage technology.
  *
- * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
@@ -26,8 +26,8 @@
 #include <sys/file.h>
 
 #include "miscadmin.h"
+#include "access/xlogutils.h"
 #include "access/xlog.h"
-#include "catalog/catalog.h"
 #include "pgstat.h"
 #include "portability/instr_time.h"
 #include "postmaster/bgwriter.h"
@@ -154,7 +154,7 @@ typedef struct
 
 static HTAB *pendingOpsTable = NULL;
 static List *pendingUnlinks = NIL;
-static MemoryContext pendingOpsCxt;            /* context for the above  */
+static MemoryContext pendingOpsCxt; /* context for the above  */
 
 static CycleCtr mdsync_cycle_ctr = 0;
 static CycleCtr mdckpt_cycle_ctr = 0;
@@ -304,7 +304,7 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 
        path = relpath(reln->smgr_rnode, forkNum);
 
-       fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
+       fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
 
        if (fd < 0)
        {
@@ -317,7 +317,7 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
                 * already, even if isRedo is not set.  (See also mdopen)
                 */
                if (isRedo || IsBootstrapProcessingMode())
-                       fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
+                       fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
                if (fd < 0)
                {
                        /* be sure to report the error reported by create, not open */
@@ -430,7 +430,7 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
                /* truncate(2) would be easier here, but Windows hasn't got it */
                int                     fd;
 
-               fd = OpenTransientFile(path, O_RDWR | PG_BINARY, 0);
+               fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
                if (fd >= 0)
                {
                        int                     save_errno;
@@ -472,7 +472,7 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
                                if (errno != ENOENT)
                                        ereport(WARNING,
                                                        (errcode_for_file_access(),
-                                          errmsg("could not remove file \"%s\": %m", segpath)));
+                                                        errmsg("could not remove file \"%s\": %m", segpath)));
                                break;
                        }
                }
@@ -518,26 +518,11 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 
        v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
 
-       seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
+       seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
 
        Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 
-       /*
-        * Note: because caller usually obtained blocknum by calling mdnblocks,
-        * which did a seek(SEEK_END), this seek is often redundant and will be
-        * optimized away by fd.c.  It's not redundant, however, if there is a
-        * partial page at the end of the file. In that case we want to try to
-        * overwrite the partial page with a full page.  It's also not redundant
-        * if bufmgr.c had to dump another buffer of the same file to make room
-        * for the new page's buffer.
-        */
-       if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
-               ereport(ERROR,
-                               (errcode_for_file_access(),
-                                errmsg("could not seek to block %u in file \"%s\": %m",
-                                               blocknum, FilePathName(v->mdfd_vfd))));
-
-       if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
+       if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
        {
                if (nbytes < 0)
                        ereport(ERROR,
@@ -583,7 +568,7 @@ mdopen(SMgrRelation reln, ForkNumber forknum, int behavior)
 
        path = relpath(reln->smgr_rnode, forknum);
 
-       fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
+       fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
 
        if (fd < 0)
        {
@@ -594,7 +579,7 @@ mdopen(SMgrRelation reln, ForkNumber forknum, int behavior)
                 * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
                 */
                if (IsBootstrapProcessingMode())
-                       fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
+                       fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
                if (fd < 0)
                {
                        if ((behavior & EXTENSION_RETURN_NULL) &&
@@ -664,12 +649,12 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 
        v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
 
-       seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
+       seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
 
        Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 
        (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ, WAIT_EVENT_DATA_FILE_PREFETCH);
-#endif   /* USE_PREFETCH */
+#endif                                                 /* USE_PREFETCH */
 }
 
 /*
@@ -715,7 +700,7 @@ mdwriteback(SMgrRelation reln, ForkNumber forknum,
                Assert(nflush >= 1);
                Assert(nflush <= nblocks);
 
-               seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
+               seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
 
                FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH);
 
@@ -744,17 +729,11 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
        v = _mdfd_getseg(reln, forknum, blocknum, false,
                                         EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
 
-       seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
+       seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
 
        Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 
-       if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
-               ereport(ERROR,
-                               (errcode_for_file_access(),
-                                errmsg("could not seek to block %u in file \"%s\": %m",
-                                               blocknum, FilePathName(v->mdfd_vfd))));
-
-       nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_READ);
+       nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_READ);
 
        TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
                                                                           reln->smgr_rnode.node.spcNode,
@@ -820,17 +799,11 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
        v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
                                         EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
 
-       seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
+       seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
 
        Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 
-       if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
-               ereport(ERROR,
-                               (errcode_for_file_access(),
-                                errmsg("could not seek to block %u in file \"%s\": %m",
-                                               blocknum, FilePathName(v->mdfd_vfd))));
-
-       nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_WRITE);
+       nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_WRITE);
 
        TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
                                                                                reln->smgr_rnode.node.spcNode,
@@ -997,9 +970,9 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
                        if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
                                ereport(ERROR,
                                                (errcode_for_file_access(),
-                                       errmsg("could not truncate file \"%s\" to %u blocks: %m",
-                                                  FilePathName(v->mdfd_vfd),
-                                                  nblocks)));
+                                                errmsg("could not truncate file \"%s\" to %u blocks: %m",
+                                                               FilePathName(v->mdfd_vfd),
+                                                               nblocks)));
                        if (!SmgrIsTemp(reln))
                                register_dirty_segment(reln, forknum, v);
                }
@@ -1039,7 +1012,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
                MdfdVec    *v = &reln->md_seg_fds[forknum][segno - 1];
 
                if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
-                       ereport(ERROR,
+                       ereport(data_sync_elevel(ERROR),
                                        (errcode_for_file_access(),
                                         errmsg("could not fsync file \"%s\": %m",
                                                        FilePathName(v->mdfd_vfd))));
@@ -1150,10 +1123,8 @@ mdsync(void)
                 * The bitmap manipulations are slightly tricky, because we can call
                 * AbsorbFsyncRequests() inside the loop and that could result in
                 * bms_add_member() modifying and even re-palloc'ing the bitmapsets.
-                * This is okay because we unlink each bitmapset from the hashtable
-                * entry before scanning it.  That means that any incoming fsync
-                * requests will be processed now if they reach the table before we
-                * begin to scan their fork.
+                * So we detach it, but if we fail we'll merge it with any new
+                * requests that have arrived in the meantime.
                 */
                for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
                {
@@ -1163,7 +1134,8 @@ mdsync(void)
                        entry->requests[forknum] = NULL;
                        entry->canceled[forknum] = false;
 
-                       while ((segno = bms_first_member(requests)) >= 0)
+                       segno = -1;
+                       while ((segno = bms_next_member(requests, segno)) >= 0)
                        {
                                int                     failures;
 
@@ -1225,7 +1197,7 @@ mdsync(void)
 
                                        /* Attempt to open and fsync the target segment */
                                        seg = _mdfd_getseg(reln, forknum,
-                                                        (BlockNumber) segno * (BlockNumber) RELSEG_SIZE,
+                                                                          (BlockNumber) segno * (BlockNumber) RELSEG_SIZE,
                                                                           false,
                                                                           EXTENSION_RETURN_NULL
                                                                           | EXTENSION_DONT_CHECK_SIZE);
@@ -1244,6 +1216,7 @@ mdsync(void)
                                                        longest = elapsed;
                                                total_elapsed += elapsed;
                                                processed++;
+                                               requests = bms_del_member(requests, segno);
                                                if (log_checkpoints)
                                                        elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
                                                                 processed,
@@ -1272,15 +1245,28 @@ mdsync(void)
                                         */
                                        if (!FILE_POSSIBLY_DELETED(errno) ||
                                                failures > 0)
-                                               ereport(ERROR,
+                                       {
+                                               Bitmapset  *new_requests;
+
+                                               /*
+                                                * We need to merge these unsatisfied requests with
+                                                * any others that have arrived since we started.
+                                                */
+                                               new_requests = entry->requests[forknum];
+                                               entry->requests[forknum] =
+                                                       bms_join(new_requests, requests);
+
+                                               errno = save_errno;
+                                               ereport(data_sync_elevel(ERROR),
                                                                (errcode_for_file_access(),
                                                                 errmsg("could not fsync file \"%s\": %m",
                                                                                path)));
+                                       }
                                        else
                                                ereport(DEBUG1,
                                                                (errcode_for_file_access(),
-                                               errmsg("could not fsync file \"%s\" but retrying: %m",
-                                                          path)));
+                                                                errmsg("could not fsync file \"%s\" but retrying: %m",
+                                                                               path)));
                                        pfree(path);
 
                                        /*
@@ -1445,7 +1431,7 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
                                (errmsg("could not forward fsync request because request queue is full")));
 
                if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
-                       ereport(ERROR,
+                       ereport(data_sync_elevel(ERROR),
                                        (errcode_for_file_access(),
                                         errmsg("could not fsync file \"%s\": %m",
                                                        FilePathName(seg->mdfd_vfd))));
@@ -1704,6 +1690,43 @@ ForgetDatabaseFsyncRequests(Oid dbid)
        }
 }
 
+/*
+ * DropRelationFiles -- drop files of all given relations
+ */
+void
+DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo)
+{
+       SMgrRelation *srels;
+       int                     i;
+
+       srels = palloc(sizeof(SMgrRelation) * ndelrels);
+       for (i = 0; i < ndelrels; i++)
+       {
+               SMgrRelation srel = smgropen(delrels[i], InvalidBackendId);
+
+               if (isRedo)
+               {
+                       ForkNumber      fork;
+
+                       for (fork = 0; fork <= MAX_FORKNUM; fork++)
+                               XLogDropRelation(delrels[i], fork);
+               }
+               srels[i] = srel;
+       }
+
+       smgrdounlinkall(srels, ndelrels, isRedo);
+
+       /*
+        * Call smgrclose() in reverse order as when smgropen() is called.
+        * This trick enables remove_from_unowned_list() in smgrclose()
+        * to search the SMgrRelation from the unowned list,
+        * with O(1) performance.
+        */
+       for (i = ndelrels - 1; i >= 0; i--)
+               smgrclose(srels[i]);
+       pfree(srels);
+}
+
 
 /*
  *     _fdvec_resize() -- Resize the fork's open segments array
@@ -1780,7 +1803,7 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
        fullpath = _mdfd_segpath(reln, forknum, segno);
 
        /* open the file */
-       fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
+       fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags);
 
        pfree(fullpath);
 
@@ -1925,9 +1948,9 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
                                return NULL;
                        ereport(ERROR,
                                        (errcode_for_file_access(),
-                                  errmsg("could not open file \"%s\" (target block %u): %m",
-                                                 _mdfd_segpath(reln, forknum, nextsegno),
-                                                 blkno)));
+                                        errmsg("could not open file \"%s\" (target block %u): %m",
+                                                       _mdfd_segpath(reln, forknum, nextsegno),
+                                                       blkno)));
                }
        }
 
@@ -1942,7 +1965,7 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 {
        off_t           len;
 
-       len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
+       len = FileSize(seg->mdfd_vfd);
        if (len < 0)
                ereport(ERROR,
                                (errcode_for_file_access(),