* It doesn't matter whether the bits are on spinning rust or some other
* storage technology.
*
- * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
#include <sys/file.h>
#include "miscadmin.h"
+#include "access/xlogutils.h"
#include "access/xlog.h"
-#include "catalog/catalog.h"
#include "pgstat.h"
#include "portability/instr_time.h"
#include "postmaster/bgwriter.h"
static HTAB *pendingOpsTable = NULL;
static List *pendingUnlinks = NIL;
-static MemoryContext pendingOpsCxt; /* context for the above */
+static MemoryContext pendingOpsCxt; /* context for the above */
static CycleCtr mdsync_cycle_ctr = 0;
static CycleCtr mdckpt_cycle_ctr = 0;
path = relpath(reln->smgr_rnode, forkNum);
- fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
+ fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
if (fd < 0)
{
* already, even if isRedo is not set. (See also mdopen)
*/
if (isRedo || IsBootstrapProcessingMode())
- fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
+ fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
if (fd < 0)
{
/* be sure to report the error reported by create, not open */
/* truncate(2) would be easier here, but Windows hasn't got it */
int fd;
- fd = OpenTransientFile(path, O_RDWR | PG_BINARY, 0);
+ fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
if (fd >= 0)
{
int save_errno;
if (errno != ENOENT)
ereport(WARNING,
(errcode_for_file_access(),
- errmsg("could not remove file \"%s\": %m", segpath)));
+ errmsg("could not remove file \"%s\": %m", segpath)));
break;
}
}
v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
- seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
+ seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
- /*
- * Note: because caller usually obtained blocknum by calling mdnblocks,
- * which did a seek(SEEK_END), this seek is often redundant and will be
- * optimized away by fd.c. It's not redundant, however, if there is a
- * partial page at the end of the file. In that case we want to try to
- * overwrite the partial page with a full page. It's also not redundant
- * if bufmgr.c had to dump another buffer of the same file to make room
- * for the new page's buffer.
- */
- if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not seek to block %u in file \"%s\": %m",
- blocknum, FilePathName(v->mdfd_vfd))));
-
- if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
+ if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
{
if (nbytes < 0)
ereport(ERROR,
path = relpath(reln->smgr_rnode, forknum);
- fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
+ fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
if (fd < 0)
{
* substitute for mdcreate() in bootstrap mode only. (See mdcreate)
*/
if (IsBootstrapProcessingMode())
- fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
+ fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
if (fd < 0)
{
if ((behavior & EXTENSION_RETURN_NULL) &&
v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
- seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
+ seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
(void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ, WAIT_EVENT_DATA_FILE_PREFETCH);
-#endif /* USE_PREFETCH */
+#endif /* USE_PREFETCH */
}
/*
Assert(nflush >= 1);
Assert(nflush <= nblocks);
- seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
+ seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH);
v = _mdfd_getseg(reln, forknum, blocknum, false,
EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
- seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
+ seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
- if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not seek to block %u in file \"%s\": %m",
- blocknum, FilePathName(v->mdfd_vfd))));
-
- nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_READ);
+ nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_READ);
TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
reln->smgr_rnode.node.spcNode,
v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
- seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
+ seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
- if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not seek to block %u in file \"%s\": %m",
- blocknum, FilePathName(v->mdfd_vfd))));
-
- nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_WRITE);
+ nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_WRITE);
TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
reln->smgr_rnode.node.spcNode,
if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
ereport(ERROR,
(errcode_for_file_access(),
- errmsg("could not truncate file \"%s\" to %u blocks: %m",
- FilePathName(v->mdfd_vfd),
- nblocks)));
+ errmsg("could not truncate file \"%s\" to %u blocks: %m",
+ FilePathName(v->mdfd_vfd),
+ nblocks)));
if (!SmgrIsTemp(reln))
register_dirty_segment(reln, forknum, v);
}
MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
- ereport(ERROR,
+ ereport(data_sync_elevel(ERROR),
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m",
FilePathName(v->mdfd_vfd))));
* The bitmap manipulations are slightly tricky, because we can call
* AbsorbFsyncRequests() inside the loop and that could result in
* bms_add_member() modifying and even re-palloc'ing the bitmapsets.
- * This is okay because we unlink each bitmapset from the hashtable
- * entry before scanning it. That means that any incoming fsync
- * requests will be processed now if they reach the table before we
- * begin to scan their fork.
+ * So we detach it, but if we fail we'll merge it with any new
+ * requests that have arrived in the meantime.
*/
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
{
entry->requests[forknum] = NULL;
entry->canceled[forknum] = false;
- while ((segno = bms_first_member(requests)) >= 0)
+ segno = -1;
+ while ((segno = bms_next_member(requests, segno)) >= 0)
{
int failures;
/* Attempt to open and fsync the target segment */
seg = _mdfd_getseg(reln, forknum,
- (BlockNumber) segno * (BlockNumber) RELSEG_SIZE,
+ (BlockNumber) segno * (BlockNumber) RELSEG_SIZE,
false,
EXTENSION_RETURN_NULL
| EXTENSION_DONT_CHECK_SIZE);
longest = elapsed;
total_elapsed += elapsed;
processed++;
+ requests = bms_del_member(requests, segno);
if (log_checkpoints)
elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
processed,
*/
if (!FILE_POSSIBLY_DELETED(errno) ||
failures > 0)
- ereport(ERROR,
+ {
+ Bitmapset *new_requests;
+
+ /*
+ * We need to merge these unsatisfied requests with
+ * any others that have arrived since we started.
+ */
+ new_requests = entry->requests[forknum];
+ entry->requests[forknum] =
+ bms_join(new_requests, requests);
+
+ errno = save_errno;
+ ereport(data_sync_elevel(ERROR),
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m",
path)));
+ }
else
ereport(DEBUG1,
(errcode_for_file_access(),
- errmsg("could not fsync file \"%s\" but retrying: %m",
- path)));
+ errmsg("could not fsync file \"%s\" but retrying: %m",
+ path)));
pfree(path);
/*
(errmsg("could not forward fsync request because request queue is full")));
if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
- ereport(ERROR,
+ ereport(data_sync_elevel(ERROR),
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m",
FilePathName(seg->mdfd_vfd))));
}
}
+/*
+ * DropRelationFiles -- drop files of all given relations
+ */
+void
+DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo)
+{
+ SMgrRelation *srels;
+ int i;
+
+ srels = palloc(sizeof(SMgrRelation) * ndelrels);
+ for (i = 0; i < ndelrels; i++)
+ {
+ SMgrRelation srel = smgropen(delrels[i], InvalidBackendId);
+
+ if (isRedo)
+ {
+ ForkNumber fork;
+
+ for (fork = 0; fork <= MAX_FORKNUM; fork++)
+ XLogDropRelation(delrels[i], fork);
+ }
+ srels[i] = srel;
+ }
+
+ smgrdounlinkall(srels, ndelrels, isRedo);
+
+ /*
+ * Call smgrclose() in reverse order as when smgropen() is called.
+ * This trick enables remove_from_unowned_list() in smgrclose()
+ * to search the SMgrRelation from the unowned list,
+ * with O(1) performance.
+ */
+ for (i = ndelrels - 1; i >= 0; i--)
+ smgrclose(srels[i]);
+ pfree(srels);
+}
+
/*
* _fdvec_resize() -- Resize the fork's open segments array
fullpath = _mdfd_segpath(reln, forknum, segno);
/* open the file */
- fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
+ fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags);
pfree(fullpath);
return NULL;
ereport(ERROR,
(errcode_for_file_access(),
- errmsg("could not open file \"%s\" (target block %u): %m",
- _mdfd_segpath(reln, forknum, nextsegno),
- blkno)));
+ errmsg("could not open file \"%s\" (target block %u): %m",
+ _mdfd_segpath(reln, forknum, nextsegno),
+ blkno)));
}
}
{
off_t len;
- len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
+ len = FileSize(seg->mdfd_vfd);
if (len < 0)
ereport(ERROR,
(errcode_for_file_access(),