X-Git-Url: https://granicus.if.org/sourcecode?a=blobdiff_plain;f=src%2Fbackend%2Fstorage%2Fsmgr%2Fmd.c;h=e4501ff9bc909712c7316c0937537991f3e5734a;hb=97c39498e5ca9208d3de5a443a2282923619bf91;hp=7f44606c1a9c6d78ebd022343b65c1969744d0dd;hpb=fba105b1099f4f5fa7283bb17cba6fed2baa8d0c;p=postgresql diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 7f44606c1a..e4501ff9bc 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -3,7 +3,14 @@ * md.c * This code manages relations that reside on magnetic disk. * - * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group + * Or at least, that was what the Berkeley folk had in mind when they named + * this file. In reality, what this code provides is an interface from + * the smgr API to Unix-like filesystem APIs, so it will work with any type + * of device for which the operating system provides filesystem support. + * It doesn't matter whether the bits are on spinning rust or some other + * storage technology. + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -18,8 +25,10 @@ #include #include -#include "catalog/catalog.h" #include "miscadmin.h" +#include "access/xlogutils.h" +#include "access/xlog.h" +#include "pgstat.h" #include "portability/instr_time.h" #include "postmaster/bgwriter.h" #include "storage/fd.h" @@ -31,13 +40,14 @@ #include "pg_trace.h" -/* interval for calling AbsorbFsyncRequests in mdsync */ +/* intervals for calling AbsorbFsyncRequests in mdsync and mdpostckpt */ #define FSYNCS_PER_ABSORB 10 +#define UNLINKS_PER_ABSORB 10 /* * Special values for the segno arg to RememberFsyncRequest. * - * Note that CompactBgwriterRequestQueue assumes that it's OK to remove an + * Note that CompactCheckpointerRequestQueue assumes that it's OK to remove an * fsync request from the queue if an identical, subsequent request is found. * See comments there before making changes here. */ @@ -50,7 +60,7 @@ * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform, * that's what you get. Ugh. This code is designed so that we don't * actually believe these cases are okay without further evidence (namely, - * a pending fsync request getting revoked ... see mdsync). + * a pending fsync request getting canceled ... see mdsync). */ #ifndef WIN32 #define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT) @@ -76,103 +86,115 @@ * Inactive segments are those that once contained data but are currently * not needed because of an mdtruncate() operation. The reason for leaving * them present at size zero, rather than unlinking them, is that other - * backends and/or the bgwriter might be holding open file references to - * such segments. If the relation expands again after mdtruncate(), such + * backends and/or the checkpointer might be holding open file references to + * such segments. If the relation expands again after mdtruncate(), such * that a deactivated segment becomes active again, it is important that * such file references still be valid --- else data might get written * out to an unlinked old copy of a segment file that will eventually * disappear. * - * The file descriptor pointer (md_fd field) stored in the SMgrRelation - * cache is, therefore, just the head of a list of MdfdVec objects, one - * per segment. But note the md_fd pointer can be NULL, indicating - * relation not open. - * - * Also note that mdfd_chain == NULL does not necessarily mean the relation - * doesn't have another segment after this one; we may just not have - * opened the next segment yet. (We could not have "all segments are - * in the chain" as an invariant anyway, since another backend could - * extend the relation when we weren't looking.) We do not make chain + * File descriptors are stored in the per-fork md_seg_fds arrays inside + * SMgrRelation. The length of these arrays is stored in md_num_open_segs. + * Note that a fork's md_num_open_segs having a specific value does not + * necessarily mean the relation doesn't have additional segments; we may + * just not have opened the next segment yet. (We could not have "all + * segments are in the array" as an invariant anyway, since another backend + * could extend the relation while we aren't looking.) We do not have * entries for inactive segments, however; as soon as we find a partial * segment, we assume that any subsequent segments are inactive. * - * All MdfdVec objects are palloc'd in the MdCxt memory context. + * The entire MdfdVec array is palloc'd in the MdCxt memory context. */ typedef struct _MdfdVec { File mdfd_vfd; /* fd number in fd.c's pool */ BlockNumber mdfd_segno; /* segment number, from 0 */ - struct _MdfdVec *mdfd_chain; /* next segment, or NULL */ } MdfdVec; -static MemoryContext MdCxt; /* context for all md.c allocations */ +static MemoryContext MdCxt; /* context for all MdfdVec objects */ /* - * In some contexts (currently, standalone backends and the bgwriter process) + * In some contexts (currently, standalone backends and the checkpointer) * we keep track of pending fsync operations: we need to remember all relation * segments that have been written since the last checkpoint, so that we can * fsync them down to disk before completing the next checkpoint. This hash - * table remembers the pending operations. We use a hash table mostly as - * a convenient way of eliminating duplicate requests. + * table remembers the pending operations. We use a hash table mostly as + * a convenient way of merging duplicate requests. * * We use a similar mechanism to remember no-longer-needed files that can * be deleted after the next checkpoint, but we use a linked list instead of * a hash table, because we don't expect there to be any duplicate requests. * + * These mechanisms are only used for non-temp relations; we never fsync + * temp rels, nor do we need to postpone their deletion (see comments in + * mdunlink). + * * (Regular backends do not track pending operations locally, but forward - * them to the bgwriter.) + * them to the checkpointer.) */ -typedef struct -{ - RelFileNodeBackend rnode; /* the targeted relation */ - ForkNumber forknum; - BlockNumber segno; /* which segment */ -} PendingOperationTag; - typedef uint16 CycleCtr; /* can be any convenient integer size */ typedef struct { - PendingOperationTag tag; /* hash table key (must be first!) */ - bool canceled; /* T => request canceled, not yet removed */ - CycleCtr cycle_ctr; /* mdsync_cycle_ctr when request was made */ + RelFileNode rnode; /* hash table key (must be first!) */ + CycleCtr cycle_ctr; /* mdsync_cycle_ctr of oldest request */ + /* requests[f] has bit n set if we need to fsync segment n of fork f */ + Bitmapset *requests[MAX_FORKNUM + 1]; + /* canceled[f] is true if we canceled fsyncs for fork "recently" */ + bool canceled[MAX_FORKNUM + 1]; } PendingOperationEntry; typedef struct { - RelFileNodeBackend rnode; /* the dead relation to delete */ + RelFileNode rnode; /* the dead relation to delete */ CycleCtr cycle_ctr; /* mdckpt_cycle_ctr when request was made */ } PendingUnlinkEntry; static HTAB *pendingOpsTable = NULL; static List *pendingUnlinks = NIL; +static MemoryContext pendingOpsCxt; /* context for the above */ static CycleCtr mdsync_cycle_ctr = 0; static CycleCtr mdckpt_cycle_ctr = 0; -typedef enum /* behavior for mdopen & _mdfd_getseg */ -{ - EXTENSION_FAIL, /* ereport if segment not present */ - EXTENSION_RETURN_NULL, /* return NULL if not present */ - EXTENSION_CREATE /* create new segments as needed */ -} ExtensionBehavior; +/*** behavior for mdopen & _mdfd_getseg ***/ +/* ereport if segment not present */ +#define EXTENSION_FAIL (1 << 0) +/* return NULL if segment not present */ +#define EXTENSION_RETURN_NULL (1 << 1) +/* create new segments as needed */ +#define EXTENSION_CREATE (1 << 2) +/* create new segments if needed during recovery */ +#define EXTENSION_CREATE_RECOVERY (1 << 3) +/* + * Allow opening segments which are preceded by segments smaller than + * RELSEG_SIZE, e.g. inactive segments (see above). Note that this is breaks + * mdnblocks() and related functionality henceforth - which currently is ok, + * because this is only required in the checkpointer which never uses + * mdnblocks(). + */ +#define EXTENSION_DONT_CHECK_SIZE (1 << 4) + /* local routines */ -static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum, - ExtensionBehavior behavior); +static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, + bool isRedo); +static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum, int behavior); static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg); static void register_unlink(RelFileNodeBackend rnode); -static MdfdVec *_fdvec_alloc(void); +static void _fdvec_resize(SMgrRelation reln, + ForkNumber forknum, + int nseg); static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno); static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno, BlockNumber segno, int oflags); static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno, - BlockNumber blkno, bool skipFsync, ExtensionBehavior behavior); + BlockNumber blkno, bool skipFsync, int behavior); static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg); @@ -185,46 +207,65 @@ mdinit(void) { MdCxt = AllocSetContextCreate(TopMemoryContext, "MdSmgr", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); /* * Create pending-operations hashtable if we need it. Currently, we need - * it if we are standalone (not under a postmaster) OR if we are a - * bootstrap-mode subprocess of a postmaster (that is, a startup or - * bgwriter process). + * it if we are standalone (not under a postmaster) or if we are a startup + * or checkpointer auxiliary process. */ - if (!IsUnderPostmaster || IsBootstrapProcessingMode()) + if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess()) { HASHCTL hash_ctl; + /* + * XXX: The checkpointer needs to add entries to the pending ops table + * when absorbing fsync requests. That is done within a critical + * section, which isn't usually allowed, but we make an exception. It + * means that there's a theoretical possibility that you run out of + * memory while absorbing fsync requests, which leads to a PANIC. + * Fortunately the hash table is small so that's unlikely to happen in + * practice. + */ + pendingOpsCxt = AllocSetContextCreate(MdCxt, + "Pending ops context", + ALLOCSET_DEFAULT_SIZES); + MemoryContextAllowInCriticalSection(pendingOpsCxt, true); + MemSet(&hash_ctl, 0, sizeof(hash_ctl)); - hash_ctl.keysize = sizeof(PendingOperationTag); + hash_ctl.keysize = sizeof(RelFileNode); hash_ctl.entrysize = sizeof(PendingOperationEntry); - hash_ctl.hash = tag_hash; - hash_ctl.hcxt = MdCxt; + hash_ctl.hcxt = pendingOpsCxt; pendingOpsTable = hash_create("Pending Ops Table", 100L, &hash_ctl, - HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); pendingUnlinks = NIL; } } /* - * In archive recovery, we rely on bgwriter to do fsyncs, but we will have + * In archive recovery, we rely on checkpointer to do fsyncs, but we will have * already created the pendingOpsTable during initialization of the startup * process. Calling this function drops the local pendingOpsTable so that - * subsequent requests will be forwarded to bgwriter. + * subsequent requests will be forwarded to checkpointer. */ void SetForwardFsyncRequests(void) { - /* Perform any pending ops we may have queued up */ + /* Perform any pending fsyncs we may have queued up, then drop table */ if (pendingOpsTable) + { mdsync(); + hash_destroy(pendingOpsTable); + } pendingOpsTable = NULL; + + /* + * We should not have any pending unlink requests, since mdunlink doesn't + * queue unlink requests when isRedo. + */ + Assert(pendingUnlinks == NIL); } /* @@ -252,17 +293,18 @@ mdexists(SMgrRelation reln, ForkNumber forkNum) void mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) { + MdfdVec *mdfd; char *path; File fd; - if (isRedo && reln->md_fd[forkNum] != NULL) + if (isRedo && reln->md_num_open_segs[forkNum] > 0) return; /* created and opened already... */ - Assert(reln->md_fd[forkNum] == NULL); + Assert(reln->md_num_open_segs[forkNum] == 0); path = relpath(reln->smgr_rnode, forkNum); - fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600); + fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); if (fd < 0) { @@ -272,10 +314,10 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) * During bootstrap, there are cases where a system relation will be * accessed (by internal backend processes) before the bootstrap * script nominally creates it. Therefore, allow the file to exist - * already, even if isRedo is not set. (See also mdopen) + * already, even if isRedo is not set. (See also mdopen) */ if (isRedo || IsBootstrapProcessingMode()) - fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600); + fd = PathNameOpenFile(path, O_RDWR | PG_BINARY); if (fd < 0) { /* be sure to report the error reported by create, not open */ @@ -288,24 +330,23 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) pfree(path); - if (reln->smgr_transient) - FileSetTransient(fd); - - reln->md_fd[forkNum] = _fdvec_alloc(); - - reln->md_fd[forkNum]->mdfd_vfd = fd; - reln->md_fd[forkNum]->mdfd_segno = 0; - reln->md_fd[forkNum]->mdfd_chain = NULL; + _fdvec_resize(reln, forkNum, 1); + mdfd = &reln->md_seg_fds[forkNum][0]; + mdfd->mdfd_vfd = fd; + mdfd->mdfd_segno = 0; } /* * mdunlink() -- Unlink a relation. * - * Note that we're passed a RelFileNode --- by the time this is called, + * Note that we're passed a RelFileNodeBackend --- by the time this is called, * there won't be an SMgrRelation hashtable entry anymore. * - * Actually, we don't unlink the first segment file of the relation, but - * just truncate it to zero length, and record a request to unlink it after + * forkNum can be a fork number to delete a specific fork, or InvalidForkNumber + * to delete all forks. + * + * For regular relations, we don't unlink the first segment file of the rel, + * but just truncate it to zero length, and record a request to unlink it after * the next checkpoint. Additional segments can be unlinked immediately, * however. Leaving the empty file in place prevents that relfilenode * number from being reused. The scenario this protects us from is: @@ -317,12 +358,24 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) * if the contents of the file were repopulated by subsequent WAL entries. * But if we didn't WAL-log insertions, but instead relied on fsyncing the * file after populating it (as for instance CLUSTER and CREATE INDEX do), - * the contents of the file would be lost forever. By leaving the empty file + * the contents of the file would be lost forever. By leaving the empty file * until after the next checkpoint, we prevent reassignment of the relfilenode * number until it's safe, because relfilenode assignment skips over any * existing file. * - * If isRedo is true, it's okay for the relation to be already gone. + * We do not need to go through this dance for temp relations, though, because + * we never make WAL entries for temp rels, and so a temp rel poses no threat + * to the health of a regular rel that has taken over its relfilenode number. + * The fact that temp rels and regular rels have different file naming + * patterns provides additional safety. + * + * All the above applies only to the relation's main fork; other forks can + * just be removed immediately, since they are not needed to prevent the + * relfilenode number from being recycled. Also, we do not carefully + * track whether other forks have been created or not, but just attempt to + * unlink them unconditionally; so we should never complain about ENOENT. + * + * If isRedo is true, it's unsurprising for the relation to be already gone. * Also, we should remove the file immediately instead of queuing a request * for later, since during redo there's no possibility of creating a * conflicting relation. @@ -333,44 +386,58 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) void mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) { - char *path; - int ret; - /* * We have to clean out any pending fsync requests for the doomed - * relation, else the next mdsync() will fail. + * relation, else the next mdsync() will fail. There can't be any such + * requests for a temp relation, though. We can send just one request + * even when deleting multiple forks, since the fsync queuing code accepts + * the "InvalidForkNumber = all forks" convention. */ - ForgetRelationFsyncRequests(rnode, forkNum); + if (!RelFileNodeBackendIsTemp(rnode)) + ForgetRelationFsyncRequests(rnode.node, forkNum); + + /* Now do the per-fork work */ + if (forkNum == InvalidForkNumber) + { + for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++) + mdunlinkfork(rnode, forkNum, isRedo); + } + else + mdunlinkfork(rnode, forkNum, isRedo); +} + +static void +mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) +{ + char *path; + int ret; path = relpath(rnode, forkNum); /* * Delete or truncate the first segment. */ - if (isRedo || forkNum != MAIN_FORKNUM) + if (isRedo || forkNum != MAIN_FORKNUM || RelFileNodeBackendIsTemp(rnode)) { ret = unlink(path); - if (ret < 0) - { - if (!isRedo || errno != ENOENT) - ereport(WARNING, - (errcode_for_file_access(), - errmsg("could not remove file \"%s\": %m", path))); - } + if (ret < 0 && errno != ENOENT) + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", path))); } else { /* truncate(2) would be easier here, but Windows hasn't got it */ int fd; - fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0); + fd = OpenTransientFile(path, O_RDWR | PG_BINARY); if (fd >= 0) { int save_errno; ret = ftruncate(fd, 0); save_errno = errno; - close(fd); + CloseTransientFile(fd); errno = save_errno; } else @@ -379,6 +446,9 @@ mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) ereport(WARNING, (errcode_for_file_access(), errmsg("could not truncate file \"%s\": %m", path))); + + /* Register request to unlink first segment later */ + register_unlink(rnode); } /* @@ -402,7 +472,7 @@ mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) if (errno != ENOENT) ereport(WARNING, (errcode_for_file_access(), - errmsg("could not remove file \"%s\": %m", segpath))); + errmsg("could not remove file \"%s\": %m", segpath))); break; } } @@ -410,10 +480,6 @@ mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) } pfree(path); - - /* Register request to unlink first segment later */ - if (!isRedo && forkNum == MAIN_FORKNUM) - register_unlink(rnode); } /* @@ -452,26 +518,11 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE); - seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); - /* - * Note: because caller usually obtained blocknum by calling mdnblocks, - * which did a seek(SEEK_END), this seek is often redundant and will be - * optimized away by fd.c. It's not redundant, however, if there is a - * partial page at the end of the file. In that case we want to try to - * overwrite the partial page with a full page. It's also not redundant - * if bufmgr.c had to dump another buffer of the same file to make room - * for the new page's buffer. - */ - if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not seek to block %u in file \"%s\": %m", - blocknum, FilePathName(v->mdfd_vfd)))); - - if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ) + if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ) { if (nbytes < 0) ereport(ERROR, @@ -505,19 +556,19 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * invent one out of whole cloth. */ static MdfdVec * -mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior) +mdopen(SMgrRelation reln, ForkNumber forknum, int behavior) { MdfdVec *mdfd; char *path; File fd; /* No work if already open */ - if (reln->md_fd[forknum]) - return reln->md_fd[forknum]; + if (reln->md_num_open_segs[forknum] > 0) + return &reln->md_seg_fds[forknum][0]; path = relpath(reln->smgr_rnode, forknum); - fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600); + fd = PathNameOpenFile(path, O_RDWR | PG_BINARY); if (fd < 0) { @@ -528,10 +579,10 @@ mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior) * substitute for mdcreate() in bootstrap mode only. (See mdcreate) */ if (IsBootstrapProcessingMode()) - fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600); + fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); if (fd < 0) { - if (behavior == EXTENSION_RETURN_NULL && + if ((behavior & EXTENSION_RETURN_NULL) && FILE_POSSIBLY_DELETED(errno)) { pfree(path); @@ -545,14 +596,11 @@ mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior) pfree(path); - if (reln->smgr_transient) - FileSetTransient(fd); - - reln->md_fd[forknum] = mdfd = _fdvec_alloc(); - + _fdvec_resize(reln, forknum, 1); + mdfd = &reln->md_seg_fds[forknum][0]; mdfd->mdfd_vfd = fd; mdfd->mdfd_segno = 0; - mdfd->mdfd_chain = NULL; + Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE)); return mdfd; @@ -564,25 +612,29 @@ mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior) void mdclose(SMgrRelation reln, ForkNumber forknum) { - MdfdVec *v = reln->md_fd[forknum]; + int nopensegs = reln->md_num_open_segs[forknum]; /* No work if already closed */ - if (v == NULL) + if (nopensegs == 0) return; - reln->md_fd[forknum] = NULL; /* prevent dangling pointer after error */ - - while (v != NULL) + /* close segments starting from the end */ + while (nopensegs > 0) { - MdfdVec *ov = v; + MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1]; /* if not closed already */ if (v->mdfd_vfd >= 0) + { FileClose(v->mdfd_vfd); - /* Now free vector */ - v = v->mdfd_chain; - pfree(ov); + v->mdfd_vfd = -1; + } + + nopensegs--; } + + /* resize just once, avoids pointless reallocations */ + _fdvec_resize(reln, forknum, 0); } /* @@ -597,14 +649,65 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL); - seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); - (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ); -#endif /* USE_PREFETCH */ + (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ, WAIT_EVENT_DATA_FILE_PREFETCH); +#endif /* USE_PREFETCH */ } +/* + * mdwriteback() -- Tell the kernel to write pages back to storage. + * + * This accepts a range of blocks because flushing several pages at once is + * considerably more efficient than doing so individually. + */ +void +mdwriteback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks) +{ + /* + * Issue flush requests in as few requests as possible; have to split at + * segment boundaries though, since those are actually separate files. + */ + while (nblocks > 0) + { + BlockNumber nflush = nblocks; + off_t seekpos; + MdfdVec *v; + int segnum_start, + segnum_end; + + v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ , + EXTENSION_RETURN_NULL); + + /* + * We might be flushing buffers of already removed relations, that's + * ok, just ignore that case. + */ + if (!v) + return; + + /* compute offset inside the current segment */ + segnum_start = blocknum / RELSEG_SIZE; + + /* compute number of desired writes within the current segment */ + segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE; + if (segnum_start != segnum_end) + nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)); + + Assert(nflush >= 1); + Assert(nflush <= nblocks); + + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + + FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH); + + nblocks -= nflush; + blocknum += nflush; + } +} /* * mdread() -- Read the specified block from a relation. @@ -623,19 +726,14 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, reln->smgr_rnode.node.relNode, reln->smgr_rnode.backend); - v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL); + v = _mdfd_getseg(reln, forknum, blocknum, false, + EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); - seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); - if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not seek to block %u in file \"%s\": %m", - blocknum, FilePathName(v->mdfd_vfd)))); - - nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ); + nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_READ); TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum, reln->smgr_rnode.node.spcNode, @@ -698,19 +796,14 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, reln->smgr_rnode.node.relNode, reln->smgr_rnode.backend); - v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_FAIL); + v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, + EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); - seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); - if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not seek to block %u in file \"%s\": %m", - blocknum, FilePathName(v->mdfd_vfd)))); - - nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ); + nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_WRITE); TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum, reln->smgr_rnode.node.spcNode, @@ -745,9 +838,9 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * mdnblocks() -- Get the number of blocks stored in a relation. * * Important side effect: all active segments of the relation are opened - * and added to the mdfd_chain list. If this routine has not been + * and added to the mdfd_seg_fds array. If this routine has not been * called, then only segments up to the last one actually touched - * are present in the chain. + * are present in the array. */ BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum) @@ -756,23 +849,24 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) BlockNumber nblocks; BlockNumber segno = 0; + /* mdopen has opened the first segment */ + Assert(reln->md_num_open_segs[forknum] > 0); + /* - * Skip through any segments that aren't the last one, to avoid redundant - * seeks on them. We have previously verified that these segments are - * exactly RELSEG_SIZE long, and it's useless to recheck that each time. + * Start from the last open segments, to avoid redundant seeks. We have + * previously verified that these segments are exactly RELSEG_SIZE long, + * and it's useless to recheck that each time. * * NOTE: this assumption could only be wrong if another backend has - * truncated the relation. We rely on higher code levels to handle that + * truncated the relation. We rely on higher code levels to handle that * scenario by closing and re-opening the md fd, which is handled via - * relcache flush. (Since the bgwriter doesn't participate in relcache - * flush, it could have segment chain entries for inactive segments; - * that's OK because the bgwriter never needs to compute relation size.) + * relcache flush. (Since the checkpointer doesn't participate in + * relcache flush, it could have segment entries for inactive segments; + * that's OK because the checkpointer never needs to compute relation + * size.) */ - while (v->mdfd_chain != NULL) - { - segno++; - v = v->mdfd_chain; - } + segno = reln->md_num_open_segs[forknum] - 1; + v = &reln->md_seg_fds[forknum][segno]; for (;;) { @@ -787,23 +881,16 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) */ segno++; - if (v->mdfd_chain == NULL) - { - /* - * Because we pass O_CREAT, we will create the next segment (with - * zero length) immediately, if the last segment is of length - * RELSEG_SIZE. While perhaps not strictly necessary, this keeps - * the logic simple. - */ - v->mdfd_chain = _mdfd_openseg(reln, forknum, segno, O_CREAT); - if (v->mdfd_chain == NULL) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", - _mdfd_segpath(reln, forknum, segno)))); - } - - v = v->mdfd_chain; + /* + * We used to pass O_CREAT here, but that's has the disadvantage that + * it might create a segment which has vanished through some operating + * system misadventure. In such a case, creating the segment here + * undermines _mdfd_getseg's attempts to notice and report an error + * upon access to a missing segment. + */ + v = _mdfd_openseg(reln, forknum, segno, 0); + if (v == NULL) + return segno * ((BlockNumber) RELSEG_SIZE); } } @@ -813,9 +900,9 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) { - MdfdVec *v; BlockNumber curnblk; BlockNumber priorblocks; + int curopensegs; /* * NOTE: mdnblocks makes sure we have opened all active segments, so that @@ -835,21 +922,26 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) if (nblocks == curnblk) return; /* no work */ - v = mdopen(reln, forknum, EXTENSION_FAIL); - - priorblocks = 0; - while (v != NULL) + /* + * Truncate segments, starting at the last one. Starting at the end makes + * managing the memory for the fd array easier, should there be errors. + */ + curopensegs = reln->md_num_open_segs[forknum]; + while (curopensegs > 0) { - MdfdVec *ov = v; + MdfdVec *v; + + priorblocks = (curopensegs - 1) * RELSEG_SIZE; + + v = &reln->md_seg_fds[forknum][curopensegs - 1]; if (priorblocks > nblocks) { /* - * This segment is no longer active (and has already been unlinked - * from the mdfd_chain). We truncate the file, but do not delete - * it, for reasons explained in the header comments. + * This segment is no longer active. We truncate the file, but do + * not delete it, for reasons explained in the header comments. */ - if (FileTruncate(v->mdfd_vfd, 0) < 0) + if (FileTruncate(v->mdfd_vfd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not truncate file \"%s\": %m", @@ -857,43 +949,42 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) if (!SmgrIsTemp(reln)) register_dirty_segment(reln, forknum, v); - v = v->mdfd_chain; - Assert(ov != reln->md_fd[forknum]); /* we never drop the 1st - * segment */ - pfree(ov); + + /* we never drop the 1st segment */ + Assert(v != &reln->md_seg_fds[forknum][0]); + + FileClose(v->mdfd_vfd); + _fdvec_resize(reln, forknum, curopensegs - 1); } else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks) { /* * This is the last segment we want to keep. Truncate the file to - * the right length, and clear chain link that points to any - * remaining segments (which we shall zap). NOTE: if nblocks is - * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st - * segment to 0 length but keep it. This adheres to the invariant - * given in the header comments. + * the right length. NOTE: if nblocks is exactly a multiple K of + * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but + * keep it. This adheres to the invariant given in the header + * comments. */ BlockNumber lastsegblocks = nblocks - priorblocks; - if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ) < 0) + if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not truncate file \"%s\" to %u blocks: %m", - FilePathName(v->mdfd_vfd), - nblocks))); + errmsg("could not truncate file \"%s\" to %u blocks: %m", + FilePathName(v->mdfd_vfd), + nblocks))); if (!SmgrIsTemp(reln)) register_dirty_segment(reln, forknum, v); - v = v->mdfd_chain; - ov->mdfd_chain = NULL; } else { /* - * We still need this segment and 0 or more blocks beyond it, so - * nothing to do here. + * We still need this segment, so nothing to do for this and any + * earlier segment. */ - v = v->mdfd_chain; + break; } - priorblocks += RELSEG_SIZE; + curopensegs--; } } @@ -906,7 +997,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) void mdimmedsync(SMgrRelation reln, ForkNumber forknum) { - MdfdVec *v; + int segno; /* * NOTE: mdnblocks makes sure we have opened all active segments, so that @@ -914,16 +1005,18 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) */ mdnblocks(reln, forknum); - v = mdopen(reln, forknum, EXTENSION_FAIL); + segno = reln->md_num_open_segs[forknum]; - while (v != NULL) + while (segno > 0) { - if (FileSync(v->mdfd_vfd) < 0) - ereport(ERROR, + MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1]; + + if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0) + ereport(data_sync_elevel(ERROR), (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", FilePathName(v->mdfd_vfd)))); - v = v->mdfd_chain; + segno--; } } @@ -956,8 +1049,8 @@ mdsync(void) elog(ERROR, "cannot sync without a pendingOpsTable"); /* - * If we are in the bgwriter, the sync had better include all fsync - * requests that were queued by backends up to this point. The tightest + * If we are in the checkpointer, the sync had better include all fsync + * requests that were queued by backends up to this point. The tightest * race condition that could occur is that a buffer that must be written * and fsync'd for the checkpoint could have been dumped by a backend just * before it was visited by BufferSync(). We know the backend will have @@ -1011,8 +1104,11 @@ mdsync(void) hash_seq_init(&hstat, pendingOpsTable); while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) { + ForkNumber forknum; + /* - * If the entry is new then don't process it this time. Note that + * If the entry is new then don't process it this time; it might + * contain multiple fsync-request bits, but they are all new. Note * "continue" bypasses the hash-remove call at the bottom of the loop. */ if (entry->cycle_ctr == mdsync_cycle_ctr) @@ -1022,85 +1118,96 @@ mdsync(void) Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr); /* - * If fsync is off then we don't have to bother opening the file at - * all. (We delay checking until this point so that changing fsync on - * the fly behaves sensibly.) Also, if the entry is marked canceled, - * fall through to delete it. + * Scan over the forks and segments represented by the entry. + * + * The bitmap manipulations are slightly tricky, because we can call + * AbsorbFsyncRequests() inside the loop and that could result in + * bms_add_member() modifying and even re-palloc'ing the bitmapsets. + * So we detach it, but if we fail we'll merge it with any new + * requests that have arrived in the meantime. */ - if (enableFsync && !entry->canceled) + for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) { - int failures; + Bitmapset *requests = entry->requests[forknum]; + int segno; - /* - * If in bgwriter, we want to absorb pending requests every so - * often to prevent overflow of the fsync request queue. It is - * unspecified whether newly-added entries will be visited by - * hash_seq_search, but we don't care since we don't need to - * process them anyway. - */ - if (--absorb_counter <= 0) - { - AbsorbFsyncRequests(); - absorb_counter = FSYNCS_PER_ABSORB; - } + entry->requests[forknum] = NULL; + entry->canceled[forknum] = false; - /* - * The fsync table could contain requests to fsync segments that - * have been deleted (unlinked) by the time we get to them. Rather - * than just hoping an ENOENT (or EACCES on Windows) error can be - * ignored, what we do on error is absorb pending requests and - * then retry. Since mdunlink() queues a "revoke" message before - * actually unlinking, the fsync request is guaranteed to be - * marked canceled after the absorb if it really was this case. - * DROP DATABASE likewise has to tell us to forget fsync requests - * before it starts deletions. - */ - for (failures = 0;; failures++) /* loop exits at "break" */ + segno = -1; + while ((segno = bms_next_member(requests, segno)) >= 0) { - SMgrRelation reln; - MdfdVec *seg; - char *path; + int failures; + + /* + * If fsync is off then we don't have to bother opening the + * file at all. (We delay checking until this point so that + * changing fsync on the fly behaves sensibly.) + */ + if (!enableFsync) + continue; /* - * Find or create an smgr hash entry for this relation. This - * may seem a bit unclean -- md calling smgr? But it's really - * the best solution. It ensures that the open file reference - * isn't permanently leaked if we get an error here. (You may - * say "but an unreferenced SMgrRelation is still a leak!" Not - * really, because the only case in which a checkpoint is done - * by a process that isn't about to shut down is in the - * bgwriter, and it will periodically do smgrcloseall(). This - * fact justifies our not closing the reln in the success path - * either, which is a good thing since in non-bgwriter cases - * we couldn't safely do that.) Furthermore, in many cases - * the relation will have been dirtied through this same smgr - * relation, and so we can save a file open/close cycle. + * If in checkpointer, we want to absorb pending requests + * every so often to prevent overflow of the fsync request + * queue. It is unspecified whether newly-added entries will + * be visited by hash_seq_search, but we don't care since we + * don't need to process them anyway. */ - reln = smgropen(entry->tag.rnode.node, - entry->tag.rnode.backend); + if (--absorb_counter <= 0) + { + AbsorbFsyncRequests(); + absorb_counter = FSYNCS_PER_ABSORB; + } /* - * It is possible that the relation has been dropped or - * truncated since the fsync request was entered. Therefore, - * allow ENOENT, but only if we didn't fail already on this - * file. This applies both during _mdfd_getseg() and during - * FileSync, since fd.c might have closed the file behind our - * back. + * The fsync table could contain requests to fsync segments + * that have been deleted (unlinked) by the time we get to + * them. Rather than just hoping an ENOENT (or EACCES on + * Windows) error can be ignored, what we do on error is + * absorb pending requests and then retry. Since mdunlink() + * queues a "cancel" message before actually unlinking, the + * fsync request is guaranteed to be marked canceled after the + * absorb if it really was this case. DROP DATABASE likewise + * has to tell us to forget fsync requests before it starts + * deletions. */ - seg = _mdfd_getseg(reln, entry->tag.forknum, - entry->tag.segno * ((BlockNumber) RELSEG_SIZE), - false, EXTENSION_RETURN_NULL); + for (failures = 0;; failures++) /* loop exits at "break" */ + { + SMgrRelation reln; + MdfdVec *seg; + char *path; + int save_errno; + + /* + * Find or create an smgr hash entry for this relation. + * This may seem a bit unclean -- md calling smgr? But + * it's really the best solution. It ensures that the + * open file reference isn't permanently leaked if we get + * an error here. (You may say "but an unreferenced + * SMgrRelation is still a leak!" Not really, because the + * only case in which a checkpoint is done by a process + * that isn't about to shut down is in the checkpointer, + * and it will periodically do smgrcloseall(). This fact + * justifies our not closing the reln in the success path + * either, which is a good thing since in non-checkpointer + * cases we couldn't safely do that.) + */ + reln = smgropen(entry->rnode, InvalidBackendId); + + /* Attempt to open and fsync the target segment */ + seg = _mdfd_getseg(reln, forknum, + (BlockNumber) segno * (BlockNumber) RELSEG_SIZE, + false, + EXTENSION_RETURN_NULL + | EXTENSION_DONT_CHECK_SIZE); - if (log_checkpoints) INSTR_TIME_SET_CURRENT(sync_start); - else - INSTR_TIME_SET_ZERO(sync_start); - if (seg != NULL && - FileSync(seg->mdfd_vfd) >= 0) - { - if (log_checkpoints && (!INSTR_TIME_IS_ZERO(sync_start))) + if (seg != NULL && + FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) >= 0) { + /* Success; update statistics about sync timing */ INSTR_TIME_SET_CURRENT(sync_end); sync_diff = sync_end; INSTR_TIME_SUBTRACT(sync_diff, sync_start); @@ -1109,51 +1216,93 @@ mdsync(void) longest = elapsed; total_elapsed += elapsed; processed++; - elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec", - processed, FilePathName(seg->mdfd_vfd), (double) elapsed / 1000); + requests = bms_del_member(requests, segno); + if (log_checkpoints) + elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec", + processed, + FilePathName(seg->mdfd_vfd), + (double) elapsed / 1000); + + break; /* out of retry loop */ } - break; /* success; break out of retry loop */ - } - - /* - * XXX is there any point in allowing more than one retry? - * Don't see one at the moment, but easy to change the test - * here if so. - */ - path = _mdfd_segpath(reln, entry->tag.forknum, - entry->tag.segno); - if (!FILE_POSSIBLY_DELETED(errno) || - failures > 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not fsync file \"%s\": %m", path))); - else - ereport(DEBUG1, - (errcode_for_file_access(), - errmsg("could not fsync file \"%s\" but retrying: %m", - path))); - pfree(path); - - /* - * Absorb incoming requests and check to see if canceled. - */ - AbsorbFsyncRequests(); - absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */ - - if (entry->canceled) - break; - } /* end retry loop */ + /* Compute file name for use in message */ + save_errno = errno; + path = _mdfd_segpath(reln, forknum, (BlockNumber) segno); + errno = save_errno; + + /* + * It is possible that the relation has been dropped or + * truncated since the fsync request was entered. + * Therefore, allow ENOENT, but only if we didn't fail + * already on this file. This applies both for + * _mdfd_getseg() and for FileSync, since fd.c might have + * closed the file behind our back. + * + * XXX is there any point in allowing more than one retry? + * Don't see one at the moment, but easy to change the + * test here if so. + */ + if (!FILE_POSSIBLY_DELETED(errno) || + failures > 0) + { + Bitmapset *new_requests; + + /* + * We need to merge these unsatisfied requests with + * any others that have arrived since we started. + */ + new_requests = entry->requests[forknum]; + entry->requests[forknum] = + bms_join(new_requests, requests); + + errno = save_errno; + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", + path))); + } + else + ereport(DEBUG1, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\" but retrying: %m", + path))); + pfree(path); + + /* + * Absorb incoming requests and check to see if a cancel + * arrived for this relation fork. + */ + AbsorbFsyncRequests(); + absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */ + + if (entry->canceled[forknum]) + break; + } /* end retry loop */ + } + bms_free(requests); } /* - * If we get here, either we fsync'd successfully, or we don't have to - * because enableFsync is off, or the entry is (now) marked canceled. - * Okay to delete it. + * We've finished everything that was requested before we started to + * scan the entry. If no new requests have been inserted meanwhile, + * remove the entry. Otherwise, update its cycle counter, as all the + * requests now in it must have arrived during this cycle. */ - if (hash_search(pendingOpsTable, &entry->tag, - HASH_REMOVE, NULL) == NULL) - elog(ERROR, "pendingOpsTable corrupted"); + for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) + { + if (entry->requests[forknum] != NULL) + break; + } + if (forknum <= MAX_FORKNUM) + entry->cycle_ctr = mdsync_cycle_ctr; + else + { + /* Okay to remove it */ + if (hash_search(pendingOpsTable, &entry->rnode, + HASH_REMOVE, NULL) == NULL) + elog(ERROR, "pendingOpsTable corrupted"); + } } /* end loop over hashtable entries */ /* Return sync performance metrics for report at checkpoint end */ @@ -1182,21 +1331,6 @@ mdsync(void) void mdpreckpt(void) { - ListCell *cell; - - /* - * In case the prior checkpoint wasn't completed, stamp all entries in the - * list with the current cycle counter. Anything that's in the list at - * the start of checkpoint can surely be deleted after the checkpoint is - * finished, regardless of when the request was made. - */ - foreach(cell, pendingUnlinks) - { - PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell); - - entry->cycle_ctr = mdckpt_cycle_ctr; - } - /* * Any unlink requests arriving after this point will be assigned the next * cycle counter, and won't be unlinked until next checkpoint. @@ -1212,6 +1346,9 @@ mdpreckpt(void) void mdpostckpt(void) { + int absorb_counter; + + absorb_counter = UNLINKS_PER_ABSORB; while (pendingUnlinks != NIL) { PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks); @@ -1220,15 +1357,17 @@ mdpostckpt(void) /* * New entries are appended to the end, so if the entry is new we've * reached the end of old entries. + * + * Note: if just the right number of consecutive checkpoints fail, we + * could be fooled here by cycle_ctr wraparound. However, the only + * consequence is that we'd delay unlinking for one more checkpoint, + * which is perfectly tolerable. */ if (entry->cycle_ctr == mdckpt_cycle_ctr) break; - /* Else assert we haven't missed it */ - Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr); - /* Unlink the file */ - path = relpath(entry->rnode, MAIN_FORKNUM); + path = relpathperm(entry->rnode, MAIN_FORKNUM); if (unlink(path) < 0) { /* @@ -1245,8 +1384,21 @@ mdpostckpt(void) } pfree(path); + /* And remove the list entry */ pendingUnlinks = list_delete_first(pendingUnlinks); pfree(entry); + + /* + * As in mdsync, we don't want to stop absorbing fsync requests for a + * long time when there are many deletions to be done. We can safely + * call AbsorbFsyncRequests() at this point in the loop (note it might + * try to delete list entries). + */ + if (--absorb_counter <= 0) + { + AbsorbFsyncRequests(); + absorb_counter = UNLINKS_PER_ABSORB; + } } } @@ -1255,28 +1407,31 @@ mdpostckpt(void) * * If there is a local pending-ops table, just make an entry in it for * mdsync to process later. Otherwise, try to pass off the fsync request - * to the background writer process. If that fails, just do the fsync - * locally before returning (we expect this will not happen often enough + * to the checkpointer process. If that fails, just do the fsync + * locally before returning (we hope this will not happen often enough * to be a performance problem). */ static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) { + /* Temp relations should never be fsync'd */ + Assert(!SmgrIsTemp(reln)); + if (pendingOpsTable) { /* push it into local pending-ops table */ - RememberFsyncRequest(reln->smgr_rnode, forknum, seg->mdfd_segno); + RememberFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno); } else { - if (ForwardFsyncRequest(reln->smgr_rnode, forknum, seg->mdfd_segno)) + if (ForwardFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno)) return; /* passed it off successfully */ ereport(DEBUG1, (errmsg("could not forward fsync request because request queue is full"))); - if (FileSync(seg->mdfd_vfd) < 0) - ereport(ERROR, + if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0) + ereport(data_sync_elevel(ERROR), (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", FilePathName(seg->mdfd_vfd)))); @@ -1286,72 +1441,100 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) /* * register_unlink() -- Schedule a file to be deleted after next checkpoint * + * We don't bother passing in the fork number, because this is only used + * with main forks. + * * As with register_dirty_segment, this could involve either a local or * a remote pending-ops table. */ static void register_unlink(RelFileNodeBackend rnode) { + /* Should never be used with temp relations */ + Assert(!RelFileNodeBackendIsTemp(rnode)); + if (pendingOpsTable) { /* push it into local pending-ops table */ - RememberFsyncRequest(rnode, MAIN_FORKNUM, UNLINK_RELATION_REQUEST); + RememberFsyncRequest(rnode.node, MAIN_FORKNUM, + UNLINK_RELATION_REQUEST); } else { /* - * Notify the bgwriter about it. If we fail to queue the request + * Notify the checkpointer about it. If we fail to queue the request * message, we have to sleep and try again, because we can't simply * delete the file now. Ugly, but hopefully won't happen often. * * XXX should we just leave the file orphaned instead? */ Assert(IsUnderPostmaster); - while (!ForwardFsyncRequest(rnode, MAIN_FORKNUM, + while (!ForwardFsyncRequest(rnode.node, MAIN_FORKNUM, UNLINK_RELATION_REQUEST)) pg_usleep(10000L); /* 10 msec seems a good number */ } } /* - * RememberFsyncRequest() -- callback from bgwriter side of fsync request + * RememberFsyncRequest() -- callback from checkpointer side of fsync request * - * We stuff most fsync requests into the local hash table for execution - * during the bgwriter's next checkpoint. UNLINK requests go into a + * We stuff fsync requests into the local hash table for execution + * during the checkpointer's next checkpoint. UNLINK requests go into a * separate linked list, however, because they get processed separately. * * The range of possible segment numbers is way less than the range of * BlockNumber, so we can reserve high values of segno for special purposes. * We define three: - * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation + * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation, + * either for one fork, or all forks if forknum is InvalidForkNumber * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database * - UNLINK_RELATION_REQUEST is a request to delete the file after the next * checkpoint. + * Note also that we're assuming real segment numbers don't exceed INT_MAX. * - * (Handling the FORGET_* requests is a tad slow because the hash table has - * to be searched linearly, but it doesn't seem worth rethinking the table - * structure for them.) + * (Handling FORGET_DATABASE_FSYNC requests is a tad slow because the hash + * table has to be searched linearly, but dropping a database is a pretty + * heavyweight operation anyhow, so we'll live with it.) */ void -RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum, - BlockNumber segno) +RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) { Assert(pendingOpsTable); if (segno == FORGET_RELATION_FSYNC) { - /* Remove any pending requests for the entire relation */ - HASH_SEQ_STATUS hstat; + /* Remove any pending requests for the relation (one or all forks) */ PendingOperationEntry *entry; - hash_seq_init(&hstat, pendingOpsTable); - while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) + entry = (PendingOperationEntry *) hash_search(pendingOpsTable, + &rnode, + HASH_FIND, + NULL); + if (entry) { - if (RelFileNodeBackendEquals(entry->tag.rnode, rnode) && - entry->tag.forknum == forknum) + /* + * We can't just delete the entry since mdsync could have an + * active hashtable scan. Instead we delete the bitmapsets; this + * is safe because of the way mdsync is coded. We also set the + * "canceled" flags so that mdsync can tell that a cancel arrived + * for the fork(s). + */ + if (forknum == InvalidForkNumber) + { + /* remove requests for all forks */ + for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) + { + bms_free(entry->requests[forknum]); + entry->requests[forknum] = NULL; + entry->canceled[forknum] = true; + } + } + else { - /* Okay, cancel this entry */ - entry->canceled = true; + /* remove requests for single fork */ + bms_free(entry->requests[forknum]); + entry->requests[forknum] = NULL; + entry->canceled[forknum] = true; } } } @@ -1368,10 +1551,15 @@ RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum, hash_seq_init(&hstat, pendingOpsTable); while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) { - if (entry->tag.rnode.node.dbNode == rnode.node.dbNode) + if (entry->rnode.dbNode == rnode.dbNode) { - /* Okay, cancel this entry */ - entry->canceled = true; + /* remove requests for all forks */ + for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) + { + bms_free(entry->requests[forknum]); + entry->requests[forknum] = NULL; + entry->canceled[forknum] = true; + } } } @@ -1382,7 +1570,7 @@ RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum, PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell); next = lnext(cell); - if (entry->rnode.node.dbNode == rnode.node.dbNode) + if (entry->rnode.dbNode == rnode.dbNode) { pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev); pfree(entry); @@ -1394,9 +1582,12 @@ RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum, else if (segno == UNLINK_RELATION_REQUEST) { /* Unlink request: put it in the linked list */ - MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt); + MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); PendingUnlinkEntry *entry; + /* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */ + Assert(forknum == MAIN_FORKNUM); + entry = palloc(sizeof(PendingUnlinkEntry)); entry->rnode = rnode; entry->cycle_ctr = mdckpt_cycle_ctr; @@ -1408,48 +1599,43 @@ RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum, else { /* Normal case: enter a request to fsync this segment */ - PendingOperationTag key; + MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); PendingOperationEntry *entry; bool found; - /* ensure any pad bytes in the hash key are zeroed */ - MemSet(&key, 0, sizeof(key)); - key.rnode = rnode; - key.forknum = forknum; - key.segno = segno; - entry = (PendingOperationEntry *) hash_search(pendingOpsTable, - &key, + &rnode, HASH_ENTER, &found); - /* if new or previously canceled entry, initialize it */ - if (!found || entry->canceled) + /* if new entry, initialize it */ + if (!found) { - entry->canceled = false; entry->cycle_ctr = mdsync_cycle_ctr; + MemSet(entry->requests, 0, sizeof(entry->requests)); + MemSet(entry->canceled, 0, sizeof(entry->canceled)); } /* * NB: it's intentional that we don't change cycle_ctr if the entry - * already exists. The fsync request must be treated as old, even - * though the new request will be satisfied too by any subsequent - * fsync. - * - * However, if the entry is present but is marked canceled, we should - * act just as though it wasn't there. The only case where this could - * happen would be if a file had been deleted, we received but did not - * yet act on the cancel request, and the same relfilenode was then - * assigned to a new file. We mustn't lose the new request, but it - * should be considered new not old. + * already exists. The cycle_ctr must represent the oldest fsync + * request that could be in the entry. */ + + entry->requests[forknum] = bms_add_member(entry->requests[forknum], + (int) segno); + + MemoryContextSwitchTo(oldcxt); } } /* - * ForgetRelationFsyncRequests -- forget any fsyncs for a rel + * ForgetRelationFsyncRequests -- forget any fsyncs for a relation fork + * + * forknum == InvalidForkNumber means all forks, although this code doesn't + * actually know that, since it's just forwarding the request elsewhere. */ void -ForgetRelationFsyncRequests(RelFileNodeBackend rnode, ForkNumber forknum) +ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum) { if (pendingOpsTable) { @@ -1459,21 +1645,21 @@ ForgetRelationFsyncRequests(RelFileNodeBackend rnode, ForkNumber forknum) else if (IsUnderPostmaster) { /* - * Notify the bgwriter about it. If we fail to queue the revoke + * Notify the checkpointer about it. If we fail to queue the cancel * message, we have to sleep and try again ... ugly, but hopefully * won't happen often. * * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an * error would leave the no-longer-used file still present on disk, - * which would be bad, so I'm inclined to assume that the bgwriter + * which would be bad, so I'm inclined to assume that the checkpointer * will always empty the queue soon. */ while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC)) pg_usleep(10000L); /* 10 msec seems a good number */ /* - * Note we don't wait for the bgwriter to actually absorb the revoke - * message; see mdsync() for the implications. + * Note we don't wait for the checkpointer to actually absorb the + * cancel message; see mdsync() for the implications. */ } } @@ -1484,12 +1670,11 @@ ForgetRelationFsyncRequests(RelFileNodeBackend rnode, ForkNumber forknum) void ForgetDatabaseFsyncRequests(Oid dbid) { - RelFileNodeBackend rnode; + RelFileNode rnode; - rnode.node.dbNode = dbid; - rnode.node.spcNode = 0; - rnode.node.relNode = 0; - rnode.backend = InvalidBackendId; + rnode.dbNode = dbid; + rnode.spcNode = 0; + rnode.relNode = 0; if (pendingOpsTable) { @@ -1505,14 +1690,79 @@ ForgetDatabaseFsyncRequests(Oid dbid) } } +/* + * DropRelationFiles -- drop files of all given relations + */ +void +DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo) +{ + SMgrRelation *srels; + int i; + + srels = palloc(sizeof(SMgrRelation) * ndelrels); + for (i = 0; i < ndelrels; i++) + { + SMgrRelation srel = smgropen(delrels[i], InvalidBackendId); + + if (isRedo) + { + ForkNumber fork; + + for (fork = 0; fork <= MAX_FORKNUM; fork++) + XLogDropRelation(delrels[i], fork); + } + srels[i] = srel; + } + + smgrdounlinkall(srels, ndelrels, isRedo); + + /* + * Call smgrclose() in reverse order as when smgropen() is called. + * This trick enables remove_from_unowned_list() in smgrclose() + * to search the SMgrRelation from the unowned list, + * with O(1) performance. + */ + for (i = ndelrels - 1; i >= 0; i--) + smgrclose(srels[i]); + pfree(srels); +} + /* - * _fdvec_alloc() -- Make a MdfdVec object. + * _fdvec_resize() -- Resize the fork's open segments array */ -static MdfdVec * -_fdvec_alloc(void) +static void +_fdvec_resize(SMgrRelation reln, + ForkNumber forknum, + int nseg) { - return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec)); + if (nseg == 0) + { + if (reln->md_num_open_segs[forknum] > 0) + { + pfree(reln->md_seg_fds[forknum]); + reln->md_seg_fds[forknum] = NULL; + } + } + else if (reln->md_num_open_segs[forknum] == 0) + { + reln->md_seg_fds[forknum] = + MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg); + } + else + { + /* + * It doesn't seem worthwhile complicating the code by having a more + * aggressive growth strategy here; the number of segments doesn't + * grow that fast, and the memory context internally will sometimes + * avoid doing an actual reallocation. + */ + reln->md_seg_fds[forknum] = + repalloc(reln->md_seg_fds[forknum], + sizeof(MdfdVec) * nseg); + } + + reln->md_num_open_segs[forknum] = nseg; } /* @@ -1529,9 +1779,7 @@ _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno) if (segno > 0) { - /* be sure we have enough space for the '.segno' */ - fullpath = (char *) palloc(strlen(path) + 12); - sprintf(fullpath, "%s.%u", path, segno); + fullpath = psprintf("%s.%u", path, segno); pfree(path); } else @@ -1555,23 +1803,21 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, fullpath = _mdfd_segpath(reln, forknum, segno); /* open the file */ - fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600); + fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags); pfree(fullpath); if (fd < 0) return NULL; - if (reln->smgr_transient) - FileSetTransient(fd); - - /* allocate an mdfdvec entry for it */ - v = _fdvec_alloc(); + if (segno <= reln->md_num_open_segs[forknum]) + _fdvec_resize(reln, forknum, segno + 1); /* fill the entry */ + v = &reln->md_seg_fds[forknum][segno]; v->mdfd_vfd = fd; v->mdfd_segno = segno; - v->mdfd_chain = NULL; + Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); /* all done */ @@ -1588,68 +1834,126 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, */ static MdfdVec * _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, - bool skipFsync, ExtensionBehavior behavior) + bool skipFsync, int behavior) { - MdfdVec *v = mdopen(reln, forknum, behavior); + MdfdVec *v; BlockNumber targetseg; BlockNumber nextsegno; - if (!v) - return NULL; /* only possible if EXTENSION_RETURN_NULL */ + /* some way to handle non-existent segments needs to be specified */ + Assert(behavior & + (EXTENSION_FAIL | EXTENSION_CREATE | EXTENSION_RETURN_NULL)); targetseg = blkno / ((BlockNumber) RELSEG_SIZE); - for (nextsegno = 1; nextsegno <= targetseg; nextsegno++) + + /* if an existing and opened segment, we're done */ + if (targetseg < reln->md_num_open_segs[forknum]) + { + v = &reln->md_seg_fds[forknum][targetseg]; + return v; + } + + /* + * The target segment is not yet open. Iterate over all the segments + * between the last opened and the target segment. This way missing + * segments either raise an error, or get created (according to + * 'behavior'). Start with either the last opened, or the first segment if + * none was opened before. + */ + if (reln->md_num_open_segs[forknum] > 0) + v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1]; + else { + v = mdopen(reln, forknum, behavior); + if (!v) + return NULL; /* if behavior & EXTENSION_RETURN_NULL */ + } + + for (nextsegno = reln->md_num_open_segs[forknum]; + nextsegno <= targetseg; nextsegno++) + { + BlockNumber nblocks = _mdnblocks(reln, forknum, v); + int flags = 0; + Assert(nextsegno == v->mdfd_segno + 1); - if (v->mdfd_chain == NULL) + if (nblocks > ((BlockNumber) RELSEG_SIZE)) + elog(FATAL, "segment too big"); + + if ((behavior & EXTENSION_CREATE) || + (InRecovery && (behavior & EXTENSION_CREATE_RECOVERY))) { /* * Normally we will create new segments only if authorized by the - * caller (i.e., we are doing mdextend()). But when doing WAL + * caller (i.e., we are doing mdextend()). But when doing WAL * recovery, create segments anyway; this allows cases such as * replaying WAL data that has a write into a high-numbered - * segment of a relation that was later deleted. We want to go + * segment of a relation that was later deleted. We want to go * ahead and create the segments so we can finish out the replay. + * However if the caller has specified + * EXTENSION_REALLY_RETURN_NULL, then extension is not desired + * even in recovery; we won't reach this point in that case. * * We have to maintain the invariant that segments before the last - * active segment are of size RELSEG_SIZE; therefore, pad them out - * with zeroes if needed. (This only matters if caller is - * extending the relation discontiguously, but that can happen in - * hash indexes.) + * active segment are of size RELSEG_SIZE; therefore, if + * extending, pad them out with zeroes if needed. (This only + * matters if in recovery, or if the caller is extending the + * relation discontiguously, but that can happen in hash indexes.) */ - if (behavior == EXTENSION_CREATE || InRecovery) + if (nblocks < ((BlockNumber) RELSEG_SIZE)) { - if (_mdnblocks(reln, forknum, v) < RELSEG_SIZE) - { - char *zerobuf = palloc0(BLCKSZ); + char *zerobuf = palloc0(BLCKSZ); - mdextend(reln, forknum, - nextsegno * ((BlockNumber) RELSEG_SIZE) - 1, - zerobuf, skipFsync); - pfree(zerobuf); - } - v->mdfd_chain = _mdfd_openseg(reln, forknum, +nextsegno, O_CREAT); + mdextend(reln, forknum, + nextsegno * ((BlockNumber) RELSEG_SIZE) - 1, + zerobuf, skipFsync); + pfree(zerobuf); } - else - { - /* We won't create segment if not existent */ - v->mdfd_chain = _mdfd_openseg(reln, forknum, nextsegno, 0); - } - if (v->mdfd_chain == NULL) + flags = O_CREAT; + } + else if (!(behavior & EXTENSION_DONT_CHECK_SIZE) && + nblocks < ((BlockNumber) RELSEG_SIZE)) + { + /* + * When not extending (or explicitly including truncated + * segments), only open the next segment if the current one is + * exactly RELSEG_SIZE. If not (this branch), either return NULL + * or fail. + */ + if (behavior & EXTENSION_RETURN_NULL) { - if (behavior == EXTENSION_RETURN_NULL && - FILE_POSSIBLY_DELETED(errno)) - return NULL; - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\" (target block %u): %m", - _mdfd_segpath(reln, forknum, nextsegno), - blkno))); + /* + * Some callers discern between reasons for _mdfd_getseg() + * returning NULL based on errno. As there's no failing + * syscall involved in this case, explicitly set errno to + * ENOENT, as that seems the closest interpretation. + */ + errno = ENOENT; + return NULL; } + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks", + _mdfd_segpath(reln, forknum, nextsegno), + blkno, nblocks))); + } + + v = _mdfd_openseg(reln, forknum, nextsegno, flags); + + if (v == NULL) + { + if ((behavior & EXTENSION_RETURN_NULL) && + FILE_POSSIBLY_DELETED(errno)) + return NULL; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\" (target block %u): %m", + _mdfd_segpath(reln, forknum, nextsegno), + blkno))); } - v = v->mdfd_chain; } + return v; } @@ -1661,7 +1965,7 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) { off_t len; - len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END); + len = FileSize(seg->mdfd_vfd); if (len < 0) ereport(ERROR, (errcode_for_file_access(),