granicus.if.org Git - postgresql/blob - src/backend/storage/smgr/md.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * md.c
   4  *        This code manages relations that reside on magnetic disk.
   5  *
   6  * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        src/backend/storage/smgr/md.c
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15 #include "postgres.h"
  16
  17 #include <unistd.h>
  18 #include <fcntl.h>
  19 #include <sys/file.h>
  20
  21 #include "miscadmin.h"
  22 #include "access/xlog.h"
  23 #include "catalog/catalog.h"
  24 #include "portability/instr_time.h"
  25 #include "postmaster/bgwriter.h"
  26 #include "storage/fd.h"
  27 #include "storage/bufmgr.h"
  28 #include "storage/relfilenode.h"
  29 #include "storage/smgr.h"
  30 #include "utils/hsearch.h"
  31 #include "utils/memutils.h"
  32 #include "pg_trace.h"
  33
  34
  35 /* interval for calling AbsorbFsyncRequests in mdsync */
  36 #define FSYNCS_PER_ABSORB               10
  37
  38 /*
  39  * Special values for the segno arg to RememberFsyncRequest.
  40  *
  41  * Note that CompactCheckpointerRequestQueue assumes that it's OK to remove an
  42  * fsync request from the queue if an identical, subsequent request is found.
  43  * See comments there before making changes here.
  44  */
  45 #define FORGET_RELATION_FSYNC   (InvalidBlockNumber)
  46 #define FORGET_DATABASE_FSYNC   (InvalidBlockNumber-1)
  47 #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
  48
  49 /*
  50  * On Windows, we have to interpret EACCES as possibly meaning the same as
  51  * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
  52  * that's what you get.  Ugh.  This code is designed so that we don't
  53  * actually believe these cases are okay without further evidence (namely,
  54  * a pending fsync request getting revoked ... see mdsync).
  55  */
  56 #ifndef WIN32
  57 #define FILE_POSSIBLY_DELETED(err)      ((err) == ENOENT)
  58 #else
  59 #define FILE_POSSIBLY_DELETED(err)      ((err) == ENOENT || (err) == EACCES)
  60 #endif
  61
  62 /*
  63  *      The magnetic disk storage manager keeps track of open file
  64  *      descriptors in its own descriptor pool.  This is done to make it
  65  *      easier to support relations that are larger than the operating
  66  *      system's file size limit (often 2GBytes).  In order to do that,
  67  *      we break relations up into "segment" files that are each shorter than
  68  *      the OS file size limit.  The segment size is set by the RELSEG_SIZE
  69  *      configuration constant in pg_config.h.
  70  *
  71  *      On disk, a relation must consist of consecutively numbered segment
  72  *      files in the pattern
  73  *              -- Zero or more full segments of exactly RELSEG_SIZE blocks each
  74  *              -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
  75  *              -- Optionally, any number of inactive segments of size 0 blocks.
  76  *      The full and partial segments are collectively the "active" segments.
  77  *      Inactive segments are those that once contained data but are currently
  78  *      not needed because of an mdtruncate() operation.  The reason for leaving
  79  *      them present at size zero, rather than unlinking them, is that other
  80  *      backends and/or the checkpointer might be holding open file references to
  81  *      such segments.  If the relation expands again after mdtruncate(), such
  82  *      that a deactivated segment becomes active again, it is important that
  83  *      such file references still be valid --- else data might get written
  84  *      out to an unlinked old copy of a segment file that will eventually
  85  *      disappear.
  86  *
  87  *      The file descriptor pointer (md_fd field) stored in the SMgrRelation
  88  *      cache is, therefore, just the head of a list of MdfdVec objects, one
  89  *      per segment.  But note the md_fd pointer can be NULL, indicating
  90  *      relation not open.
  91  *
  92  *      Also note that mdfd_chain == NULL does not necessarily mean the relation
  93  *      doesn't have another segment after this one; we may just not have
  94  *      opened the next segment yet.  (We could not have "all segments are
  95  *      in the chain" as an invariant anyway, since another backend could
  96  *      extend the relation when we weren't looking.)  We do not make chain
  97  *      entries for inactive segments, however; as soon as we find a partial
  98  *      segment, we assume that any subsequent segments are inactive.
  99  *
 100  *      All MdfdVec objects are palloc'd in the MdCxt memory context.
 101  */
 102
 103 typedef struct _MdfdVec
 104 {
 105         File            mdfd_vfd;               /* fd number in fd.c's pool */
 106         BlockNumber mdfd_segno;         /* segment number, from 0 */
 107         struct _MdfdVec *mdfd_chain;    /* next segment, or NULL */
 108 } MdfdVec;
 109
 110 static MemoryContext MdCxt;             /* context for all md.c allocations */
 111
 112
 113 /*
 114  * In some contexts (currently, standalone backends and the checkpointer process)
 115  * we keep track of pending fsync operations: we need to remember all relation
 116  * segments that have been written since the last checkpoint, so that we can
 117  * fsync them down to disk before completing the next checkpoint.  This hash
 118  * table remembers the pending operations.      We use a hash table mostly as
 119  * a convenient way of eliminating duplicate requests.
 120  *
 121  * We use a similar mechanism to remember no-longer-needed files that can
 122  * be deleted after the next checkpoint, but we use a linked list instead of
 123  * a hash table, because we don't expect there to be any duplicate requests.
 124  *
 125  * These mechanisms are only used for non-temp relations; we never fsync
 126  * temp rels, nor do we need to postpone their deletion (see comments in
 127  * mdunlink).
 128  *
 129  * (Regular backends do not track pending operations locally, but forward
 130  * them to the checkpointer.)
 131  */
 132 typedef struct
 133 {
 134         RelFileNode     rnode;                  /* the targeted relation */
 135         ForkNumber      forknum;                /* which fork */
 136         BlockNumber segno;                      /* which segment */
 137 } PendingOperationTag;
 138
 139 typedef uint16 CycleCtr;                /* can be any convenient integer size */
 140
 141 typedef struct
 142 {
 143         PendingOperationTag tag;        /* hash table key (must be first!) */
 144         bool            canceled;               /* T => request canceled, not yet removed */
 145         CycleCtr        cycle_ctr;              /* mdsync_cycle_ctr when request was made */
 146 } PendingOperationEntry;
 147
 148 typedef struct
 149 {
 150         RelFileNode     rnode;                  /* the dead relation to delete */
 151         CycleCtr        cycle_ctr;              /* mdckpt_cycle_ctr when request was made */
 152 } PendingUnlinkEntry;
 153
 154 static HTAB *pendingOpsTable = NULL;
 155 static List *pendingUnlinks = NIL;
 156
 157 static CycleCtr mdsync_cycle_ctr = 0;
 158 static CycleCtr mdckpt_cycle_ctr = 0;
 159
 160
 161 typedef enum                                    /* behavior for mdopen & _mdfd_getseg */
 162 {
 163         EXTENSION_FAIL,                         /* ereport if segment not present */
 164         EXTENSION_RETURN_NULL,          /* return NULL if not present */
 165         EXTENSION_CREATE                        /* create new segments as needed */
 166 } ExtensionBehavior;
 167
 168 /* local routines */
 169 static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum,
 170                                                  bool isRedo);
 171 static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum,
 172            ExtensionBehavior behavior);
 173 static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
 174                                            MdfdVec *seg);
 175 static void register_unlink(RelFileNodeBackend rnode);
 176 static MdfdVec *_fdvec_alloc(void);
 177 static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
 178                           BlockNumber segno);
 179 static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno,
 180                           BlockNumber segno, int oflags);
 181 static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno,
 182                          BlockNumber blkno, bool skipFsync, ExtensionBehavior behavior);
 183 static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
 184                    MdfdVec *seg);
 185
 186
 187 /*
 188  *      mdinit() -- Initialize private state for magnetic disk storage manager.
 189  */
 190 void
 191 mdinit(void)
 192 {
 193         MdCxt = AllocSetContextCreate(TopMemoryContext,
 194                                                                   "MdSmgr",
 195                                                                   ALLOCSET_DEFAULT_MINSIZE,
 196                                                                   ALLOCSET_DEFAULT_INITSIZE,
 197                                                                   ALLOCSET_DEFAULT_MAXSIZE);
 198
 199         /*
 200          * Create pending-operations hashtable if we need it.  Currently, we need
 201          * it if we are standalone (not under a postmaster) or if we are a startup
 202          * or checkpointer auxiliary process.
 203          */
 204         if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess())
 205         {
 206                 HASHCTL         hash_ctl;
 207
 208                 MemSet(&hash_ctl, 0, sizeof(hash_ctl));
 209                 hash_ctl.keysize = sizeof(PendingOperationTag);
 210                 hash_ctl.entrysize = sizeof(PendingOperationEntry);
 211                 hash_ctl.hash = tag_hash;
 212                 hash_ctl.hcxt = MdCxt;
 213                 pendingOpsTable = hash_create("Pending Ops Table",
 214                                                                           100L,
 215                                                                           &hash_ctl,
 216                                                                    HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
 217                 pendingUnlinks = NIL;
 218         }
 219 }
 220
 221 /*
 222  * In archive recovery, we rely on checkpointer to do fsyncs, but we will have
 223  * already created the pendingOpsTable during initialization of the startup
 224  * process.  Calling this function drops the local pendingOpsTable so that
 225  * subsequent requests will be forwarded to checkpointer.
 226  */
 227 void
 228 SetForwardFsyncRequests(void)
 229 {
 230         /* Perform any pending ops we may have queued up */
 231         if (pendingOpsTable)
 232                 mdsync();
 233         pendingOpsTable = NULL;
 234 }
 235
 236 /*
 237  *      mdexists() -- Does the physical file exist?
 238  *
 239  * Note: this will return true for lingering files, with pending deletions
 240  */
 241 bool
 242 mdexists(SMgrRelation reln, ForkNumber forkNum)
 243 {
 244         /*
 245          * Close it first, to ensure that we notice if the fork has been unlinked
 246          * since we opened it.
 247          */
 248         mdclose(reln, forkNum);
 249
 250         return (mdopen(reln, forkNum, EXTENSION_RETURN_NULL) != NULL);
 251 }
 252
 253 /*
 254  *      mdcreate() -- Create a new relation on magnetic disk.
 255  *
 256  * If isRedo is true, it's okay for the relation to exist already.
 257  */
 258 void
 259 mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 260 {
 261         char       *path;
 262         File            fd;
 263
 264         if (isRedo && reln->md_fd[forkNum] != NULL)
 265                 return;                                 /* created and opened already... */
 266
 267         Assert(reln->md_fd[forkNum] == NULL);
 268
 269         path = relpath(reln->smgr_rnode, forkNum);
 270
 271         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
 272
 273         if (fd < 0)
 274         {
 275                 int                     save_errno = errno;
 276
 277                 /*
 278                  * During bootstrap, there are cases where a system relation will be
 279                  * accessed (by internal backend processes) before the bootstrap
 280                  * script nominally creates it.  Therefore, allow the file to exist
 281                  * already, even if isRedo is not set.  (See also mdopen)
 282                  */
 283                 if (isRedo || IsBootstrapProcessingMode())
 284                         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
 285                 if (fd < 0)
 286                 {
 287                         /* be sure to report the error reported by create, not open */
 288                         errno = save_errno;
 289                         ereport(ERROR,
 290                                         (errcode_for_file_access(),
 291                                          errmsg("could not create file \"%s\": %m", path)));
 292                 }
 293         }
 294
 295         pfree(path);
 296
 297         if (reln->smgr_transient)
 298                 FileSetTransient(fd);
 299
 300         reln->md_fd[forkNum] = _fdvec_alloc();
 301
 302         reln->md_fd[forkNum]->mdfd_vfd = fd;
 303         reln->md_fd[forkNum]->mdfd_segno = 0;
 304         reln->md_fd[forkNum]->mdfd_chain = NULL;
 305 }
 306
 307 /*
 308  *      mdunlink() -- Unlink a relation.
 309  *
 310  * Note that we're passed a RelFileNodeBackend --- by the time this is called,
 311  * there won't be an SMgrRelation hashtable entry anymore.
 312  *
 313  * forkNum can be a fork number to delete a specific fork, or InvalidForkNumber
 314  * to delete all forks.
 315  *
 316  * For regular relations, we don't unlink the first segment file of the rel,
 317  * but just truncate it to zero length, and record a request to unlink it after
 318  * the next checkpoint.  Additional segments can be unlinked immediately,
 319  * however.  Leaving the empty file in place prevents that relfilenode
 320  * number from being reused.  The scenario this protects us from is:
 321  * 1. We delete a relation (and commit, and actually remove its file).
 322  * 2. We create a new relation, which by chance gets the same relfilenode as
 323  *        the just-deleted one (OIDs must've wrapped around for that to happen).
 324  * 3. We crash before another checkpoint occurs.
 325  * During replay, we would delete the file and then recreate it, which is fine
 326  * if the contents of the file were repopulated by subsequent WAL entries.
 327  * But if we didn't WAL-log insertions, but instead relied on fsyncing the
 328  * file after populating it (as for instance CLUSTER and CREATE INDEX do),
 329  * the contents of the file would be lost forever.      By leaving the empty file
 330  * until after the next checkpoint, we prevent reassignment of the relfilenode
 331  * number until it's safe, because relfilenode assignment skips over any
 332  * existing file.
 333  *
 334  * We do not need to go through this dance for temp relations, though, because
 335  * we never make WAL entries for temp rels, and so a temp rel poses no threat
 336  * to the health of a regular rel that has taken over its relfilenode number.
 337  * The fact that temp rels and regular rels have different file naming
 338  * patterns provides additional safety.
 339  *
 340  * All the above applies only to the relation's main fork; other forks can
 341  * just be removed immediately, since they are not needed to prevent the
 342  * relfilenode number from being recycled.      Also, we do not carefully
 343  * track whether other forks have been created or not, but just attempt to
 344  * unlink them unconditionally; so we should never complain about ENOENT.
 345  *
 346  * If isRedo is true, it's unsurprising for the relation to be already gone.
 347  * Also, we should remove the file immediately instead of queuing a request
 348  * for later, since during redo there's no possibility of creating a
 349  * conflicting relation.
 350  *
 351  * Note: any failure should be reported as WARNING not ERROR, because
 352  * we are usually not in a transaction anymore when this is called.
 353  */
 354 void
 355 mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 356 {
 357         /*
 358          * We have to clean out any pending fsync requests for the doomed
 359          * relation, else the next mdsync() will fail.  There can't be any such
 360          * requests for a temp relation, though.  We can send just one request
 361          * even when deleting multiple forks, since the fsync queuing code accepts
 362          * the "InvalidForkNumber = all forks" convention.
 363          */
 364         if (!RelFileNodeBackendIsTemp(rnode))
 365                 ForgetRelationFsyncRequests(rnode.node, forkNum);
 366
 367         /* Now do the per-fork work */
 368         if (forkNum == InvalidForkNumber)
 369         {
 370                 for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++)
 371                         mdunlinkfork(rnode, forkNum, isRedo);
 372         }
 373         else
 374                 mdunlinkfork(rnode, forkNum, isRedo);
 375 }
 376
 377 static void
 378 mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 379 {
 380         char       *path;
 381         int                     ret;
 382
 383         path = relpath(rnode, forkNum);
 384
 385         /*
 386          * Delete or truncate the first segment.
 387          */
 388         if (isRedo || forkNum != MAIN_FORKNUM || RelFileNodeBackendIsTemp(rnode))
 389         {
 390                 ret = unlink(path);
 391                 if (ret < 0 && errno != ENOENT)
 392                         ereport(WARNING,
 393                                         (errcode_for_file_access(),
 394                                          errmsg("could not remove file \"%s\": %m", path)));
 395         }
 396         else
 397         {
 398                 /* truncate(2) would be easier here, but Windows hasn't got it */
 399                 int                     fd;
 400
 401                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0);
 402                 if (fd >= 0)
 403                 {
 404                         int                     save_errno;
 405
 406                         ret = ftruncate(fd, 0);
 407                         save_errno = errno;
 408                         close(fd);
 409                         errno = save_errno;
 410                 }
 411                 else
 412                         ret = -1;
 413                 if (ret < 0 && errno != ENOENT)
 414                         ereport(WARNING,
 415                                         (errcode_for_file_access(),
 416                                          errmsg("could not truncate file \"%s\": %m", path)));
 417
 418                 /* Register request to unlink first segment later */
 419                 register_unlink(rnode);
 420         }
 421
 422         /*
 423          * Delete any additional segments.
 424          */
 425         if (ret >= 0)
 426         {
 427                 char       *segpath = (char *) palloc(strlen(path) + 12);
 428                 BlockNumber segno;
 429
 430                 /*
 431                  * Note that because we loop until getting ENOENT, we will correctly
 432                  * remove all inactive segments as well as active ones.
 433                  */
 434                 for (segno = 1;; segno++)
 435                 {
 436                         sprintf(segpath, "%s.%u", path, segno);
 437                         if (unlink(segpath) < 0)
 438                         {
 439                                 /* ENOENT is expected after the last segment... */
 440                                 if (errno != ENOENT)
 441                                         ereport(WARNING,
 442                                                         (errcode_for_file_access(),
 443                                            errmsg("could not remove file \"%s\": %m", segpath)));
 444                                 break;
 445                         }
 446                 }
 447                 pfree(segpath);
 448         }
 449
 450         pfree(path);
 451 }
 452
 453 /*
 454  *      mdextend() -- Add a block to the specified relation.
 455  *
 456  *              The semantics are nearly the same as mdwrite(): write at the
 457  *              specified position.  However, this is to be used for the case of
 458  *              extending a relation (i.e., blocknum is at or beyond the current
 459  *              EOF).  Note that we assume writing a block beyond current EOF
 460  *              causes intervening file space to become filled with zeroes.
 461  */
 462 void
 463 mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 464                  char *buffer, bool skipFsync)
 465 {
 466         off_t           seekpos;
 467         int                     nbytes;
 468         MdfdVec    *v;
 469
 470         /* This assert is too expensive to have on normally ... */
 471 #ifdef CHECK_WRITE_VS_EXTEND
 472         Assert(blocknum >= mdnblocks(reln, forknum));
 473 #endif
 474
 475         /*
 476          * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
 477          * more --- we mustn't create a block whose number actually is
 478          * InvalidBlockNumber.
 479          */
 480         if (blocknum == InvalidBlockNumber)
 481                 ereport(ERROR,
 482                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 483                                  errmsg("cannot extend file \"%s\" beyond %u blocks",
 484                                                 relpath(reln->smgr_rnode, forknum),
 485                                                 InvalidBlockNumber)));
 486
 487         v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
 488
 489         seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
 490
 491         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 492
 493         /*
 494          * Note: because caller usually obtained blocknum by calling mdnblocks,
 495          * which did a seek(SEEK_END), this seek is often redundant and will be
 496          * optimized away by fd.c.      It's not redundant, however, if there is a
 497          * partial page at the end of the file. In that case we want to try to
 498          * overwrite the partial page with a full page.  It's also not redundant
 499          * if bufmgr.c had to dump another buffer of the same file to make room
 500          * for the new page's buffer.
 501          */
 502         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 503                 ereport(ERROR,
 504                                 (errcode_for_file_access(),
 505                                  errmsg("could not seek to block %u in file \"%s\": %m",
 506                                                 blocknum, FilePathName(v->mdfd_vfd))));
 507
 508         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
 509         {
 510                 if (nbytes < 0)
 511                         ereport(ERROR,
 512                                         (errcode_for_file_access(),
 513                                          errmsg("could not extend file \"%s\": %m",
 514                                                         FilePathName(v->mdfd_vfd)),
 515                                          errhint("Check free disk space.")));
 516                 /* short write: complain appropriately */
 517                 ereport(ERROR,
 518                                 (errcode(ERRCODE_DISK_FULL),
 519                                  errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
 520                                                 FilePathName(v->mdfd_vfd),
 521                                                 nbytes, BLCKSZ, blocknum),
 522                                  errhint("Check free disk space.")));
 523         }
 524
 525         if (!skipFsync && !SmgrIsTemp(reln))
 526                 register_dirty_segment(reln, forknum, v);
 527
 528         Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
 529 }
 530
 531 /*
 532  *      mdopen() -- Open the specified relation.
 533  *
 534  * Note we only open the first segment, when there are multiple segments.
 535  *
 536  * If first segment is not present, either ereport or return NULL according
 537  * to "behavior".  We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
 538  * EXTENSION_CREATE means it's OK to extend an existing relation, not to
 539  * invent one out of whole cloth.
 540  */
 541 static MdfdVec *
 542 mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior)
 543 {
 544         MdfdVec    *mdfd;
 545         char       *path;
 546         File            fd;
 547
 548         /* No work if already open */
 549         if (reln->md_fd[forknum])
 550                 return reln->md_fd[forknum];
 551
 552         path = relpath(reln->smgr_rnode, forknum);
 553
 554         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
 555
 556         if (fd < 0)
 557         {
 558                 /*
 559                  * During bootstrap, there are cases where a system relation will be
 560                  * accessed (by internal backend processes) before the bootstrap
 561                  * script nominally creates it.  Therefore, accept mdopen() as a
 562                  * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
 563                  */
 564                 if (IsBootstrapProcessingMode())
 565                         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
 566                 if (fd < 0)
 567                 {
 568                         if (behavior == EXTENSION_RETURN_NULL &&
 569                                 FILE_POSSIBLY_DELETED(errno))
 570                         {
 571                                 pfree(path);
 572                                 return NULL;
 573                         }
 574                         ereport(ERROR,
 575                                         (errcode_for_file_access(),
 576                                          errmsg("could not open file \"%s\": %m", path)));
 577                 }
 578         }
 579
 580         pfree(path);
 581
 582         if (reln->smgr_transient)
 583                 FileSetTransient(fd);
 584
 585         reln->md_fd[forknum] = mdfd = _fdvec_alloc();
 586
 587         mdfd->mdfd_vfd = fd;
 588         mdfd->mdfd_segno = 0;
 589         mdfd->mdfd_chain = NULL;
 590         Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
 591
 592         return mdfd;
 593 }
 594
 595 /*
 596  *      mdclose() -- Close the specified relation, if it isn't closed already.
 597  */
 598 void
 599 mdclose(SMgrRelation reln, ForkNumber forknum)
 600 {
 601         MdfdVec    *v = reln->md_fd[forknum];
 602
 603         /* No work if already closed */
 604         if (v == NULL)
 605                 return;
 606
 607         reln->md_fd[forknum] = NULL;    /* prevent dangling pointer after error */
 608
 609         while (v != NULL)
 610         {
 611                 MdfdVec    *ov = v;
 612
 613                 /* if not closed already */
 614                 if (v->mdfd_vfd >= 0)
 615                         FileClose(v->mdfd_vfd);
 616                 /* Now free vector */
 617                 v = v->mdfd_chain;
 618                 pfree(ov);
 619         }
 620 }
 621
 622 /*
 623  *      mdprefetch() -- Initiate asynchronous read of the specified block of a relation
 624  */
 625 void
 626 mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 627 {
 628 #ifdef USE_PREFETCH
 629         off_t           seekpos;
 630         MdfdVec    *v;
 631
 632         v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
 633
 634         seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
 635
 636         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 637
 638         (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ);
 639 #endif   /* USE_PREFETCH */
 640 }
 641
 642
 643 /*
 644  *      mdread() -- Read the specified block from a relation.
 645  */
 646 void
 647 mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 648            char *buffer)
 649 {
 650         off_t           seekpos;
 651         int                     nbytes;
 652         MdfdVec    *v;
 653
 654         TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
 655                                                                                 reln->smgr_rnode.node.spcNode,
 656                                                                                 reln->smgr_rnode.node.dbNode,
 657                                                                                 reln->smgr_rnode.node.relNode,
 658                                                                                 reln->smgr_rnode.backend);
 659
 660         v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
 661
 662         seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
 663
 664         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 665
 666         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 667                 ereport(ERROR,
 668                                 (errcode_for_file_access(),
 669                                  errmsg("could not seek to block %u in file \"%s\": %m",
 670                                                 blocknum, FilePathName(v->mdfd_vfd))));
 671
 672         nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ);
 673
 674         TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
 675                                                                            reln->smgr_rnode.node.spcNode,
 676                                                                            reln->smgr_rnode.node.dbNode,
 677                                                                            reln->smgr_rnode.node.relNode,
 678                                                                            reln->smgr_rnode.backend,
 679                                                                            nbytes,
 680                                                                            BLCKSZ);
 681
 682         if (nbytes != BLCKSZ)
 683         {
 684                 if (nbytes < 0)
 685                         ereport(ERROR,
 686                                         (errcode_for_file_access(),
 687                                          errmsg("could not read block %u in file \"%s\": %m",
 688                                                         blocknum, FilePathName(v->mdfd_vfd))));
 689
 690                 /*
 691                  * Short read: we are at or past EOF, or we read a partial block at
 692                  * EOF.  Normally this is an error; upper levels should never try to
 693                  * read a nonexistent block.  However, if zero_damaged_pages is ON or
 694                  * we are InRecovery, we should instead return zeroes without
 695                  * complaining.  This allows, for example, the case of trying to
 696                  * update a block that was later truncated away.
 697                  */
 698                 if (zero_damaged_pages || InRecovery)
 699                         MemSet(buffer, 0, BLCKSZ);
 700                 else
 701                         ereport(ERROR,
 702                                         (errcode(ERRCODE_DATA_CORRUPTED),
 703                                          errmsg("could not read block %u in file \"%s\": read only %d of %d bytes",
 704                                                         blocknum, FilePathName(v->mdfd_vfd),
 705                                                         nbytes, BLCKSZ)));
 706         }
 707 }
 708
 709 /*
 710  *      mdwrite() -- Write the supplied block at the appropriate location.
 711  *
 712  *              This is to be used only for updating already-existing blocks of a
 713  *              relation (ie, those before the current EOF).  To extend a relation,
 714  *              use mdextend().
 715  */
 716 void
 717 mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 718                 char *buffer, bool skipFsync)
 719 {
 720         off_t           seekpos;
 721         int                     nbytes;
 722         MdfdVec    *v;
 723
 724         /* This assert is too expensive to have on normally ... */
 725 #ifdef CHECK_WRITE_VS_EXTEND
 726         Assert(blocknum < mdnblocks(reln, forknum));
 727 #endif
 728
 729         TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
 730                                                                                  reln->smgr_rnode.node.spcNode,
 731                                                                                  reln->smgr_rnode.node.dbNode,
 732                                                                                  reln->smgr_rnode.node.relNode,
 733                                                                                  reln->smgr_rnode.backend);
 734
 735         v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_FAIL);
 736
 737         seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
 738
 739         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 740
 741         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 742                 ereport(ERROR,
 743                                 (errcode_for_file_access(),
 744                                  errmsg("could not seek to block %u in file \"%s\": %m",
 745                                                 blocknum, FilePathName(v->mdfd_vfd))));
 746
 747         nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ);
 748
 749         TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
 750                                                                                 reln->smgr_rnode.node.spcNode,
 751                                                                                 reln->smgr_rnode.node.dbNode,
 752                                                                                 reln->smgr_rnode.node.relNode,
 753                                                                                 reln->smgr_rnode.backend,
 754                                                                                 nbytes,
 755                                                                                 BLCKSZ);
 756
 757         if (nbytes != BLCKSZ)
 758         {
 759                 if (nbytes < 0)
 760                         ereport(ERROR,
 761                                         (errcode_for_file_access(),
 762                                          errmsg("could not write block %u in file \"%s\": %m",
 763                                                         blocknum, FilePathName(v->mdfd_vfd))));
 764                 /* short write: complain appropriately */
 765                 ereport(ERROR,
 766                                 (errcode(ERRCODE_DISK_FULL),
 767                                  errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes",
 768                                                 blocknum,
 769                                                 FilePathName(v->mdfd_vfd),
 770                                                 nbytes, BLCKSZ),
 771                                  errhint("Check free disk space.")));
 772         }
 773
 774         if (!skipFsync && !SmgrIsTemp(reln))
 775                 register_dirty_segment(reln, forknum, v);
 776 }
 777
 778 /*
 779  *      mdnblocks() -- Get the number of blocks stored in a relation.
 780  *
 781  *              Important side effect: all active segments of the relation are opened
 782  *              and added to the mdfd_chain list.  If this routine has not been
 783  *              called, then only segments up to the last one actually touched
 784  *              are present in the chain.
 785  */
 786 BlockNumber
 787 mdnblocks(SMgrRelation reln, ForkNumber forknum)
 788 {
 789         MdfdVec    *v = mdopen(reln, forknum, EXTENSION_FAIL);
 790         BlockNumber nblocks;
 791         BlockNumber segno = 0;
 792
 793         /*
 794          * Skip through any segments that aren't the last one, to avoid redundant
 795          * seeks on them.  We have previously verified that these segments are
 796          * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
 797          *
 798          * NOTE: this assumption could only be wrong if another backend has
 799          * truncated the relation.      We rely on higher code levels to handle that
 800          * scenario by closing and re-opening the md fd, which is handled via
 801          * relcache flush.      (Since the checkpointer doesn't participate in
 802          * relcache flush, it could have segment chain entries for inactive
 803          * segments; that's OK because the checkpointer never needs to compute
 804          * relation size.)
 805          */
 806         while (v->mdfd_chain != NULL)
 807         {
 808                 segno++;
 809                 v = v->mdfd_chain;
 810         }
 811
 812         for (;;)
 813         {
 814                 nblocks = _mdnblocks(reln, forknum, v);
 815                 if (nblocks > ((BlockNumber) RELSEG_SIZE))
 816                         elog(FATAL, "segment too big");
 817                 if (nblocks < ((BlockNumber) RELSEG_SIZE))
 818                         return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
 819
 820                 /*
 821                  * If segment is exactly RELSEG_SIZE, advance to next one.
 822                  */
 823                 segno++;
 824
 825                 if (v->mdfd_chain == NULL)
 826                 {
 827                         /*
 828                          * Because we pass O_CREAT, we will create the next segment (with
 829                          * zero length) immediately, if the last segment is of length
 830                          * RELSEG_SIZE.  While perhaps not strictly necessary, this keeps
 831                          * the logic simple.
 832                          */
 833                         v->mdfd_chain = _mdfd_openseg(reln, forknum, segno, O_CREAT);
 834                         if (v->mdfd_chain == NULL)
 835                                 ereport(ERROR,
 836                                                 (errcode_for_file_access(),
 837                                                  errmsg("could not open file \"%s\": %m",
 838                                                                 _mdfd_segpath(reln, forknum, segno))));
 839                 }
 840
 841                 v = v->mdfd_chain;
 842         }
 843 }
 844
 845 /*
 846  *      mdtruncate() -- Truncate relation to specified number of blocks.
 847  */
 848 void
 849 mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 850 {
 851         MdfdVec    *v;
 852         BlockNumber curnblk;
 853         BlockNumber priorblocks;
 854
 855         /*
 856          * NOTE: mdnblocks makes sure we have opened all active segments, so that
 857          * truncation loop will get them all!
 858          */
 859         curnblk = mdnblocks(reln, forknum);
 860         if (nblocks > curnblk)
 861         {
 862                 /* Bogus request ... but no complaint if InRecovery */
 863                 if (InRecovery)
 864                         return;
 865                 ereport(ERROR,
 866                                 (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
 867                                                 relpath(reln->smgr_rnode, forknum),
 868                                                 nblocks, curnblk)));
 869         }
 870         if (nblocks == curnblk)
 871                 return;                                 /* no work */
 872
 873         v = mdopen(reln, forknum, EXTENSION_FAIL);
 874
 875         priorblocks = 0;
 876         while (v != NULL)
 877         {
 878                 MdfdVec    *ov = v;
 879
 880                 if (priorblocks > nblocks)
 881                 {
 882                         /*
 883                          * This segment is no longer active (and has already been unlinked
 884                          * from the mdfd_chain). We truncate the file, but do not delete
 885                          * it, for reasons explained in the header comments.
 886                          */
 887                         if (FileTruncate(v->mdfd_vfd, 0) < 0)
 888                                 ereport(ERROR,
 889                                                 (errcode_for_file_access(),
 890                                                  errmsg("could not truncate file \"%s\": %m",
 891                                                                 FilePathName(v->mdfd_vfd))));
 892
 893                         if (!SmgrIsTemp(reln))
 894                                 register_dirty_segment(reln, forknum, v);
 895                         v = v->mdfd_chain;
 896                         Assert(ov != reln->md_fd[forknum]); /* we never drop the 1st
 897                                                                                                  * segment */
 898                         pfree(ov);
 899                 }
 900                 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
 901                 {
 902                         /*
 903                          * This is the last segment we want to keep. Truncate the file to
 904                          * the right length, and clear chain link that points to any
 905                          * remaining segments (which we shall zap). NOTE: if nblocks is
 906                          * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
 907                          * segment to 0 length but keep it. This adheres to the invariant
 908                          * given in the header comments.
 909                          */
 910                         BlockNumber lastsegblocks = nblocks - priorblocks;
 911
 912                         if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ) < 0)
 913                                 ereport(ERROR,
 914                                                 (errcode_for_file_access(),
 915                                         errmsg("could not truncate file \"%s\" to %u blocks: %m",
 916                                                    FilePathName(v->mdfd_vfd),
 917                                                    nblocks)));
 918                         if (!SmgrIsTemp(reln))
 919                                 register_dirty_segment(reln, forknum, v);
 920                         v = v->mdfd_chain;
 921                         ov->mdfd_chain = NULL;
 922                 }
 923                 else
 924                 {
 925                         /*
 926                          * We still need this segment and 0 or more blocks beyond it, so
 927                          * nothing to do here.
 928                          */
 929                         v = v->mdfd_chain;
 930                 }
 931                 priorblocks += RELSEG_SIZE;
 932         }
 933 }
 934
 935 /*
 936  *      mdimmedsync() -- Immediately sync a relation to stable storage.
 937  *
 938  * Note that only writes already issued are synced; this routine knows
 939  * nothing of dirty buffers that may exist inside the buffer manager.
 940  */
 941 void
 942 mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 943 {
 944         MdfdVec    *v;
 945
 946         /*
 947          * NOTE: mdnblocks makes sure we have opened all active segments, so that
 948          * fsync loop will get them all!
 949          */
 950         mdnblocks(reln, forknum);
 951
 952         v = mdopen(reln, forknum, EXTENSION_FAIL);
 953
 954         while (v != NULL)
 955         {
 956                 if (FileSync(v->mdfd_vfd) < 0)
 957                         ereport(ERROR,
 958                                         (errcode_for_file_access(),
 959                                          errmsg("could not fsync file \"%s\": %m",
 960                                                         FilePathName(v->mdfd_vfd))));
 961                 v = v->mdfd_chain;
 962         }
 963 }
 964
 965 /*
 966  *      mdsync() -- Sync previous writes to stable storage.
 967  */
 968 void
 969 mdsync(void)
 970 {
 971         static bool mdsync_in_progress = false;
 972
 973         HASH_SEQ_STATUS hstat;
 974         PendingOperationEntry *entry;
 975         int                     absorb_counter;
 976
 977         /* Statistics on sync times */
 978         int                     processed = 0;
 979         instr_time      sync_start,
 980                                 sync_end,
 981                                 sync_diff;
 982         uint64          elapsed;
 983         uint64          longest = 0;
 984         uint64          total_elapsed = 0;
 985
 986         /*
 987          * This is only called during checkpoints, and checkpoints should only
 988          * occur in processes that have created a pendingOpsTable.
 989          */
 990         if (!pendingOpsTable)
 991                 elog(ERROR, "cannot sync without a pendingOpsTable");
 992
 993         /*
 994          * If we are in the checkpointer, the sync had better include all fsync
 995          * requests that were queued by backends up to this point.      The tightest
 996          * race condition that could occur is that a buffer that must be written
 997          * and fsync'd for the checkpoint could have been dumped by a backend just
 998          * before it was visited by BufferSync().  We know the backend will have
 999          * queued an fsync request before clearing the buffer's dirtybit, so we
1000          * are safe as long as we do an Absorb after completing BufferSync().
1001          */
1002         AbsorbFsyncRequests();
1003
1004         /*
1005          * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
1006          * checkpoint), we want to ignore fsync requests that are entered into the
1007          * hashtable after this point --- they should be processed next time,
1008          * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
1009          * ones: new ones will have cycle_ctr equal to the incremented value of
1010          * mdsync_cycle_ctr.
1011          *
1012          * In normal circumstances, all entries present in the table at this point
1013          * will have cycle_ctr exactly equal to the current (about to be old)
1014          * value of mdsync_cycle_ctr.  However, if we fail partway through the
1015          * fsync'ing loop, then older values of cycle_ctr might remain when we
1016          * come back here to try again.  Repeated checkpoint failures would
1017          * eventually wrap the counter around to the point where an old entry
1018          * might appear new, causing us to skip it, possibly allowing a checkpoint
1019          * to succeed that should not have.  To forestall wraparound, any time the
1020          * previous mdsync() failed to complete, run through the table and
1021          * forcibly set cycle_ctr = mdsync_cycle_ctr.
1022          *
1023          * Think not to merge this loop with the main loop, as the problem is
1024          * exactly that that loop may fail before having visited all the entries.
1025          * From a performance point of view it doesn't matter anyway, as this path
1026          * will never be taken in a system that's functioning normally.
1027          */
1028         if (mdsync_in_progress)
1029         {
1030                 /* prior try failed, so update any stale cycle_ctr values */
1031                 hash_seq_init(&hstat, pendingOpsTable);
1032                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1033                 {
1034                         entry->cycle_ctr = mdsync_cycle_ctr;
1035                 }
1036         }
1037
1038         /* Advance counter so that new hashtable entries are distinguishable */
1039         mdsync_cycle_ctr++;
1040
1041         /* Set flag to detect failure if we don't reach the end of the loop */
1042         mdsync_in_progress = true;
1043
1044         /* Now scan the hashtable for fsync requests to process */
1045         absorb_counter = FSYNCS_PER_ABSORB;
1046         hash_seq_init(&hstat, pendingOpsTable);
1047         while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1048         {
1049                 /*
1050                  * If the entry is new then don't process it this time.  Note that
1051                  * "continue" bypasses the hash-remove call at the bottom of the loop.
1052                  */
1053                 if (entry->cycle_ctr == mdsync_cycle_ctr)
1054                         continue;
1055
1056                 /* Else assert we haven't missed it */
1057                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
1058
1059                 /*
1060                  * If fsync is off then we don't have to bother opening the file at
1061                  * all.  (We delay checking until this point so that changing fsync on
1062                  * the fly behaves sensibly.)  Also, if the entry is marked canceled,
1063                  * fall through to delete it.
1064                  */
1065                 if (enableFsync && !entry->canceled)
1066                 {
1067                         int                     failures;
1068
1069                         /*
1070                          * If in checkpointer, we want to absorb pending requests every so
1071                          * often to prevent overflow of the fsync request queue.  It is
1072                          * unspecified whether newly-added entries will be visited by
1073                          * hash_seq_search, but we don't care since we don't need to
1074                          * process them anyway.
1075                          */
1076                         if (--absorb_counter <= 0)
1077                         {
1078                                 AbsorbFsyncRequests();
1079                                 absorb_counter = FSYNCS_PER_ABSORB;
1080                         }
1081
1082                         /*
1083                          * The fsync table could contain requests to fsync segments that
1084                          * have been deleted (unlinked) by the time we get to them. Rather
1085                          * than just hoping an ENOENT (or EACCES on Windows) error can be
1086                          * ignored, what we do on error is absorb pending requests and
1087                          * then retry.  Since mdunlink() queues a "revoke" message before
1088                          * actually unlinking, the fsync request is guaranteed to be
1089                          * marked canceled after the absorb if it really was this case.
1090                          * DROP DATABASE likewise has to tell us to forget fsync requests
1091                          * before it starts deletions.
1092                          */
1093                         for (failures = 0;; failures++)         /* loop exits at "break" */
1094                         {
1095                                 SMgrRelation reln;
1096                                 MdfdVec    *seg;
1097                                 char       *path;
1098
1099                                 /*
1100                                  * Find or create an smgr hash entry for this relation. This
1101                                  * may seem a bit unclean -- md calling smgr?  But it's really
1102                                  * the best solution.  It ensures that the open file reference
1103                                  * isn't permanently leaked if we get an error here. (You may
1104                                  * say "but an unreferenced SMgrRelation is still a leak!" Not
1105                                  * really, because the only case in which a checkpoint is done
1106                                  * by a process that isn't about to shut down is in the
1107                                  * checkpointer, and it will periodically do smgrcloseall().
1108                                  * This fact justifies our not closing the reln in the success
1109                                  * path either, which is a good thing since in
1110                                  * non-checkpointer cases we couldn't safely do that.)
1111                                  */
1112                                 reln = smgropen(entry->tag.rnode, InvalidBackendId);
1113
1114                                 /*
1115                                  * It is possible that the relation has been dropped or
1116                                  * truncated since the fsync request was entered.  Therefore,
1117                                  * allow ENOENT, but only if we didn't fail already on this
1118                                  * file.  This applies both during _mdfd_getseg() and during
1119                                  * FileSync, since fd.c might have closed the file behind our
1120                                  * back.
1121                                  */
1122                                 seg = _mdfd_getseg(reln, entry->tag.forknum,
1123                                                           entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
1124                                                                    false, EXTENSION_RETURN_NULL);
1125
1126                                 INSTR_TIME_SET_CURRENT(sync_start);
1127
1128                                 if (seg != NULL &&
1129                                         FileSync(seg->mdfd_vfd) >= 0)
1130                                 {
1131                                         INSTR_TIME_SET_CURRENT(sync_end);
1132                                         sync_diff = sync_end;
1133                                         INSTR_TIME_SUBTRACT(sync_diff, sync_start);
1134                                         elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
1135                                         if (elapsed > longest)
1136                                                 longest = elapsed;
1137                                         total_elapsed += elapsed;
1138                                         processed++;
1139                                         if (log_checkpoints)
1140                                                 elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
1141                                                          processed, FilePathName(seg->mdfd_vfd), (double) elapsed / 1000);
1142
1143                                         break;          /* success; break out of retry loop */
1144                                 }
1145
1146                                 /*
1147                                  * XXX is there any point in allowing more than one retry?
1148                                  * Don't see one at the moment, but easy to change the test
1149                                  * here if so.
1150                                  */
1151                                 path = _mdfd_segpath(reln, entry->tag.forknum,
1152                                                                          entry->tag.segno);
1153                                 if (!FILE_POSSIBLY_DELETED(errno) ||
1154                                         failures > 0)
1155                                         ereport(ERROR,
1156                                                         (errcode_for_file_access(),
1157                                                    errmsg("could not fsync file \"%s\": %m", path)));
1158                                 else
1159                                         ereport(DEBUG1,
1160                                                         (errcode_for_file_access(),
1161                                            errmsg("could not fsync file \"%s\" but retrying: %m",
1162                                                           path)));
1163                                 pfree(path);
1164
1165                                 /*
1166                                  * Absorb incoming requests and check to see if canceled.
1167                                  */
1168                                 AbsorbFsyncRequests();
1169                                 absorb_counter = FSYNCS_PER_ABSORB;             /* might as well... */
1170
1171                                 if (entry->canceled)
1172                                         break;
1173                         }                                       /* end retry loop */
1174                 }
1175
1176                 /*
1177                  * If we get here, either we fsync'd successfully, or we don't have to
1178                  * because enableFsync is off, or the entry is (now) marked canceled.
1179                  * Okay to delete it.
1180                  */
1181                 if (hash_search(pendingOpsTable, &entry->tag,
1182                                                 HASH_REMOVE, NULL) == NULL)
1183                         elog(ERROR, "pendingOpsTable corrupted");
1184         }                                                       /* end loop over hashtable entries */
1185
1186         /* Return sync performance metrics for report at checkpoint end */
1187         CheckpointStats.ckpt_sync_rels = processed;
1188         CheckpointStats.ckpt_longest_sync = longest;
1189         CheckpointStats.ckpt_agg_sync_time = total_elapsed;
1190
1191         /* Flag successful completion of mdsync */
1192         mdsync_in_progress = false;
1193 }
1194
1195 /*
1196  * mdpreckpt() -- Do pre-checkpoint work
1197  *
1198  * To distinguish unlink requests that arrived before this checkpoint
1199  * started from those that arrived during the checkpoint, we use a cycle
1200  * counter similar to the one we use for fsync requests. That cycle
1201  * counter is incremented here.
1202  *
1203  * This must be called *before* the checkpoint REDO point is determined.
1204  * That ensures that we won't delete files too soon.
1205  *
1206  * Note that we can't do anything here that depends on the assumption
1207  * that the checkpoint will be completed.
1208  */
1209 void
1210 mdpreckpt(void)
1211 {
1212         ListCell   *cell;
1213
1214         /*
1215          * In case the prior checkpoint wasn't completed, stamp all entries in the
1216          * list with the current cycle counter.  Anything that's in the list at
1217          * the start of checkpoint can surely be deleted after the checkpoint is
1218          * finished, regardless of when the request was made.
1219          */
1220         foreach(cell, pendingUnlinks)
1221         {
1222                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1223
1224                 entry->cycle_ctr = mdckpt_cycle_ctr;
1225         }
1226
1227         /*
1228          * Any unlink requests arriving after this point will be assigned the next
1229          * cycle counter, and won't be unlinked until next checkpoint.
1230          */
1231         mdckpt_cycle_ctr++;
1232 }
1233
1234 /*
1235  * mdpostckpt() -- Do post-checkpoint work
1236  *
1237  * Remove any lingering files that can now be safely removed.
1238  */
1239 void
1240 mdpostckpt(void)
1241 {
1242         while (pendingUnlinks != NIL)
1243         {
1244                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
1245                 char       *path;
1246
1247                 /*
1248                  * New entries are appended to the end, so if the entry is new we've
1249                  * reached the end of old entries.
1250                  */
1251                 if (entry->cycle_ctr == mdckpt_cycle_ctr)
1252                         break;
1253
1254                 /* Else assert we haven't missed it */
1255                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);
1256
1257                 /* Unlink the file */
1258                 path = relpathperm(entry->rnode, MAIN_FORKNUM);
1259                 if (unlink(path) < 0)
1260                 {
1261                         /*
1262                          * There's a race condition, when the database is dropped at the
1263                          * same time that we process the pending unlink requests. If the
1264                          * DROP DATABASE deletes the file before we do, we will get ENOENT
1265                          * here. rmtree() also has to ignore ENOENT errors, to deal with
1266                          * the possibility that we delete the file first.
1267                          */
1268                         if (errno != ENOENT)
1269                                 ereport(WARNING,
1270                                                 (errcode_for_file_access(),
1271                                                  errmsg("could not remove file \"%s\": %m", path)));
1272                 }
1273                 pfree(path);
1274
1275                 pendingUnlinks = list_delete_first(pendingUnlinks);
1276                 pfree(entry);
1277         }
1278 }
1279
1280 /*
1281  * register_dirty_segment() -- Mark a relation segment as needing fsync
1282  *
1283  * If there is a local pending-ops table, just make an entry in it for
1284  * mdsync to process later.  Otherwise, try to pass off the fsync request
1285  * to the checkpointer process.  If that fails, just do the fsync
1286  * locally before returning (we hope this will not happen often enough
1287  * to be a performance problem).
1288  */
1289 static void
1290 register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
1291 {
1292         /* Temp relations should never be fsync'd */
1293         Assert(!SmgrIsTemp(reln));
1294
1295         if (pendingOpsTable)
1296         {
1297                 /* push it into local pending-ops table */
1298                 RememberFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno);
1299         }
1300         else
1301         {
1302                 if (ForwardFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno))
1303                         return;                         /* passed it off successfully */
1304
1305                 ereport(DEBUG1,
1306                                 (errmsg("could not forward fsync request because request queue is full")));
1307
1308                 if (FileSync(seg->mdfd_vfd) < 0)
1309                         ereport(ERROR,
1310                                         (errcode_for_file_access(),
1311                                          errmsg("could not fsync file \"%s\": %m",
1312                                                         FilePathName(seg->mdfd_vfd))));
1313         }
1314 }
1315
1316 /*
1317  * register_unlink() -- Schedule a file to be deleted after next checkpoint
1318  *
1319  * We don't bother passing in the fork number, because this is only used
1320  * with main forks.
1321  *
1322  * As with register_dirty_segment, this could involve either a local or
1323  * a remote pending-ops table.
1324  */
1325 static void
1326 register_unlink(RelFileNodeBackend rnode)
1327 {
1328         /* Should never be used with temp relations */
1329         Assert(!RelFileNodeBackendIsTemp(rnode));
1330
1331         if (pendingOpsTable)
1332         {
1333                 /* push it into local pending-ops table */
1334                 RememberFsyncRequest(rnode.node, MAIN_FORKNUM,
1335                                                          UNLINK_RELATION_REQUEST);
1336         }
1337         else
1338         {
1339                 /*
1340                  * Notify the checkpointer about it.  If we fail to queue the request
1341                  * message, we have to sleep and try again, because we can't simply
1342                  * delete the file now.  Ugly, but hopefully won't happen often.
1343                  *
1344                  * XXX should we just leave the file orphaned instead?
1345                  */
1346                 Assert(IsUnderPostmaster);
1347                 while (!ForwardFsyncRequest(rnode.node, MAIN_FORKNUM,
1348                                                                         UNLINK_RELATION_REQUEST))
1349                         pg_usleep(10000L);      /* 10 msec seems a good number */
1350         }
1351 }
1352
1353 /*
1354  * RememberFsyncRequest() -- callback from checkpointer side of fsync request
1355  *
1356  * We stuff most fsync requests into the local hash table for execution
1357  * during the checkpointer's next checkpoint.  UNLINK requests go into a
1358  * separate linked list, however, because they get processed separately.
1359  *
1360  * The range of possible segment numbers is way less than the range of
1361  * BlockNumber, so we can reserve high values of segno for special purposes.
1362  * We define three:
1363  * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation,
1364  *   either for one fork, or all forks if forknum is InvalidForkNumber
1365  * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
1366  * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
1367  *       checkpoint.
1368  *
1369  * (Handling the FORGET_* requests is a tad slow because the hash table has
1370  * to be searched linearly, but it doesn't seem worth rethinking the table
1371  * structure for them.)
1372  */
1373 void
1374 RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
1375 {
1376         Assert(pendingOpsTable);
1377
1378         if (segno == FORGET_RELATION_FSYNC)
1379         {
1380                 /* Remove any pending requests for the relation (one or all forks) */
1381                 HASH_SEQ_STATUS hstat;
1382                 PendingOperationEntry *entry;
1383
1384                 hash_seq_init(&hstat, pendingOpsTable);
1385                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1386                 {
1387                         if (RelFileNodeEquals(entry->tag.rnode, rnode) &&
1388                                 (entry->tag.forknum == forknum ||
1389                                  forknum == InvalidForkNumber))
1390                         {
1391                                 /* Okay, cancel this entry */
1392                                 entry->canceled = true;
1393                         }
1394                 }
1395         }
1396         else if (segno == FORGET_DATABASE_FSYNC)
1397         {
1398                 /* Remove any pending requests for the entire database */
1399                 HASH_SEQ_STATUS hstat;
1400                 PendingOperationEntry *entry;
1401                 ListCell   *cell,
1402                                    *prev,
1403                                    *next;
1404
1405                 /* Remove fsync requests */
1406                 hash_seq_init(&hstat, pendingOpsTable);
1407                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1408                 {
1409                         if (entry->tag.rnode.dbNode == rnode.dbNode)
1410                         {
1411                                 /* Okay, cancel this entry */
1412                                 entry->canceled = true;
1413                         }
1414                 }
1415
1416                 /* Remove unlink requests */
1417                 prev = NULL;
1418                 for (cell = list_head(pendingUnlinks); cell; cell = next)
1419                 {
1420                         PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1421
1422                         next = lnext(cell);
1423                         if (entry->rnode.dbNode == rnode.dbNode)
1424                         {
1425                                 pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
1426                                 pfree(entry);
1427                         }
1428                         else
1429                                 prev = cell;
1430                 }
1431         }
1432         else if (segno == UNLINK_RELATION_REQUEST)
1433         {
1434                 /* Unlink request: put it in the linked list */
1435                 MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
1436                 PendingUnlinkEntry *entry;
1437
1438                 /* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */
1439                 Assert(forknum == MAIN_FORKNUM);
1440
1441                 entry = palloc(sizeof(PendingUnlinkEntry));
1442                 entry->rnode = rnode;
1443                 entry->cycle_ctr = mdckpt_cycle_ctr;
1444
1445                 pendingUnlinks = lappend(pendingUnlinks, entry);
1446
1447                 MemoryContextSwitchTo(oldcxt);
1448         }
1449         else
1450         {
1451                 /* Normal case: enter a request to fsync this segment */
1452                 PendingOperationTag key;
1453                 PendingOperationEntry *entry;
1454                 bool            found;
1455
1456                 /* ensure any pad bytes in the hash key are zeroed */
1457                 MemSet(&key, 0, sizeof(key));
1458                 key.rnode = rnode;
1459                 key.forknum = forknum;
1460                 key.segno = segno;
1461
1462                 entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
1463                                                                                                           &key,
1464                                                                                                           HASH_ENTER,
1465                                                                                                           &found);
1466                 /* if new or previously canceled entry, initialize it */
1467                 if (!found || entry->canceled)
1468                 {
1469                         entry->canceled = false;
1470                         entry->cycle_ctr = mdsync_cycle_ctr;
1471                 }
1472
1473                 /*
1474                  * NB: it's intentional that we don't change cycle_ctr if the entry
1475                  * already exists.      The fsync request must be treated as old, even
1476                  * though the new request will be satisfied too by any subsequent
1477                  * fsync.
1478                  *
1479                  * However, if the entry is present but is marked canceled, we should
1480                  * act just as though it wasn't there.  The only case where this could
1481                  * happen would be if a file had been deleted, we received but did not
1482                  * yet act on the cancel request, and the same relfilenode was then
1483                  * assigned to a new file.      We mustn't lose the new request, but it
1484                  * should be considered new not old.
1485                  */
1486         }
1487 }
1488
1489 /*
1490  * ForgetRelationFsyncRequests -- forget any fsyncs for a relation fork
1491  *
1492  * forknum == InvalidForkNumber means all forks, although this code doesn't
1493  * actually know that, since it's just forwarding the request elsewhere.
1494  */
1495 void
1496 ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
1497 {
1498         if (pendingOpsTable)
1499         {
1500                 /* standalone backend or startup process: fsync state is local */
1501                 RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC);
1502         }
1503         else if (IsUnderPostmaster)
1504         {
1505                 /*
1506                  * Notify the checkpointer about it.  If we fail to queue the revoke
1507                  * message, we have to sleep and try again ... ugly, but hopefully
1508                  * won't happen often.
1509                  *
1510                  * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
1511                  * error would leave the no-longer-used file still present on disk,
1512                  * which would be bad, so I'm inclined to assume that the checkpointer
1513                  * will always empty the queue soon.
1514                  */
1515                 while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC))
1516                         pg_usleep(10000L);      /* 10 msec seems a good number */
1517
1518                 /*
1519                  * Note we don't wait for the checkpointer to actually absorb the
1520                  * revoke message; see mdsync() for the implications.
1521                  */
1522         }
1523 }
1524
1525 /*
1526  * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB
1527  */
1528 void
1529 ForgetDatabaseFsyncRequests(Oid dbid)
1530 {
1531         RelFileNode rnode;
1532
1533         rnode.dbNode = dbid;
1534         rnode.spcNode = 0;
1535         rnode.relNode = 0;
1536
1537         if (pendingOpsTable)
1538         {
1539                 /* standalone backend or startup process: fsync state is local */
1540                 RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC);
1541         }
1542         else if (IsUnderPostmaster)
1543         {
1544                 /* see notes in ForgetRelationFsyncRequests */
1545                 while (!ForwardFsyncRequest(rnode, InvalidForkNumber,
1546                                                                         FORGET_DATABASE_FSYNC))
1547                         pg_usleep(10000L);      /* 10 msec seems a good number */
1548         }
1549 }
1550
1551
1552 /*
1553  *      _fdvec_alloc() -- Make a MdfdVec object.
1554  */
1555 static MdfdVec *
1556 _fdvec_alloc(void)
1557 {
1558         return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
1559 }
1560
1561 /*
1562  * Return the filename for the specified segment of the relation. The
1563  * returned string is palloc'd.
1564  */
1565 static char *
1566 _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
1567 {
1568         char       *path,
1569                            *fullpath;
1570
1571         path = relpath(reln->smgr_rnode, forknum);
1572
1573         if (segno > 0)
1574         {
1575                 /* be sure we have enough space for the '.segno' */
1576                 fullpath = (char *) palloc(strlen(path) + 12);
1577                 sprintf(fullpath, "%s.%u", path, segno);
1578                 pfree(path);
1579         }
1580         else
1581                 fullpath = path;
1582
1583         return fullpath;
1584 }
1585
1586 /*
1587  * Open the specified segment of the relation,
1588  * and make a MdfdVec object for it.  Returns NULL on failure.
1589  */
1590 static MdfdVec *
1591 _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
1592                           int oflags)
1593 {
1594         MdfdVec    *v;
1595         int                     fd;
1596         char       *fullpath;
1597
1598         fullpath = _mdfd_segpath(reln, forknum, segno);
1599
1600         /* open the file */
1601         fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
1602
1603         pfree(fullpath);
1604
1605         if (fd < 0)
1606                 return NULL;
1607
1608         if (reln->smgr_transient)
1609                 FileSetTransient(fd);
1610
1611         /* allocate an mdfdvec entry for it */
1612         v = _fdvec_alloc();
1613
1614         /* fill the entry */
1615         v->mdfd_vfd = fd;
1616         v->mdfd_segno = segno;
1617         v->mdfd_chain = NULL;
1618         Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
1619
1620         /* all done */
1621         return v;
1622 }
1623
1624 /*
1625  *      _mdfd_getseg() -- Find the segment of the relation holding the
1626  *              specified block.
1627  *
1628  * If the segment doesn't exist, we ereport, return NULL, or create the
1629  * segment, according to "behavior".  Note: skipFsync is only used in the
1630  * EXTENSION_CREATE case.
1631  */
1632 static MdfdVec *
1633 _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
1634                          bool skipFsync, ExtensionBehavior behavior)
1635 {
1636         MdfdVec    *v = mdopen(reln, forknum, behavior);
1637         BlockNumber targetseg;
1638         BlockNumber nextsegno;
1639
1640         if (!v)
1641                 return NULL;                    /* only possible if EXTENSION_RETURN_NULL */
1642
1643         targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1644         for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
1645         {
1646                 Assert(nextsegno == v->mdfd_segno + 1);
1647
1648                 if (v->mdfd_chain == NULL)
1649                 {
1650                         /*
1651                          * Normally we will create new segments only if authorized by the
1652                          * caller (i.e., we are doing mdextend()).      But when doing WAL
1653                          * recovery, create segments anyway; this allows cases such as
1654                          * replaying WAL data that has a write into a high-numbered
1655                          * segment of a relation that was later deleted.  We want to go
1656                          * ahead and create the segments so we can finish out the replay.
1657                          *
1658                          * We have to maintain the invariant that segments before the last
1659                          * active segment are of size RELSEG_SIZE; therefore, pad them out
1660                          * with zeroes if needed.  (This only matters if caller is
1661                          * extending the relation discontiguously, but that can happen in
1662                          * hash indexes.)
1663                          */
1664                         if (behavior == EXTENSION_CREATE || InRecovery)
1665                         {
1666                                 if (_mdnblocks(reln, forknum, v) < RELSEG_SIZE)
1667                                 {
1668                                         char       *zerobuf = palloc0(BLCKSZ);
1669
1670                                         mdextend(reln, forknum,
1671                                                          nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1672                                                          zerobuf, skipFsync);
1673                                         pfree(zerobuf);
1674                                 }
1675                                 v->mdfd_chain = _mdfd_openseg(reln, forknum, +nextsegno, O_CREAT);
1676                         }
1677                         else
1678                         {
1679                                 /* We won't create segment if not existent */
1680                                 v->mdfd_chain = _mdfd_openseg(reln, forknum, nextsegno, 0);
1681                         }
1682                         if (v->mdfd_chain == NULL)
1683                         {
1684                                 if (behavior == EXTENSION_RETURN_NULL &&
1685                                         FILE_POSSIBLY_DELETED(errno))
1686                                         return NULL;
1687                                 ereport(ERROR,
1688                                                 (errcode_for_file_access(),
1689                                    errmsg("could not open file \"%s\" (target block %u): %m",
1690                                                   _mdfd_segpath(reln, forknum, nextsegno),
1691                                                   blkno)));
1692                         }
1693                 }
1694                 v = v->mdfd_chain;
1695         }
1696         return v;
1697 }
1698
1699 /*
1700  * Get number of blocks present in a single disk file
1701  */
1702 static BlockNumber
1703 _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
1704 {
1705         off_t           len;
1706
1707         len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
1708         if (len < 0)
1709                 ereport(ERROR,
1710                                 (errcode_for_file_access(),
1711                                  errmsg("could not seek to end of file \"%s\": %m",
1712                                                 FilePathName(seg->mdfd_vfd))));
1713         /* note that this calculation will ignore any partial block at EOF */
1714         return (BlockNumber) (len / BLCKSZ);
1715 }