granicus.if.org Git - postgresql/blob - src/backend/storage/smgr/md.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * md.c
   4  *        This code manages relations that reside on magnetic disk.
   5  *
   6  * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        src/backend/storage/smgr/md.c
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15 #include "postgres.h"
  16
  17 #include <unistd.h>
  18 #include <fcntl.h>
  19 #include <sys/file.h>
  20
  21 #include "miscadmin.h"
  22 #include "access/xlog.h"
  23 #include "catalog/catalog.h"
  24 #include "portability/instr_time.h"
  25 #include "postmaster/bgwriter.h"
  26 #include "storage/fd.h"
  27 #include "storage/bufmgr.h"
  28 #include "storage/relfilenode.h"
  29 #include "storage/smgr.h"
  30 #include "utils/hsearch.h"
  31 #include "utils/memutils.h"
  32 #include "pg_trace.h"
  33
  34
  35 /* interval for calling AbsorbFsyncRequests in mdsync */
  36 #define FSYNCS_PER_ABSORB               10
  37
  38 /*
  39  * Special values for the segno arg to RememberFsyncRequest.
  40  *
  41  * Note that CompactcheckpointerRequestQueue assumes that it's OK to remove an
  42  * fsync request from the queue if an identical, subsequent request is found.
  43  * See comments there before making changes here.
  44  */
  45 #define FORGET_RELATION_FSYNC   (InvalidBlockNumber)
  46 #define FORGET_DATABASE_FSYNC   (InvalidBlockNumber-1)
  47 #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
  48
  49 /*
  50  * On Windows, we have to interpret EACCES as possibly meaning the same as
  51  * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
  52  * that's what you get.  Ugh.  This code is designed so that we don't
  53  * actually believe these cases are okay without further evidence (namely,
  54  * a pending fsync request getting revoked ... see mdsync).
  55  */
  56 #ifndef WIN32
  57 #define FILE_POSSIBLY_DELETED(err)      ((err) == ENOENT)
  58 #else
  59 #define FILE_POSSIBLY_DELETED(err)      ((err) == ENOENT || (err) == EACCES)
  60 #endif
  61
  62 /*
  63  *      The magnetic disk storage manager keeps track of open file
  64  *      descriptors in its own descriptor pool.  This is done to make it
  65  *      easier to support relations that are larger than the operating
  66  *      system's file size limit (often 2GBytes).  In order to do that,
  67  *      we break relations up into "segment" files that are each shorter than
  68  *      the OS file size limit.  The segment size is set by the RELSEG_SIZE
  69  *      configuration constant in pg_config.h.
  70  *
  71  *      On disk, a relation must consist of consecutively numbered segment
  72  *      files in the pattern
  73  *              -- Zero or more full segments of exactly RELSEG_SIZE blocks each
  74  *              -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
  75  *              -- Optionally, any number of inactive segments of size 0 blocks.
  76  *      The full and partial segments are collectively the "active" segments.
  77  *      Inactive segments are those that once contained data but are currently
  78  *      not needed because of an mdtruncate() operation.  The reason for leaving
  79  *      them present at size zero, rather than unlinking them, is that other
  80  *      backends and/or the checkpointer might be holding open file references to
  81  *      such segments.  If the relation expands again after mdtruncate(), such
  82  *      that a deactivated segment becomes active again, it is important that
  83  *      such file references still be valid --- else data might get written
  84  *      out to an unlinked old copy of a segment file that will eventually
  85  *      disappear.
  86  *
  87  *      The file descriptor pointer (md_fd field) stored in the SMgrRelation
  88  *      cache is, therefore, just the head of a list of MdfdVec objects, one
  89  *      per segment.  But note the md_fd pointer can be NULL, indicating
  90  *      relation not open.
  91  *
  92  *      Also note that mdfd_chain == NULL does not necessarily mean the relation
  93  *      doesn't have another segment after this one; we may just not have
  94  *      opened the next segment yet.  (We could not have "all segments are
  95  *      in the chain" as an invariant anyway, since another backend could
  96  *      extend the relation when we weren't looking.)  We do not make chain
  97  *      entries for inactive segments, however; as soon as we find a partial
  98  *      segment, we assume that any subsequent segments are inactive.
  99  *
 100  *      All MdfdVec objects are palloc'd in the MdCxt memory context.
 101  */
 102
 103 typedef struct _MdfdVec
 104 {
 105         File            mdfd_vfd;               /* fd number in fd.c's pool */
 106         BlockNumber mdfd_segno;         /* segment number, from 0 */
 107         struct _MdfdVec *mdfd_chain;    /* next segment, or NULL */
 108 } MdfdVec;
 109
 110 static MemoryContext MdCxt;             /* context for all md.c allocations */
 111
 112
 113 /*
 114  * In some contexts (currently, standalone backends and the checkpointer process)
 115  * we keep track of pending fsync operations: we need to remember all relation
 116  * segments that have been written since the last checkpoint, so that we can
 117  * fsync them down to disk before completing the next checkpoint.  This hash
 118  * table remembers the pending operations.      We use a hash table mostly as
 119  * a convenient way of eliminating duplicate requests.
 120  *
 121  * We use a similar mechanism to remember no-longer-needed files that can
 122  * be deleted after the next checkpoint, but we use a linked list instead of
 123  * a hash table, because we don't expect there to be any duplicate requests.
 124  *
 125  * (Regular backends do not track pending operations locally, but forward
 126  * them to the checkpointer.)
 127  */
 128 typedef struct
 129 {
 130         RelFileNodeBackend rnode;       /* the targeted relation */
 131         ForkNumber      forknum;
 132         BlockNumber segno;                      /* which segment */
 133 } PendingOperationTag;
 134
 135 typedef uint16 CycleCtr;                /* can be any convenient integer size */
 136
 137 typedef struct
 138 {
 139         PendingOperationTag tag;        /* hash table key (must be first!) */
 140         bool            canceled;               /* T => request canceled, not yet removed */
 141         CycleCtr        cycle_ctr;              /* mdsync_cycle_ctr when request was made */
 142 } PendingOperationEntry;
 143
 144 typedef struct
 145 {
 146         RelFileNodeBackend rnode;       /* the dead relation to delete */
 147         CycleCtr        cycle_ctr;              /* mdckpt_cycle_ctr when request was made */
 148 } PendingUnlinkEntry;
 149
 150 static HTAB *pendingOpsTable = NULL;
 151 static List *pendingUnlinks = NIL;
 152
 153 static CycleCtr mdsync_cycle_ctr = 0;
 154 static CycleCtr mdckpt_cycle_ctr = 0;
 155
 156
 157 typedef enum                                    /* behavior for mdopen & _mdfd_getseg */
 158 {
 159         EXTENSION_FAIL,                         /* ereport if segment not present */
 160         EXTENSION_RETURN_NULL,          /* return NULL if not present */
 161         EXTENSION_CREATE                        /* create new segments as needed */
 162 } ExtensionBehavior;
 163
 164 /* local routines */
 165 static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum,
 166            ExtensionBehavior behavior);
 167 static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
 168                                            MdfdVec *seg);
 169 static void register_unlink(RelFileNodeBackend rnode);
 170 static MdfdVec *_fdvec_alloc(void);
 171 static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
 172                           BlockNumber segno);
 173 static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno,
 174                           BlockNumber segno, int oflags);
 175 static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno,
 176                          BlockNumber blkno, bool skipFsync, ExtensionBehavior behavior);
 177 static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
 178                    MdfdVec *seg);
 179
 180
 181 /*
 182  *      mdinit() -- Initialize private state for magnetic disk storage manager.
 183  */
 184 void
 185 mdinit(void)
 186 {
 187         MdCxt = AllocSetContextCreate(TopMemoryContext,
 188                                                                   "MdSmgr",
 189                                                                   ALLOCSET_DEFAULT_MINSIZE,
 190                                                                   ALLOCSET_DEFAULT_INITSIZE,
 191                                                                   ALLOCSET_DEFAULT_MAXSIZE);
 192
 193         /*
 194          * Create pending-operations hashtable if we need it.  Currently, we need
 195          * it if we are standalone (not under a postmaster) OR if we are a
 196          * bootstrap-mode subprocess of a postmaster (that is, a startup or
 197          * checkpointer process).
 198          */
 199         if (!IsUnderPostmaster || IsBootstrapProcessingMode())
 200         {
 201                 HASHCTL         hash_ctl;
 202
 203                 MemSet(&hash_ctl, 0, sizeof(hash_ctl));
 204                 hash_ctl.keysize = sizeof(PendingOperationTag);
 205                 hash_ctl.entrysize = sizeof(PendingOperationEntry);
 206                 hash_ctl.hash = tag_hash;
 207                 hash_ctl.hcxt = MdCxt;
 208                 pendingOpsTable = hash_create("Pending Ops Table",
 209                                                                           100L,
 210                                                                           &hash_ctl,
 211                                                                    HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
 212                 pendingUnlinks = NIL;
 213         }
 214 }
 215
 216 /*
 217  * In archive recovery, we rely on checkpointer to do fsyncs, but we will have
 218  * already created the pendingOpsTable during initialization of the startup
 219  * process.  Calling this function drops the local pendingOpsTable so that
 220  * subsequent requests will be forwarded to checkpointer.
 221  */
 222 void
 223 SetForwardFsyncRequests(void)
 224 {
 225         /* Perform any pending ops we may have queued up */
 226         if (pendingOpsTable)
 227                 mdsync();
 228         pendingOpsTable = NULL;
 229 }
 230
 231 /*
 232  *      mdexists() -- Does the physical file exist?
 233  *
 234  * Note: this will return true for lingering files, with pending deletions
 235  */
 236 bool
 237 mdexists(SMgrRelation reln, ForkNumber forkNum)
 238 {
 239         /*
 240          * Close it first, to ensure that we notice if the fork has been unlinked
 241          * since we opened it.
 242          */
 243         mdclose(reln, forkNum);
 244
 245         return (mdopen(reln, forkNum, EXTENSION_RETURN_NULL) != NULL);
 246 }
 247
 248 /*
 249  *      mdcreate() -- Create a new relation on magnetic disk.
 250  *
 251  * If isRedo is true, it's okay for the relation to exist already.
 252  */
 253 void
 254 mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 255 {
 256         char       *path;
 257         File            fd;
 258
 259         if (isRedo && reln->md_fd[forkNum] != NULL)
 260                 return;                                 /* created and opened already... */
 261
 262         Assert(reln->md_fd[forkNum] == NULL);
 263
 264         path = relpath(reln->smgr_rnode, forkNum);
 265
 266         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
 267
 268         if (fd < 0)
 269         {
 270                 int                     save_errno = errno;
 271
 272                 /*
 273                  * During bootstrap, there are cases where a system relation will be
 274                  * accessed (by internal backend processes) before the bootstrap
 275                  * script nominally creates it.  Therefore, allow the file to exist
 276                  * already, even if isRedo is not set.  (See also mdopen)
 277                  */
 278                 if (isRedo || IsBootstrapProcessingMode())
 279                         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
 280                 if (fd < 0)
 281                 {
 282                         /* be sure to report the error reported by create, not open */
 283                         errno = save_errno;
 284                         ereport(ERROR,
 285                                         (errcode_for_file_access(),
 286                                          errmsg("could not create file \"%s\": %m", path)));
 287                 }
 288         }
 289
 290         pfree(path);
 291
 292         if (reln->smgr_transient)
 293                 FileSetTransient(fd);
 294
 295         reln->md_fd[forkNum] = _fdvec_alloc();
 296
 297         reln->md_fd[forkNum]->mdfd_vfd = fd;
 298         reln->md_fd[forkNum]->mdfd_segno = 0;
 299         reln->md_fd[forkNum]->mdfd_chain = NULL;
 300 }
 301
 302 /*
 303  *      mdunlink() -- Unlink a relation.
 304  *
 305  * Note that we're passed a RelFileNode --- by the time this is called,
 306  * there won't be an SMgrRelation hashtable entry anymore.
 307  *
 308  * Actually, we don't unlink the first segment file of the relation, but
 309  * just truncate it to zero length, and record a request to unlink it after
 310  * the next checkpoint.  Additional segments can be unlinked immediately,
 311  * however.  Leaving the empty file in place prevents that relfilenode
 312  * number from being reused.  The scenario this protects us from is:
 313  * 1. We delete a relation (and commit, and actually remove its file).
 314  * 2. We create a new relation, which by chance gets the same relfilenode as
 315  *        the just-deleted one (OIDs must've wrapped around for that to happen).
 316  * 3. We crash before another checkpoint occurs.
 317  * During replay, we would delete the file and then recreate it, which is fine
 318  * if the contents of the file were repopulated by subsequent WAL entries.
 319  * But if we didn't WAL-log insertions, but instead relied on fsyncing the
 320  * file after populating it (as for instance CLUSTER and CREATE INDEX do),
 321  * the contents of the file would be lost forever.      By leaving the empty file
 322  * until after the next checkpoint, we prevent reassignment of the relfilenode
 323  * number until it's safe, because relfilenode assignment skips over any
 324  * existing file.
 325  *
 326  * All the above applies only to the relation's main fork; other forks can
 327  * just be removed immediately, since they are not needed to prevent the
 328  * relfilenode number from being recycled.      Also, we do not carefully
 329  * track whether other forks have been created or not, but just attempt to
 330  * unlink them unconditionally; so we should never complain about ENOENT.
 331  *
 332  * If isRedo is true, it's unsurprising for the relation to be already gone.
 333  * Also, we should remove the file immediately instead of queuing a request
 334  * for later, since during redo there's no possibility of creating a
 335  * conflicting relation.
 336  *
 337  * Note: any failure should be reported as WARNING not ERROR, because
 338  * we are usually not in a transaction anymore when this is called.
 339  */
 340 void
 341 mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 342 {
 343         char       *path;
 344         int                     ret;
 345
 346         /*
 347          * We have to clean out any pending fsync requests for the doomed
 348          * relation, else the next mdsync() will fail.
 349          */
 350         ForgetRelationFsyncRequests(rnode, forkNum);
 351
 352         path = relpath(rnode, forkNum);
 353
 354         /*
 355          * Delete or truncate the first segment.
 356          */
 357         if (isRedo || forkNum != MAIN_FORKNUM)
 358         {
 359                 ret = unlink(path);
 360                 if (ret < 0 && errno != ENOENT)
 361                         ereport(WARNING,
 362                                         (errcode_for_file_access(),
 363                                          errmsg("could not remove file \"%s\": %m", path)));
 364         }
 365         else
 366         {
 367                 /* truncate(2) would be easier here, but Windows hasn't got it */
 368                 int                     fd;
 369
 370                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0);
 371                 if (fd >= 0)
 372                 {
 373                         int                     save_errno;
 374
 375                         ret = ftruncate(fd, 0);
 376                         save_errno = errno;
 377                         close(fd);
 378                         errno = save_errno;
 379                 }
 380                 else
 381                         ret = -1;
 382                 if (ret < 0 && errno != ENOENT)
 383                         ereport(WARNING,
 384                                         (errcode_for_file_access(),
 385                                          errmsg("could not truncate file \"%s\": %m", path)));
 386
 387                 /* Register request to unlink first segment later */
 388                 register_unlink(rnode);
 389         }
 390
 391         /*
 392          * Delete any additional segments.
 393          */
 394         if (ret >= 0)
 395         {
 396                 char       *segpath = (char *) palloc(strlen(path) + 12);
 397                 BlockNumber segno;
 398
 399                 /*
 400                  * Note that because we loop until getting ENOENT, we will correctly
 401                  * remove all inactive segments as well as active ones.
 402                  */
 403                 for (segno = 1;; segno++)
 404                 {
 405                         sprintf(segpath, "%s.%u", path, segno);
 406                         if (unlink(segpath) < 0)
 407                         {
 408                                 /* ENOENT is expected after the last segment... */
 409                                 if (errno != ENOENT)
 410                                         ereport(WARNING,
 411                                                         (errcode_for_file_access(),
 412                                            errmsg("could not remove file \"%s\": %m", segpath)));
 413                                 break;
 414                         }
 415                 }
 416                 pfree(segpath);
 417         }
 418
 419         pfree(path);
 420 }
 421
 422 /*
 423  *      mdextend() -- Add a block to the specified relation.
 424  *
 425  *              The semantics are nearly the same as mdwrite(): write at the
 426  *              specified position.  However, this is to be used for the case of
 427  *              extending a relation (i.e., blocknum is at or beyond the current
 428  *              EOF).  Note that we assume writing a block beyond current EOF
 429  *              causes intervening file space to become filled with zeroes.
 430  */
 431 void
 432 mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 433                  char *buffer, bool skipFsync)
 434 {
 435         off_t           seekpos;
 436         int                     nbytes;
 437         MdfdVec    *v;
 438
 439         /* This assert is too expensive to have on normally ... */
 440 #ifdef CHECK_WRITE_VS_EXTEND
 441         Assert(blocknum >= mdnblocks(reln, forknum));
 442 #endif
 443
 444         /*
 445          * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
 446          * more --- we mustn't create a block whose number actually is
 447          * InvalidBlockNumber.
 448          */
 449         if (blocknum == InvalidBlockNumber)
 450                 ereport(ERROR,
 451                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 452                                  errmsg("cannot extend file \"%s\" beyond %u blocks",
 453                                                 relpath(reln->smgr_rnode, forknum),
 454                                                 InvalidBlockNumber)));
 455
 456         v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
 457
 458         seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
 459
 460         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 461
 462         /*
 463          * Note: because caller usually obtained blocknum by calling mdnblocks,
 464          * which did a seek(SEEK_END), this seek is often redundant and will be
 465          * optimized away by fd.c.      It's not redundant, however, if there is a
 466          * partial page at the end of the file. In that case we want to try to
 467          * overwrite the partial page with a full page.  It's also not redundant
 468          * if bufmgr.c had to dump another buffer of the same file to make room
 469          * for the new page's buffer.
 470          */
 471         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 472                 ereport(ERROR,
 473                                 (errcode_for_file_access(),
 474                                  errmsg("could not seek to block %u in file \"%s\": %m",
 475                                                 blocknum, FilePathName(v->mdfd_vfd))));
 476
 477         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
 478         {
 479                 if (nbytes < 0)
 480                         ereport(ERROR,
 481                                         (errcode_for_file_access(),
 482                                          errmsg("could not extend file \"%s\": %m",
 483                                                         FilePathName(v->mdfd_vfd)),
 484                                          errhint("Check free disk space.")));
 485                 /* short write: complain appropriately */
 486                 ereport(ERROR,
 487                                 (errcode(ERRCODE_DISK_FULL),
 488                                  errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
 489                                                 FilePathName(v->mdfd_vfd),
 490                                                 nbytes, BLCKSZ, blocknum),
 491                                  errhint("Check free disk space.")));
 492         }
 493
 494         if (!skipFsync && !SmgrIsTemp(reln))
 495                 register_dirty_segment(reln, forknum, v);
 496
 497         Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
 498 }
 499
 500 /*
 501  *      mdopen() -- Open the specified relation.
 502  *
 503  * Note we only open the first segment, when there are multiple segments.
 504  *
 505  * If first segment is not present, either ereport or return NULL according
 506  * to "behavior".  We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
 507  * EXTENSION_CREATE means it's OK to extend an existing relation, not to
 508  * invent one out of whole cloth.
 509  */
 510 static MdfdVec *
 511 mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior)
 512 {
 513         MdfdVec    *mdfd;
 514         char       *path;
 515         File            fd;
 516
 517         /* No work if already open */
 518         if (reln->md_fd[forknum])
 519                 return reln->md_fd[forknum];
 520
 521         path = relpath(reln->smgr_rnode, forknum);
 522
 523         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
 524
 525         if (fd < 0)
 526         {
 527                 /*
 528                  * During bootstrap, there are cases where a system relation will be
 529                  * accessed (by internal backend processes) before the bootstrap
 530                  * script nominally creates it.  Therefore, accept mdopen() as a
 531                  * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
 532                  */
 533                 if (IsBootstrapProcessingMode())
 534                         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
 535                 if (fd < 0)
 536                 {
 537                         if (behavior == EXTENSION_RETURN_NULL &&
 538                                 FILE_POSSIBLY_DELETED(errno))
 539                         {
 540                                 pfree(path);
 541                                 return NULL;
 542                         }
 543                         ereport(ERROR,
 544                                         (errcode_for_file_access(),
 545                                          errmsg("could not open file \"%s\": %m", path)));
 546                 }
 547         }
 548
 549         pfree(path);
 550
 551         if (reln->smgr_transient)
 552                 FileSetTransient(fd);
 553
 554         reln->md_fd[forknum] = mdfd = _fdvec_alloc();
 555
 556         mdfd->mdfd_vfd = fd;
 557         mdfd->mdfd_segno = 0;
 558         mdfd->mdfd_chain = NULL;
 559         Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
 560
 561         return mdfd;
 562 }
 563
 564 /*
 565  *      mdclose() -- Close the specified relation, if it isn't closed already.
 566  */
 567 void
 568 mdclose(SMgrRelation reln, ForkNumber forknum)
 569 {
 570         MdfdVec    *v = reln->md_fd[forknum];
 571
 572         /* No work if already closed */
 573         if (v == NULL)
 574                 return;
 575
 576         reln->md_fd[forknum] = NULL;    /* prevent dangling pointer after error */
 577
 578         while (v != NULL)
 579         {
 580                 MdfdVec    *ov = v;
 581
 582                 /* if not closed already */
 583                 if (v->mdfd_vfd >= 0)
 584                         FileClose(v->mdfd_vfd);
 585                 /* Now free vector */
 586                 v = v->mdfd_chain;
 587                 pfree(ov);
 588         }
 589 }
 590
 591 /*
 592  *      mdprefetch() -- Initiate asynchronous read of the specified block of a relation
 593  */
 594 void
 595 mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 596 {
 597 #ifdef USE_PREFETCH
 598         off_t           seekpos;
 599         MdfdVec    *v;
 600
 601         v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
 602
 603         seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
 604
 605         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 606
 607         (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ);
 608 #endif   /* USE_PREFETCH */
 609 }
 610
 611
 612 /*
 613  *      mdread() -- Read the specified block from a relation.
 614  */
 615 void
 616 mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 617            char *buffer)
 618 {
 619         off_t           seekpos;
 620         int                     nbytes;
 621         MdfdVec    *v;
 622
 623         TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
 624                                                                                 reln->smgr_rnode.node.spcNode,
 625                                                                                 reln->smgr_rnode.node.dbNode,
 626                                                                                 reln->smgr_rnode.node.relNode,
 627                                                                                 reln->smgr_rnode.backend);
 628
 629         v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
 630
 631         seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
 632
 633         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 634
 635         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 636                 ereport(ERROR,
 637                                 (errcode_for_file_access(),
 638                                  errmsg("could not seek to block %u in file \"%s\": %m",
 639                                                 blocknum, FilePathName(v->mdfd_vfd))));
 640
 641         nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ);
 642
 643         TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
 644                                                                            reln->smgr_rnode.node.spcNode,
 645                                                                            reln->smgr_rnode.node.dbNode,
 646                                                                            reln->smgr_rnode.node.relNode,
 647                                                                            reln->smgr_rnode.backend,
 648                                                                            nbytes,
 649                                                                            BLCKSZ);
 650
 651         if (nbytes != BLCKSZ)
 652         {
 653                 if (nbytes < 0)
 654                         ereport(ERROR,
 655                                         (errcode_for_file_access(),
 656                                          errmsg("could not read block %u in file \"%s\": %m",
 657                                                         blocknum, FilePathName(v->mdfd_vfd))));
 658
 659                 /*
 660                  * Short read: we are at or past EOF, or we read a partial block at
 661                  * EOF.  Normally this is an error; upper levels should never try to
 662                  * read a nonexistent block.  However, if zero_damaged_pages is ON or
 663                  * we are InRecovery, we should instead return zeroes without
 664                  * complaining.  This allows, for example, the case of trying to
 665                  * update a block that was later truncated away.
 666                  */
 667                 if (zero_damaged_pages || InRecovery)
 668                         MemSet(buffer, 0, BLCKSZ);
 669                 else
 670                         ereport(ERROR,
 671                                         (errcode(ERRCODE_DATA_CORRUPTED),
 672                                          errmsg("could not read block %u in file \"%s\": read only %d of %d bytes",
 673                                                         blocknum, FilePathName(v->mdfd_vfd),
 674                                                         nbytes, BLCKSZ)));
 675         }
 676 }
 677
 678 /*
 679  *      mdwrite() -- Write the supplied block at the appropriate location.
 680  *
 681  *              This is to be used only for updating already-existing blocks of a
 682  *              relation (ie, those before the current EOF).  To extend a relation,
 683  *              use mdextend().
 684  */
 685 void
 686 mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 687                 char *buffer, bool skipFsync)
 688 {
 689         off_t           seekpos;
 690         int                     nbytes;
 691         MdfdVec    *v;
 692
 693         /* This assert is too expensive to have on normally ... */
 694 #ifdef CHECK_WRITE_VS_EXTEND
 695         Assert(blocknum < mdnblocks(reln, forknum));
 696 #endif
 697
 698         TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
 699                                                                                  reln->smgr_rnode.node.spcNode,
 700                                                                                  reln->smgr_rnode.node.dbNode,
 701                                                                                  reln->smgr_rnode.node.relNode,
 702                                                                                  reln->smgr_rnode.backend);
 703
 704         v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_FAIL);
 705
 706         seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
 707
 708         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 709
 710         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 711                 ereport(ERROR,
 712                                 (errcode_for_file_access(),
 713                                  errmsg("could not seek to block %u in file \"%s\": %m",
 714                                                 blocknum, FilePathName(v->mdfd_vfd))));
 715
 716         nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ);
 717
 718         TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
 719                                                                                 reln->smgr_rnode.node.spcNode,
 720                                                                                 reln->smgr_rnode.node.dbNode,
 721                                                                                 reln->smgr_rnode.node.relNode,
 722                                                                                 reln->smgr_rnode.backend,
 723                                                                                 nbytes,
 724                                                                                 BLCKSZ);
 725
 726         if (nbytes != BLCKSZ)
 727         {
 728                 if (nbytes < 0)
 729                         ereport(ERROR,
 730                                         (errcode_for_file_access(),
 731                                          errmsg("could not write block %u in file \"%s\": %m",
 732                                                         blocknum, FilePathName(v->mdfd_vfd))));
 733                 /* short write: complain appropriately */
 734                 ereport(ERROR,
 735                                 (errcode(ERRCODE_DISK_FULL),
 736                                  errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes",
 737                                                 blocknum,
 738                                                 FilePathName(v->mdfd_vfd),
 739                                                 nbytes, BLCKSZ),
 740                                  errhint("Check free disk space.")));
 741         }
 742
 743         if (!skipFsync && !SmgrIsTemp(reln))
 744                 register_dirty_segment(reln, forknum, v);
 745 }
 746
 747 /*
 748  *      mdnblocks() -- Get the number of blocks stored in a relation.
 749  *
 750  *              Important side effect: all active segments of the relation are opened
 751  *              and added to the mdfd_chain list.  If this routine has not been
 752  *              called, then only segments up to the last one actually touched
 753  *              are present in the chain.
 754  */
 755 BlockNumber
 756 mdnblocks(SMgrRelation reln, ForkNumber forknum)
 757 {
 758         MdfdVec    *v = mdopen(reln, forknum, EXTENSION_FAIL);
 759         BlockNumber nblocks;
 760         BlockNumber segno = 0;
 761
 762         /*
 763          * Skip through any segments that aren't the last one, to avoid redundant
 764          * seeks on them.  We have previously verified that these segments are
 765          * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
 766          *
 767          * NOTE: this assumption could only be wrong if another backend has
 768          * truncated the relation.      We rely on higher code levels to handle that
 769          * scenario by closing and re-opening the md fd, which is handled via
 770          * relcache flush.      (Since the checkpointer doesn't participate in
 771          * relcache flush, it could have segment chain entries for inactive
 772          * segments; that's OK because the checkpointer never needs to compute
 773          * relation size.)
 774          */
 775         while (v->mdfd_chain != NULL)
 776         {
 777                 segno++;
 778                 v = v->mdfd_chain;
 779         }
 780
 781         for (;;)
 782         {
 783                 nblocks = _mdnblocks(reln, forknum, v);
 784                 if (nblocks > ((BlockNumber) RELSEG_SIZE))
 785                         elog(FATAL, "segment too big");
 786                 if (nblocks < ((BlockNumber) RELSEG_SIZE))
 787                         return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
 788
 789                 /*
 790                  * If segment is exactly RELSEG_SIZE, advance to next one.
 791                  */
 792                 segno++;
 793
 794                 if (v->mdfd_chain == NULL)
 795                 {
 796                         /*
 797                          * Because we pass O_CREAT, we will create the next segment (with
 798                          * zero length) immediately, if the last segment is of length
 799                          * RELSEG_SIZE.  While perhaps not strictly necessary, this keeps
 800                          * the logic simple.
 801                          */
 802                         v->mdfd_chain = _mdfd_openseg(reln, forknum, segno, O_CREAT);
 803                         if (v->mdfd_chain == NULL)
 804                                 ereport(ERROR,
 805                                                 (errcode_for_file_access(),
 806                                                  errmsg("could not open file \"%s\": %m",
 807                                                                 _mdfd_segpath(reln, forknum, segno))));
 808                 }
 809
 810                 v = v->mdfd_chain;
 811         }
 812 }
 813
 814 /*
 815  *      mdtruncate() -- Truncate relation to specified number of blocks.
 816  */
 817 void
 818 mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 819 {
 820         MdfdVec    *v;
 821         BlockNumber curnblk;
 822         BlockNumber priorblocks;
 823
 824         /*
 825          * NOTE: mdnblocks makes sure we have opened all active segments, so that
 826          * truncation loop will get them all!
 827          */
 828         curnblk = mdnblocks(reln, forknum);
 829         if (nblocks > curnblk)
 830         {
 831                 /* Bogus request ... but no complaint if InRecovery */
 832                 if (InRecovery)
 833                         return;
 834                 ereport(ERROR,
 835                                 (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
 836                                                 relpath(reln->smgr_rnode, forknum),
 837                                                 nblocks, curnblk)));
 838         }
 839         if (nblocks == curnblk)
 840                 return;                                 /* no work */
 841
 842         v = mdopen(reln, forknum, EXTENSION_FAIL);
 843
 844         priorblocks = 0;
 845         while (v != NULL)
 846         {
 847                 MdfdVec    *ov = v;
 848
 849                 if (priorblocks > nblocks)
 850                 {
 851                         /*
 852                          * This segment is no longer active (and has already been unlinked
 853                          * from the mdfd_chain). We truncate the file, but do not delete
 854                          * it, for reasons explained in the header comments.
 855                          */
 856                         if (FileTruncate(v->mdfd_vfd, 0) < 0)
 857                                 ereport(ERROR,
 858                                                 (errcode_for_file_access(),
 859                                                  errmsg("could not truncate file \"%s\": %m",
 860                                                                 FilePathName(v->mdfd_vfd))));
 861
 862                         if (!SmgrIsTemp(reln))
 863                                 register_dirty_segment(reln, forknum, v);
 864                         v = v->mdfd_chain;
 865                         Assert(ov != reln->md_fd[forknum]); /* we never drop the 1st
 866                                                                                                  * segment */
 867                         pfree(ov);
 868                 }
 869                 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
 870                 {
 871                         /*
 872                          * This is the last segment we want to keep. Truncate the file to
 873                          * the right length, and clear chain link that points to any
 874                          * remaining segments (which we shall zap). NOTE: if nblocks is
 875                          * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
 876                          * segment to 0 length but keep it. This adheres to the invariant
 877                          * given in the header comments.
 878                          */
 879                         BlockNumber lastsegblocks = nblocks - priorblocks;
 880
 881                         if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ) < 0)
 882                                 ereport(ERROR,
 883                                                 (errcode_for_file_access(),
 884                                         errmsg("could not truncate file \"%s\" to %u blocks: %m",
 885                                                    FilePathName(v->mdfd_vfd),
 886                                                    nblocks)));
 887                         if (!SmgrIsTemp(reln))
 888                                 register_dirty_segment(reln, forknum, v);
 889                         v = v->mdfd_chain;
 890                         ov->mdfd_chain = NULL;
 891                 }
 892                 else
 893                 {
 894                         /*
 895                          * We still need this segment and 0 or more blocks beyond it, so
 896                          * nothing to do here.
 897                          */
 898                         v = v->mdfd_chain;
 899                 }
 900                 priorblocks += RELSEG_SIZE;
 901         }
 902 }
 903
 904 /*
 905  *      mdimmedsync() -- Immediately sync a relation to stable storage.
 906  *
 907  * Note that only writes already issued are synced; this routine knows
 908  * nothing of dirty buffers that may exist inside the buffer manager.
 909  */
 910 void
 911 mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 912 {
 913         MdfdVec    *v;
 914
 915         /*
 916          * NOTE: mdnblocks makes sure we have opened all active segments, so that
 917          * fsync loop will get them all!
 918          */
 919         mdnblocks(reln, forknum);
 920
 921         v = mdopen(reln, forknum, EXTENSION_FAIL);
 922
 923         while (v != NULL)
 924         {
 925                 if (FileSync(v->mdfd_vfd) < 0)
 926                         ereport(ERROR,
 927                                         (errcode_for_file_access(),
 928                                          errmsg("could not fsync file \"%s\": %m",
 929                                                         FilePathName(v->mdfd_vfd))));
 930                 v = v->mdfd_chain;
 931         }
 932 }
 933
 934 /*
 935  *      mdsync() -- Sync previous writes to stable storage.
 936  */
 937 void
 938 mdsync(void)
 939 {
 940         static bool mdsync_in_progress = false;
 941
 942         HASH_SEQ_STATUS hstat;
 943         PendingOperationEntry *entry;
 944         int                     absorb_counter;
 945
 946         /* Statistics on sync times */
 947         int                     processed = 0;
 948         instr_time      sync_start,
 949                                 sync_end,
 950                                 sync_diff;
 951         uint64          elapsed;
 952         uint64          longest = 0;
 953         uint64          total_elapsed = 0;
 954
 955         /*
 956          * This is only called during checkpoints, and checkpoints should only
 957          * occur in processes that have created a pendingOpsTable.
 958          */
 959         if (!pendingOpsTable)
 960                 elog(ERROR, "cannot sync without a pendingOpsTable");
 961
 962         /*
 963          * If we are in the checkpointer, the sync had better include all fsync
 964          * requests that were queued by backends up to this point.      The tightest
 965          * race condition that could occur is that a buffer that must be written
 966          * and fsync'd for the checkpoint could have been dumped by a backend just
 967          * before it was visited by BufferSync().  We know the backend will have
 968          * queued an fsync request before clearing the buffer's dirtybit, so we
 969          * are safe as long as we do an Absorb after completing BufferSync().
 970          */
 971         AbsorbFsyncRequests();
 972
 973         /*
 974          * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
 975          * checkpoint), we want to ignore fsync requests that are entered into the
 976          * hashtable after this point --- they should be processed next time,
 977          * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
 978          * ones: new ones will have cycle_ctr equal to the incremented value of
 979          * mdsync_cycle_ctr.
 980          *
 981          * In normal circumstances, all entries present in the table at this point
 982          * will have cycle_ctr exactly equal to the current (about to be old)
 983          * value of mdsync_cycle_ctr.  However, if we fail partway through the
 984          * fsync'ing loop, then older values of cycle_ctr might remain when we
 985          * come back here to try again.  Repeated checkpoint failures would
 986          * eventually wrap the counter around to the point where an old entry
 987          * might appear new, causing us to skip it, possibly allowing a checkpoint
 988          * to succeed that should not have.  To forestall wraparound, any time the
 989          * previous mdsync() failed to complete, run through the table and
 990          * forcibly set cycle_ctr = mdsync_cycle_ctr.
 991          *
 992          * Think not to merge this loop with the main loop, as the problem is
 993          * exactly that that loop may fail before having visited all the entries.
 994          * From a performance point of view it doesn't matter anyway, as this path
 995          * will never be taken in a system that's functioning normally.
 996          */
 997         if (mdsync_in_progress)
 998         {
 999                 /* prior try failed, so update any stale cycle_ctr values */
1000                 hash_seq_init(&hstat, pendingOpsTable);
1001                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1002                 {
1003                         entry->cycle_ctr = mdsync_cycle_ctr;
1004                 }
1005         }
1006
1007         /* Advance counter so that new hashtable entries are distinguishable */
1008         mdsync_cycle_ctr++;
1009
1010         /* Set flag to detect failure if we don't reach the end of the loop */
1011         mdsync_in_progress = true;
1012
1013         /* Now scan the hashtable for fsync requests to process */
1014         absorb_counter = FSYNCS_PER_ABSORB;
1015         hash_seq_init(&hstat, pendingOpsTable);
1016         while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1017         {
1018                 /*
1019                  * If the entry is new then don't process it this time.  Note that
1020                  * "continue" bypasses the hash-remove call at the bottom of the loop.
1021                  */
1022                 if (entry->cycle_ctr == mdsync_cycle_ctr)
1023                         continue;
1024
1025                 /* Else assert we haven't missed it */
1026                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
1027
1028                 /*
1029                  * If fsync is off then we don't have to bother opening the file at
1030                  * all.  (We delay checking until this point so that changing fsync on
1031                  * the fly behaves sensibly.)  Also, if the entry is marked canceled,
1032                  * fall through to delete it.
1033                  */
1034                 if (enableFsync && !entry->canceled)
1035                 {
1036                         int                     failures;
1037
1038                         /*
1039                          * If in checkpointer, we want to absorb pending requests every so
1040                          * often to prevent overflow of the fsync request queue.  It is
1041                          * unspecified whether newly-added entries will be visited by
1042                          * hash_seq_search, but we don't care since we don't need to
1043                          * process them anyway.
1044                          */
1045                         if (--absorb_counter <= 0)
1046                         {
1047                                 AbsorbFsyncRequests();
1048                                 absorb_counter = FSYNCS_PER_ABSORB;
1049                         }
1050
1051                         /*
1052                          * The fsync table could contain requests to fsync segments that
1053                          * have been deleted (unlinked) by the time we get to them. Rather
1054                          * than just hoping an ENOENT (or EACCES on Windows) error can be
1055                          * ignored, what we do on error is absorb pending requests and
1056                          * then retry.  Since mdunlink() queues a "revoke" message before
1057                          * actually unlinking, the fsync request is guaranteed to be
1058                          * marked canceled after the absorb if it really was this case.
1059                          * DROP DATABASE likewise has to tell us to forget fsync requests
1060                          * before it starts deletions.
1061                          */
1062                         for (failures = 0;; failures++)         /* loop exits at "break" */
1063                         {
1064                                 SMgrRelation reln;
1065                                 MdfdVec    *seg;
1066                                 char       *path;
1067
1068                                 /*
1069                                  * Find or create an smgr hash entry for this relation. This
1070                                  * may seem a bit unclean -- md calling smgr?  But it's really
1071                                  * the best solution.  It ensures that the open file reference
1072                                  * isn't permanently leaked if we get an error here. (You may
1073                                  * say "but an unreferenced SMgrRelation is still a leak!" Not
1074                                  * really, because the only case in which a checkpoint is done
1075                                  * by a process that isn't about to shut down is in the
1076                                  * checkpointer, and it will periodically do smgrcloseall().
1077                                  * This fact justifies our not closing the reln in the success
1078                                  * path either, which is a good thing since in
1079                                  * non-checkpointer cases we couldn't safely do that.)
1080                                  * Furthermore, in many cases the relation will have been
1081                                  * dirtied through this same smgr relation, and so we can save
1082                                  * a file open/close cycle.
1083                                  */
1084                                 reln = smgropen(entry->tag.rnode.node,
1085                                                                 entry->tag.rnode.backend);
1086
1087                                 /*
1088                                  * It is possible that the relation has been dropped or
1089                                  * truncated since the fsync request was entered.  Therefore,
1090                                  * allow ENOENT, but only if we didn't fail already on this
1091                                  * file.  This applies both during _mdfd_getseg() and during
1092                                  * FileSync, since fd.c might have closed the file behind our
1093                                  * back.
1094                                  */
1095                                 seg = _mdfd_getseg(reln, entry->tag.forknum,
1096                                                           entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
1097                                                                    false, EXTENSION_RETURN_NULL);
1098
1099                                 INSTR_TIME_SET_CURRENT(sync_start);
1100
1101                                 if (seg != NULL &&
1102                                         FileSync(seg->mdfd_vfd) >= 0)
1103                                 {
1104                                         INSTR_TIME_SET_CURRENT(sync_end);
1105                                         sync_diff = sync_end;
1106                                         INSTR_TIME_SUBTRACT(sync_diff, sync_start);
1107                                         elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
1108                                         if (elapsed > longest)
1109                                                 longest = elapsed;
1110                                         total_elapsed += elapsed;
1111                                         processed++;
1112                                         if (log_checkpoints)
1113                                                 elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
1114                                                          processed, FilePathName(seg->mdfd_vfd), (double) elapsed / 1000);
1115
1116                                         break;          /* success; break out of retry loop */
1117                                 }
1118
1119                                 /*
1120                                  * XXX is there any point in allowing more than one retry?
1121                                  * Don't see one at the moment, but easy to change the test
1122                                  * here if so.
1123                                  */
1124                                 path = _mdfd_segpath(reln, entry->tag.forknum,
1125                                                                          entry->tag.segno);
1126                                 if (!FILE_POSSIBLY_DELETED(errno) ||
1127                                         failures > 0)
1128                                         ereport(ERROR,
1129                                                         (errcode_for_file_access(),
1130                                                    errmsg("could not fsync file \"%s\": %m", path)));
1131                                 else
1132                                         ereport(DEBUG1,
1133                                                         (errcode_for_file_access(),
1134                                            errmsg("could not fsync file \"%s\" but retrying: %m",
1135                                                           path)));
1136                                 pfree(path);
1137
1138                                 /*
1139                                  * Absorb incoming requests and check to see if canceled.
1140                                  */
1141                                 AbsorbFsyncRequests();
1142                                 absorb_counter = FSYNCS_PER_ABSORB;             /* might as well... */
1143
1144                                 if (entry->canceled)
1145                                         break;
1146                         }                                       /* end retry loop */
1147                 }
1148
1149                 /*
1150                  * If we get here, either we fsync'd successfully, or we don't have to
1151                  * because enableFsync is off, or the entry is (now) marked canceled.
1152                  * Okay to delete it.
1153                  */
1154                 if (hash_search(pendingOpsTable, &entry->tag,
1155                                                 HASH_REMOVE, NULL) == NULL)
1156                         elog(ERROR, "pendingOpsTable corrupted");
1157         }                                                       /* end loop over hashtable entries */
1158
1159         /* Return sync performance metrics for report at checkpoint end */
1160         CheckpointStats.ckpt_sync_rels = processed;
1161         CheckpointStats.ckpt_longest_sync = longest;
1162         CheckpointStats.ckpt_agg_sync_time = total_elapsed;
1163
1164         /* Flag successful completion of mdsync */
1165         mdsync_in_progress = false;
1166 }
1167
1168 /*
1169  * mdpreckpt() -- Do pre-checkpoint work
1170  *
1171  * To distinguish unlink requests that arrived before this checkpoint
1172  * started from those that arrived during the checkpoint, we use a cycle
1173  * counter similar to the one we use for fsync requests. That cycle
1174  * counter is incremented here.
1175  *
1176  * This must be called *before* the checkpoint REDO point is determined.
1177  * That ensures that we won't delete files too soon.
1178  *
1179  * Note that we can't do anything here that depends on the assumption
1180  * that the checkpoint will be completed.
1181  */
1182 void
1183 mdpreckpt(void)
1184 {
1185         ListCell   *cell;
1186
1187         /*
1188          * In case the prior checkpoint wasn't completed, stamp all entries in the
1189          * list with the current cycle counter.  Anything that's in the list at
1190          * the start of checkpoint can surely be deleted after the checkpoint is
1191          * finished, regardless of when the request was made.
1192          */
1193         foreach(cell, pendingUnlinks)
1194         {
1195                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1196
1197                 entry->cycle_ctr = mdckpt_cycle_ctr;
1198         }
1199
1200         /*
1201          * Any unlink requests arriving after this point will be assigned the next
1202          * cycle counter, and won't be unlinked until next checkpoint.
1203          */
1204         mdckpt_cycle_ctr++;
1205 }
1206
1207 /*
1208  * mdpostckpt() -- Do post-checkpoint work
1209  *
1210  * Remove any lingering files that can now be safely removed.
1211  */
1212 void
1213 mdpostckpt(void)
1214 {
1215         while (pendingUnlinks != NIL)
1216         {
1217                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
1218                 char       *path;
1219
1220                 /*
1221                  * New entries are appended to the end, so if the entry is new we've
1222                  * reached the end of old entries.
1223                  */
1224                 if (entry->cycle_ctr == mdckpt_cycle_ctr)
1225                         break;
1226
1227                 /* Else assert we haven't missed it */
1228                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);
1229
1230                 /* Unlink the file */
1231                 path = relpath(entry->rnode, MAIN_FORKNUM);
1232                 if (unlink(path) < 0)
1233                 {
1234                         /*
1235                          * There's a race condition, when the database is dropped at the
1236                          * same time that we process the pending unlink requests. If the
1237                          * DROP DATABASE deletes the file before we do, we will get ENOENT
1238                          * here. rmtree() also has to ignore ENOENT errors, to deal with
1239                          * the possibility that we delete the file first.
1240                          */
1241                         if (errno != ENOENT)
1242                                 ereport(WARNING,
1243                                                 (errcode_for_file_access(),
1244                                                  errmsg("could not remove file \"%s\": %m", path)));
1245                 }
1246                 pfree(path);
1247
1248                 pendingUnlinks = list_delete_first(pendingUnlinks);
1249                 pfree(entry);
1250         }
1251 }
1252
1253 /*
1254  * register_dirty_segment() -- Mark a relation segment as needing fsync
1255  *
1256  * If there is a local pending-ops table, just make an entry in it for
1257  * mdsync to process later.  Otherwise, try to pass off the fsync request
1258  * to the background writer process.  If that fails, just do the fsync
1259  * locally before returning (we expect this will not happen often enough
1260  * to be a performance problem).
1261  */
1262 static void
1263 register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
1264 {
1265         if (pendingOpsTable)
1266         {
1267                 /* push it into local pending-ops table */
1268                 RememberFsyncRequest(reln->smgr_rnode, forknum, seg->mdfd_segno);
1269         }
1270         else
1271         {
1272                 if (ForwardFsyncRequest(reln->smgr_rnode, forknum, seg->mdfd_segno))
1273                         return;                         /* passed it off successfully */
1274
1275                 ereport(DEBUG1,
1276                                 (errmsg("could not forward fsync request because request queue is full")));
1277
1278                 if (FileSync(seg->mdfd_vfd) < 0)
1279                         ereport(ERROR,
1280                                         (errcode_for_file_access(),
1281                                          errmsg("could not fsync file \"%s\": %m",
1282                                                         FilePathName(seg->mdfd_vfd))));
1283         }
1284 }
1285
1286 /*
1287  * register_unlink() -- Schedule a file to be deleted after next checkpoint
1288  *
1289  * As with register_dirty_segment, this could involve either a local or
1290  * a remote pending-ops table.
1291  */
1292 static void
1293 register_unlink(RelFileNodeBackend rnode)
1294 {
1295         if (pendingOpsTable)
1296         {
1297                 /* push it into local pending-ops table */
1298                 RememberFsyncRequest(rnode, MAIN_FORKNUM, UNLINK_RELATION_REQUEST);
1299         }
1300         else
1301         {
1302                 /*
1303                  * Notify the checkpointer about it.  If we fail to queue the request
1304                  * message, we have to sleep and try again, because we can't simply
1305                  * delete the file now.  Ugly, but hopefully won't happen often.
1306                  *
1307                  * XXX should we just leave the file orphaned instead?
1308                  */
1309                 Assert(IsUnderPostmaster);
1310                 while (!ForwardFsyncRequest(rnode, MAIN_FORKNUM,
1311                                                                         UNLINK_RELATION_REQUEST))
1312                         pg_usleep(10000L);      /* 10 msec seems a good number */
1313         }
1314 }
1315
1316 /*
1317  * RememberFsyncRequest() -- callback from checkpointer side of fsync request
1318  *
1319  * We stuff most fsync requests into the local hash table for execution
1320  * during the checkpointer's next checkpoint.  UNLINK requests go into a
1321  * separate linked list, however, because they get processed separately.
1322  *
1323  * The range of possible segment numbers is way less than the range of
1324  * BlockNumber, so we can reserve high values of segno for special purposes.
1325  * We define three:
1326  * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation
1327  * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
1328  * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
1329  *       checkpoint.
1330  *
1331  * (Handling the FORGET_* requests is a tad slow because the hash table has
1332  * to be searched linearly, but it doesn't seem worth rethinking the table
1333  * structure for them.)
1334  */
1335 void
1336 RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
1337                                          BlockNumber segno)
1338 {
1339         Assert(pendingOpsTable);
1340
1341         if (segno == FORGET_RELATION_FSYNC)
1342         {
1343                 /* Remove any pending requests for the entire relation */
1344                 HASH_SEQ_STATUS hstat;
1345                 PendingOperationEntry *entry;
1346
1347                 hash_seq_init(&hstat, pendingOpsTable);
1348                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1349                 {
1350                         if (RelFileNodeBackendEquals(entry->tag.rnode, rnode) &&
1351                                 entry->tag.forknum == forknum)
1352                         {
1353                                 /* Okay, cancel this entry */
1354                                 entry->canceled = true;
1355                         }
1356                 }
1357         }
1358         else if (segno == FORGET_DATABASE_FSYNC)
1359         {
1360                 /* Remove any pending requests for the entire database */
1361                 HASH_SEQ_STATUS hstat;
1362                 PendingOperationEntry *entry;
1363                 ListCell   *cell,
1364                                    *prev,
1365                                    *next;
1366
1367                 /* Remove fsync requests */
1368                 hash_seq_init(&hstat, pendingOpsTable);
1369                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1370                 {
1371                         if (entry->tag.rnode.node.dbNode == rnode.node.dbNode)
1372                         {
1373                                 /* Okay, cancel this entry */
1374                                 entry->canceled = true;
1375                         }
1376                 }
1377
1378                 /* Remove unlink requests */
1379                 prev = NULL;
1380                 for (cell = list_head(pendingUnlinks); cell; cell = next)
1381                 {
1382                         PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1383
1384                         next = lnext(cell);
1385                         if (entry->rnode.node.dbNode == rnode.node.dbNode)
1386                         {
1387                                 pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
1388                                 pfree(entry);
1389                         }
1390                         else
1391                                 prev = cell;
1392                 }
1393         }
1394         else if (segno == UNLINK_RELATION_REQUEST)
1395         {
1396                 /* Unlink request: put it in the linked list */
1397                 MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
1398                 PendingUnlinkEntry *entry;
1399
1400                 entry = palloc(sizeof(PendingUnlinkEntry));
1401                 entry->rnode = rnode;
1402                 entry->cycle_ctr = mdckpt_cycle_ctr;
1403
1404                 pendingUnlinks = lappend(pendingUnlinks, entry);
1405
1406                 MemoryContextSwitchTo(oldcxt);
1407         }
1408         else
1409         {
1410                 /* Normal case: enter a request to fsync this segment */
1411                 PendingOperationTag key;
1412                 PendingOperationEntry *entry;
1413                 bool            found;
1414
1415                 /* ensure any pad bytes in the hash key are zeroed */
1416                 MemSet(&key, 0, sizeof(key));
1417                 key.rnode = rnode;
1418                 key.forknum = forknum;
1419                 key.segno = segno;
1420
1421                 entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
1422                                                                                                           &key,
1423                                                                                                           HASH_ENTER,
1424                                                                                                           &found);
1425                 /* if new or previously canceled entry, initialize it */
1426                 if (!found || entry->canceled)
1427                 {
1428                         entry->canceled = false;
1429                         entry->cycle_ctr = mdsync_cycle_ctr;
1430                 }
1431
1432                 /*
1433                  * NB: it's intentional that we don't change cycle_ctr if the entry
1434                  * already exists.      The fsync request must be treated as old, even
1435                  * though the new request will be satisfied too by any subsequent
1436                  * fsync.
1437                  *
1438                  * However, if the entry is present but is marked canceled, we should
1439                  * act just as though it wasn't there.  The only case where this could
1440                  * happen would be if a file had been deleted, we received but did not
1441                  * yet act on the cancel request, and the same relfilenode was then
1442                  * assigned to a new file.      We mustn't lose the new request, but it
1443                  * should be considered new not old.
1444                  */
1445         }
1446 }
1447
1448 /*
1449  * ForgetRelationFsyncRequests -- forget any fsyncs for a rel
1450  */
1451 void
1452 ForgetRelationFsyncRequests(RelFileNodeBackend rnode, ForkNumber forknum)
1453 {
1454         if (pendingOpsTable)
1455         {
1456                 /* standalone backend or startup process: fsync state is local */
1457                 RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC);
1458         }
1459         else if (IsUnderPostmaster)
1460         {
1461                 /*
1462                  * Notify the checkpointer about it.  If we fail to queue the revoke
1463                  * message, we have to sleep and try again ... ugly, but hopefully
1464                  * won't happen often.
1465                  *
1466                  * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
1467                  * error would leave the no-longer-used file still present on disk,
1468                  * which would be bad, so I'm inclined to assume that the checkpointer
1469                  * will always empty the queue soon.
1470                  */
1471                 while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC))
1472                         pg_usleep(10000L);      /* 10 msec seems a good number */
1473
1474                 /*
1475                  * Note we don't wait for the checkpointer to actually absorb the
1476                  * revoke message; see mdsync() for the implications.
1477                  */
1478         }
1479 }
1480
1481 /*
1482  * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB
1483  */
1484 void
1485 ForgetDatabaseFsyncRequests(Oid dbid)
1486 {
1487         RelFileNodeBackend rnode;
1488
1489         rnode.node.dbNode = dbid;
1490         rnode.node.spcNode = 0;
1491         rnode.node.relNode = 0;
1492         rnode.backend = InvalidBackendId;
1493
1494         if (pendingOpsTable)
1495         {
1496                 /* standalone backend or startup process: fsync state is local */
1497                 RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC);
1498         }
1499         else if (IsUnderPostmaster)
1500         {
1501                 /* see notes in ForgetRelationFsyncRequests */
1502                 while (!ForwardFsyncRequest(rnode, InvalidForkNumber,
1503                                                                         FORGET_DATABASE_FSYNC))
1504                         pg_usleep(10000L);      /* 10 msec seems a good number */
1505         }
1506 }
1507
1508
1509 /*
1510  *      _fdvec_alloc() -- Make a MdfdVec object.
1511  */
1512 static MdfdVec *
1513 _fdvec_alloc(void)
1514 {
1515         return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
1516 }
1517
1518 /*
1519  * Return the filename for the specified segment of the relation. The
1520  * returned string is palloc'd.
1521  */
1522 static char *
1523 _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
1524 {
1525         char       *path,
1526                            *fullpath;
1527
1528         path = relpath(reln->smgr_rnode, forknum);
1529
1530         if (segno > 0)
1531         {
1532                 /* be sure we have enough space for the '.segno' */
1533                 fullpath = (char *) palloc(strlen(path) + 12);
1534                 sprintf(fullpath, "%s.%u", path, segno);
1535                 pfree(path);
1536         }
1537         else
1538                 fullpath = path;
1539
1540         return fullpath;
1541 }
1542
1543 /*
1544  * Open the specified segment of the relation,
1545  * and make a MdfdVec object for it.  Returns NULL on failure.
1546  */
1547 static MdfdVec *
1548 _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
1549                           int oflags)
1550 {
1551         MdfdVec    *v;
1552         int                     fd;
1553         char       *fullpath;
1554
1555         fullpath = _mdfd_segpath(reln, forknum, segno);
1556
1557         /* open the file */
1558         fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
1559
1560         pfree(fullpath);
1561
1562         if (fd < 0)
1563                 return NULL;
1564
1565         if (reln->smgr_transient)
1566                 FileSetTransient(fd);
1567
1568         /* allocate an mdfdvec entry for it */
1569         v = _fdvec_alloc();
1570
1571         /* fill the entry */
1572         v->mdfd_vfd = fd;
1573         v->mdfd_segno = segno;
1574         v->mdfd_chain = NULL;
1575         Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
1576
1577         /* all done */
1578         return v;
1579 }
1580
1581 /*
1582  *      _mdfd_getseg() -- Find the segment of the relation holding the
1583  *              specified block.
1584  *
1585  * If the segment doesn't exist, we ereport, return NULL, or create the
1586  * segment, according to "behavior".  Note: skipFsync is only used in the
1587  * EXTENSION_CREATE case.
1588  */
1589 static MdfdVec *
1590 _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
1591                          bool skipFsync, ExtensionBehavior behavior)
1592 {
1593         MdfdVec    *v = mdopen(reln, forknum, behavior);
1594         BlockNumber targetseg;
1595         BlockNumber nextsegno;
1596
1597         if (!v)
1598                 return NULL;                    /* only possible if EXTENSION_RETURN_NULL */
1599
1600         targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1601         for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
1602         {
1603                 Assert(nextsegno == v->mdfd_segno + 1);
1604
1605                 if (v->mdfd_chain == NULL)
1606                 {
1607                         /*
1608                          * Normally we will create new segments only if authorized by the
1609                          * caller (i.e., we are doing mdextend()).      But when doing WAL
1610                          * recovery, create segments anyway; this allows cases such as
1611                          * replaying WAL data that has a write into a high-numbered
1612                          * segment of a relation that was later deleted.  We want to go
1613                          * ahead and create the segments so we can finish out the replay.
1614                          *
1615                          * We have to maintain the invariant that segments before the last
1616                          * active segment are of size RELSEG_SIZE; therefore, pad them out
1617                          * with zeroes if needed.  (This only matters if caller is
1618                          * extending the relation discontiguously, but that can happen in
1619                          * hash indexes.)
1620                          */
1621                         if (behavior == EXTENSION_CREATE || InRecovery)
1622                         {
1623                                 if (_mdnblocks(reln, forknum, v) < RELSEG_SIZE)
1624                                 {
1625                                         char       *zerobuf = palloc0(BLCKSZ);
1626
1627                                         mdextend(reln, forknum,
1628                                                          nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1629                                                          zerobuf, skipFsync);
1630                                         pfree(zerobuf);
1631                                 }
1632                                 v->mdfd_chain = _mdfd_openseg(reln, forknum, +nextsegno, O_CREAT);
1633                         }
1634                         else
1635                         {
1636                                 /* We won't create segment if not existent */
1637                                 v->mdfd_chain = _mdfd_openseg(reln, forknum, nextsegno, 0);
1638                         }
1639                         if (v->mdfd_chain == NULL)
1640                         {
1641                                 if (behavior == EXTENSION_RETURN_NULL &&
1642                                         FILE_POSSIBLY_DELETED(errno))
1643                                         return NULL;
1644                                 ereport(ERROR,
1645                                                 (errcode_for_file_access(),
1646                                    errmsg("could not open file \"%s\" (target block %u): %m",
1647                                                   _mdfd_segpath(reln, forknum, nextsegno),
1648                                                   blkno)));
1649                         }
1650                 }
1651                 v = v->mdfd_chain;
1652         }
1653         return v;
1654 }
1655
1656 /*
1657  * Get number of blocks present in a single disk file
1658  */
1659 static BlockNumber
1660 _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
1661 {
1662         off_t           len;
1663
1664         len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
1665         if (len < 0)
1666                 ereport(ERROR,
1667                                 (errcode_for_file_access(),
1668                                  errmsg("could not seek to end of file \"%s\": %m",
1669                                                 FilePathName(seg->mdfd_vfd))));
1670         /* note that this calculation will ignore any partial block at EOF */
1671         return (BlockNumber) (len / BLCKSZ);
1672 }