granicus.if.org Git - postgresql/blob - src/backend/storage/smgr/md.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * md.c
   4  *        This code manages relations that reside on magnetic disk.
   5  *
   6  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.143 2009/01/01 17:23:48 momjian Exp $
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15 #include "postgres.h"
  16
  17 #include <unistd.h>
  18 #include <fcntl.h>
  19 #include <sys/file.h>
  20
  21 #include "catalog/catalog.h"
  22 #include "miscadmin.h"
  23 #include "postmaster/bgwriter.h"
  24 #include "storage/fd.h"
  25 #include "storage/bufmgr.h"
  26 #include "storage/relfilenode.h"
  27 #include "storage/smgr.h"
  28 #include "utils/hsearch.h"
  29 #include "utils/memutils.h"
  30 #include "pg_trace.h"
  31
  32
  33 /* interval for calling AbsorbFsyncRequests in mdsync */
  34 #define FSYNCS_PER_ABSORB               10
  35
  36 /* special values for the segno arg to RememberFsyncRequest */
  37 #define FORGET_RELATION_FSYNC   (InvalidBlockNumber)
  38 #define FORGET_DATABASE_FSYNC   (InvalidBlockNumber-1)
  39 #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
  40
  41 /*
  42  * On Windows, we have to interpret EACCES as possibly meaning the same as
  43  * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
  44  * that's what you get.  Ugh.  This code is designed so that we don't
  45  * actually believe these cases are okay without further evidence (namely,
  46  * a pending fsync request getting revoked ... see mdsync).
  47  */
  48 #ifndef WIN32
  49 #define FILE_POSSIBLY_DELETED(err)      ((err) == ENOENT)
  50 #else
  51 #define FILE_POSSIBLY_DELETED(err)      ((err) == ENOENT || (err) == EACCES)
  52 #endif
  53
  54 /*
  55  *      The magnetic disk storage manager keeps track of open file
  56  *      descriptors in its own descriptor pool.  This is done to make it
  57  *      easier to support relations that are larger than the operating
  58  *      system's file size limit (often 2GBytes).  In order to do that,
  59  *      we break relations up into "segment" files that are each shorter than
  60  *      the OS file size limit.  The segment size is set by the RELSEG_SIZE
  61  *      configuration constant in pg_config.h.
  62  *
  63  *      On disk, a relation must consist of consecutively numbered segment
  64  *      files in the pattern
  65  *              -- Zero or more full segments of exactly RELSEG_SIZE blocks each
  66  *              -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
  67  *              -- Optionally, any number of inactive segments of size 0 blocks.
  68  *      The full and partial segments are collectively the "active" segments.
  69  *      Inactive segments are those that once contained data but are currently
  70  *      not needed because of an mdtruncate() operation.  The reason for leaving
  71  *      them present at size zero, rather than unlinking them, is that other
  72  *      backends and/or the bgwriter might be holding open file references to
  73  *      such segments.  If the relation expands again after mdtruncate(), such
  74  *      that a deactivated segment becomes active again, it is important that
  75  *      such file references still be valid --- else data might get written
  76  *      out to an unlinked old copy of a segment file that will eventually
  77  *      disappear.
  78  *
  79  *      The file descriptor pointer (md_fd field) stored in the SMgrRelation
  80  *      cache is, therefore, just the head of a list of MdfdVec objects, one
  81  *      per segment.  But note the md_fd pointer can be NULL, indicating
  82  *      relation not open.
  83  *
  84  *      Also note that mdfd_chain == NULL does not necessarily mean the relation
  85  *      doesn't have another segment after this one; we may just not have
  86  *      opened the next segment yet.  (We could not have "all segments are
  87  *      in the chain" as an invariant anyway, since another backend could
  88  *      extend the relation when we weren't looking.)  We do not make chain
  89  *      entries for inactive segments, however; as soon as we find a partial
  90  *      segment, we assume that any subsequent segments are inactive.
  91  *
  92  *      All MdfdVec objects are palloc'd in the MdCxt memory context.
  93  */
  94
  95 typedef struct _MdfdVec
  96 {
  97         File            mdfd_vfd;               /* fd number in fd.c's pool */
  98         BlockNumber mdfd_segno;         /* segment number, from 0 */
  99         struct _MdfdVec *mdfd_chain;    /* next segment, or NULL */
 100 } MdfdVec;
 101
 102 static MemoryContext MdCxt;             /* context for all md.c allocations */
 103
 104
 105 /*
 106  * In some contexts (currently, standalone backends and the bgwriter process)
 107  * we keep track of pending fsync operations: we need to remember all relation
 108  * segments that have been written since the last checkpoint, so that we can
 109  * fsync them down to disk before completing the next checkpoint.  This hash
 110  * table remembers the pending operations.      We use a hash table mostly as
 111  * a convenient way of eliminating duplicate requests.
 112  *
 113  * We use a similar mechanism to remember no-longer-needed files that can
 114  * be deleted after the next checkpoint, but we use a linked list instead of
 115  * a hash table, because we don't expect there to be any duplicate requests.
 116  *
 117  * (Regular backends do not track pending operations locally, but forward
 118  * them to the bgwriter.)
 119  */
 120 typedef struct
 121 {
 122         RelFileNode rnode;                      /* the targeted relation */
 123         ForkNumber forknum;
 124         BlockNumber segno;                      /* which segment */
 125 } PendingOperationTag;
 126
 127 typedef uint16 CycleCtr;                /* can be any convenient integer size */
 128
 129 typedef struct
 130 {
 131         PendingOperationTag tag;        /* hash table key (must be first!) */
 132         bool            canceled;               /* T => request canceled, not yet removed */
 133         CycleCtr        cycle_ctr;              /* mdsync_cycle_ctr when request was made */
 134 } PendingOperationEntry;
 135
 136 typedef struct
 137 {
 138         RelFileNode rnode;                      /* the dead relation to delete */
 139         CycleCtr        cycle_ctr;              /* mdckpt_cycle_ctr when request was made */
 140 } PendingUnlinkEntry;
 141
 142 static HTAB *pendingOpsTable = NULL;
 143 static List *pendingUnlinks = NIL;
 144
 145 static CycleCtr mdsync_cycle_ctr = 0;
 146 static CycleCtr mdckpt_cycle_ctr = 0;
 147
 148
 149 typedef enum                                    /* behavior for mdopen & _mdfd_getseg */
 150 {
 151         EXTENSION_FAIL,                         /* ereport if segment not present */
 152         EXTENSION_RETURN_NULL,          /* return NULL if not present */
 153         EXTENSION_CREATE                        /* create new segments as needed */
 154 } ExtensionBehavior;
 155
 156 /* local routines */
 157 static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum,
 158                                            ExtensionBehavior behavior);
 159 static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
 160                                                                    MdfdVec *seg);
 161 static void register_unlink(RelFileNode rnode);
 162 static MdfdVec *_fdvec_alloc(void);
 163 static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno,
 164                                                           BlockNumber segno, int oflags);
 165 static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno,
 166                          BlockNumber blkno, bool isTemp, ExtensionBehavior behavior);
 167 static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
 168                                                           MdfdVec *seg);
 169
 170
 171 /*
 172  *      mdinit() -- Initialize private state for magnetic disk storage manager.
 173  */
 174 void
 175 mdinit(void)
 176 {
 177         MdCxt = AllocSetContextCreate(TopMemoryContext,
 178                                                                   "MdSmgr",
 179                                                                   ALLOCSET_DEFAULT_MINSIZE,
 180                                                                   ALLOCSET_DEFAULT_INITSIZE,
 181                                                                   ALLOCSET_DEFAULT_MAXSIZE);
 182
 183         /*
 184          * Create pending-operations hashtable if we need it.  Currently, we need
 185          * it if we are standalone (not under a postmaster) OR if we are a
 186          * bootstrap-mode subprocess of a postmaster (that is, a startup or
 187          * bgwriter process).
 188          */
 189         if (!IsUnderPostmaster || IsBootstrapProcessingMode())
 190         {
 191                 HASHCTL         hash_ctl;
 192
 193                 MemSet(&hash_ctl, 0, sizeof(hash_ctl));
 194                 hash_ctl.keysize = sizeof(PendingOperationTag);
 195                 hash_ctl.entrysize = sizeof(PendingOperationEntry);
 196                 hash_ctl.hash = tag_hash;
 197                 hash_ctl.hcxt = MdCxt;
 198                 pendingOpsTable = hash_create("Pending Ops Table",
 199                                                                           100L,
 200                                                                           &hash_ctl,
 201                                                                    HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
 202                 pendingUnlinks = NIL;
 203         }
 204 }
 205
 206 /*
 207  *  mdexists() -- Does the physical file exist?
 208  *
 209  * Note: this will return true for lingering files, with pending deletions
 210  */
 211 bool
 212 mdexists(SMgrRelation reln, ForkNumber forkNum)
 213 {
 214         /*
 215          * Close it first, to ensure that we notice if the fork has been
 216          * unlinked since we opened it.
 217          */
 218         mdclose(reln, forkNum);
 219
 220         return (mdopen(reln, forkNum, EXTENSION_RETURN_NULL) != NULL);
 221 }
 222
 223 /*
 224  *      mdcreate() -- Create a new relation on magnetic disk.
 225  *
 226  * If isRedo is true, it's okay for the relation to exist already.
 227  */
 228 void
 229 mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 230 {
 231         char       *path;
 232         File            fd;
 233
 234         if (isRedo && reln->md_fd[forkNum] != NULL)
 235                 return;                                 /* created and opened already... */
 236
 237         Assert(reln->md_fd[forkNum] == NULL);
 238
 239         path = relpath(reln->smgr_rnode, forkNum);
 240
 241         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
 242
 243         if (fd < 0)
 244         {
 245                 int                     save_errno = errno;
 246
 247                 /*
 248                  * During bootstrap, there are cases where a system relation will be
 249                  * accessed (by internal backend processes) before the bootstrap
 250                  * script nominally creates it.  Therefore, allow the file to exist
 251                  * already, even if isRedo is not set.  (See also mdopen)
 252                  */
 253                 if (isRedo || IsBootstrapProcessingMode())
 254                         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
 255                 if (fd < 0)
 256                 {
 257                         /* be sure to report the error reported by create, not open */
 258                         errno = save_errno;
 259                         ereport(ERROR,
 260                                         (errcode_for_file_access(),
 261                                          errmsg("could not create relation %s: %m", path)));
 262                 }
 263         }
 264
 265         pfree(path);
 266
 267         reln->md_fd[forkNum] = _fdvec_alloc();
 268
 269         reln->md_fd[forkNum]->mdfd_vfd = fd;
 270         reln->md_fd[forkNum]->mdfd_segno = 0;
 271         reln->md_fd[forkNum]->mdfd_chain = NULL;
 272 }
 273
 274 /*
 275  *      mdunlink() -- Unlink a relation.
 276  *
 277  * Note that we're passed a RelFileNode --- by the time this is called,
 278  * there won't be an SMgrRelation hashtable entry anymore.
 279  *
 280  * Actually, we don't unlink the first segment file of the relation, but
 281  * just truncate it to zero length, and record a request to unlink it after
 282  * the next checkpoint.  Additional segments can be unlinked immediately,
 283  * however.  Leaving the empty file in place prevents that relfilenode
 284  * number from being reused.  The scenario this protects us from is:
 285  * 1. We delete a relation (and commit, and actually remove its file).
 286  * 2. We create a new relation, which by chance gets the same relfilenode as
 287  *        the just-deleted one (OIDs must've wrapped around for that to happen).
 288  * 3. We crash before another checkpoint occurs.
 289  * During replay, we would delete the file and then recreate it, which is fine
 290  * if the contents of the file were repopulated by subsequent WAL entries.
 291  * But if we didn't WAL-log insertions, but instead relied on fsyncing the
 292  * file after populating it (as for instance CLUSTER and CREATE INDEX do),
 293  * the contents of the file would be lost forever.      By leaving the empty file
 294  * until after the next checkpoint, we prevent reassignment of the relfilenode
 295  * number until it's safe, because relfilenode assignment skips over any
 296  * existing file.
 297  *
 298  * If isRedo is true, it's okay for the relation to be already gone.
 299  * Also, we should remove the file immediately instead of queuing a request
 300  * for later, since during redo there's no possibility of creating a
 301  * conflicting relation.
 302  *
 303  * Note: any failure should be reported as WARNING not ERROR, because
 304  * we are usually not in a transaction anymore when this is called.
 305  */
 306 void
 307 mdunlink(RelFileNode rnode, ForkNumber forkNum, bool isRedo)
 308 {
 309         char       *path;
 310         int                     ret;
 311
 312         /*
 313          * We have to clean out any pending fsync requests for the doomed
 314          * relation, else the next mdsync() will fail.
 315          */
 316         ForgetRelationFsyncRequests(rnode, forkNum);
 317
 318         path = relpath(rnode, forkNum);
 319
 320         /*
 321          * Delete or truncate the first segment.
 322          */
 323         if (isRedo || forkNum != MAIN_FORKNUM)
 324                 ret = unlink(path);
 325         else
 326         {
 327                 /* truncate(2) would be easier here, but Windows hasn't got it */
 328                 int                     fd;
 329
 330                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0);
 331                 if (fd >= 0)
 332                 {
 333                         int                     save_errno;
 334
 335                         ret = ftruncate(fd, 0);
 336                         save_errno = errno;
 337                         close(fd);
 338                         errno = save_errno;
 339                 }
 340                 else
 341                         ret = -1;
 342         }
 343         if (ret < 0)
 344         {
 345                 if (!isRedo || errno != ENOENT)
 346                         ereport(WARNING,
 347                                         (errcode_for_file_access(),
 348                                          errmsg("could not remove relation %s: %m", path)));
 349         }
 350
 351         /*
 352          * Delete any additional segments.
 353          */
 354         else
 355         {
 356                 char       *segpath = (char *) palloc(strlen(path) + 12);
 357                 BlockNumber segno;
 358
 359                 /*
 360                  * Note that because we loop until getting ENOENT, we will correctly
 361                  * remove all inactive segments as well as active ones.
 362                  */
 363                 for (segno = 1;; segno++)
 364                 {
 365                         sprintf(segpath, "%s.%u", path, segno);
 366                         if (unlink(segpath) < 0)
 367                         {
 368                                 /* ENOENT is expected after the last segment... */
 369                                 if (errno != ENOENT)
 370                                         ereport(WARNING,
 371                                                         (errcode_for_file_access(),
 372                                                          errmsg("could not remove segment %u of relation %s: %m",
 373                                                                         segno, path)));
 374                                 break;
 375                         }
 376                 }
 377                 pfree(segpath);
 378         }
 379
 380         pfree(path);
 381
 382         /* Register request to unlink first segment later */
 383         if (!isRedo && forkNum == MAIN_FORKNUM)
 384                 register_unlink(rnode);
 385 }
 386
 387 /*
 388  *      mdextend() -- Add a block to the specified relation.
 389  *
 390  *              The semantics are nearly the same as mdwrite(): write at the
 391  *              specified position.  However, this is to be used for the case of
 392  *              extending a relation (i.e., blocknum is at or beyond the current
 393  *              EOF).  Note that we assume writing a block beyond current EOF
 394  *              causes intervening file space to become filled with zeroes.
 395  */
 396 void
 397 mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 398                  char *buffer, bool isTemp)
 399 {
 400         off_t           seekpos;
 401         int                     nbytes;
 402         MdfdVec    *v;
 403
 404         /* This assert is too expensive to have on normally ... */
 405 #ifdef CHECK_WRITE_VS_EXTEND
 406         Assert(blocknum >= mdnblocks(reln, forknum));
 407 #endif
 408
 409         /*
 410          * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
 411          * more --- we mustn't create a block whose number actually is
 412          * InvalidBlockNumber.
 413          */
 414         if (blocknum == InvalidBlockNumber)
 415                 ereport(ERROR,
 416                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 417                                  errmsg("cannot extend relation %s beyond %u blocks",
 418                                                 relpath(reln->smgr_rnode, forknum),
 419                                                 InvalidBlockNumber)));
 420
 421         v = _mdfd_getseg(reln, forknum, blocknum, isTemp, EXTENSION_CREATE);
 422
 423         seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
 424         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 425
 426         /*
 427          * Note: because caller usually obtained blocknum by calling mdnblocks,
 428          * which did a seek(SEEK_END), this seek is often redundant and will be
 429          * optimized away by fd.c.      It's not redundant, however, if there is a
 430          * partial page at the end of the file. In that case we want to try to
 431          * overwrite the partial page with a full page.  It's also not redundant
 432          * if bufmgr.c had to dump another buffer of the same file to make room
 433          * for the new page's buffer.
 434          */
 435         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 436                 ereport(ERROR,
 437                                 (errcode_for_file_access(),
 438                                  errmsg("could not seek to block %u of relation %s: %m",
 439                                                 blocknum,
 440                                                 relpath(reln->smgr_rnode, forknum))));
 441
 442         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
 443         {
 444                 if (nbytes < 0)
 445                         ereport(ERROR,
 446                                         (errcode_for_file_access(),
 447                                          errmsg("could not extend relation %s: %m",
 448                                                         relpath(reln->smgr_rnode, forknum)),
 449                                          errhint("Check free disk space.")));
 450                 /* short write: complain appropriately */
 451                 ereport(ERROR,
 452                                 (errcode(ERRCODE_DISK_FULL),
 453                                  errmsg("could not extend relation %s: wrote only %d of %d bytes at block %u",
 454                                                 relpath(reln->smgr_rnode, forknum),
 455                                                 nbytes, BLCKSZ, blocknum),
 456                                  errhint("Check free disk space.")));
 457         }
 458
 459         if (!isTemp)
 460                 register_dirty_segment(reln, forknum, v);
 461
 462         Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
 463 }
 464
 465 /*
 466  *      mdopen() -- Open the specified relation.
 467  *
 468  * Note we only open the first segment, when there are multiple segments.
 469  *
 470  * If first segment is not present, either ereport or return NULL according
 471  * to "behavior".  We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
 472  * EXTENSION_CREATE means it's OK to extend an existing relation, not to
 473  * invent one out of whole cloth.
 474  */
 475 static MdfdVec *
 476 mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior)
 477 {
 478         MdfdVec    *mdfd;
 479         char       *path;
 480         File            fd;
 481
 482         /* No work if already open */
 483         if (reln->md_fd[forknum])
 484                 return reln->md_fd[forknum];
 485
 486         path = relpath(reln->smgr_rnode, forknum);
 487
 488         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
 489
 490         if (fd < 0)
 491         {
 492                 /*
 493                  * During bootstrap, there are cases where a system relation will be
 494                  * accessed (by internal backend processes) before the bootstrap
 495                  * script nominally creates it.  Therefore, accept mdopen() as a
 496                  * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
 497                  */
 498                 if (IsBootstrapProcessingMode())
 499                         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
 500                 if (fd < 0)
 501                 {
 502                         if (behavior == EXTENSION_RETURN_NULL &&
 503                                 FILE_POSSIBLY_DELETED(errno))
 504                         {
 505                                 pfree(path);
 506                                 return NULL;
 507                         }
 508                         ereport(ERROR,
 509                                         (errcode_for_file_access(),
 510                                          errmsg("could not open relation %s: %m", path)));
 511                 }
 512         }
 513
 514         pfree(path);
 515
 516         reln->md_fd[forknum] = mdfd = _fdvec_alloc();
 517
 518         mdfd->mdfd_vfd = fd;
 519         mdfd->mdfd_segno = 0;
 520         mdfd->mdfd_chain = NULL;
 521         Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
 522
 523         return mdfd;
 524 }
 525
 526 /*
 527  *      mdclose() -- Close the specified relation, if it isn't closed already.
 528  */
 529 void
 530 mdclose(SMgrRelation reln, ForkNumber forknum)
 531 {
 532         MdfdVec    *v = reln->md_fd[forknum];
 533
 534         /* No work if already closed */
 535         if (v == NULL)
 536                 return;
 537
 538         reln->md_fd[forknum] = NULL;                    /* prevent dangling pointer after error */
 539
 540         while (v != NULL)
 541         {
 542                 MdfdVec    *ov = v;
 543
 544                 /* if not closed already */
 545                 if (v->mdfd_vfd >= 0)
 546                         FileClose(v->mdfd_vfd);
 547                 /* Now free vector */
 548                 v = v->mdfd_chain;
 549                 pfree(ov);
 550         }
 551 }
 552
 553 /*
 554  *      mdread() -- Read the specified block from a relation.
 555  */
 556 void
 557 mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 558            char *buffer)
 559 {
 560         off_t           seekpos;
 561         int                     nbytes;
 562         MdfdVec    *v;
 563
 564         TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum, reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode, reln->smgr_rnode.relNode);
 565
 566         v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
 567
 568         seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
 569         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 570
 571         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 572                 ereport(ERROR,
 573                                 (errcode_for_file_access(),
 574                                  errmsg("could not seek to block %u of relation %s: %m",
 575                                                 blocknum, relpath(reln->smgr_rnode, forknum))));
 576
 577         nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ);
 578
 579         TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum, reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode, reln->smgr_rnode.relNode, relpath(reln->smgr_rnode, forknum), nbytes, BLCKSZ);
 580
 581         if (nbytes != BLCKSZ)
 582         {
 583                 if (nbytes < 0)
 584                         ereport(ERROR,
 585                                         (errcode_for_file_access(),
 586                                    errmsg("could not read block %u of relation %s: %m",
 587                                                   blocknum, relpath(reln->smgr_rnode, forknum))));
 588
 589                 /*
 590                  * Short read: we are at or past EOF, or we read a partial block at
 591                  * EOF.  Normally this is an error; upper levels should never try to
 592                  * read a nonexistent block.  However, if zero_damaged_pages is ON or
 593                  * we are InRecovery, we should instead return zeroes without
 594                  * complaining.  This allows, for example, the case of trying to
 595                  * update a block that was later truncated away.
 596                  */
 597                 if (zero_damaged_pages || InRecovery)
 598                         MemSet(buffer, 0, BLCKSZ);
 599                 else
 600                         ereport(ERROR,
 601                                         (errcode(ERRCODE_DATA_CORRUPTED),
 602                                          errmsg("could not read block %u of relation %s: read only %d of %d bytes",
 603                                                         blocknum, relpath(reln->smgr_rnode, forknum),
 604                                                         nbytes, BLCKSZ)));
 605         }
 606 }
 607
 608 /*
 609  *      mdwrite() -- Write the supplied block at the appropriate location.
 610  *
 611  *              This is to be used only for updating already-existing blocks of a
 612  *              relation (ie, those before the current EOF).  To extend a relation,
 613  *              use mdextend().
 614  */
 615 void
 616 mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 617                 char *buffer, bool isTemp)
 618 {
 619         off_t           seekpos;
 620         int                     nbytes;
 621         MdfdVec    *v;
 622
 623         /* This assert is too expensive to have on normally ... */
 624 #ifdef CHECK_WRITE_VS_EXTEND
 625         Assert(blocknum < mdnblocks(reln, forknum));
 626 #endif
 627
 628         TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum, reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode, reln->smgr_rnode.relNode);
 629
 630         v = _mdfd_getseg(reln, forknum, blocknum, isTemp, EXTENSION_FAIL);
 631
 632         seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
 633         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 634
 635         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 636                 ereport(ERROR,
 637                                 (errcode_for_file_access(),
 638                                  errmsg("could not seek to block %u of relation %s: %m",
 639                                                 blocknum, relpath(reln->smgr_rnode, forknum))));
 640
 641         nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ);
 642
 643         TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum, reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode, reln->smgr_rnode.relNode, relpath(reln->smgr_rnode, forknum), nbytes, BLCKSZ);
 644
 645         if (nbytes != BLCKSZ)
 646         {
 647                 if (nbytes < 0)
 648                         ereport(ERROR,
 649                                         (errcode_for_file_access(),
 650                                   errmsg("could not write block %u of relation %s: %m",
 651                                                  blocknum, relpath(reln->smgr_rnode, forknum))));
 652                 /* short write: complain appropriately */
 653                 ereport(ERROR,
 654                                 (errcode(ERRCODE_DISK_FULL),
 655                                  errmsg("could not write block %u of relation %s: wrote only %d of %d bytes",
 656                                                 blocknum,
 657                                                 relpath(reln->smgr_rnode, forknum),
 658                                                 nbytes, BLCKSZ),
 659                                  errhint("Check free disk space.")));
 660         }
 661
 662         if (!isTemp)
 663                 register_dirty_segment(reln, forknum, v);
 664 }
 665
 666 /*
 667  *      mdnblocks() -- Get the number of blocks stored in a relation.
 668  *
 669  *              Important side effect: all active segments of the relation are opened
 670  *              and added to the mdfd_chain list.  If this routine has not been
 671  *              called, then only segments up to the last one actually touched
 672  *              are present in the chain.
 673  */
 674 BlockNumber
 675 mdnblocks(SMgrRelation reln, ForkNumber forknum)
 676 {
 677         MdfdVec    *v = mdopen(reln, forknum, EXTENSION_FAIL);
 678         BlockNumber nblocks;
 679         BlockNumber segno = 0;
 680
 681         /*
 682          * Skip through any segments that aren't the last one, to avoid redundant
 683          * seeks on them.  We have previously verified that these segments are
 684          * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
 685          *
 686          * NOTE: this assumption could only be wrong if another backend has
 687          * truncated the relation.      We rely on higher code levels to handle that
 688          * scenario by closing and re-opening the md fd, which is handled via
 689          * relcache flush.      (Since the bgwriter doesn't participate in relcache
 690          * flush, it could have segment chain entries for inactive segments;
 691          * that's OK because the bgwriter never needs to compute relation size.)
 692          */
 693         while (v->mdfd_chain != NULL)
 694         {
 695                 segno++;
 696                 v = v->mdfd_chain;
 697         }
 698
 699         for (;;)
 700         {
 701                 nblocks = _mdnblocks(reln, forknum, v);
 702                 if (nblocks > ((BlockNumber) RELSEG_SIZE))
 703                         elog(FATAL, "segment too big");
 704                 if (nblocks < ((BlockNumber) RELSEG_SIZE))
 705                         return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
 706
 707                 /*
 708                  * If segment is exactly RELSEG_SIZE, advance to next one.
 709                  */
 710                 segno++;
 711
 712                 if (v->mdfd_chain == NULL)
 713                 {
 714                         /*
 715                          * Because we pass O_CREAT, we will create the next segment (with
 716                          * zero length) immediately, if the last segment is of length
 717                          * RELSEG_SIZE.  While perhaps not strictly necessary, this keeps
 718                          * the logic simple.
 719                          */
 720                         v->mdfd_chain = _mdfd_openseg(reln, forknum, segno, O_CREAT);
 721                         if (v->mdfd_chain == NULL)
 722                                 ereport(ERROR,
 723                                                 (errcode_for_file_access(),
 724                                  errmsg("could not open segment %u of relation %s: %m",
 725                                                 segno,
 726                                                 relpath(reln->smgr_rnode, forknum))));
 727                 }
 728
 729                 v = v->mdfd_chain;
 730         }
 731 }
 732
 733 /*
 734  *      mdtruncate() -- Truncate relation to specified number of blocks.
 735  */
 736 void
 737 mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks,
 738                    bool isTemp)
 739 {
 740         MdfdVec    *v;
 741         BlockNumber curnblk;
 742         BlockNumber priorblocks;
 743
 744         /*
 745          * NOTE: mdnblocks makes sure we have opened all active segments, so that
 746          * truncation loop will get them all!
 747          */
 748         curnblk = mdnblocks(reln, forknum);
 749         if (nblocks > curnblk)
 750         {
 751                 /* Bogus request ... but no complaint if InRecovery */
 752                 if (InRecovery)
 753                         return;
 754                 ereport(ERROR,
 755                                 (errmsg("could not truncate relation %s to %u blocks: it's only %u blocks now",
 756                                                 relpath(reln->smgr_rnode, forknum),
 757                                                 nblocks, curnblk)));
 758         }
 759         if (nblocks == curnblk)
 760                 return;                                 /* no work */
 761
 762         v = mdopen(reln, forknum, EXTENSION_FAIL);
 763
 764         priorblocks = 0;
 765         while (v != NULL)
 766         {
 767                 MdfdVec    *ov = v;
 768
 769                 if (priorblocks > nblocks)
 770                 {
 771                         /*
 772                          * This segment is no longer active (and has already been unlinked
 773                          * from the mdfd_chain). We truncate the file, but do not delete
 774                          * it, for reasons explained in the header comments.
 775                          */
 776                         if (FileTruncate(v->mdfd_vfd, 0) < 0)
 777                                 ereport(ERROR,
 778                                                 (errcode_for_file_access(),
 779                                                  errmsg("could not truncate relation %s to %u blocks: %m",
 780                                                                 relpath(reln->smgr_rnode, forknum),
 781                                                                 nblocks)));
 782                         if (!isTemp)
 783                                 register_dirty_segment(reln, forknum, v);
 784                         v = v->mdfd_chain;
 785                         Assert(ov != reln->md_fd[forknum]);     /* we never drop the 1st segment */
 786                         pfree(ov);
 787                 }
 788                 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
 789                 {
 790                         /*
 791                          * This is the last segment we want to keep. Truncate the file to
 792                          * the right length, and clear chain link that points to any
 793                          * remaining segments (which we shall zap). NOTE: if nblocks is
 794                          * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
 795                          * segment to 0 length but keep it. This adheres to the invariant
 796                          * given in the header comments.
 797                          */
 798                         BlockNumber lastsegblocks = nblocks - priorblocks;
 799
 800                         if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ) < 0)
 801                                 ereport(ERROR,
 802                                                 (errcode_for_file_access(),
 803                                                  errmsg("could not truncate relation %s to %u blocks: %m",
 804                                                                 relpath(reln->smgr_rnode, forknum),
 805                                                                 nblocks)));
 806                         if (!isTemp)
 807                                 register_dirty_segment(reln, forknum, v);
 808                         v = v->mdfd_chain;
 809                         ov->mdfd_chain = NULL;
 810                 }
 811                 else
 812                 {
 813                         /*
 814                          * We still need this segment and 0 or more blocks beyond it, so
 815                          * nothing to do here.
 816                          */
 817                         v = v->mdfd_chain;
 818                 }
 819                 priorblocks += RELSEG_SIZE;
 820         }
 821 }
 822
 823 /*
 824  *      mdimmedsync() -- Immediately sync a relation to stable storage.
 825  *
 826  * Note that only writes already issued are synced; this routine knows
 827  * nothing of dirty buffers that may exist inside the buffer manager.
 828  */
 829 void
 830 mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 831 {
 832         MdfdVec    *v;
 833         BlockNumber curnblk;
 834
 835         /*
 836          * NOTE: mdnblocks makes sure we have opened all active segments, so that
 837          * fsync loop will get them all!
 838          */
 839         curnblk = mdnblocks(reln, forknum);
 840
 841         v = mdopen(reln, forknum, EXTENSION_FAIL);
 842
 843         while (v != NULL)
 844         {
 845                 if (FileSync(v->mdfd_vfd) < 0)
 846                         ereport(ERROR,
 847                                         (errcode_for_file_access(),
 848                                          errmsg("could not fsync segment %u of relation %s: %m",
 849                                                         v->mdfd_segno,
 850                                                         relpath(reln->smgr_rnode, forknum))));
 851                 v = v->mdfd_chain;
 852         }
 853 }
 854
 855 /*
 856  *      mdsync() -- Sync previous writes to stable storage.
 857  */
 858 void
 859 mdsync(void)
 860 {
 861         static bool mdsync_in_progress = false;
 862
 863         HASH_SEQ_STATUS hstat;
 864         PendingOperationEntry *entry;
 865         int                     absorb_counter;
 866
 867         /*
 868          * This is only called during checkpoints, and checkpoints should only
 869          * occur in processes that have created a pendingOpsTable.
 870          */
 871         if (!pendingOpsTable)
 872                 elog(ERROR, "cannot sync without a pendingOpsTable");
 873
 874         /*
 875          * If we are in the bgwriter, the sync had better include all fsync
 876          * requests that were queued by backends up to this point.      The tightest
 877          * race condition that could occur is that a buffer that must be written
 878          * and fsync'd for the checkpoint could have been dumped by a backend just
 879          * before it was visited by BufferSync().  We know the backend will have
 880          * queued an fsync request before clearing the buffer's dirtybit, so we
 881          * are safe as long as we do an Absorb after completing BufferSync().
 882          */
 883         AbsorbFsyncRequests();
 884
 885         /*
 886          * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
 887          * checkpoint), we want to ignore fsync requests that are entered into the
 888          * hashtable after this point --- they should be processed next time,
 889          * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
 890          * ones: new ones will have cycle_ctr equal to the incremented value of
 891          * mdsync_cycle_ctr.
 892          *
 893          * In normal circumstances, all entries present in the table at this point
 894          * will have cycle_ctr exactly equal to the current (about to be old)
 895          * value of mdsync_cycle_ctr.  However, if we fail partway through the
 896          * fsync'ing loop, then older values of cycle_ctr might remain when we
 897          * come back here to try again.  Repeated checkpoint failures would
 898          * eventually wrap the counter around to the point where an old entry
 899          * might appear new, causing us to skip it, possibly allowing a checkpoint
 900          * to succeed that should not have.  To forestall wraparound, any time the
 901          * previous mdsync() failed to complete, run through the table and
 902          * forcibly set cycle_ctr = mdsync_cycle_ctr.
 903          *
 904          * Think not to merge this loop with the main loop, as the problem is
 905          * exactly that that loop may fail before having visited all the entries.
 906          * From a performance point of view it doesn't matter anyway, as this path
 907          * will never be taken in a system that's functioning normally.
 908          */
 909         if (mdsync_in_progress)
 910         {
 911                 /* prior try failed, so update any stale cycle_ctr values */
 912                 hash_seq_init(&hstat, pendingOpsTable);
 913                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
 914                 {
 915                         entry->cycle_ctr = mdsync_cycle_ctr;
 916                 }
 917         }
 918
 919         /* Advance counter so that new hashtable entries are distinguishable */
 920         mdsync_cycle_ctr++;
 921
 922         /* Set flag to detect failure if we don't reach the end of the loop */
 923         mdsync_in_progress = true;
 924
 925         /* Now scan the hashtable for fsync requests to process */
 926         absorb_counter = FSYNCS_PER_ABSORB;
 927         hash_seq_init(&hstat, pendingOpsTable);
 928         while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
 929         {
 930                 /*
 931                  * If the entry is new then don't process it this time.  Note that
 932                  * "continue" bypasses the hash-remove call at the bottom of the loop.
 933                  */
 934                 if (entry->cycle_ctr == mdsync_cycle_ctr)
 935                         continue;
 936
 937                 /* Else assert we haven't missed it */
 938                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
 939
 940                 /*
 941                  * If fsync is off then we don't have to bother opening the file at
 942                  * all.  (We delay checking until this point so that changing fsync on
 943                  * the fly behaves sensibly.)  Also, if the entry is marked canceled,
 944                  * fall through to delete it.
 945                  */
 946                 if (enableFsync && !entry->canceled)
 947                 {
 948                         int                     failures;
 949
 950                         /*
 951                          * If in bgwriter, we want to absorb pending requests every so
 952                          * often to prevent overflow of the fsync request queue.  It is
 953                          * unspecified whether newly-added entries will be visited by
 954                          * hash_seq_search, but we don't care since we don't need to
 955                          * process them anyway.
 956                          */
 957                         if (--absorb_counter <= 0)
 958                         {
 959                                 AbsorbFsyncRequests();
 960                                 absorb_counter = FSYNCS_PER_ABSORB;
 961                         }
 962
 963                         /*
 964                          * The fsync table could contain requests to fsync segments that
 965                          * have been deleted (unlinked) by the time we get to them. Rather
 966                          * than just hoping an ENOENT (or EACCES on Windows) error can be
 967                          * ignored, what we do on error is absorb pending requests and
 968                          * then retry.  Since mdunlink() queues a "revoke" message before
 969                          * actually unlinking, the fsync request is guaranteed to be
 970                          * marked canceled after the absorb if it really was this case.
 971                          * DROP DATABASE likewise has to tell us to forget fsync requests
 972                          * before it starts deletions.
 973                          */
 974                         for (failures = 0;; failures++)         /* loop exits at "break" */
 975                         {
 976                                 SMgrRelation reln;
 977                                 MdfdVec    *seg;
 978                                 char       *path;
 979
 980                                 /*
 981                                  * Find or create an smgr hash entry for this relation. This
 982                                  * may seem a bit unclean -- md calling smgr?  But it's really
 983                                  * the best solution.  It ensures that the open file reference
 984                                  * isn't permanently leaked if we get an error here. (You may
 985                                  * say "but an unreferenced SMgrRelation is still a leak!" Not
 986                                  * really, because the only case in which a checkpoint is done
 987                                  * by a process that isn't about to shut down is in the
 988                                  * bgwriter, and it will periodically do smgrcloseall(). This
 989                                  * fact justifies our not closing the reln in the success path
 990                                  * either, which is a good thing since in non-bgwriter cases
 991                                  * we couldn't safely do that.)  Furthermore, in many cases
 992                                  * the relation will have been dirtied through this same smgr
 993                                  * relation, and so we can save a file open/close cycle.
 994                                  */
 995                                 reln = smgropen(entry->tag.rnode);
 996
 997                                 /*
 998                                  * It is possible that the relation has been dropped or
 999                                  * truncated since the fsync request was entered.  Therefore,
1000                                  * allow ENOENT, but only if we didn't fail already on this
1001                                  * file.  This applies both during _mdfd_getseg() and during
1002                                  * FileSync, since fd.c might have closed the file behind our
1003                                  * back.
1004                                  */
1005                                 seg = _mdfd_getseg(reln, entry->tag.forknum,
1006                                                           entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
1007                                                                    false, EXTENSION_RETURN_NULL);
1008                                 if (seg != NULL &&
1009                                         FileSync(seg->mdfd_vfd) >= 0)
1010                                         break;          /* success; break out of retry loop */
1011
1012                                 /*
1013                                  * XXX is there any point in allowing more than one retry?
1014                                  * Don't see one at the moment, but easy to change the test
1015                                  * here if so.
1016                                  */
1017                                 path = relpath(entry->tag.rnode, entry->tag.forknum);
1018                                 if (!FILE_POSSIBLY_DELETED(errno) ||
1019                                         failures > 0)
1020                                         ereport(ERROR,
1021                                                         (errcode_for_file_access(),
1022                                                          errmsg("could not fsync segment %u of relation %s: %m",
1023                                                                         entry->tag.segno, path)));
1024                                 else
1025                                         ereport(DEBUG1,
1026                                                         (errcode_for_file_access(),
1027                                                          errmsg("could not fsync segment %u of relation %s but retrying: %m",
1028                                                                         entry->tag.segno, path)));
1029                                 pfree(path);
1030
1031                                 /*
1032                                  * Absorb incoming requests and check to see if canceled.
1033                                  */
1034                                 AbsorbFsyncRequests();
1035                                 absorb_counter = FSYNCS_PER_ABSORB;             /* might as well... */
1036
1037                                 if (entry->canceled)
1038                                         break;
1039                         }                                       /* end retry loop */
1040                 }
1041
1042                 /*
1043                  * If we get here, either we fsync'd successfully, or we don't have to
1044                  * because enableFsync is off, or the entry is (now) marked canceled.
1045                  * Okay to delete it.
1046                  */
1047                 if (hash_search(pendingOpsTable, &entry->tag,
1048                                                 HASH_REMOVE, NULL) == NULL)
1049                         elog(ERROR, "pendingOpsTable corrupted");
1050         }                                                       /* end loop over hashtable entries */
1051
1052         /* Flag successful completion of mdsync */
1053         mdsync_in_progress = false;
1054 }
1055
1056 /*
1057  * mdpreckpt() -- Do pre-checkpoint work
1058  *
1059  * To distinguish unlink requests that arrived before this checkpoint
1060  * started from those that arrived during the checkpoint, we use a cycle
1061  * counter similar to the one we use for fsync requests. That cycle
1062  * counter is incremented here.
1063  *
1064  * This must be called *before* the checkpoint REDO point is determined.
1065  * That ensures that we won't delete files too soon.
1066  *
1067  * Note that we can't do anything here that depends on the assumption
1068  * that the checkpoint will be completed.
1069  */
1070 void
1071 mdpreckpt(void)
1072 {
1073         ListCell   *cell;
1074
1075         /*
1076          * In case the prior checkpoint wasn't completed, stamp all entries in the
1077          * list with the current cycle counter.  Anything that's in the list at
1078          * the start of checkpoint can surely be deleted after the checkpoint is
1079          * finished, regardless of when the request was made.
1080          */
1081         foreach(cell, pendingUnlinks)
1082         {
1083                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1084
1085                 entry->cycle_ctr = mdckpt_cycle_ctr;
1086         }
1087
1088         /*
1089          * Any unlink requests arriving after this point will be assigned the next
1090          * cycle counter, and won't be unlinked until next checkpoint.
1091          */
1092         mdckpt_cycle_ctr++;
1093 }
1094
1095 /*
1096  * mdpostckpt() -- Do post-checkpoint work
1097  *
1098  * Remove any lingering files that can now be safely removed.
1099  */
1100 void
1101 mdpostckpt(void)
1102 {
1103         while (pendingUnlinks != NIL)
1104         {
1105                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
1106                 char       *path;
1107
1108                 /*
1109                  * New entries are appended to the end, so if the entry is new we've
1110                  * reached the end of old entries.
1111                  */
1112                 if (entry->cycle_ctr == mdckpt_cycle_ctr)
1113                         break;
1114
1115                 /* Else assert we haven't missed it */
1116                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);
1117
1118                 /* Unlink the file */
1119                 path = relpath(entry->rnode, MAIN_FORKNUM);
1120                 if (unlink(path) < 0)
1121                 {
1122                         /*
1123                          * There's a race condition, when the database is dropped at the
1124                          * same time that we process the pending unlink requests. If the
1125                          * DROP DATABASE deletes the file before we do, we will get ENOENT
1126                          * here. rmtree() also has to ignore ENOENT errors, to deal with
1127                          * the possibility that we delete the file first.
1128                          */
1129                         if (errno != ENOENT)
1130                                 ereport(WARNING,
1131                                                 (errcode_for_file_access(),
1132                                                  errmsg("could not remove relation %s: %m", path)));
1133                 }
1134                 pfree(path);
1135
1136                 pendingUnlinks = list_delete_first(pendingUnlinks);
1137                 pfree(entry);
1138         }
1139 }
1140
1141 /*
1142  * register_dirty_segment() -- Mark a relation segment as needing fsync
1143  *
1144  * If there is a local pending-ops table, just make an entry in it for
1145  * mdsync to process later.  Otherwise, try to pass off the fsync request
1146  * to the background writer process.  If that fails, just do the fsync
1147  * locally before returning (we expect this will not happen often enough
1148  * to be a performance problem).
1149  */
1150 static void
1151 register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
1152 {
1153         if (pendingOpsTable)
1154         {
1155                 /* push it into local pending-ops table */
1156                 RememberFsyncRequest(reln->smgr_rnode, forknum, seg->mdfd_segno);
1157         }
1158         else
1159         {
1160                 if (ForwardFsyncRequest(reln->smgr_rnode, forknum, seg->mdfd_segno))
1161                         return;                         /* passed it off successfully */
1162
1163                 if (FileSync(seg->mdfd_vfd) < 0)
1164                         ereport(ERROR,
1165                                         (errcode_for_file_access(),
1166                                          errmsg("could not fsync segment %u of relation %s: %m",
1167                                                         seg->mdfd_segno,
1168                                                         relpath(reln->smgr_rnode, forknum))));
1169         }
1170 }
1171
1172 /*
1173  * register_unlink() -- Schedule a file to be deleted after next checkpoint
1174  *
1175  * As with register_dirty_segment, this could involve either a local or
1176  * a remote pending-ops table.
1177  */
1178 static void
1179 register_unlink(RelFileNode rnode)
1180 {
1181         if (pendingOpsTable)
1182         {
1183                 /* push it into local pending-ops table */
1184                 RememberFsyncRequest(rnode, MAIN_FORKNUM, UNLINK_RELATION_REQUEST);
1185         }
1186         else
1187         {
1188                 /*
1189                  * Notify the bgwriter about it.  If we fail to queue the request
1190                  * message, we have to sleep and try again, because we can't simply
1191                  * delete the file now.  Ugly, but hopefully won't happen often.
1192                  *
1193                  * XXX should we just leave the file orphaned instead?
1194                  */
1195                 Assert(IsUnderPostmaster);
1196                 while (!ForwardFsyncRequest(rnode, MAIN_FORKNUM,
1197                                                                         UNLINK_RELATION_REQUEST))
1198                         pg_usleep(10000L);      /* 10 msec seems a good number */
1199         }
1200 }
1201
1202 /*
1203  * RememberFsyncRequest() -- callback from bgwriter side of fsync request
1204  *
1205  * We stuff most fsync requests into the local hash table for execution
1206  * during the bgwriter's next checkpoint.  UNLINK requests go into a
1207  * separate linked list, however, because they get processed separately.
1208  *
1209  * The range of possible segment numbers is way less than the range of
1210  * BlockNumber, so we can reserve high values of segno for special purposes.
1211  * We define three:
1212  * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation
1213  * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
1214  * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
1215  *       checkpoint.
1216  *
1217  * (Handling the FORGET_* requests is a tad slow because the hash table has
1218  * to be searched linearly, but it doesn't seem worth rethinking the table
1219  * structure for them.)
1220  */
1221 void
1222 RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
1223 {
1224         Assert(pendingOpsTable);
1225
1226         if (segno == FORGET_RELATION_FSYNC)
1227         {
1228                 /* Remove any pending requests for the entire relation */
1229                 HASH_SEQ_STATUS hstat;
1230                 PendingOperationEntry *entry;
1231
1232                 hash_seq_init(&hstat, pendingOpsTable);
1233                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1234                 {
1235                         if (RelFileNodeEquals(entry->tag.rnode, rnode) &&
1236                                 entry->tag.forknum == forknum)
1237                         {
1238                                 /* Okay, cancel this entry */
1239                                 entry->canceled = true;
1240                         }
1241                 }
1242         }
1243         else if (segno == FORGET_DATABASE_FSYNC)
1244         {
1245                 /* Remove any pending requests for the entire database */
1246                 HASH_SEQ_STATUS hstat;
1247                 PendingOperationEntry *entry;
1248                 ListCell   *cell,
1249                                    *prev,
1250                                    *next;
1251
1252                 /* Remove fsync requests */
1253                 hash_seq_init(&hstat, pendingOpsTable);
1254                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1255                 {
1256                         if (entry->tag.rnode.dbNode == rnode.dbNode)
1257                         {
1258                                 /* Okay, cancel this entry */
1259                                 entry->canceled = true;
1260                         }
1261                 }
1262
1263                 /* Remove unlink requests */
1264                 prev = NULL;
1265                 for (cell = list_head(pendingUnlinks); cell; cell = next)
1266                 {
1267                         PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1268
1269                         next = lnext(cell);
1270                         if (entry->rnode.dbNode == rnode.dbNode)
1271                         {
1272                                 pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
1273                                 pfree(entry);
1274                         }
1275                         else
1276                                 prev = cell;
1277                 }
1278         }
1279         else if (segno == UNLINK_RELATION_REQUEST)
1280         {
1281                 /* Unlink request: put it in the linked list */
1282                 MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
1283                 PendingUnlinkEntry *entry;
1284
1285                 entry = palloc(sizeof(PendingUnlinkEntry));
1286                 entry->rnode = rnode;
1287                 entry->cycle_ctr = mdckpt_cycle_ctr;
1288
1289                 pendingUnlinks = lappend(pendingUnlinks, entry);
1290
1291                 MemoryContextSwitchTo(oldcxt);
1292         }
1293         else
1294         {
1295                 /* Normal case: enter a request to fsync this segment */
1296                 PendingOperationTag key;
1297                 PendingOperationEntry *entry;
1298                 bool            found;
1299
1300                 /* ensure any pad bytes in the hash key are zeroed */
1301                 MemSet(&key, 0, sizeof(key));
1302                 key.rnode = rnode;
1303                 key.forknum = forknum;
1304                 key.segno = segno;
1305
1306                 entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
1307                                                                                                           &key,
1308                                                                                                           HASH_ENTER,
1309                                                                                                           &found);
1310                 /* if new or previously canceled entry, initialize it */
1311                 if (!found || entry->canceled)
1312                 {
1313                         entry->canceled = false;
1314                         entry->cycle_ctr = mdsync_cycle_ctr;
1315                 }
1316
1317                 /*
1318                  * NB: it's intentional that we don't change cycle_ctr if the entry
1319                  * already exists.      The fsync request must be treated as old, even
1320                  * though the new request will be satisfied too by any subsequent
1321                  * fsync.
1322                  *
1323                  * However, if the entry is present but is marked canceled, we should
1324                  * act just as though it wasn't there.  The only case where this could
1325                  * happen would be if a file had been deleted, we received but did not
1326                  * yet act on the cancel request, and the same relfilenode was then
1327                  * assigned to a new file.      We mustn't lose the new request, but it
1328                  * should be considered new not old.
1329                  */
1330         }
1331 }
1332
1333 /*
1334  * ForgetRelationFsyncRequests -- forget any fsyncs for a rel
1335  */
1336 void
1337 ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
1338 {
1339         if (pendingOpsTable)
1340         {
1341                 /* standalone backend or startup process: fsync state is local */
1342                 RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC);
1343         }
1344         else if (IsUnderPostmaster)
1345         {
1346                 /*
1347                  * Notify the bgwriter about it.  If we fail to queue the revoke
1348                  * message, we have to sleep and try again ... ugly, but hopefully
1349                  * won't happen often.
1350                  *
1351                  * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
1352                  * error would leave the no-longer-used file still present on disk,
1353                  * which would be bad, so I'm inclined to assume that the bgwriter
1354                  * will always empty the queue soon.
1355                  */
1356                 while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC))
1357                         pg_usleep(10000L);      /* 10 msec seems a good number */
1358
1359                 /*
1360                  * Note we don't wait for the bgwriter to actually absorb the revoke
1361                  * message; see mdsync() for the implications.
1362                  */
1363         }
1364 }
1365
1366 /*
1367  * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB
1368  */
1369 void
1370 ForgetDatabaseFsyncRequests(Oid dbid)
1371 {
1372         RelFileNode rnode;
1373
1374         rnode.dbNode = dbid;
1375         rnode.spcNode = 0;
1376         rnode.relNode = 0;
1377
1378         if (pendingOpsTable)
1379         {
1380                 /* standalone backend or startup process: fsync state is local */
1381                 RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC);
1382         }
1383         else if (IsUnderPostmaster)
1384         {
1385                 /* see notes in ForgetRelationFsyncRequests */
1386                 while (!ForwardFsyncRequest(rnode, InvalidForkNumber,
1387                                                                         FORGET_DATABASE_FSYNC))
1388                         pg_usleep(10000L);      /* 10 msec seems a good number */
1389         }
1390 }
1391
1392
1393 /*
1394  *      _fdvec_alloc() -- Make a MdfdVec object.
1395  */
1396 static MdfdVec *
1397 _fdvec_alloc(void)
1398 {
1399         return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
1400 }
1401
1402 /*
1403  * Open the specified segment of the relation,
1404  * and make a MdfdVec object for it.  Returns NULL on failure.
1405  */
1406 static MdfdVec *
1407 _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
1408                           int oflags)
1409 {
1410         MdfdVec    *v;
1411         int                     fd;
1412         char       *path,
1413                            *fullpath;
1414
1415         path = relpath(reln->smgr_rnode, forknum);
1416
1417         if (segno > 0)
1418         {
1419                 /* be sure we have enough space for the '.segno' */
1420                 fullpath = (char *) palloc(strlen(path) + 12);
1421                 sprintf(fullpath, "%s.%u", path, segno);
1422                 pfree(path);
1423         }
1424         else
1425                 fullpath = path;
1426
1427         /* open the file */
1428         fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
1429
1430         pfree(fullpath);
1431
1432         if (fd < 0)
1433                 return NULL;
1434
1435         /* allocate an mdfdvec entry for it */
1436         v = _fdvec_alloc();
1437
1438         /* fill the entry */
1439         v->mdfd_vfd = fd;
1440         v->mdfd_segno = segno;
1441         v->mdfd_chain = NULL;
1442         Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
1443
1444         /* all done */
1445         return v;
1446 }
1447
1448 /*
1449  *      _mdfd_getseg() -- Find the segment of the relation holding the
1450  *              specified block.
1451  *
1452  * If the segment doesn't exist, we ereport, return NULL, or create the
1453  * segment, according to "behavior".  Note: isTemp need only be correct
1454  * in the EXTENSION_CREATE case.
1455  */
1456 static MdfdVec *
1457 _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
1458                          bool isTemp, ExtensionBehavior behavior)
1459 {
1460         MdfdVec    *v = mdopen(reln, forknum, behavior);
1461         BlockNumber targetseg;
1462         BlockNumber nextsegno;
1463
1464         if (!v)
1465                 return NULL;                    /* only possible if EXTENSION_RETURN_NULL */
1466
1467         targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1468         for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
1469         {
1470                 Assert(nextsegno == v->mdfd_segno + 1);
1471
1472                 if (v->mdfd_chain == NULL)
1473                 {
1474                         /*
1475                          * Normally we will create new segments only if authorized by the
1476                          * caller (i.e., we are doing mdextend()).      But when doing WAL
1477                          * recovery, create segments anyway; this allows cases such as
1478                          * replaying WAL data that has a write into a high-numbered
1479                          * segment of a relation that was later deleted.  We want to go
1480                          * ahead and create the segments so we can finish out the replay.
1481                          *
1482                          * We have to maintain the invariant that segments before the last
1483                          * active segment are of size RELSEG_SIZE; therefore, pad them out
1484                          * with zeroes if needed.  (This only matters if caller is
1485                          * extending the relation discontiguously, but that can happen in
1486                          * hash indexes.)
1487                          */
1488                         if (behavior == EXTENSION_CREATE || InRecovery)
1489                         {
1490                                 if (_mdnblocks(reln, forknum, v) < RELSEG_SIZE)
1491                                 {
1492                                         char       *zerobuf = palloc0(BLCKSZ);
1493
1494                                         mdextend(reln, forknum,
1495                                                          nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1496                                                          zerobuf, isTemp);
1497                                         pfree(zerobuf);
1498                                 }
1499                                 v->mdfd_chain = _mdfd_openseg(reln, forknum, +nextsegno, O_CREAT);
1500                         }
1501                         else
1502                         {
1503                                 /* We won't create segment if not existent */
1504                                 v->mdfd_chain = _mdfd_openseg(reln, forknum, nextsegno, 0);
1505                         }
1506                         if (v->mdfd_chain == NULL)
1507                         {
1508                                 if (behavior == EXTENSION_RETURN_NULL &&
1509                                         FILE_POSSIBLY_DELETED(errno))
1510                                         return NULL;
1511                                 ereport(ERROR,
1512                                                 (errcode_for_file_access(),
1513                                                  errmsg("could not open segment %u of relation %s (target block %u): %m",
1514                                                                 nextsegno,
1515                                                                 relpath(reln->smgr_rnode, forknum),
1516                                                                 blkno)));
1517                         }
1518                 }
1519                 v = v->mdfd_chain;
1520         }
1521         return v;
1522 }
1523
1524 /*
1525  * Get number of blocks present in a single disk file
1526  */
1527 static BlockNumber
1528 _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
1529 {
1530         off_t           len;
1531
1532         len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
1533         if (len < 0)
1534                 ereport(ERROR,
1535                                 (errcode_for_file_access(),
1536                                  errmsg("could not seek to end of segment %u of relation %s: %m",
1537                                                 seg->mdfd_segno, relpath(reln->smgr_rnode, forknum))));
1538         /* note that this calculation will ignore any partial block at EOF */
1539         return (BlockNumber) (len / BLCKSZ);
1540 }