granicus.if.org Git - postgresql/blob - src/backend/storage/smgr/md.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * md.c
   4  *        This code manages relations that reside on magnetic disk.
   5  *
   6  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.132 2007/11/15 21:49:47 tgl Exp $
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15 #include "postgres.h"
  16
  17 #include <unistd.h>
  18 #include <fcntl.h>
  19 #include <sys/file.h>
  20
  21 #include "catalog/catalog.h"
  22 #include "miscadmin.h"
  23 #include "postmaster/bgwriter.h"
  24 #include "storage/fd.h"
  25 #include "storage/bufmgr.h"
  26 #include "storage/smgr.h"
  27 #include "utils/hsearch.h"
  28 #include "utils/memutils.h"
  29
  30
  31 /* interval for calling AbsorbFsyncRequests in mdsync */
  32 #define FSYNCS_PER_ABSORB               10
  33
  34 /* special values for the segno arg to RememberFsyncRequest */
  35 #define FORGET_RELATION_FSYNC   (InvalidBlockNumber)
  36 #define FORGET_DATABASE_FSYNC   (InvalidBlockNumber-1)
  37 #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
  38
  39 /*
  40  * On Windows, we have to interpret EACCES as possibly meaning the same as
  41  * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
  42  * that's what you get.  Ugh.  This code is designed so that we don't
  43  * actually believe these cases are okay without further evidence (namely,
  44  * a pending fsync request getting revoked ... see mdsync).
  45  */
  46 #ifndef WIN32
  47 #define FILE_POSSIBLY_DELETED(err)      ((err) == ENOENT)
  48 #else
  49 #define FILE_POSSIBLY_DELETED(err)      ((err) == ENOENT || (err) == EACCES)
  50 #endif
  51
  52 /*
  53  *      The magnetic disk storage manager keeps track of open file
  54  *      descriptors in its own descriptor pool.  This is done to make it
  55  *      easier to support relations that are larger than the operating
  56  *      system's file size limit (often 2GBytes).  In order to do that,
  57  *      we break relations up into "segment" files that are each shorter than
  58  *      the OS file size limit.  The segment size is set by the RELSEG_SIZE
  59  *      configuration constant in pg_config_manual.h.
  60  *
  61  *      On disk, a relation must consist of consecutively numbered segment
  62  *      files in the pattern
  63  *              -- Zero or more full segments of exactly RELSEG_SIZE blocks each
  64  *              -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
  65  *              -- Optionally, any number of inactive segments of size 0 blocks.
  66  *      The full and partial segments are collectively the "active" segments.
  67  *      Inactive segments are those that once contained data but are currently
  68  *      not needed because of an mdtruncate() operation.  The reason for leaving
  69  *      them present at size zero, rather than unlinking them, is that other
  70  *      backends and/or the bgwriter might be holding open file references to
  71  *      such segments.  If the relation expands again after mdtruncate(), such
  72  *      that a deactivated segment becomes active again, it is important that
  73  *      such file references still be valid --- else data might get written
  74  *      out to an unlinked old copy of a segment file that will eventually
  75  *      disappear.
  76  *
  77  *      The file descriptor pointer (md_fd field) stored in the SMgrRelation
  78  *      cache is, therefore, just the head of a list of MdfdVec objects, one
  79  *      per segment.  But note the md_fd pointer can be NULL, indicating
  80  *      relation not open.
  81  *
  82  *      Also note that mdfd_chain == NULL does not necessarily mean the relation
  83  *      doesn't have another segment after this one; we may just not have
  84  *      opened the next segment yet.  (We could not have "all segments are
  85  *      in the chain" as an invariant anyway, since another backend could
  86  *      extend the relation when we weren't looking.)  We do not make chain
  87  *      entries for inactive segments, however; as soon as we find a partial
  88  *      segment, we assume that any subsequent segments are inactive.
  89  *
  90  *      All MdfdVec objects are palloc'd in the MdCxt memory context.
  91  *
  92  *      Defining LET_OS_MANAGE_FILESIZE disables the segmentation logic,
  93  *      for use on machines that support large files.  Beware that that
  94  *      code has not been tested in a long time and is probably bit-rotted.
  95  */
  96
  97 typedef struct _MdfdVec
  98 {
  99         File            mdfd_vfd;               /* fd number in fd.c's pool */
 100         BlockNumber mdfd_segno;         /* segment number, from 0 */
 101 #ifndef LET_OS_MANAGE_FILESIZE  /* for large relations */
 102         struct _MdfdVec *mdfd_chain;    /* next segment, or NULL */
 103 #endif
 104 } MdfdVec;
 105
 106 static MemoryContext MdCxt;             /* context for all md.c allocations */
 107
 108
 109 /*
 110  * In some contexts (currently, standalone backends and the bgwriter process)
 111  * we keep track of pending fsync operations: we need to remember all relation
 112  * segments that have been written since the last checkpoint, so that we can
 113  * fsync them down to disk before completing the next checkpoint.  This hash
 114  * table remembers the pending operations.      We use a hash table mostly as
 115  * a convenient way of eliminating duplicate requests.
 116  *
 117  * We use a similar mechanism to remember no-longer-needed files that can
 118  * be deleted after the next checkpoint, but we use a linked list instead of
 119  * a hash table, because we don't expect there to be any duplicate requests.
 120  *
 121  * (Regular backends do not track pending operations locally, but forward
 122  * them to the bgwriter.)
 123  */
 124 typedef struct
 125 {
 126         RelFileNode rnode;                      /* the targeted relation */
 127         BlockNumber segno;                      /* which segment */
 128 }       PendingOperationTag;
 129
 130 typedef uint16 CycleCtr;                /* can be any convenient integer size */
 131
 132 typedef struct
 133 {
 134         PendingOperationTag tag;        /* hash table key (must be first!) */
 135         bool            canceled;               /* T => request canceled, not yet removed */
 136         CycleCtr        cycle_ctr;              /* mdsync_cycle_ctr when request was made */
 137 } PendingOperationEntry;
 138
 139 typedef struct
 140 {
 141         RelFileNode rnode;                      /* the dead relation to delete */
 142         CycleCtr        cycle_ctr;              /* mdckpt_cycle_ctr when request was made */
 143 }       PendingUnlinkEntry;
 144
 145 static HTAB *pendingOpsTable = NULL;
 146 static List *pendingUnlinks = NIL;
 147
 148 static CycleCtr mdsync_cycle_ctr = 0;
 149 static CycleCtr mdckpt_cycle_ctr = 0;
 150
 151
 152 typedef enum                                    /* behavior for mdopen & _mdfd_getseg */
 153 {
 154         EXTENSION_FAIL,                         /* ereport if segment not present */
 155         EXTENSION_RETURN_NULL,          /* return NULL if not present */
 156         EXTENSION_CREATE                        /* create new segments as needed */
 157 }       ExtensionBehavior;
 158
 159 /* local routines */
 160 static MdfdVec *mdopen(SMgrRelation reln, ExtensionBehavior behavior);
 161 static void register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
 162 static void register_unlink(RelFileNode rnode);
 163 static MdfdVec *_fdvec_alloc(void);
 164
 165 #ifndef LET_OS_MANAGE_FILESIZE
 166 static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno,
 167                           int oflags);
 168 #endif
 169 static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno,
 170                          bool isTemp, ExtensionBehavior behavior);
 171 static BlockNumber _mdnblocks(SMgrRelation reln, MdfdVec *seg);
 172
 173
 174 /*
 175  *      mdinit() -- Initialize private state for magnetic disk storage manager.
 176  */
 177 void
 178 mdinit(void)
 179 {
 180         MdCxt = AllocSetContextCreate(TopMemoryContext,
 181                                                                   "MdSmgr",
 182                                                                   ALLOCSET_DEFAULT_MINSIZE,
 183                                                                   ALLOCSET_DEFAULT_INITSIZE,
 184                                                                   ALLOCSET_DEFAULT_MAXSIZE);
 185
 186         /*
 187          * Create pending-operations hashtable if we need it.  Currently, we need
 188          * it if we are standalone (not under a postmaster) OR if we are a
 189          * bootstrap-mode subprocess of a postmaster (that is, a startup or
 190          * bgwriter process).
 191          */
 192         if (!IsUnderPostmaster || IsBootstrapProcessingMode())
 193         {
 194                 HASHCTL         hash_ctl;
 195
 196                 MemSet(&hash_ctl, 0, sizeof(hash_ctl));
 197                 hash_ctl.keysize = sizeof(PendingOperationTag);
 198                 hash_ctl.entrysize = sizeof(PendingOperationEntry);
 199                 hash_ctl.hash = tag_hash;
 200                 hash_ctl.hcxt = MdCxt;
 201                 pendingOpsTable = hash_create("Pending Ops Table",
 202                                                                           100L,
 203                                                                           &hash_ctl,
 204                                                                    HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
 205                 pendingUnlinks = NIL;
 206         }
 207 }
 208
 209 /*
 210  *      mdcreate() -- Create a new relation on magnetic disk.
 211  *
 212  * If isRedo is true, it's okay for the relation to exist already.
 213  */
 214 void
 215 mdcreate(SMgrRelation reln, bool isRedo)
 216 {
 217         char       *path;
 218         File            fd;
 219
 220         if (isRedo && reln->md_fd != NULL)
 221                 return;                                 /* created and opened already... */
 222
 223         Assert(reln->md_fd == NULL);
 224
 225         path = relpath(reln->smgr_rnode);
 226
 227         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
 228
 229         if (fd < 0)
 230         {
 231                 int                     save_errno = errno;
 232
 233                 /*
 234                  * During bootstrap, there are cases where a system relation will be
 235                  * accessed (by internal backend processes) before the bootstrap
 236                  * script nominally creates it.  Therefore, allow the file to exist
 237                  * already, even if isRedo is not set.  (See also mdopen)
 238                  */
 239                 if (isRedo || IsBootstrapProcessingMode())
 240                         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
 241                 if (fd < 0)
 242                 {
 243                         pfree(path);
 244                         /* be sure to report the error reported by create, not open */
 245                         errno = save_errno;
 246                         ereport(ERROR,
 247                                         (errcode_for_file_access(),
 248                                          errmsg("could not create relation %u/%u/%u: %m",
 249                                                         reln->smgr_rnode.spcNode,
 250                                                         reln->smgr_rnode.dbNode,
 251                                                         reln->smgr_rnode.relNode)));
 252                 }
 253         }
 254
 255         pfree(path);
 256
 257         reln->md_fd = _fdvec_alloc();
 258
 259         reln->md_fd->mdfd_vfd = fd;
 260         reln->md_fd->mdfd_segno = 0;
 261 #ifndef LET_OS_MANAGE_FILESIZE
 262         reln->md_fd->mdfd_chain = NULL;
 263 #endif
 264 }
 265
 266 /*
 267  *      mdunlink() -- Unlink a relation.
 268  *
 269  * Note that we're passed a RelFileNode --- by the time this is called,
 270  * there won't be an SMgrRelation hashtable entry anymore.
 271  *
 272  * Actually, we don't unlink the first segment file of the relation, but
 273  * just truncate it to zero length, and record a request to unlink it after
 274  * the next checkpoint.  Additional segments can be unlinked immediately,
 275  * however.  Leaving the empty file in place prevents that relfilenode
 276  * number from being reused.  The scenario this protects us from is:
 277  * 1. We delete a relation (and commit, and actually remove its file).
 278  * 2. We create a new relation, which by chance gets the same relfilenode as
 279  *        the just-deleted one (OIDs must've wrapped around for that to happen).
 280  * 3. We crash before another checkpoint occurs.
 281  * During replay, we would delete the file and then recreate it, which is fine
 282  * if the contents of the file were repopulated by subsequent WAL entries.
 283  * But if we didn't WAL-log insertions, but instead relied on fsyncing the
 284  * file after populating it (as for instance CLUSTER and CREATE INDEX do),
 285  * the contents of the file would be lost forever.      By leaving the empty file
 286  * until after the next checkpoint, we prevent reassignment of the relfilenode
 287  * number until it's safe, because relfilenode assignment skips over any
 288  * existing file.
 289  *
 290  * If isRedo is true, it's okay for the relation to be already gone.
 291  * Also, we should remove the file immediately instead of queuing a request
 292  * for later, since during redo there's no possibility of creating a
 293  * conflicting relation.
 294  *
 295  * Note: any failure should be reported as WARNING not ERROR, because
 296  * we are usually not in a transaction anymore when this is called.
 297  */
 298 void
 299 mdunlink(RelFileNode rnode, bool isRedo)
 300 {
 301         char       *path;
 302         int                     ret;
 303
 304         /*
 305          * We have to clean out any pending fsync requests for the doomed
 306          * relation, else the next mdsync() will fail.
 307          */
 308         ForgetRelationFsyncRequests(rnode);
 309
 310         path = relpath(rnode);
 311
 312         /*
 313          * Delete or truncate the first segment, or only segment if not doing
 314          * segmenting
 315          */
 316         if (isRedo)
 317                 ret = unlink(path);
 318         else
 319         {
 320                 /* truncate(2) would be easier here, but Windows hasn't got it */
 321                 int             fd;
 322
 323                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0);
 324                 if (fd >= 0)
 325                 {
 326                         int             save_errno;
 327
 328                         ret = ftruncate(fd, 0);
 329                         save_errno = errno;
 330                         close(fd);
 331                         errno = save_errno;
 332                 }
 333                 else
 334                         ret = -1;
 335         }
 336         if (ret < 0)
 337         {
 338                 if (!isRedo || errno != ENOENT)
 339                         ereport(WARNING,
 340                                         (errcode_for_file_access(),
 341                                          errmsg("could not remove relation %u/%u/%u: %m",
 342                                                         rnode.spcNode,
 343                                                         rnode.dbNode,
 344                                                         rnode.relNode)));
 345         }
 346
 347 #ifndef LET_OS_MANAGE_FILESIZE
 348         /* Delete the additional segments, if any */
 349         else
 350         {
 351                 char       *segpath = (char *) palloc(strlen(path) + 12);
 352                 BlockNumber segno;
 353
 354                 /*
 355                  * Note that because we loop until getting ENOENT, we will correctly
 356                  * remove all inactive segments as well as active ones.
 357                  */
 358                 for (segno = 1;; segno++)
 359                 {
 360                         sprintf(segpath, "%s.%u", path, segno);
 361                         if (unlink(segpath) < 0)
 362                         {
 363                                 /* ENOENT is expected after the last segment... */
 364                                 if (errno != ENOENT)
 365                                         ereport(WARNING,
 366                                                         (errcode_for_file_access(),
 367                                                          errmsg("could not remove segment %u of relation %u/%u/%u: %m",
 368                                                                         segno,
 369                                                                         rnode.spcNode,
 370                                                                         rnode.dbNode,
 371                                                                         rnode.relNode)));
 372                                 break;
 373                         }
 374                 }
 375                 pfree(segpath);
 376         }
 377 #endif
 378
 379         pfree(path);
 380
 381         /* Register request to unlink first segment later */
 382         if (!isRedo)
 383                 register_unlink(rnode);
 384 }
 385
 386 /*
 387  *      mdextend() -- Add a block to the specified relation.
 388  *
 389  *              The semantics are nearly the same as mdwrite(): write at the
 390  *              specified position.  However, this is to be used for the case of
 391  *              extending a relation (i.e., blocknum is at or beyond the current
 392  *              EOF).  Note that we assume writing a block beyond current EOF
 393  *              causes intervening file space to become filled with zeroes.
 394  */
 395 void
 396 mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
 397 {
 398         long            seekpos;
 399         int                     nbytes;
 400         MdfdVec    *v;
 401
 402         /* This assert is too expensive to have on normally ... */
 403 #ifdef CHECK_WRITE_VS_EXTEND
 404         Assert(blocknum >= mdnblocks(reln));
 405 #endif
 406
 407         /*
 408          * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
 409          * more --- we mustn't create a block whose number actually is
 410          * InvalidBlockNumber.
 411          */
 412         if (blocknum == InvalidBlockNumber)
 413                 ereport(ERROR,
 414                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 415                                  errmsg("cannot extend relation %u/%u/%u beyond %u blocks",
 416                                                 reln->smgr_rnode.spcNode,
 417                                                 reln->smgr_rnode.dbNode,
 418                                                 reln->smgr_rnode.relNode,
 419                                                 InvalidBlockNumber)));
 420
 421         v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_CREATE);
 422
 423 #ifndef LET_OS_MANAGE_FILESIZE
 424         seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
 425         Assert(seekpos < BLCKSZ * RELSEG_SIZE);
 426 #else
 427         seekpos = (long) (BLCKSZ * (blocknum));
 428 #endif
 429
 430         /*
 431          * Note: because caller usually obtained blocknum by calling mdnblocks,
 432          * which did a seek(SEEK_END), this seek is often redundant and will be
 433          * optimized away by fd.c.      It's not redundant, however, if there is a
 434          * partial page at the end of the file. In that case we want to try to
 435          * overwrite the partial page with a full page.  It's also not redundant
 436          * if bufmgr.c had to dump another buffer of the same file to make room
 437          * for the new page's buffer.
 438          */
 439         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 440                 ereport(ERROR,
 441                                 (errcode_for_file_access(),
 442                                  errmsg("could not seek to block %u of relation %u/%u/%u: %m",
 443                                                 blocknum,
 444                                                 reln->smgr_rnode.spcNode,
 445                                                 reln->smgr_rnode.dbNode,
 446                                                 reln->smgr_rnode.relNode)));
 447
 448         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
 449         {
 450                 if (nbytes < 0)
 451                         ereport(ERROR,
 452                                         (errcode_for_file_access(),
 453                                          errmsg("could not extend relation %u/%u/%u: %m",
 454                                                         reln->smgr_rnode.spcNode,
 455                                                         reln->smgr_rnode.dbNode,
 456                                                         reln->smgr_rnode.relNode),
 457                                          errhint("Check free disk space.")));
 458                 /* short write: complain appropriately */
 459                 ereport(ERROR,
 460                                 (errcode(ERRCODE_DISK_FULL),
 461                                  errmsg("could not extend relation %u/%u/%u: wrote only %d of %d bytes at block %u",
 462                                                 reln->smgr_rnode.spcNode,
 463                                                 reln->smgr_rnode.dbNode,
 464                                                 reln->smgr_rnode.relNode,
 465                                                 nbytes, BLCKSZ, blocknum),
 466                                  errhint("Check free disk space.")));
 467         }
 468
 469         if (!isTemp)
 470                 register_dirty_segment(reln, v);
 471
 472 #ifndef LET_OS_MANAGE_FILESIZE
 473         Assert(_mdnblocks(reln, v) <= ((BlockNumber) RELSEG_SIZE));
 474 #endif
 475 }
 476
 477 /*
 478  *      mdopen() -- Open the specified relation.
 479  *
 480  * Note we only open the first segment, when there are multiple segments.
 481  *
 482  * If first segment is not present, either ereport or return NULL according
 483  * to "behavior".  We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
 484  * EXTENSION_CREATE means it's OK to extend an existing relation, not to
 485  * invent one out of whole cloth.
 486  */
 487 static MdfdVec *
 488 mdopen(SMgrRelation reln, ExtensionBehavior behavior)
 489 {
 490         MdfdVec    *mdfd;
 491         char       *path;
 492         File            fd;
 493
 494         /* No work if already open */
 495         if (reln->md_fd)
 496                 return reln->md_fd;
 497
 498         path = relpath(reln->smgr_rnode);
 499
 500         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
 501
 502         if (fd < 0)
 503         {
 504                 /*
 505                  * During bootstrap, there are cases where a system relation will be
 506                  * accessed (by internal backend processes) before the bootstrap
 507                  * script nominally creates it.  Therefore, accept mdopen() as a
 508                  * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
 509                  */
 510                 if (IsBootstrapProcessingMode())
 511                         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
 512                 if (fd < 0)
 513                 {
 514                         pfree(path);
 515                         if (behavior == EXTENSION_RETURN_NULL &&
 516                                 FILE_POSSIBLY_DELETED(errno))
 517                                 return NULL;
 518                         ereport(ERROR,
 519                                         (errcode_for_file_access(),
 520                                          errmsg("could not open relation %u/%u/%u: %m",
 521                                                         reln->smgr_rnode.spcNode,
 522                                                         reln->smgr_rnode.dbNode,
 523                                                         reln->smgr_rnode.relNode)));
 524                 }
 525         }
 526
 527         pfree(path);
 528
 529         reln->md_fd = mdfd = _fdvec_alloc();
 530
 531         mdfd->mdfd_vfd = fd;
 532         mdfd->mdfd_segno = 0;
 533 #ifndef LET_OS_MANAGE_FILESIZE
 534         mdfd->mdfd_chain = NULL;
 535         Assert(_mdnblocks(reln, mdfd) <= ((BlockNumber) RELSEG_SIZE));
 536 #endif
 537
 538         return mdfd;
 539 }
 540
 541 /*
 542  *      mdclose() -- Close the specified relation, if it isn't closed already.
 543  */
 544 void
 545 mdclose(SMgrRelation reln)
 546 {
 547         MdfdVec    *v = reln->md_fd;
 548
 549         /* No work if already closed */
 550         if (v == NULL)
 551                 return;
 552
 553         reln->md_fd = NULL;                     /* prevent dangling pointer after error */
 554
 555 #ifndef LET_OS_MANAGE_FILESIZE
 556         while (v != NULL)
 557         {
 558                 MdfdVec    *ov = v;
 559
 560                 /* if not closed already */
 561                 if (v->mdfd_vfd >= 0)
 562                         FileClose(v->mdfd_vfd);
 563                 /* Now free vector */
 564                 v = v->mdfd_chain;
 565                 pfree(ov);
 566         }
 567 #else
 568         if (v->mdfd_vfd >= 0)
 569                 FileClose(v->mdfd_vfd);
 570         pfree(v);
 571 #endif
 572 }
 573
 574 /*
 575  *      mdread() -- Read the specified block from a relation.
 576  */
 577 void
 578 mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
 579 {
 580         long            seekpos;
 581         int                     nbytes;
 582         MdfdVec    *v;
 583
 584         v = _mdfd_getseg(reln, blocknum, false, EXTENSION_FAIL);
 585
 586 #ifndef LET_OS_MANAGE_FILESIZE
 587         seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
 588         Assert(seekpos < BLCKSZ * RELSEG_SIZE);
 589 #else
 590         seekpos = (long) (BLCKSZ * (blocknum));
 591 #endif
 592
 593         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 594                 ereport(ERROR,
 595                                 (errcode_for_file_access(),
 596                                  errmsg("could not seek to block %u of relation %u/%u/%u: %m",
 597                                                 blocknum,
 598                                                 reln->smgr_rnode.spcNode,
 599                                                 reln->smgr_rnode.dbNode,
 600                                                 reln->smgr_rnode.relNode)));
 601
 602         if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
 603         {
 604                 if (nbytes < 0)
 605                         ereport(ERROR,
 606                                         (errcode_for_file_access(),
 607                                    errmsg("could not read block %u of relation %u/%u/%u: %m",
 608                                                   blocknum,
 609                                                   reln->smgr_rnode.spcNode,
 610                                                   reln->smgr_rnode.dbNode,
 611                                                   reln->smgr_rnode.relNode)));
 612
 613                 /*
 614                  * Short read: we are at or past EOF, or we read a partial block at
 615                  * EOF.  Normally this is an error; upper levels should never try to
 616                  * read a nonexistent block.  However, if zero_damaged_pages is ON or
 617                  * we are InRecovery, we should instead return zeroes without
 618                  * complaining.  This allows, for example, the case of trying to
 619                  * update a block that was later truncated away.
 620                  */
 621                 if (zero_damaged_pages || InRecovery)
 622                         MemSet(buffer, 0, BLCKSZ);
 623                 else
 624                         ereport(ERROR,
 625                                         (errcode(ERRCODE_DATA_CORRUPTED),
 626                                          errmsg("could not read block %u of relation %u/%u/%u: read only %d of %d bytes",
 627                                                         blocknum,
 628                                                         reln->smgr_rnode.spcNode,
 629                                                         reln->smgr_rnode.dbNode,
 630                                                         reln->smgr_rnode.relNode,
 631                                                         nbytes, BLCKSZ)));
 632         }
 633 }
 634
 635 /*
 636  *      mdwrite() -- Write the supplied block at the appropriate location.
 637  *
 638  *              This is to be used only for updating already-existing blocks of a
 639  *              relation (ie, those before the current EOF).  To extend a relation,
 640  *              use mdextend().
 641  */
 642 void
 643 mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
 644 {
 645         long            seekpos;
 646         int                     nbytes;
 647         MdfdVec    *v;
 648
 649         /* This assert is too expensive to have on normally ... */
 650 #ifdef CHECK_WRITE_VS_EXTEND
 651         Assert(blocknum < mdnblocks(reln));
 652 #endif
 653
 654         v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_FAIL);
 655
 656 #ifndef LET_OS_MANAGE_FILESIZE
 657         seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
 658         Assert(seekpos < BLCKSZ * RELSEG_SIZE);
 659 #else
 660         seekpos = (long) (BLCKSZ * (blocknum));
 661 #endif
 662
 663         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 664                 ereport(ERROR,
 665                                 (errcode_for_file_access(),
 666                                  errmsg("could not seek to block %u of relation %u/%u/%u: %m",
 667                                                 blocknum,
 668                                                 reln->smgr_rnode.spcNode,
 669                                                 reln->smgr_rnode.dbNode,
 670                                                 reln->smgr_rnode.relNode)));
 671
 672         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
 673         {
 674                 if (nbytes < 0)
 675                         ereport(ERROR,
 676                                         (errcode_for_file_access(),
 677                                   errmsg("could not write block %u of relation %u/%u/%u: %m",
 678                                                  blocknum,
 679                                                  reln->smgr_rnode.spcNode,
 680                                                  reln->smgr_rnode.dbNode,
 681                                                  reln->smgr_rnode.relNode)));
 682                 /* short write: complain appropriately */
 683                 ereport(ERROR,
 684                                 (errcode(ERRCODE_DISK_FULL),
 685                                  errmsg("could not write block %u of relation %u/%u/%u: wrote only %d of %d bytes",
 686                                                 blocknum,
 687                                                 reln->smgr_rnode.spcNode,
 688                                                 reln->smgr_rnode.dbNode,
 689                                                 reln->smgr_rnode.relNode,
 690                                                 nbytes, BLCKSZ),
 691                                  errhint("Check free disk space.")));
 692         }
 693
 694         if (!isTemp)
 695                 register_dirty_segment(reln, v);
 696 }
 697
 698 /*
 699  *      mdnblocks() -- Get the number of blocks stored in a relation.
 700  *
 701  *              Important side effect: all active segments of the relation are opened
 702  *              and added to the mdfd_chain list.  If this routine has not been
 703  *              called, then only segments up to the last one actually touched
 704  *              are present in the chain.
 705  */
 706 BlockNumber
 707 mdnblocks(SMgrRelation reln)
 708 {
 709         MdfdVec    *v = mdopen(reln, EXTENSION_FAIL);
 710
 711 #ifndef LET_OS_MANAGE_FILESIZE
 712         BlockNumber nblocks;
 713         BlockNumber segno = 0;
 714
 715         /*
 716          * Skip through any segments that aren't the last one, to avoid redundant
 717          * seeks on them.  We have previously verified that these segments are
 718          * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
 719          *
 720          * NOTE: this assumption could only be wrong if another backend has
 721          * truncated the relation.      We rely on higher code levels to handle that
 722          * scenario by closing and re-opening the md fd, which is handled via
 723          * relcache flush.      (Since the bgwriter doesn't participate in relcache
 724          * flush, it could have segment chain entries for inactive segments;
 725          * that's OK because the bgwriter never needs to compute relation size.)
 726          */
 727         while (v->mdfd_chain != NULL)
 728         {
 729                 segno++;
 730                 v = v->mdfd_chain;
 731         }
 732
 733         for (;;)
 734         {
 735                 nblocks = _mdnblocks(reln, v);
 736                 if (nblocks > ((BlockNumber) RELSEG_SIZE))
 737                         elog(FATAL, "segment too big");
 738                 if (nblocks < ((BlockNumber) RELSEG_SIZE))
 739                         return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
 740
 741                 /*
 742                  * If segment is exactly RELSEG_SIZE, advance to next one.
 743                  */
 744                 segno++;
 745
 746                 if (v->mdfd_chain == NULL)
 747                 {
 748                         /*
 749                          * Because we pass O_CREAT, we will create the next segment (with
 750                          * zero length) immediately, if the last segment is of length
 751                          * RELSEG_SIZE.  While perhaps not strictly necessary, this keeps
 752                          * the logic simple.
 753                          */
 754                         v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
 755                         if (v->mdfd_chain == NULL)
 756                                 ereport(ERROR,
 757                                                 (errcode_for_file_access(),
 758                                  errmsg("could not open segment %u of relation %u/%u/%u: %m",
 759                                                 segno,
 760                                                 reln->smgr_rnode.spcNode,
 761                                                 reln->smgr_rnode.dbNode,
 762                                                 reln->smgr_rnode.relNode)));
 763                 }
 764
 765                 v = v->mdfd_chain;
 766         }
 767 #else
 768         return _mdnblocks(reln, v);
 769 #endif
 770 }
 771
 772 /*
 773  *      mdtruncate() -- Truncate relation to specified number of blocks.
 774  */
 775 void
 776 mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
 777 {
 778         MdfdVec    *v;
 779         BlockNumber curnblk;
 780
 781 #ifndef LET_OS_MANAGE_FILESIZE
 782         BlockNumber priorblocks;
 783 #endif
 784
 785         /*
 786          * NOTE: mdnblocks makes sure we have opened all active segments, so that
 787          * truncation loop will get them all!
 788          */
 789         curnblk = mdnblocks(reln);
 790         if (nblocks > curnblk)
 791         {
 792                 /* Bogus request ... but no complaint if InRecovery */
 793                 if (InRecovery)
 794                         return;
 795                 ereport(ERROR,
 796                                 (errmsg("could not truncate relation %u/%u/%u to %u blocks: it's only %u blocks now",
 797                                                 reln->smgr_rnode.spcNode,
 798                                                 reln->smgr_rnode.dbNode,
 799                                                 reln->smgr_rnode.relNode,
 800                                                 nblocks, curnblk)));
 801         }
 802         if (nblocks == curnblk)
 803                 return;                                 /* no work */
 804
 805         v = mdopen(reln, EXTENSION_FAIL);
 806
 807 #ifndef LET_OS_MANAGE_FILESIZE
 808         priorblocks = 0;
 809         while (v != NULL)
 810         {
 811                 MdfdVec    *ov = v;
 812
 813                 if (priorblocks > nblocks)
 814                 {
 815                         /*
 816                          * This segment is no longer active (and has already been unlinked
 817                          * from the mdfd_chain). We truncate the file, but do not delete
 818                          * it, for reasons explained in the header comments.
 819                          */
 820                         if (FileTruncate(v->mdfd_vfd, 0) < 0)
 821                                 ereport(ERROR,
 822                                                 (errcode_for_file_access(),
 823                                                  errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
 824                                                                 reln->smgr_rnode.spcNode,
 825                                                                 reln->smgr_rnode.dbNode,
 826                                                                 reln->smgr_rnode.relNode,
 827                                                                 nblocks)));
 828                         if (!isTemp)
 829                                 register_dirty_segment(reln, v);
 830                         v = v->mdfd_chain;
 831                         Assert(ov != reln->md_fd);      /* we never drop the 1st segment */
 832                         pfree(ov);
 833                 }
 834                 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
 835                 {
 836                         /*
 837                          * This is the last segment we want to keep. Truncate the file to
 838                          * the right length, and clear chain link that points to any
 839                          * remaining segments (which we shall zap). NOTE: if nblocks is
 840                          * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
 841                          * segment to 0 length but keep it. This adheres to the invariant
 842                          * given in the header comments.
 843                          */
 844                         BlockNumber lastsegblocks = nblocks - priorblocks;
 845
 846                         if (FileTruncate(v->mdfd_vfd, lastsegblocks * BLCKSZ) < 0)
 847                                 ereport(ERROR,
 848                                                 (errcode_for_file_access(),
 849                                                  errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
 850                                                                 reln->smgr_rnode.spcNode,
 851                                                                 reln->smgr_rnode.dbNode,
 852                                                                 reln->smgr_rnode.relNode,
 853                                                                 nblocks)));
 854                         if (!isTemp)
 855                                 register_dirty_segment(reln, v);
 856                         v = v->mdfd_chain;
 857                         ov->mdfd_chain = NULL;
 858                 }
 859                 else
 860                 {
 861                         /*
 862                          * We still need this segment and 0 or more blocks beyond it, so
 863                          * nothing to do here.
 864                          */
 865                         v = v->mdfd_chain;
 866                 }
 867                 priorblocks += RELSEG_SIZE;
 868         }
 869 #else
 870         if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0)
 871                 ereport(ERROR,
 872                                 (errcode_for_file_access(),
 873                           errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
 874                                          reln->smgr_rnode.spcNode,
 875                                          reln->smgr_rnode.dbNode,
 876                                          reln->smgr_rnode.relNode,
 877                                          nblocks)));
 878         if (!isTemp)
 879                 register_dirty_segment(reln, v);
 880 #endif
 881 }
 882
 883 /*
 884  *      mdimmedsync() -- Immediately sync a relation to stable storage.
 885  *
 886  * Note that only writes already issued are synced; this routine knows
 887  * nothing of dirty buffers that may exist inside the buffer manager.
 888  */
 889 void
 890 mdimmedsync(SMgrRelation reln)
 891 {
 892         MdfdVec    *v;
 893         BlockNumber curnblk;
 894
 895         /*
 896          * NOTE: mdnblocks makes sure we have opened all active segments, so that
 897          * fsync loop will get them all!
 898          */
 899         curnblk = mdnblocks(reln);
 900
 901         v = mdopen(reln, EXTENSION_FAIL);
 902
 903 #ifndef LET_OS_MANAGE_FILESIZE
 904         while (v != NULL)
 905         {
 906                 if (FileSync(v->mdfd_vfd) < 0)
 907                         ereport(ERROR,
 908                                         (errcode_for_file_access(),
 909                                 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
 910                                            v->mdfd_segno,
 911                                            reln->smgr_rnode.spcNode,
 912                                            reln->smgr_rnode.dbNode,
 913                                            reln->smgr_rnode.relNode)));
 914                 v = v->mdfd_chain;
 915         }
 916 #else
 917         if (FileSync(v->mdfd_vfd) < 0)
 918                 ereport(ERROR,
 919                                 (errcode_for_file_access(),
 920                                  errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
 921                                                 v->mdfd_segno,
 922                                                 reln->smgr_rnode.spcNode,
 923                                                 reln->smgr_rnode.dbNode,
 924                                                 reln->smgr_rnode.relNode)));
 925 #endif
 926 }
 927
 928 /*
 929  *      mdsync() -- Sync previous writes to stable storage.
 930  */
 931 void
 932 mdsync(void)
 933 {
 934         static bool mdsync_in_progress = false;
 935
 936         HASH_SEQ_STATUS hstat;
 937         PendingOperationEntry *entry;
 938         int                     absorb_counter;
 939
 940         /*
 941          * This is only called during checkpoints, and checkpoints should only
 942          * occur in processes that have created a pendingOpsTable.
 943          */
 944         if (!pendingOpsTable)
 945                 elog(ERROR, "cannot sync without a pendingOpsTable");
 946
 947         /*
 948          * If we are in the bgwriter, the sync had better include all fsync
 949          * requests that were queued by backends up to this point.      The tightest
 950          * race condition that could occur is that a buffer that must be written
 951          * and fsync'd for the checkpoint could have been dumped by a backend just
 952          * before it was visited by BufferSync().  We know the backend will have
 953          * queued an fsync request before clearing the buffer's dirtybit, so we
 954          * are safe as long as we do an Absorb after completing BufferSync().
 955          */
 956         AbsorbFsyncRequests();
 957
 958         /*
 959          * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
 960          * checkpoint), we want to ignore fsync requests that are entered into the
 961          * hashtable after this point --- they should be processed next time,
 962          * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
 963          * ones: new ones will have cycle_ctr equal to the incremented value of
 964          * mdsync_cycle_ctr.
 965          *
 966          * In normal circumstances, all entries present in the table at this point
 967          * will have cycle_ctr exactly equal to the current (about to be old)
 968          * value of mdsync_cycle_ctr.  However, if we fail partway through the
 969          * fsync'ing loop, then older values of cycle_ctr might remain when we
 970          * come back here to try again.  Repeated checkpoint failures would
 971          * eventually wrap the counter around to the point where an old entry
 972          * might appear new, causing us to skip it, possibly allowing a checkpoint
 973          * to succeed that should not have.  To forestall wraparound, any time the
 974          * previous mdsync() failed to complete, run through the table and
 975          * forcibly set cycle_ctr = mdsync_cycle_ctr.
 976          *
 977          * Think not to merge this loop with the main loop, as the problem is
 978          * exactly that that loop may fail before having visited all the entries.
 979          * From a performance point of view it doesn't matter anyway, as this path
 980          * will never be taken in a system that's functioning normally.
 981          */
 982         if (mdsync_in_progress)
 983         {
 984                 /* prior try failed, so update any stale cycle_ctr values */
 985                 hash_seq_init(&hstat, pendingOpsTable);
 986                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
 987                 {
 988                         entry->cycle_ctr = mdsync_cycle_ctr;
 989                 }
 990         }
 991
 992         /* Advance counter so that new hashtable entries are distinguishable */
 993         mdsync_cycle_ctr++;
 994
 995         /* Set flag to detect failure if we don't reach the end of the loop */
 996         mdsync_in_progress = true;
 997
 998         /* Now scan the hashtable for fsync requests to process */
 999         absorb_counter = FSYNCS_PER_ABSORB;
1000         hash_seq_init(&hstat, pendingOpsTable);
1001         while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1002         {
1003                 /*
1004                  * If the entry is new then don't process it this time.  Note that
1005                  * "continue" bypasses the hash-remove call at the bottom of the loop.
1006                  */
1007                 if (entry->cycle_ctr == mdsync_cycle_ctr)
1008                         continue;
1009
1010                 /* Else assert we haven't missed it */
1011                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
1012
1013                 /*
1014                  * If fsync is off then we don't have to bother opening the file at
1015                  * all.  (We delay checking until this point so that changing fsync on
1016                  * the fly behaves sensibly.)  Also, if the entry is marked canceled,
1017                  * fall through to delete it.
1018                  */
1019                 if (enableFsync && !entry->canceled)
1020                 {
1021                         int                     failures;
1022
1023                         /*
1024                          * If in bgwriter, we want to absorb pending requests every so
1025                          * often to prevent overflow of the fsync request queue.  It is
1026                          * unspecified whether newly-added entries will be visited by
1027                          * hash_seq_search, but we don't care since we don't need to
1028                          * process them anyway.
1029                          */
1030                         if (--absorb_counter <= 0)
1031                         {
1032                                 AbsorbFsyncRequests();
1033                                 absorb_counter = FSYNCS_PER_ABSORB;
1034                         }
1035
1036                         /*
1037                          * The fsync table could contain requests to fsync segments that
1038                          * have been deleted (unlinked) by the time we get to them. Rather
1039                          * than just hoping an ENOENT (or EACCES on Windows) error can be
1040                          * ignored, what we do on error is absorb pending requests and
1041                          * then retry.  Since mdunlink() queues a "revoke" message before
1042                          * actually unlinking, the fsync request is guaranteed to be
1043                          * marked canceled after the absorb if it really was this case.
1044                          * DROP DATABASE likewise has to tell us to forget fsync requests
1045                          * before it starts deletions.
1046                          */
1047                         for (failures = 0;; failures++)         /* loop exits at "break" */
1048                         {
1049                                 SMgrRelation reln;
1050                                 MdfdVec    *seg;
1051
1052                                 /*
1053                                  * Find or create an smgr hash entry for this relation. This
1054                                  * may seem a bit unclean -- md calling smgr?  But it's really
1055                                  * the best solution.  It ensures that the open file reference
1056                                  * isn't permanently leaked if we get an error here. (You may
1057                                  * say "but an unreferenced SMgrRelation is still a leak!" Not
1058                                  * really, because the only case in which a checkpoint is done
1059                                  * by a process that isn't about to shut down is in the
1060                                  * bgwriter, and it will periodically do smgrcloseall(). This
1061                                  * fact justifies our not closing the reln in the success path
1062                                  * either, which is a good thing since in non-bgwriter cases
1063                                  * we couldn't safely do that.)  Furthermore, in many cases
1064                                  * the relation will have been dirtied through this same smgr
1065                                  * relation, and so we can save a file open/close cycle.
1066                                  */
1067                                 reln = smgropen(entry->tag.rnode);
1068
1069                                 /*
1070                                  * It is possible that the relation has been dropped or
1071                                  * truncated since the fsync request was entered.  Therefore,
1072                                  * allow ENOENT, but only if we didn't fail already on this
1073                                  * file.  This applies both during _mdfd_getseg() and during
1074                                  * FileSync, since fd.c might have closed the file behind our
1075                                  * back.
1076                                  */
1077                                 seg = _mdfd_getseg(reln,
1078                                                           entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
1079                                                                    false, EXTENSION_RETURN_NULL);
1080                                 if (seg != NULL &&
1081                                         FileSync(seg->mdfd_vfd) >= 0)
1082                                         break;          /* success; break out of retry loop */
1083
1084                                 /*
1085                                  * XXX is there any point in allowing more than one retry?
1086                                  * Don't see one at the moment, but easy to change the test
1087                                  * here if so.
1088                                  */
1089                                 if (!FILE_POSSIBLY_DELETED(errno) ||
1090                                         failures > 0)
1091                                         ereport(ERROR,
1092                                                         (errcode_for_file_access(),
1093                                                          errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
1094                                                                         entry->tag.segno,
1095                                                                         entry->tag.rnode.spcNode,
1096                                                                         entry->tag.rnode.dbNode,
1097                                                                         entry->tag.rnode.relNode)));
1098                                 else
1099                                         ereport(DEBUG1,
1100                                                         (errcode_for_file_access(),
1101                                                          errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
1102                                                                         entry->tag.segno,
1103                                                                         entry->tag.rnode.spcNode,
1104                                                                         entry->tag.rnode.dbNode,
1105                                                                         entry->tag.rnode.relNode)));
1106
1107                                 /*
1108                                  * Absorb incoming requests and check to see if canceled.
1109                                  */
1110                                 AbsorbFsyncRequests();
1111                                 absorb_counter = FSYNCS_PER_ABSORB;             /* might as well... */
1112
1113                                 if (entry->canceled)
1114                                         break;
1115                         }                                       /* end retry loop */
1116                 }
1117
1118                 /*
1119                  * If we get here, either we fsync'd successfully, or we don't have to
1120                  * because enableFsync is off, or the entry is (now) marked canceled.
1121                  * Okay to delete it.
1122                  */
1123                 if (hash_search(pendingOpsTable, &entry->tag,
1124                                                 HASH_REMOVE, NULL) == NULL)
1125                         elog(ERROR, "pendingOpsTable corrupted");
1126         }                                                       /* end loop over hashtable entries */
1127
1128         /* Flag successful completion of mdsync */
1129         mdsync_in_progress = false;
1130 }
1131
1132 /*
1133  * mdpreckpt() -- Do pre-checkpoint work
1134  *
1135  * To distinguish unlink requests that arrived before this checkpoint
1136  * started from those that arrived during the checkpoint, we use a cycle
1137  * counter similar to the one we use for fsync requests. That cycle
1138  * counter is incremented here.
1139  *
1140  * This must be called *before* the checkpoint REDO point is determined.
1141  * That ensures that we won't delete files too soon.
1142  *
1143  * Note that we can't do anything here that depends on the assumption
1144  * that the checkpoint will be completed.
1145  */
1146 void
1147 mdpreckpt(void)
1148 {
1149         ListCell   *cell;
1150
1151         /*
1152          * In case the prior checkpoint wasn't completed, stamp all entries in the
1153          * list with the current cycle counter.  Anything that's in the list at
1154          * the start of checkpoint can surely be deleted after the checkpoint is
1155          * finished, regardless of when the request was made.
1156          */
1157         foreach(cell, pendingUnlinks)
1158         {
1159                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1160
1161                 entry->cycle_ctr = mdckpt_cycle_ctr;
1162         }
1163
1164         /*
1165          * Any unlink requests arriving after this point will be assigned the next
1166          * cycle counter, and won't be unlinked until next checkpoint.
1167          */
1168         mdckpt_cycle_ctr++;
1169 }
1170
1171 /*
1172  * mdpostckpt() -- Do post-checkpoint work
1173  *
1174  * Remove any lingering files that can now be safely removed.
1175  */
1176 void
1177 mdpostckpt(void)
1178 {
1179         while (pendingUnlinks != NIL)
1180         {
1181                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
1182                 char       *path;
1183
1184                 /*
1185                  * New entries are appended to the end, so if the entry is new we've
1186                  * reached the end of old entries.
1187                  */
1188                 if (entry->cycle_ctr == mdsync_cycle_ctr)
1189                         break;
1190
1191                 /* Else assert we haven't missed it */
1192                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);
1193
1194                 /* Unlink the file */
1195                 path = relpath(entry->rnode);
1196                 if (unlink(path) < 0)
1197                 {
1198                         /*
1199                          * ENOENT shouldn't happen either, but it doesn't really matter
1200                          * because we would've deleted it now anyway.
1201                          */
1202                         if (errno != ENOENT)
1203                                 ereport(WARNING,
1204                                                 (errcode_for_file_access(),
1205                                                  errmsg("could not remove relation %u/%u/%u: %m",
1206                                                                 entry->rnode.spcNode,
1207                                                                 entry->rnode.dbNode,
1208                                                                 entry->rnode.relNode)));
1209                 }
1210                 pfree(path);
1211
1212                 pendingUnlinks = list_delete_first(pendingUnlinks);
1213                 pfree(entry);
1214         }
1215 }
1216
1217 /*
1218  * register_dirty_segment() -- Mark a relation segment as needing fsync
1219  *
1220  * If there is a local pending-ops table, just make an entry in it for
1221  * mdsync to process later.  Otherwise, try to pass off the fsync request
1222  * to the background writer process.  If that fails, just do the fsync
1223  * locally before returning (we expect this will not happen often enough
1224  * to be a performance problem).
1225  */
1226 static void
1227 register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
1228 {
1229         if (pendingOpsTable)
1230         {
1231                 /* push it into local pending-ops table */
1232                 RememberFsyncRequest(reln->smgr_rnode, seg->mdfd_segno);
1233         }
1234         else
1235         {
1236                 if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno))
1237                         return;                         /* passed it off successfully */
1238
1239                 if (FileSync(seg->mdfd_vfd) < 0)
1240                         ereport(ERROR,
1241                                         (errcode_for_file_access(),
1242                                 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
1243                                            seg->mdfd_segno,
1244                                            reln->smgr_rnode.spcNode,
1245                                            reln->smgr_rnode.dbNode,
1246                                            reln->smgr_rnode.relNode)));
1247         }
1248 }
1249
1250 /*
1251  * register_unlink() -- Schedule a file to be deleted after next checkpoint
1252  *
1253  * As with register_dirty_segment, this could involve either a local or
1254  * a remote pending-ops table.
1255  */
1256 static void
1257 register_unlink(RelFileNode rnode)
1258 {
1259         if (pendingOpsTable)
1260         {
1261                 /* push it into local pending-ops table */
1262                 RememberFsyncRequest(rnode, UNLINK_RELATION_REQUEST);
1263         }
1264         else
1265         {
1266                 /*
1267                  * Notify the bgwriter about it.  If we fail to queue the request
1268                  * message, we have to sleep and try again, because we can't simply
1269                  * delete the file now.  Ugly, but hopefully won't happen often.
1270                  *
1271                  * XXX should we just leave the file orphaned instead?
1272                  */
1273                 Assert(IsUnderPostmaster);
1274                 while (!ForwardFsyncRequest(rnode, UNLINK_RELATION_REQUEST))
1275                         pg_usleep(10000L);      /* 10 msec seems a good number */
1276         }
1277 }
1278
1279 /*
1280  * RememberFsyncRequest() -- callback from bgwriter side of fsync request
1281  *
1282  * We stuff most fsync requests into the local hash table for execution
1283  * during the bgwriter's next checkpoint.  UNLINK requests go into a
1284  * separate linked list, however, because they get processed separately.
1285  *
1286  * The range of possible segment numbers is way less than the range of
1287  * BlockNumber, so we can reserve high values of segno for special purposes.
1288  * We define three:
1289  * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation
1290  * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
1291  * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
1292  *       checkpoint.
1293  *
1294  * (Handling the FORGET_* requests is a tad slow because the hash table has
1295  * to be searched linearly, but it doesn't seem worth rethinking the table
1296  * structure for them.)
1297  */
1298 void
1299 RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
1300 {
1301         Assert(pendingOpsTable);
1302
1303         if (segno == FORGET_RELATION_FSYNC)
1304         {
1305                 /* Remove any pending requests for the entire relation */
1306                 HASH_SEQ_STATUS hstat;
1307                 PendingOperationEntry *entry;
1308
1309                 hash_seq_init(&hstat, pendingOpsTable);
1310                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1311                 {
1312                         if (RelFileNodeEquals(entry->tag.rnode, rnode))
1313                         {
1314                                 /* Okay, cancel this entry */
1315                                 entry->canceled = true;
1316                         }
1317                 }
1318         }
1319         else if (segno == FORGET_DATABASE_FSYNC)
1320         {
1321                 /* Remove any pending requests for the entire database */
1322                 HASH_SEQ_STATUS hstat;
1323                 PendingOperationEntry *entry;
1324
1325                 hash_seq_init(&hstat, pendingOpsTable);
1326                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1327                 {
1328                         if (entry->tag.rnode.dbNode == rnode.dbNode)
1329                         {
1330                                 /* Okay, cancel this entry */
1331                                 entry->canceled = true;
1332                         }
1333                 }
1334         }
1335         else if (segno == UNLINK_RELATION_REQUEST)
1336         {
1337                 /* Unlink request: put it in the linked list */
1338                 MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
1339                 PendingUnlinkEntry *entry;
1340
1341                 entry = palloc(sizeof(PendingUnlinkEntry));
1342                 entry->rnode = rnode;
1343                 entry->cycle_ctr = mdckpt_cycle_ctr;
1344
1345                 pendingUnlinks = lappend(pendingUnlinks, entry);
1346
1347                 MemoryContextSwitchTo(oldcxt);
1348         }
1349         else
1350         {
1351                 /* Normal case: enter a request to fsync this segment */
1352                 PendingOperationTag key;
1353                 PendingOperationEntry *entry;
1354                 bool            found;
1355
1356                 /* ensure any pad bytes in the hash key are zeroed */
1357                 MemSet(&key, 0, sizeof(key));
1358                 key.rnode = rnode;
1359                 key.segno = segno;
1360
1361                 entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
1362                                                                                                           &key,
1363                                                                                                           HASH_ENTER,
1364                                                                                                           &found);
1365                 /* if new or previously canceled entry, initialize it */
1366                 if (!found || entry->canceled)
1367                 {
1368                         entry->canceled = false;
1369                         entry->cycle_ctr = mdsync_cycle_ctr;
1370                 }
1371
1372                 /*
1373                  * NB: it's intentional that we don't change cycle_ctr if the entry
1374                  * already exists.      The fsync request must be treated as old, even
1375                  * though the new request will be satisfied too by any subsequent
1376                  * fsync.
1377                  *
1378                  * However, if the entry is present but is marked canceled, we should
1379                  * act just as though it wasn't there.  The only case where this could
1380                  * happen would be if a file had been deleted, we received but did not
1381                  * yet act on the cancel request, and the same relfilenode was then
1382                  * assigned to a new file.      We mustn't lose the new request, but it
1383                  * should be considered new not old.
1384                  */
1385         }
1386 }
1387
1388 /*
1389  * ForgetRelationFsyncRequests -- ensure any fsyncs for a rel are forgotten
1390  */
1391 void
1392 ForgetRelationFsyncRequests(RelFileNode rnode)
1393 {
1394         if (pendingOpsTable)
1395         {
1396                 /* standalone backend or startup process: fsync state is local */
1397                 RememberFsyncRequest(rnode, FORGET_RELATION_FSYNC);
1398         }
1399         else if (IsUnderPostmaster)
1400         {
1401                 /*
1402                  * Notify the bgwriter about it.  If we fail to queue the revoke
1403                  * message, we have to sleep and try again ... ugly, but hopefully
1404                  * won't happen often.
1405                  *
1406                  * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
1407                  * error would leave the no-longer-used file still present on disk,
1408                  * which would be bad, so I'm inclined to assume that the bgwriter
1409                  * will always empty the queue soon.
1410                  */
1411                 while (!ForwardFsyncRequest(rnode, FORGET_RELATION_FSYNC))
1412                         pg_usleep(10000L);      /* 10 msec seems a good number */
1413
1414                 /*
1415                  * Note we don't wait for the bgwriter to actually absorb the revoke
1416                  * message; see mdsync() for the implications.
1417                  */
1418         }
1419 }
1420
1421 /*
1422  * ForgetDatabaseFsyncRequests -- ensure any fsyncs for a DB are forgotten
1423  */
1424 void
1425 ForgetDatabaseFsyncRequests(Oid dbid)
1426 {
1427         RelFileNode rnode;
1428
1429         rnode.dbNode = dbid;
1430         rnode.spcNode = 0;
1431         rnode.relNode = 0;
1432
1433         if (pendingOpsTable)
1434         {
1435                 /* standalone backend or startup process: fsync state is local */
1436                 RememberFsyncRequest(rnode, FORGET_DATABASE_FSYNC);
1437         }
1438         else if (IsUnderPostmaster)
1439         {
1440                 /* see notes in ForgetRelationFsyncRequests */
1441                 while (!ForwardFsyncRequest(rnode, FORGET_DATABASE_FSYNC))
1442                         pg_usleep(10000L);      /* 10 msec seems a good number */
1443         }
1444 }
1445
1446
1447 /*
1448  *      _fdvec_alloc() -- Make a MdfdVec object.
1449  */
1450 static MdfdVec *
1451 _fdvec_alloc(void)
1452 {
1453         return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
1454 }
1455
1456 #ifndef LET_OS_MANAGE_FILESIZE
1457
1458 /*
1459  * Open the specified segment of the relation,
1460  * and make a MdfdVec object for it.  Returns NULL on failure.
1461  */
1462 static MdfdVec *
1463 _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
1464 {
1465         MdfdVec    *v;
1466         int                     fd;
1467         char       *path,
1468                            *fullpath;
1469
1470         path = relpath(reln->smgr_rnode);
1471
1472         if (segno > 0)
1473         {
1474                 /* be sure we have enough space for the '.segno' */
1475                 fullpath = (char *) palloc(strlen(path) + 12);
1476                 sprintf(fullpath, "%s.%u", path, segno);
1477                 pfree(path);
1478         }
1479         else
1480                 fullpath = path;
1481
1482         /* open the file */
1483         fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
1484
1485         pfree(fullpath);
1486
1487         if (fd < 0)
1488                 return NULL;
1489
1490         /* allocate an mdfdvec entry for it */
1491         v = _fdvec_alloc();
1492
1493         /* fill the entry */
1494         v->mdfd_vfd = fd;
1495         v->mdfd_segno = segno;
1496         v->mdfd_chain = NULL;
1497         Assert(_mdnblocks(reln, v) <= ((BlockNumber) RELSEG_SIZE));
1498
1499         /* all done */
1500         return v;
1501 }
1502 #endif   /* LET_OS_MANAGE_FILESIZE */
1503
1504 /*
1505  *      _mdfd_getseg() -- Find the segment of the relation holding the
1506  *              specified block.
1507  *
1508  * If the segment doesn't exist, we ereport, return NULL, or create the
1509  * segment, according to "behavior".  Note: isTemp need only be correct
1510  * in the EXTENSION_CREATE case.
1511  */
1512 static MdfdVec *
1513 _mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool isTemp,
1514                          ExtensionBehavior behavior)
1515 {
1516         MdfdVec    *v = mdopen(reln, behavior);
1517
1518 #ifndef LET_OS_MANAGE_FILESIZE
1519         BlockNumber targetseg;
1520         BlockNumber nextsegno;
1521
1522         if (!v)
1523                 return NULL;                    /* only possible if EXTENSION_RETURN_NULL */
1524
1525         targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1526         for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
1527         {
1528                 Assert(nextsegno == v->mdfd_segno + 1);
1529
1530                 if (v->mdfd_chain == NULL)
1531                 {
1532                         /*
1533                          * Normally we will create new segments only if authorized by the
1534                          * caller (i.e., we are doing mdextend()).      But when doing WAL
1535                          * recovery, create segments anyway; this allows cases such as
1536                          * replaying WAL data that has a write into a high-numbered
1537                          * segment of a relation that was later deleted.  We want to go
1538                          * ahead and create the segments so we can finish out the replay.
1539                          *
1540                          * We have to maintain the invariant that segments before the last
1541                          * active segment are of size RELSEG_SIZE; therefore, pad them out
1542                          * with zeroes if needed.  (This only matters if caller is
1543                          * extending the relation discontiguously, but that can happen in
1544                          * hash indexes.)
1545                          */
1546                         if (behavior == EXTENSION_CREATE || InRecovery)
1547                         {
1548                                 if (_mdnblocks(reln, v) < RELSEG_SIZE)
1549                                 {
1550                                         char       *zerobuf = palloc0(BLCKSZ);
1551
1552                                         mdextend(reln, nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1553                                                          zerobuf, isTemp);
1554                                         pfree(zerobuf);
1555                                 }
1556                                 v->mdfd_chain = _mdfd_openseg(reln, nextsegno, O_CREAT);
1557                         }
1558                         else
1559                         {
1560                                 /* We won't create segment if not existent */
1561                                 v->mdfd_chain = _mdfd_openseg(reln, nextsegno, 0);
1562                         }
1563                         if (v->mdfd_chain == NULL)
1564                         {
1565                                 if (behavior == EXTENSION_RETURN_NULL &&
1566                                         FILE_POSSIBLY_DELETED(errno))
1567                                         return NULL;
1568                                 ereport(ERROR,
1569                                                 (errcode_for_file_access(),
1570                                                  errmsg("could not open segment %u of relation %u/%u/%u (target block %u): %m",
1571                                                                 nextsegno,
1572                                                                 reln->smgr_rnode.spcNode,
1573                                                                 reln->smgr_rnode.dbNode,
1574                                                                 reln->smgr_rnode.relNode,
1575                                                                 blkno)));
1576                         }
1577                 }
1578                 v = v->mdfd_chain;
1579         }
1580 #endif
1581
1582         return v;
1583 }
1584
1585 /*
1586  * Get number of blocks present in a single disk file
1587  */
1588 static BlockNumber
1589 _mdnblocks(SMgrRelation reln, MdfdVec *seg)
1590 {
1591         long            len;
1592
1593         len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
1594         if (len < 0)
1595                 ereport(ERROR,
1596                                 (errcode_for_file_access(),
1597                 errmsg("could not seek to end of segment %u of relation %u/%u/%u: %m",
1598                            seg->mdfd_segno,
1599                            reln->smgr_rnode.spcNode,
1600                            reln->smgr_rnode.dbNode,
1601                            reln->smgr_rnode.relNode)));
1602         /* note that this calculation will ignore any partial block at EOF */
1603         return (BlockNumber) (len / BLCKSZ);
1604 }