granicus.if.org Git - postgresql/blob - src/backend/storage/smgr/md.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * md.c
   4  *        This code manages relations that reside on magnetic disk.
   5  *
   6  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.130 2007/11/15 20:36:40 tgl Exp $
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15 #include "postgres.h"
  16
  17 #include <unistd.h>
  18 #include <fcntl.h>
  19 #include <sys/file.h>
  20
  21 #include "catalog/catalog.h"
  22 #include "miscadmin.h"
  23 #include "postmaster/bgwriter.h"
  24 #include "storage/fd.h"
  25 #include "storage/bufmgr.h"
  26 #include "storage/smgr.h"
  27 #include "utils/hsearch.h"
  28 #include "utils/memutils.h"
  29
  30
  31 /* interval for calling AbsorbFsyncRequests in mdsync */
  32 #define FSYNCS_PER_ABSORB               10
  33
  34 /* special values for the segno arg to RememberFsyncRequest */
  35 #define FORGET_RELATION_FSYNC   (InvalidBlockNumber)
  36 #define FORGET_DATABASE_FSYNC   (InvalidBlockNumber-1)
  37 #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
  38
  39 /*
  40  * On Windows, we have to interpret EACCES as possibly meaning the same as
  41  * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
  42  * that's what you get.  Ugh.  This code is designed so that we don't
  43  * actually believe these cases are okay without further evidence (namely,
  44  * a pending fsync request getting revoked ... see mdsync).
  45  */
  46 #ifndef WIN32
  47 #define FILE_POSSIBLY_DELETED(err)  ((err) == ENOENT)
  48 #else
  49 #define FILE_POSSIBLY_DELETED(err)  ((err) == ENOENT || (err) == EACCES)
  50 #endif
  51
  52 /*
  53  *      The magnetic disk storage manager keeps track of open file
  54  *      descriptors in its own descriptor pool.  This is done to make it
  55  *      easier to support relations that are larger than the operating
  56  *      system's file size limit (often 2GBytes).  In order to do that,
  57  *      we break relations up into "segment" files that are each shorter than
  58  *      the OS file size limit.  The segment size is set by the RELSEG_SIZE
  59  *      configuration constant in pg_config_manual.h.
  60  *
  61  *      On disk, a relation must consist of consecutively numbered segment
  62  *      files in the pattern
  63  *              -- Zero or more full segments of exactly RELSEG_SIZE blocks each
  64  *              -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
  65  *              -- Optionally, any number of inactive segments of size 0 blocks.
  66  *      The full and partial segments are collectively the "active" segments.
  67  *      Inactive segments are those that once contained data but are currently
  68  *      not needed because of an mdtruncate() operation.  The reason for leaving
  69  *      them present at size zero, rather than unlinking them, is that other
  70  *      backends and/or the bgwriter might be holding open file references to
  71  *      such segments.  If the relation expands again after mdtruncate(), such
  72  *      that a deactivated segment becomes active again, it is important that
  73  *      such file references still be valid --- else data might get written
  74  *      out to an unlinked old copy of a segment file that will eventually
  75  *      disappear.
  76  *
  77  *      The file descriptor pointer (md_fd field) stored in the SMgrRelation
  78  *      cache is, therefore, just the head of a list of MdfdVec objects, one
  79  *      per segment.  But note the md_fd pointer can be NULL, indicating
  80  *      relation not open.
  81  *
  82  *      Also note that mdfd_chain == NULL does not necessarily mean the relation
  83  *      doesn't have another segment after this one; we may just not have
  84  *      opened the next segment yet.  (We could not have "all segments are
  85  *      in the chain" as an invariant anyway, since another backend could
  86  *      extend the relation when we weren't looking.)  We do not make chain
  87  *      entries for inactive segments, however; as soon as we find a partial
  88  *      segment, we assume that any subsequent segments are inactive.
  89  *
  90  *      All MdfdVec objects are palloc'd in the MdCxt memory context.
  91  *
  92  *      Defining LET_OS_MANAGE_FILESIZE disables the segmentation logic,
  93  *      for use on machines that support large files.  Beware that that
  94  *      code has not been tested in a long time and is probably bit-rotted.
  95  */
  96
  97 typedef struct _MdfdVec
  98 {
  99         File            mdfd_vfd;               /* fd number in fd.c's pool */
 100         BlockNumber mdfd_segno;         /* segment number, from 0 */
 101 #ifndef LET_OS_MANAGE_FILESIZE  /* for large relations */
 102         struct _MdfdVec *mdfd_chain;    /* next segment, or NULL */
 103 #endif
 104 } MdfdVec;
 105
 106 static MemoryContext MdCxt;             /* context for all md.c allocations */
 107
 108
 109 /*
 110  * In some contexts (currently, standalone backends and the bgwriter process)
 111  * we keep track of pending fsync operations: we need to remember all relation
 112  * segments that have been written since the last checkpoint, so that we can
 113  * fsync them down to disk before completing the next checkpoint.  This hash
 114  * table remembers the pending operations.      We use a hash table mostly as
 115  * a convenient way of eliminating duplicate requests.
 116  *
 117  * We use a similar mechanism to remember no-longer-needed files that can
 118  * be deleted after the next checkpoint, but we use a linked list instead of
 119  * a hash table, because we don't expect there to be any duplicate requests.
 120  *
 121  * (Regular backends do not track pending operations locally, but forward
 122  * them to the bgwriter.)
 123  */
 124 typedef struct
 125 {
 126         RelFileNode rnode;                      /* the targeted relation */
 127         BlockNumber segno;                      /* which segment */
 128 } PendingOperationTag;
 129
 130 typedef uint16 CycleCtr;                /* can be any convenient integer size */
 131
 132 typedef struct
 133 {
 134         PendingOperationTag tag;        /* hash table key (must be first!) */
 135         bool            canceled;               /* T => request canceled, not yet removed */
 136         CycleCtr        cycle_ctr;              /* mdsync_cycle_ctr when request was made */
 137 } PendingOperationEntry;
 138
 139 typedef struct
 140 {
 141         RelFileNode rnode;                      /* the dead relation to delete */
 142         CycleCtr cycle_ctr;                     /* mdckpt_cycle_ctr when request was made */
 143 } PendingUnlinkEntry;
 144
 145 static HTAB *pendingOpsTable = NULL;
 146 static List *pendingUnlinks = NIL;
 147
 148 static CycleCtr mdsync_cycle_ctr = 0;
 149 static CycleCtr mdckpt_cycle_ctr = 0;
 150
 151
 152 typedef enum                                    /* behavior for mdopen & _mdfd_getseg */
 153 {
 154         EXTENSION_FAIL,                         /* ereport if segment not present */
 155         EXTENSION_RETURN_NULL,          /* return NULL if not present */
 156         EXTENSION_CREATE                        /* create new segments as needed */
 157 } ExtensionBehavior;
 158
 159 /* local routines */
 160 static MdfdVec *mdopen(SMgrRelation reln, ExtensionBehavior behavior);
 161 static void register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
 162 static void register_unlink(RelFileNode rnode);
 163 static MdfdVec *_fdvec_alloc(void);
 164
 165 #ifndef LET_OS_MANAGE_FILESIZE
 166 static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno,
 167                           int oflags);
 168 #endif
 169 static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno,
 170                                                          bool isTemp, ExtensionBehavior behavior);
 171 static BlockNumber _mdnblocks(SMgrRelation reln, MdfdVec *seg);
 172
 173
 174 /*
 175  *      mdinit() -- Initialize private state for magnetic disk storage manager.
 176  */
 177 void
 178 mdinit(void)
 179 {
 180         MdCxt = AllocSetContextCreate(TopMemoryContext,
 181                                                                   "MdSmgr",
 182                                                                   ALLOCSET_DEFAULT_MINSIZE,
 183                                                                   ALLOCSET_DEFAULT_INITSIZE,
 184                                                                   ALLOCSET_DEFAULT_MAXSIZE);
 185
 186         /*
 187          * Create pending-operations hashtable if we need it.  Currently, we need
 188          * it if we are standalone (not under a postmaster) OR if we are a
 189          * bootstrap-mode subprocess of a postmaster (that is, a startup or
 190          * bgwriter process).
 191          */
 192         if (!IsUnderPostmaster || IsBootstrapProcessingMode())
 193         {
 194                 HASHCTL         hash_ctl;
 195
 196                 MemSet(&hash_ctl, 0, sizeof(hash_ctl));
 197                 hash_ctl.keysize = sizeof(PendingOperationTag);
 198                 hash_ctl.entrysize = sizeof(PendingOperationEntry);
 199                 hash_ctl.hash = tag_hash;
 200                 hash_ctl.hcxt = MdCxt;
 201                 pendingOpsTable = hash_create("Pending Ops Table",
 202                                                                           100L,
 203                                                                           &hash_ctl,
 204                                                                    HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
 205                 pendingUnlinks = NIL;
 206         }
 207 }
 208
 209 /*
 210  *      mdcreate() -- Create a new relation on magnetic disk.
 211  *
 212  * If isRedo is true, it's okay for the relation to exist already.
 213  */
 214 void
 215 mdcreate(SMgrRelation reln, bool isRedo)
 216 {
 217         char       *path;
 218         File            fd;
 219
 220         if (isRedo && reln->md_fd != NULL)
 221                 return;                                 /* created and opened already... */
 222
 223         Assert(reln->md_fd == NULL);
 224
 225         path = relpath(reln->smgr_rnode);
 226
 227         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
 228
 229         if (fd < 0)
 230         {
 231                 int                     save_errno = errno;
 232
 233                 /*
 234                  * During bootstrap, there are cases where a system relation will be
 235                  * accessed (by internal backend processes) before the bootstrap
 236                  * script nominally creates it.  Therefore, allow the file to exist
 237                  * already, even if isRedo is not set.  (See also mdopen)
 238                  */
 239                 if (isRedo || IsBootstrapProcessingMode())
 240                         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
 241                 if (fd < 0)
 242                 {
 243                         pfree(path);
 244                         /* be sure to report the error reported by create, not open */
 245                         errno = save_errno;
 246                         ereport(ERROR,
 247                                         (errcode_for_file_access(),
 248                                          errmsg("could not create relation %u/%u/%u: %m",
 249                                                         reln->smgr_rnode.spcNode,
 250                                                         reln->smgr_rnode.dbNode,
 251                                                         reln->smgr_rnode.relNode)));
 252                 }
 253         }
 254
 255         pfree(path);
 256
 257         reln->md_fd = _fdvec_alloc();
 258
 259         reln->md_fd->mdfd_vfd = fd;
 260         reln->md_fd->mdfd_segno = 0;
 261 #ifndef LET_OS_MANAGE_FILESIZE
 262         reln->md_fd->mdfd_chain = NULL;
 263 #endif
 264 }
 265
 266 /*
 267  *      mdunlink() -- Unlink a relation.
 268  *
 269  * Note that we're passed a RelFileNode --- by the time this is called,
 270  * there won't be an SMgrRelation hashtable entry anymore.
 271  *
 272  * Actually, we don't unlink the first segment file of the relation, but
 273  * just truncate it to zero length, and record a request to unlink it after
 274  * the next checkpoint.  Additional segments can be unlinked immediately,
 275  * however.  Leaving the empty file in place prevents that relfilenode
 276  * number from being reused.  The scenario this protects us from is:
 277  * 1. We delete a relation (and commit, and actually remove its file).
 278  * 2. We create a new relation, which by chance gets the same relfilenode as
 279  *    the just-deleted one (OIDs must've wrapped around for that to happen).
 280  * 3. We crash before another checkpoint occurs.
 281  * During replay, we would delete the file and then recreate it, which is fine
 282  * if the contents of the file were repopulated by subsequent WAL entries.
 283  * But if we didn't WAL-log insertions, but instead relied on fsyncing the
 284  * file after populating it (as for instance CLUSTER and CREATE INDEX do),
 285  * the contents of the file would be lost forever.  By leaving the empty file
 286  * until after the next checkpoint, we prevent reassignment of the relfilenode
 287  * number until it's safe, because relfilenode assignment skips over any
 288  * existing file.
 289  *
 290  * If isRedo is true, it's okay for the relation to be already gone.
 291  * Also, we should remove the file immediately instead of queuing a request
 292  * for later, since during redo there's no possibility of creating a
 293  * conflicting relation.
 294  *
 295  * Note: any failure should be reported as WARNING not ERROR, because
 296  * we are usually not in a transaction anymore when this is called.
 297  */
 298 void
 299 mdunlink(RelFileNode rnode, bool isRedo)
 300 {
 301         char       *path;
 302         int ret;
 303
 304         /*
 305          * We have to clean out any pending fsync requests for the doomed relation,
 306          * else the next mdsync() will fail.
 307          */
 308         ForgetRelationFsyncRequests(rnode);
 309
 310         path = relpath(rnode);
 311
 312         /*
 313          * Delete or truncate the first segment, or only segment if not doing
 314          * segmenting
 315          */
 316         if (isRedo)
 317                 ret = unlink(path);
 318         else
 319                 ret = truncate(path, 0);
 320         if (ret < 0)
 321         {
 322                 if (!isRedo || errno != ENOENT)
 323                         ereport(WARNING,
 324                                         (errcode_for_file_access(),
 325                                          errmsg("could not remove relation %u/%u/%u: %m",
 326                                                         rnode.spcNode,
 327                                                         rnode.dbNode,
 328                                                         rnode.relNode)));
 329         }
 330
 331 #ifndef LET_OS_MANAGE_FILESIZE
 332         /* Delete the additional segments, if any */
 333         else
 334         {
 335                 char       *segpath = (char *) palloc(strlen(path) + 12);
 336                 BlockNumber segno;
 337
 338                 /*
 339                  * Note that because we loop until getting ENOENT, we will
 340                  * correctly remove all inactive segments as well as active ones.
 341                  */
 342                 for (segno = 1;; segno++)
 343                 {
 344                         sprintf(segpath, "%s.%u", path, segno);
 345                         if (unlink(segpath) < 0)
 346                         {
 347                                 /* ENOENT is expected after the last segment... */
 348                                 if (errno != ENOENT)
 349                                         ereport(WARNING,
 350                                                         (errcode_for_file_access(),
 351                                                          errmsg("could not remove segment %u of relation %u/%u/%u: %m",
 352                                                                         segno,
 353                                                                         rnode.spcNode,
 354                                                                         rnode.dbNode,
 355                                                                         rnode.relNode)));
 356                                 break;
 357                         }
 358                 }
 359                 pfree(segpath);
 360         }
 361 #endif
 362
 363         pfree(path);
 364
 365         /* Register request to unlink first segment later */
 366         if (!isRedo)
 367                 register_unlink(rnode);
 368 }
 369
 370 /*
 371  *      mdextend() -- Add a block to the specified relation.
 372  *
 373  *              The semantics are nearly the same as mdwrite(): write at the
 374  *              specified position.  However, this is to be used for the case of
 375  *              extending a relation (i.e., blocknum is at or beyond the current
 376  *              EOF).  Note that we assume writing a block beyond current EOF
 377  *              causes intervening file space to become filled with zeroes.
 378  */
 379 void
 380 mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
 381 {
 382         long            seekpos;
 383         int                     nbytes;
 384         MdfdVec    *v;
 385
 386         /* This assert is too expensive to have on normally ... */
 387 #ifdef CHECK_WRITE_VS_EXTEND
 388         Assert(blocknum >= mdnblocks(reln));
 389 #endif
 390
 391         /*
 392          * If a relation manages to grow to 2^32-1 blocks, refuse to extend it
 393          * any more --- we mustn't create a block whose number
 394          * actually is InvalidBlockNumber.
 395          */
 396         if (blocknum == InvalidBlockNumber)
 397                 ereport(ERROR,
 398                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 399                                  errmsg("cannot extend relation %u/%u/%u beyond %u blocks",
 400                                                 reln->smgr_rnode.spcNode,
 401                                                 reln->smgr_rnode.dbNode,
 402                                                 reln->smgr_rnode.relNode,
 403                                                 InvalidBlockNumber)));
 404
 405         v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_CREATE);
 406
 407 #ifndef LET_OS_MANAGE_FILESIZE
 408         seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
 409         Assert(seekpos < BLCKSZ * RELSEG_SIZE);
 410 #else
 411         seekpos = (long) (BLCKSZ * (blocknum));
 412 #endif
 413
 414         /*
 415          * Note: because caller usually obtained blocknum by calling mdnblocks,
 416          * which did a seek(SEEK_END), this seek is often redundant and will be
 417          * optimized away by fd.c.  It's not redundant, however, if there is a
 418          * partial page at the end of the file. In that case we want to try to
 419          * overwrite the partial page with a full page.  It's also not redundant
 420          * if bufmgr.c had to dump another buffer of the same file to make room
 421          * for the new page's buffer.
 422          */
 423         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 424                 ereport(ERROR,
 425                                 (errcode_for_file_access(),
 426                                  errmsg("could not seek to block %u of relation %u/%u/%u: %m",
 427                                                 blocknum,
 428                                                 reln->smgr_rnode.spcNode,
 429                                                 reln->smgr_rnode.dbNode,
 430                                                 reln->smgr_rnode.relNode)));
 431
 432         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
 433         {
 434                 if (nbytes < 0)
 435                         ereport(ERROR,
 436                                         (errcode_for_file_access(),
 437                                          errmsg("could not extend relation %u/%u/%u: %m",
 438                                                         reln->smgr_rnode.spcNode,
 439                                                         reln->smgr_rnode.dbNode,
 440                                                         reln->smgr_rnode.relNode),
 441                                          errhint("Check free disk space.")));
 442                 /* short write: complain appropriately */
 443                 ereport(ERROR,
 444                                 (errcode(ERRCODE_DISK_FULL),
 445                                  errmsg("could not extend relation %u/%u/%u: wrote only %d of %d bytes at block %u",
 446                                                 reln->smgr_rnode.spcNode,
 447                                                 reln->smgr_rnode.dbNode,
 448                                                 reln->smgr_rnode.relNode,
 449                                                 nbytes, BLCKSZ, blocknum),
 450                                  errhint("Check free disk space.")));
 451         }
 452
 453         if (!isTemp)
 454                 register_dirty_segment(reln, v);
 455
 456 #ifndef LET_OS_MANAGE_FILESIZE
 457         Assert(_mdnblocks(reln, v) <= ((BlockNumber) RELSEG_SIZE));
 458 #endif
 459 }
 460
 461 /*
 462  *      mdopen() -- Open the specified relation.
 463  *
 464  * Note we only open the first segment, when there are multiple segments.
 465  *
 466  * If first segment is not present, either ereport or return NULL according
 467  * to "behavior".  We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
 468  * EXTENSION_CREATE means it's OK to extend an existing relation, not to
 469  * invent one out of whole cloth.
 470  */
 471 static MdfdVec *
 472 mdopen(SMgrRelation reln, ExtensionBehavior behavior)
 473 {
 474         MdfdVec    *mdfd;
 475         char       *path;
 476         File            fd;
 477
 478         /* No work if already open */
 479         if (reln->md_fd)
 480                 return reln->md_fd;
 481
 482         path = relpath(reln->smgr_rnode);
 483
 484         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
 485
 486         if (fd < 0)
 487         {
 488                 /*
 489                  * During bootstrap, there are cases where a system relation will be
 490                  * accessed (by internal backend processes) before the bootstrap
 491                  * script nominally creates it.  Therefore, accept mdopen() as a
 492                  * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
 493                  */
 494                 if (IsBootstrapProcessingMode())
 495                         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
 496                 if (fd < 0)
 497                 {
 498                         pfree(path);
 499                         if (behavior == EXTENSION_RETURN_NULL &&
 500                                 FILE_POSSIBLY_DELETED(errno))
 501                                 return NULL;
 502                         ereport(ERROR,
 503                                         (errcode_for_file_access(),
 504                                          errmsg("could not open relation %u/%u/%u: %m",
 505                                                         reln->smgr_rnode.spcNode,
 506                                                         reln->smgr_rnode.dbNode,
 507                                                         reln->smgr_rnode.relNode)));
 508                 }
 509         }
 510
 511         pfree(path);
 512
 513         reln->md_fd = mdfd = _fdvec_alloc();
 514
 515         mdfd->mdfd_vfd = fd;
 516         mdfd->mdfd_segno = 0;
 517 #ifndef LET_OS_MANAGE_FILESIZE
 518         mdfd->mdfd_chain = NULL;
 519         Assert(_mdnblocks(reln, mdfd) <= ((BlockNumber) RELSEG_SIZE));
 520 #endif
 521
 522         return mdfd;
 523 }
 524
 525 /*
 526  *      mdclose() -- Close the specified relation, if it isn't closed already.
 527  */
 528 void
 529 mdclose(SMgrRelation reln)
 530 {
 531         MdfdVec    *v = reln->md_fd;
 532
 533         /* No work if already closed */
 534         if (v == NULL)
 535                 return;
 536
 537         reln->md_fd = NULL;                     /* prevent dangling pointer after error */
 538
 539 #ifndef LET_OS_MANAGE_FILESIZE
 540         while (v != NULL)
 541         {
 542                 MdfdVec    *ov = v;
 543
 544                 /* if not closed already */
 545                 if (v->mdfd_vfd >= 0)
 546                         FileClose(v->mdfd_vfd);
 547                 /* Now free vector */
 548                 v = v->mdfd_chain;
 549                 pfree(ov);
 550         }
 551 #else
 552         if (v->mdfd_vfd >= 0)
 553                 FileClose(v->mdfd_vfd);
 554         pfree(v);
 555 #endif
 556 }
 557
 558 /*
 559  *      mdread() -- Read the specified block from a relation.
 560  */
 561 void
 562 mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
 563 {
 564         long            seekpos;
 565         int                     nbytes;
 566         MdfdVec    *v;
 567
 568         v = _mdfd_getseg(reln, blocknum, false, EXTENSION_FAIL);
 569
 570 #ifndef LET_OS_MANAGE_FILESIZE
 571         seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
 572         Assert(seekpos < BLCKSZ * RELSEG_SIZE);
 573 #else
 574         seekpos = (long) (BLCKSZ * (blocknum));
 575 #endif
 576
 577         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 578                 ereport(ERROR,
 579                                 (errcode_for_file_access(),
 580                                  errmsg("could not seek to block %u of relation %u/%u/%u: %m",
 581                                                 blocknum,
 582                                                 reln->smgr_rnode.spcNode,
 583                                                 reln->smgr_rnode.dbNode,
 584                                                 reln->smgr_rnode.relNode)));
 585
 586         if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
 587         {
 588                 if (nbytes < 0)
 589                         ereport(ERROR,
 590                                         (errcode_for_file_access(),
 591                                          errmsg("could not read block %u of relation %u/%u/%u: %m",
 592                                                         blocknum,
 593                                                         reln->smgr_rnode.spcNode,
 594                                                         reln->smgr_rnode.dbNode,
 595                                                         reln->smgr_rnode.relNode)));
 596                 /*
 597                  * Short read: we are at or past EOF, or we read a partial block at
 598                  * EOF.  Normally this is an error; upper levels should never try to
 599                  * read a nonexistent block.  However, if zero_damaged_pages is ON
 600                  * or we are InRecovery, we should instead return zeroes without
 601                  * complaining.  This allows, for example, the case of trying to
 602                  * update a block that was later truncated away.
 603                  */
 604                 if (zero_damaged_pages || InRecovery)
 605                         MemSet(buffer, 0, BLCKSZ);
 606                 else
 607                         ereport(ERROR,
 608                                         (errcode(ERRCODE_DATA_CORRUPTED),
 609                                          errmsg("could not read block %u of relation %u/%u/%u: read only %d of %d bytes",
 610                                                         blocknum,
 611                                                         reln->smgr_rnode.spcNode,
 612                                                         reln->smgr_rnode.dbNode,
 613                                                         reln->smgr_rnode.relNode,
 614                                                         nbytes, BLCKSZ)));
 615         }
 616 }
 617
 618 /*
 619  *      mdwrite() -- Write the supplied block at the appropriate location.
 620  *
 621  *              This is to be used only for updating already-existing blocks of a
 622  *              relation (ie, those before the current EOF).  To extend a relation,
 623  *              use mdextend().
 624  */
 625 void
 626 mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
 627 {
 628         long            seekpos;
 629         int                     nbytes;
 630         MdfdVec    *v;
 631
 632         /* This assert is too expensive to have on normally ... */
 633 #ifdef CHECK_WRITE_VS_EXTEND
 634         Assert(blocknum < mdnblocks(reln));
 635 #endif
 636
 637         v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_FAIL);
 638
 639 #ifndef LET_OS_MANAGE_FILESIZE
 640         seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
 641         Assert(seekpos < BLCKSZ * RELSEG_SIZE);
 642 #else
 643         seekpos = (long) (BLCKSZ * (blocknum));
 644 #endif
 645
 646         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 647                 ereport(ERROR,
 648                                 (errcode_for_file_access(),
 649                                  errmsg("could not seek to block %u of relation %u/%u/%u: %m",
 650                                                 blocknum,
 651                                                 reln->smgr_rnode.spcNode,
 652                                                 reln->smgr_rnode.dbNode,
 653                                                 reln->smgr_rnode.relNode)));
 654
 655         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
 656         {
 657                 if (nbytes < 0)
 658                         ereport(ERROR,
 659                                         (errcode_for_file_access(),
 660                                          errmsg("could not write block %u of relation %u/%u/%u: %m",
 661                                                         blocknum,
 662                                                         reln->smgr_rnode.spcNode,
 663                                                         reln->smgr_rnode.dbNode,
 664                                                         reln->smgr_rnode.relNode)));
 665                 /* short write: complain appropriately */
 666                 ereport(ERROR,
 667                                 (errcode(ERRCODE_DISK_FULL),
 668                                  errmsg("could not write block %u of relation %u/%u/%u: wrote only %d of %d bytes",
 669                                                 blocknum,
 670                                                 reln->smgr_rnode.spcNode,
 671                                                 reln->smgr_rnode.dbNode,
 672                                                 reln->smgr_rnode.relNode,
 673                                                 nbytes, BLCKSZ),
 674                                  errhint("Check free disk space.")));
 675         }
 676
 677         if (!isTemp)
 678                 register_dirty_segment(reln, v);
 679 }
 680
 681 /*
 682  *      mdnblocks() -- Get the number of blocks stored in a relation.
 683  *
 684  *              Important side effect: all active segments of the relation are opened
 685  *              and added to the mdfd_chain list.  If this routine has not been
 686  *              called, then only segments up to the last one actually touched
 687  *              are present in the chain.
 688  */
 689 BlockNumber
 690 mdnblocks(SMgrRelation reln)
 691 {
 692         MdfdVec    *v = mdopen(reln, EXTENSION_FAIL);
 693
 694 #ifndef LET_OS_MANAGE_FILESIZE
 695         BlockNumber nblocks;
 696         BlockNumber segno = 0;
 697
 698         /*
 699          * Skip through any segments that aren't the last one, to avoid redundant
 700          * seeks on them.  We have previously verified that these segments are
 701          * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
 702          *
 703          * NOTE: this assumption could only be wrong if another backend has
 704          * truncated the relation.      We rely on higher code levels to handle that
 705          * scenario by closing and re-opening the md fd, which is handled via
 706          * relcache flush.  (Since the bgwriter doesn't participate in relcache
 707          * flush, it could have segment chain entries for inactive segments;
 708          * that's OK because the bgwriter never needs to compute relation size.)
 709          */
 710         while (v->mdfd_chain != NULL)
 711         {
 712                 segno++;
 713                 v = v->mdfd_chain;
 714         }
 715
 716         for (;;)
 717         {
 718                 nblocks = _mdnblocks(reln, v);
 719                 if (nblocks > ((BlockNumber) RELSEG_SIZE))
 720                         elog(FATAL, "segment too big");
 721                 if (nblocks < ((BlockNumber) RELSEG_SIZE))
 722                         return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
 723
 724                 /*
 725                  * If segment is exactly RELSEG_SIZE, advance to next one.
 726                  */
 727                 segno++;
 728
 729                 if (v->mdfd_chain == NULL)
 730                 {
 731                         /*
 732                          * Because we pass O_CREAT, we will create the next segment (with
 733                          * zero length) immediately, if the last segment is of length
 734                          * RELSEG_SIZE.  While perhaps not strictly necessary, this keeps
 735                          * the logic simple.
 736                          */
 737                         v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
 738                         if (v->mdfd_chain == NULL)
 739                                 ereport(ERROR,
 740                                                 (errcode_for_file_access(),
 741                                                  errmsg("could not open segment %u of relation %u/%u/%u: %m",
 742                                                                 segno,
 743                                                                 reln->smgr_rnode.spcNode,
 744                                                                 reln->smgr_rnode.dbNode,
 745                                                                 reln->smgr_rnode.relNode)));
 746                 }
 747
 748                 v = v->mdfd_chain;
 749         }
 750 #else
 751         return _mdnblocks(reln, v);
 752 #endif
 753 }
 754
 755 /*
 756  *      mdtruncate() -- Truncate relation to specified number of blocks.
 757  */
 758 void
 759 mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
 760 {
 761         MdfdVec    *v;
 762         BlockNumber curnblk;
 763
 764 #ifndef LET_OS_MANAGE_FILESIZE
 765         BlockNumber priorblocks;
 766 #endif
 767
 768         /*
 769          * NOTE: mdnblocks makes sure we have opened all active segments, so
 770          * that truncation loop will get them all!
 771          */
 772         curnblk = mdnblocks(reln);
 773         if (nblocks > curnblk)
 774         {
 775                 /* Bogus request ... but no complaint if InRecovery */
 776                 if (InRecovery)
 777                         return;
 778                 ereport(ERROR,
 779                                 (errmsg("could not truncate relation %u/%u/%u to %u blocks: it's only %u blocks now",
 780                                                 reln->smgr_rnode.spcNode,
 781                                                 reln->smgr_rnode.dbNode,
 782                                                 reln->smgr_rnode.relNode,
 783                                                 nblocks, curnblk)));
 784         }
 785         if (nblocks == curnblk)
 786                 return;                                 /* no work */
 787
 788         v = mdopen(reln, EXTENSION_FAIL);
 789
 790 #ifndef LET_OS_MANAGE_FILESIZE
 791         priorblocks = 0;
 792         while (v != NULL)
 793         {
 794                 MdfdVec    *ov = v;
 795
 796                 if (priorblocks > nblocks)
 797                 {
 798                         /*
 799                          * This segment is no longer active (and has already been
 800                          * unlinked from the mdfd_chain). We truncate the file, but do
 801                          * not delete it, for reasons explained in the header comments.
 802                          */
 803                         if (FileTruncate(v->mdfd_vfd, 0) < 0)
 804                                 ereport(ERROR,
 805                                                 (errcode_for_file_access(),
 806                                                  errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
 807                                                                 reln->smgr_rnode.spcNode,
 808                                                                 reln->smgr_rnode.dbNode,
 809                                                                 reln->smgr_rnode.relNode,
 810                                                                 nblocks)));
 811                         if (!isTemp)
 812                                 register_dirty_segment(reln, v);
 813                         v = v->mdfd_chain;
 814                         Assert(ov != reln->md_fd);      /* we never drop the 1st segment */
 815                         pfree(ov);
 816                 }
 817                 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
 818                 {
 819                         /*
 820                          * This is the last segment we want to keep. Truncate the file to
 821                          * the right length, and clear chain link that points to any
 822                          * remaining segments (which we shall zap). NOTE: if nblocks is
 823                          * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
 824                          * segment to 0 length but keep it. This adheres to the invariant
 825                          * given in the header comments.
 826                          */
 827                         BlockNumber lastsegblocks = nblocks - priorblocks;
 828
 829                         if (FileTruncate(v->mdfd_vfd, lastsegblocks * BLCKSZ) < 0)
 830                                 ereport(ERROR,
 831                                                 (errcode_for_file_access(),
 832                                                  errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
 833                                                                 reln->smgr_rnode.spcNode,
 834                                                                 reln->smgr_rnode.dbNode,
 835                                                                 reln->smgr_rnode.relNode,
 836                                                                 nblocks)));
 837                         if (!isTemp)
 838                                 register_dirty_segment(reln, v);
 839                         v = v->mdfd_chain;
 840                         ov->mdfd_chain = NULL;
 841                 }
 842                 else
 843                 {
 844                         /*
 845                          * We still need this segment and 0 or more blocks beyond it, so
 846                          * nothing to do here.
 847                          */
 848                         v = v->mdfd_chain;
 849                 }
 850                 priorblocks += RELSEG_SIZE;
 851         }
 852 #else
 853         if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0)
 854                 ereport(ERROR,
 855                                 (errcode_for_file_access(),
 856                           errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
 857                                          reln->smgr_rnode.spcNode,
 858                                          reln->smgr_rnode.dbNode,
 859                                          reln->smgr_rnode.relNode,
 860                                          nblocks)));
 861         if (!isTemp)
 862                 register_dirty_segment(reln, v);
 863 #endif
 864 }
 865
 866 /*
 867  *      mdimmedsync() -- Immediately sync a relation to stable storage.
 868  *
 869  * Note that only writes already issued are synced; this routine knows
 870  * nothing of dirty buffers that may exist inside the buffer manager.
 871  */
 872 void
 873 mdimmedsync(SMgrRelation reln)
 874 {
 875         MdfdVec    *v;
 876         BlockNumber curnblk;
 877
 878         /*
 879          * NOTE: mdnblocks makes sure we have opened all active segments, so
 880          * that fsync loop will get them all!
 881          */
 882         curnblk = mdnblocks(reln);
 883
 884         v = mdopen(reln, EXTENSION_FAIL);
 885
 886 #ifndef LET_OS_MANAGE_FILESIZE
 887         while (v != NULL)
 888         {
 889                 if (FileSync(v->mdfd_vfd) < 0)
 890                         ereport(ERROR,
 891                                         (errcode_for_file_access(),
 892                                          errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
 893                                                         v->mdfd_segno,
 894                                                         reln->smgr_rnode.spcNode,
 895                                                         reln->smgr_rnode.dbNode,
 896                                                         reln->smgr_rnode.relNode)));
 897                 v = v->mdfd_chain;
 898         }
 899 #else
 900         if (FileSync(v->mdfd_vfd) < 0)
 901                 ereport(ERROR,
 902                                 (errcode_for_file_access(),
 903                                  errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
 904                                                 v->mdfd_segno,
 905                                                 reln->smgr_rnode.spcNode,
 906                                                 reln->smgr_rnode.dbNode,
 907                                                 reln->smgr_rnode.relNode)));
 908 #endif
 909 }
 910
 911 /*
 912  *      mdsync() -- Sync previous writes to stable storage.
 913  */
 914 void
 915 mdsync(void)
 916 {
 917         static bool mdsync_in_progress = false;
 918
 919         HASH_SEQ_STATUS hstat;
 920         PendingOperationEntry *entry;
 921         int                     absorb_counter;
 922
 923         /*
 924          * This is only called during checkpoints, and checkpoints should only
 925          * occur in processes that have created a pendingOpsTable.
 926          */
 927         if (!pendingOpsTable)
 928                 elog(ERROR, "cannot sync without a pendingOpsTable");
 929
 930         /*
 931          * If we are in the bgwriter, the sync had better include all fsync
 932          * requests that were queued by backends up to this point.  The tightest
 933          * race condition that could occur is that a buffer that must be written
 934          * and fsync'd for the checkpoint could have been dumped by a backend
 935          * just before it was visited by BufferSync().  We know the backend will
 936          * have queued an fsync request before clearing the buffer's dirtybit,
 937          * so we are safe as long as we do an Absorb after completing BufferSync().
 938          */
 939         AbsorbFsyncRequests();
 940
 941         /*
 942          * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
 943          * checkpoint), we want to ignore fsync requests that are entered into the
 944          * hashtable after this point --- they should be processed next time,
 945          * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
 946          * ones: new ones will have cycle_ctr equal to the incremented value of
 947          * mdsync_cycle_ctr.
 948          *
 949          * In normal circumstances, all entries present in the table at this
 950          * point will have cycle_ctr exactly equal to the current (about to be old)
 951          * value of mdsync_cycle_ctr.  However, if we fail partway through the
 952          * fsync'ing loop, then older values of cycle_ctr might remain when we
 953          * come back here to try again.  Repeated checkpoint failures would
 954          * eventually wrap the counter around to the point where an old entry
 955          * might appear new, causing us to skip it, possibly allowing a checkpoint
 956          * to succeed that should not have.  To forestall wraparound, any time
 957          * the previous mdsync() failed to complete, run through the table and
 958          * forcibly set cycle_ctr = mdsync_cycle_ctr.
 959          *
 960          * Think not to merge this loop with the main loop, as the problem is
 961          * exactly that that loop may fail before having visited all the entries.
 962          * From a performance point of view it doesn't matter anyway, as this
 963          * path will never be taken in a system that's functioning normally.
 964          */
 965         if (mdsync_in_progress)
 966         {
 967                 /* prior try failed, so update any stale cycle_ctr values */
 968                 hash_seq_init(&hstat, pendingOpsTable);
 969                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
 970                 {
 971                         entry->cycle_ctr = mdsync_cycle_ctr;
 972                 }
 973         }
 974
 975         /* Advance counter so that new hashtable entries are distinguishable */
 976         mdsync_cycle_ctr++;
 977
 978         /* Set flag to detect failure if we don't reach the end of the loop */
 979         mdsync_in_progress = true;
 980
 981         /* Now scan the hashtable for fsync requests to process */
 982         absorb_counter = FSYNCS_PER_ABSORB;
 983         hash_seq_init(&hstat, pendingOpsTable);
 984         while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
 985         {
 986                 /*
 987                  * If the entry is new then don't process it this time.  Note that
 988                  * "continue" bypasses the hash-remove call at the bottom of the loop.
 989                  */
 990                 if (entry->cycle_ctr == mdsync_cycle_ctr)
 991                         continue;
 992
 993                 /* Else assert we haven't missed it */
 994                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
 995
 996                 /*
 997                  * If fsync is off then we don't have to bother opening the file
 998                  * at all.  (We delay checking until this point so that changing
 999                  * fsync on the fly behaves sensibly.)  Also, if the entry is
1000                  * marked canceled, fall through to delete it.
1001                  */
1002                 if (enableFsync && !entry->canceled)
1003                 {
1004                         int                     failures;
1005
1006                         /*
1007                          * If in bgwriter, we want to absorb pending requests every so
1008                          * often to prevent overflow of the fsync request queue.  It is
1009                          * unspecified whether newly-added entries will be visited by
1010                          * hash_seq_search, but we don't care since we don't need to
1011                          * process them anyway.
1012                          */
1013                         if (--absorb_counter <= 0)
1014                         {
1015                                 AbsorbFsyncRequests();
1016                                 absorb_counter = FSYNCS_PER_ABSORB;
1017                         }
1018
1019                         /*
1020                          * The fsync table could contain requests to fsync segments that
1021                          * have been deleted (unlinked) by the time we get to them.
1022                          * Rather than just hoping an ENOENT (or EACCES on Windows) error
1023                          * can be ignored, what we do on error is absorb pending requests
1024                          * and then retry.  Since mdunlink() queues a "revoke" message
1025                          * before actually unlinking, the fsync request is guaranteed to
1026                          * be marked canceled after the absorb if it really was this case.
1027                          * DROP DATABASE likewise has to tell us to forget fsync requests
1028                          * before it starts deletions.
1029                          */
1030                         for (failures = 0; ; failures++)        /* loop exits at "break" */
1031                         {
1032                                 SMgrRelation reln;
1033                                 MdfdVec    *seg;
1034
1035                                 /*
1036                                  * Find or create an smgr hash entry for this relation. This
1037                                  * may seem a bit unclean -- md calling smgr?  But it's really
1038                                  * the best solution.  It ensures that the open file reference
1039                                  * isn't permanently leaked if we get an error here. (You may
1040                                  * say "but an unreferenced SMgrRelation is still a leak!" Not
1041                                  * really, because the only case in which a checkpoint is done
1042                                  * by a process that isn't about to shut down is in the
1043                                  * bgwriter, and it will periodically do smgrcloseall(). This
1044                                  * fact justifies our not closing the reln in the success path
1045                                  * either, which is a good thing since in non-bgwriter cases
1046                                  * we couldn't safely do that.)  Furthermore, in many cases
1047                                  * the relation will have been dirtied through this same smgr
1048                                  * relation, and so we can save a file open/close cycle.
1049                                  */
1050                                 reln = smgropen(entry->tag.rnode);
1051
1052                                 /*
1053                                  * It is possible that the relation has been dropped or
1054                                  * truncated since the fsync request was entered.  Therefore,
1055                                  * allow ENOENT, but only if we didn't fail already on
1056                                  * this file.  This applies both during _mdfd_getseg() and
1057                                  * during FileSync, since fd.c might have closed the file
1058                                  * behind our back.
1059                                  */
1060                                 seg = _mdfd_getseg(reln,
1061                                                                    entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
1062                                                                    false, EXTENSION_RETURN_NULL);
1063                                 if (seg != NULL &&
1064                                         FileSync(seg->mdfd_vfd) >= 0)
1065                                         break;          /* success; break out of retry loop */
1066
1067                                 /*
1068                                  * XXX is there any point in allowing more than one retry?
1069                                  * Don't see one at the moment, but easy to change the
1070                                  * test here if so.
1071                                  */
1072                                 if (!FILE_POSSIBLY_DELETED(errno) ||
1073                                         failures > 0)
1074                                         ereport(ERROR,
1075                                                         (errcode_for_file_access(),
1076                                                          errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
1077                                                                         entry->tag.segno,
1078                                                                         entry->tag.rnode.spcNode,
1079                                                                         entry->tag.rnode.dbNode,
1080                                                                         entry->tag.rnode.relNode)));
1081                                 else
1082                                         ereport(DEBUG1,
1083                                                         (errcode_for_file_access(),
1084                                                          errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
1085                                                                         entry->tag.segno,
1086                                                                         entry->tag.rnode.spcNode,
1087                                                                         entry->tag.rnode.dbNode,
1088                                                                         entry->tag.rnode.relNode)));
1089
1090                                 /*
1091                                  * Absorb incoming requests and check to see if canceled.
1092                                  */
1093                                 AbsorbFsyncRequests();
1094                                 absorb_counter = FSYNCS_PER_ABSORB;     /* might as well... */
1095
1096                                 if (entry->canceled)
1097                                         break;
1098                         }       /* end retry loop */
1099                 }
1100
1101                 /*
1102                  * If we get here, either we fsync'd successfully, or we don't have
1103                  * to because enableFsync is off, or the entry is (now) marked
1104                  * canceled.  Okay to delete it.
1105                  */
1106                 if (hash_search(pendingOpsTable, &entry->tag,
1107                                                 HASH_REMOVE, NULL) == NULL)
1108                         elog(ERROR, "pendingOpsTable corrupted");
1109         }       /* end loop over hashtable entries */
1110
1111         /* Flag successful completion of mdsync */
1112         mdsync_in_progress = false;
1113 }
1114
1115 /*
1116  * mdpreckpt() -- Do pre-checkpoint work
1117  *
1118  * To distinguish unlink requests that arrived before this checkpoint
1119  * started from those that arrived during the checkpoint, we use a cycle
1120  * counter similar to the one we use for fsync requests. That cycle
1121  * counter is incremented here.
1122  *
1123  * This must be called *before* the checkpoint REDO point is determined.
1124  * That ensures that we won't delete files too soon.
1125  *
1126  * Note that we can't do anything here that depends on the assumption
1127  * that the checkpoint will be completed.
1128  */
1129 void
1130 mdpreckpt(void)
1131 {
1132         ListCell *cell;
1133
1134         /*
1135          * In case the prior checkpoint wasn't completed, stamp all entries in
1136          * the list with the current cycle counter.  Anything that's in the
1137          * list at the start of checkpoint can surely be deleted after the
1138          * checkpoint is finished, regardless of when the request was made.
1139          */
1140         foreach(cell, pendingUnlinks)
1141         {
1142                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1143
1144                 entry->cycle_ctr = mdckpt_cycle_ctr;
1145         }
1146
1147         /*
1148          * Any unlink requests arriving after this point will be assigned the
1149          * next cycle counter, and won't be unlinked until next checkpoint.
1150          */
1151         mdckpt_cycle_ctr++;
1152 }
1153
1154 /*
1155  * mdpostckpt() -- Do post-checkpoint work
1156  *
1157  * Remove any lingering files that can now be safely removed.
1158  */
1159 void
1160 mdpostckpt(void)
1161 {
1162         while (pendingUnlinks != NIL)
1163         {
1164                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
1165                 char *path;
1166
1167                 /*
1168                  * New entries are appended to the end, so if the entry is new
1169                  * we've reached the end of old entries.
1170                  */
1171                 if (entry->cycle_ctr == mdsync_cycle_ctr)
1172                         break;
1173
1174                 /* Else assert we haven't missed it */
1175                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);
1176
1177                 /* Unlink the file */
1178                 path = relpath(entry->rnode);
1179                 if (unlink(path) < 0)
1180                 {
1181                         /*
1182                          * ENOENT shouldn't happen either, but it doesn't really matter
1183                          * because we would've deleted it now anyway.
1184                          */
1185                         if (errno != ENOENT)
1186                                 ereport(WARNING,
1187                                                 (errcode_for_file_access(),
1188                                                  errmsg("could not remove relation %u/%u/%u: %m",
1189                                                                 entry->rnode.spcNode,
1190                                                                 entry->rnode.dbNode,
1191                                                                 entry->rnode.relNode)));
1192                 }
1193                 pfree(path);
1194
1195                 pendingUnlinks = list_delete_first(pendingUnlinks);
1196                 pfree(entry);
1197         }
1198 }
1199
1200 /*
1201  * register_dirty_segment() -- Mark a relation segment as needing fsync
1202  *
1203  * If there is a local pending-ops table, just make an entry in it for
1204  * mdsync to process later.  Otherwise, try to pass off the fsync request
1205  * to the background writer process.  If that fails, just do the fsync
1206  * locally before returning (we expect this will not happen often enough
1207  * to be a performance problem).
1208  */
1209 static void
1210 register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
1211 {
1212         if (pendingOpsTable)
1213         {
1214                 /* push it into local pending-ops table */
1215                 RememberFsyncRequest(reln->smgr_rnode, seg->mdfd_segno);
1216         }
1217         else
1218         {
1219                 if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno))
1220                         return;                         /* passed it off successfully */
1221
1222                 if (FileSync(seg->mdfd_vfd) < 0)
1223                         ereport(ERROR,
1224                                         (errcode_for_file_access(),
1225                                          errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
1226                                                         seg->mdfd_segno,
1227                                                         reln->smgr_rnode.spcNode,
1228                                                         reln->smgr_rnode.dbNode,
1229                                                         reln->smgr_rnode.relNode)));
1230         }
1231 }
1232
1233 /*
1234  * register_unlink() -- Schedule a file to be deleted after next checkpoint
1235  *
1236  * As with register_dirty_segment, this could involve either a local or
1237  * a remote pending-ops table.
1238  */
1239 static void
1240 register_unlink(RelFileNode rnode)
1241 {
1242         if (pendingOpsTable)
1243         {
1244                 /* push it into local pending-ops table */
1245                 RememberFsyncRequest(rnode, UNLINK_RELATION_REQUEST);
1246         }
1247         else
1248         {
1249                 /*
1250                  * Notify the bgwriter about it.  If we fail to queue the request
1251                  * message, we have to sleep and try again, because we can't simply
1252                  * delete the file now.  Ugly, but hopefully won't happen often.
1253                  *
1254                  * XXX should we just leave the file orphaned instead?
1255                  */
1256                 Assert(IsUnderPostmaster);
1257                 while (!ForwardFsyncRequest(rnode, UNLINK_RELATION_REQUEST))
1258                         pg_usleep(10000L);      /* 10 msec seems a good number */
1259         }
1260 }
1261
1262 /*
1263  * RememberFsyncRequest() -- callback from bgwriter side of fsync request
1264  *
1265  * We stuff most fsync requests into the local hash table for execution
1266  * during the bgwriter's next checkpoint.  UNLINK requests go into a
1267  * separate linked list, however, because they get processed separately.
1268  *
1269  * The range of possible segment numbers is way less than the range of
1270  * BlockNumber, so we can reserve high values of segno for special purposes.
1271  * We define three:
1272  * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation
1273  * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
1274  * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
1275  *   checkpoint.
1276  *
1277  * (Handling the FORGET_* requests is a tad slow because the hash table has
1278  * to be searched linearly, but it doesn't seem worth rethinking the table
1279  * structure for them.)
1280  */
1281 void
1282 RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
1283 {
1284         Assert(pendingOpsTable);
1285
1286         if (segno == FORGET_RELATION_FSYNC)
1287         {
1288                 /* Remove any pending requests for the entire relation */
1289                 HASH_SEQ_STATUS hstat;
1290                 PendingOperationEntry *entry;
1291
1292                 hash_seq_init(&hstat, pendingOpsTable);
1293                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1294                 {
1295                         if (RelFileNodeEquals(entry->tag.rnode, rnode))
1296                         {
1297                                 /* Okay, cancel this entry */
1298                                 entry->canceled = true;
1299                         }
1300                 }
1301         }
1302         else if (segno == FORGET_DATABASE_FSYNC)
1303         {
1304                 /* Remove any pending requests for the entire database */
1305                 HASH_SEQ_STATUS hstat;
1306                 PendingOperationEntry *entry;
1307
1308                 hash_seq_init(&hstat, pendingOpsTable);
1309                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1310                 {
1311                         if (entry->tag.rnode.dbNode == rnode.dbNode)
1312                         {
1313                                 /* Okay, cancel this entry */
1314                                 entry->canceled = true;
1315                         }
1316                 }
1317         }
1318         else if (segno == UNLINK_RELATION_REQUEST)
1319         {
1320                 /* Unlink request: put it in the linked list */
1321                 MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
1322                 PendingUnlinkEntry *entry;
1323
1324                 entry = palloc(sizeof(PendingUnlinkEntry));
1325                 entry->rnode = rnode;
1326                 entry->cycle_ctr = mdckpt_cycle_ctr;
1327
1328                 pendingUnlinks = lappend(pendingUnlinks, entry);
1329
1330                 MemoryContextSwitchTo(oldcxt);
1331         }
1332         else
1333         {
1334                 /* Normal case: enter a request to fsync this segment */
1335                 PendingOperationTag key;
1336                 PendingOperationEntry *entry;
1337                 bool            found;
1338
1339                 /* ensure any pad bytes in the hash key are zeroed */
1340                 MemSet(&key, 0, sizeof(key));
1341                 key.rnode = rnode;
1342                 key.segno = segno;
1343
1344                 entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
1345                                                                                                           &key,
1346                                                                                                           HASH_ENTER,
1347                                                                                                           &found);
1348                 /* if new or previously canceled entry, initialize it */
1349                 if (!found || entry->canceled)
1350                 {
1351                         entry->canceled = false;
1352                         entry->cycle_ctr = mdsync_cycle_ctr;
1353                 }
1354                 /*
1355                  * NB: it's intentional that we don't change cycle_ctr if the entry
1356                  * already exists.  The fsync request must be treated as old, even
1357                  * though the new request will be satisfied too by any subsequent
1358                  * fsync.
1359                  *
1360                  * However, if the entry is present but is marked canceled, we should
1361                  * act just as though it wasn't there.  The only case where this could
1362                  * happen would be if a file had been deleted, we received but did not
1363                  * yet act on the cancel request, and the same relfilenode was then
1364                  * assigned to a new file.  We mustn't lose the new request, but
1365                  * it should be considered new not old.
1366                  */
1367         }
1368 }
1369
1370 /*
1371  * ForgetRelationFsyncRequests -- ensure any fsyncs for a rel are forgotten
1372  */
1373 void
1374 ForgetRelationFsyncRequests(RelFileNode rnode)
1375 {
1376         if (pendingOpsTable)
1377         {
1378                 /* standalone backend or startup process: fsync state is local */
1379                 RememberFsyncRequest(rnode, FORGET_RELATION_FSYNC);
1380         }
1381         else if (IsUnderPostmaster)
1382         {
1383                 /*
1384                  * Notify the bgwriter about it.  If we fail to queue the revoke
1385                  * message, we have to sleep and try again ... ugly, but hopefully
1386                  * won't happen often.
1387                  *
1388                  * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with
1389                  * an error would leave the no-longer-used file still present on
1390                  * disk, which would be bad, so I'm inclined to assume that the
1391                  * bgwriter will always empty the queue soon.
1392                  */
1393                 while (!ForwardFsyncRequest(rnode, FORGET_RELATION_FSYNC))
1394                         pg_usleep(10000L);      /* 10 msec seems a good number */
1395                 /*
1396                  * Note we don't wait for the bgwriter to actually absorb the
1397                  * revoke message; see mdsync() for the implications.
1398                  */
1399         }
1400 }
1401
1402 /*
1403  * ForgetDatabaseFsyncRequests -- ensure any fsyncs for a DB are forgotten
1404  */
1405 void
1406 ForgetDatabaseFsyncRequests(Oid dbid)
1407 {
1408         RelFileNode rnode;
1409
1410         rnode.dbNode = dbid;
1411         rnode.spcNode = 0;
1412         rnode.relNode = 0;
1413
1414         if (pendingOpsTable)
1415         {
1416                 /* standalone backend or startup process: fsync state is local */
1417                 RememberFsyncRequest(rnode, FORGET_DATABASE_FSYNC);
1418         }
1419         else if (IsUnderPostmaster)
1420         {
1421                 /* see notes in ForgetRelationFsyncRequests */
1422                 while (!ForwardFsyncRequest(rnode, FORGET_DATABASE_FSYNC))
1423                         pg_usleep(10000L);      /* 10 msec seems a good number */
1424         }
1425 }
1426
1427
1428 /*
1429  *      _fdvec_alloc() -- Make a MdfdVec object.
1430  */
1431 static MdfdVec *
1432 _fdvec_alloc(void)
1433 {
1434         return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
1435 }
1436
1437 #ifndef LET_OS_MANAGE_FILESIZE
1438
1439 /*
1440  * Open the specified segment of the relation,
1441  * and make a MdfdVec object for it.  Returns NULL on failure.
1442  */
1443 static MdfdVec *
1444 _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
1445 {
1446         MdfdVec    *v;
1447         int                     fd;
1448         char       *path,
1449                            *fullpath;
1450
1451         path = relpath(reln->smgr_rnode);
1452
1453         if (segno > 0)
1454         {
1455                 /* be sure we have enough space for the '.segno' */
1456                 fullpath = (char *) palloc(strlen(path) + 12);
1457                 sprintf(fullpath, "%s.%u", path, segno);
1458                 pfree(path);
1459         }
1460         else
1461                 fullpath = path;
1462
1463         /* open the file */
1464         fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
1465
1466         pfree(fullpath);
1467
1468         if (fd < 0)
1469                 return NULL;
1470
1471         /* allocate an mdfdvec entry for it */
1472         v = _fdvec_alloc();
1473
1474         /* fill the entry */
1475         v->mdfd_vfd = fd;
1476         v->mdfd_segno = segno;
1477         v->mdfd_chain = NULL;
1478         Assert(_mdnblocks(reln, v) <= ((BlockNumber) RELSEG_SIZE));
1479
1480         /* all done */
1481         return v;
1482 }
1483 #endif   /* LET_OS_MANAGE_FILESIZE */
1484
1485 /*
1486  *      _mdfd_getseg() -- Find the segment of the relation holding the
1487  *              specified block.
1488  *
1489  * If the segment doesn't exist, we ereport, return NULL, or create the
1490  * segment, according to "behavior".  Note: isTemp need only be correct
1491  * in the EXTENSION_CREATE case.
1492  */
1493 static MdfdVec *
1494 _mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool isTemp,
1495                          ExtensionBehavior behavior)
1496 {
1497         MdfdVec    *v = mdopen(reln, behavior);
1498
1499 #ifndef LET_OS_MANAGE_FILESIZE
1500         BlockNumber targetseg;
1501         BlockNumber nextsegno;
1502
1503         if (!v)
1504                 return NULL;                    /* only possible if EXTENSION_RETURN_NULL */
1505
1506         targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1507         for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
1508         {
1509                 Assert(nextsegno == v->mdfd_segno + 1);
1510
1511                 if (v->mdfd_chain == NULL)
1512                 {
1513                         /*
1514                          * Normally we will create new segments only if authorized by
1515                          * the caller (i.e., we are doing mdextend()).  But when doing
1516                          * WAL recovery, create segments anyway; this allows cases such as
1517                          * replaying WAL data that has a write into a high-numbered
1518                          * segment of a relation that was later deleted.  We want to go
1519                          * ahead and create the segments so we can finish out the replay.
1520                          *
1521                          * We have to maintain the invariant that segments before the
1522                          * last active segment are of size RELSEG_SIZE; therefore, pad
1523                          * them out with zeroes if needed.  (This only matters if caller
1524                          * is extending the relation discontiguously, but that can happen
1525                          * in hash indexes.)
1526                          */
1527                         if (behavior == EXTENSION_CREATE || InRecovery)
1528                         {
1529                                 if (_mdnblocks(reln, v) < RELSEG_SIZE)
1530                                 {
1531                                         char   *zerobuf = palloc0(BLCKSZ);
1532
1533                                         mdextend(reln, nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1534                                                          zerobuf, isTemp);
1535                                         pfree(zerobuf);
1536                                 }
1537                                 v->mdfd_chain = _mdfd_openseg(reln, nextsegno, O_CREAT);
1538                         }
1539                         else
1540                         {
1541                                 /* We won't create segment if not existent */
1542                                 v->mdfd_chain = _mdfd_openseg(reln, nextsegno, 0);
1543                         }
1544                         if (v->mdfd_chain == NULL)
1545                         {
1546                                 if (behavior == EXTENSION_RETURN_NULL &&
1547                                         FILE_POSSIBLY_DELETED(errno))
1548                                         return NULL;
1549                                 ereport(ERROR,
1550                                                 (errcode_for_file_access(),
1551                                                  errmsg("could not open segment %u of relation %u/%u/%u (target block %u): %m",
1552                                                                 nextsegno,
1553                                                                 reln->smgr_rnode.spcNode,
1554                                                                 reln->smgr_rnode.dbNode,
1555                                                                 reln->smgr_rnode.relNode,
1556                                                                 blkno)));
1557                         }
1558                 }
1559                 v = v->mdfd_chain;
1560         }
1561 #endif
1562
1563         return v;
1564 }
1565
1566 /*
1567  * Get number of blocks present in a single disk file
1568  */
1569 static BlockNumber
1570 _mdnblocks(SMgrRelation reln, MdfdVec *seg)
1571 {
1572         long            len;
1573
1574         len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
1575         if (len < 0)
1576                 ereport(ERROR,
1577                                 (errcode_for_file_access(),
1578                                  errmsg("could not seek to end of segment %u of relation %u/%u/%u: %m",
1579                                                 seg->mdfd_segno,
1580                                                 reln->smgr_rnode.spcNode,
1581                                                 reln->smgr_rnode.dbNode,
1582                                                 reln->smgr_rnode.relNode)));
1583         /* note that this calculation will ignore any partial block at EOF */
1584         return (BlockNumber) (len / BLCKSZ);
1585 }