granicus.if.org Git - postgresql/blob - src/backend/storage/smgr/md.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * md.c
   4  *        This code manages relations that reside on magnetic disk.
   5  *
   6  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.127 2007/01/17 16:25:01 tgl Exp $
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15 #include "postgres.h"
  16
  17 #include <unistd.h>
  18 #include <fcntl.h>
  19 #include <sys/file.h>
  20
  21 #include "catalog/catalog.h"
  22 #include "miscadmin.h"
  23 #include "postmaster/bgwriter.h"
  24 #include "storage/fd.h"
  25 #include "storage/bufmgr.h"
  26 #include "storage/smgr.h"
  27 #include "utils/hsearch.h"
  28 #include "utils/memutils.h"
  29
  30
  31 /* interval for calling AbsorbFsyncRequests in mdsync */
  32 #define FSYNCS_PER_ABSORB               10
  33
  34 /* special values for the segno arg to RememberFsyncRequest */
  35 #define FORGET_RELATION_FSYNC   (InvalidBlockNumber)
  36 #define FORGET_DATABASE_FSYNC   (InvalidBlockNumber-1)
  37
  38 /*
  39  * On Windows, we have to interpret EACCES as possibly meaning the same as
  40  * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
  41  * that's what you get.  Ugh.  This code is designed so that we don't
  42  * actually believe these cases are okay without further evidence (namely,
  43  * a pending fsync request getting revoked ... see mdsync).
  44  */
  45 #ifndef WIN32
  46 #define FILE_POSSIBLY_DELETED(err)  ((err) == ENOENT)
  47 #else
  48 #define FILE_POSSIBLY_DELETED(err)  ((err) == ENOENT || (err) == EACCES)
  49 #endif
  50
  51 /*
  52  *      The magnetic disk storage manager keeps track of open file
  53  *      descriptors in its own descriptor pool.  This is done to make it
  54  *      easier to support relations that are larger than the operating
  55  *      system's file size limit (often 2GBytes).  In order to do that,
  56  *      we break relations up into "segment" files that are each shorter than
  57  *      the OS file size limit.  The segment size is set by the RELSEG_SIZE
  58  *      configuration constant in pg_config_manual.h.
  59  *
  60  *      On disk, a relation must consist of consecutively numbered segment
  61  *      files in the pattern
  62  *              -- Zero or more full segments of exactly RELSEG_SIZE blocks each
  63  *              -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
  64  *              -- Optionally, any number of inactive segments of size 0 blocks.
  65  *      The full and partial segments are collectively the "active" segments.
  66  *      Inactive segments are those that once contained data but are currently
  67  *      not needed because of an mdtruncate() operation.  The reason for leaving
  68  *      them present at size zero, rather than unlinking them, is that other
  69  *      backends and/or the bgwriter might be holding open file references to
  70  *      such segments.  If the relation expands again after mdtruncate(), such
  71  *      that a deactivated segment becomes active again, it is important that
  72  *      such file references still be valid --- else data might get written
  73  *      out to an unlinked old copy of a segment file that will eventually
  74  *      disappear.
  75  *
  76  *      The file descriptor pointer (md_fd field) stored in the SMgrRelation
  77  *      cache is, therefore, just the head of a list of MdfdVec objects, one
  78  *      per segment.  But note the md_fd pointer can be NULL, indicating
  79  *      relation not open.
  80  *
  81  *      Also note that mdfd_chain == NULL does not necessarily mean the relation
  82  *      doesn't have another segment after this one; we may just not have
  83  *      opened the next segment yet.  (We could not have "all segments are
  84  *      in the chain" as an invariant anyway, since another backend could
  85  *      extend the relation when we weren't looking.)  We do not make chain
  86  *      entries for inactive segments, however; as soon as we find a partial
  87  *      segment, we assume that any subsequent segments are inactive.
  88  *
  89  *      All MdfdVec objects are palloc'd in the MdCxt memory context.
  90  *
  91  *      Defining LET_OS_MANAGE_FILESIZE disables the segmentation logic,
  92  *      for use on machines that support large files.  Beware that that
  93  *      code has not been tested in a long time and is probably bit-rotted.
  94  */
  95
  96 typedef struct _MdfdVec
  97 {
  98         File            mdfd_vfd;               /* fd number in fd.c's pool */
  99         BlockNumber mdfd_segno;         /* segment number, from 0 */
 100 #ifndef LET_OS_MANAGE_FILESIZE  /* for large relations */
 101         struct _MdfdVec *mdfd_chain;    /* next segment, or NULL */
 102 #endif
 103 } MdfdVec;
 104
 105 static MemoryContext MdCxt;             /* context for all md.c allocations */
 106
 107
 108 /*
 109  * In some contexts (currently, standalone backends and the bgwriter process)
 110  * we keep track of pending fsync operations: we need to remember all relation
 111  * segments that have been written since the last checkpoint, so that we can
 112  * fsync them down to disk before completing the next checkpoint.  This hash
 113  * table remembers the pending operations.      We use a hash table mostly as
 114  * a convenient way of eliminating duplicate requests.
 115  *
 116  * (Regular backends do not track pending operations locally, but forward
 117  * them to the bgwriter.)
 118  */
 119 typedef struct
 120 {
 121         RelFileNode rnode;                      /* the targeted relation */
 122         BlockNumber segno;                      /* which segment */
 123 } PendingOperationTag;
 124
 125 typedef struct
 126 {
 127         PendingOperationTag tag;        /* hash table key (must be first!) */
 128         int                     failures;               /* number of failed attempts to fsync */
 129 } PendingOperationEntry;
 130
 131 static HTAB *pendingOpsTable = NULL;
 132
 133
 134 typedef enum                                    /* behavior for mdopen & _mdfd_getseg */
 135 {
 136         EXTENSION_FAIL,                         /* ereport if segment not present */
 137         EXTENSION_RETURN_NULL,          /* return NULL if not present */
 138         EXTENSION_CREATE                        /* create new segments as needed */
 139 } ExtensionBehavior;
 140
 141 /* local routines */
 142 static MdfdVec *mdopen(SMgrRelation reln, ExtensionBehavior behavior);
 143 static void register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
 144 static MdfdVec *_fdvec_alloc(void);
 145
 146 #ifndef LET_OS_MANAGE_FILESIZE
 147 static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno,
 148                           int oflags);
 149 #endif
 150 static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno,
 151                                                          bool isTemp, ExtensionBehavior behavior);
 152 static BlockNumber _mdnblocks(SMgrRelation reln, MdfdVec *seg);
 153
 154
 155 /*
 156  *      mdinit() -- Initialize private state for magnetic disk storage manager.
 157  */
 158 void
 159 mdinit(void)
 160 {
 161         MdCxt = AllocSetContextCreate(TopMemoryContext,
 162                                                                   "MdSmgr",
 163                                                                   ALLOCSET_DEFAULT_MINSIZE,
 164                                                                   ALLOCSET_DEFAULT_INITSIZE,
 165                                                                   ALLOCSET_DEFAULT_MAXSIZE);
 166
 167         /*
 168          * Create pending-operations hashtable if we need it.  Currently, we need
 169          * it if we are standalone (not under a postmaster) OR if we are a
 170          * bootstrap-mode subprocess of a postmaster (that is, a startup or
 171          * bgwriter process).
 172          */
 173         if (!IsUnderPostmaster || IsBootstrapProcessingMode())
 174         {
 175                 HASHCTL         hash_ctl;
 176
 177                 MemSet(&hash_ctl, 0, sizeof(hash_ctl));
 178                 hash_ctl.keysize = sizeof(PendingOperationTag);
 179                 hash_ctl.entrysize = sizeof(PendingOperationEntry);
 180                 hash_ctl.hash = tag_hash;
 181                 hash_ctl.hcxt = MdCxt;
 182                 pendingOpsTable = hash_create("Pending Ops Table",
 183                                                                           100L,
 184                                                                           &hash_ctl,
 185                                                                    HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
 186         }
 187 }
 188
 189 /*
 190  *      mdcreate() -- Create a new relation on magnetic disk.
 191  *
 192  * If isRedo is true, it's okay for the relation to exist already.
 193  */
 194 void
 195 mdcreate(SMgrRelation reln, bool isRedo)
 196 {
 197         char       *path;
 198         File            fd;
 199
 200         if (isRedo && reln->md_fd != NULL)
 201                 return;                                 /* created and opened already... */
 202
 203         Assert(reln->md_fd == NULL);
 204
 205         path = relpath(reln->smgr_rnode);
 206
 207         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
 208
 209         if (fd < 0)
 210         {
 211                 int                     save_errno = errno;
 212
 213                 /*
 214                  * During bootstrap, there are cases where a system relation will be
 215                  * accessed (by internal backend processes) before the bootstrap
 216                  * script nominally creates it.  Therefore, allow the file to exist
 217                  * already, even if isRedo is not set.  (See also mdopen)
 218                  */
 219                 if (isRedo || IsBootstrapProcessingMode())
 220                         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
 221                 if (fd < 0)
 222                 {
 223                         pfree(path);
 224                         /* be sure to report the error reported by create, not open */
 225                         errno = save_errno;
 226                         ereport(ERROR,
 227                                         (errcode_for_file_access(),
 228                                          errmsg("could not create relation %u/%u/%u: %m",
 229                                                         reln->smgr_rnode.spcNode,
 230                                                         reln->smgr_rnode.dbNode,
 231                                                         reln->smgr_rnode.relNode)));
 232                 }
 233         }
 234
 235         pfree(path);
 236
 237         reln->md_fd = _fdvec_alloc();
 238
 239         reln->md_fd->mdfd_vfd = fd;
 240         reln->md_fd->mdfd_segno = 0;
 241 #ifndef LET_OS_MANAGE_FILESIZE
 242         reln->md_fd->mdfd_chain = NULL;
 243 #endif
 244 }
 245
 246 /*
 247  *      mdunlink() -- Unlink a relation.
 248  *
 249  * Note that we're passed a RelFileNode --- by the time this is called,
 250  * there won't be an SMgrRelation hashtable entry anymore.
 251  *
 252  * If isRedo is true, it's okay for the relation to be already gone.
 253  * Also, any failure should be reported as WARNING not ERROR, because
 254  * we are usually not in a transaction anymore when this is called.
 255  */
 256 void
 257 mdunlink(RelFileNode rnode, bool isRedo)
 258 {
 259         char       *path;
 260
 261         /*
 262          * We have to clean out any pending fsync requests for the doomed relation,
 263          * else the next mdsync() will fail.
 264          */
 265         ForgetRelationFsyncRequests(rnode);
 266
 267         path = relpath(rnode);
 268
 269         /* Delete the first segment, or only segment if not doing segmenting */
 270         if (unlink(path) < 0)
 271         {
 272                 if (!isRedo || errno != ENOENT)
 273                         ereport(WARNING,
 274                                         (errcode_for_file_access(),
 275                                          errmsg("could not remove relation %u/%u/%u: %m",
 276                                                         rnode.spcNode,
 277                                                         rnode.dbNode,
 278                                                         rnode.relNode)));
 279         }
 280
 281 #ifndef LET_OS_MANAGE_FILESIZE
 282         /* Delete the additional segments, if any */
 283         else
 284         {
 285                 char       *segpath = (char *) palloc(strlen(path) + 12);
 286                 BlockNumber segno;
 287
 288                 /*
 289                  * Note that because we loop until getting ENOENT, we will
 290                  * correctly remove all inactive segments as well as active ones.
 291                  */
 292                 for (segno = 1;; segno++)
 293                 {
 294                         sprintf(segpath, "%s.%u", path, segno);
 295                         if (unlink(segpath) < 0)
 296                         {
 297                                 /* ENOENT is expected after the last segment... */
 298                                 if (errno != ENOENT)
 299                                         ereport(WARNING,
 300                                                         (errcode_for_file_access(),
 301                                                          errmsg("could not remove segment %u of relation %u/%u/%u: %m",
 302                                                                         segno,
 303                                                                         rnode.spcNode,
 304                                                                         rnode.dbNode,
 305                                                                         rnode.relNode)));
 306                                 break;
 307                         }
 308                 }
 309                 pfree(segpath);
 310         }
 311 #endif
 312
 313         pfree(path);
 314 }
 315
 316 /*
 317  *      mdextend() -- Add a block to the specified relation.
 318  *
 319  *              The semantics are nearly the same as mdwrite(): write at the
 320  *              specified position.  However, this is to be used for the case of
 321  *              extending a relation (i.e., blocknum is at or beyond the current
 322  *              EOF).  Note that we assume writing a block beyond current EOF
 323  *              causes intervening file space to become filled with zeroes.
 324  */
 325 void
 326 mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
 327 {
 328         long            seekpos;
 329         int                     nbytes;
 330         MdfdVec    *v;
 331
 332         /* This assert is too expensive to have on normally ... */
 333 #ifdef CHECK_WRITE_VS_EXTEND
 334         Assert(blocknum >= mdnblocks(reln));
 335 #endif
 336
 337         /*
 338          * If a relation manages to grow to 2^32-1 blocks, refuse to extend it
 339          * any more --- we mustn't create a block whose number
 340          * actually is InvalidBlockNumber.
 341          */
 342         if (blocknum == InvalidBlockNumber)
 343                 ereport(ERROR,
 344                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 345                                  errmsg("cannot extend relation %u/%u/%u beyond %u blocks",
 346                                                 reln->smgr_rnode.spcNode,
 347                                                 reln->smgr_rnode.dbNode,
 348                                                 reln->smgr_rnode.relNode,
 349                                                 InvalidBlockNumber)));
 350
 351         v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_CREATE);
 352
 353 #ifndef LET_OS_MANAGE_FILESIZE
 354         seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
 355         Assert(seekpos < BLCKSZ * RELSEG_SIZE);
 356 #else
 357         seekpos = (long) (BLCKSZ * (blocknum));
 358 #endif
 359
 360         /*
 361          * Note: because caller usually obtained blocknum by calling mdnblocks,
 362          * which did a seek(SEEK_END), this seek is often redundant and will be
 363          * optimized away by fd.c.  It's not redundant, however, if there is a
 364          * partial page at the end of the file. In that case we want to try to
 365          * overwrite the partial page with a full page.  It's also not redundant
 366          * if bufmgr.c had to dump another buffer of the same file to make room
 367          * for the new page's buffer.
 368          */
 369         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 370                 ereport(ERROR,
 371                                 (errcode_for_file_access(),
 372                                  errmsg("could not seek to block %u of relation %u/%u/%u: %m",
 373                                                 blocknum,
 374                                                 reln->smgr_rnode.spcNode,
 375                                                 reln->smgr_rnode.dbNode,
 376                                                 reln->smgr_rnode.relNode)));
 377
 378         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
 379         {
 380                 if (nbytes < 0)
 381                         ereport(ERROR,
 382                                         (errcode_for_file_access(),
 383                                          errmsg("could not extend relation %u/%u/%u: %m",
 384                                                         reln->smgr_rnode.spcNode,
 385                                                         reln->smgr_rnode.dbNode,
 386                                                         reln->smgr_rnode.relNode),
 387                                          errhint("Check free disk space.")));
 388                 /* short write: complain appropriately */
 389                 ereport(ERROR,
 390                                 (errcode(ERRCODE_DISK_FULL),
 391                                  errmsg("could not extend relation %u/%u/%u: wrote only %d of %d bytes at block %u",
 392                                                 reln->smgr_rnode.spcNode,
 393                                                 reln->smgr_rnode.dbNode,
 394                                                 reln->smgr_rnode.relNode,
 395                                                 nbytes, BLCKSZ, blocknum),
 396                                  errhint("Check free disk space.")));
 397         }
 398
 399         if (!isTemp)
 400                 register_dirty_segment(reln, v);
 401
 402 #ifndef LET_OS_MANAGE_FILESIZE
 403         Assert(_mdnblocks(reln, v) <= ((BlockNumber) RELSEG_SIZE));
 404 #endif
 405 }
 406
 407 /*
 408  *      mdopen() -- Open the specified relation.
 409  *
 410  * Note we only open the first segment, when there are multiple segments.
 411  *
 412  * If first segment is not present, either ereport or return NULL according
 413  * to "behavior".  We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
 414  * EXTENSION_CREATE means it's OK to extend an existing relation, not to
 415  * invent one out of whole cloth.
 416  */
 417 static MdfdVec *
 418 mdopen(SMgrRelation reln, ExtensionBehavior behavior)
 419 {
 420         MdfdVec    *mdfd;
 421         char       *path;
 422         File            fd;
 423
 424         /* No work if already open */
 425         if (reln->md_fd)
 426                 return reln->md_fd;
 427
 428         path = relpath(reln->smgr_rnode);
 429
 430         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
 431
 432         if (fd < 0)
 433         {
 434                 /*
 435                  * During bootstrap, there are cases where a system relation will be
 436                  * accessed (by internal backend processes) before the bootstrap
 437                  * script nominally creates it.  Therefore, accept mdopen() as a
 438                  * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
 439                  */
 440                 if (IsBootstrapProcessingMode())
 441                         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
 442                 if (fd < 0)
 443                 {
 444                         pfree(path);
 445                         if (behavior == EXTENSION_RETURN_NULL &&
 446                                 FILE_POSSIBLY_DELETED(errno))
 447                                 return NULL;
 448                         ereport(ERROR,
 449                                         (errcode_for_file_access(),
 450                                          errmsg("could not open relation %u/%u/%u: %m",
 451                                                         reln->smgr_rnode.spcNode,
 452                                                         reln->smgr_rnode.dbNode,
 453                                                         reln->smgr_rnode.relNode)));
 454                 }
 455         }
 456
 457         pfree(path);
 458
 459         reln->md_fd = mdfd = _fdvec_alloc();
 460
 461         mdfd->mdfd_vfd = fd;
 462         mdfd->mdfd_segno = 0;
 463 #ifndef LET_OS_MANAGE_FILESIZE
 464         mdfd->mdfd_chain = NULL;
 465         Assert(_mdnblocks(reln, mdfd) <= ((BlockNumber) RELSEG_SIZE));
 466 #endif
 467
 468         return mdfd;
 469 }
 470
 471 /*
 472  *      mdclose() -- Close the specified relation, if it isn't closed already.
 473  */
 474 void
 475 mdclose(SMgrRelation reln)
 476 {
 477         MdfdVec    *v = reln->md_fd;
 478
 479         /* No work if already closed */
 480         if (v == NULL)
 481                 return;
 482
 483         reln->md_fd = NULL;                     /* prevent dangling pointer after error */
 484
 485 #ifndef LET_OS_MANAGE_FILESIZE
 486         while (v != NULL)
 487         {
 488                 MdfdVec    *ov = v;
 489
 490                 /* if not closed already */
 491                 if (v->mdfd_vfd >= 0)
 492                         FileClose(v->mdfd_vfd);
 493                 /* Now free vector */
 494                 v = v->mdfd_chain;
 495                 pfree(ov);
 496         }
 497 #else
 498         if (v->mdfd_vfd >= 0)
 499                 FileClose(v->mdfd_vfd);
 500         pfree(v);
 501 #endif
 502 }
 503
 504 /*
 505  *      mdread() -- Read the specified block from a relation.
 506  */
 507 void
 508 mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
 509 {
 510         long            seekpos;
 511         int                     nbytes;
 512         MdfdVec    *v;
 513
 514         v = _mdfd_getseg(reln, blocknum, false, EXTENSION_FAIL);
 515
 516 #ifndef LET_OS_MANAGE_FILESIZE
 517         seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
 518         Assert(seekpos < BLCKSZ * RELSEG_SIZE);
 519 #else
 520         seekpos = (long) (BLCKSZ * (blocknum));
 521 #endif
 522
 523         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 524                 ereport(ERROR,
 525                                 (errcode_for_file_access(),
 526                                  errmsg("could not seek to block %u of relation %u/%u/%u: %m",
 527                                                 blocknum,
 528                                                 reln->smgr_rnode.spcNode,
 529                                                 reln->smgr_rnode.dbNode,
 530                                                 reln->smgr_rnode.relNode)));
 531
 532         if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
 533         {
 534                 if (nbytes < 0)
 535                         ereport(ERROR,
 536                                         (errcode_for_file_access(),
 537                                          errmsg("could not read block %u of relation %u/%u/%u: %m",
 538                                                         blocknum,
 539                                                         reln->smgr_rnode.spcNode,
 540                                                         reln->smgr_rnode.dbNode,
 541                                                         reln->smgr_rnode.relNode)));
 542                 /*
 543                  * Short read: we are at or past EOF, or we read a partial block at
 544                  * EOF.  Normally this is an error; upper levels should never try to
 545                  * read a nonexistent block.  However, if zero_damaged_pages is ON
 546                  * or we are InRecovery, we should instead return zeroes without
 547                  * complaining.  This allows, for example, the case of trying to
 548                  * update a block that was later truncated away.
 549                  */
 550                 if (zero_damaged_pages || InRecovery)
 551                         MemSet(buffer, 0, BLCKSZ);
 552                 else
 553                         ereport(ERROR,
 554                                         (errcode(ERRCODE_DATA_CORRUPTED),
 555                                          errmsg("could not read block %u of relation %u/%u/%u: read only %d of %d bytes",
 556                                                         blocknum,
 557                                                         reln->smgr_rnode.spcNode,
 558                                                         reln->smgr_rnode.dbNode,
 559                                                         reln->smgr_rnode.relNode,
 560                                                         nbytes, BLCKSZ)));
 561         }
 562 }
 563
 564 /*
 565  *      mdwrite() -- Write the supplied block at the appropriate location.
 566  *
 567  *              This is to be used only for updating already-existing blocks of a
 568  *              relation (ie, those before the current EOF).  To extend a relation,
 569  *              use mdextend().
 570  */
 571 void
 572 mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
 573 {
 574         long            seekpos;
 575         int                     nbytes;
 576         MdfdVec    *v;
 577
 578         /* This assert is too expensive to have on normally ... */
 579 #ifdef CHECK_WRITE_VS_EXTEND
 580         Assert(blocknum < mdnblocks(reln));
 581 #endif
 582
 583         v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_FAIL);
 584
 585 #ifndef LET_OS_MANAGE_FILESIZE
 586         seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
 587         Assert(seekpos < BLCKSZ * RELSEG_SIZE);
 588 #else
 589         seekpos = (long) (BLCKSZ * (blocknum));
 590 #endif
 591
 592         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 593                 ereport(ERROR,
 594                                 (errcode_for_file_access(),
 595                                  errmsg("could not seek to block %u of relation %u/%u/%u: %m",
 596                                                 blocknum,
 597                                                 reln->smgr_rnode.spcNode,
 598                                                 reln->smgr_rnode.dbNode,
 599                                                 reln->smgr_rnode.relNode)));
 600
 601         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
 602         {
 603                 if (nbytes < 0)
 604                         ereport(ERROR,
 605                                         (errcode_for_file_access(),
 606                                          errmsg("could not write block %u of relation %u/%u/%u: %m",
 607                                                         blocknum,
 608                                                         reln->smgr_rnode.spcNode,
 609                                                         reln->smgr_rnode.dbNode,
 610                                                         reln->smgr_rnode.relNode)));
 611                 /* short write: complain appropriately */
 612                 ereport(ERROR,
 613                                 (errcode(ERRCODE_DISK_FULL),
 614                                  errmsg("could not write block %u of relation %u/%u/%u: wrote only %d of %d bytes",
 615                                                 blocknum,
 616                                                 reln->smgr_rnode.spcNode,
 617                                                 reln->smgr_rnode.dbNode,
 618                                                 reln->smgr_rnode.relNode,
 619                                                 nbytes, BLCKSZ),
 620                                  errhint("Check free disk space.")));
 621         }
 622
 623         if (!isTemp)
 624                 register_dirty_segment(reln, v);
 625 }
 626
 627 /*
 628  *      mdnblocks() -- Get the number of blocks stored in a relation.
 629  *
 630  *              Important side effect: all active segments of the relation are opened
 631  *              and added to the mdfd_chain list.  If this routine has not been
 632  *              called, then only segments up to the last one actually touched
 633  *              are present in the chain.
 634  */
 635 BlockNumber
 636 mdnblocks(SMgrRelation reln)
 637 {
 638         MdfdVec    *v = mdopen(reln, EXTENSION_FAIL);
 639
 640 #ifndef LET_OS_MANAGE_FILESIZE
 641         BlockNumber nblocks;
 642         BlockNumber segno = 0;
 643
 644         /*
 645          * Skip through any segments that aren't the last one, to avoid redundant
 646          * seeks on them.  We have previously verified that these segments are
 647          * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
 648          *
 649          * NOTE: this assumption could only be wrong if another backend has
 650          * truncated the relation.      We rely on higher code levels to handle that
 651          * scenario by closing and re-opening the md fd, which is handled via
 652          * relcache flush.  (Since the bgwriter doesn't participate in relcache
 653          * flush, it could have segment chain entries for inactive segments;
 654          * that's OK because the bgwriter never needs to compute relation size.)
 655          */
 656         while (v->mdfd_chain != NULL)
 657         {
 658                 segno++;
 659                 v = v->mdfd_chain;
 660         }
 661
 662         for (;;)
 663         {
 664                 nblocks = _mdnblocks(reln, v);
 665                 if (nblocks > ((BlockNumber) RELSEG_SIZE))
 666                         elog(FATAL, "segment too big");
 667                 if (nblocks < ((BlockNumber) RELSEG_SIZE))
 668                         return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
 669
 670                 /*
 671                  * If segment is exactly RELSEG_SIZE, advance to next one.
 672                  */
 673                 segno++;
 674
 675                 if (v->mdfd_chain == NULL)
 676                 {
 677                         /*
 678                          * Because we pass O_CREAT, we will create the next segment (with
 679                          * zero length) immediately, if the last segment is of length
 680                          * RELSEG_SIZE.  While perhaps not strictly necessary, this keeps
 681                          * the logic simple.
 682                          */
 683                         v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
 684                         if (v->mdfd_chain == NULL)
 685                                 ereport(ERROR,
 686                                                 (errcode_for_file_access(),
 687                                                  errmsg("could not open segment %u of relation %u/%u/%u: %m",
 688                                                                 segno,
 689                                                                 reln->smgr_rnode.spcNode,
 690                                                                 reln->smgr_rnode.dbNode,
 691                                                                 reln->smgr_rnode.relNode)));
 692                 }
 693
 694                 v = v->mdfd_chain;
 695         }
 696 #else
 697         return _mdnblocks(reln, v);
 698 #endif
 699 }
 700
 701 /*
 702  *      mdtruncate() -- Truncate relation to specified number of blocks.
 703  */
 704 void
 705 mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
 706 {
 707         MdfdVec    *v;
 708         BlockNumber curnblk;
 709
 710 #ifndef LET_OS_MANAGE_FILESIZE
 711         BlockNumber priorblocks;
 712 #endif
 713
 714         /*
 715          * NOTE: mdnblocks makes sure we have opened all active segments, so
 716          * that truncation loop will get them all!
 717          */
 718         curnblk = mdnblocks(reln);
 719         if (nblocks > curnblk)
 720         {
 721                 /* Bogus request ... but no complaint if InRecovery */
 722                 if (InRecovery)
 723                         return;
 724                 ereport(ERROR,
 725                                 (errmsg("could not truncate relation %u/%u/%u to %u blocks: it's only %u blocks now",
 726                                                 reln->smgr_rnode.spcNode,
 727                                                 reln->smgr_rnode.dbNode,
 728                                                 reln->smgr_rnode.relNode,
 729                                                 nblocks, curnblk)));
 730         }
 731         if (nblocks == curnblk)
 732                 return;                                 /* no work */
 733
 734         v = mdopen(reln, EXTENSION_FAIL);
 735
 736 #ifndef LET_OS_MANAGE_FILESIZE
 737         priorblocks = 0;
 738         while (v != NULL)
 739         {
 740                 MdfdVec    *ov = v;
 741
 742                 if (priorblocks > nblocks)
 743                 {
 744                         /*
 745                          * This segment is no longer active (and has already been
 746                          * unlinked from the mdfd_chain). We truncate the file, but do
 747                          * not delete it, for reasons explained in the header comments.
 748                          */
 749                         if (FileTruncate(v->mdfd_vfd, 0) < 0)
 750                                 ereport(ERROR,
 751                                                 (errcode_for_file_access(),
 752                                                  errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
 753                                                                 reln->smgr_rnode.spcNode,
 754                                                                 reln->smgr_rnode.dbNode,
 755                                                                 reln->smgr_rnode.relNode,
 756                                                                 nblocks)));
 757                         if (!isTemp)
 758                                 register_dirty_segment(reln, v);
 759                         v = v->mdfd_chain;
 760                         Assert(ov != reln->md_fd);      /* we never drop the 1st segment */
 761                         pfree(ov);
 762                 }
 763                 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
 764                 {
 765                         /*
 766                          * This is the last segment we want to keep. Truncate the file to
 767                          * the right length, and clear chain link that points to any
 768                          * remaining segments (which we shall zap). NOTE: if nblocks is
 769                          * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
 770                          * segment to 0 length but keep it. This adheres to the invariant
 771                          * given in the header comments.
 772                          */
 773                         BlockNumber lastsegblocks = nblocks - priorblocks;
 774
 775                         if (FileTruncate(v->mdfd_vfd, lastsegblocks * BLCKSZ) < 0)
 776                                 ereport(ERROR,
 777                                                 (errcode_for_file_access(),
 778                                                  errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
 779                                                                 reln->smgr_rnode.spcNode,
 780                                                                 reln->smgr_rnode.dbNode,
 781                                                                 reln->smgr_rnode.relNode,
 782                                                                 nblocks)));
 783                         if (!isTemp)
 784                                 register_dirty_segment(reln, v);
 785                         v = v->mdfd_chain;
 786                         ov->mdfd_chain = NULL;
 787                 }
 788                 else
 789                 {
 790                         /*
 791                          * We still need this segment and 0 or more blocks beyond it, so
 792                          * nothing to do here.
 793                          */
 794                         v = v->mdfd_chain;
 795                 }
 796                 priorblocks += RELSEG_SIZE;
 797         }
 798 #else
 799         if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0)
 800                 ereport(ERROR,
 801                                 (errcode_for_file_access(),
 802                           errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
 803                                          reln->smgr_rnode.spcNode,
 804                                          reln->smgr_rnode.dbNode,
 805                                          reln->smgr_rnode.relNode,
 806                                          nblocks)));
 807         if (!isTemp)
 808                 register_dirty_segment(reln, v);
 809 #endif
 810 }
 811
 812 /*
 813  *      mdimmedsync() -- Immediately sync a relation to stable storage.
 814  *
 815  * Note that only writes already issued are synced; this routine knows
 816  * nothing of dirty buffers that may exist inside the buffer manager.
 817  */
 818 void
 819 mdimmedsync(SMgrRelation reln)
 820 {
 821         MdfdVec    *v;
 822         BlockNumber curnblk;
 823
 824         /*
 825          * NOTE: mdnblocks makes sure we have opened all active segments, so
 826          * that fsync loop will get them all!
 827          */
 828         curnblk = mdnblocks(reln);
 829
 830         v = mdopen(reln, EXTENSION_FAIL);
 831
 832 #ifndef LET_OS_MANAGE_FILESIZE
 833         while (v != NULL)
 834         {
 835                 if (FileSync(v->mdfd_vfd) < 0)
 836                         ereport(ERROR,
 837                                         (errcode_for_file_access(),
 838                                          errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
 839                                                         v->mdfd_segno,
 840                                                         reln->smgr_rnode.spcNode,
 841                                                         reln->smgr_rnode.dbNode,
 842                                                         reln->smgr_rnode.relNode)));
 843                 v = v->mdfd_chain;
 844         }
 845 #else
 846         if (FileSync(v->mdfd_vfd) < 0)
 847                 ereport(ERROR,
 848                                 (errcode_for_file_access(),
 849                                  errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
 850                                                 v->mdfd_segno,
 851                                                 reln->smgr_rnode.spcNode,
 852                                                 reln->smgr_rnode.dbNode,
 853                                                 reln->smgr_rnode.relNode)));
 854 #endif
 855 }
 856
 857 /*
 858  *      mdsync() -- Sync previous writes to stable storage.
 859  *
 860  * This is only called during checkpoints, and checkpoints should only
 861  * occur in processes that have created a pendingOpsTable.
 862  */
 863 void
 864 mdsync(void)
 865 {
 866         bool            need_retry;
 867
 868         if (!pendingOpsTable)
 869                 elog(ERROR, "cannot sync without a pendingOpsTable");
 870
 871         /*
 872          * The fsync table could contain requests to fsync relations that have
 873          * been deleted (unlinked) by the time we get to them.  Rather than
 874          * just hoping an ENOENT (or EACCES on Windows) error can be ignored,
 875          * what we will do is retry the whole process after absorbing fsync
 876          * request messages again.  Since mdunlink() queues a "revoke" message
 877          * before actually unlinking, the fsync request is guaranteed to be gone
 878          * the second time if it really was this case.  DROP DATABASE likewise
 879          * has to tell us to forget fsync requests before it starts deletions.
 880          */
 881         do {
 882                 HASH_SEQ_STATUS hstat;
 883                 PendingOperationEntry *entry;
 884                 int                     absorb_counter;
 885
 886                 need_retry = false;
 887
 888                 /*
 889                  * If we are in the bgwriter, the sync had better include all fsync
 890                  * requests that were queued by backends before the checkpoint REDO
 891                  * point was determined. We go that a little better by accepting all
 892                  * requests queued up to the point where we start fsync'ing.
 893                  */
 894                 AbsorbFsyncRequests();
 895
 896                 absorb_counter = FSYNCS_PER_ABSORB;
 897                 hash_seq_init(&hstat, pendingOpsTable);
 898                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
 899                 {
 900                         /*
 901                          * If fsync is off then we don't have to bother opening the file
 902                          * at all.  (We delay checking until this point so that changing
 903                          * fsync on the fly behaves sensibly.)
 904                          */
 905                         if (enableFsync)
 906                         {
 907                                 SMgrRelation reln;
 908                                 MdfdVec    *seg;
 909
 910                                 /*
 911                                  * If in bgwriter, we want to absorb pending requests every so
 912                                  * often to prevent overflow of the fsync request queue.  This
 913                                  * could result in deleting the current entry out from under
 914                                  * our hashtable scan, so the procedure is to fall out of the
 915                                  * scan and start over from the top of the function.
 916                                  */
 917                                 if (--absorb_counter <= 0)
 918                                 {
 919                                         need_retry = true;
 920                                         break;
 921                                 }
 922
 923                                 /*
 924                                  * Find or create an smgr hash entry for this relation. This
 925                                  * may seem a bit unclean -- md calling smgr?  But it's really
 926                                  * the best solution.  It ensures that the open file reference
 927                                  * isn't permanently leaked if we get an error here. (You may
 928                                  * say "but an unreferenced SMgrRelation is still a leak!" Not
 929                                  * really, because the only case in which a checkpoint is done
 930                                  * by a process that isn't about to shut down is in the
 931                                  * bgwriter, and it will periodically do smgrcloseall(). This
 932                                  * fact justifies our not closing the reln in the success path
 933                                  * either, which is a good thing since in non-bgwriter cases
 934                                  * we couldn't safely do that.)  Furthermore, in many cases
 935                                  * the relation will have been dirtied through this same smgr
 936                                  * relation, and so we can save a file open/close cycle.
 937                                  */
 938                                 reln = smgropen(entry->tag.rnode);
 939
 940                                 /*
 941                                  * It is possible that the relation has been dropped or
 942                                  * truncated since the fsync request was entered.  Therefore,
 943                                  * allow ENOENT, but only if we didn't fail once already on
 944                                  * this file.  This applies both during _mdfd_getseg() and
 945                                  * during FileSync, since fd.c might have closed the file
 946                                  * behind our back.
 947                                  */
 948                                 seg = _mdfd_getseg(reln,
 949                                                                    entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
 950                                                                    false, EXTENSION_RETURN_NULL);
 951                                 if (seg == NULL ||
 952                                         FileSync(seg->mdfd_vfd) < 0)
 953                                 {
 954                                         /*
 955                                          * XXX is there any point in allowing more than one try?
 956                                          * Don't see one at the moment, but easy to change the
 957                                          * test here if so.
 958                                          */
 959                                         if (!FILE_POSSIBLY_DELETED(errno) ||
 960                                                 ++(entry->failures) > 1)
 961                                                 ereport(ERROR,
 962                                                                 (errcode_for_file_access(),
 963                                                                  errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
 964                                                                                 entry->tag.segno,
 965                                                                                 entry->tag.rnode.spcNode,
 966                                                                                 entry->tag.rnode.dbNode,
 967                                                                                 entry->tag.rnode.relNode)));
 968                                         else
 969                                                 ereport(DEBUG1,
 970                                                                 (errcode_for_file_access(),
 971                                                                  errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
 972                                                                                 entry->tag.segno,
 973                                                                                 entry->tag.rnode.spcNode,
 974                                                                                 entry->tag.rnode.dbNode,
 975                                                                                 entry->tag.rnode.relNode)));
 976                                         need_retry = true;
 977                                         continue;       /* don't delete the hashtable entry */
 978                                 }
 979                         }
 980
 981                         /* Okay, delete this entry */
 982                         if (hash_search(pendingOpsTable, &entry->tag,
 983                                                         HASH_REMOVE, NULL) == NULL)
 984                                 elog(ERROR, "pendingOpsTable corrupted");
 985                 }
 986         } while (need_retry);
 987 }
 988
 989 /*
 990  * register_dirty_segment() -- Mark a relation segment as needing fsync
 991  *
 992  * If there is a local pending-ops table, just make an entry in it for
 993  * mdsync to process later.  Otherwise, try to pass off the fsync request
 994  * to the background writer process.  If that fails, just do the fsync
 995  * locally before returning (we expect this will not happen often enough
 996  * to be a performance problem).
 997  */
 998 static void
 999 register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
1000 {
1001         if (pendingOpsTable)
1002         {
1003                 /* push it into local pending-ops table */
1004                 RememberFsyncRequest(reln->smgr_rnode, seg->mdfd_segno);
1005         }
1006         else
1007         {
1008                 if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno))
1009                         return;                         /* passed it off successfully */
1010
1011                 if (FileSync(seg->mdfd_vfd) < 0)
1012                         ereport(ERROR,
1013                                         (errcode_for_file_access(),
1014                                          errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
1015                                                         seg->mdfd_segno,
1016                                                         reln->smgr_rnode.spcNode,
1017                                                         reln->smgr_rnode.dbNode,
1018                                                         reln->smgr_rnode.relNode)));
1019         }
1020 }
1021
1022 /*
1023  * RememberFsyncRequest() -- callback from bgwriter side of fsync request
1024  *
1025  * We stuff the fsync request into the local hash table for execution
1026  * during the bgwriter's next checkpoint.
1027  *
1028  * The range of possible segment numbers is way less than the range of
1029  * BlockNumber, so we can reserve high values of segno for special purposes.
1030  * We define two: FORGET_RELATION_FSYNC means to drop pending fsyncs for
1031  * a relation, and FORGET_DATABASE_FSYNC means to drop pending fsyncs for
1032  * a whole database.  (These are a tad slow because the hash table has to be
1033  * searched linearly, but it doesn't seem worth rethinking the table structure
1034  * for them.)
1035  */
1036 void
1037 RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
1038 {
1039         Assert(pendingOpsTable);
1040
1041         if (segno == FORGET_RELATION_FSYNC)
1042         {
1043                 /* Remove any pending requests for the entire relation */
1044                 HASH_SEQ_STATUS hstat;
1045                 PendingOperationEntry *entry;
1046
1047                 hash_seq_init(&hstat, pendingOpsTable);
1048                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1049                 {
1050                         if (RelFileNodeEquals(entry->tag.rnode, rnode))
1051                         {
1052                                 /* Okay, delete this entry */
1053                                 if (hash_search(pendingOpsTable, &entry->tag,
1054                                                                 HASH_REMOVE, NULL) == NULL)
1055                                         elog(ERROR, "pendingOpsTable corrupted");
1056                         }
1057                 }
1058         }
1059         else if (segno == FORGET_DATABASE_FSYNC)
1060         {
1061                 /* Remove any pending requests for the entire database */
1062                 HASH_SEQ_STATUS hstat;
1063                 PendingOperationEntry *entry;
1064
1065                 hash_seq_init(&hstat, pendingOpsTable);
1066                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1067                 {
1068                         if (entry->tag.rnode.dbNode == rnode.dbNode)
1069                         {
1070                                 /* Okay, delete this entry */
1071                                 if (hash_search(pendingOpsTable, &entry->tag,
1072                                                                 HASH_REMOVE, NULL) == NULL)
1073                                         elog(ERROR, "pendingOpsTable corrupted");
1074                         }
1075                 }
1076         }
1077         else
1078         {
1079                 /* Normal case: enter a request to fsync this segment */
1080                 PendingOperationTag key;
1081                 PendingOperationEntry *entry;
1082                 bool            found;
1083
1084                 /* ensure any pad bytes in the hash key are zeroed */
1085                 MemSet(&key, 0, sizeof(key));
1086                 key.rnode = rnode;
1087                 key.segno = segno;
1088
1089                 entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
1090                                                                                                           &key,
1091                                                                                                           HASH_ENTER,
1092                                                                                                           &found);
1093                 if (!found)                             /* new entry, so initialize it */
1094                         entry->failures = 0;
1095         }
1096 }
1097
1098 /*
1099  * ForgetRelationFsyncRequests -- ensure any fsyncs for a rel are forgotten
1100  */
1101 void
1102 ForgetRelationFsyncRequests(RelFileNode rnode)
1103 {
1104         if (pendingOpsTable)
1105         {
1106                 /* standalone backend or startup process: fsync state is local */
1107                 RememberFsyncRequest(rnode, FORGET_RELATION_FSYNC);
1108         }
1109         else if (IsUnderPostmaster)
1110         {
1111                 /*
1112                  * Notify the bgwriter about it.  If we fail to queue the revoke
1113                  * message, we have to sleep and try again ... ugly, but hopefully
1114                  * won't happen often.
1115                  *
1116                  * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with
1117                  * an error would leave the no-longer-used file still present on
1118                  * disk, which would be bad, so I'm inclined to assume that the
1119                  * bgwriter will always empty the queue soon.
1120                  */
1121                 while (!ForwardFsyncRequest(rnode, FORGET_RELATION_FSYNC))
1122                         pg_usleep(10000L);      /* 10 msec seems a good number */
1123                 /*
1124                  * Note we don't wait for the bgwriter to actually absorb the
1125                  * revoke message; see mdsync() for the implications.
1126                  */
1127         }
1128 }
1129
1130 /*
1131  * ForgetDatabaseFsyncRequests -- ensure any fsyncs for a DB are forgotten
1132  */
1133 void
1134 ForgetDatabaseFsyncRequests(Oid dbid)
1135 {
1136         RelFileNode rnode;
1137
1138         rnode.dbNode = dbid;
1139         rnode.spcNode = 0;
1140         rnode.relNode = 0;
1141
1142         if (pendingOpsTable)
1143         {
1144                 /* standalone backend or startup process: fsync state is local */
1145                 RememberFsyncRequest(rnode, FORGET_DATABASE_FSYNC);
1146         }
1147         else if (IsUnderPostmaster)
1148         {
1149                 /* see notes in ForgetRelationFsyncRequests */
1150                 while (!ForwardFsyncRequest(rnode, FORGET_DATABASE_FSYNC))
1151                         pg_usleep(10000L);      /* 10 msec seems a good number */
1152         }
1153 }
1154
1155
1156 /*
1157  *      _fdvec_alloc() -- Make a MdfdVec object.
1158  */
1159 static MdfdVec *
1160 _fdvec_alloc(void)
1161 {
1162         return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
1163 }
1164
1165 #ifndef LET_OS_MANAGE_FILESIZE
1166
1167 /*
1168  * Open the specified segment of the relation,
1169  * and make a MdfdVec object for it.  Returns NULL on failure.
1170  */
1171 static MdfdVec *
1172 _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
1173 {
1174         MdfdVec    *v;
1175         int                     fd;
1176         char       *path,
1177                            *fullpath;
1178
1179         path = relpath(reln->smgr_rnode);
1180
1181         if (segno > 0)
1182         {
1183                 /* be sure we have enough space for the '.segno' */
1184                 fullpath = (char *) palloc(strlen(path) + 12);
1185                 sprintf(fullpath, "%s.%u", path, segno);
1186                 pfree(path);
1187         }
1188         else
1189                 fullpath = path;
1190
1191         /* open the file */
1192         fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
1193
1194         pfree(fullpath);
1195
1196         if (fd < 0)
1197                 return NULL;
1198
1199         /* allocate an mdfdvec entry for it */
1200         v = _fdvec_alloc();
1201
1202         /* fill the entry */
1203         v->mdfd_vfd = fd;
1204         v->mdfd_segno = segno;
1205         v->mdfd_chain = NULL;
1206         Assert(_mdnblocks(reln, v) <= ((BlockNumber) RELSEG_SIZE));
1207
1208         /* all done */
1209         return v;
1210 }
1211 #endif   /* LET_OS_MANAGE_FILESIZE */
1212
1213 /*
1214  *      _mdfd_getseg() -- Find the segment of the relation holding the
1215  *              specified block.
1216  *
1217  * If the segment doesn't exist, we ereport, return NULL, or create the
1218  * segment, according to "behavior".  Note: isTemp need only be correct
1219  * in the EXTENSION_CREATE case.
1220  */
1221 static MdfdVec *
1222 _mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool isTemp,
1223                          ExtensionBehavior behavior)
1224 {
1225         MdfdVec    *v = mdopen(reln, behavior);
1226
1227 #ifndef LET_OS_MANAGE_FILESIZE
1228         BlockNumber targetseg;
1229         BlockNumber nextsegno;
1230
1231         if (!v)
1232                 return NULL;                    /* only possible if EXTENSION_RETURN_NULL */
1233
1234         targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1235         for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
1236         {
1237                 Assert(nextsegno == v->mdfd_segno + 1);
1238
1239                 if (v->mdfd_chain == NULL)
1240                 {
1241                         /*
1242                          * Normally we will create new segments only if authorized by
1243                          * the caller (i.e., we are doing mdextend()).  But when doing
1244                          * WAL recovery, create segments anyway; this allows cases such as
1245                          * replaying WAL data that has a write into a high-numbered
1246                          * segment of a relation that was later deleted.  We want to go
1247                          * ahead and create the segments so we can finish out the replay.
1248                          *
1249                          * We have to maintain the invariant that segments before the
1250                          * last active segment are of size RELSEG_SIZE; therefore, pad
1251                          * them out with zeroes if needed.  (This only matters if caller
1252                          * is extending the relation discontiguously, but that can happen
1253                          * in hash indexes.)
1254                          */
1255                         if (behavior == EXTENSION_CREATE || InRecovery)
1256                         {
1257                                 if (_mdnblocks(reln, v) < RELSEG_SIZE)
1258                                 {
1259                                         char   *zerobuf = palloc0(BLCKSZ);
1260
1261                                         mdextend(reln, nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1262                                                          zerobuf, isTemp);
1263                                         pfree(zerobuf);
1264                                 }
1265                                 v->mdfd_chain = _mdfd_openseg(reln, nextsegno, O_CREAT);
1266                         }
1267                         else
1268                         {
1269                                 /* We won't create segment if not existent */
1270                                 v->mdfd_chain = _mdfd_openseg(reln, nextsegno, 0);
1271                         }
1272                         if (v->mdfd_chain == NULL)
1273                         {
1274                                 if (behavior == EXTENSION_RETURN_NULL &&
1275                                         FILE_POSSIBLY_DELETED(errno))
1276                                         return NULL;
1277                                 ereport(ERROR,
1278                                                 (errcode_for_file_access(),
1279                                                  errmsg("could not open segment %u of relation %u/%u/%u (target block %u): %m",
1280                                                                 nextsegno,
1281                                                                 reln->smgr_rnode.spcNode,
1282                                                                 reln->smgr_rnode.dbNode,
1283                                                                 reln->smgr_rnode.relNode,
1284                                                                 blkno)));
1285                         }
1286                 }
1287                 v = v->mdfd_chain;
1288         }
1289 #endif
1290
1291         return v;
1292 }
1293
1294 /*
1295  * Get number of blocks present in a single disk file
1296  */
1297 static BlockNumber
1298 _mdnblocks(SMgrRelation reln, MdfdVec *seg)
1299 {
1300         long            len;
1301
1302         len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
1303         if (len < 0)
1304                 ereport(ERROR,
1305                                 (errcode_for_file_access(),
1306                                  errmsg("could not seek to end of segment %u of relation %u/%u/%u: %m",
1307                                                 seg->mdfd_segno,
1308                                                 reln->smgr_rnode.spcNode,
1309                                                 reln->smgr_rnode.dbNode,
1310                                                 reln->smgr_rnode.relNode)));
1311         /* note that this calculation will ignore any partial block at EOF */
1312         return (BlockNumber) (len / BLCKSZ);
1313 }