granicus.if.org Git - postgresql/blob - src/backend/storage/smgr/md.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * md.c
   4  *        This code manages relations that reside on magnetic disk.
   5  *
   6  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.114 2004/12/31 22:01:13 pgsql Exp $
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15 #include "postgres.h"
  16
  17 #include <errno.h>
  18 #include <unistd.h>
  19 #include <fcntl.h>
  20 #include <sys/file.h>
  21
  22 #include "catalog/catalog.h"
  23 #include "miscadmin.h"
  24 #include "postmaster/bgwriter.h"
  25 #include "storage/fd.h"
  26 #include "storage/smgr.h"
  27 #include "utils/hsearch.h"
  28 #include "utils/memutils.h"
  29
  30
  31 /*
  32  *      The magnetic disk storage manager keeps track of open file
  33  *      descriptors in its own descriptor pool.  This is done to make it
  34  *      easier to support relations that are larger than the operating
  35  *      system's file size limit (often 2GBytes).  In order to do that,
  36  *      we break relations up into chunks of < 2GBytes and store one chunk
  37  *      in each of several files that represent the relation.  See the
  38  *      BLCKSZ and RELSEG_SIZE configuration constants in pg_config_manual.h.
  39  *      All chunks except the last MUST have size exactly equal to RELSEG_SIZE
  40  *      blocks --- see mdnblocks() and mdtruncate().
  41  *
  42  *      The file descriptor pointer (md_fd field) stored in the SMgrRelation
  43  *      cache is, therefore, just the head of a list of MdfdVec objects.
  44  *      But note the md_fd pointer can be NULL, indicating relation not open.
  45  *
  46  *      Note that mdfd_chain == NULL does not necessarily mean the relation
  47  *      doesn't have another segment after this one; we may just not have
  48  *      opened the next segment yet.  (We could not have "all segments are
  49  *      in the chain" as an invariant anyway, since another backend could
  50  *      extend the relation when we weren't looking.)
  51  *
  52  *      All MdfdVec objects are palloc'd in the MdCxt memory context.
  53  */
  54
  55 typedef struct _MdfdVec
  56 {
  57         File            mdfd_vfd;               /* fd number in fd.c's pool */
  58         BlockNumber mdfd_segno;         /* segment number, from 0 */
  59 #ifndef LET_OS_MANAGE_FILESIZE  /* for large relations */
  60         struct _MdfdVec *mdfd_chain;    /* next segment, or NULL */
  61 #endif
  62 } MdfdVec;
  63
  64 static MemoryContext MdCxt;             /* context for all md.c allocations */
  65
  66
  67 /*
  68  * In some contexts (currently, standalone backends and the bgwriter process)
  69  * we keep track of pending fsync operations: we need to remember all relation
  70  * segments that have been written since the last checkpoint, so that we can
  71  * fsync them down to disk before completing the next checkpoint.  This hash
  72  * table remembers the pending operations.      We use a hash table not because
  73  * we want to look up individual operations, but simply as a convenient way
  74  * of eliminating duplicate requests.
  75  *
  76  * (Regular backends do not track pending operations locally, but forward
  77  * them to the bgwriter.)
  78  *
  79  * XXX for WIN32, may want to expand this to track pending deletes, too.
  80  */
  81 typedef struct
  82 {
  83         RelFileNode rnode;                      /* the targeted relation */
  84         BlockNumber segno;                      /* which segment */
  85 } PendingOperationEntry;
  86
  87 static HTAB *pendingOpsTable = NULL;
  88
  89
  90 /* local routines */
  91 static MdfdVec *mdopen(SMgrRelation reln, bool allowNotFound);
  92 static bool register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
  93 static MdfdVec *_fdvec_alloc(void);
  94
  95 #ifndef LET_OS_MANAGE_FILESIZE
  96 static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno,
  97                           int oflags);
  98 #endif
  99 static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno,
 100                          bool allowNotFound);
 101 static BlockNumber _mdnblocks(File file, Size blcksz);
 102
 103
 104 /*
 105  *      mdinit() -- Initialize private state for magnetic disk storage manager.
 106  */
 107 bool
 108 mdinit(void)
 109 {
 110         MdCxt = AllocSetContextCreate(TopMemoryContext,
 111                                                                   "MdSmgr",
 112                                                                   ALLOCSET_DEFAULT_MINSIZE,
 113                                                                   ALLOCSET_DEFAULT_INITSIZE,
 114                                                                   ALLOCSET_DEFAULT_MAXSIZE);
 115
 116         /*
 117          * Create pending-operations hashtable if we need it.  Currently, we
 118          * need it if we are standalone (not under a postmaster) OR if we are
 119          * a bootstrap-mode subprocess of a postmaster (that is, a startup or
 120          * bgwriter process).
 121          */
 122         if (!IsUnderPostmaster || IsBootstrapProcessingMode())
 123         {
 124                 HASHCTL         hash_ctl;
 125
 126                 MemSet(&hash_ctl, 0, sizeof(hash_ctl));
 127                 hash_ctl.keysize = sizeof(PendingOperationEntry);
 128                 hash_ctl.entrysize = sizeof(PendingOperationEntry);
 129                 hash_ctl.hash = tag_hash;
 130                 hash_ctl.hcxt = MdCxt;
 131                 pendingOpsTable = hash_create("Pending Ops Table",
 132                                                                           100L,
 133                                                                           &hash_ctl,
 134                                                            HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
 135         }
 136
 137         return true;
 138 }
 139
 140 /*
 141  *      mdcreate() -- Create a new relation on magnetic disk.
 142  *
 143  * If isRedo is true, it's okay for the relation to exist already.
 144  */
 145 bool
 146 mdcreate(SMgrRelation reln, bool isRedo)
 147 {
 148         char       *path;
 149         File            fd;
 150
 151         if (isRedo && reln->md_fd != NULL)
 152                 return true;                    /* created and opened already... */
 153
 154         Assert(reln->md_fd == NULL);
 155
 156         path = relpath(reln->smgr_rnode);
 157
 158         fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
 159
 160         if (fd < 0)
 161         {
 162                 int                     save_errno = errno;
 163
 164                 /*
 165                  * During bootstrap, there are cases where a system relation will
 166                  * be accessed (by internal backend processes) before the
 167                  * bootstrap script nominally creates it.  Therefore, allow the
 168                  * file to exist already, even if isRedo is not set.  (See also
 169                  * mdopen)
 170                  */
 171                 if (isRedo || IsBootstrapProcessingMode())
 172                         fd = FileNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
 173                 if (fd < 0)
 174                 {
 175                         pfree(path);
 176                         /* be sure to return the error reported by create, not open */
 177                         errno = save_errno;
 178                         return false;
 179                 }
 180                 errno = 0;
 181         }
 182
 183         pfree(path);
 184
 185         reln->md_fd = _fdvec_alloc();
 186
 187         reln->md_fd->mdfd_vfd = fd;
 188         reln->md_fd->mdfd_segno = 0;
 189 #ifndef LET_OS_MANAGE_FILESIZE
 190         reln->md_fd->mdfd_chain = NULL;
 191 #endif
 192
 193         return true;
 194 }
 195
 196 /*
 197  *      mdunlink() -- Unlink a relation.
 198  *
 199  * Note that we're passed a RelFileNode --- by the time this is called,
 200  * there won't be an SMgrRelation hashtable entry anymore.
 201  *
 202  * If isRedo is true, it's okay for the relation to be already gone.
 203  */
 204 bool
 205 mdunlink(RelFileNode rnode, bool isRedo)
 206 {
 207         bool            status = true;
 208         int                     save_errno = 0;
 209         char       *path;
 210
 211         path = relpath(rnode);
 212
 213         /* Delete the first segment, or only segment if not doing segmenting */
 214         if (unlink(path) < 0)
 215         {
 216                 if (!isRedo || errno != ENOENT)
 217                 {
 218                         status = false;
 219                         save_errno = errno;
 220                 }
 221         }
 222
 223 #ifndef LET_OS_MANAGE_FILESIZE
 224         /* Get the additional segments, if any */
 225         if (status)
 226         {
 227                 char       *segpath = (char *) palloc(strlen(path) + 12);
 228                 BlockNumber segno;
 229
 230                 for (segno = 1;; segno++)
 231                 {
 232                         sprintf(segpath, "%s.%u", path, segno);
 233                         if (unlink(segpath) < 0)
 234                         {
 235                                 /* ENOENT is expected after the last segment... */
 236                                 if (errno != ENOENT)
 237                                 {
 238                                         status = false;
 239                                         save_errno = errno;
 240                                 }
 241                                 break;
 242                         }
 243                 }
 244                 pfree(segpath);
 245         }
 246 #endif
 247
 248         pfree(path);
 249
 250         errno = save_errno;
 251         return status;
 252 }
 253
 254 /*
 255  *      mdextend() -- Add a block to the specified relation.
 256  *
 257  *              The semantics are basically the same as mdwrite(): write at the
 258  *              specified position.  However, we are expecting to extend the
 259  *              relation (ie, blocknum is the current EOF), and so in case of
 260  *              failure we clean up by truncating.
 261  *
 262  *              This routine returns true or false, with errno set as appropriate.
 263  *
 264  * Note: this routine used to call mdnblocks() to get the block position
 265  * to write at, but that's pretty silly since the caller needs to know where
 266  * the block will be written, and accordingly must have done mdnblocks()
 267  * already.  Might as well pass in the position and save a seek.
 268  */
 269 bool
 270 mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
 271 {
 272         long            seekpos;
 273         int                     nbytes;
 274         MdfdVec    *v;
 275
 276         v = _mdfd_getseg(reln, blocknum, false);
 277
 278 #ifndef LET_OS_MANAGE_FILESIZE
 279         seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
 280         Assert(seekpos < BLCKSZ * RELSEG_SIZE);
 281 #else
 282         seekpos = (long) (BLCKSZ * (blocknum));
 283 #endif
 284
 285         /*
 286          * Note: because caller obtained blocknum by calling _mdnblocks, which
 287          * did a seek(SEEK_END), this seek is often redundant and will be
 288          * optimized away by fd.c.      It's not redundant, however, if there is a
 289          * partial page at the end of the file.  In that case we want to try
 290          * to overwrite the partial page with a full page.      It's also not
 291          * redundant if bufmgr.c had to dump another buffer of the same file
 292          * to make room for the new page's buffer.
 293          */
 294         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 295                 return false;
 296
 297         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
 298         {
 299                 if (nbytes > 0)
 300                 {
 301                         int                     save_errno = errno;
 302
 303                         /* Remove the partially-written page */
 304                         FileTruncate(v->mdfd_vfd, seekpos);
 305                         FileSeek(v->mdfd_vfd, seekpos, SEEK_SET);
 306                         errno = save_errno;
 307                 }
 308                 return false;
 309         }
 310
 311         if (!isTemp)
 312         {
 313                 if (!register_dirty_segment(reln, v))
 314                         return false;
 315         }
 316
 317 #ifndef LET_OS_MANAGE_FILESIZE
 318         Assert(_mdnblocks(v->mdfd_vfd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
 319 #endif
 320
 321         return true;
 322 }
 323
 324 /*
 325  *      mdopen() -- Open the specified relation.  ereport's on failure.
 326  *              (Optionally, can return NULL instead of ereport for ENOENT.)
 327  *
 328  * Note we only open the first segment, when there are multiple segments.
 329  */
 330 static MdfdVec *
 331 mdopen(SMgrRelation reln, bool allowNotFound)
 332 {
 333         MdfdVec    *mdfd;
 334         char       *path;
 335         File            fd;
 336
 337         /* No work if already open */
 338         if (reln->md_fd)
 339                 return reln->md_fd;
 340
 341         path = relpath(reln->smgr_rnode);
 342
 343         fd = FileNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
 344
 345         if (fd < 0)
 346         {
 347                 /*
 348                  * During bootstrap, there are cases where a system relation will
 349                  * be accessed (by internal backend processes) before the
 350                  * bootstrap script nominally creates it.  Therefore, accept
 351                  * mdopen() as a substitute for mdcreate() in bootstrap mode only.
 352                  * (See mdcreate)
 353                  */
 354                 if (IsBootstrapProcessingMode())
 355                         fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
 356                 if (fd < 0)
 357                 {
 358                         pfree(path);
 359                         if (allowNotFound && errno == ENOENT)
 360                                 return NULL;
 361                         ereport(ERROR,
 362                                         (errcode_for_file_access(),
 363                                          errmsg("could not open relation %u/%u/%u: %m",
 364                                                         reln->smgr_rnode.spcNode,
 365                                                         reln->smgr_rnode.dbNode,
 366                                                         reln->smgr_rnode.relNode)));
 367                 }
 368         }
 369
 370         pfree(path);
 371
 372         reln->md_fd = mdfd = _fdvec_alloc();
 373
 374         mdfd->mdfd_vfd = fd;
 375         mdfd->mdfd_segno = 0;
 376 #ifndef LET_OS_MANAGE_FILESIZE
 377         mdfd->mdfd_chain = NULL;
 378         Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
 379 #endif
 380
 381         return mdfd;
 382 }
 383
 384 /*
 385  *      mdclose() -- Close the specified relation, if it isn't closed already.
 386  *
 387  *              Returns true or false with errno set as appropriate.
 388  */
 389 bool
 390 mdclose(SMgrRelation reln)
 391 {
 392         MdfdVec    *v = reln->md_fd;
 393
 394         /* No work if already closed */
 395         if (v == NULL)
 396                 return true;
 397
 398         reln->md_fd = NULL;                     /* prevent dangling pointer after error */
 399
 400 #ifndef LET_OS_MANAGE_FILESIZE
 401         while (v != NULL)
 402         {
 403                 MdfdVec    *ov = v;
 404
 405                 /* if not closed already */
 406                 if (v->mdfd_vfd >= 0)
 407                         FileClose(v->mdfd_vfd);
 408                 /* Now free vector */
 409                 v = v->mdfd_chain;
 410                 pfree(ov);
 411         }
 412 #else
 413         if (v->mdfd_vfd >= 0)
 414                 FileClose(v->mdfd_vfd);
 415         pfree(v);
 416 #endif
 417
 418         return true;
 419 }
 420
 421 /*
 422  *      mdread() -- Read the specified block from a relation.
 423  */
 424 bool
 425 mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
 426 {
 427         bool            status;
 428         long            seekpos;
 429         int                     nbytes;
 430         MdfdVec    *v;
 431
 432         v = _mdfd_getseg(reln, blocknum, false);
 433
 434 #ifndef LET_OS_MANAGE_FILESIZE
 435         seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
 436         Assert(seekpos < BLCKSZ * RELSEG_SIZE);
 437 #else
 438         seekpos = (long) (BLCKSZ * (blocknum));
 439 #endif
 440
 441         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 442                 return false;
 443
 444         status = true;
 445         if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
 446         {
 447                 /*
 448                  * If we are at or past EOF, return zeroes without complaining.
 449                  * Also substitute zeroes if we found a partial block at EOF.
 450                  *
 451                  * XXX this is really ugly, bad design.  However the current
 452                  * implementation of hash indexes requires it, because hash index
 453                  * pages are initialized out-of-order.
 454                  */
 455                 if (nbytes == 0 ||
 456                         (nbytes > 0 && mdnblocks(reln) == blocknum))
 457                         MemSet(buffer, 0, BLCKSZ);
 458                 else
 459                         status = false;
 460         }
 461
 462         return status;
 463 }
 464
 465 /*
 466  *      mdwrite() -- Write the supplied block at the appropriate location.
 467  */
 468 bool
 469 mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
 470 {
 471         long            seekpos;
 472         MdfdVec    *v;
 473
 474         v = _mdfd_getseg(reln, blocknum, false);
 475
 476 #ifndef LET_OS_MANAGE_FILESIZE
 477         seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
 478         Assert(seekpos < BLCKSZ * RELSEG_SIZE);
 479 #else
 480         seekpos = (long) (BLCKSZ * (blocknum));
 481 #endif
 482
 483         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 484                 return false;
 485
 486         if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
 487                 return false;
 488
 489         if (!isTemp)
 490         {
 491                 if (!register_dirty_segment(reln, v))
 492                         return false;
 493         }
 494
 495         return true;
 496 }
 497
 498 /*
 499  *      mdnblocks() -- Get the number of blocks stored in a relation.
 500  *
 501  *              Important side effect: all segments of the relation are opened
 502  *              and added to the mdfd_chain list.  If this routine has not been
 503  *              called, then only segments up to the last one actually touched
 504  *              are present in the chain...
 505  *
 506  *              Returns # of blocks, or InvalidBlockNumber on error.
 507  */
 508 BlockNumber
 509 mdnblocks(SMgrRelation reln)
 510 {
 511         MdfdVec    *v = mdopen(reln, false);
 512
 513 #ifndef LET_OS_MANAGE_FILESIZE
 514         BlockNumber nblocks;
 515         BlockNumber segno = 0;
 516
 517         /*
 518          * Skip through any segments that aren't the last one, to avoid
 519          * redundant seeks on them.  We have previously verified that these
 520          * segments are exactly RELSEG_SIZE long, and it's useless to recheck
 521          * that each time. (NOTE: this assumption could only be wrong if
 522          * another backend has truncated the relation.  We rely on higher code
 523          * levels to handle that scenario by closing and re-opening the md
 524          * fd.)
 525          */
 526         while (v->mdfd_chain != NULL)
 527         {
 528                 segno++;
 529                 v = v->mdfd_chain;
 530         }
 531
 532         for (;;)
 533         {
 534                 nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ);
 535                 if (nblocks > ((BlockNumber) RELSEG_SIZE))
 536                         elog(FATAL, "segment too big");
 537                 if (nblocks < ((BlockNumber) RELSEG_SIZE))
 538                         return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
 539
 540                 /*
 541                  * If segment is exactly RELSEG_SIZE, advance to next one.
 542                  */
 543                 segno++;
 544
 545                 if (v->mdfd_chain == NULL)
 546                 {
 547                         /*
 548                          * Because we pass O_CREAT, we will create the next segment
 549                          * (with zero length) immediately, if the last segment is of
 550                          * length REL_SEGSIZE.  This is unnecessary but harmless, and
 551                          * testing for the case would take more cycles than it seems
 552                          * worth.
 553                          */
 554                         v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
 555                         if (v->mdfd_chain == NULL)
 556                                 return InvalidBlockNumber;              /* failed? */
 557                 }
 558
 559                 v = v->mdfd_chain;
 560         }
 561 #else
 562         return _mdnblocks(v->mdfd_vfd, BLCKSZ);
 563 #endif
 564 }
 565
 566 /*
 567  *      mdtruncate() -- Truncate relation to specified number of blocks.
 568  *
 569  *              Returns # of blocks or InvalidBlockNumber on error.
 570  */
 571 BlockNumber
 572 mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
 573 {
 574         MdfdVec    *v;
 575         BlockNumber curnblk;
 576
 577 #ifndef LET_OS_MANAGE_FILESIZE
 578         BlockNumber priorblocks;
 579 #endif
 580
 581         /*
 582          * NOTE: mdnblocks makes sure we have opened all existing segments, so
 583          * that truncate/delete loop will get them all!
 584          */
 585         curnblk = mdnblocks(reln);
 586         if (curnblk == InvalidBlockNumber)
 587                 return InvalidBlockNumber;              /* mdnblocks failed */
 588         if (nblocks > curnblk)
 589                 return InvalidBlockNumber;              /* bogus request */
 590         if (nblocks == curnblk)
 591                 return nblocks;                 /* no work */
 592
 593         v = mdopen(reln, false);
 594
 595 #ifndef LET_OS_MANAGE_FILESIZE
 596         priorblocks = 0;
 597         while (v != NULL)
 598         {
 599                 MdfdVec    *ov = v;
 600
 601                 if (priorblocks > nblocks)
 602                 {
 603                         /*
 604                          * This segment is no longer wanted at all (and has already
 605                          * been unlinked from the mdfd_chain). We truncate the file
 606                          * before deleting it because if other backends are holding
 607                          * the file open, the unlink will fail on some platforms.
 608                          * Better a zero-size file gets left around than a big file...
 609                          */
 610                         FileTruncate(v->mdfd_vfd, 0);
 611                         FileUnlink(v->mdfd_vfd);
 612                         v = v->mdfd_chain;
 613                         Assert(ov != reln->md_fd);      /* we never drop the 1st segment */
 614                         pfree(ov);
 615                 }
 616                 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
 617                 {
 618                         /*
 619                          * This is the last segment we want to keep. Truncate the file
 620                          * to the right length, and clear chain link that points to
 621                          * any remaining segments (which we shall zap). NOTE: if
 622                          * nblocks is exactly a multiple K of RELSEG_SIZE, we will
 623                          * truncate the K+1st segment to 0 length but keep it. This is
 624                          * mainly so that the right thing happens if nblocks==0.
 625                          */
 626                         BlockNumber lastsegblocks = nblocks - priorblocks;
 627
 628                         if (FileTruncate(v->mdfd_vfd, lastsegblocks * BLCKSZ) < 0)
 629                                 return InvalidBlockNumber;
 630                         if (!isTemp)
 631                         {
 632                                 if (!register_dirty_segment(reln, v))
 633                                         return InvalidBlockNumber;
 634                         }
 635                         v = v->mdfd_chain;
 636                         ov->mdfd_chain = NULL;
 637                 }
 638                 else
 639                 {
 640                         /*
 641                          * We still need this segment and 0 or more blocks beyond it,
 642                          * so nothing to do here.
 643                          */
 644                         v = v->mdfd_chain;
 645                 }
 646                 priorblocks += RELSEG_SIZE;
 647         }
 648 #else
 649         if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0)
 650                 return InvalidBlockNumber;
 651         if (!isTemp)
 652         {
 653                 if (!register_dirty_segment(reln, v))
 654                         return InvalidBlockNumber;
 655         }
 656 #endif
 657
 658         return nblocks;
 659 }
 660
 661 /*
 662  *      mdimmedsync() -- Immediately sync a relation to stable storage.
 663  */
 664 bool
 665 mdimmedsync(SMgrRelation reln)
 666 {
 667         MdfdVec    *v;
 668         BlockNumber curnblk;
 669
 670         /*
 671          * NOTE: mdnblocks makes sure we have opened all existing segments, so
 672          * that fsync loop will get them all!
 673          */
 674         curnblk = mdnblocks(reln);
 675         if (curnblk == InvalidBlockNumber)
 676                 return false;                   /* mdnblocks failed */
 677
 678         v = mdopen(reln, false);
 679
 680 #ifndef LET_OS_MANAGE_FILESIZE
 681         while (v != NULL)
 682         {
 683                 if (FileSync(v->mdfd_vfd) < 0)
 684                         return false;
 685                 v = v->mdfd_chain;
 686         }
 687 #else
 688         if (FileSync(v->mdfd_vfd) < 0)
 689                 return false;
 690 #endif
 691
 692         return true;
 693 }
 694
 695 /*
 696  *      mdsync() -- Sync previous writes to stable storage.
 697  *
 698  * This is only called during checkpoints, and checkpoints should only
 699  * occur in processes that have created a pendingOpsTable.
 700  */
 701 bool
 702 mdsync(void)
 703 {
 704         HASH_SEQ_STATUS hstat;
 705         PendingOperationEntry *entry;
 706
 707         if (!pendingOpsTable)
 708                 return false;
 709
 710         /*
 711          * If we are in the bgwriter, the sync had better include all fsync
 712          * requests that were queued by backends before the checkpoint REDO
 713          * point was determined.  We go that a little better by accepting all
 714          * requests queued up to the point where we start fsync'ing.
 715          */
 716         AbsorbFsyncRequests();
 717
 718         hash_seq_init(&hstat, pendingOpsTable);
 719         while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
 720         {
 721                 /*
 722                  * If fsync is off then we don't have to bother opening the file
 723                  * at all.      (We delay checking until this point so that changing
 724                  * fsync on the fly behaves sensibly.)
 725                  */
 726                 if (enableFsync)
 727                 {
 728                         SMgrRelation reln;
 729                         MdfdVec    *seg;
 730
 731                         /*
 732                          * Find or create an smgr hash entry for this relation. This
 733                          * may seem a bit unclean -- md calling smgr?  But it's really
 734                          * the best solution.  It ensures that the open file reference
 735                          * isn't permanently leaked if we get an error here. (You may
 736                          * say "but an unreferenced SMgrRelation is still a leak!" Not
 737                          * really, because the only case in which a checkpoint is done
 738                          * by a process that isn't about to shut down is in the
 739                          * bgwriter, and it will periodically do smgrcloseall().  This
 740                          * fact justifies our not closing the reln in the success path
 741                          * either, which is a good thing since in non-bgwriter cases
 742                          * we couldn't safely do that.)  Furthermore, in many cases
 743                          * the relation will have been dirtied through this same smgr
 744                          * relation, and so we can save a file open/close cycle.
 745                          */
 746                         reln = smgropen(entry->rnode);
 747
 748                         /*
 749                          * It is possible that the relation has been dropped or
 750                          * truncated since the fsync request was entered.  Therefore,
 751                          * we have to allow file-not-found errors.      This applies both
 752                          * during _mdfd_getseg() and during FileSync, since fd.c might
 753                          * have closed the file behind our back.
 754                          */
 755                         seg = _mdfd_getseg(reln,
 756                                                            entry->segno * ((BlockNumber) RELSEG_SIZE),
 757                                                            true);
 758                         if (seg)
 759                         {
 760                                 if (FileSync(seg->mdfd_vfd) < 0 &&
 761                                         errno != ENOENT)
 762                                 {
 763                                         ereport(LOG,
 764                                                         (errcode_for_file_access(),
 765                                                          errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
 766                                                                         entry->segno,
 767                                                                         entry->rnode.spcNode,
 768                                                                         entry->rnode.dbNode,
 769                                                                         entry->rnode.relNode)));
 770                                         return false;
 771                                 }
 772                         }
 773                 }
 774
 775                 /* Okay, delete this entry */
 776                 if (hash_search(pendingOpsTable, entry,
 777                                                 HASH_REMOVE, NULL) == NULL)
 778                         elog(ERROR, "pendingOpsTable corrupted");
 779         }
 780
 781         return true;
 782 }
 783
 784 /*
 785  * register_dirty_segment() -- Mark a relation segment as needing fsync
 786  *
 787  * If there is a local pending-ops table, just make an entry in it for
 788  * mdsync to process later.  Otherwise, try to pass off the fsync request
 789  * to the background writer process.  If that fails, just do the fsync
 790  * locally before returning (we expect this will not happen often enough
 791  * to be a performance problem).
 792  *
 793  * A false result implies I/O failure during local fsync.  errno will be
 794  * valid for error reporting.
 795  */
 796 static bool
 797 register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
 798 {
 799         if (pendingOpsTable)
 800         {
 801                 PendingOperationEntry entry;
 802
 803                 /* ensure any pad bytes in the struct are zeroed */
 804                 MemSet(&entry, 0, sizeof(entry));
 805                 entry.rnode = reln->smgr_rnode;
 806                 entry.segno = seg->mdfd_segno;
 807
 808                 if (hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL) != NULL)
 809                         return true;
 810                 /* out of memory: fall through to do it locally */
 811         }
 812         else
 813         {
 814                 if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno))
 815                         return true;
 816         }
 817
 818         if (FileSync(seg->mdfd_vfd) < 0)
 819                 return false;
 820         return true;
 821 }
 822
 823 /*
 824  * RememberFsyncRequest() -- callback from bgwriter side of fsync request
 825  *
 826  * We stuff the fsync request into the local hash table for execution
 827  * during the bgwriter's next checkpoint.
 828  */
 829 void
 830 RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
 831 {
 832         PendingOperationEntry entry;
 833
 834         Assert(pendingOpsTable);
 835
 836         /* ensure any pad bytes in the struct are zeroed */
 837         MemSet(&entry, 0, sizeof(entry));
 838         entry.rnode = rnode;
 839         entry.segno = segno;
 840
 841         if (hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL) == NULL)
 842                 ereport(FATAL,
 843                                 (errcode(ERRCODE_OUT_OF_MEMORY),
 844                                  errmsg("out of memory")));
 845 }
 846
 847 /*
 848  *      _fdvec_alloc() -- Make a MdfdVec object.
 849  */
 850 static MdfdVec *
 851 _fdvec_alloc(void)
 852 {
 853         return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
 854 }
 855
 856 #ifndef LET_OS_MANAGE_FILESIZE
 857
 858 /*
 859  * Open the specified segment of the relation,
 860  * and make a MdfdVec object for it.  Returns NULL on failure.
 861  */
 862 static MdfdVec *
 863 _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
 864 {
 865         MdfdVec    *v;
 866         int                     fd;
 867         char       *path,
 868                            *fullpath;
 869
 870         path = relpath(reln->smgr_rnode);
 871
 872         if (segno > 0)
 873         {
 874                 /* be sure we have enough space for the '.segno' */
 875                 fullpath = (char *) palloc(strlen(path) + 12);
 876                 sprintf(fullpath, "%s.%u", path, segno);
 877                 pfree(path);
 878         }
 879         else
 880                 fullpath = path;
 881
 882         /* open the file */
 883         fd = FileNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
 884
 885         pfree(fullpath);
 886
 887         if (fd < 0)
 888                 return NULL;
 889
 890         /* allocate an mdfdvec entry for it */
 891         v = _fdvec_alloc();
 892
 893         /* fill the entry */
 894         v->mdfd_vfd = fd;
 895         v->mdfd_segno = segno;
 896         v->mdfd_chain = NULL;
 897         Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
 898
 899         /* all done */
 900         return v;
 901 }
 902 #endif   /* LET_OS_MANAGE_FILESIZE */
 903
 904 /*
 905  *      _mdfd_getseg() -- Find the segment of the relation holding the
 906  *              specified block.  ereport's on failure.
 907  *              (Optionally, can return NULL instead of ereport for ENOENT.)
 908  */
 909 static MdfdVec *
 910 _mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool allowNotFound)
 911 {
 912         MdfdVec    *v = mdopen(reln, allowNotFound);
 913
 914 #ifndef LET_OS_MANAGE_FILESIZE
 915         BlockNumber segstogo;
 916         BlockNumber nextsegno;
 917
 918         if (!v)
 919                 return NULL;                    /* only possible if allowNotFound */
 920
 921         for (segstogo = blkno / ((BlockNumber) RELSEG_SIZE), nextsegno = 1;
 922                  segstogo > 0;
 923                  nextsegno++, segstogo--)
 924         {
 925                 if (v->mdfd_chain == NULL)
 926                 {
 927                         /*
 928                          * We will create the next segment only if the target block is
 929                          * within it.  This prevents Sorcerer's Apprentice syndrome if
 930                          * a bug at higher levels causes us to be handed a
 931                          * ridiculously large blkno --- otherwise we could create many
 932                          * thousands of empty segment files before reaching the
 933                          * "target" block.      We should never need to create more than
 934                          * one new segment per call, so this restriction seems
 935                          * reasonable.
 936                          *
 937                          * BUT: when doing WAL recovery, disable this logic and create
 938                          * segments unconditionally.  In this case it seems better
 939                          * to assume the given blkno is good (it presumably came from
 940                          * a CRC-checked WAL record); furthermore this lets us cope
 941                          * in the case where we are replaying WAL data that has a write
 942                          * into a high-numbered segment of a relation that was later
 943                          * deleted.  We want to go ahead and create the segments so
 944                          * we can finish out the replay.
 945                          */
 946                         v->mdfd_chain = _mdfd_openseg(reln,
 947                                                                                   nextsegno,
 948                                                                   (segstogo == 1 || InRecovery) ? O_CREAT : 0);
 949                         if (v->mdfd_chain == NULL)
 950                         {
 951                                 if (allowNotFound && errno == ENOENT)
 952                                         return NULL;
 953                                 ereport(ERROR,
 954                                                 (errcode_for_file_access(),
 955                                                  errmsg("could not open segment %u of relation %u/%u/%u (target block %u): %m",
 956                                                                 nextsegno,
 957                                                                 reln->smgr_rnode.spcNode,
 958                                                                 reln->smgr_rnode.dbNode,
 959                                                                 reln->smgr_rnode.relNode,
 960                                                                 blkno)));
 961                         }
 962                 }
 963                 v = v->mdfd_chain;
 964         }
 965 #endif
 966
 967         return v;
 968 }
 969
 970 /*
 971  * Get number of blocks present in a single disk file
 972  */
 973 static BlockNumber
 974 _mdnblocks(File file, Size blcksz)
 975 {
 976         long            len;
 977
 978         len = FileSeek(file, 0L, SEEK_END);
 979         if (len < 0)
 980                 return 0;                               /* on failure, assume file is empty */
 981         return (BlockNumber) (len / blcksz);
 982 }