1 /*-------------------------------------------------------------------------
4 * This code manages relations that reside on magnetic disk.
6 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.127 2007/01/17 16:25:01 tgl Exp $
13 *-------------------------------------------------------------------------
21 #include "catalog/catalog.h"
22 #include "miscadmin.h"
23 #include "postmaster/bgwriter.h"
24 #include "storage/fd.h"
25 #include "storage/bufmgr.h"
26 #include "storage/smgr.h"
27 #include "utils/hsearch.h"
28 #include "utils/memutils.h"
31 /* interval for calling AbsorbFsyncRequests in mdsync */
32 #define FSYNCS_PER_ABSORB 10
34 /* special values for the segno arg to RememberFsyncRequest */
35 #define FORGET_RELATION_FSYNC (InvalidBlockNumber)
36 #define FORGET_DATABASE_FSYNC (InvalidBlockNumber-1)
39 * On Windows, we have to interpret EACCES as possibly meaning the same as
40 * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
41 * that's what you get. Ugh. This code is designed so that we don't
42 * actually believe these cases are okay without further evidence (namely,
43 * a pending fsync request getting revoked ... see mdsync).
46 #define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT)
48 #define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT || (err) == EACCES)
52 * The magnetic disk storage manager keeps track of open file
53 * descriptors in its own descriptor pool. This is done to make it
54 * easier to support relations that are larger than the operating
55 * system's file size limit (often 2GBytes). In order to do that,
56 * we break relations up into "segment" files that are each shorter than
57 * the OS file size limit. The segment size is set by the RELSEG_SIZE
58 * configuration constant in pg_config_manual.h.
60 * On disk, a relation must consist of consecutively numbered segment
61 * files in the pattern
62 * -- Zero or more full segments of exactly RELSEG_SIZE blocks each
63 * -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
64 * -- Optionally, any number of inactive segments of size 0 blocks.
65 * The full and partial segments are collectively the "active" segments.
66 * Inactive segments are those that once contained data but are currently
67 * not needed because of an mdtruncate() operation. The reason for leaving
68 * them present at size zero, rather than unlinking them, is that other
69 * backends and/or the bgwriter might be holding open file references to
70 * such segments. If the relation expands again after mdtruncate(), such
71 * that a deactivated segment becomes active again, it is important that
72 * such file references still be valid --- else data might get written
73 * out to an unlinked old copy of a segment file that will eventually
76 * The file descriptor pointer (md_fd field) stored in the SMgrRelation
77 * cache is, therefore, just the head of a list of MdfdVec objects, one
78 * per segment. But note the md_fd pointer can be NULL, indicating
81 * Also note that mdfd_chain == NULL does not necessarily mean the relation
82 * doesn't have another segment after this one; we may just not have
83 * opened the next segment yet. (We could not have "all segments are
84 * in the chain" as an invariant anyway, since another backend could
85 * extend the relation when we weren't looking.) We do not make chain
86 * entries for inactive segments, however; as soon as we find a partial
87 * segment, we assume that any subsequent segments are inactive.
89 * All MdfdVec objects are palloc'd in the MdCxt memory context.
91 * Defining LET_OS_MANAGE_FILESIZE disables the segmentation logic,
92 * for use on machines that support large files. Beware that that
93 * code has not been tested in a long time and is probably bit-rotted.
96 typedef struct _MdfdVec
98 File mdfd_vfd; /* fd number in fd.c's pool */
99 BlockNumber mdfd_segno; /* segment number, from 0 */
100 #ifndef LET_OS_MANAGE_FILESIZE /* for large relations */
101 struct _MdfdVec *mdfd_chain; /* next segment, or NULL */
105 static MemoryContext MdCxt; /* context for all md.c allocations */
109 * In some contexts (currently, standalone backends and the bgwriter process)
110 * we keep track of pending fsync operations: we need to remember all relation
111 * segments that have been written since the last checkpoint, so that we can
112 * fsync them down to disk before completing the next checkpoint. This hash
113 * table remembers the pending operations. We use a hash table mostly as
114 * a convenient way of eliminating duplicate requests.
116 * (Regular backends do not track pending operations locally, but forward
117 * them to the bgwriter.)
121 RelFileNode rnode; /* the targeted relation */
122 BlockNumber segno; /* which segment */
123 } PendingOperationTag;
127 PendingOperationTag tag; /* hash table key (must be first!) */
128 int failures; /* number of failed attempts to fsync */
129 } PendingOperationEntry;
131 static HTAB *pendingOpsTable = NULL;
134 typedef enum /* behavior for mdopen & _mdfd_getseg */
136 EXTENSION_FAIL, /* ereport if segment not present */
137 EXTENSION_RETURN_NULL, /* return NULL if not present */
138 EXTENSION_CREATE /* create new segments as needed */
142 static MdfdVec *mdopen(SMgrRelation reln, ExtensionBehavior behavior);
143 static void register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
144 static MdfdVec *_fdvec_alloc(void);
146 #ifndef LET_OS_MANAGE_FILESIZE
147 static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno,
150 static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno,
151 bool isTemp, ExtensionBehavior behavior);
152 static BlockNumber _mdnblocks(SMgrRelation reln, MdfdVec *seg);
156 * mdinit() -- Initialize private state for magnetic disk storage manager.
161 MdCxt = AllocSetContextCreate(TopMemoryContext,
163 ALLOCSET_DEFAULT_MINSIZE,
164 ALLOCSET_DEFAULT_INITSIZE,
165 ALLOCSET_DEFAULT_MAXSIZE);
168 * Create pending-operations hashtable if we need it. Currently, we need
169 * it if we are standalone (not under a postmaster) OR if we are a
170 * bootstrap-mode subprocess of a postmaster (that is, a startup or
173 if (!IsUnderPostmaster || IsBootstrapProcessingMode())
177 MemSet(&hash_ctl, 0, sizeof(hash_ctl));
178 hash_ctl.keysize = sizeof(PendingOperationTag);
179 hash_ctl.entrysize = sizeof(PendingOperationEntry);
180 hash_ctl.hash = tag_hash;
181 hash_ctl.hcxt = MdCxt;
182 pendingOpsTable = hash_create("Pending Ops Table",
185 HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
190 * mdcreate() -- Create a new relation on magnetic disk.
192 * If isRedo is true, it's okay for the relation to exist already.
195 mdcreate(SMgrRelation reln, bool isRedo)
200 if (isRedo && reln->md_fd != NULL)
201 return; /* created and opened already... */
203 Assert(reln->md_fd == NULL);
205 path = relpath(reln->smgr_rnode);
207 fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
211 int save_errno = errno;
214 * During bootstrap, there are cases where a system relation will be
215 * accessed (by internal backend processes) before the bootstrap
216 * script nominally creates it. Therefore, allow the file to exist
217 * already, even if isRedo is not set. (See also mdopen)
219 if (isRedo || IsBootstrapProcessingMode())
220 fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
224 /* be sure to report the error reported by create, not open */
227 (errcode_for_file_access(),
228 errmsg("could not create relation %u/%u/%u: %m",
229 reln->smgr_rnode.spcNode,
230 reln->smgr_rnode.dbNode,
231 reln->smgr_rnode.relNode)));
237 reln->md_fd = _fdvec_alloc();
239 reln->md_fd->mdfd_vfd = fd;
240 reln->md_fd->mdfd_segno = 0;
241 #ifndef LET_OS_MANAGE_FILESIZE
242 reln->md_fd->mdfd_chain = NULL;
247 * mdunlink() -- Unlink a relation.
249 * Note that we're passed a RelFileNode --- by the time this is called,
250 * there won't be an SMgrRelation hashtable entry anymore.
252 * If isRedo is true, it's okay for the relation to be already gone.
253 * Also, any failure should be reported as WARNING not ERROR, because
254 * we are usually not in a transaction anymore when this is called.
257 mdunlink(RelFileNode rnode, bool isRedo)
262 * We have to clean out any pending fsync requests for the doomed relation,
263 * else the next mdsync() will fail.
265 ForgetRelationFsyncRequests(rnode);
267 path = relpath(rnode);
269 /* Delete the first segment, or only segment if not doing segmenting */
270 if (unlink(path) < 0)
272 if (!isRedo || errno != ENOENT)
274 (errcode_for_file_access(),
275 errmsg("could not remove relation %u/%u/%u: %m",
281 #ifndef LET_OS_MANAGE_FILESIZE
282 /* Delete the additional segments, if any */
285 char *segpath = (char *) palloc(strlen(path) + 12);
289 * Note that because we loop until getting ENOENT, we will
290 * correctly remove all inactive segments as well as active ones.
292 for (segno = 1;; segno++)
294 sprintf(segpath, "%s.%u", path, segno);
295 if (unlink(segpath) < 0)
297 /* ENOENT is expected after the last segment... */
300 (errcode_for_file_access(),
301 errmsg("could not remove segment %u of relation %u/%u/%u: %m",
317 * mdextend() -- Add a block to the specified relation.
319 * The semantics are nearly the same as mdwrite(): write at the
320 * specified position. However, this is to be used for the case of
321 * extending a relation (i.e., blocknum is at or beyond the current
322 * EOF). Note that we assume writing a block beyond current EOF
323 * causes intervening file space to become filled with zeroes.
326 mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
332 /* This assert is too expensive to have on normally ... */
333 #ifdef CHECK_WRITE_VS_EXTEND
334 Assert(blocknum >= mdnblocks(reln));
338 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it
339 * any more --- we mustn't create a block whose number
340 * actually is InvalidBlockNumber.
342 if (blocknum == InvalidBlockNumber)
344 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
345 errmsg("cannot extend relation %u/%u/%u beyond %u blocks",
346 reln->smgr_rnode.spcNode,
347 reln->smgr_rnode.dbNode,
348 reln->smgr_rnode.relNode,
349 InvalidBlockNumber)));
351 v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_CREATE);
353 #ifndef LET_OS_MANAGE_FILESIZE
354 seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
355 Assert(seekpos < BLCKSZ * RELSEG_SIZE);
357 seekpos = (long) (BLCKSZ * (blocknum));
361 * Note: because caller usually obtained blocknum by calling mdnblocks,
362 * which did a seek(SEEK_END), this seek is often redundant and will be
363 * optimized away by fd.c. It's not redundant, however, if there is a
364 * partial page at the end of the file. In that case we want to try to
365 * overwrite the partial page with a full page. It's also not redundant
366 * if bufmgr.c had to dump another buffer of the same file to make room
367 * for the new page's buffer.
369 if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
371 (errcode_for_file_access(),
372 errmsg("could not seek to block %u of relation %u/%u/%u: %m",
374 reln->smgr_rnode.spcNode,
375 reln->smgr_rnode.dbNode,
376 reln->smgr_rnode.relNode)));
378 if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
382 (errcode_for_file_access(),
383 errmsg("could not extend relation %u/%u/%u: %m",
384 reln->smgr_rnode.spcNode,
385 reln->smgr_rnode.dbNode,
386 reln->smgr_rnode.relNode),
387 errhint("Check free disk space.")));
388 /* short write: complain appropriately */
390 (errcode(ERRCODE_DISK_FULL),
391 errmsg("could not extend relation %u/%u/%u: wrote only %d of %d bytes at block %u",
392 reln->smgr_rnode.spcNode,
393 reln->smgr_rnode.dbNode,
394 reln->smgr_rnode.relNode,
395 nbytes, BLCKSZ, blocknum),
396 errhint("Check free disk space.")));
400 register_dirty_segment(reln, v);
402 #ifndef LET_OS_MANAGE_FILESIZE
403 Assert(_mdnblocks(reln, v) <= ((BlockNumber) RELSEG_SIZE));
408 * mdopen() -- Open the specified relation.
410 * Note we only open the first segment, when there are multiple segments.
412 * If first segment is not present, either ereport or return NULL according
413 * to "behavior". We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
414 * EXTENSION_CREATE means it's OK to extend an existing relation, not to
415 * invent one out of whole cloth.
418 mdopen(SMgrRelation reln, ExtensionBehavior behavior)
424 /* No work if already open */
428 path = relpath(reln->smgr_rnode);
430 fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
435 * During bootstrap, there are cases where a system relation will be
436 * accessed (by internal backend processes) before the bootstrap
437 * script nominally creates it. Therefore, accept mdopen() as a
438 * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
440 if (IsBootstrapProcessingMode())
441 fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
445 if (behavior == EXTENSION_RETURN_NULL &&
446 FILE_POSSIBLY_DELETED(errno))
449 (errcode_for_file_access(),
450 errmsg("could not open relation %u/%u/%u: %m",
451 reln->smgr_rnode.spcNode,
452 reln->smgr_rnode.dbNode,
453 reln->smgr_rnode.relNode)));
459 reln->md_fd = mdfd = _fdvec_alloc();
462 mdfd->mdfd_segno = 0;
463 #ifndef LET_OS_MANAGE_FILESIZE
464 mdfd->mdfd_chain = NULL;
465 Assert(_mdnblocks(reln, mdfd) <= ((BlockNumber) RELSEG_SIZE));
472 * mdclose() -- Close the specified relation, if it isn't closed already.
475 mdclose(SMgrRelation reln)
477 MdfdVec *v = reln->md_fd;
479 /* No work if already closed */
483 reln->md_fd = NULL; /* prevent dangling pointer after error */
485 #ifndef LET_OS_MANAGE_FILESIZE
490 /* if not closed already */
491 if (v->mdfd_vfd >= 0)
492 FileClose(v->mdfd_vfd);
493 /* Now free vector */
498 if (v->mdfd_vfd >= 0)
499 FileClose(v->mdfd_vfd);
505 * mdread() -- Read the specified block from a relation.
508 mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
514 v = _mdfd_getseg(reln, blocknum, false, EXTENSION_FAIL);
516 #ifndef LET_OS_MANAGE_FILESIZE
517 seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
518 Assert(seekpos < BLCKSZ * RELSEG_SIZE);
520 seekpos = (long) (BLCKSZ * (blocknum));
523 if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
525 (errcode_for_file_access(),
526 errmsg("could not seek to block %u of relation %u/%u/%u: %m",
528 reln->smgr_rnode.spcNode,
529 reln->smgr_rnode.dbNode,
530 reln->smgr_rnode.relNode)));
532 if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
536 (errcode_for_file_access(),
537 errmsg("could not read block %u of relation %u/%u/%u: %m",
539 reln->smgr_rnode.spcNode,
540 reln->smgr_rnode.dbNode,
541 reln->smgr_rnode.relNode)));
543 * Short read: we are at or past EOF, or we read a partial block at
544 * EOF. Normally this is an error; upper levels should never try to
545 * read a nonexistent block. However, if zero_damaged_pages is ON
546 * or we are InRecovery, we should instead return zeroes without
547 * complaining. This allows, for example, the case of trying to
548 * update a block that was later truncated away.
550 if (zero_damaged_pages || InRecovery)
551 MemSet(buffer, 0, BLCKSZ);
554 (errcode(ERRCODE_DATA_CORRUPTED),
555 errmsg("could not read block %u of relation %u/%u/%u: read only %d of %d bytes",
557 reln->smgr_rnode.spcNode,
558 reln->smgr_rnode.dbNode,
559 reln->smgr_rnode.relNode,
565 * mdwrite() -- Write the supplied block at the appropriate location.
567 * This is to be used only for updating already-existing blocks of a
568 * relation (ie, those before the current EOF). To extend a relation,
572 mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
578 /* This assert is too expensive to have on normally ... */
579 #ifdef CHECK_WRITE_VS_EXTEND
580 Assert(blocknum < mdnblocks(reln));
583 v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_FAIL);
585 #ifndef LET_OS_MANAGE_FILESIZE
586 seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
587 Assert(seekpos < BLCKSZ * RELSEG_SIZE);
589 seekpos = (long) (BLCKSZ * (blocknum));
592 if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
594 (errcode_for_file_access(),
595 errmsg("could not seek to block %u of relation %u/%u/%u: %m",
597 reln->smgr_rnode.spcNode,
598 reln->smgr_rnode.dbNode,
599 reln->smgr_rnode.relNode)));
601 if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
605 (errcode_for_file_access(),
606 errmsg("could not write block %u of relation %u/%u/%u: %m",
608 reln->smgr_rnode.spcNode,
609 reln->smgr_rnode.dbNode,
610 reln->smgr_rnode.relNode)));
611 /* short write: complain appropriately */
613 (errcode(ERRCODE_DISK_FULL),
614 errmsg("could not write block %u of relation %u/%u/%u: wrote only %d of %d bytes",
616 reln->smgr_rnode.spcNode,
617 reln->smgr_rnode.dbNode,
618 reln->smgr_rnode.relNode,
620 errhint("Check free disk space.")));
624 register_dirty_segment(reln, v);
628 * mdnblocks() -- Get the number of blocks stored in a relation.
630 * Important side effect: all active segments of the relation are opened
631 * and added to the mdfd_chain list. If this routine has not been
632 * called, then only segments up to the last one actually touched
633 * are present in the chain.
636 mdnblocks(SMgrRelation reln)
638 MdfdVec *v = mdopen(reln, EXTENSION_FAIL);
640 #ifndef LET_OS_MANAGE_FILESIZE
642 BlockNumber segno = 0;
645 * Skip through any segments that aren't the last one, to avoid redundant
646 * seeks on them. We have previously verified that these segments are
647 * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
649 * NOTE: this assumption could only be wrong if another backend has
650 * truncated the relation. We rely on higher code levels to handle that
651 * scenario by closing and re-opening the md fd, which is handled via
652 * relcache flush. (Since the bgwriter doesn't participate in relcache
653 * flush, it could have segment chain entries for inactive segments;
654 * that's OK because the bgwriter never needs to compute relation size.)
656 while (v->mdfd_chain != NULL)
664 nblocks = _mdnblocks(reln, v);
665 if (nblocks > ((BlockNumber) RELSEG_SIZE))
666 elog(FATAL, "segment too big");
667 if (nblocks < ((BlockNumber) RELSEG_SIZE))
668 return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
671 * If segment is exactly RELSEG_SIZE, advance to next one.
675 if (v->mdfd_chain == NULL)
678 * Because we pass O_CREAT, we will create the next segment (with
679 * zero length) immediately, if the last segment is of length
680 * RELSEG_SIZE. While perhaps not strictly necessary, this keeps
683 v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
684 if (v->mdfd_chain == NULL)
686 (errcode_for_file_access(),
687 errmsg("could not open segment %u of relation %u/%u/%u: %m",
689 reln->smgr_rnode.spcNode,
690 reln->smgr_rnode.dbNode,
691 reln->smgr_rnode.relNode)));
697 return _mdnblocks(reln, v);
702 * mdtruncate() -- Truncate relation to specified number of blocks.
705 mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
710 #ifndef LET_OS_MANAGE_FILESIZE
711 BlockNumber priorblocks;
715 * NOTE: mdnblocks makes sure we have opened all active segments, so
716 * that truncation loop will get them all!
718 curnblk = mdnblocks(reln);
719 if (nblocks > curnblk)
721 /* Bogus request ... but no complaint if InRecovery */
725 (errmsg("could not truncate relation %u/%u/%u to %u blocks: it's only %u blocks now",
726 reln->smgr_rnode.spcNode,
727 reln->smgr_rnode.dbNode,
728 reln->smgr_rnode.relNode,
731 if (nblocks == curnblk)
732 return; /* no work */
734 v = mdopen(reln, EXTENSION_FAIL);
736 #ifndef LET_OS_MANAGE_FILESIZE
742 if (priorblocks > nblocks)
745 * This segment is no longer active (and has already been
746 * unlinked from the mdfd_chain). We truncate the file, but do
747 * not delete it, for reasons explained in the header comments.
749 if (FileTruncate(v->mdfd_vfd, 0) < 0)
751 (errcode_for_file_access(),
752 errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
753 reln->smgr_rnode.spcNode,
754 reln->smgr_rnode.dbNode,
755 reln->smgr_rnode.relNode,
758 register_dirty_segment(reln, v);
760 Assert(ov != reln->md_fd); /* we never drop the 1st segment */
763 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
766 * This is the last segment we want to keep. Truncate the file to
767 * the right length, and clear chain link that points to any
768 * remaining segments (which we shall zap). NOTE: if nblocks is
769 * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
770 * segment to 0 length but keep it. This adheres to the invariant
771 * given in the header comments.
773 BlockNumber lastsegblocks = nblocks - priorblocks;
775 if (FileTruncate(v->mdfd_vfd, lastsegblocks * BLCKSZ) < 0)
777 (errcode_for_file_access(),
778 errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
779 reln->smgr_rnode.spcNode,
780 reln->smgr_rnode.dbNode,
781 reln->smgr_rnode.relNode,
784 register_dirty_segment(reln, v);
786 ov->mdfd_chain = NULL;
791 * We still need this segment and 0 or more blocks beyond it, so
792 * nothing to do here.
796 priorblocks += RELSEG_SIZE;
799 if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0)
801 (errcode_for_file_access(),
802 errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
803 reln->smgr_rnode.spcNode,
804 reln->smgr_rnode.dbNode,
805 reln->smgr_rnode.relNode,
808 register_dirty_segment(reln, v);
813 * mdimmedsync() -- Immediately sync a relation to stable storage.
815 * Note that only writes already issued are synced; this routine knows
816 * nothing of dirty buffers that may exist inside the buffer manager.
819 mdimmedsync(SMgrRelation reln)
825 * NOTE: mdnblocks makes sure we have opened all active segments, so
826 * that fsync loop will get them all!
828 curnblk = mdnblocks(reln);
830 v = mdopen(reln, EXTENSION_FAIL);
832 #ifndef LET_OS_MANAGE_FILESIZE
835 if (FileSync(v->mdfd_vfd) < 0)
837 (errcode_for_file_access(),
838 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
840 reln->smgr_rnode.spcNode,
841 reln->smgr_rnode.dbNode,
842 reln->smgr_rnode.relNode)));
846 if (FileSync(v->mdfd_vfd) < 0)
848 (errcode_for_file_access(),
849 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
851 reln->smgr_rnode.spcNode,
852 reln->smgr_rnode.dbNode,
853 reln->smgr_rnode.relNode)));
858 * mdsync() -- Sync previous writes to stable storage.
860 * This is only called during checkpoints, and checkpoints should only
861 * occur in processes that have created a pendingOpsTable.
868 if (!pendingOpsTable)
869 elog(ERROR, "cannot sync without a pendingOpsTable");
872 * The fsync table could contain requests to fsync relations that have
873 * been deleted (unlinked) by the time we get to them. Rather than
874 * just hoping an ENOENT (or EACCES on Windows) error can be ignored,
875 * what we will do is retry the whole process after absorbing fsync
876 * request messages again. Since mdunlink() queues a "revoke" message
877 * before actually unlinking, the fsync request is guaranteed to be gone
878 * the second time if it really was this case. DROP DATABASE likewise
879 * has to tell us to forget fsync requests before it starts deletions.
882 HASH_SEQ_STATUS hstat;
883 PendingOperationEntry *entry;
889 * If we are in the bgwriter, the sync had better include all fsync
890 * requests that were queued by backends before the checkpoint REDO
891 * point was determined. We go that a little better by accepting all
892 * requests queued up to the point where we start fsync'ing.
894 AbsorbFsyncRequests();
896 absorb_counter = FSYNCS_PER_ABSORB;
897 hash_seq_init(&hstat, pendingOpsTable);
898 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
901 * If fsync is off then we don't have to bother opening the file
902 * at all. (We delay checking until this point so that changing
903 * fsync on the fly behaves sensibly.)
911 * If in bgwriter, we want to absorb pending requests every so
912 * often to prevent overflow of the fsync request queue. This
913 * could result in deleting the current entry out from under
914 * our hashtable scan, so the procedure is to fall out of the
915 * scan and start over from the top of the function.
917 if (--absorb_counter <= 0)
924 * Find or create an smgr hash entry for this relation. This
925 * may seem a bit unclean -- md calling smgr? But it's really
926 * the best solution. It ensures that the open file reference
927 * isn't permanently leaked if we get an error here. (You may
928 * say "but an unreferenced SMgrRelation is still a leak!" Not
929 * really, because the only case in which a checkpoint is done
930 * by a process that isn't about to shut down is in the
931 * bgwriter, and it will periodically do smgrcloseall(). This
932 * fact justifies our not closing the reln in the success path
933 * either, which is a good thing since in non-bgwriter cases
934 * we couldn't safely do that.) Furthermore, in many cases
935 * the relation will have been dirtied through this same smgr
936 * relation, and so we can save a file open/close cycle.
938 reln = smgropen(entry->tag.rnode);
941 * It is possible that the relation has been dropped or
942 * truncated since the fsync request was entered. Therefore,
943 * allow ENOENT, but only if we didn't fail once already on
944 * this file. This applies both during _mdfd_getseg() and
945 * during FileSync, since fd.c might have closed the file
948 seg = _mdfd_getseg(reln,
949 entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
950 false, EXTENSION_RETURN_NULL);
952 FileSync(seg->mdfd_vfd) < 0)
955 * XXX is there any point in allowing more than one try?
956 * Don't see one at the moment, but easy to change the
959 if (!FILE_POSSIBLY_DELETED(errno) ||
960 ++(entry->failures) > 1)
962 (errcode_for_file_access(),
963 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
965 entry->tag.rnode.spcNode,
966 entry->tag.rnode.dbNode,
967 entry->tag.rnode.relNode)));
970 (errcode_for_file_access(),
971 errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
973 entry->tag.rnode.spcNode,
974 entry->tag.rnode.dbNode,
975 entry->tag.rnode.relNode)));
977 continue; /* don't delete the hashtable entry */
981 /* Okay, delete this entry */
982 if (hash_search(pendingOpsTable, &entry->tag,
983 HASH_REMOVE, NULL) == NULL)
984 elog(ERROR, "pendingOpsTable corrupted");
986 } while (need_retry);
990 * register_dirty_segment() -- Mark a relation segment as needing fsync
992 * If there is a local pending-ops table, just make an entry in it for
993 * mdsync to process later. Otherwise, try to pass off the fsync request
994 * to the background writer process. If that fails, just do the fsync
995 * locally before returning (we expect this will not happen often enough
996 * to be a performance problem).
999 register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
1001 if (pendingOpsTable)
1003 /* push it into local pending-ops table */
1004 RememberFsyncRequest(reln->smgr_rnode, seg->mdfd_segno);
1008 if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno))
1009 return; /* passed it off successfully */
1011 if (FileSync(seg->mdfd_vfd) < 0)
1013 (errcode_for_file_access(),
1014 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
1016 reln->smgr_rnode.spcNode,
1017 reln->smgr_rnode.dbNode,
1018 reln->smgr_rnode.relNode)));
1023 * RememberFsyncRequest() -- callback from bgwriter side of fsync request
1025 * We stuff the fsync request into the local hash table for execution
1026 * during the bgwriter's next checkpoint.
1028 * The range of possible segment numbers is way less than the range of
1029 * BlockNumber, so we can reserve high values of segno for special purposes.
1030 * We define two: FORGET_RELATION_FSYNC means to drop pending fsyncs for
1031 * a relation, and FORGET_DATABASE_FSYNC means to drop pending fsyncs for
1032 * a whole database. (These are a tad slow because the hash table has to be
1033 * searched linearly, but it doesn't seem worth rethinking the table structure
1037 RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
1039 Assert(pendingOpsTable);
1041 if (segno == FORGET_RELATION_FSYNC)
1043 /* Remove any pending requests for the entire relation */
1044 HASH_SEQ_STATUS hstat;
1045 PendingOperationEntry *entry;
1047 hash_seq_init(&hstat, pendingOpsTable);
1048 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1050 if (RelFileNodeEquals(entry->tag.rnode, rnode))
1052 /* Okay, delete this entry */
1053 if (hash_search(pendingOpsTable, &entry->tag,
1054 HASH_REMOVE, NULL) == NULL)
1055 elog(ERROR, "pendingOpsTable corrupted");
1059 else if (segno == FORGET_DATABASE_FSYNC)
1061 /* Remove any pending requests for the entire database */
1062 HASH_SEQ_STATUS hstat;
1063 PendingOperationEntry *entry;
1065 hash_seq_init(&hstat, pendingOpsTable);
1066 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1068 if (entry->tag.rnode.dbNode == rnode.dbNode)
1070 /* Okay, delete this entry */
1071 if (hash_search(pendingOpsTable, &entry->tag,
1072 HASH_REMOVE, NULL) == NULL)
1073 elog(ERROR, "pendingOpsTable corrupted");
1079 /* Normal case: enter a request to fsync this segment */
1080 PendingOperationTag key;
1081 PendingOperationEntry *entry;
1084 /* ensure any pad bytes in the hash key are zeroed */
1085 MemSet(&key, 0, sizeof(key));
1089 entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
1093 if (!found) /* new entry, so initialize it */
1094 entry->failures = 0;
1099 * ForgetRelationFsyncRequests -- ensure any fsyncs for a rel are forgotten
1102 ForgetRelationFsyncRequests(RelFileNode rnode)
1104 if (pendingOpsTable)
1106 /* standalone backend or startup process: fsync state is local */
1107 RememberFsyncRequest(rnode, FORGET_RELATION_FSYNC);
1109 else if (IsUnderPostmaster)
1112 * Notify the bgwriter about it. If we fail to queue the revoke
1113 * message, we have to sleep and try again ... ugly, but hopefully
1114 * won't happen often.
1116 * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with
1117 * an error would leave the no-longer-used file still present on
1118 * disk, which would be bad, so I'm inclined to assume that the
1119 * bgwriter will always empty the queue soon.
1121 while (!ForwardFsyncRequest(rnode, FORGET_RELATION_FSYNC))
1122 pg_usleep(10000L); /* 10 msec seems a good number */
1124 * Note we don't wait for the bgwriter to actually absorb the
1125 * revoke message; see mdsync() for the implications.
1131 * ForgetDatabaseFsyncRequests -- ensure any fsyncs for a DB are forgotten
1134 ForgetDatabaseFsyncRequests(Oid dbid)
1138 rnode.dbNode = dbid;
1142 if (pendingOpsTable)
1144 /* standalone backend or startup process: fsync state is local */
1145 RememberFsyncRequest(rnode, FORGET_DATABASE_FSYNC);
1147 else if (IsUnderPostmaster)
1149 /* see notes in ForgetRelationFsyncRequests */
1150 while (!ForwardFsyncRequest(rnode, FORGET_DATABASE_FSYNC))
1151 pg_usleep(10000L); /* 10 msec seems a good number */
1157 * _fdvec_alloc() -- Make a MdfdVec object.
1162 return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
1165 #ifndef LET_OS_MANAGE_FILESIZE
1168 * Open the specified segment of the relation,
1169 * and make a MdfdVec object for it. Returns NULL on failure.
1172 _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
1179 path = relpath(reln->smgr_rnode);
1183 /* be sure we have enough space for the '.segno' */
1184 fullpath = (char *) palloc(strlen(path) + 12);
1185 sprintf(fullpath, "%s.%u", path, segno);
1192 fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
1199 /* allocate an mdfdvec entry for it */
1202 /* fill the entry */
1204 v->mdfd_segno = segno;
1205 v->mdfd_chain = NULL;
1206 Assert(_mdnblocks(reln, v) <= ((BlockNumber) RELSEG_SIZE));
1211 #endif /* LET_OS_MANAGE_FILESIZE */
1214 * _mdfd_getseg() -- Find the segment of the relation holding the
1217 * If the segment doesn't exist, we ereport, return NULL, or create the
1218 * segment, according to "behavior". Note: isTemp need only be correct
1219 * in the EXTENSION_CREATE case.
1222 _mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool isTemp,
1223 ExtensionBehavior behavior)
1225 MdfdVec *v = mdopen(reln, behavior);
1227 #ifndef LET_OS_MANAGE_FILESIZE
1228 BlockNumber targetseg;
1229 BlockNumber nextsegno;
1232 return NULL; /* only possible if EXTENSION_RETURN_NULL */
1234 targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1235 for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
1237 Assert(nextsegno == v->mdfd_segno + 1);
1239 if (v->mdfd_chain == NULL)
1242 * Normally we will create new segments only if authorized by
1243 * the caller (i.e., we are doing mdextend()). But when doing
1244 * WAL recovery, create segments anyway; this allows cases such as
1245 * replaying WAL data that has a write into a high-numbered
1246 * segment of a relation that was later deleted. We want to go
1247 * ahead and create the segments so we can finish out the replay.
1249 * We have to maintain the invariant that segments before the
1250 * last active segment are of size RELSEG_SIZE; therefore, pad
1251 * them out with zeroes if needed. (This only matters if caller
1252 * is extending the relation discontiguously, but that can happen
1255 if (behavior == EXTENSION_CREATE || InRecovery)
1257 if (_mdnblocks(reln, v) < RELSEG_SIZE)
1259 char *zerobuf = palloc0(BLCKSZ);
1261 mdextend(reln, nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1265 v->mdfd_chain = _mdfd_openseg(reln, nextsegno, O_CREAT);
1269 /* We won't create segment if not existent */
1270 v->mdfd_chain = _mdfd_openseg(reln, nextsegno, 0);
1272 if (v->mdfd_chain == NULL)
1274 if (behavior == EXTENSION_RETURN_NULL &&
1275 FILE_POSSIBLY_DELETED(errno))
1278 (errcode_for_file_access(),
1279 errmsg("could not open segment %u of relation %u/%u/%u (target block %u): %m",
1281 reln->smgr_rnode.spcNode,
1282 reln->smgr_rnode.dbNode,
1283 reln->smgr_rnode.relNode,
1295 * Get number of blocks present in a single disk file
1298 _mdnblocks(SMgrRelation reln, MdfdVec *seg)
1302 len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
1305 (errcode_for_file_access(),
1306 errmsg("could not seek to end of segment %u of relation %u/%u/%u: %m",
1308 reln->smgr_rnode.spcNode,
1309 reln->smgr_rnode.dbNode,
1310 reln->smgr_rnode.relNode)));
1311 /* note that this calculation will ignore any partial block at EOF */
1312 return (BlockNumber) (len / BLCKSZ);