1 /*-------------------------------------------------------------------------
4 * public interface routines to storage manager switch.
6 * All file system operations in POSTGRES dispatch through these
9 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
10 * Portions Copyright (c) 1994, Regents of the University of California
14 * src/backend/storage/smgr/smgr.c
16 *-------------------------------------------------------------------------
20 #include "commands/tablespace.h"
21 #include "storage/bufmgr.h"
22 #include "storage/ipc.h"
23 #include "storage/smgr.h"
24 #include "utils/hsearch.h"
25 #include "utils/inval.h"
29 * This struct of function pointers defines the API between smgr.c and
30 * any individual storage manager module. Note that smgr subfunctions are
31 * generally expected to report problems via elog(ERROR). An exception is
32 * that smgr_unlink should use elog(WARNING), rather than erroring out,
33 * because we normally unlink relations during post-commit/abort cleanup,
34 * and so it's too late to raise an error. Also, various conditions that
35 * would normally be errors should be allowed during bootstrap and/or WAL
36 * recovery --- see comments in md.c for details.
40 void (*smgr_init) (void); /* may be NULL */
41 void (*smgr_shutdown) (void); /* may be NULL */
42 void (*smgr_close) (SMgrRelation reln, ForkNumber forknum);
43 void (*smgr_create) (SMgrRelation reln, ForkNumber forknum,
45 bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
46 void (*smgr_unlink) (RelFileNodeBackend rnode, ForkNumber forknum,
48 void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
49 BlockNumber blocknum, char *buffer, bool skipFsync);
50 void (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
51 BlockNumber blocknum);
52 void (*smgr_read) (SMgrRelation reln, ForkNumber forknum,
53 BlockNumber blocknum, char *buffer);
54 void (*smgr_write) (SMgrRelation reln, ForkNumber forknum,
55 BlockNumber blocknum, char *buffer, bool skipFsync);
56 void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
57 BlockNumber blocknum, BlockNumber nblocks);
58 BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
59 void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
61 void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
62 void (*smgr_pre_ckpt) (void); /* may be NULL */
63 void (*smgr_sync) (void); /* may be NULL */
64 void (*smgr_post_ckpt) (void); /* may be NULL */
68 static const f_smgr smgrsw[] = {
72 .smgr_shutdown = NULL,
73 .smgr_close = mdclose,
74 .smgr_create = mdcreate,
75 .smgr_exists = mdexists,
76 .smgr_unlink = mdunlink,
77 .smgr_extend = mdextend,
78 .smgr_prefetch = mdprefetch,
80 .smgr_write = mdwrite,
81 .smgr_writeback = mdwriteback,
82 .smgr_nblocks = mdnblocks,
83 .smgr_truncate = mdtruncate,
84 .smgr_immedsync = mdimmedsync,
85 .smgr_pre_ckpt = mdpreckpt,
87 .smgr_post_ckpt = mdpostckpt
91 static const int NSmgr = lengthof(smgrsw);
95 * Each backend has a hashtable that stores all extant SMgrRelation objects.
96 * In addition, "unowned" SMgrRelation objects are chained together in a list.
98 static HTAB *SMgrRelationHash = NULL;
100 static SMgrRelation first_unowned_reln = NULL;
102 /* local function prototypes */
103 static void smgrshutdown(int code, Datum arg);
104 static void add_to_unowned_list(SMgrRelation reln);
105 static void remove_from_unowned_list(SMgrRelation reln);
109 * smgrinit(), smgrshutdown() -- Initialize or shut down storage
112 * Note: smgrinit is called during backend startup (normal or standalone
113 * case), *not* during postmaster start. Therefore, any resources created
114 * here or destroyed in smgrshutdown are backend-local.
121 for (i = 0; i < NSmgr; i++)
123 if (smgrsw[i].smgr_init)
124 smgrsw[i].smgr_init();
127 /* register the shutdown proc */
128 on_proc_exit(smgrshutdown, 0);
132 * on_proc_exit hook for smgr cleanup during backend shutdown
135 smgrshutdown(int code, Datum arg)
139 for (i = 0; i < NSmgr; i++)
141 if (smgrsw[i].smgr_shutdown)
142 smgrsw[i].smgr_shutdown();
147 * smgropen() -- Return an SMgrRelation object, creating it if need be.
149 * This does not attempt to actually open the underlying file.
152 smgropen(RelFileNode rnode, BackendId backend)
154 RelFileNodeBackend brnode;
158 if (SMgrRelationHash == NULL)
160 /* First time through: initialize the hash table */
163 MemSet(&ctl, 0, sizeof(ctl));
164 ctl.keysize = sizeof(RelFileNodeBackend);
165 ctl.entrysize = sizeof(SMgrRelationData);
166 SMgrRelationHash = hash_create("smgr relation table", 400,
167 &ctl, HASH_ELEM | HASH_BLOBS);
168 first_unowned_reln = NULL;
171 /* Look up or create an entry */
173 brnode.backend = backend;
174 reln = (SMgrRelation) hash_search(SMgrRelationHash,
178 /* Initialize it if not present before */
183 /* hash_search already filled in the lookup key */
184 reln->smgr_owner = NULL;
185 reln->smgr_targblock = InvalidBlockNumber;
186 reln->smgr_fsm_nblocks = InvalidBlockNumber;
187 reln->smgr_vm_nblocks = InvalidBlockNumber;
188 reln->smgr_which = 0; /* we only have md.c at present */
190 /* mark it not open */
191 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
192 reln->md_num_open_segs[forknum] = 0;
194 /* it has no owner yet */
195 add_to_unowned_list(reln);
202 * smgrsetowner() -- Establish a long-lived reference to an SMgrRelation object
204 * There can be only one owner at a time; this is sufficient since currently
205 * the only such owners exist in the relcache.
208 smgrsetowner(SMgrRelation *owner, SMgrRelation reln)
210 /* We don't support "disowning" an SMgrRelation here, use smgrclearowner */
211 Assert(owner != NULL);
214 * First, unhook any old owner. (Normally there shouldn't be any, but it
215 * seems possible that this can happen during swap_relation_files()
216 * depending on the order of processing. It's ok to close the old
217 * relcache entry early in that case.)
219 * If there isn't an old owner, then the reln should be in the unowned
220 * list, and we need to remove it.
222 if (reln->smgr_owner)
223 *(reln->smgr_owner) = NULL;
225 remove_from_unowned_list(reln);
227 /* Now establish the ownership relationship. */
228 reln->smgr_owner = owner;
233 * smgrclearowner() -- Remove long-lived reference to an SMgrRelation object
237 smgrclearowner(SMgrRelation *owner, SMgrRelation reln)
239 /* Do nothing if the SMgrRelation object is not owned by the owner */
240 if (reln->smgr_owner != owner)
243 /* unset the owner's reference */
246 /* unset our reference to the owner */
247 reln->smgr_owner = NULL;
249 add_to_unowned_list(reln);
253 * add_to_unowned_list -- link an SMgrRelation onto the unowned list
255 * Check remove_from_unowned_list()'s comments for performance
259 add_to_unowned_list(SMgrRelation reln)
261 /* place it at head of the list (to make smgrsetowner cheap) */
262 reln->next_unowned_reln = first_unowned_reln;
263 first_unowned_reln = reln;
267 * remove_from_unowned_list -- unlink an SMgrRelation from the unowned list
269 * If the reln is not present in the list, nothing happens. Typically this
270 * would be caller error, but there seems no reason to throw an error.
272 * In the worst case this could be rather slow; but in all the cases that seem
273 * likely to be performance-critical, the reln being sought will actually be
274 * first in the list. Furthermore, the number of unowned relns touched in any
275 * one transaction shouldn't be all that high typically. So it doesn't seem
276 * worth expending the additional space and management logic needed for a
277 * doubly-linked list.
280 remove_from_unowned_list(SMgrRelation reln)
285 for (link = &first_unowned_reln, cur = *link;
287 link = &cur->next_unowned_reln, cur = *link)
291 *link = cur->next_unowned_reln;
292 cur->next_unowned_reln = NULL;
299 * smgrexists() -- Does the underlying file for a fork exist?
302 smgrexists(SMgrRelation reln, ForkNumber forknum)
304 return smgrsw[reln->smgr_which].smgr_exists(reln, forknum);
308 * smgrclose() -- Close and delete an SMgrRelation object.
311 smgrclose(SMgrRelation reln)
316 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
317 smgrsw[reln->smgr_which].smgr_close(reln, forknum);
319 owner = reln->smgr_owner;
322 remove_from_unowned_list(reln);
324 if (hash_search(SMgrRelationHash,
325 (void *) &(reln->smgr_rnode),
326 HASH_REMOVE, NULL) == NULL)
327 elog(ERROR, "SMgrRelation hashtable corrupted");
330 * Unhook the owner pointer, if any. We do this last since in the remote
331 * possibility of failure above, the SMgrRelation object will still exist.
338 * smgrcloseall() -- Close all existing SMgrRelation objects.
343 HASH_SEQ_STATUS status;
346 /* Nothing to do if hashtable not set up */
347 if (SMgrRelationHash == NULL)
350 hash_seq_init(&status, SMgrRelationHash);
352 while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL)
357 * smgrclosenode() -- Close SMgrRelation object for given RelFileNode,
360 * This has the same effects as smgrclose(smgropen(rnode)), but it avoids
361 * uselessly creating a hashtable entry only to drop it again when no
362 * such entry exists already.
365 smgrclosenode(RelFileNodeBackend rnode)
369 /* Nothing to do if hashtable not set up */
370 if (SMgrRelationHash == NULL)
373 reln = (SMgrRelation) hash_search(SMgrRelationHash,
381 * smgrcreate() -- Create a new relation.
383 * Given an already-created (but presumably unused) SMgrRelation,
384 * cause the underlying disk file or other storage for the fork
387 * If isRedo is true, it is okay for the underlying file to exist
388 * already because we are in a WAL replay sequence.
391 smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
394 * Exit quickly in WAL replay mode if we've already opened the file. If
395 * it's open, it surely must exist.
397 if (isRedo && reln->md_num_open_segs[forknum] > 0)
401 * We may be using the target table space for the first time in this
402 * database, so create a per-database subdirectory if needed.
404 * XXX this is a fairly ugly violation of module layering, but this seems
405 * to be the best place to put the check. Maybe TablespaceCreateDbspace
406 * should be here and not in commands/tablespace.c? But that would imply
407 * importing a lot of stuff that smgr.c oughtn't know, either.
409 TablespaceCreateDbspace(reln->smgr_rnode.node.spcNode,
410 reln->smgr_rnode.node.dbNode,
413 smgrsw[reln->smgr_which].smgr_create(reln, forknum, isRedo);
417 * smgrdounlink() -- Immediately unlink all forks of a relation.
419 * All forks of the relation are removed from the store. This should
420 * not be used during transactional operations, since it can't be undone.
422 * If isRedo is true, it is okay for the underlying file(s) to be gone
425 * This is equivalent to calling smgrdounlinkfork for each fork, but
426 * it's significantly quicker so should be preferred when possible.
429 smgrdounlink(SMgrRelation reln, bool isRedo)
431 RelFileNodeBackend rnode = reln->smgr_rnode;
432 int which = reln->smgr_which;
435 /* Close the forks at smgr level */
436 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
437 smgrsw[which].smgr_close(reln, forknum);
440 * Get rid of any remaining buffers for the relation. bufmgr will just
441 * drop them without bothering to write the contents.
443 DropRelFileNodesAllBuffers(&rnode, 1);
446 * It'd be nice to tell the stats collector to forget it immediately, too.
447 * But we can't because we don't know the OID (and in cases involving
448 * relfilenode swaps, it's not always clear which table OID to forget,
453 * Send a shared-inval message to force other backends to close any
454 * dangling smgr references they may have for this rel. We should do this
455 * before starting the actual unlinking, in case we fail partway through
456 * that step. Note that the sinval message will eventually come back to
457 * this backend, too, and thereby provide a backstop that we closed our
460 CacheInvalidateSmgr(rnode);
463 * Delete the physical file(s).
465 * Note: smgr_unlink must treat deletion failure as a WARNING, not an
466 * ERROR, because we've already decided to commit or abort the current
469 smgrsw[which].smgr_unlink(rnode, InvalidForkNumber, isRedo);
473 * smgrdounlinkall() -- Immediately unlink all forks of all given relations
475 * All forks of all given relations are removed from the store. This
476 * should not be used during transactional operations, since it can't be
479 * If isRedo is true, it is okay for the underlying file(s) to be gone
482 * This is equivalent to calling smgrdounlink for each relation, but it's
483 * significantly quicker so should be preferred when possible.
486 smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
489 RelFileNodeBackend *rnodes;
496 * create an array which contains all relations to be dropped, and close
497 * each relation's forks at the smgr level while at it
499 rnodes = palloc(sizeof(RelFileNodeBackend) * nrels);
500 for (i = 0; i < nrels; i++)
502 RelFileNodeBackend rnode = rels[i]->smgr_rnode;
503 int which = rels[i]->smgr_which;
507 /* Close the forks at smgr level */
508 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
509 smgrsw[which].smgr_close(rels[i], forknum);
513 * Get rid of any remaining buffers for the relations. bufmgr will just
514 * drop them without bothering to write the contents.
516 DropRelFileNodesAllBuffers(rnodes, nrels);
519 * It'd be nice to tell the stats collector to forget them immediately,
520 * too. But we can't because we don't know the OIDs.
524 * Send a shared-inval message to force other backends to close any
525 * dangling smgr references they may have for these rels. We should do
526 * this before starting the actual unlinking, in case we fail partway
527 * through that step. Note that the sinval messages will eventually come
528 * back to this backend, too, and thereby provide a backstop that we
529 * closed our own smgr rel.
531 for (i = 0; i < nrels; i++)
532 CacheInvalidateSmgr(rnodes[i]);
535 * Delete the physical file(s).
537 * Note: smgr_unlink must treat deletion failure as a WARNING, not an
538 * ERROR, because we've already decided to commit or abort the current
542 for (i = 0; i < nrels; i++)
544 int which = rels[i]->smgr_which;
546 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
547 smgrsw[which].smgr_unlink(rnodes[i], forknum, isRedo);
554 * smgrdounlinkfork() -- Immediately unlink one fork of a relation.
556 * The specified fork of the relation is removed from the store. This
557 * should not be used during transactional operations, since it can't be
560 * If isRedo is true, it is okay for the underlying file to be gone
564 smgrdounlinkfork(SMgrRelation reln, ForkNumber forknum, bool isRedo)
566 RelFileNodeBackend rnode = reln->smgr_rnode;
567 int which = reln->smgr_which;
569 /* Close the fork at smgr level */
570 smgrsw[which].smgr_close(reln, forknum);
573 * Get rid of any remaining buffers for the fork. bufmgr will just drop
574 * them without bothering to write the contents.
576 DropRelFileNodeBuffers(rnode, forknum, 0);
579 * It'd be nice to tell the stats collector to forget it immediately, too.
580 * But we can't because we don't know the OID (and in cases involving
581 * relfilenode swaps, it's not always clear which table OID to forget,
586 * Send a shared-inval message to force other backends to close any
587 * dangling smgr references they may have for this rel. We should do this
588 * before starting the actual unlinking, in case we fail partway through
589 * that step. Note that the sinval message will eventually come back to
590 * this backend, too, and thereby provide a backstop that we closed our
593 CacheInvalidateSmgr(rnode);
596 * Delete the physical file(s).
598 * Note: smgr_unlink must treat deletion failure as a WARNING, not an
599 * ERROR, because we've already decided to commit or abort the current
602 smgrsw[which].smgr_unlink(rnode, forknum, isRedo);
606 * smgrextend() -- Add a new block to a file.
608 * The semantics are nearly the same as smgrwrite(): write at the
609 * specified position. However, this is to be used for the case of
610 * extending a relation (i.e., blocknum is at or beyond the current
611 * EOF). Note that we assume writing a block beyond current EOF
612 * causes intervening file space to become filled with zeroes.
615 smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
616 char *buffer, bool skipFsync)
618 smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum,
623 * smgrprefetch() -- Initiate asynchronous read of the specified block of a relation.
626 smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
628 smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum);
632 * smgrread() -- read a particular block from a relation into the supplied
635 * This routine is called from the buffer manager in order to
636 * instantiate pages in the shared buffer cache. All storage managers
637 * return pages in the format that POSTGRES expects.
640 smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
643 smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer);
647 * smgrwrite() -- Write the supplied buffer out.
649 * This is to be used only for updating already-existing blocks of a
650 * relation (ie, those before the current EOF). To extend a relation,
653 * This is not a synchronous write -- the block is not necessarily
654 * on disk at return, only dumped out to the kernel. However,
655 * provisions will be made to fsync the write before the next checkpoint.
657 * skipFsync indicates that the caller will make other provisions to
658 * fsync the relation, so we needn't bother. Temporary relations also
659 * do not require fsync.
662 smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
663 char *buffer, bool skipFsync)
665 smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum,
671 * smgrwriteback() -- Trigger kernel writeback for the supplied range of
675 smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
678 smgrsw[reln->smgr_which].smgr_writeback(reln, forknum, blocknum,
683 * smgrnblocks() -- Calculate the number of blocks in the
687 smgrnblocks(SMgrRelation reln, ForkNumber forknum)
689 return smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum);
693 * smgrtruncate() -- Truncate supplied relation to the specified number
696 * The truncation is done immediately, so this can't be rolled back.
699 smgrtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
702 * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
703 * just drop them without bothering to write the contents.
705 DropRelFileNodeBuffers(reln->smgr_rnode, forknum, nblocks);
708 * Send a shared-inval message to force other backends to close any smgr
709 * references they may have for this rel. This is useful because they
710 * might have open file pointers to segments that got removed, and/or
711 * smgr_targblock variables pointing past the new rel end. (The inval
712 * message will come back to our backend, too, causing a
713 * probably-unnecessary local smgr flush. But we don't expect that this
714 * is a performance-critical path.) As in the unlink code, we want to be
715 * sure the message is sent before we start changing things on-disk.
717 CacheInvalidateSmgr(reln->smgr_rnode);
722 smgrsw[reln->smgr_which].smgr_truncate(reln, forknum, nblocks);
726 * smgrimmedsync() -- Force the specified relation to stable storage.
728 * Synchronously force all previous writes to the specified relation
731 * This is useful for building completely new relations (eg, new
732 * indexes). Instead of incrementally WAL-logging the index build
733 * steps, we can just write completed index pages to disk with smgrwrite
734 * or smgrextend, and then fsync the completed index file before
735 * committing the transaction. (This is sufficient for purposes of
736 * crash recovery, since it effectively duplicates forcing a checkpoint
737 * for the completed index. But it is *not* sufficient if one wishes
738 * to use the WAL log for PITR or replication purposes: in that case
739 * we have to make WAL entries as well.)
741 * The preceding writes should specify skipFsync = true to avoid
742 * duplicative fsyncs.
744 * Note that you need to do FlushRelationBuffers() first if there is
745 * any possibility that there are dirty buffers for the relation;
746 * otherwise the sync is not very meaningful.
749 smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
751 smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum);
756 * smgrpreckpt() -- Prepare for checkpoint.
763 for (i = 0; i < NSmgr; i++)
765 if (smgrsw[i].smgr_pre_ckpt)
766 smgrsw[i].smgr_pre_ckpt();
771 * smgrsync() -- Sync files to disk during checkpoint.
778 for (i = 0; i < NSmgr; i++)
780 if (smgrsw[i].smgr_sync)
781 smgrsw[i].smgr_sync();
786 * smgrpostckpt() -- Post-checkpoint cleanup.
793 for (i = 0; i < NSmgr; i++)
795 if (smgrsw[i].smgr_post_ckpt)
796 smgrsw[i].smgr_post_ckpt();
803 * This routine is called during transaction commit or abort (it doesn't
804 * particularly care which). All transient SMgrRelation objects are closed.
806 * We do this as a compromise between wanting transient SMgrRelations to
807 * live awhile (to amortize the costs of blind writes of multiple blocks)
808 * and needing them to not live forever (since we're probably holding open
809 * a kernel file descriptor for the underlying file, and we need to ensure
810 * that gets closed reasonably soon if the file gets deleted).
816 * Zap all unowned SMgrRelations. We rely on smgrclose() to remove each
819 while (first_unowned_reln != NULL)
821 Assert(first_unowned_reln->smgr_owner == NULL);
822 smgrclose(first_unowned_reln);