1 /*-------------------------------------------------------------------------
4 * code to create and destroy physical storage for relations
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/catalog/storage.c
14 * Some of this code used to be in storage/smgr/smgr.c, and the
15 * function names still reflect that.
17 *-------------------------------------------------------------------------
22 #include "miscadmin.h"
24 #include "access/visibilitymap.h"
25 #include "access/xact.h"
26 #include "access/xlog.h"
27 #include "access/xloginsert.h"
28 #include "access/xlogutils.h"
29 #include "catalog/storage.h"
30 #include "catalog/storage_xlog.h"
31 #include "storage/freespace.h"
32 #include "storage/smgr.h"
33 #include "utils/memutils.h"
34 #include "utils/rel.h"
37 * We keep a list of all relations (represented as RelFileNode values)
38 * that have been created or deleted in the current transaction. When
39 * a relation is created, we create the physical file immediately, but
40 * remember it so that we can delete the file again if the current
41 * transaction is aborted. Conversely, a deletion request is NOT
42 * executed immediately, but is just entered in the list. When and if
43 * the transaction commits, we can delete the physical file.
45 * To handle subtransactions, every entry is marked with its transaction
46 * nesting level. At subtransaction commit, we reassign the subtransaction's
47 * entries to the parent nesting level. At subtransaction abort, we can
48 * immediately execute the abort-time actions for all entries of the current
51 * NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
52 * unbetimes. It'd probably be OK to keep it in TopTransactionContext,
53 * but I'm being paranoid.
56 typedef struct PendingRelDelete
58 RelFileNode relnode; /* relation that may need to be deleted */
59 BackendId backend; /* InvalidBackendId if not a temp rel */
60 bool atCommit; /* T=delete at commit; F=delete at abort */
61 int nestLevel; /* xact nesting level of request */
62 struct PendingRelDelete *next; /* linked-list link */
65 static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
68 * RelationCreateStorage
69 * Create physical storage for a relation.
71 * Create the underlying disk file storage for the relation. This only
72 * creates the main fork; additional forks are created lazily by the
73 * modules that need them.
75 * This function is transactional. The creation is WAL-logged, and if the
76 * transaction aborts later on, the storage will be destroyed.
79 RelationCreateStorage(RelFileNode rnode, char relpersistence)
81 PendingRelDelete *pending;
86 switch (relpersistence)
88 case RELPERSISTENCE_TEMP:
89 backend = BackendIdForTempRelations();
92 case RELPERSISTENCE_UNLOGGED:
93 backend = InvalidBackendId;
96 case RELPERSISTENCE_PERMANENT:
97 backend = InvalidBackendId;
101 elog(ERROR, "invalid relpersistence: %c", relpersistence);
102 return NULL; /* placate compiler */
105 srel = smgropen(rnode, backend);
106 smgrcreate(srel, MAIN_FORKNUM, false);
109 log_smgrcreate(&srel->smgr_rnode.node, MAIN_FORKNUM);
111 /* Add the relation to the list of stuff to delete at abort */
112 pending = (PendingRelDelete *)
113 MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
114 pending->relnode = rnode;
115 pending->backend = backend;
116 pending->atCommit = false; /* delete if abort */
117 pending->nestLevel = GetCurrentTransactionNestLevel();
118 pending->next = pendingDeletes;
119 pendingDeletes = pending;
125 * Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL.
128 log_smgrcreate(const RelFileNode *rnode, ForkNumber forkNum)
130 xl_smgr_create xlrec;
133 * Make an XLOG entry reporting the file creation.
135 xlrec.rnode = *rnode;
136 xlrec.forkNum = forkNum;
139 XLogRegisterData((char *) &xlrec, sizeof(xlrec));
140 XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE);
144 * RelationDropStorage
145 * Schedule unlinking of physical storage at transaction commit.
148 RelationDropStorage(Relation rel)
150 PendingRelDelete *pending;
152 /* Add the relation to the list of stuff to delete at commit */
153 pending = (PendingRelDelete *)
154 MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
155 pending->relnode = rel->rd_node;
156 pending->backend = rel->rd_backend;
157 pending->atCommit = true; /* delete if commit */
158 pending->nestLevel = GetCurrentTransactionNestLevel();
159 pending->next = pendingDeletes;
160 pendingDeletes = pending;
163 * NOTE: if the relation was created in this transaction, it will now be
164 * present in the pending-delete list twice, once with atCommit true and
165 * once with atCommit false. Hence, it will be physically deleted at end
166 * of xact in either case (and the other entry will be ignored by
167 * smgrDoPendingDeletes, so no error will occur). We could instead remove
168 * the existing list entry and delete the physical file immediately, but
169 * for now I'll keep the logic simple.
172 RelationCloseSmgr(rel);
176 * RelationPreserveStorage
177 * Mark a relation as not to be deleted after all.
179 * We need this function because relation mapping changes are committed
180 * separately from commit of the whole transaction, so it's still possible
181 * for the transaction to abort after the mapping update is done.
182 * When a new physical relation is installed in the map, it would be
183 * scheduled for delete-on-abort, so we'd delete it, and be in trouble.
184 * The relation mapper fixes this by telling us to not delete such relations
185 * after all as part of its commit.
187 * We also use this to reuse an old build of an index during ALTER TABLE, this
188 * time removing the delete-at-commit entry.
190 * No-op if the relation is not among those scheduled for deletion.
193 RelationPreserveStorage(RelFileNode rnode, bool atCommit)
195 PendingRelDelete *pending;
196 PendingRelDelete *prev;
197 PendingRelDelete *next;
200 for (pending = pendingDeletes; pending != NULL; pending = next)
202 next = pending->next;
203 if (RelFileNodeEquals(rnode, pending->relnode)
204 && pending->atCommit == atCommit)
206 /* unlink and delete list entry */
210 pendingDeletes = next;
212 /* prev does not change */
216 /* unrelated entry, don't touch it */
224 * Physically truncate a relation to the specified number of blocks.
226 * This includes getting rid of any buffers for the blocks that are to be
230 RelationTruncate(Relation rel, BlockNumber nblocks)
234 bool need_fsm_vacuum = false;
235 ForkNumber forks[MAX_FORKNUM];
236 BlockNumber blocks[MAX_FORKNUM];
239 /* Open it at the smgr level if not already done */
240 RelationOpenSmgr(rel);
243 * Make sure smgr_targblock etc aren't pointing somewhere past new end
245 rel->rd_smgr->smgr_targblock = InvalidBlockNumber;
246 rel->rd_smgr->smgr_fsm_nblocks = InvalidBlockNumber;
247 rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber;
249 /* Prepare for truncation of MAIN fork of the relation */
250 forks[nforks] = MAIN_FORKNUM;
251 blocks[nforks] = nblocks;
254 /* Prepare for truncation of the FSM if it exists */
255 fsm = smgrexists(rel->rd_smgr, FSM_FORKNUM);
258 blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, nblocks);
259 if (BlockNumberIsValid(blocks[nforks]))
261 forks[nforks] = FSM_FORKNUM;
263 need_fsm_vacuum = true;
267 /* Prepare for truncation of the visibility map too if it exists */
268 vm = smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
271 blocks[nforks] = visibilitymap_prepare_truncate(rel, nblocks);
272 if (BlockNumberIsValid(blocks[nforks]))
274 forks[nforks] = VISIBILITYMAP_FORKNUM;
280 * We WAL-log the truncation before actually truncating, which means
281 * trouble if the truncation fails. If we then crash, the WAL replay
282 * likely isn't going to succeed in the truncation either, and cause a
283 * PANIC. It's tempting to put a critical section here, but that cure
284 * would be worse than the disease. It would turn a usually harmless
285 * failure to truncate, that might spell trouble at WAL replay, into a
288 if (RelationNeedsWAL(rel))
291 * Make an XLOG entry reporting the file truncation.
294 xl_smgr_truncate xlrec;
296 xlrec.blkno = nblocks;
297 xlrec.rnode = rel->rd_node;
298 xlrec.flags = SMGR_TRUNCATE_ALL;
301 XLogRegisterData((char *) &xlrec, sizeof(xlrec));
303 lsn = XLogInsert(RM_SMGR_ID,
304 XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
307 * Flush, because otherwise the truncation of the main relation might
308 * hit the disk before the WAL record, and the truncation of the FSM
309 * or visibility map. If we crashed during that window, we'd be left
310 * with a truncated heap, but the FSM or visibility map would still
311 * contain entries for the non-existent heap pages.
317 /* Do the real work to truncate relation forks */
318 smgrtruncate(rel->rd_smgr, forks, nforks, blocks);
321 * Update upper-level FSM pages to account for the truncation.
322 * This is important because the just-truncated pages were likely
323 * marked as all-free, and would be preferentially selected.
326 FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber);
330 * Copy a fork's data, block by block.
332 * Note that this requires that there is no dirty data in shared buffers. If
333 * it's possible that there are, callers need to flush those using
334 * e.g. FlushRelationBuffers(rel).
337 RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
338 ForkNumber forkNum, char relpersistence)
343 bool copying_initfork;
347 page = (Page) buf.data;
350 * The init fork for an unlogged relation in many respects has to be
351 * treated the same as normal relation, changes need to be WAL logged and
352 * it needs to be synced to disk.
354 copying_initfork = relpersistence == RELPERSISTENCE_UNLOGGED &&
355 forkNum == INIT_FORKNUM;
358 * We need to log the copied data in WAL iff WAL archiving/streaming is
359 * enabled AND it's a permanent relation.
361 use_wal = XLogIsNeeded() &&
362 (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
364 nblocks = smgrnblocks(src, forkNum);
366 for (blkno = 0; blkno < nblocks; blkno++)
368 /* If we got a cancel signal during the copy of the data, quit */
369 CHECK_FOR_INTERRUPTS();
371 smgrread(src, forkNum, blkno, buf.data);
373 if (!PageIsVerified(page, blkno))
375 (errcode(ERRCODE_DATA_CORRUPTED),
376 errmsg("invalid page in block %u of relation %s",
378 relpathbackend(src->smgr_rnode.node,
379 src->smgr_rnode.backend,
383 * WAL-log the copied page. Unfortunately we don't know what kind of a
384 * page this is, so we have to log the full page including any unused
388 log_newpage(&dst->smgr_rnode.node, forkNum, blkno, page, false);
390 PageSetChecksumInplace(page, blkno);
393 * Now write the page. We say skipFsync = true because there's no
394 * need for smgr to schedule an fsync for this write; we'll do it
397 smgrextend(dst, forkNum, blkno, buf.data, true);
401 * If the rel is WAL-logged, must fsync before commit. We use heap_sync
402 * to ensure that the toast table gets fsync'd too. (For a temp or
403 * unlogged rel we don't care since the data will be gone after a crash
406 * It's obvious that we must do this when not WAL-logging the copy. It's
407 * less obvious that we have to do it even if we did WAL-log the copied
408 * pages. The reason is that since we're copying outside shared buffers, a
409 * CHECKPOINT occurring during the copy has no way to flush the previously
410 * written data to disk (indeed it won't know the new rel even exists). A
411 * crash later on would replay WAL from the checkpoint, therefore it
412 * wouldn't replay our earlier WAL entries. If we do not fsync those pages
413 * here, they might still not be on disk when the crash occurs.
415 if (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork)
416 smgrimmedsync(dst, forkNum);
420 * smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
422 * This also runs when aborting a subxact; we want to clean up a failed
423 * subxact immediately.
425 * Note: It's possible that we're being asked to remove a relation that has
426 * no physical storage in any fork. In particular, it's possible that we're
427 * cleaning up an old temporary relation for which RemovePgTempFiles has
428 * already recovered the physical storage.
431 smgrDoPendingDeletes(bool isCommit)
433 int nestLevel = GetCurrentTransactionNestLevel();
434 PendingRelDelete *pending;
435 PendingRelDelete *prev;
436 PendingRelDelete *next;
440 SMgrRelation *srels = NULL;
443 for (pending = pendingDeletes; pending != NULL; pending = next)
445 next = pending->next;
446 if (pending->nestLevel < nestLevel)
448 /* outer-level entries should not be processed yet */
453 /* unlink list entry first, so we don't retry on failure */
457 pendingDeletes = next;
458 /* do deletion if called for */
459 if (pending->atCommit == isCommit)
463 srel = smgropen(pending->relnode, pending->backend);
465 /* allocate the initial array, or extend it, if needed */
469 srels = palloc(sizeof(SMgrRelation) * maxrels);
471 else if (maxrels <= nrels)
474 srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
477 srels[nrels++] = srel;
479 /* must explicitly free the list entry */
481 /* prev does not change */
487 smgrdounlinkall(srels, nrels, false);
489 for (i = 0; i < nrels; i++)
497 * smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
499 * The return value is the number of relations scheduled for termination.
500 * *ptr is set to point to a freshly-palloc'd array of RelFileNodes.
501 * If there are no relations to be deleted, *ptr is set to NULL.
503 * Only non-temporary relations are included in the returned list. This is OK
504 * because the list is used only in contexts where temporary relations don't
505 * matter: we're either writing to the two-phase state file (and transactions
506 * that have touched temp tables can't be prepared) or we're writing to xlog
507 * (and all temporary files will be zapped if we restart anyway, so no need
508 * for redo to do it also).
510 * Note that the list does not include anything scheduled for termination
511 * by upper-level transactions.
514 smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr)
516 int nestLevel = GetCurrentTransactionNestLevel();
519 PendingRelDelete *pending;
522 for (pending = pendingDeletes; pending != NULL; pending = pending->next)
524 if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
525 && pending->backend == InvalidBackendId)
533 rptr = (RelFileNode *) palloc(nrels * sizeof(RelFileNode));
535 for (pending = pendingDeletes; pending != NULL; pending = pending->next)
537 if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
538 && pending->backend == InvalidBackendId)
540 *rptr = pending->relnode;
548 * PostPrepare_smgr -- Clean up after a successful PREPARE
550 * What we have to do here is throw away the in-memory state about pending
551 * relation deletes. It's all been recorded in the 2PC state file and
552 * it's no longer smgr's job to worry about it.
555 PostPrepare_smgr(void)
557 PendingRelDelete *pending;
558 PendingRelDelete *next;
560 for (pending = pendingDeletes; pending != NULL; pending = next)
562 next = pending->next;
563 pendingDeletes = next;
564 /* must explicitly free the list entry */
571 * AtSubCommit_smgr() --- Take care of subtransaction commit.
573 * Reassign all items in the pending-deletes list to the parent transaction.
576 AtSubCommit_smgr(void)
578 int nestLevel = GetCurrentTransactionNestLevel();
579 PendingRelDelete *pending;
581 for (pending = pendingDeletes; pending != NULL; pending = pending->next)
583 if (pending->nestLevel >= nestLevel)
584 pending->nestLevel = nestLevel - 1;
589 * AtSubAbort_smgr() --- Take care of subtransaction abort.
591 * Delete created relations and forget about deleted relations.
592 * We can execute these operations immediately because we know this
593 * subtransaction will not commit.
596 AtSubAbort_smgr(void)
598 smgrDoPendingDeletes(false);
602 smgr_redo(XLogReaderState *record)
604 XLogRecPtr lsn = record->EndRecPtr;
605 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
607 /* Backup blocks are not used in smgr records */
608 Assert(!XLogRecHasAnyBlockRefs(record));
610 if (info == XLOG_SMGR_CREATE)
612 xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
615 reln = smgropen(xlrec->rnode, InvalidBackendId);
616 smgrcreate(reln, xlrec->forkNum, true);
618 else if (info == XLOG_SMGR_TRUNCATE)
620 xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
623 ForkNumber forks[MAX_FORKNUM];
624 BlockNumber blocks[MAX_FORKNUM];
626 bool need_fsm_vacuum = false;
628 reln = smgropen(xlrec->rnode, InvalidBackendId);
631 * Forcibly create relation if it doesn't exist (which suggests that
632 * it was dropped somewhere later in the WAL sequence). As in
633 * XLogReadBufferForRedo, we prefer to recreate the rel and replay the
634 * log as best we can until the drop is seen.
636 smgrcreate(reln, MAIN_FORKNUM, true);
639 * Before we perform the truncation, update minimum recovery point to
640 * cover this WAL record. Once the relation is truncated, there's no
641 * going back. The buffer manager enforces the WAL-first rule for
642 * normal updates to relation files, so that the minimum recovery
643 * point is always updated before the corresponding change in the data
644 * file is flushed to disk. We have to do the same manually here.
646 * Doing this before the truncation means that if the truncation fails
647 * for some reason, you cannot start up the system even after restart,
648 * until you fix the underlying situation so that the truncation will
649 * succeed. Alternatively, we could update the minimum recovery point
650 * after truncation, but that would leave a small window where the
651 * WAL-first rule could be violated.
655 /* Prepare for truncation of MAIN fork */
656 if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0)
658 forks[nforks] = MAIN_FORKNUM;
659 blocks[nforks] = xlrec->blkno;
662 /* Also tell xlogutils.c about it */
663 XLogTruncateRelation(xlrec->rnode, MAIN_FORKNUM, xlrec->blkno);
666 /* Prepare for truncation of FSM and VM too */
667 rel = CreateFakeRelcacheEntry(xlrec->rnode);
669 if ((xlrec->flags & SMGR_TRUNCATE_FSM) != 0 &&
670 smgrexists(reln, FSM_FORKNUM))
672 blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, xlrec->blkno);
673 if (BlockNumberIsValid(blocks[nforks]))
675 forks[nforks] = FSM_FORKNUM;
677 need_fsm_vacuum = true;
680 if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0 &&
681 smgrexists(reln, VISIBILITYMAP_FORKNUM))
683 blocks[nforks] = visibilitymap_prepare_truncate(rel, xlrec->blkno);
684 if (BlockNumberIsValid(blocks[nforks]))
686 forks[nforks] = VISIBILITYMAP_FORKNUM;
691 /* Do the real work to truncate relation forks */
693 smgrtruncate(reln, forks, nforks, blocks);
696 * Update upper-level FSM pages to account for the truncation.
697 * This is important because the just-truncated pages were likely
698 * marked as all-free, and would be preferentially selected.
701 FreeSpaceMapVacuumRange(rel, xlrec->blkno,
704 FreeFakeRelcacheEntry(rel);
707 elog(PANIC, "smgr_redo: unknown op code %u", info);