* None of this code is used during normal system operation.
*
*
- * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/backend/access/transam/xlogutils.c,v 1.52 2008/05/12 00:00:46 alvherre Exp $
+ * src/backend/access/transam/xlogutils.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
+#include "access/xlog.h"
#include "access/xlogutils.h"
-#include "storage/bufmgr.h"
-#include "storage/bufpage.h"
+#include "catalog/catalog.h"
#include "storage/smgr.h"
+#include "utils/guc.h"
#include "utils/hsearch.h"
+#include "utils/rel.h"
/*
typedef struct xl_invalid_page_key
{
RelFileNode node; /* the relation */
+ ForkNumber forkno; /* the fork number */
BlockNumber blkno; /* the page */
} xl_invalid_page_key;
static HTAB *invalid_page_tab = NULL;
+/* Report a reference to an invalid page */
+static void
+report_invalid_page(int elevel, RelFileNode node, ForkNumber forkno,
+ BlockNumber blkno, bool present)
+{
+ char *path = relpathperm(node, forkno);
+
+ if (present)
+ elog(elevel, "page %u of relation %s is uninitialized",
+ blkno, path);
+ else
+ elog(elevel, "page %u of relation %s does not exist",
+ blkno, path);
+ pfree(path);
+}
+
/* Log a reference to an invalid page */
static void
-log_invalid_page(RelFileNode node, BlockNumber blkno, bool present)
+log_invalid_page(RelFileNode node, ForkNumber forkno, BlockNumber blkno,
+ bool present)
{
xl_invalid_page_key key;
xl_invalid_page *hentry;
bool found;
+ /*
+ * Once recovery has reached a consistent state, the invalid-page table
+ * should be empty and remain so. If a reference to an invalid page is
+ * found after consistency is reached, PANIC immediately. This might seem
+ * aggressive, but it's better than letting the invalid reference linger
+ * in the hash table until the end of recovery and PANIC there, which
+ * might come only much later if this is a standby server.
+ */
+ if (reachedConsistency)
+ {
+ report_invalid_page(WARNING, node, forkno, blkno, present);
+ elog(PANIC, "WAL contains references to invalid pages");
+ }
+
/*
* Log references to invalid pages at DEBUG1 level. This allows some
* tracing of the cause (note the elog context mechanism will tell us
* something about the XLOG record that generated the reference).
*/
- if (present)
- elog(DEBUG1, "page %u of relation %u/%u/%u is uninitialized",
- blkno, node.spcNode, node.dbNode, node.relNode);
- else
- elog(DEBUG1, "page %u of relation %u/%u/%u does not exist",
- blkno, node.spcNode, node.dbNode, node.relNode);
+ if (log_min_messages <= DEBUG1 || client_min_messages <= DEBUG1)
+ report_invalid_page(DEBUG1, node, forkno, blkno, present);
if (invalid_page_tab == NULL)
{
/* we currently assume xl_invalid_page_key contains no padding */
key.node = node;
+ key.forkno = forkno;
key.blkno = blkno;
hentry = (xl_invalid_page *)
hash_search(invalid_page_tab, (void *) &key, HASH_ENTER, &found);
/* Forget any invalid pages >= minblkno, because they've been dropped */
static void
-forget_invalid_pages(RelFileNode node, BlockNumber minblkno)
+forget_invalid_pages(RelFileNode node, ForkNumber forkno, BlockNumber minblkno)
{
HASH_SEQ_STATUS status;
xl_invalid_page *hentry;
while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
{
if (RelFileNodeEquals(hentry->key.node, node) &&
+ hentry->key.forkno == forkno &&
hentry->key.blkno >= minblkno)
{
- elog(DEBUG2, "page %u of relation %u/%u/%u has been dropped",
- hentry->key.blkno, hentry->key.node.spcNode,
- hentry->key.node.dbNode, hentry->key.node.relNode);
+ if (log_min_messages <= DEBUG2 || client_min_messages <= DEBUG2)
+ {
+ char *path = relpathperm(hentry->key.node, forkno);
+
+ elog(DEBUG2, "page %u of relation %s has been dropped",
+ hentry->key.blkno, path);
+ pfree(path);
+ }
if (hash_search(invalid_page_tab,
(void *) &hentry->key,
{
if (hentry->key.node.dbNode == dbid)
{
- elog(DEBUG2, "page %u of relation %u/%u/%u has been dropped",
- hentry->key.blkno, hentry->key.node.spcNode,
- hentry->key.node.dbNode, hentry->key.node.relNode);
+ if (log_min_messages <= DEBUG2 || client_min_messages <= DEBUG2)
+ {
+ char *path = relpathperm(hentry->key.node, hentry->key.forkno);
+
+ elog(DEBUG2, "page %u of relation %s has been dropped",
+ hentry->key.blkno, path);
+ pfree(path);
+ }
if (hash_search(invalid_page_tab,
(void *) &hentry->key,
}
}
+/* Are there any unresolved references to invalid pages? */
+bool
+XLogHaveInvalidPages(void)
+{
+ if (invalid_page_tab != NULL &&
+ hash_get_num_entries(invalid_page_tab) > 0)
+ return true;
+ return false;
+}
+
/* Complain about any remaining invalid-page entries */
void
XLogCheckInvalidPages(void)
*/
while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
{
- if (hentry->present)
- elog(WARNING, "page %u of relation %u/%u/%u was uninitialized",
- hentry->key.blkno, hentry->key.node.spcNode,
- hentry->key.node.dbNode, hentry->key.node.relNode);
- else
- elog(WARNING, "page %u of relation %u/%u/%u did not exist",
- hentry->key.blkno, hentry->key.node.spcNode,
- hentry->key.node.dbNode, hentry->key.node.relNode);
+ report_invalid_page(WARNING, hentry->key.node, hentry->key.forkno,
+ hentry->key.blkno, hentry->present);
foundone = true;
}
if (foundone)
elog(PANIC, "WAL contains references to invalid pages");
-}
+ hash_destroy(invalid_page_tab);
+ invalid_page_tab = NULL;
+}
/*
* XLogReadBuffer
+ * Read a page during XLOG replay.
+ *
+ * This is a shorthand of XLogReadBufferExtended() followed by
+ * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), for reading from the main
+ * fork.
+ *
+ * (Getting the buffer lock is not really necessary during single-process
+ * crash recovery, but some subroutines such as MarkBufferDirty will complain
+ * if we don't have the lock. In hot standby mode it's definitely necessary.)
+ *
+ * The returned buffer is exclusively-locked.
+ *
+ * For historical reasons, instead of a ReadBufferMode argument, this only
+ * supports RBM_ZERO (init == true) and RBM_NORMAL (init == false) modes.
+ */
+Buffer
+XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
+{
+ Buffer buf;
+
+ buf = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno,
+ init ? RBM_ZERO : RBM_NORMAL);
+ if (BufferIsValid(buf))
+ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+ return buf;
+}
+
+/*
+ * XLogReadBufferExtended
* Read a page during XLOG replay
*
- * This is functionally comparable to ReadBuffer followed by
- * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE): you get back a pinned
- * and locked buffer. (Getting the lock is not really necessary, since we
- * expect that this is only used during single-process XLOG replay, but
- * some subroutines such as MarkBufferDirty will complain if we don't.)
+ * This is functionally comparable to ReadBufferExtended. There's some
+ * differences in the behavior wrt. the "mode" argument:
+ *
+ * In RBM_NORMAL mode, if the page doesn't exist, or contains all-zeroes, we
+ * return InvalidBuffer. In this case the caller should silently skip the
+ * update on this page. (In this situation, we expect that the page was later
+ * dropped or truncated. If we don't see evidence of that later in the WAL
+ * sequence, we'll complain at the end of WAL replay.)
*
- * If "init" is true then the caller intends to rewrite the page fully
- * using the info in the XLOG record. In this case we will extend the
- * relation if needed to make the page exist, and we will not complain about
- * the page being "new" (all zeroes); in fact, we usually will supply a
- * zeroed buffer without reading the page at all, so as to avoid unnecessary
- * failure if the page is present on disk but has corrupt headers.
+ * In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the
+ * relation is extended with all-zeroes pages up to the given block number.
*
- * If "init" is false then the caller needs the page to be valid already.
- * If the page doesn't exist or contains zeroes, we return InvalidBuffer.
- * In this case the caller should silently skip the update on this page.
- * (In this situation, we expect that the page was later dropped or truncated.
- * If we don't see evidence of that later in the WAL sequence, we'll complain
- * at the end of WAL replay.)
+ * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
+ * exist, and we don't check for all-zeroes. Thus, no log entry is made
+ * to imply that the page should be dropped or truncated later.
*/
Buffer
-XLogReadBuffer(Relation reln, BlockNumber blkno, bool init)
+XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
+ BlockNumber blkno, ReadBufferMode mode)
{
- BlockNumber lastblock = RelationGetNumberOfBlocks(reln);
+ BlockNumber lastblock;
Buffer buffer;
+ SMgrRelation smgr;
Assert(blkno != P_NEW);
+ /* Open the relation at smgr level */
+ smgr = smgropen(rnode, InvalidBackendId);
+
+ /*
+ * Create the target file if it doesn't already exist. This lets us cope
+ * if the replay sequence contains writes to a relation that is later
+ * deleted. (The original coding of this routine would instead suppress
+ * the writes, but that seems like it risks losing valuable data if the
+ * filesystem loses an inode during a crash. Better to write the data
+ * until we are actually told to delete the file.)
+ */
+ smgrcreate(smgr, forknum, true);
+
+ lastblock = smgrnblocks(smgr, forknum);
+
if (blkno < lastblock)
{
/* page exists in file */
- if (init)
- buffer = ReadOrZeroBuffer(reln, blkno);
- else
- buffer = ReadBuffer(reln, blkno);
+ buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
+ mode, NULL);
}
else
{
/* hm, page doesn't exist in file */
- if (!init)
+ if (mode == RBM_NORMAL)
{
- log_invalid_page(reln->rd_node, blkno, false);
+ log_invalid_page(rnode, forknum, blkno, false);
return InvalidBuffer;
}
+ if (mode == RBM_NORMAL_NO_LOG)
+ return InvalidBuffer;
/* OK to extend the file */
/* we do this in recovery only - no rel-extension lock needed */
Assert(InRecovery);
buffer = InvalidBuffer;
- while (blkno >= lastblock)
+ do
{
if (buffer != InvalidBuffer)
ReleaseBuffer(buffer);
- buffer = ReadBuffer(reln, P_NEW);
- lastblock++;
+ buffer = ReadBufferWithoutRelcache(rnode, forknum,
+ P_NEW, mode, NULL);
+ }
+ while (BufferGetBlockNumber(buffer) < blkno);
+ /* Handle the corner case that P_NEW returns non-consecutive pages */
+ if (BufferGetBlockNumber(buffer) != blkno)
+ {
+ ReleaseBuffer(buffer);
+ buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
+ mode, NULL);
}
- Assert(BufferGetBlockNumber(buffer) == blkno);
}
- LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
-
- if (!init)
+ if (mode == RBM_NORMAL)
{
/* check that page has been initialized */
Page page = (Page) BufferGetPage(buffer);
- if (PageIsNew((PageHeader) page))
+ /*
+ * We assume that PageIsNew is safe without a lock. During recovery,
+ * there should be no other backends that could modify the buffer at
+ * the same time.
+ */
+ if (PageIsNew(page))
{
- UnlockReleaseBuffer(buffer);
- log_invalid_page(reln->rd_node, blkno, true);
+ ReleaseBuffer(buffer);
+ log_invalid_page(rnode, forknum, blkno, true);
return InvalidBuffer;
}
}
/*
- * Lightweight "Relation" cache --- this substitutes for the normal relcache
- * during XLOG replay.
+ * Struct actually returned by XLogFakeRelcacheEntry, though the declared
+ * return type is Relation.
*/
-
-typedef struct XLogRelDesc
-{
- RelationData reldata;
- struct XLogRelDesc *lessRecently;
- struct XLogRelDesc *moreRecently;
-} XLogRelDesc;
-
-typedef struct XLogRelCacheEntry
+typedef struct
{
- RelFileNode rnode;
- XLogRelDesc *rdesc;
-} XLogRelCacheEntry;
+ RelationData reldata; /* Note: this must be first */
+ FormData_pg_class pgc;
+} FakeRelCacheEntryData;
-static HTAB *_xlrelcache;
-static XLogRelDesc *_xlrelarr = NULL;
-static Form_pg_class _xlpgcarr = NULL;
-static int _xlast = 0;
-static int _xlcnt = 0;
-
-#define _XLOG_RELCACHESIZE 512
-
-static void
-_xl_init_rel_cache(void)
-{
- HASHCTL ctl;
-
- _xlcnt = _XLOG_RELCACHESIZE;
- _xlast = 0;
- _xlrelarr = (XLogRelDesc *) malloc(sizeof(XLogRelDesc) * _xlcnt);
- memset(_xlrelarr, 0, sizeof(XLogRelDesc) * _xlcnt);
- _xlpgcarr = (Form_pg_class) malloc(sizeof(FormData_pg_class) * _xlcnt);
- memset(_xlpgcarr, 0, sizeof(FormData_pg_class) * _xlcnt);
-
- _xlrelarr[0].moreRecently = &(_xlrelarr[0]);
- _xlrelarr[0].lessRecently = &(_xlrelarr[0]);
-
- memset(&ctl, 0, sizeof(ctl));
- ctl.keysize = sizeof(RelFileNode);
- ctl.entrysize = sizeof(XLogRelCacheEntry);
- ctl.hash = tag_hash;
-
- _xlrelcache = hash_create("XLOG relcache", _XLOG_RELCACHESIZE,
- &ctl, HASH_ELEM | HASH_FUNCTION);
-}
-
-static void
-_xl_remove_hash_entry(XLogRelDesc *rdesc)
-{
- Form_pg_class tpgc = rdesc->reldata.rd_rel;
- XLogRelCacheEntry *hentry;
-
- rdesc->lessRecently->moreRecently = rdesc->moreRecently;
- rdesc->moreRecently->lessRecently = rdesc->lessRecently;
-
- hentry = (XLogRelCacheEntry *) hash_search(_xlrelcache,
- (void *) &(rdesc->reldata.rd_node), HASH_REMOVE, NULL);
- if (hentry == NULL)
- elog(PANIC, "_xl_remove_hash_entry: file was not found in cache");
-
- RelationCloseSmgr(&(rdesc->reldata));
-
- memset(rdesc, 0, sizeof(XLogRelDesc));
- memset(tpgc, 0, sizeof(FormData_pg_class));
- rdesc->reldata.rd_rel = tpgc;
-}
-
-static XLogRelDesc *
-_xl_new_reldesc(void)
-{
- XLogRelDesc *res;
-
- _xlast++;
- if (_xlast < _xlcnt)
- {
- _xlrelarr[_xlast].reldata.rd_rel = &(_xlpgcarr[_xlast]);
- return &(_xlrelarr[_xlast]);
- }
-
- /* reuse */
- res = _xlrelarr[0].moreRecently;
-
- _xl_remove_hash_entry(res);
-
- _xlast--;
- return res;
-}
-
-
-void
-XLogInitRelationCache(void)
-{
- _xl_init_rel_cache();
- invalid_page_tab = NULL;
-}
-
-void
-XLogCloseRelationCache(void)
-{
- HASH_SEQ_STATUS status;
- XLogRelCacheEntry *hentry;
-
- if (!_xlrelarr)
- return;
-
- hash_seq_init(&status, _xlrelcache);
-
- while ((hentry = (XLogRelCacheEntry *) hash_seq_search(&status)) != NULL)
- _xl_remove_hash_entry(hentry->rdesc);
-
- hash_destroy(_xlrelcache);
-
- free(_xlrelarr);
- free(_xlpgcarr);
-
- _xlrelarr = NULL;
-}
+typedef FakeRelCacheEntryData *FakeRelCacheEntry;
/*
- * Open a relation during XLOG replay
+ * Create a fake relation cache entry for a physical relation
*
- * Note: this once had an API that allowed NULL return on failure, but it
- * no longer does; any failure results in elog().
+ * It's often convenient to use the same functions in XLOG replay as in the
+ * main codepath, but those functions typically work with a relcache entry.
+ * We don't have a working relation cache during XLOG replay, but this
+ * function can be used to create a fake relcache entry instead. Only the
+ * fields related to physical storage, like rd_rel, are initialized, so the
+ * fake entry is only usable in low-level operations like ReadBuffer().
+ *
+ * Caller must free the returned entry with FreeFakeRelcacheEntry().
*/
Relation
-XLogOpenRelation(RelFileNode rnode)
+CreateFakeRelcacheEntry(RelFileNode rnode)
{
- XLogRelDesc *res;
- XLogRelCacheEntry *hentry;
- bool found;
+ FakeRelCacheEntry fakeentry;
+ Relation rel;
- hentry = (XLogRelCacheEntry *)
- hash_search(_xlrelcache, (void *) &rnode, HASH_FIND, NULL);
+ Assert(InRecovery);
- if (hentry)
- {
- res = hentry->rdesc;
+ /* Allocate the Relation struct and all related space in one block. */
+ fakeentry = palloc0(sizeof(FakeRelCacheEntryData));
+ rel = (Relation) fakeentry;
- res->lessRecently->moreRecently = res->moreRecently;
- res->moreRecently->lessRecently = res->lessRecently;
- }
- else
- {
- res = _xl_new_reldesc();
+ rel->rd_rel = &fakeentry->pgc;
+ rel->rd_node = rnode;
+ /* We will never be working with temp rels during recovery */
+ rel->rd_backend = InvalidBackendId;
- sprintf(RelationGetRelationName(&(res->reldata)), "%u", rnode.relNode);
+ /* It must be a permanent table if we're in recovery. */
+ rel->rd_rel->relpersistence = RELPERSISTENCE_PERMANENT;
- res->reldata.rd_node = rnode;
+ /* We don't know the name of the relation; use relfilenode instead */
+ sprintf(RelationGetRelationName(rel), "%u", rnode.relNode);
- /*
- * We set up the lockRelId in case anything tries to lock the dummy
- * relation. Note that this is fairly bogus since relNode may be
- * different from the relation's OID. It shouldn't really matter
- * though, since we are presumably running by ourselves and can't have
- * any lock conflicts ...
- */
- res->reldata.rd_lockInfo.lockRelId.dbId = rnode.dbNode;
- res->reldata.rd_lockInfo.lockRelId.relId = rnode.relNode;
-
- hentry = (XLogRelCacheEntry *)
- hash_search(_xlrelcache, (void *) &rnode, HASH_ENTER, &found);
-
- if (found)
- elog(PANIC, "xlog relation already present on insert into cache");
-
- hentry->rdesc = res;
-
- res->reldata.rd_targblock = InvalidBlockNumber;
- res->reldata.rd_smgr = NULL;
- RelationOpenSmgr(&(res->reldata));
+ /*
+ * We set up the lockRelId in case anything tries to lock the dummy
+ * relation. Note that this is fairly bogus since relNode may be
+ * different from the relation's OID. It shouldn't really matter though,
+ * since we are presumably running by ourselves and can't have any lock
+ * conflicts ...
+ */
+ rel->rd_lockInfo.lockRelId.dbId = rnode.dbNode;
+ rel->rd_lockInfo.lockRelId.relId = rnode.relNode;
- /*
- * Create the target file if it doesn't already exist. This lets us
- * cope if the replay sequence contains writes to a relation that is
- * later deleted. (The original coding of this routine would instead
- * return NULL, causing the writes to be suppressed. But that seems
- * like it risks losing valuable data if the filesystem loses an inode
- * during a crash. Better to write the data until we are actually
- * told to delete the file.)
- */
- smgrcreate(res->reldata.rd_smgr, res->reldata.rd_istemp, true);
- }
+ rel->rd_smgr = NULL;
- res->moreRecently = &(_xlrelarr[0]);
- res->lessRecently = _xlrelarr[0].lessRecently;
- _xlrelarr[0].lessRecently = res;
- res->lessRecently->moreRecently = res;
+ return rel;
+}
- return &(res->reldata);
+/*
+ * Free a fake relation cache entry.
+ */
+void
+FreeFakeRelcacheEntry(Relation fakerel)
+{
+ /* make sure the fakerel is not referenced by the SmgrRelation anymore */
+ if (fakerel->rd_smgr != NULL)
+ smgrclearowner(&fakerel->rd_smgr, fakerel->rd_smgr);
+ pfree(fakerel);
}
/*
* Drop a relation during XLOG replay
*
- * This is called when the relation is about to be deleted; we need to ensure
- * that there is no dangling smgr reference in the xlog relation cache.
- *
- * Currently, we don't bother to physically remove the relation from the
- * cache, we just let it age out normally.
- *
- * This also takes care of removing any open "invalid-page" records for
- * the relation.
+ * This is called when the relation is about to be deleted; we need to remove
+ * any open "invalid-page" records for the relation.
*/
void
-XLogDropRelation(RelFileNode rnode)
+XLogDropRelation(RelFileNode rnode, ForkNumber forknum)
{
- XLogRelCacheEntry *hentry;
-
- hentry = (XLogRelCacheEntry *)
- hash_search(_xlrelcache, (void *) &rnode, HASH_FIND, NULL);
-
- if (hentry)
- {
- XLogRelDesc *rdesc = hentry->rdesc;
-
- RelationCloseSmgr(&(rdesc->reldata));
- }
-
- forget_invalid_pages(rnode, 0);
+ forget_invalid_pages(rnode, forknum, 0);
}
/*
void
XLogDropDatabase(Oid dbid)
{
- HASH_SEQ_STATUS status;
- XLogRelCacheEntry *hentry;
-
- hash_seq_init(&status, _xlrelcache);
-
- while ((hentry = (XLogRelCacheEntry *) hash_seq_search(&status)) != NULL)
- {
- XLogRelDesc *rdesc = hentry->rdesc;
-
- if (hentry->rnode.dbNode == dbid)
- RelationCloseSmgr(&(rdesc->reldata));
- }
+ /*
+ * This is unnecessarily heavy-handed, as it will close SMgrRelation
+ * objects for other databases as well. DROP DATABASE occurs seldom enough
+ * that it's not worth introducing a variant of smgrclose for just this
+ * purpose. XXX: Or should we rather leave the smgr entries dangling?
+ */
+ smgrcloseall();
forget_invalid_pages_db(dbid);
}
/*
* Truncate a relation during XLOG replay
*
- * We don't need to do anything to the fake relcache, but we do need to
- * clean up any open "invalid-page" records for the dropped pages.
+ * We need to clean up any open "invalid-page" records for the dropped pages.
*/
void
-XLogTruncateRelation(RelFileNode rnode, BlockNumber nblocks)
+XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum,
+ BlockNumber nblocks)
{
- forget_invalid_pages(rnode, nblocks);
+ forget_invalid_pages(rnode, forkNum, nblocks);
}