]> granicus.if.org Git - postgresql/blobdiff - src/backend/access/transam/xlogutils.c
pgindent run for 9.4
[postgresql] / src / backend / access / transam / xlogutils.c
index b6442787334e2e4499bbc8ac3e4d02b97d84e533..b7829ff4c6ddce25e2ffcbde9858acf8dbfc5377 100644 (file)
  *
  * xlogutils.c
  *
+ * PostgreSQL transaction log manager utility routines
  *
- * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
+ * This file contains support routines that are used by XLOG replay functions.
+ * None of this code is used during normal system operation.
+ *
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
+ * src/backend/access/transam/xlogutils.c
+ *
  *-------------------------------------------------------------------------
  */
-
-#ifdef XLOG
-
 #include "postgres.h"
 
 #include "access/xlog.h"
-#include "access/transam.h"
-#include "access/xact.h"
-#include "storage/bufpage.h"
-#include "storage/bufmgr.h"
-#include "storage/smgr.h"
-#include "access/htup.h"
 #include "access/xlogutils.h"
-#include "catalog/pg_database.h"
-#include "lib/hasht.h"
+#include "catalog/catalog.h"
+#include "storage/smgr.h"
+#include "utils/guc.h"
+#include "utils/hsearch.h"
+#include "utils/rel.h"
 
-/*
- * ---------------------------------------------------------------
- *
- * Index support functions
- *
- *----------------------------------------------------------------
- */
 
 /*
- * Check if specified heap tuple was inserted by given
- * xaction/command and return
- *
- * - -1 if not
- * - 0  if there is no tuple at all
- * - 1  if yes
+ * During XLOG replay, we may see XLOG records for incremental updates of
+ * pages that no longer exist, because their relation was later dropped or
+ * truncated.  (Note: this is only possible when full_page_writes = OFF,
+ * since when it's ON, the first reference we see to a page should always
+ * be a full-page rewrite not an incremental update.)  Rather than simply
+ * ignoring such records, we make a note of the referenced page, and then
+ * complain if we don't actually see a drop or truncate covering the page
+ * later in replay.
  */
-int
-XLogIsOwnerOfTuple(RelFileNode hnode, ItemPointer iptr, 
-                                       TransactionId xid, CommandId cid)
+typedef struct xl_invalid_page_key
 {
-       Relation                reln;
-       Buffer                  buffer;
-       Page                    page;
-       ItemId                  lp;
-       HeapTupleHeader htup;
-
-       reln = XLogOpenRelation(false, RM_HEAP_ID, hnode);
-       if (!RelationIsValid(reln))
-               return(0);
-
-       buffer = ReadBuffer(reln, ItemPointerGetBlockNumber(iptr));
-       if (!BufferIsValid(buffer))
-               return(0);
-
-       LockBuffer(buffer, BUFFER_LOCK_SHARE);
-       page = (Page) BufferGetPage(buffer);
-       if (PageIsNew((PageHeader) page) ||
-               ItemPointerGetOffsetNumber(iptr) > PageGetMaxOffsetNumber(page))
-       {
-               UnlockAndReleaseBuffer(buffer);
-               return(0);
-       }
-       lp = PageGetItemId(page, ItemPointerGetOffsetNumber(iptr));
-       if (!ItemIdIsUsed(lp) || ItemIdDeleted(lp))
-       {
-               UnlockAndReleaseBuffer(buffer);
-               return(0);
-       }
+       RelFileNode node;                       /* the relation */
+       ForkNumber      forkno;                 /* the fork number */
+       BlockNumber blkno;                      /* the page */
+} xl_invalid_page_key;
 
-       htup = (HeapTupleHeader) PageGetItem(page, lp);
+typedef struct xl_invalid_page
+{
+       xl_invalid_page_key key;        /* hash key ... must be first */
+       bool            present;                /* page existed but contained zeroes */
+} xl_invalid_page;
+
+static HTAB *invalid_page_tab = NULL;
 
-       Assert(PageGetSUI(page) == ThisStartUpID);
-       if (htup->t_xmin != xid || htup->t_cmin != cid)
-       {
-               UnlockAndReleaseBuffer(buffer);
-               return(-1);
-       }
 
-       UnlockAndReleaseBuffer(buffer);
-       return(1);
+/* Report a reference to an invalid page */
+static void
+report_invalid_page(int elevel, RelFileNode node, ForkNumber forkno,
+                                       BlockNumber blkno, bool present)
+{
+       char       *path = relpathperm(node, forkno);
+
+       if (present)
+               elog(elevel, "page %u of relation %s is uninitialized",
+                        blkno, path);
+       else
+               elog(elevel, "page %u of relation %s does not exist",
+                        blkno, path);
+       pfree(path);
 }
 
-/*
- * MUST BE CALLED ONLY ON RECOVERY.
- *
- * Check if exists valid (inserted by not aborted xaction) heap tuple
- * for given item pointer
- */
-bool
-XLogIsValidTuple(RelFileNode hnode, ItemPointer iptr)
+/* Log a reference to an invalid page */
+static void
+log_invalid_page(RelFileNode node, ForkNumber forkno, BlockNumber blkno,
+                                bool present)
 {
-       Relation                reln;
-       Buffer                  buffer;
-       Page                    page;
-       ItemId                  lp;
-       HeapTupleHeader htup;
-
-       reln = XLogOpenRelation(false, RM_HEAP_ID, hnode);
-       if (!RelationIsValid(reln))
-               return(false);
-
-       buffer = ReadBuffer(reln, ItemPointerGetBlockNumber(iptr));
-       if (!BufferIsValid(buffer))
-               return(false);
-
-       LockBuffer(buffer, BUFFER_LOCK_SHARE);
-       page = (Page) BufferGetPage(buffer);
-       if (PageIsNew((PageHeader) page) ||
-               ItemPointerGetOffsetNumber(iptr) > PageGetMaxOffsetNumber(page))
+       xl_invalid_page_key key;
+       xl_invalid_page *hentry;
+       bool            found;
+
+       /*
+        * Once recovery has reached a consistent state, the invalid-page table
+        * should be empty and remain so. If a reference to an invalid page is
+        * found after consistency is reached, PANIC immediately. This might seem
+        * aggressive, but it's better than letting the invalid reference linger
+        * in the hash table until the end of recovery and PANIC there, which
+        * might come only much later if this is a standby server.
+        */
+       if (reachedConsistency)
        {
-               UnlockAndReleaseBuffer(buffer);
-               return(false);
+               report_invalid_page(WARNING, node, forkno, blkno, present);
+               elog(PANIC, "WAL contains references to invalid pages");
        }
 
-       if (PageGetSUI(page) != ThisStartUpID)
+       /*
+        * Log references to invalid pages at DEBUG1 level.  This allows some
+        * tracing of the cause (note the elog context mechanism will tell us
+        * something about the XLOG record that generated the reference).
+        */
+       if (log_min_messages <= DEBUG1 || client_min_messages <= DEBUG1)
+               report_invalid_page(DEBUG1, node, forkno, blkno, present);
+
+       if (invalid_page_tab == NULL)
        {
-               Assert(PageGetSUI(page) < ThisStartUpID);
-               UnlockAndReleaseBuffer(buffer);
-               return(true);
+               /* create hash table when first needed */
+               HASHCTL         ctl;
+
+               memset(&ctl, 0, sizeof(ctl));
+               ctl.keysize = sizeof(xl_invalid_page_key);
+               ctl.entrysize = sizeof(xl_invalid_page);
+               ctl.hash = tag_hash;
+
+               invalid_page_tab = hash_create("XLOG invalid-page table",
+                                                                          100,
+                                                                          &ctl,
+                                                                          HASH_ELEM | HASH_FUNCTION);
        }
 
-       lp = PageGetItemId(page, ItemPointerGetOffsetNumber(iptr));
-       if (!ItemIdIsUsed(lp) || ItemIdDeleted(lp))
+       /* we currently assume xl_invalid_page_key contains no padding */
+       key.node = node;
+       key.forkno = forkno;
+       key.blkno = blkno;
+       hentry = (xl_invalid_page *)
+               hash_search(invalid_page_tab, (void *) &key, HASH_ENTER, &found);
+
+       if (!found)
+       {
+               /* hash_search already filled in the key */
+               hentry->present = present;
+       }
+       else
        {
-               UnlockAndReleaseBuffer(buffer);
-               return(false);
+               /* repeat reference ... leave "present" as it was */
        }
+}
+
+/* Forget any invalid pages >= minblkno, because they've been dropped */
+static void
+forget_invalid_pages(RelFileNode node, ForkNumber forkno, BlockNumber minblkno)
+{
+       HASH_SEQ_STATUS status;
+       xl_invalid_page *hentry;
 
-       htup = (HeapTupleHeader) PageGetItem(page, lp);
+       if (invalid_page_tab == NULL)
+               return;                                 /* nothing to do */
 
-       /* MUST CHECK WASN'T TUPLE INSERTED IN PREV STARTUP */
+       hash_seq_init(&status, invalid_page_tab);
 
-       if (!(htup->t_infomask & HEAP_XMIN_COMMITTED))
+       while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
        {
-               if (htup->t_infomask & HEAP_XMIN_INVALID ||
-                       (htup->t_infomask & HEAP_MOVED_IN &&
-                       TransactionIdDidAbort((TransactionId)htup->t_cmin)) ||
-                       TransactionIdDidAbort(htup->t_xmin))
+               if (RelFileNodeEquals(hentry->key.node, node) &&
+                       hentry->key.forkno == forkno &&
+                       hentry->key.blkno >= minblkno)
                {
-                       UnlockAndReleaseBuffer(buffer);
-                       return(false);
+                       if (log_min_messages <= DEBUG2 || client_min_messages <= DEBUG2)
+                       {
+                               char       *path = relpathperm(hentry->key.node, forkno);
+
+                               elog(DEBUG2, "page %u of relation %s has been dropped",
+                                        hentry->key.blkno, path);
+                               pfree(path);
+                       }
+
+                       if (hash_search(invalid_page_tab,
+                                                       (void *) &hentry->key,
+                                                       HASH_REMOVE, NULL) == NULL)
+                               elog(ERROR, "hash table corrupted");
                }
        }
-
-       UnlockAndReleaseBuffer(buffer);
-       return(true);
 }
 
-/*
- * Open pg_log in recovery
- */
-extern Relation        LogRelation;    /* pg_log relation */
-
-void
-XLogOpenLogRelation(void)
+/* Forget any invalid pages in a whole database */
+static void
+forget_invalid_pages_db(Oid dbid)
 {
-       Relation        logRelation;
-
-       Assert(!LogRelation);
-       logRelation = (Relation) malloc(sizeof(RelationData));
-       memset(logRelation, 0, sizeof(RelationData));
-       logRelation->rd_rel = (Form_pg_class) malloc(sizeof(FormData_pg_class));
-       memset(logRelation->rd_rel, 0, sizeof(FormData_pg_class));
-
-       sprintf(RelationGetPhysicalRelationName(logRelation), "pg_log");
-       logRelation->rd_node.tblNode = InvalidOid;
-       logRelation->rd_node.relNode = RelOid_pg_log;
-       logRelation->rd_fd = -1;
-       logRelation->rd_fd = smgropen(DEFAULT_SMGR, logRelation, false);
-       if (logRelation->rd_fd < 0)
-               elog(STOP, "XLogOpenLogRelation: failed to open pg_log");
-       LogRelation = logRelation;
-}
+       HASH_SEQ_STATUS status;
+       xl_invalid_page *hentry;
 
-/*
- * ---------------------------------------------------------------
- *
- * Storage related support functions
- *
- *----------------------------------------------------------------
- */
+       if (invalid_page_tab == NULL)
+               return;                                 /* nothing to do */
 
-Buffer
-XLogReadBuffer(bool extend, Relation reln, BlockNumber blkno)
-{
-       BlockNumber     lastblock = RelationGetNumberOfBlocks(reln);
-       Buffer          buffer;
+       hash_seq_init(&status, invalid_page_tab);
 
-       if (blkno >= lastblock)
+       while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
        {
-               buffer = InvalidBuffer;
-               if (extend)             /* we do this in recovery only - no locks */
+               if (hentry->key.node.dbNode == dbid)
                {
-                       Assert(InRecovery);
-                       while (lastblock <= blkno)
+                       if (log_min_messages <= DEBUG2 || client_min_messages <= DEBUG2)
                        {
-                               buffer = ReadBuffer(reln, P_NEW);
-                               lastblock++;
+                               char       *path = relpathperm(hentry->key.node, hentry->key.forkno);
+
+                               elog(DEBUG2, "page %u of relation %s has been dropped",
+                                        hentry->key.blkno, path);
+                               pfree(path);
                        }
+
+                       if (hash_search(invalid_page_tab,
+                                                       (void *) &hentry->key,
+                                                       HASH_REMOVE, NULL) == NULL)
+                               elog(ERROR, "hash table corrupted");
                }
-               if (buffer != InvalidBuffer)
-                       LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
-               return(buffer);
        }
-
-       buffer = ReadBuffer(reln, blkno);
-       if (buffer != InvalidBuffer)
-               LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
-       return(buffer);
 }
 
-/*
- * "Relation" cache
- */
-
-typedef struct XLogRelDesc
+/* Are there any unresolved references to invalid pages? */
+bool
+XLogHaveInvalidPages(void)
 {
-       RelationData                    reldata;
-       struct XLogRelDesc         *lessRecently;
-       struct XLogRelDesc         *moreRecently;
-} XLogRelDesc;
+       if (invalid_page_tab != NULL &&
+               hash_get_num_entries(invalid_page_tab) > 0)
+               return true;
+       return false;
+}
 
-typedef struct XLogRelCacheEntry
+/* Complain about any remaining invalid-page entries */
+void
+XLogCheckInvalidPages(void)
 {
-       RelFileNode             rnode;
-       XLogRelDesc        *rdesc;
-} XLogRelCacheEntry;
+       HASH_SEQ_STATUS status;
+       xl_invalid_page *hentry;
+       bool            foundone = false;
 
-static HTAB                               *_xlrelcache;
-static XLogRelDesc                *_xlrelarr = NULL;
-static Form_pg_class           _xlpgcarr = NULL;
-static int                                     _xlast = 0;
-static int                                     _xlcnt = 0;
-#define        _XLOG_RELCACHESIZE      512
+       if (invalid_page_tab == NULL)
+               return;                                 /* nothing to do */
 
-static void
-_xl_init_rel_cache(void)
-{
-       HASHCTL                 ctl;
+       hash_seq_init(&status, invalid_page_tab);
 
-       _xlcnt = _XLOG_RELCACHESIZE;
-       _xlast = 0;
-       _xlrelarr = (XLogRelDesc*) malloc(sizeof(XLogRelDesc) * _xlcnt);
-       memset(_xlrelarr, 0, sizeof(XLogRelDesc) * _xlcnt);
-       _xlpgcarr = (Form_pg_class) malloc(sizeof(FormData_pg_class) * _xlcnt);
-       memset(_xlpgcarr, 0, sizeof(FormData_pg_class) * _xlcnt);
-
-       _xlrelarr[0].moreRecently = &(_xlrelarr[0]);
-       _xlrelarr[0].lessRecently = &(_xlrelarr[0]);
+       /*
+        * Our strategy is to emit WARNING messages for all remaining entries and
+        * only PANIC after we've dumped all the available info.
+        */
+       while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
+       {
+               report_invalid_page(WARNING, hentry->key.node, hentry->key.forkno,
+                                                       hentry->key.blkno, hentry->present);
+               foundone = true;
+       }
 
-       memset(&ctl, 0, (int) sizeof(ctl));
-       ctl.keysize = sizeof(RelFileNode);
-       ctl.datasize = sizeof(XLogRelDesc*);
-       ctl.hash = tag_hash;
+       if (foundone)
+               elog(PANIC, "WAL contains references to invalid pages");
 
-       _xlrelcache = hash_create(_XLOG_RELCACHESIZE, &ctl,
-                                                               HASH_ELEM | HASH_FUNCTION);
+       hash_destroy(invalid_page_tab);
+       invalid_page_tab = NULL;
 }
 
-static void
-_xl_remove_hash_entry(XLogRelDesc **edata, int dummy)
+/*
+ * XLogReadBuffer
+ *             Read a page during XLOG replay.
+ *
+ * This is a shorthand of XLogReadBufferExtended() followed by
+ * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), for reading from the main
+ * fork.
+ *
+ * (Getting the buffer lock is not really necessary during single-process
+ * crash recovery, but some subroutines such as MarkBufferDirty will complain
+ * if we don't have the lock.  In hot standby mode it's definitely necessary.)
+ *
+ * The returned buffer is exclusively-locked.
+ *
+ * For historical reasons, instead of a ReadBufferMode argument, this only
+ * supports RBM_ZERO (init == true) and RBM_NORMAL (init == false) modes.
+ */
+Buffer
+XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
 {
-       XLogRelCacheEntry          *hentry;
-       bool                                    found;
-       XLogRelDesc                        *rdesc = *edata;
-       Form_pg_class                   tpgc = rdesc->reldata.rd_rel;
+       Buffer          buf;
 
-       rdesc->lessRecently->moreRecently = rdesc->moreRecently;
-       rdesc->moreRecently->lessRecently = rdesc->lessRecently;
+       buf = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno,
+                                                                init ? RBM_ZERO : RBM_NORMAL);
+       if (BufferIsValid(buf))
+               LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 
-       hentry = (XLogRelCacheEntry*) hash_search(_xlrelcache, 
-               (char*)&(rdesc->reldata.rd_node), HASH_REMOVE, &found);
+       return buf;
+}
 
-       if (hentry == NULL)
-               elog(STOP, "_xl_remove_hash_entry: can't delete from cache");
-       if (!found)
-               elog(STOP, "_xl_remove_hash_entry: file was not found in cache");
+/*
+ * XLogReadBufferExtended
+ *             Read a page during XLOG replay
+ *
+ * This is functionally comparable to ReadBufferExtended. There's some
+ * differences in the behavior wrt. the "mode" argument:
+ *
+ * In RBM_NORMAL mode, if the page doesn't exist, or contains all-zeroes, we
+ * return InvalidBuffer. In this case the caller should silently skip the
+ * update on this page. (In this situation, we expect that the page was later
+ * dropped or truncated. If we don't see evidence of that later in the WAL
+ * sequence, we'll complain at the end of WAL replay.)
+ *
+ * In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the
+ * relation is extended with all-zeroes pages up to the given block number.
+ *
+ * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
+ * exist, and we don't check for all-zeroes.  Thus, no log entry is made
+ * to imply that the page should be dropped or truncated later.
+ */
+Buffer
+XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
+                                          BlockNumber blkno, ReadBufferMode mode)
+{
+       BlockNumber lastblock;
+       Buffer          buffer;
+       SMgrRelation smgr;
 
-       if (rdesc->reldata.rd_fd >= 0)
-               smgrclose(DEFAULT_SMGR, &(rdesc->reldata));
+       Assert(blkno != P_NEW);
 
-       memset(rdesc, 0, sizeof(XLogRelDesc));
-       memset(tpgc, 0, sizeof(FormData_pg_class));
-       rdesc->reldata.rd_rel = tpgc;
+       /* Open the relation at smgr level */
+       smgr = smgropen(rnode, InvalidBackendId);
 
-       return;
-}
+       /*
+        * Create the target file if it doesn't already exist.  This lets us cope
+        * if the replay sequence contains writes to a relation that is later
+        * deleted.  (The original coding of this routine would instead suppress
+        * the writes, but that seems like it risks losing valuable data if the
+        * filesystem loses an inode during a crash.  Better to write the data
+        * until we are actually told to delete the file.)
+        */
+       smgrcreate(smgr, forknum, true);
 
-static XLogRelDesc*
-_xl_new_reldesc(void)
-{
-       XLogRelDesc        *res;
+       lastblock = smgrnblocks(smgr, forknum);
 
-       _xlast++;
-       if (_xlast < _xlcnt)
+       if (blkno < lastblock)
        {
-               _xlrelarr[_xlast].reldata.rd_rel = &(_xlpgcarr[_xlast]);
-               return(&(_xlrelarr[_xlast]));
+               /* page exists in file */
+               buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
+                                                                                  mode, NULL);
+       }
+       else
+       {
+               /* hm, page doesn't exist in file */
+               if (mode == RBM_NORMAL)
+               {
+                       log_invalid_page(rnode, forknum, blkno, false);
+                       return InvalidBuffer;
+               }
+               if (mode == RBM_NORMAL_NO_LOG)
+                       return InvalidBuffer;
+               /* OK to extend the file */
+               /* we do this in recovery only - no rel-extension lock needed */
+               Assert(InRecovery);
+               buffer = InvalidBuffer;
+               do
+               {
+                       if (buffer != InvalidBuffer)
+                               ReleaseBuffer(buffer);
+                       buffer = ReadBufferWithoutRelcache(rnode, forknum,
+                                                                                          P_NEW, mode, NULL);
+               }
+               while (BufferGetBlockNumber(buffer) < blkno);
+               /* Handle the corner case that P_NEW returns non-consecutive pages */
+               if (BufferGetBlockNumber(buffer) != blkno)
+               {
+                       ReleaseBuffer(buffer);
+                       buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
+                                                                                          mode, NULL);
+               }
        }
 
-       /* reuse */
-       res = _xlrelarr[0].moreRecently;
-
-       _xl_remove_hash_entry(&res, 0);
+       if (mode == RBM_NORMAL)
+       {
+               /* check that page has been initialized */
+               Page            page = (Page) BufferGetPage(buffer);
+
+               /*
+                * We assume that PageIsNew is safe without a lock. During recovery,
+                * there should be no other backends that could modify the buffer at
+                * the same time.
+                */
+               if (PageIsNew(page))
+               {
+                       ReleaseBuffer(buffer);
+                       log_invalid_page(rnode, forknum, blkno, true);
+                       return InvalidBuffer;
+               }
+       }
 
-       _xlast--;
-       return(res);
+       return buffer;
 }
 
-extern void CreateDummyCaches(void);
-extern void DestroyDummyCaches(void);
 
-void
-XLogInitRelationCache(void)
-{
-       CreateDummyCaches();
-       _xl_init_rel_cache();
-}
-
-void
-XLogCloseRelationCache(void)
+/*
+ * Struct actually returned by XLogFakeRelcacheEntry, though the declared
+ * return type is Relation.
+ */
+typedef struct
 {
+       RelationData reldata;           /* Note: this must be first */
+       FormData_pg_class pgc;
+} FakeRelCacheEntryData;
 
-       DestroyDummyCaches();
-
-       if (!_xlrelarr)
-               return;
-
-       HashTableWalk(_xlrelcache, (HashtFunc)_xl_remove_hash_entry, 0);
-       hash_destroy(_xlrelcache);
-
-       free(_xlrelarr);
-       free(_xlpgcarr);
-
-       _xlrelarr = NULL;
-}
+typedef FakeRelCacheEntryData *FakeRelCacheEntry;
 
+/*
+ * Create a fake relation cache entry for a physical relation
+ *
+ * It's often convenient to use the same functions in XLOG replay as in the
+ * main codepath, but those functions typically work with a relcache entry.
+ * We don't have a working relation cache during XLOG replay, but this
+ * function can be used to create a fake relcache entry instead. Only the
+ * fields related to physical storage, like rd_rel, are initialized, so the
+ * fake entry is only usable in low-level operations like ReadBuffer().
+ *
+ * Caller must free the returned entry with FreeFakeRelcacheEntry().
+ */
 Relation
-XLogOpenRelation(bool redo, RmgrId rmid, RelFileNode rnode)
+CreateFakeRelcacheEntry(RelFileNode rnode)
 {
-       XLogRelDesc                        *res;
-       XLogRelCacheEntry          *hentry;
-       bool                                    found;
+       FakeRelCacheEntry fakeentry;
+       Relation        rel;
 
-       hentry = (XLogRelCacheEntry*) 
-                       hash_search(_xlrelcache, (char*)&rnode, HASH_FIND, &found);
+       Assert(InRecovery);
 
-       if (hentry == NULL)
-               elog(STOP, "XLogOpenRelation: error in cache");
+       /* Allocate the Relation struct and all related space in one block. */
+       fakeentry = palloc0(sizeof(FakeRelCacheEntryData));
+       rel = (Relation) fakeentry;
 
-       if (found)
-       {
-               res = hentry->rdesc;
+       rel->rd_rel = &fakeentry->pgc;
+       rel->rd_node = rnode;
+       /* We will never be working with temp rels during recovery */
+       rel->rd_backend = InvalidBackendId;
 
-               res->lessRecently->moreRecently = res->moreRecently;
-               res->moreRecently->lessRecently = res->lessRecently;
-       }
-       else
-       {
-               res = _xl_new_reldesc();
-
-               sprintf(RelationGetPhysicalRelationName(&(res->reldata)), "%u", rnode.relNode);
-
-               /* unexisting DB id */
-               res->reldata.rd_lockInfo.lockRelId.dbId = RecoveryDb;
-               res->reldata.rd_lockInfo.lockRelId.relId = rnode.relNode;
-               res->reldata.rd_node = rnode;
+       /* It must be a permanent table if we're in recovery. */
+       rel->rd_rel->relpersistence = RELPERSISTENCE_PERMANENT;
 
-               hentry = (XLogRelCacheEntry*) 
-                       hash_search(_xlrelcache, (char*)&rnode, HASH_ENTER, &found);
+       /* We don't know the name of the relation; use relfilenode instead */
+       sprintf(RelationGetRelationName(rel), "%u", rnode.relNode);
 
-               if (hentry == NULL)
-                       elog(STOP, "XLogOpenRelation: can't insert into cache");
+       /*
+        * We set up the lockRelId in case anything tries to lock the dummy
+        * relation.  Note that this is fairly bogus since relNode may be
+        * different from the relation's OID.  It shouldn't really matter though,
+        * since we are presumably running by ourselves and can't have any lock
+        * conflicts ...
+        */
+       rel->rd_lockInfo.lockRelId.dbId = rnode.dbNode;
+       rel->rd_lockInfo.lockRelId.relId = rnode.relNode;
 
-               if (found)
-                       elog(STOP, "XLogOpenRelation: file found on insert into cache");
+       rel->rd_smgr = NULL;
 
-               hentry->rdesc = res;
-
-               res->reldata.rd_fd = -1;
-               res->reldata.rd_fd = smgropen(DEFAULT_SMGR, &(res->reldata),
-                                                                         true /* allow failure */);
-       }
+       return rel;
+}
 
-       res->moreRecently = &(_xlrelarr[0]);
-       res->lessRecently = _xlrelarr[0].lessRecently;
-       _xlrelarr[0].lessRecently = res;
-       res->lessRecently->moreRecently = res;
+/*
+ * Free a fake relation cache entry.
+ */
+void
+FreeFakeRelcacheEntry(Relation fakerel)
+{
+       /* make sure the fakerel is not referenced by the SmgrRelation anymore */
+       if (fakerel->rd_smgr != NULL)
+               smgrclearowner(&fakerel->rd_smgr, fakerel->rd_smgr);
+       pfree(fakerel);
+}
 
-       if (res->reldata.rd_fd < 0)             /* file doesn't exist */
-               return(NULL);
+/*
+ * Drop a relation during XLOG replay
+ *
+ * This is called when the relation is about to be deleted; we need to remove
+ * any open "invalid-page" records for the relation.
+ */
+void
+XLogDropRelation(RelFileNode rnode, ForkNumber forknum)
+{
+       forget_invalid_pages(rnode, forknum, 0);
+}
 
-       return(&(res->reldata));
+/*
+ * Drop a whole database during XLOG replay
+ *
+ * As above, but for DROP DATABASE instead of dropping a single rel
+ */
+void
+XLogDropDatabase(Oid dbid)
+{
+       /*
+        * This is unnecessarily heavy-handed, as it will close SMgrRelation
+        * objects for other databases as well. DROP DATABASE occurs seldom enough
+        * that it's not worth introducing a variant of smgrclose for just this
+        * purpose. XXX: Or should we rather leave the smgr entries dangling?
+        */
+       smgrcloseall();
+
+       forget_invalid_pages_db(dbid);
 }
 
-#endif
+/*
+ * Truncate a relation during XLOG replay
+ *
+ * We need to clean up any open "invalid-page" records for the dropped pages.
+ */
+void
+XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum,
+                                        BlockNumber nblocks)
+{
+       forget_invalid_pages(rnode, forkNum, nblocks);
+}