granicus.if.org Git - postgresql/blob - src/backend/access/transam/xlogutils.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * xlogutils.c
   4  *
   5  * PostgreSQL transaction log manager utility routines
   6  *
   7  * This file contains support routines that are used by XLOG replay functions.
   8  * None of this code is used during normal system operation.
   9  *
  10  *
  11  * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
  12  * Portions Copyright (c) 1994, Regents of the University of California
  13  *
  14  * src/backend/access/transam/xlogutils.c
  15  *
  16  *-------------------------------------------------------------------------
  17  */
  18 #include "postgres.h"
  19
  20 #include "access/xlogutils.h"
  21 #include "catalog/catalog.h"
  22 #include "storage/bufmgr.h"
  23 #include "storage/smgr.h"
  24 #include "utils/guc.h"
  25 #include "utils/hsearch.h"
  26 #include "utils/rel.h"
  27
  28
  29 /*
  30  * During XLOG replay, we may see XLOG records for incremental updates of
  31  * pages that no longer exist, because their relation was later dropped or
  32  * truncated.  (Note: this is only possible when full_page_writes = OFF,
  33  * since when it's ON, the first reference we see to a page should always
  34  * be a full-page rewrite not an incremental update.)  Rather than simply
  35  * ignoring such records, we make a note of the referenced page, and then
  36  * complain if we don't actually see a drop or truncate covering the page
  37  * later in replay.
  38  */
  39 typedef struct xl_invalid_page_key
  40 {
  41         RelFileNode node;                       /* the relation */
  42         ForkNumber      forkno;                 /* the fork number */
  43         BlockNumber blkno;                      /* the page */
  44 } xl_invalid_page_key;
  45
  46 typedef struct xl_invalid_page
  47 {
  48         xl_invalid_page_key key;        /* hash key ... must be first */
  49         bool            present;                /* page existed but contained zeroes */
  50 } xl_invalid_page;
  51
  52 static HTAB *invalid_page_tab = NULL;
  53
  54
  55 /* Log a reference to an invalid page */
  56 static void
  57 log_invalid_page(RelFileNode node, ForkNumber forkno, BlockNumber blkno,
  58                                  bool present)
  59 {
  60         xl_invalid_page_key key;
  61         xl_invalid_page *hentry;
  62         bool            found;
  63
  64         /*
  65          * Log references to invalid pages at DEBUG1 level.  This allows some
  66          * tracing of the cause (note the elog context mechanism will tell us
  67          * something about the XLOG record that generated the reference).
  68          */
  69         if (log_min_messages <= DEBUG1 || client_min_messages <= DEBUG1)
  70         {
  71                 char       *path = relpathperm(node, forkno);
  72
  73                 if (present)
  74                         elog(DEBUG1, "page %u of relation %s is uninitialized",
  75                                  blkno, path);
  76                 else
  77                         elog(DEBUG1, "page %u of relation %s does not exist",
  78                                  blkno, path);
  79                 pfree(path);
  80         }
  81
  82         if (invalid_page_tab == NULL)
  83         {
  84                 /* create hash table when first needed */
  85                 HASHCTL         ctl;
  86
  87                 memset(&ctl, 0, sizeof(ctl));
  88                 ctl.keysize = sizeof(xl_invalid_page_key);
  89                 ctl.entrysize = sizeof(xl_invalid_page);
  90                 ctl.hash = tag_hash;
  91
  92                 invalid_page_tab = hash_create("XLOG invalid-page table",
  93                                                                            100,
  94                                                                            &ctl,
  95                                                                            HASH_ELEM | HASH_FUNCTION);
  96         }
  97
  98         /* we currently assume xl_invalid_page_key contains no padding */
  99         key.node = node;
 100         key.forkno = forkno;
 101         key.blkno = blkno;
 102         hentry = (xl_invalid_page *)
 103                 hash_search(invalid_page_tab, (void *) &key, HASH_ENTER, &found);
 104
 105         if (!found)
 106         {
 107                 /* hash_search already filled in the key */
 108                 hentry->present = present;
 109         }
 110         else
 111         {
 112                 /* repeat reference ... leave "present" as it was */
 113         }
 114 }
 115
 116 /* Forget any invalid pages >= minblkno, because they've been dropped */
 117 static void
 118 forget_invalid_pages(RelFileNode node, ForkNumber forkno, BlockNumber minblkno)
 119 {
 120         HASH_SEQ_STATUS status;
 121         xl_invalid_page *hentry;
 122
 123         if (invalid_page_tab == NULL)
 124                 return;                                 /* nothing to do */
 125
 126         hash_seq_init(&status, invalid_page_tab);
 127
 128         while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
 129         {
 130                 if (RelFileNodeEquals(hentry->key.node, node) &&
 131                         hentry->key.forkno == forkno &&
 132                         hentry->key.blkno >= minblkno)
 133                 {
 134                         if (log_min_messages <= DEBUG2 || client_min_messages <= DEBUG2)
 135                         {
 136                                 char       *path = relpathperm(hentry->key.node, forkno);
 137
 138                                 elog(DEBUG2, "page %u of relation %s has been dropped",
 139                                          hentry->key.blkno, path);
 140                                 pfree(path);
 141                         }
 142
 143                         if (hash_search(invalid_page_tab,
 144                                                         (void *) &hentry->key,
 145                                                         HASH_REMOVE, NULL) == NULL)
 146                                 elog(ERROR, "hash table corrupted");
 147                 }
 148         }
 149 }
 150
 151 /* Forget any invalid pages in a whole database */
 152 static void
 153 forget_invalid_pages_db(Oid dbid)
 154 {
 155         HASH_SEQ_STATUS status;
 156         xl_invalid_page *hentry;
 157
 158         if (invalid_page_tab == NULL)
 159                 return;                                 /* nothing to do */
 160
 161         hash_seq_init(&status, invalid_page_tab);
 162
 163         while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
 164         {
 165                 if (hentry->key.node.dbNode == dbid)
 166                 {
 167                         if (log_min_messages <= DEBUG2 || client_min_messages <= DEBUG2)
 168                         {
 169                                 char       *path = relpathperm(hentry->key.node, hentry->key.forkno);
 170
 171                                 elog(DEBUG2, "page %u of relation %s has been dropped",
 172                                          hentry->key.blkno, path);
 173                                 pfree(path);
 174                         }
 175
 176                         if (hash_search(invalid_page_tab,
 177                                                         (void *) &hentry->key,
 178                                                         HASH_REMOVE, NULL) == NULL)
 179                                 elog(ERROR, "hash table corrupted");
 180                 }
 181         }
 182 }
 183
 184 /* Complain about any remaining invalid-page entries */
 185 void
 186 XLogCheckInvalidPages(void)
 187 {
 188         HASH_SEQ_STATUS status;
 189         xl_invalid_page *hentry;
 190         bool            foundone = false;
 191
 192         if (invalid_page_tab == NULL)
 193                 return;                                 /* nothing to do */
 194
 195         hash_seq_init(&status, invalid_page_tab);
 196
 197         /*
 198          * Our strategy is to emit WARNING messages for all remaining entries and
 199          * only PANIC after we've dumped all the available info.
 200          */
 201         while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
 202         {
 203                 char       *path = relpathperm(hentry->key.node, hentry->key.forkno);
 204
 205                 if (hentry->present)
 206                         elog(WARNING, "page %u of relation %s was uninitialized",
 207                                  hentry->key.blkno, path);
 208                 else
 209                         elog(WARNING, "page %u of relation %s did not exist",
 210                                  hentry->key.blkno, path);
 211                 pfree(path);
 212                 foundone = true;
 213         }
 214
 215         if (foundone)
 216                 elog(PANIC, "WAL contains references to invalid pages");
 217
 218         hash_destroy(invalid_page_tab);
 219         invalid_page_tab = NULL;
 220 }
 221
 222 /*
 223  * XLogReadBuffer
 224  *              Read a page during XLOG replay.
 225  *
 226  * This is a shorthand of XLogReadBufferExtended() followed by
 227  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), for reading from the main
 228  * fork.
 229  *
 230  * (Getting the buffer lock is not really necessary during single-process
 231  * crash recovery, but some subroutines such as MarkBufferDirty will complain
 232  * if we don't have the lock.  In hot standby mode it's definitely necessary.)
 233  *
 234  * The returned buffer is exclusively-locked.
 235  *
 236  * For historical reasons, instead of a ReadBufferMode argument, this only
 237  * supports RBM_ZERO (init == true) and RBM_NORMAL (init == false) modes.
 238  */
 239 Buffer
 240 XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
 241 {
 242         Buffer          buf;
 243
 244         buf = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno,
 245                                                                  init ? RBM_ZERO : RBM_NORMAL);
 246         if (BufferIsValid(buf))
 247                 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 248
 249         return buf;
 250 }
 251
 252 /*
 253  * XLogReadBufferExtended
 254  *              Read a page during XLOG replay
 255  *
 256  * This is functionally comparable to ReadBufferExtended. There's some
 257  * differences in the behavior wrt. the "mode" argument:
 258  *
 259  * In RBM_NORMAL mode, if the page doesn't exist, or contains all-zeroes, we
 260  * return InvalidBuffer. In this case the caller should silently skip the
 261  * update on this page. (In this situation, we expect that the page was later
 262  * dropped or truncated. If we don't see evidence of that later in the WAL
 263  * sequence, we'll complain at the end of WAL replay.)
 264  *
 265  * In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the
 266  * relation is extended with all-zeroes pages up to the given block number.
 267  */
 268 Buffer
 269 XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 270                                            BlockNumber blkno, ReadBufferMode mode)
 271 {
 272         BlockNumber lastblock;
 273         Buffer          buffer;
 274         SMgrRelation smgr;
 275
 276         Assert(blkno != P_NEW);
 277
 278         /* Open the relation at smgr level */
 279         smgr = smgropen(rnode, InvalidBackendId);
 280
 281         /*
 282          * Create the target file if it doesn't already exist.  This lets us cope
 283          * if the replay sequence contains writes to a relation that is later
 284          * deleted.  (The original coding of this routine would instead suppress
 285          * the writes, but that seems like it risks losing valuable data if the
 286          * filesystem loses an inode during a crash.  Better to write the data
 287          * until we are actually told to delete the file.)
 288          */
 289         smgrcreate(smgr, forknum, true);
 290
 291         lastblock = smgrnblocks(smgr, forknum);
 292
 293         if (blkno < lastblock)
 294         {
 295                 /* page exists in file */
 296                 buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
 297                                                                                    mode, NULL);
 298         }
 299         else
 300         {
 301                 /* hm, page doesn't exist in file */
 302                 if (mode == RBM_NORMAL)
 303                 {
 304                         log_invalid_page(rnode, forknum, blkno, false);
 305                         return InvalidBuffer;
 306                 }
 307                 /* OK to extend the file */
 308                 /* we do this in recovery only - no rel-extension lock needed */
 309                 Assert(InRecovery);
 310                 buffer = InvalidBuffer;
 311                 while (blkno >= lastblock)
 312                 {
 313                         if (buffer != InvalidBuffer)
 314                                 ReleaseBuffer(buffer);
 315                         buffer = ReadBufferWithoutRelcache(rnode, forknum,
 316                                                                                            P_NEW, mode, NULL);
 317                         lastblock++;
 318                 }
 319                 Assert(BufferGetBlockNumber(buffer) == blkno);
 320         }
 321
 322         if (mode == RBM_NORMAL)
 323         {
 324                 /* check that page has been initialized */
 325                 Page            page = (Page) BufferGetPage(buffer);
 326
 327                 /*
 328                  * We assume that PageIsNew is safe without a lock. During recovery,
 329                  * there should be no other backends that could modify the buffer at
 330                  * the same time.
 331                  */
 332                 if (PageIsNew(page))
 333                 {
 334                         ReleaseBuffer(buffer);
 335                         log_invalid_page(rnode, forknum, blkno, true);
 336                         return InvalidBuffer;
 337                 }
 338         }
 339
 340         return buffer;
 341 }
 342
 343
 344 /*
 345  * Struct actually returned by XLogFakeRelcacheEntry, though the declared
 346  * return type is Relation.
 347  */
 348 typedef struct
 349 {
 350         RelationData reldata;           /* Note: this must be first */
 351         FormData_pg_class pgc;
 352 } FakeRelCacheEntryData;
 353
 354 typedef FakeRelCacheEntryData *FakeRelCacheEntry;
 355
 356 /*
 357  * Create a fake relation cache entry for a physical relation
 358  *
 359  * It's often convenient to use the same functions in XLOG replay as in the
 360  * main codepath, but those functions typically work with a relcache entry.
 361  * We don't have a working relation cache during XLOG replay, but this
 362  * function can be used to create a fake relcache entry instead. Only the
 363  * fields related to physical storage, like rd_rel, are initialized, so the
 364  * fake entry is only usable in low-level operations like ReadBuffer().
 365  *
 366  * Caller must free the returned entry with FreeFakeRelcacheEntry().
 367  */
 368 Relation
 369 CreateFakeRelcacheEntry(RelFileNode rnode)
 370 {
 371         FakeRelCacheEntry fakeentry;
 372         Relation        rel;
 373
 374         /* Allocate the Relation struct and all related space in one block. */
 375         fakeentry = palloc0(sizeof(FakeRelCacheEntryData));
 376         rel = (Relation) fakeentry;
 377
 378         rel->rd_rel = &fakeentry->pgc;
 379         rel->rd_node = rnode;
 380         /* We will never be working with temp rels during recovery */
 381         rel->rd_backend = InvalidBackendId;
 382
 383         /* We don't know the name of the relation; use relfilenode instead */
 384         sprintf(RelationGetRelationName(rel), "%u", rnode.relNode);
 385
 386         /*
 387          * We set up the lockRelId in case anything tries to lock the dummy
 388          * relation.  Note that this is fairly bogus since relNode may be
 389          * different from the relation's OID.  It shouldn't really matter though,
 390          * since we are presumably running by ourselves and can't have any lock
 391          * conflicts ...
 392          */
 393         rel->rd_lockInfo.lockRelId.dbId = rnode.dbNode;
 394         rel->rd_lockInfo.lockRelId.relId = rnode.relNode;
 395
 396         rel->rd_smgr = NULL;
 397
 398         return rel;
 399 }
 400
 401 /*
 402  * Free a fake relation cache entry.
 403  */
 404 void
 405 FreeFakeRelcacheEntry(Relation fakerel)
 406 {
 407         pfree(fakerel);
 408 }
 409
 410 /*
 411  * Drop a relation during XLOG replay
 412  *
 413  * This is called when the relation is about to be deleted; we need to remove
 414  * any open "invalid-page" records for the relation.
 415  */
 416 void
 417 XLogDropRelation(RelFileNode rnode, ForkNumber forknum)
 418 {
 419         forget_invalid_pages(rnode, forknum, 0);
 420 }
 421
 422 /*
 423  * Drop a whole database during XLOG replay
 424  *
 425  * As above, but for DROP DATABASE instead of dropping a single rel
 426  */
 427 void
 428 XLogDropDatabase(Oid dbid)
 429 {
 430         /*
 431          * This is unnecessarily heavy-handed, as it will close SMgrRelation
 432          * objects for other databases as well. DROP DATABASE occurs seldom enough
 433          * that it's not worth introducing a variant of smgrclose for just this
 434          * purpose. XXX: Or should we rather leave the smgr entries dangling?
 435          */
 436         smgrcloseall();
 437
 438         forget_invalid_pages_db(dbid);
 439 }
 440
 441 /*
 442  * Truncate a relation during XLOG replay
 443  *
 444  * We need to clean up any open "invalid-page" records for the dropped pages.
 445  */
 446 void
 447 XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum,
 448                                          BlockNumber nblocks)
 449 {
 450         forget_invalid_pages(rnode, forkNum, nblocks);
 451 }