]> granicus.if.org Git - postgresql/blob - src/backend/access/transam/xlogutils.c
Restructure some header files a bit, in particular heapam.h, by removing some
[postgresql] / src / backend / access / transam / xlogutils.c
1 /*-------------------------------------------------------------------------
2  *
3  * xlogutils.c
4  *
5  * PostgreSQL transaction log manager utility routines
6  *
7  * This file contains support routines that are used by XLOG replay functions.
8  * None of this code is used during normal system operation.
9  *
10  *
11  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
12  * Portions Copyright (c) 1994, Regents of the University of California
13  *
14  * $PostgreSQL: pgsql/src/backend/access/transam/xlogutils.c,v 1.52 2008/05/12 00:00:46 alvherre Exp $
15  *
16  *-------------------------------------------------------------------------
17  */
18 #include "postgres.h"
19
20 #include "access/xlogutils.h"
21 #include "storage/bufmgr.h"
22 #include "storage/bufpage.h"
23 #include "storage/smgr.h"
24 #include "utils/hsearch.h"
25
26
27 /*
28  * During XLOG replay, we may see XLOG records for incremental updates of
29  * pages that no longer exist, because their relation was later dropped or
30  * truncated.  (Note: this is only possible when full_page_writes = OFF,
31  * since when it's ON, the first reference we see to a page should always
32  * be a full-page rewrite not an incremental update.)  Rather than simply
33  * ignoring such records, we make a note of the referenced page, and then
34  * complain if we don't actually see a drop or truncate covering the page
35  * later in replay.
36  */
37 typedef struct xl_invalid_page_key
38 {
39         RelFileNode node;                       /* the relation */
40         BlockNumber blkno;                      /* the page */
41 } xl_invalid_page_key;
42
43 typedef struct xl_invalid_page
44 {
45         xl_invalid_page_key key;        /* hash key ... must be first */
46         bool            present;                /* page existed but contained zeroes */
47 } xl_invalid_page;
48
49 static HTAB *invalid_page_tab = NULL;
50
51
52 /* Log a reference to an invalid page */
53 static void
54 log_invalid_page(RelFileNode node, BlockNumber blkno, bool present)
55 {
56         xl_invalid_page_key key;
57         xl_invalid_page *hentry;
58         bool            found;
59
60         /*
61          * Log references to invalid pages at DEBUG1 level.  This allows some
62          * tracing of the cause (note the elog context mechanism will tell us
63          * something about the XLOG record that generated the reference).
64          */
65         if (present)
66                 elog(DEBUG1, "page %u of relation %u/%u/%u is uninitialized",
67                          blkno, node.spcNode, node.dbNode, node.relNode);
68         else
69                 elog(DEBUG1, "page %u of relation %u/%u/%u does not exist",
70                          blkno, node.spcNode, node.dbNode, node.relNode);
71
72         if (invalid_page_tab == NULL)
73         {
74                 /* create hash table when first needed */
75                 HASHCTL         ctl;
76
77                 memset(&ctl, 0, sizeof(ctl));
78                 ctl.keysize = sizeof(xl_invalid_page_key);
79                 ctl.entrysize = sizeof(xl_invalid_page);
80                 ctl.hash = tag_hash;
81
82                 invalid_page_tab = hash_create("XLOG invalid-page table",
83                                                                            100,
84                                                                            &ctl,
85                                                                            HASH_ELEM | HASH_FUNCTION);
86         }
87
88         /* we currently assume xl_invalid_page_key contains no padding */
89         key.node = node;
90         key.blkno = blkno;
91         hentry = (xl_invalid_page *)
92                 hash_search(invalid_page_tab, (void *) &key, HASH_ENTER, &found);
93
94         if (!found)
95         {
96                 /* hash_search already filled in the key */
97                 hentry->present = present;
98         }
99         else
100         {
101                 /* repeat reference ... leave "present" as it was */
102         }
103 }
104
105 /* Forget any invalid pages >= minblkno, because they've been dropped */
106 static void
107 forget_invalid_pages(RelFileNode node, BlockNumber minblkno)
108 {
109         HASH_SEQ_STATUS status;
110         xl_invalid_page *hentry;
111
112         if (invalid_page_tab == NULL)
113                 return;                                 /* nothing to do */
114
115         hash_seq_init(&status, invalid_page_tab);
116
117         while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
118         {
119                 if (RelFileNodeEquals(hentry->key.node, node) &&
120                         hentry->key.blkno >= minblkno)
121                 {
122                         elog(DEBUG2, "page %u of relation %u/%u/%u has been dropped",
123                                  hentry->key.blkno, hentry->key.node.spcNode,
124                                  hentry->key.node.dbNode, hentry->key.node.relNode);
125
126                         if (hash_search(invalid_page_tab,
127                                                         (void *) &hentry->key,
128                                                         HASH_REMOVE, NULL) == NULL)
129                                 elog(ERROR, "hash table corrupted");
130                 }
131         }
132 }
133
134 /* Forget any invalid pages in a whole database */
135 static void
136 forget_invalid_pages_db(Oid dbid)
137 {
138         HASH_SEQ_STATUS status;
139         xl_invalid_page *hentry;
140
141         if (invalid_page_tab == NULL)
142                 return;                                 /* nothing to do */
143
144         hash_seq_init(&status, invalid_page_tab);
145
146         while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
147         {
148                 if (hentry->key.node.dbNode == dbid)
149                 {
150                         elog(DEBUG2, "page %u of relation %u/%u/%u has been dropped",
151                                  hentry->key.blkno, hentry->key.node.spcNode,
152                                  hentry->key.node.dbNode, hentry->key.node.relNode);
153
154                         if (hash_search(invalid_page_tab,
155                                                         (void *) &hentry->key,
156                                                         HASH_REMOVE, NULL) == NULL)
157                                 elog(ERROR, "hash table corrupted");
158                 }
159         }
160 }
161
162 /* Complain about any remaining invalid-page entries */
163 void
164 XLogCheckInvalidPages(void)
165 {
166         HASH_SEQ_STATUS status;
167         xl_invalid_page *hentry;
168         bool            foundone = false;
169
170         if (invalid_page_tab == NULL)
171                 return;                                 /* nothing to do */
172
173         hash_seq_init(&status, invalid_page_tab);
174
175         /*
176          * Our strategy is to emit WARNING messages for all remaining entries and
177          * only PANIC after we've dumped all the available info.
178          */
179         while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
180         {
181                 if (hentry->present)
182                         elog(WARNING, "page %u of relation %u/%u/%u was uninitialized",
183                                  hentry->key.blkno, hentry->key.node.spcNode,
184                                  hentry->key.node.dbNode, hentry->key.node.relNode);
185                 else
186                         elog(WARNING, "page %u of relation %u/%u/%u did not exist",
187                                  hentry->key.blkno, hentry->key.node.spcNode,
188                                  hentry->key.node.dbNode, hentry->key.node.relNode);
189                 foundone = true;
190         }
191
192         if (foundone)
193                 elog(PANIC, "WAL contains references to invalid pages");
194 }
195
196
197 /*
198  * XLogReadBuffer
199  *              Read a page during XLOG replay
200  *
201  * This is functionally comparable to ReadBuffer followed by
202  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE): you get back a pinned
203  * and locked buffer.  (Getting the lock is not really necessary, since we
204  * expect that this is only used during single-process XLOG replay, but
205  * some subroutines such as MarkBufferDirty will complain if we don't.)
206  *
207  * If "init" is true then the caller intends to rewrite the page fully
208  * using the info in the XLOG record.  In this case we will extend the
209  * relation if needed to make the page exist, and we will not complain about
210  * the page being "new" (all zeroes); in fact, we usually will supply a
211  * zeroed buffer without reading the page at all, so as to avoid unnecessary
212  * failure if the page is present on disk but has corrupt headers.
213  *
214  * If "init" is false then the caller needs the page to be valid already.
215  * If the page doesn't exist or contains zeroes, we return InvalidBuffer.
216  * In this case the caller should silently skip the update on this page.
217  * (In this situation, we expect that the page was later dropped or truncated.
218  * If we don't see evidence of that later in the WAL sequence, we'll complain
219  * at the end of WAL replay.)
220  */
221 Buffer
222 XLogReadBuffer(Relation reln, BlockNumber blkno, bool init)
223 {
224         BlockNumber lastblock = RelationGetNumberOfBlocks(reln);
225         Buffer          buffer;
226
227         Assert(blkno != P_NEW);
228
229         if (blkno < lastblock)
230         {
231                 /* page exists in file */
232                 if (init)
233                         buffer = ReadOrZeroBuffer(reln, blkno);
234                 else
235                         buffer = ReadBuffer(reln, blkno);
236         }
237         else
238         {
239                 /* hm, page doesn't exist in file */
240                 if (!init)
241                 {
242                         log_invalid_page(reln->rd_node, blkno, false);
243                         return InvalidBuffer;
244                 }
245                 /* OK to extend the file */
246                 /* we do this in recovery only - no rel-extension lock needed */
247                 Assert(InRecovery);
248                 buffer = InvalidBuffer;
249                 while (blkno >= lastblock)
250                 {
251                         if (buffer != InvalidBuffer)
252                                 ReleaseBuffer(buffer);
253                         buffer = ReadBuffer(reln, P_NEW);
254                         lastblock++;
255                 }
256                 Assert(BufferGetBlockNumber(buffer) == blkno);
257         }
258
259         LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
260
261         if (!init)
262         {
263                 /* check that page has been initialized */
264                 Page            page = (Page) BufferGetPage(buffer);
265
266                 if (PageIsNew((PageHeader) page))
267                 {
268                         UnlockReleaseBuffer(buffer);
269                         log_invalid_page(reln->rd_node, blkno, true);
270                         return InvalidBuffer;
271                 }
272         }
273
274         return buffer;
275 }
276
277
278 /*
279  * Lightweight "Relation" cache --- this substitutes for the normal relcache
280  * during XLOG replay.
281  */
282
283 typedef struct XLogRelDesc
284 {
285         RelationData reldata;
286         struct XLogRelDesc *lessRecently;
287         struct XLogRelDesc *moreRecently;
288 } XLogRelDesc;
289
290 typedef struct XLogRelCacheEntry
291 {
292         RelFileNode rnode;
293         XLogRelDesc *rdesc;
294 } XLogRelCacheEntry;
295
296 static HTAB *_xlrelcache;
297 static XLogRelDesc *_xlrelarr = NULL;
298 static Form_pg_class _xlpgcarr = NULL;
299 static int      _xlast = 0;
300 static int      _xlcnt = 0;
301
302 #define _XLOG_RELCACHESIZE      512
303
304 static void
305 _xl_init_rel_cache(void)
306 {
307         HASHCTL         ctl;
308
309         _xlcnt = _XLOG_RELCACHESIZE;
310         _xlast = 0;
311         _xlrelarr = (XLogRelDesc *) malloc(sizeof(XLogRelDesc) * _xlcnt);
312         memset(_xlrelarr, 0, sizeof(XLogRelDesc) * _xlcnt);
313         _xlpgcarr = (Form_pg_class) malloc(sizeof(FormData_pg_class) * _xlcnt);
314         memset(_xlpgcarr, 0, sizeof(FormData_pg_class) * _xlcnt);
315
316         _xlrelarr[0].moreRecently = &(_xlrelarr[0]);
317         _xlrelarr[0].lessRecently = &(_xlrelarr[0]);
318
319         memset(&ctl, 0, sizeof(ctl));
320         ctl.keysize = sizeof(RelFileNode);
321         ctl.entrysize = sizeof(XLogRelCacheEntry);
322         ctl.hash = tag_hash;
323
324         _xlrelcache = hash_create("XLOG relcache", _XLOG_RELCACHESIZE,
325                                                           &ctl, HASH_ELEM | HASH_FUNCTION);
326 }
327
328 static void
329 _xl_remove_hash_entry(XLogRelDesc *rdesc)
330 {
331         Form_pg_class tpgc = rdesc->reldata.rd_rel;
332         XLogRelCacheEntry *hentry;
333
334         rdesc->lessRecently->moreRecently = rdesc->moreRecently;
335         rdesc->moreRecently->lessRecently = rdesc->lessRecently;
336
337         hentry = (XLogRelCacheEntry *) hash_search(_xlrelcache,
338                                           (void *) &(rdesc->reldata.rd_node), HASH_REMOVE, NULL);
339         if (hentry == NULL)
340                 elog(PANIC, "_xl_remove_hash_entry: file was not found in cache");
341
342         RelationCloseSmgr(&(rdesc->reldata));
343
344         memset(rdesc, 0, sizeof(XLogRelDesc));
345         memset(tpgc, 0, sizeof(FormData_pg_class));
346         rdesc->reldata.rd_rel = tpgc;
347 }
348
349 static XLogRelDesc *
350 _xl_new_reldesc(void)
351 {
352         XLogRelDesc *res;
353
354         _xlast++;
355         if (_xlast < _xlcnt)
356         {
357                 _xlrelarr[_xlast].reldata.rd_rel = &(_xlpgcarr[_xlast]);
358                 return &(_xlrelarr[_xlast]);
359         }
360
361         /* reuse */
362         res = _xlrelarr[0].moreRecently;
363
364         _xl_remove_hash_entry(res);
365
366         _xlast--;
367         return res;
368 }
369
370
371 void
372 XLogInitRelationCache(void)
373 {
374         _xl_init_rel_cache();
375         invalid_page_tab = NULL;
376 }
377
378 void
379 XLogCloseRelationCache(void)
380 {
381         HASH_SEQ_STATUS status;
382         XLogRelCacheEntry *hentry;
383
384         if (!_xlrelarr)
385                 return;
386
387         hash_seq_init(&status, _xlrelcache);
388
389         while ((hentry = (XLogRelCacheEntry *) hash_seq_search(&status)) != NULL)
390                 _xl_remove_hash_entry(hentry->rdesc);
391
392         hash_destroy(_xlrelcache);
393
394         free(_xlrelarr);
395         free(_xlpgcarr);
396
397         _xlrelarr = NULL;
398 }
399
400 /*
401  * Open a relation during XLOG replay
402  *
403  * Note: this once had an API that allowed NULL return on failure, but it
404  * no longer does; any failure results in elog().
405  */
406 Relation
407 XLogOpenRelation(RelFileNode rnode)
408 {
409         XLogRelDesc *res;
410         XLogRelCacheEntry *hentry;
411         bool            found;
412
413         hentry = (XLogRelCacheEntry *)
414                 hash_search(_xlrelcache, (void *) &rnode, HASH_FIND, NULL);
415
416         if (hentry)
417         {
418                 res = hentry->rdesc;
419
420                 res->lessRecently->moreRecently = res->moreRecently;
421                 res->moreRecently->lessRecently = res->lessRecently;
422         }
423         else
424         {
425                 res = _xl_new_reldesc();
426
427                 sprintf(RelationGetRelationName(&(res->reldata)), "%u", rnode.relNode);
428
429                 res->reldata.rd_node = rnode;
430
431                 /*
432                  * We set up the lockRelId in case anything tries to lock the dummy
433                  * relation.  Note that this is fairly bogus since relNode may be
434                  * different from the relation's OID.  It shouldn't really matter
435                  * though, since we are presumably running by ourselves and can't have
436                  * any lock conflicts ...
437                  */
438                 res->reldata.rd_lockInfo.lockRelId.dbId = rnode.dbNode;
439                 res->reldata.rd_lockInfo.lockRelId.relId = rnode.relNode;
440
441                 hentry = (XLogRelCacheEntry *)
442                         hash_search(_xlrelcache, (void *) &rnode, HASH_ENTER, &found);
443
444                 if (found)
445                         elog(PANIC, "xlog relation already present on insert into cache");
446
447                 hentry->rdesc = res;
448
449                 res->reldata.rd_targblock = InvalidBlockNumber;
450                 res->reldata.rd_smgr = NULL;
451                 RelationOpenSmgr(&(res->reldata));
452
453                 /*
454                  * Create the target file if it doesn't already exist.  This lets us
455                  * cope if the replay sequence contains writes to a relation that is
456                  * later deleted.  (The original coding of this routine would instead
457                  * return NULL, causing the writes to be suppressed. But that seems
458                  * like it risks losing valuable data if the filesystem loses an inode
459                  * during a crash.      Better to write the data until we are actually
460                  * told to delete the file.)
461                  */
462                 smgrcreate(res->reldata.rd_smgr, res->reldata.rd_istemp, true);
463         }
464
465         res->moreRecently = &(_xlrelarr[0]);
466         res->lessRecently = _xlrelarr[0].lessRecently;
467         _xlrelarr[0].lessRecently = res;
468         res->lessRecently->moreRecently = res;
469
470         return &(res->reldata);
471 }
472
473 /*
474  * Drop a relation during XLOG replay
475  *
476  * This is called when the relation is about to be deleted; we need to ensure
477  * that there is no dangling smgr reference in the xlog relation cache.
478  *
479  * Currently, we don't bother to physically remove the relation from the
480  * cache, we just let it age out normally.
481  *
482  * This also takes care of removing any open "invalid-page" records for
483  * the relation.
484  */
485 void
486 XLogDropRelation(RelFileNode rnode)
487 {
488         XLogRelCacheEntry *hentry;
489
490         hentry = (XLogRelCacheEntry *)
491                 hash_search(_xlrelcache, (void *) &rnode, HASH_FIND, NULL);
492
493         if (hentry)
494         {
495                 XLogRelDesc *rdesc = hentry->rdesc;
496
497                 RelationCloseSmgr(&(rdesc->reldata));
498         }
499
500         forget_invalid_pages(rnode, 0);
501 }
502
503 /*
504  * Drop a whole database during XLOG replay
505  *
506  * As above, but for DROP DATABASE instead of dropping a single rel
507  */
508 void
509 XLogDropDatabase(Oid dbid)
510 {
511         HASH_SEQ_STATUS status;
512         XLogRelCacheEntry *hentry;
513
514         hash_seq_init(&status, _xlrelcache);
515
516         while ((hentry = (XLogRelCacheEntry *) hash_seq_search(&status)) != NULL)
517         {
518                 XLogRelDesc *rdesc = hentry->rdesc;
519
520                 if (hentry->rnode.dbNode == dbid)
521                         RelationCloseSmgr(&(rdesc->reldata));
522         }
523
524         forget_invalid_pages_db(dbid);
525 }
526
527 /*
528  * Truncate a relation during XLOG replay
529  *
530  * We don't need to do anything to the fake relcache, but we do need to
531  * clean up any open "invalid-page" records for the dropped pages.
532  */
533 void
534 XLogTruncateRelation(RelFileNode rnode, BlockNumber nblocks)
535 {
536         forget_invalid_pages(rnode, nblocks);
537 }