1 /*-------------------------------------------------------------------------
4 * WAL replay logic for GiST.
7 * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/gist/gistxlog.c
12 *-------------------------------------------------------------------------
16 #include "access/gist_private.h"
17 #include "access/xlogutils.h"
18 #include "utils/memutils.h"
28 gistxlogPageSplit *data;
32 static MemoryContext opCtx; /* working memory for operations */
35 * Replay the clearing of F_FOLLOW_RIGHT flag on a child page.
37 * Even if the WAL record includes a full-page image, we have to update the
38 * follow-right flag, because that change is not included in the full-page
39 * image. To be sure that the intermediate state with the wrong flag value is
40 * not visible to concurrent Hot Standby queries, this function handles
41 * restoring the full-page image as well as updating the flag. (Note that
42 * we never need to do anything else to the child page in the current WAL
46 gistRedoClearFollowRight(XLogRecPtr lsn, XLogRecord *record, int block_index,
47 RelFileNode node, BlockNumber childblkno)
52 if (record->xl_info & XLR_BKP_BLOCK(block_index))
53 buffer = RestoreBackupBlock(lsn, record, block_index, false, true);
56 buffer = XLogReadBuffer(node, childblkno, false);
57 if (!BufferIsValid(buffer))
58 return; /* page was deleted, nothing to do */
60 page = (Page) BufferGetPage(buffer);
63 * Note that we still update the page even if page LSN is equal to the LSN
64 * of this record, because the updated NSN is not included in the full
67 if (lsn >= PageGetLSN(page))
69 GistPageSetNSN(page, lsn);
70 GistClearFollowRight(page);
72 PageSetLSN(page, lsn);
73 MarkBufferDirty(buffer);
75 UnlockReleaseBuffer(buffer);
79 * redo any page update (except page split)
82 gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record)
84 char *begin = XLogRecGetData(record);
85 gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) begin;
91 * We need to acquire and hold lock on target page while updating the left
92 * child page. If we have a full-page image of target page, getting the
93 * lock is a side-effect of restoring that image. Note that even if the
94 * target page no longer exists, we'll still attempt to replay the change
97 if (record->xl_info & XLR_BKP_BLOCK(0))
98 buffer = RestoreBackupBlock(lsn, record, 0, false, true);
100 buffer = XLogReadBuffer(xldata->node, xldata->blkno, false);
102 /* Fix follow-right data on left child page */
103 if (BlockNumberIsValid(xldata->leftchild))
104 gistRedoClearFollowRight(lsn, record, 1,
105 xldata->node, xldata->leftchild);
107 /* Done if target page no longer exists */
108 if (!BufferIsValid(buffer))
111 /* nothing more to do if page was backed up (and no info to do it with) */
112 if (record->xl_info & XLR_BKP_BLOCK(0))
114 UnlockReleaseBuffer(buffer);
118 page = (Page) BufferGetPage(buffer);
120 /* nothing more to do if change already applied */
121 if (lsn <= PageGetLSN(page))
123 UnlockReleaseBuffer(buffer);
127 data = begin + sizeof(gistxlogPageUpdate);
129 /* Delete old tuples */
130 if (xldata->ntodelete > 0)
133 OffsetNumber *todelete = (OffsetNumber *) data;
135 data += sizeof(OffsetNumber) * xldata->ntodelete;
137 for (i = 0; i < xldata->ntodelete; i++)
138 PageIndexTupleDelete(page, todelete[i]);
139 if (GistPageIsLeaf(page))
140 GistMarkTuplesDeleted(page);
144 if (data - begin < record->xl_len)
146 OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber :
147 OffsetNumberNext(PageGetMaxOffsetNumber(page));
149 while (data - begin < record->xl_len)
151 IndexTuple itup = (IndexTuple) data;
152 Size sz = IndexTupleSize(itup);
157 l = PageAddItem(page, (Item) itup, sz, off, false, false);
158 if (l == InvalidOffsetNumber)
159 elog(ERROR, "failed to add item to GiST index page, size %d bytes",
167 * special case: leafpage, nothing to insert, nothing to delete, then
170 if (GistPageIsLeaf(page) && xldata->ntodelete == 0)
171 GistClearTuplesDeleted(page);
174 if (!GistPageIsLeaf(page) &&
175 PageGetMaxOffsetNumber(page) == InvalidOffsetNumber &&
176 xldata->blkno == GIST_ROOT_BLKNO)
179 * all links on non-leaf root page was deleted by vacuum full, so root
180 * page becomes a leaf
182 GistPageSetLeaf(page);
185 GistPageGetOpaque(page)->rightlink = InvalidBlockNumber;
186 PageSetLSN(page, lsn);
187 MarkBufferDirty(buffer);
188 UnlockReleaseBuffer(buffer);
192 decodePageSplitRecord(PageSplitRecord *decoded, XLogRecord *record)
194 char *begin = XLogRecGetData(record),
199 decoded->data = (gistxlogPageSplit *) begin;
200 decoded->page = (NewPage *) palloc(sizeof(NewPage) * decoded->data->npage);
202 ptr = begin + sizeof(gistxlogPageSplit);
203 for (i = 0; i < decoded->data->npage; i++)
205 Assert(ptr - begin < record->xl_len);
206 decoded->page[i].header = (gistxlogPage *) ptr;
207 ptr += sizeof(gistxlogPage);
209 decoded->page[i].itup = (IndexTuple *)
210 palloc(sizeof(IndexTuple) * decoded->page[i].header->num);
212 while (j < decoded->page[i].header->num)
214 Assert(ptr - begin < record->xl_len);
215 decoded->page[i].itup[j] = (IndexTuple) ptr;
216 ptr += IndexTupleSize((IndexTuple) ptr);
223 gistRedoPageSplitRecord(XLogRecPtr lsn, XLogRecord *record)
225 gistxlogPageSplit *xldata = (gistxlogPageSplit *) XLogRecGetData(record);
226 PageSplitRecord xlrec;
227 Buffer firstbuffer = InvalidBuffer;
231 bool isrootsplit = false;
233 decodePageSplitRecord(&xlrec, record);
236 * We must hold lock on the first-listed page throughout the action,
237 * including while updating the left child page (if any). We can unlock
238 * remaining pages in the list as soon as they've been written, because
239 * there is no path for concurrent queries to reach those pages without
240 * first visiting the first-listed page.
243 /* loop around all pages */
244 for (i = 0; i < xlrec.data->npage; i++)
246 NewPage *newpage = xlrec.page + i;
249 if (newpage->header->blkno == GIST_ROOT_BLKNO)
255 buffer = XLogReadBuffer(xlrec.data->node, newpage->header->blkno, true);
256 Assert(BufferIsValid(buffer));
257 page = (Page) BufferGetPage(buffer);
259 /* ok, clear buffer */
260 if (xlrec.data->origleaf && newpage->header->blkno != GIST_ROOT_BLKNO)
264 GISTInitBuffer(buffer, flags);
267 gistfillbuffer(page, newpage->itup, newpage->header->num, FirstOffsetNumber);
269 if (newpage->header->blkno == GIST_ROOT_BLKNO)
271 GistPageGetOpaque(page)->rightlink = InvalidBlockNumber;
272 GistPageSetNSN(page, xldata->orignsn);
273 GistClearFollowRight(page);
277 if (i < xlrec.data->npage - 1)
278 GistPageGetOpaque(page)->rightlink = xlrec.page[i + 1].header->blkno;
280 GistPageGetOpaque(page)->rightlink = xldata->origrlink;
281 GistPageSetNSN(page, xldata->orignsn);
282 if (i < xlrec.data->npage - 1 && !isrootsplit &&
283 xldata->markfollowright)
284 GistMarkFollowRight(page);
286 GistClearFollowRight(page);
289 PageSetLSN(page, lsn);
290 MarkBufferDirty(buffer);
293 firstbuffer = buffer;
295 UnlockReleaseBuffer(buffer);
298 /* Fix follow-right data on left child page, if any */
299 if (BlockNumberIsValid(xldata->leftchild))
300 gistRedoClearFollowRight(lsn, record, 0,
301 xldata->node, xldata->leftchild);
303 /* Finally, release lock on the first page */
304 UnlockReleaseBuffer(firstbuffer);
308 gistRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record)
310 RelFileNode *node = (RelFileNode *) XLogRecGetData(record);
314 /* Backup blocks are not used in create_index records */
315 Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
317 buffer = XLogReadBuffer(*node, GIST_ROOT_BLKNO, true);
318 Assert(BufferIsValid(buffer));
319 page = (Page) BufferGetPage(buffer);
321 GISTInitBuffer(buffer, F_LEAF);
323 PageSetLSN(page, lsn);
325 MarkBufferDirty(buffer);
326 UnlockReleaseBuffer(buffer);
330 gist_redo(XLogRecPtr lsn, XLogRecord *record)
332 uint8 info = record->xl_info & ~XLR_INFO_MASK;
333 MemoryContext oldCxt;
336 * GiST indexes do not require any conflict processing. NB: If we ever
337 * implement a similar optimization we have in b-tree, and remove killed
338 * tuples outside VACUUM, we'll need to handle that here.
341 oldCxt = MemoryContextSwitchTo(opCtx);
344 case XLOG_GIST_PAGE_UPDATE:
345 gistRedoPageUpdateRecord(lsn, record);
347 case XLOG_GIST_PAGE_SPLIT:
348 gistRedoPageSplitRecord(lsn, record);
350 case XLOG_GIST_CREATE_INDEX:
351 gistRedoCreateIndex(lsn, record);
354 elog(PANIC, "gist_redo: unknown op code %u", info);
357 MemoryContextSwitchTo(oldCxt);
358 MemoryContextReset(opCtx);
362 gist_xlog_startup(void)
364 opCtx = createTempGistContext();
368 gist_xlog_cleanup(void)
370 MemoryContextDelete(opCtx);
374 * Write WAL record of a page split.
377 gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf,
378 SplitedPageLayout *dist,
379 BlockNumber origrlink, GistNSN orignsn,
380 Buffer leftchildbuf, bool markfollowright)
383 gistxlogPageSplit xlrec;
384 SplitedPageLayout *ptr;
389 for (ptr = dist; ptr; ptr = ptr->next)
392 rdata = (XLogRecData *) palloc(sizeof(XLogRecData) * (npage * 2 + 2));
395 xlrec.origblkno = blkno;
396 xlrec.origrlink = origrlink;
397 xlrec.orignsn = orignsn;
398 xlrec.origleaf = page_is_leaf;
399 xlrec.npage = (uint16) npage;
401 BufferIsValid(leftchildbuf) ? BufferGetBlockNumber(leftchildbuf) : InvalidBlockNumber;
402 xlrec.markfollowright = markfollowright;
404 rdata[0].data = (char *) &xlrec;
405 rdata[0].len = sizeof(gistxlogPageSplit);
406 rdata[0].buffer = InvalidBuffer;
411 * Include a full page image of the child buf. (only necessary if a
412 * checkpoint happened since the child page was split)
414 if (BufferIsValid(leftchildbuf))
416 rdata[cur - 1].next = &(rdata[cur]);
417 rdata[cur].data = NULL;
419 rdata[cur].buffer = leftchildbuf;
420 rdata[cur].buffer_std = true;
424 for (ptr = dist; ptr; ptr = ptr->next)
426 rdata[cur - 1].next = &(rdata[cur]);
427 rdata[cur].buffer = InvalidBuffer;
428 rdata[cur].data = (char *) &(ptr->block);
429 rdata[cur].len = sizeof(gistxlogPage);
432 rdata[cur - 1].next = &(rdata[cur]);
433 rdata[cur].buffer = InvalidBuffer;
434 rdata[cur].data = (char *) (ptr->list);
435 rdata[cur].len = ptr->lenlist;
438 rdata[cur - 1].next = NULL;
440 recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_SPLIT, rdata);
447 * Write XLOG record describing a page update. The update can include any
448 * number of deletions and/or insertions of tuples on a single index page.
450 * If this update inserts a downlink for a split page, also record that
451 * the F_FOLLOW_RIGHT flag on the child page is cleared and NSN set.
453 * Note that both the todelete array and the tuples are marked as belonging
454 * to the target buffer; they need not be stored in XLOG if XLogInsert decides
455 * to log the whole buffer contents instead. Also, we take care that there's
456 * at least one rdata item referencing the buffer, even when ntodelete and
457 * ituplen are both zero; this ensures that XLogInsert knows about the buffer.
460 gistXLogUpdate(RelFileNode node, Buffer buffer,
461 OffsetNumber *todelete, int ntodelete,
462 IndexTuple *itup, int ituplen,
466 gistxlogPageUpdate xlrec;
471 rdata = (XLogRecData *) palloc(sizeof(XLogRecData) * (3 + ituplen));
474 xlrec.blkno = BufferGetBlockNumber(buffer);
475 xlrec.ntodelete = ntodelete;
477 BufferIsValid(leftchildbuf) ? BufferGetBlockNumber(leftchildbuf) : InvalidBlockNumber;
479 rdata[0].data = (char *) &xlrec;
480 rdata[0].len = sizeof(gistxlogPageUpdate);
481 rdata[0].buffer = InvalidBuffer;
482 rdata[0].next = &(rdata[1]);
484 rdata[1].data = (char *) todelete;
485 rdata[1].len = sizeof(OffsetNumber) * ntodelete;
486 rdata[1].buffer = buffer;
487 rdata[1].buffer_std = true;
492 for (i = 0; i < ituplen; i++)
494 rdata[cur - 1].next = &(rdata[cur]);
495 rdata[cur].data = (char *) (itup[i]);
496 rdata[cur].len = IndexTupleSize(itup[i]);
497 rdata[cur].buffer = buffer;
498 rdata[cur].buffer_std = true;
503 * Include a full page image of the child buf. (only necessary if a
504 * checkpoint happened since the child page was split)
506 if (BufferIsValid(leftchildbuf))
508 rdata[cur - 1].next = &(rdata[cur]);
509 rdata[cur].data = NULL;
511 rdata[cur].buffer = leftchildbuf;
512 rdata[cur].buffer_std = true;
515 rdata[cur - 1].next = NULL;
517 recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata);