]> granicus.if.org Git - postgresql/blob - src/backend/access/nbtree/nbtree.c
Fix initialization of fake LSN for unlogged relations
[postgresql] / src / backend / access / nbtree / nbtree.c
1 /*-------------------------------------------------------------------------
2  *
3  * nbtree.c
4  *        Implementation of Lehman and Yao's btree management algorithm for
5  *        Postgres.
6  *
7  * NOTES
8  *        This file contains only the public interface routines.
9  *
10  *
11  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
12  * Portions Copyright (c) 1994, Regents of the University of California
13  *
14  * IDENTIFICATION
15  *        src/backend/access/nbtree/nbtree.c
16  *
17  *-------------------------------------------------------------------------
18  */
19 #include "postgres.h"
20
21 #include "access/nbtree.h"
22 #include "access/nbtxlog.h"
23 #include "access/relscan.h"
24 #include "access/xlog.h"
25 #include "commands/progress.h"
26 #include "commands/vacuum.h"
27 #include "miscadmin.h"
28 #include "nodes/execnodes.h"
29 #include "pgstat.h"
30 #include "postmaster/autovacuum.h"
31 #include "storage/condition_variable.h"
32 #include "storage/indexfsm.h"
33 #include "storage/ipc.h"
34 #include "storage/lmgr.h"
35 #include "storage/smgr.h"
36 #include "utils/builtins.h"
37 #include "utils/index_selfuncs.h"
38 #include "utils/memutils.h"
39
40
41 /* Working state needed by btvacuumpage */
42 typedef struct
43 {
44         IndexVacuumInfo *info;
45         IndexBulkDeleteResult *stats;
46         IndexBulkDeleteCallback callback;
47         void       *callback_state;
48         BTCycleId       cycleid;
49         BlockNumber lastBlockVacuumed;  /* highest blkno actually vacuumed */
50         BlockNumber lastBlockLocked;    /* highest blkno we've cleanup-locked */
51         BlockNumber totFreePages;       /* true total # of free pages */
52         TransactionId oldestBtpoXact;
53         MemoryContext pagedelcontext;
54 } BTVacState;
55
56 /*
57  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
58  *
59  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
60  * a new page; others must wait.
61  *
62  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
63  * to a new page; some process can start doing that.
64  *
65  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
66  * We reach this state once for every distinct combination of array keys.
67  */
68 typedef enum
69 {
70         BTPARALLEL_NOT_INITIALIZED,
71         BTPARALLEL_ADVANCING,
72         BTPARALLEL_IDLE,
73         BTPARALLEL_DONE
74 } BTPS_State;
75
76 /*
77  * BTParallelScanDescData contains btree specific shared information required
78  * for parallel scan.
79  */
80 typedef struct BTParallelScanDescData
81 {
82         BlockNumber btps_scanPage;      /* latest or next page to be scanned */
83         BTPS_State      btps_pageStatus;        /* indicates whether next page is
84                                                                          * available for scan. see above for
85                                                                          * possible states of parallel scan. */
86         int                     btps_arrayKeyCount; /* count indicating number of array scan
87                                                                          * keys processed by parallel scan */
88         slock_t         btps_mutex;             /* protects above variables */
89         ConditionVariable btps_cv;      /* used to synchronize parallel scan */
90 }                       BTParallelScanDescData;
91
92 typedef struct BTParallelScanDescData *BTParallelScanDesc;
93
94
95 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
96                                                  IndexBulkDeleteCallback callback, void *callback_state,
97                                                  BTCycleId cycleid, TransactionId *oldestBtpoXact);
98 static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
99                                                  BlockNumber orig_blkno);
100
101
102 /*
103  * Btree handler function: return IndexAmRoutine with access method parameters
104  * and callbacks.
105  */
106 Datum
107 bthandler(PG_FUNCTION_ARGS)
108 {
109         IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
110
111         amroutine->amstrategies = BTMaxStrategyNumber;
112         amroutine->amsupport = BTNProcs;
113         amroutine->amcanorder = true;
114         amroutine->amcanorderbyop = false;
115         amroutine->amcanbackward = true;
116         amroutine->amcanunique = true;
117         amroutine->amcanmulticol = true;
118         amroutine->amoptionalkey = true;
119         amroutine->amsearcharray = true;
120         amroutine->amsearchnulls = true;
121         amroutine->amstorage = false;
122         amroutine->amclusterable = true;
123         amroutine->ampredlocks = true;
124         amroutine->amcanparallel = true;
125         amroutine->amcaninclude = true;
126         amroutine->amkeytype = InvalidOid;
127
128         amroutine->ambuild = btbuild;
129         amroutine->ambuildempty = btbuildempty;
130         amroutine->aminsert = btinsert;
131         amroutine->ambulkdelete = btbulkdelete;
132         amroutine->amvacuumcleanup = btvacuumcleanup;
133         amroutine->amcanreturn = btcanreturn;
134         amroutine->amcostestimate = btcostestimate;
135         amroutine->amoptions = btoptions;
136         amroutine->amproperty = btproperty;
137         amroutine->ambuildphasename = btbuildphasename;
138         amroutine->amvalidate = btvalidate;
139         amroutine->ambeginscan = btbeginscan;
140         amroutine->amrescan = btrescan;
141         amroutine->amgettuple = btgettuple;
142         amroutine->amgetbitmap = btgetbitmap;
143         amroutine->amendscan = btendscan;
144         amroutine->ammarkpos = btmarkpos;
145         amroutine->amrestrpos = btrestrpos;
146         amroutine->amestimateparallelscan = btestimateparallelscan;
147         amroutine->aminitparallelscan = btinitparallelscan;
148         amroutine->amparallelrescan = btparallelrescan;
149
150         PG_RETURN_POINTER(amroutine);
151 }
152
153 /*
154  *      btbuildempty() -- build an empty btree index in the initialization fork
155  */
156 void
157 btbuildempty(Relation index)
158 {
159         Page            metapage;
160
161         /* Construct metapage. */
162         metapage = (Page) palloc(BLCKSZ);
163         _bt_initmetapage(metapage, P_NONE, 0);
164
165         /*
166          * Write the page and log it.  It might seem that an immediate sync would
167          * be sufficient to guarantee that the file exists on disk, but recovery
168          * itself might remove it while replaying, for example, an
169          * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record.  Therefore, we need
170          * this even when wal_level=minimal.
171          */
172         PageSetChecksumInplace(metapage, BTREE_METAPAGE);
173         smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
174                           (char *) metapage, true);
175         log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
176                                 BTREE_METAPAGE, metapage, true);
177
178         /*
179          * An immediate sync is required even if we xlog'd the page, because the
180          * write did not go through shared_buffers and therefore a concurrent
181          * checkpoint may have moved the redo pointer past our xlog record.
182          */
183         smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
184 }
185
186 /*
187  *      btinsert() -- insert an index tuple into a btree.
188  *
189  *              Descend the tree recursively, find the appropriate location for our
190  *              new tuple, and put it there.
191  */
192 bool
193 btinsert(Relation rel, Datum *values, bool *isnull,
194                  ItemPointer ht_ctid, Relation heapRel,
195                  IndexUniqueCheck checkUnique,
196                  IndexInfo *indexInfo)
197 {
198         bool            result;
199         IndexTuple      itup;
200
201         /* generate an index tuple */
202         itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
203         itup->t_tid = *ht_ctid;
204
205         result = _bt_doinsert(rel, itup, checkUnique, heapRel);
206
207         pfree(itup);
208
209         return result;
210 }
211
212 /*
213  *      btgettuple() -- Get the next tuple in the scan.
214  */
215 bool
216 btgettuple(IndexScanDesc scan, ScanDirection dir)
217 {
218         BTScanOpaque so = (BTScanOpaque) scan->opaque;
219         bool            res;
220
221         /* btree indexes are never lossy */
222         scan->xs_recheck = false;
223
224         /*
225          * If we have any array keys, initialize them during first call for a
226          * scan.  We can't do this in btrescan because we don't know the scan
227          * direction at that time.
228          */
229         if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
230         {
231                 /* punt if we have any unsatisfiable array keys */
232                 if (so->numArrayKeys < 0)
233                         return false;
234
235                 _bt_start_array_keys(scan, dir);
236         }
237
238         /* This loop handles advancing to the next array elements, if any */
239         do
240         {
241                 /*
242                  * If we've already initialized this scan, we can just advance it in
243                  * the appropriate direction.  If we haven't done so yet, we call
244                  * _bt_first() to get the first item in the scan.
245                  */
246                 if (!BTScanPosIsValid(so->currPos))
247                         res = _bt_first(scan, dir);
248                 else
249                 {
250                         /*
251                          * Check to see if we should kill the previously-fetched tuple.
252                          */
253                         if (scan->kill_prior_tuple)
254                         {
255                                 /*
256                                  * Yes, remember it for later. (We'll deal with all such
257                                  * tuples at once right before leaving the index page.)  The
258                                  * test for numKilled overrun is not just paranoia: if the
259                                  * caller reverses direction in the indexscan then the same
260                                  * item might get entered multiple times. It's not worth
261                                  * trying to optimize that, so we don't detect it, but instead
262                                  * just forget any excess entries.
263                                  */
264                                 if (so->killedItems == NULL)
265                                         so->killedItems = (int *)
266                                                 palloc(MaxIndexTuplesPerPage * sizeof(int));
267                                 if (so->numKilled < MaxIndexTuplesPerPage)
268                                         so->killedItems[so->numKilled++] = so->currPos.itemIndex;
269                         }
270
271                         /*
272                          * Now continue the scan.
273                          */
274                         res = _bt_next(scan, dir);
275                 }
276
277                 /* If we have a tuple, return it ... */
278                 if (res)
279                         break;
280                 /* ... otherwise see if we have more array keys to deal with */
281         } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
282
283         return res;
284 }
285
286 /*
287  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
288  */
289 int64
290 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
291 {
292         BTScanOpaque so = (BTScanOpaque) scan->opaque;
293         int64           ntids = 0;
294         ItemPointer heapTid;
295
296         /*
297          * If we have any array keys, initialize them.
298          */
299         if (so->numArrayKeys)
300         {
301                 /* punt if we have any unsatisfiable array keys */
302                 if (so->numArrayKeys < 0)
303                         return ntids;
304
305                 _bt_start_array_keys(scan, ForwardScanDirection);
306         }
307
308         /* This loop handles advancing to the next array elements, if any */
309         do
310         {
311                 /* Fetch the first page & tuple */
312                 if (_bt_first(scan, ForwardScanDirection))
313                 {
314                         /* Save tuple ID, and continue scanning */
315                         heapTid = &scan->xs_heaptid;
316                         tbm_add_tuples(tbm, heapTid, 1, false);
317                         ntids++;
318
319                         for (;;)
320                         {
321                                 /*
322                                  * Advance to next tuple within page.  This is the same as the
323                                  * easy case in _bt_next().
324                                  */
325                                 if (++so->currPos.itemIndex > so->currPos.lastItem)
326                                 {
327                                         /* let _bt_next do the heavy lifting */
328                                         if (!_bt_next(scan, ForwardScanDirection))
329                                                 break;
330                                 }
331
332                                 /* Save tuple ID, and continue scanning */
333                                 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
334                                 tbm_add_tuples(tbm, heapTid, 1, false);
335                                 ntids++;
336                         }
337                 }
338                 /* Now see if we have more array keys to deal with */
339         } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));
340
341         return ntids;
342 }
343
344 /*
345  *      btbeginscan() -- start a scan on a btree index
346  */
347 IndexScanDesc
348 btbeginscan(Relation rel, int nkeys, int norderbys)
349 {
350         IndexScanDesc scan;
351         BTScanOpaque so;
352
353         /* no order by operators allowed */
354         Assert(norderbys == 0);
355
356         /* get the scan */
357         scan = RelationGetIndexScan(rel, nkeys, norderbys);
358
359         /* allocate private workspace */
360         so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
361         BTScanPosInvalidate(so->currPos);
362         BTScanPosInvalidate(so->markPos);
363         if (scan->numberOfKeys > 0)
364                 so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
365         else
366                 so->keyData = NULL;
367
368         so->arrayKeyData = NULL;        /* assume no array keys for now */
369         so->numArrayKeys = 0;
370         so->arrayKeys = NULL;
371         so->arrayContext = NULL;
372
373         so->killedItems = NULL;         /* until needed */
374         so->numKilled = 0;
375
376         /*
377          * We don't know yet whether the scan will be index-only, so we do not
378          * allocate the tuple workspace arrays until btrescan.  However, we set up
379          * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
380          */
381         so->currTuples = so->markTuples = NULL;
382
383         scan->xs_itupdesc = RelationGetDescr(rel);
384
385         scan->opaque = so;
386
387         return scan;
388 }
389
390 /*
391  *      btrescan() -- rescan an index relation
392  */
393 void
394 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
395                  ScanKey orderbys, int norderbys)
396 {
397         BTScanOpaque so = (BTScanOpaque) scan->opaque;
398
399         /* we aren't holding any read locks, but gotta drop the pins */
400         if (BTScanPosIsValid(so->currPos))
401         {
402                 /* Before leaving current page, deal with any killed items */
403                 if (so->numKilled > 0)
404                         _bt_killitems(scan);
405                 BTScanPosUnpinIfPinned(so->currPos);
406                 BTScanPosInvalidate(so->currPos);
407         }
408
409         so->markItemIndex = -1;
410         so->arrayKeyCount = 0;
411         BTScanPosUnpinIfPinned(so->markPos);
412         BTScanPosInvalidate(so->markPos);
413
414         /*
415          * Allocate tuple workspace arrays, if needed for an index-only scan and
416          * not already done in a previous rescan call.  To save on palloc
417          * overhead, both workspaces are allocated as one palloc block; only this
418          * function and btendscan know that.
419          *
420          * NOTE: this data structure also makes it safe to return data from a
421          * "name" column, even though btree name_ops uses an underlying storage
422          * datatype of cstring.  The risk there is that "name" is supposed to be
423          * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
424          * However, since we only return data out of tuples sitting in the
425          * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
426          * data out of the markTuples array --- running off the end of memory for
427          * a SIGSEGV is not possible.  Yeah, this is ugly as sin, but it beats
428          * adding special-case treatment for name_ops elsewhere.
429          */
430         if (scan->xs_want_itup && so->currTuples == NULL)
431         {
432                 so->currTuples = (char *) palloc(BLCKSZ * 2);
433                 so->markTuples = so->currTuples + BLCKSZ;
434         }
435
436         /*
437          * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
438          * - vadim 05/05/97
439          */
440         if (scankey && scan->numberOfKeys > 0)
441                 memmove(scan->keyData,
442                                 scankey,
443                                 scan->numberOfKeys * sizeof(ScanKeyData));
444         so->numberOfKeys = 0;           /* until _bt_preprocess_keys sets it */
445
446         /* If any keys are SK_SEARCHARRAY type, set up array-key info */
447         _bt_preprocess_array_keys(scan);
448 }
449
450 /*
451  *      btendscan() -- close down a scan
452  */
453 void
454 btendscan(IndexScanDesc scan)
455 {
456         BTScanOpaque so = (BTScanOpaque) scan->opaque;
457
458         /* we aren't holding any read locks, but gotta drop the pins */
459         if (BTScanPosIsValid(so->currPos))
460         {
461                 /* Before leaving current page, deal with any killed items */
462                 if (so->numKilled > 0)
463                         _bt_killitems(scan);
464                 BTScanPosUnpinIfPinned(so->currPos);
465         }
466
467         so->markItemIndex = -1;
468         BTScanPosUnpinIfPinned(so->markPos);
469
470         /* No need to invalidate positions, the RAM is about to be freed. */
471
472         /* Release storage */
473         if (so->keyData != NULL)
474                 pfree(so->keyData);
475         /* so->arrayKeyData and so->arrayKeys are in arrayContext */
476         if (so->arrayContext != NULL)
477                 MemoryContextDelete(so->arrayContext);
478         if (so->killedItems != NULL)
479                 pfree(so->killedItems);
480         if (so->currTuples != NULL)
481                 pfree(so->currTuples);
482         /* so->markTuples should not be pfree'd, see btrescan */
483         pfree(so);
484 }
485
486 /*
487  *      btmarkpos() -- save current scan position
488  */
489 void
490 btmarkpos(IndexScanDesc scan)
491 {
492         BTScanOpaque so = (BTScanOpaque) scan->opaque;
493
494         /* There may be an old mark with a pin (but no lock). */
495         BTScanPosUnpinIfPinned(so->markPos);
496
497         /*
498          * Just record the current itemIndex.  If we later step to next page
499          * before releasing the marked position, _bt_steppage makes a full copy of
500          * the currPos struct in markPos.  If (as often happens) the mark is moved
501          * before we leave the page, we don't have to do that work.
502          */
503         if (BTScanPosIsValid(so->currPos))
504                 so->markItemIndex = so->currPos.itemIndex;
505         else
506         {
507                 BTScanPosInvalidate(so->markPos);
508                 so->markItemIndex = -1;
509         }
510
511         /* Also record the current positions of any array keys */
512         if (so->numArrayKeys)
513                 _bt_mark_array_keys(scan);
514 }
515
516 /*
517  *      btrestrpos() -- restore scan to last saved position
518  */
519 void
520 btrestrpos(IndexScanDesc scan)
521 {
522         BTScanOpaque so = (BTScanOpaque) scan->opaque;
523
524         /* Restore the marked positions of any array keys */
525         if (so->numArrayKeys)
526                 _bt_restore_array_keys(scan);
527
528         if (so->markItemIndex >= 0)
529         {
530                 /*
531                  * The scan has never moved to a new page since the last mark.  Just
532                  * restore the itemIndex.
533                  *
534                  * NB: In this case we can't count on anything in so->markPos to be
535                  * accurate.
536                  */
537                 so->currPos.itemIndex = so->markItemIndex;
538         }
539         else
540         {
541                 /*
542                  * The scan moved to a new page after last mark or restore, and we are
543                  * now restoring to the marked page.  We aren't holding any read
544                  * locks, but if we're still holding the pin for the current position,
545                  * we must drop it.
546                  */
547                 if (BTScanPosIsValid(so->currPos))
548                 {
549                         /* Before leaving current page, deal with any killed items */
550                         if (so->numKilled > 0)
551                                 _bt_killitems(scan);
552                         BTScanPosUnpinIfPinned(so->currPos);
553                 }
554
555                 if (BTScanPosIsValid(so->markPos))
556                 {
557                         /* bump pin on mark buffer for assignment to current buffer */
558                         if (BTScanPosIsPinned(so->markPos))
559                                 IncrBufferRefCount(so->markPos.buf);
560                         memcpy(&so->currPos, &so->markPos,
561                                    offsetof(BTScanPosData, items[1]) +
562                                    so->markPos.lastItem * sizeof(BTScanPosItem));
563                         if (so->currTuples)
564                                 memcpy(so->currTuples, so->markTuples,
565                                            so->markPos.nextTupleOffset);
566                 }
567                 else
568                         BTScanPosInvalidate(so->currPos);
569         }
570 }
571
572 /*
573  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
574  */
575 Size
576 btestimateparallelscan(void)
577 {
578         return sizeof(BTParallelScanDescData);
579 }
580
581 /*
582  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
583  */
584 void
585 btinitparallelscan(void *target)
586 {
587         BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
588
589         SpinLockInit(&bt_target->btps_mutex);
590         bt_target->btps_scanPage = InvalidBlockNumber;
591         bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
592         bt_target->btps_arrayKeyCount = 0;
593         ConditionVariableInit(&bt_target->btps_cv);
594 }
595
596 /*
597  *      btparallelrescan() -- reset parallel scan
598  */
599 void
600 btparallelrescan(IndexScanDesc scan)
601 {
602         BTParallelScanDesc btscan;
603         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
604
605         Assert(parallel_scan);
606
607         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
608                                                                                                   parallel_scan->ps_offset);
609
610         /*
611          * In theory, we don't need to acquire the spinlock here, because there
612          * shouldn't be any other workers running at this point, but we do so for
613          * consistency.
614          */
615         SpinLockAcquire(&btscan->btps_mutex);
616         btscan->btps_scanPage = InvalidBlockNumber;
617         btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
618         btscan->btps_arrayKeyCount = 0;
619         SpinLockRelease(&btscan->btps_mutex);
620 }
621
622 /*
623  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
624  *              page.  Other scans must wait until we call _bt_parallel_release()
625  *              or _bt_parallel_done().
626  *
627  * The return value is true if we successfully seized the scan and false
628  * if we did not.  The latter case occurs if no pages remain for the current
629  * set of scankeys.
630  *
631  * If the return value is true, *pageno returns the next or current page
632  * of the scan (depending on the scan direction).  An invalid block number
633  * means the scan hasn't yet started, and P_NONE means we've reached the end.
634  * The first time a participating process reaches the last page, it will return
635  * true and set *pageno to P_NONE; after that, further attempts to seize the
636  * scan will return false.
637  *
638  * Callers should ignore the value of pageno if the return value is false.
639  */
640 bool
641 _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
642 {
643         BTScanOpaque so = (BTScanOpaque) scan->opaque;
644         BTPS_State      pageStatus;
645         bool            exit_loop = false;
646         bool            status = true;
647         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
648         BTParallelScanDesc btscan;
649
650         *pageno = P_NONE;
651
652         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
653                                                                                                   parallel_scan->ps_offset);
654
655         while (1)
656         {
657                 SpinLockAcquire(&btscan->btps_mutex);
658                 pageStatus = btscan->btps_pageStatus;
659
660                 if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
661                 {
662                         /* Parallel scan has already advanced to a new set of scankeys. */
663                         status = false;
664                 }
665                 else if (pageStatus == BTPARALLEL_DONE)
666                 {
667                         /*
668                          * We're done with this set of scankeys.  This may be the end, or
669                          * there could be more sets to try.
670                          */
671                         status = false;
672                 }
673                 else if (pageStatus != BTPARALLEL_ADVANCING)
674                 {
675                         /*
676                          * We have successfully seized control of the scan for the purpose
677                          * of advancing it to a new page!
678                          */
679                         btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
680                         *pageno = btscan->btps_scanPage;
681                         exit_loop = true;
682                 }
683                 SpinLockRelease(&btscan->btps_mutex);
684                 if (exit_loop || !status)
685                         break;
686                 ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
687         }
688         ConditionVariableCancelSleep();
689
690         return status;
691 }
692
693 /*
694  * _bt_parallel_release() -- Complete the process of advancing the scan to a
695  *              new page.  We now have the new value btps_scanPage; some other backend
696  *              can now begin advancing the scan.
697  */
698 void
699 _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
700 {
701         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
702         BTParallelScanDesc btscan;
703
704         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
705                                                                                                   parallel_scan->ps_offset);
706
707         SpinLockAcquire(&btscan->btps_mutex);
708         btscan->btps_scanPage = scan_page;
709         btscan->btps_pageStatus = BTPARALLEL_IDLE;
710         SpinLockRelease(&btscan->btps_mutex);
711         ConditionVariableSignal(&btscan->btps_cv);
712 }
713
714 /*
715  * _bt_parallel_done() -- Mark the parallel scan as complete.
716  *
717  * When there are no pages left to scan, this function should be called to
718  * notify other workers.  Otherwise, they might wait forever for the scan to
719  * advance to the next page.
720  */
721 void
722 _bt_parallel_done(IndexScanDesc scan)
723 {
724         BTScanOpaque so = (BTScanOpaque) scan->opaque;
725         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
726         BTParallelScanDesc btscan;
727         bool            status_changed = false;
728
729         /* Do nothing, for non-parallel scans */
730         if (parallel_scan == NULL)
731                 return;
732
733         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
734                                                                                                   parallel_scan->ps_offset);
735
736         /*
737          * Mark the parallel scan as done for this combination of scan keys,
738          * unless some other process already did so.  See also
739          * _bt_advance_array_keys.
740          */
741         SpinLockAcquire(&btscan->btps_mutex);
742         if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
743                 btscan->btps_pageStatus != BTPARALLEL_DONE)
744         {
745                 btscan->btps_pageStatus = BTPARALLEL_DONE;
746                 status_changed = true;
747         }
748         SpinLockRelease(&btscan->btps_mutex);
749
750         /* wake up all the workers associated with this parallel scan */
751         if (status_changed)
752                 ConditionVariableBroadcast(&btscan->btps_cv);
753 }
754
755 /*
756  * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
757  *                      keys.
758  *
759  * Updates the count of array keys processed for both local and parallel
760  * scans.
761  */
762 void
763 _bt_parallel_advance_array_keys(IndexScanDesc scan)
764 {
765         BTScanOpaque so = (BTScanOpaque) scan->opaque;
766         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
767         BTParallelScanDesc btscan;
768
769         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
770                                                                                                   parallel_scan->ps_offset);
771
772         so->arrayKeyCount++;
773         SpinLockAcquire(&btscan->btps_mutex);
774         if (btscan->btps_pageStatus == BTPARALLEL_DONE)
775         {
776                 btscan->btps_scanPage = InvalidBlockNumber;
777                 btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
778                 btscan->btps_arrayKeyCount++;
779         }
780         SpinLockRelease(&btscan->btps_mutex);
781 }
782
783 /*
784  * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup assuming that
785  *                      btbulkdelete() wasn't called.
786  */
787 static bool
788 _bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
789 {
790         Buffer          metabuf;
791         Page            metapg;
792         BTMetaPageData *metad;
793         bool            result = false;
794
795         metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ);
796         metapg = BufferGetPage(metabuf);
797         metad = BTPageGetMeta(metapg);
798
799         if (metad->btm_version < BTREE_NOVAC_VERSION)
800         {
801                 /*
802                  * Do cleanup if metapage needs upgrade, because we don't have
803                  * cleanup-related meta-information yet.
804                  */
805                 result = true;
806         }
807         else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) &&
808                          TransactionIdPrecedes(metad->btm_oldest_btpo_xact,
809                                                                    RecentGlobalXmin))
810         {
811                 /*
812                  * If oldest btpo.xact in the deleted pages is older than
813                  * RecentGlobalXmin, then at least one deleted page can be recycled.
814                  */
815                 result = true;
816         }
817         else
818         {
819                 StdRdOptions *relopts;
820                 float8          cleanup_scale_factor;
821                 float8          prev_num_heap_tuples;
822
823                 /*
824                  * If table receives enough insertions and no cleanup was performed,
825                  * then index would appear have stale statistics.  If scale factor is
826                  * set, we avoid that by performing cleanup if the number of inserted
827                  * tuples exceeds vacuum_cleanup_index_scale_factor fraction of
828                  * original tuples count.
829                  */
830                 relopts = (StdRdOptions *) info->index->rd_options;
831                 cleanup_scale_factor = (relopts &&
832                                                                 relopts->vacuum_cleanup_index_scale_factor >= 0)
833                         ? relopts->vacuum_cleanup_index_scale_factor
834                         : vacuum_cleanup_index_scale_factor;
835                 prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
836
837                 if (cleanup_scale_factor <= 0 ||
838                         prev_num_heap_tuples <= 0 ||
839                         (info->num_heap_tuples - prev_num_heap_tuples) /
840                         prev_num_heap_tuples >= cleanup_scale_factor)
841                         result = true;
842         }
843
844         _bt_relbuf(info->index, metabuf);
845         return result;
846 }
847
848 /*
849  * Bulk deletion of all index entries pointing to a set of heap tuples.
850  * The set of target tuples is specified via a callback routine that tells
851  * whether any given heap tuple (identified by ItemPointer) is being deleted.
852  *
853  * Result: a palloc'd struct containing statistical info for VACUUM displays.
854  */
855 IndexBulkDeleteResult *
856 btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
857                          IndexBulkDeleteCallback callback, void *callback_state)
858 {
859         Relation        rel = info->index;
860         BTCycleId       cycleid;
861
862         /* allocate stats if first time through, else re-use existing struct */
863         if (stats == NULL)
864                 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
865
866         /* Establish the vacuum cycle ID to use for this scan */
867         /* The ENSURE stuff ensures we clean up shared memory on failure */
868         PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
869         {
870                 TransactionId oldestBtpoXact;
871
872                 cycleid = _bt_start_vacuum(rel);
873
874                 btvacuumscan(info, stats, callback, callback_state, cycleid,
875                                          &oldestBtpoXact);
876
877                 /*
878                  * Update cleanup-related information in metapage. This information is
879                  * used only for cleanup but keeping them up to date can avoid
880                  * unnecessary cleanup even after bulkdelete.
881                  */
882                 _bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
883                                                                          info->num_heap_tuples);
884         }
885         PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
886         _bt_end_vacuum(rel);
887
888         return stats;
889 }
890
891 /*
892  * Post-VACUUM cleanup.
893  *
894  * Result: a palloc'd struct containing statistical info for VACUUM displays.
895  */
896 IndexBulkDeleteResult *
897 btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
898 {
899         /* No-op in ANALYZE ONLY mode */
900         if (info->analyze_only)
901                 return stats;
902
903         /*
904          * If btbulkdelete was called, we need not do anything, just return the
905          * stats from the latest btbulkdelete call.  If it wasn't called, we might
906          * still need to do a pass over the index, to recycle any newly-recyclable
907          * pages or to obtain index statistics.  _bt_vacuum_needs_cleanup
908          * determines if either are needed.
909          *
910          * Since we aren't going to actually delete any leaf items, there's no
911          * need to go through all the vacuum-cycle-ID pushups.
912          */
913         if (stats == NULL)
914         {
915                 TransactionId oldestBtpoXact;
916
917                 /* Check if we need a cleanup */
918                 if (!_bt_vacuum_needs_cleanup(info))
919                         return NULL;
920
921                 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
922                 btvacuumscan(info, stats, NULL, NULL, 0, &oldestBtpoXact);
923
924                 /* Update cleanup-related information in the metapage */
925                 _bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
926                                                                          info->num_heap_tuples);
927         }
928
929         /*
930          * It's quite possible for us to be fooled by concurrent page splits into
931          * double-counting some index tuples, so disbelieve any total that exceeds
932          * the underlying heap's count ... if we know that accurately.  Otherwise
933          * this might just make matters worse.
934          */
935         if (!info->estimated_count)
936         {
937                 if (stats->num_index_tuples > info->num_heap_tuples)
938                         stats->num_index_tuples = info->num_heap_tuples;
939         }
940
941         return stats;
942 }
943
944 /*
945  * btvacuumscan --- scan the index for VACUUMing purposes
946  *
947  * This combines the functions of looking for leaf tuples that are deletable
948  * according to the vacuum callback, looking for empty pages that can be
949  * deleted, and looking for old deleted pages that can be recycled.  Both
950  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
951  * btbulkdelete call occurred).
952  *
953  * The caller is responsible for initially allocating/zeroing a stats struct
954  * and for obtaining a vacuum cycle ID if necessary.
955  */
956 static void
957 btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
958                          IndexBulkDeleteCallback callback, void *callback_state,
959                          BTCycleId cycleid, TransactionId *oldestBtpoXact)
960 {
961         Relation        rel = info->index;
962         BTVacState      vstate;
963         BlockNumber num_pages;
964         BlockNumber blkno;
965         bool            needLock;
966
967         /*
968          * Reset counts that will be incremented during the scan; needed in case
969          * of multiple scans during a single VACUUM command
970          */
971         stats->estimated_count = false;
972         stats->num_index_tuples = 0;
973         stats->pages_deleted = 0;
974
975         /* Set up info to pass down to btvacuumpage */
976         vstate.info = info;
977         vstate.stats = stats;
978         vstate.callback = callback;
979         vstate.callback_state = callback_state;
980         vstate.cycleid = cycleid;
981         vstate.lastBlockVacuumed = BTREE_METAPAGE;      /* Initialise at first block */
982         vstate.lastBlockLocked = BTREE_METAPAGE;
983         vstate.totFreePages = 0;
984         vstate.oldestBtpoXact = InvalidTransactionId;
985
986         /* Create a temporary memory context to run _bt_pagedel in */
987         vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
988                                                                                                   "_bt_pagedel",
989                                                                                                   ALLOCSET_DEFAULT_SIZES);
990
991         /*
992          * The outer loop iterates over all index pages except the metapage, in
993          * physical order (we hope the kernel will cooperate in providing
994          * read-ahead for speed).  It is critical that we visit all leaf pages,
995          * including ones added after we start the scan, else we might fail to
996          * delete some deletable tuples.  Hence, we must repeatedly check the
997          * relation length.  We must acquire the relation-extension lock while
998          * doing so to avoid a race condition: if someone else is extending the
999          * relation, there is a window where bufmgr/smgr have created a new
1000          * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1001          * we manage to scan such a page here, we'll improperly assume it can be
1002          * recycled.  Taking the lock synchronizes things enough to prevent a
1003          * problem: either num_pages won't include the new page, or _bt_getbuf
1004          * already has write lock on the buffer and it will be fully initialized
1005          * before we can examine it.  (See also vacuumlazy.c, which has the same
1006          * issue.)      Also, we need not worry if a page is added immediately after
1007          * we look; the page splitting code already has write-lock on the left
1008          * page before it adds a right page, so we must already have processed any
1009          * tuples due to be moved into such a page.
1010          *
1011          * We can skip locking for new or temp relations, however, since no one
1012          * else could be accessing them.
1013          */
1014         needLock = !RELATION_IS_LOCAL(rel);
1015
1016         blkno = BTREE_METAPAGE + 1;
1017         for (;;)
1018         {
1019                 /* Get the current relation length */
1020                 if (needLock)
1021                         LockRelationForExtension(rel, ExclusiveLock);
1022                 num_pages = RelationGetNumberOfBlocks(rel);
1023                 if (needLock)
1024                         UnlockRelationForExtension(rel, ExclusiveLock);
1025
1026                 if (info->report_progress)
1027                         pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1028                                                                                  num_pages);
1029
1030                 /* Quit if we've scanned the whole relation */
1031                 if (blkno >= num_pages)
1032                         break;
1033                 /* Iterate over pages, then loop back to recheck length */
1034                 for (; blkno < num_pages; blkno++)
1035                 {
1036                         btvacuumpage(&vstate, blkno, blkno);
1037                         if (info->report_progress)
1038                                 pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1039                                                                                          blkno);
1040                 }
1041         }
1042
1043         /*
1044          * Check to see if we need to issue one final WAL record for this index,
1045          * which may be needed for correctness on a hot standby node when non-MVCC
1046          * index scans could take place.
1047          *
1048          * If the WAL is replayed in hot standby, the replay process needs to get
1049          * cleanup locks on all index leaf pages, just as we've been doing here.
1050          * However, we won't issue any WAL records about pages that have no items
1051          * to be deleted.  For pages between pages we've vacuumed, the replay code
1052          * will take locks under the direction of the lastBlockVacuumed fields in
1053          * the XLOG_BTREE_VACUUM WAL records.  To cover pages after the last one
1054          * we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record
1055          * against the last leaf page in the index, if that one wasn't vacuumed.
1056          */
1057         if (XLogStandbyInfoActive() &&
1058                 vstate.lastBlockVacuumed < vstate.lastBlockLocked)
1059         {
1060                 Buffer          buf;
1061
1062                 /*
1063                  * The page should be valid, but we can't use _bt_getbuf() because we
1064                  * want to use a nondefault buffer access strategy.  Since we aren't
1065                  * going to delete any items, getting cleanup lock again is probably
1066                  * overkill, but for consistency do that anyway.
1067                  */
1068                 buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked,
1069                                                                  RBM_NORMAL, info->strategy);
1070                 LockBufferForCleanup(buf);
1071                 _bt_checkpage(rel, buf);
1072                 _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed);
1073                 _bt_relbuf(rel, buf);
1074         }
1075
1076         MemoryContextDelete(vstate.pagedelcontext);
1077
1078         /*
1079          * If we found any recyclable pages (and recorded them in the FSM), then
1080          * forcibly update the upper-level FSM pages to ensure that searchers can
1081          * find them.  It's possible that the pages were also found during
1082          * previous scans and so this is a waste of time, but it's cheap enough
1083          * relative to scanning the index that it shouldn't matter much, and
1084          * making sure that free pages are available sooner not later seems
1085          * worthwhile.
1086          *
1087          * Note that if no recyclable pages exist, we don't bother vacuuming the
1088          * FSM at all.
1089          */
1090         if (vstate.totFreePages > 0)
1091                 IndexFreeSpaceMapVacuum(rel);
1092
1093         /* update statistics */
1094         stats->num_pages = num_pages;
1095         stats->pages_free = vstate.totFreePages;
1096
1097         if (oldestBtpoXact)
1098                 *oldestBtpoXact = vstate.oldestBtpoXact;
1099 }
1100
1101 /*
1102  * btvacuumpage --- VACUUM one page
1103  *
1104  * This processes a single page for btvacuumscan().  In some cases we
1105  * must go back and re-examine previously-scanned pages; this routine
1106  * recurses when necessary to handle that case.
1107  *
1108  * blkno is the page to process.  orig_blkno is the highest block number
1109  * reached by the outer btvacuumscan loop (the same as blkno, unless we
1110  * are recursing to re-examine a previous page).
1111  */
1112 static void
1113 btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
1114 {
1115         IndexVacuumInfo *info = vstate->info;
1116         IndexBulkDeleteResult *stats = vstate->stats;
1117         IndexBulkDeleteCallback callback = vstate->callback;
1118         void       *callback_state = vstate->callback_state;
1119         Relation        rel = info->index;
1120         bool            delete_now;
1121         BlockNumber recurse_to;
1122         Buffer          buf;
1123         Page            page;
1124         BTPageOpaque opaque = NULL;
1125
1126 restart:
1127         delete_now = false;
1128         recurse_to = P_NONE;
1129
1130         /* call vacuum_delay_point while not holding any buffer lock */
1131         vacuum_delay_point();
1132
1133         /*
1134          * We can't use _bt_getbuf() here because it always applies
1135          * _bt_checkpage(), which will barf on an all-zero page. We want to
1136          * recycle all-zero pages, not fail.  Also, we want to use a nondefault
1137          * buffer access strategy.
1138          */
1139         buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1140                                                          info->strategy);
1141         LockBuffer(buf, BT_READ);
1142         page = BufferGetPage(buf);
1143         if (!PageIsNew(page))
1144         {
1145                 _bt_checkpage(rel, buf);
1146                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1147         }
1148
1149         /*
1150          * If we are recursing, the only case we want to do anything with is a
1151          * live leaf page having the current vacuum cycle ID.  Any other state
1152          * implies we already saw the page (eg, deleted it as being empty).
1153          */
1154         if (blkno != orig_blkno)
1155         {
1156                 if (_bt_page_recyclable(page) ||
1157                         P_IGNORE(opaque) ||
1158                         !P_ISLEAF(opaque) ||
1159                         opaque->btpo_cycleid != vstate->cycleid)
1160                 {
1161                         _bt_relbuf(rel, buf);
1162                         return;
1163                 }
1164         }
1165
1166         /* Page is valid, see what to do with it */
1167         if (_bt_page_recyclable(page))
1168         {
1169                 /* Okay to recycle this page */
1170                 RecordFreeIndexPage(rel, blkno);
1171                 vstate->totFreePages++;
1172                 stats->pages_deleted++;
1173         }
1174         else if (P_ISDELETED(opaque))
1175         {
1176                 /* Already deleted, but can't recycle yet */
1177                 stats->pages_deleted++;
1178
1179                 /* Update the oldest btpo.xact */
1180                 if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
1181                         TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
1182                         vstate->oldestBtpoXact = opaque->btpo.xact;
1183         }
1184         else if (P_ISHALFDEAD(opaque))
1185         {
1186                 /* Half-dead, try to delete */
1187                 delete_now = true;
1188         }
1189         else if (P_ISLEAF(opaque))
1190         {
1191                 OffsetNumber deletable[MaxOffsetNumber];
1192                 int                     ndeletable;
1193                 OffsetNumber offnum,
1194                                         minoff,
1195                                         maxoff;
1196
1197                 /*
1198                  * Trade in the initial read lock for a super-exclusive write lock on
1199                  * this page.  We must get such a lock on every leaf page over the
1200                  * course of the vacuum scan, whether or not it actually contains any
1201                  * deletable tuples --- see nbtree/README.
1202                  */
1203                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1204                 LockBufferForCleanup(buf);
1205
1206                 /*
1207                  * Remember highest leaf page number we've taken cleanup lock on; see
1208                  * notes in btvacuumscan
1209                  */
1210                 if (blkno > vstate->lastBlockLocked)
1211                         vstate->lastBlockLocked = blkno;
1212
1213                 /*
1214                  * Check whether we need to recurse back to earlier pages.  What we
1215                  * are concerned about is a page split that happened since we started
1216                  * the vacuum scan.  If the split moved some tuples to a lower page
1217                  * then we might have missed 'em.  If so, set up for tail recursion.
1218                  * (Must do this before possibly clearing btpo_cycleid below!)
1219                  */
1220                 if (vstate->cycleid != 0 &&
1221                         opaque->btpo_cycleid == vstate->cycleid &&
1222                         !(opaque->btpo_flags & BTP_SPLIT_END) &&
1223                         !P_RIGHTMOST(opaque) &&
1224                         opaque->btpo_next < orig_blkno)
1225                         recurse_to = opaque->btpo_next;
1226
1227                 /*
1228                  * Scan over all items to see which ones need deleted according to the
1229                  * callback function.
1230                  */
1231                 ndeletable = 0;
1232                 minoff = P_FIRSTDATAKEY(opaque);
1233                 maxoff = PageGetMaxOffsetNumber(page);
1234                 if (callback)
1235                 {
1236                         for (offnum = minoff;
1237                                  offnum <= maxoff;
1238                                  offnum = OffsetNumberNext(offnum))
1239                         {
1240                                 IndexTuple      itup;
1241                                 ItemPointer htup;
1242
1243                                 itup = (IndexTuple) PageGetItem(page,
1244                                                                                                 PageGetItemId(page, offnum));
1245                                 htup = &(itup->t_tid);
1246
1247                                 /*
1248                                  * During Hot Standby we currently assume that
1249                                  * XLOG_BTREE_VACUUM records do not produce conflicts. That is
1250                                  * only true as long as the callback function depends only
1251                                  * upon whether the index tuple refers to heap tuples removed
1252                                  * in the initial heap scan. When vacuum starts it derives a
1253                                  * value of OldestXmin. Backends taking later snapshots could
1254                                  * have a RecentGlobalXmin with a later xid than the vacuum's
1255                                  * OldestXmin, so it is possible that row versions deleted
1256                                  * after OldestXmin could be marked as killed by other
1257                                  * backends. The callback function *could* look at the index
1258                                  * tuple state in isolation and decide to delete the index
1259                                  * tuple, though currently it does not. If it ever did, we
1260                                  * would need to reconsider whether XLOG_BTREE_VACUUM records
1261                                  * should cause conflicts. If they did cause conflicts they
1262                                  * would be fairly harsh conflicts, since we haven't yet
1263                                  * worked out a way to pass a useful value for
1264                                  * latestRemovedXid on the XLOG_BTREE_VACUUM records. This
1265                                  * applies to *any* type of index that marks index tuples as
1266                                  * killed.
1267                                  */
1268                                 if (callback(htup, callback_state))
1269                                         deletable[ndeletable++] = offnum;
1270                         }
1271                 }
1272
1273                 /*
1274                  * Apply any needed deletes.  We issue just one _bt_delitems_vacuum()
1275                  * call per page, so as to minimize WAL traffic.
1276                  */
1277                 if (ndeletable > 0)
1278                 {
1279                         /*
1280                          * Notice that the issued XLOG_BTREE_VACUUM WAL record includes
1281                          * all information to the replay code to allow it to get a cleanup
1282                          * lock on all pages between the previous lastBlockVacuumed and
1283                          * this page. This ensures that WAL replay locks all leaf pages at
1284                          * some point, which is important should non-MVCC scans be
1285                          * requested. This is currently unused on standby, but we record
1286                          * it anyway, so that the WAL contains the required information.
1287                          *
1288                          * Since we can visit leaf pages out-of-order when recursing,
1289                          * replay might end up locking such pages an extra time, but it
1290                          * doesn't seem worth the amount of bookkeeping it'd take to avoid
1291                          * that.
1292                          */
1293                         _bt_delitems_vacuum(rel, buf, deletable, ndeletable,
1294                                                                 vstate->lastBlockVacuumed);
1295
1296                         /*
1297                          * Remember highest leaf page number we've issued a
1298                          * XLOG_BTREE_VACUUM WAL record for.
1299                          */
1300                         if (blkno > vstate->lastBlockVacuumed)
1301                                 vstate->lastBlockVacuumed = blkno;
1302
1303                         stats->tuples_removed += ndeletable;
1304                         /* must recompute maxoff */
1305                         maxoff = PageGetMaxOffsetNumber(page);
1306                 }
1307                 else
1308                 {
1309                         /*
1310                          * If the page has been split during this vacuum cycle, it seems
1311                          * worth expending a write to clear btpo_cycleid even if we don't
1312                          * have any deletions to do.  (If we do, _bt_delitems_vacuum takes
1313                          * care of this.)  This ensures we won't process the page again.
1314                          *
1315                          * We treat this like a hint-bit update because there's no need to
1316                          * WAL-log it.
1317                          */
1318                         if (vstate->cycleid != 0 &&
1319                                 opaque->btpo_cycleid == vstate->cycleid)
1320                         {
1321                                 opaque->btpo_cycleid = 0;
1322                                 MarkBufferDirtyHint(buf, true);
1323                         }
1324                 }
1325
1326                 /*
1327                  * If it's now empty, try to delete; else count the live tuples. We
1328                  * don't delete when recursing, though, to avoid putting entries into
1329                  * freePages out-of-order (doesn't seem worth any extra code to handle
1330                  * the case).
1331                  */
1332                 if (minoff > maxoff)
1333                         delete_now = (blkno == orig_blkno);
1334                 else
1335                         stats->num_index_tuples += maxoff - minoff + 1;
1336         }
1337
1338         if (delete_now)
1339         {
1340                 MemoryContext oldcontext;
1341                 int                     ndel;
1342
1343                 /* Run pagedel in a temp context to avoid memory leakage */
1344                 MemoryContextReset(vstate->pagedelcontext);
1345                 oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1346
1347                 ndel = _bt_pagedel(rel, buf);
1348
1349                 /* count only this page, else may double-count parent */
1350                 if (ndel)
1351                 {
1352                         stats->pages_deleted++;
1353                         if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
1354                                 TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
1355                                 vstate->oldestBtpoXact = opaque->btpo.xact;
1356                 }
1357
1358                 MemoryContextSwitchTo(oldcontext);
1359                 /* pagedel released buffer, so we shouldn't */
1360         }
1361         else
1362                 _bt_relbuf(rel, buf);
1363
1364         /*
1365          * This is really tail recursion, but if the compiler is too stupid to
1366          * optimize it as such, we'd eat an uncomfortably large amount of stack
1367          * space per recursion level (due to the deletable[] array). A failure is
1368          * improbable since the number of levels isn't likely to be large ... but
1369          * just in case, let's hand-optimize into a loop.
1370          */
1371         if (recurse_to != P_NONE)
1372         {
1373                 blkno = recurse_to;
1374                 goto restart;
1375         }
1376 }
1377
1378 /*
1379  *      btcanreturn() -- Check whether btree indexes support index-only scans.
1380  *
1381  * btrees always do, so this is trivial.
1382  */
1383 bool
1384 btcanreturn(Relation index, int attno)
1385 {
1386         return true;
1387 }