]> granicus.if.org Git - postgresql/blob - src/backend/access/nbtree/nbtree.c
Make heap TID a tiebreaker nbtree index column.
[postgresql] / src / backend / access / nbtree / nbtree.c
1 /*-------------------------------------------------------------------------
2  *
3  * nbtree.c
4  *        Implementation of Lehman and Yao's btree management algorithm for
5  *        Postgres.
6  *
7  * NOTES
8  *        This file contains only the public interface routines.
9  *
10  *
11  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
12  * Portions Copyright (c) 1994, Regents of the University of California
13  *
14  * IDENTIFICATION
15  *        src/backend/access/nbtree/nbtree.c
16  *
17  *-------------------------------------------------------------------------
18  */
19 #include "postgres.h"
20
21 #include "access/nbtree.h"
22 #include "access/nbtxlog.h"
23 #include "access/relscan.h"
24 #include "access/xlog.h"
25 #include "commands/vacuum.h"
26 #include "miscadmin.h"
27 #include "nodes/execnodes.h"
28 #include "pgstat.h"
29 #include "postmaster/autovacuum.h"
30 #include "storage/condition_variable.h"
31 #include "storage/indexfsm.h"
32 #include "storage/ipc.h"
33 #include "storage/lmgr.h"
34 #include "storage/smgr.h"
35 #include "utils/builtins.h"
36 #include "utils/index_selfuncs.h"
37 #include "utils/memutils.h"
38
39
40 /* Working state needed by btvacuumpage */
41 typedef struct
42 {
43         IndexVacuumInfo *info;
44         IndexBulkDeleteResult *stats;
45         IndexBulkDeleteCallback callback;
46         void       *callback_state;
47         BTCycleId       cycleid;
48         BlockNumber lastBlockVacuumed;  /* highest blkno actually vacuumed */
49         BlockNumber lastBlockLocked;    /* highest blkno we've cleanup-locked */
50         BlockNumber totFreePages;       /* true total # of free pages */
51         TransactionId oldestBtpoXact;
52         MemoryContext pagedelcontext;
53 } BTVacState;
54
55 /*
56  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
57  *
58  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
59  * a new page; others must wait.
60  *
61  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
62  * to a new page; some process can start doing that.
63  *
64  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
65  * We reach this state once for every distinct combination of array keys.
66  */
67 typedef enum
68 {
69         BTPARALLEL_NOT_INITIALIZED,
70         BTPARALLEL_ADVANCING,
71         BTPARALLEL_IDLE,
72         BTPARALLEL_DONE
73 } BTPS_State;
74
75 /*
76  * BTParallelScanDescData contains btree specific shared information required
77  * for parallel scan.
78  */
79 typedef struct BTParallelScanDescData
80 {
81         BlockNumber btps_scanPage;      /* latest or next page to be scanned */
82         BTPS_State      btps_pageStatus;        /* indicates whether next page is
83                                                                          * available for scan. see above for
84                                                                          * possible states of parallel scan. */
85         int                     btps_arrayKeyCount; /* count indicating number of array scan
86                                                                          * keys processed by parallel scan */
87         slock_t         btps_mutex;             /* protects above variables */
88         ConditionVariable btps_cv;      /* used to synchronize parallel scan */
89 }                       BTParallelScanDescData;
90
91 typedef struct BTParallelScanDescData *BTParallelScanDesc;
92
93
94 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
95                          IndexBulkDeleteCallback callback, void *callback_state,
96                          BTCycleId cycleid, TransactionId *oldestBtpoXact);
97 static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
98                          BlockNumber orig_blkno);
99
100
101 /*
102  * Btree handler function: return IndexAmRoutine with access method parameters
103  * and callbacks.
104  */
105 Datum
106 bthandler(PG_FUNCTION_ARGS)
107 {
108         IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
109
110         amroutine->amstrategies = BTMaxStrategyNumber;
111         amroutine->amsupport = BTNProcs;
112         amroutine->amcanorder = true;
113         amroutine->amcanorderbyop = false;
114         amroutine->amcanbackward = true;
115         amroutine->amcanunique = true;
116         amroutine->amcanmulticol = true;
117         amroutine->amoptionalkey = true;
118         amroutine->amsearcharray = true;
119         amroutine->amsearchnulls = true;
120         amroutine->amstorage = false;
121         amroutine->amclusterable = true;
122         amroutine->ampredlocks = true;
123         amroutine->amcanparallel = true;
124         amroutine->amcaninclude = true;
125         amroutine->amkeytype = InvalidOid;
126
127         amroutine->ambuild = btbuild;
128         amroutine->ambuildempty = btbuildempty;
129         amroutine->aminsert = btinsert;
130         amroutine->ambulkdelete = btbulkdelete;
131         amroutine->amvacuumcleanup = btvacuumcleanup;
132         amroutine->amcanreturn = btcanreturn;
133         amroutine->amcostestimate = btcostestimate;
134         amroutine->amoptions = btoptions;
135         amroutine->amproperty = btproperty;
136         amroutine->amvalidate = btvalidate;
137         amroutine->ambeginscan = btbeginscan;
138         amroutine->amrescan = btrescan;
139         amroutine->amgettuple = btgettuple;
140         amroutine->amgetbitmap = btgetbitmap;
141         amroutine->amendscan = btendscan;
142         amroutine->ammarkpos = btmarkpos;
143         amroutine->amrestrpos = btrestrpos;
144         amroutine->amestimateparallelscan = btestimateparallelscan;
145         amroutine->aminitparallelscan = btinitparallelscan;
146         amroutine->amparallelrescan = btparallelrescan;
147
148         PG_RETURN_POINTER(amroutine);
149 }
150
151 /*
152  *      btbuildempty() -- build an empty btree index in the initialization fork
153  */
154 void
155 btbuildempty(Relation index)
156 {
157         Page            metapage;
158
159         /* Construct metapage. */
160         metapage = (Page) palloc(BLCKSZ);
161         _bt_initmetapage(metapage, P_NONE, 0);
162
163         /*
164          * Write the page and log it.  It might seem that an immediate sync would
165          * be sufficient to guarantee that the file exists on disk, but recovery
166          * itself might remove it while replaying, for example, an
167          * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record.  Therefore, we need
168          * this even when wal_level=minimal.
169          */
170         PageSetChecksumInplace(metapage, BTREE_METAPAGE);
171         smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
172                           (char *) metapage, true);
173         log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
174                                 BTREE_METAPAGE, metapage, true);
175
176         /*
177          * An immediate sync is required even if we xlog'd the page, because the
178          * write did not go through shared_buffers and therefore a concurrent
179          * checkpoint may have moved the redo pointer past our xlog record.
180          */
181         smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
182 }
183
184 /*
185  *      btinsert() -- insert an index tuple into a btree.
186  *
187  *              Descend the tree recursively, find the appropriate location for our
188  *              new tuple, and put it there.
189  */
190 bool
191 btinsert(Relation rel, Datum *values, bool *isnull,
192                  ItemPointer ht_ctid, Relation heapRel,
193                  IndexUniqueCheck checkUnique,
194                  IndexInfo *indexInfo)
195 {
196         bool            result;
197         IndexTuple      itup;
198
199         /* generate an index tuple */
200         itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
201         itup->t_tid = *ht_ctid;
202
203         result = _bt_doinsert(rel, itup, checkUnique, heapRel);
204
205         pfree(itup);
206
207         return result;
208 }
209
210 /*
211  *      btgettuple() -- Get the next tuple in the scan.
212  */
213 bool
214 btgettuple(IndexScanDesc scan, ScanDirection dir)
215 {
216         BTScanOpaque so = (BTScanOpaque) scan->opaque;
217         bool            res;
218
219         /* btree indexes are never lossy */
220         scan->xs_recheck = false;
221
222         /*
223          * If we have any array keys, initialize them during first call for a
224          * scan.  We can't do this in btrescan because we don't know the scan
225          * direction at that time.
226          */
227         if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
228         {
229                 /* punt if we have any unsatisfiable array keys */
230                 if (so->numArrayKeys < 0)
231                         return false;
232
233                 _bt_start_array_keys(scan, dir);
234         }
235
236         /* This loop handles advancing to the next array elements, if any */
237         do
238         {
239                 /*
240                  * If we've already initialized this scan, we can just advance it in
241                  * the appropriate direction.  If we haven't done so yet, we call
242                  * _bt_first() to get the first item in the scan.
243                  */
244                 if (!BTScanPosIsValid(so->currPos))
245                         res = _bt_first(scan, dir);
246                 else
247                 {
248                         /*
249                          * Check to see if we should kill the previously-fetched tuple.
250                          */
251                         if (scan->kill_prior_tuple)
252                         {
253                                 /*
254                                  * Yes, remember it for later. (We'll deal with all such
255                                  * tuples at once right before leaving the index page.)  The
256                                  * test for numKilled overrun is not just paranoia: if the
257                                  * caller reverses direction in the indexscan then the same
258                                  * item might get entered multiple times. It's not worth
259                                  * trying to optimize that, so we don't detect it, but instead
260                                  * just forget any excess entries.
261                                  */
262                                 if (so->killedItems == NULL)
263                                         so->killedItems = (int *)
264                                                 palloc(MaxIndexTuplesPerPage * sizeof(int));
265                                 if (so->numKilled < MaxIndexTuplesPerPage)
266                                         so->killedItems[so->numKilled++] = so->currPos.itemIndex;
267                         }
268
269                         /*
270                          * Now continue the scan.
271                          */
272                         res = _bt_next(scan, dir);
273                 }
274
275                 /* If we have a tuple, return it ... */
276                 if (res)
277                         break;
278                 /* ... otherwise see if we have more array keys to deal with */
279         } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
280
281         return res;
282 }
283
284 /*
285  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
286  */
287 int64
288 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
289 {
290         BTScanOpaque so = (BTScanOpaque) scan->opaque;
291         int64           ntids = 0;
292         ItemPointer heapTid;
293
294         /*
295          * If we have any array keys, initialize them.
296          */
297         if (so->numArrayKeys)
298         {
299                 /* punt if we have any unsatisfiable array keys */
300                 if (so->numArrayKeys < 0)
301                         return ntids;
302
303                 _bt_start_array_keys(scan, ForwardScanDirection);
304         }
305
306         /* This loop handles advancing to the next array elements, if any */
307         do
308         {
309                 /* Fetch the first page & tuple */
310                 if (_bt_first(scan, ForwardScanDirection))
311                 {
312                         /* Save tuple ID, and continue scanning */
313                         heapTid = &scan->xs_heaptid;
314                         tbm_add_tuples(tbm, heapTid, 1, false);
315                         ntids++;
316
317                         for (;;)
318                         {
319                                 /*
320                                  * Advance to next tuple within page.  This is the same as the
321                                  * easy case in _bt_next().
322                                  */
323                                 if (++so->currPos.itemIndex > so->currPos.lastItem)
324                                 {
325                                         /* let _bt_next do the heavy lifting */
326                                         if (!_bt_next(scan, ForwardScanDirection))
327                                                 break;
328                                 }
329
330                                 /* Save tuple ID, and continue scanning */
331                                 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
332                                 tbm_add_tuples(tbm, heapTid, 1, false);
333                                 ntids++;
334                         }
335                 }
336                 /* Now see if we have more array keys to deal with */
337         } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));
338
339         return ntids;
340 }
341
342 /*
343  *      btbeginscan() -- start a scan on a btree index
344  */
345 IndexScanDesc
346 btbeginscan(Relation rel, int nkeys, int norderbys)
347 {
348         IndexScanDesc scan;
349         BTScanOpaque so;
350
351         /* no order by operators allowed */
352         Assert(norderbys == 0);
353
354         /* get the scan */
355         scan = RelationGetIndexScan(rel, nkeys, norderbys);
356
357         /* allocate private workspace */
358         so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
359         BTScanPosInvalidate(so->currPos);
360         BTScanPosInvalidate(so->markPos);
361         if (scan->numberOfKeys > 0)
362                 so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
363         else
364                 so->keyData = NULL;
365
366         so->arrayKeyData = NULL;        /* assume no array keys for now */
367         so->numArrayKeys = 0;
368         so->arrayKeys = NULL;
369         so->arrayContext = NULL;
370
371         so->killedItems = NULL;         /* until needed */
372         so->numKilled = 0;
373
374         /*
375          * We don't know yet whether the scan will be index-only, so we do not
376          * allocate the tuple workspace arrays until btrescan.  However, we set up
377          * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
378          */
379         so->currTuples = so->markTuples = NULL;
380
381         scan->xs_itupdesc = RelationGetDescr(rel);
382
383         scan->opaque = so;
384
385         return scan;
386 }
387
388 /*
389  *      btrescan() -- rescan an index relation
390  */
391 void
392 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
393                  ScanKey orderbys, int norderbys)
394 {
395         BTScanOpaque so = (BTScanOpaque) scan->opaque;
396
397         /* we aren't holding any read locks, but gotta drop the pins */
398         if (BTScanPosIsValid(so->currPos))
399         {
400                 /* Before leaving current page, deal with any killed items */
401                 if (so->numKilled > 0)
402                         _bt_killitems(scan);
403                 BTScanPosUnpinIfPinned(so->currPos);
404                 BTScanPosInvalidate(so->currPos);
405         }
406
407         so->markItemIndex = -1;
408         so->arrayKeyCount = 0;
409         BTScanPosUnpinIfPinned(so->markPos);
410         BTScanPosInvalidate(so->markPos);
411
412         /*
413          * Allocate tuple workspace arrays, if needed for an index-only scan and
414          * not already done in a previous rescan call.  To save on palloc
415          * overhead, both workspaces are allocated as one palloc block; only this
416          * function and btendscan know that.
417          *
418          * NOTE: this data structure also makes it safe to return data from a
419          * "name" column, even though btree name_ops uses an underlying storage
420          * datatype of cstring.  The risk there is that "name" is supposed to be
421          * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
422          * However, since we only return data out of tuples sitting in the
423          * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
424          * data out of the markTuples array --- running off the end of memory for
425          * a SIGSEGV is not possible.  Yeah, this is ugly as sin, but it beats
426          * adding special-case treatment for name_ops elsewhere.
427          */
428         if (scan->xs_want_itup && so->currTuples == NULL)
429         {
430                 so->currTuples = (char *) palloc(BLCKSZ * 2);
431                 so->markTuples = so->currTuples + BLCKSZ;
432         }
433
434         /*
435          * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
436          * - vadim 05/05/97
437          */
438         if (scankey && scan->numberOfKeys > 0)
439                 memmove(scan->keyData,
440                                 scankey,
441                                 scan->numberOfKeys * sizeof(ScanKeyData));
442         so->numberOfKeys = 0;           /* until _bt_preprocess_keys sets it */
443
444         /* If any keys are SK_SEARCHARRAY type, set up array-key info */
445         _bt_preprocess_array_keys(scan);
446 }
447
448 /*
449  *      btendscan() -- close down a scan
450  */
451 void
452 btendscan(IndexScanDesc scan)
453 {
454         BTScanOpaque so = (BTScanOpaque) scan->opaque;
455
456         /* we aren't holding any read locks, but gotta drop the pins */
457         if (BTScanPosIsValid(so->currPos))
458         {
459                 /* Before leaving current page, deal with any killed items */
460                 if (so->numKilled > 0)
461                         _bt_killitems(scan);
462                 BTScanPosUnpinIfPinned(so->currPos);
463         }
464
465         so->markItemIndex = -1;
466         BTScanPosUnpinIfPinned(so->markPos);
467
468         /* No need to invalidate positions, the RAM is about to be freed. */
469
470         /* Release storage */
471         if (so->keyData != NULL)
472                 pfree(so->keyData);
473         /* so->arrayKeyData and so->arrayKeys are in arrayContext */
474         if (so->arrayContext != NULL)
475                 MemoryContextDelete(so->arrayContext);
476         if (so->killedItems != NULL)
477                 pfree(so->killedItems);
478         if (so->currTuples != NULL)
479                 pfree(so->currTuples);
480         /* so->markTuples should not be pfree'd, see btrescan */
481         pfree(so);
482 }
483
484 /*
485  *      btmarkpos() -- save current scan position
486  */
487 void
488 btmarkpos(IndexScanDesc scan)
489 {
490         BTScanOpaque so = (BTScanOpaque) scan->opaque;
491
492         /* There may be an old mark with a pin (but no lock). */
493         BTScanPosUnpinIfPinned(so->markPos);
494
495         /*
496          * Just record the current itemIndex.  If we later step to next page
497          * before releasing the marked position, _bt_steppage makes a full copy of
498          * the currPos struct in markPos.  If (as often happens) the mark is moved
499          * before we leave the page, we don't have to do that work.
500          */
501         if (BTScanPosIsValid(so->currPos))
502                 so->markItemIndex = so->currPos.itemIndex;
503         else
504         {
505                 BTScanPosInvalidate(so->markPos);
506                 so->markItemIndex = -1;
507         }
508
509         /* Also record the current positions of any array keys */
510         if (so->numArrayKeys)
511                 _bt_mark_array_keys(scan);
512 }
513
514 /*
515  *      btrestrpos() -- restore scan to last saved position
516  */
517 void
518 btrestrpos(IndexScanDesc scan)
519 {
520         BTScanOpaque so = (BTScanOpaque) scan->opaque;
521
522         /* Restore the marked positions of any array keys */
523         if (so->numArrayKeys)
524                 _bt_restore_array_keys(scan);
525
526         if (so->markItemIndex >= 0)
527         {
528                 /*
529                  * The scan has never moved to a new page since the last mark.  Just
530                  * restore the itemIndex.
531                  *
532                  * NB: In this case we can't count on anything in so->markPos to be
533                  * accurate.
534                  */
535                 so->currPos.itemIndex = so->markItemIndex;
536         }
537         else
538         {
539                 /*
540                  * The scan moved to a new page after last mark or restore, and we are
541                  * now restoring to the marked page.  We aren't holding any read
542                  * locks, but if we're still holding the pin for the current position,
543                  * we must drop it.
544                  */
545                 if (BTScanPosIsValid(so->currPos))
546                 {
547                         /* Before leaving current page, deal with any killed items */
548                         if (so->numKilled > 0)
549                                 _bt_killitems(scan);
550                         BTScanPosUnpinIfPinned(so->currPos);
551                 }
552
553                 if (BTScanPosIsValid(so->markPos))
554                 {
555                         /* bump pin on mark buffer for assignment to current buffer */
556                         if (BTScanPosIsPinned(so->markPos))
557                                 IncrBufferRefCount(so->markPos.buf);
558                         memcpy(&so->currPos, &so->markPos,
559                                    offsetof(BTScanPosData, items[1]) +
560                                    so->markPos.lastItem * sizeof(BTScanPosItem));
561                         if (so->currTuples)
562                                 memcpy(so->currTuples, so->markTuples,
563                                            so->markPos.nextTupleOffset);
564                 }
565                 else
566                         BTScanPosInvalidate(so->currPos);
567         }
568 }
569
570 /*
571  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
572  */
573 Size
574 btestimateparallelscan(void)
575 {
576         return sizeof(BTParallelScanDescData);
577 }
578
579 /*
580  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
581  */
582 void
583 btinitparallelscan(void *target)
584 {
585         BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
586
587         SpinLockInit(&bt_target->btps_mutex);
588         bt_target->btps_scanPage = InvalidBlockNumber;
589         bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
590         bt_target->btps_arrayKeyCount = 0;
591         ConditionVariableInit(&bt_target->btps_cv);
592 }
593
594 /*
595  *      btparallelrescan() -- reset parallel scan
596  */
597 void
598 btparallelrescan(IndexScanDesc scan)
599 {
600         BTParallelScanDesc btscan;
601         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
602
603         Assert(parallel_scan);
604
605         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
606                                                                                                   parallel_scan->ps_offset);
607
608         /*
609          * In theory, we don't need to acquire the spinlock here, because there
610          * shouldn't be any other workers running at this point, but we do so for
611          * consistency.
612          */
613         SpinLockAcquire(&btscan->btps_mutex);
614         btscan->btps_scanPage = InvalidBlockNumber;
615         btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
616         btscan->btps_arrayKeyCount = 0;
617         SpinLockRelease(&btscan->btps_mutex);
618 }
619
620 /*
621  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
622  *              page.  Other scans must wait until we call bt_parallel_release() or
623  *              bt_parallel_done().
624  *
625  * The return value is true if we successfully seized the scan and false
626  * if we did not.  The latter case occurs if no pages remain for the current
627  * set of scankeys.
628  *
629  * If the return value is true, *pageno returns the next or current page
630  * of the scan (depending on the scan direction).  An invalid block number
631  * means the scan hasn't yet started, and P_NONE means we've reached the end.
632  * The first time a participating process reaches the last page, it will return
633  * true and set *pageno to P_NONE; after that, further attempts to seize the
634  * scan will return false.
635  *
636  * Callers should ignore the value of pageno if the return value is false.
637  */
638 bool
639 _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
640 {
641         BTScanOpaque so = (BTScanOpaque) scan->opaque;
642         BTPS_State      pageStatus;
643         bool            exit_loop = false;
644         bool            status = true;
645         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
646         BTParallelScanDesc btscan;
647
648         *pageno = P_NONE;
649
650         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
651                                                                                                   parallel_scan->ps_offset);
652
653         while (1)
654         {
655                 SpinLockAcquire(&btscan->btps_mutex);
656                 pageStatus = btscan->btps_pageStatus;
657
658                 if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
659                 {
660                         /* Parallel scan has already advanced to a new set of scankeys. */
661                         status = false;
662                 }
663                 else if (pageStatus == BTPARALLEL_DONE)
664                 {
665                         /*
666                          * We're done with this set of scankeys.  This may be the end, or
667                          * there could be more sets to try.
668                          */
669                         status = false;
670                 }
671                 else if (pageStatus != BTPARALLEL_ADVANCING)
672                 {
673                         /*
674                          * We have successfully seized control of the scan for the purpose
675                          * of advancing it to a new page!
676                          */
677                         btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
678                         *pageno = btscan->btps_scanPage;
679                         exit_loop = true;
680                 }
681                 SpinLockRelease(&btscan->btps_mutex);
682                 if (exit_loop || !status)
683                         break;
684                 ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
685         }
686         ConditionVariableCancelSleep();
687
688         return status;
689 }
690
691 /*
692  * _bt_parallel_release() -- Complete the process of advancing the scan to a
693  *              new page.  We now have the new value btps_scanPage; some other backend
694  *              can now begin advancing the scan.
695  */
696 void
697 _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
698 {
699         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
700         BTParallelScanDesc btscan;
701
702         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
703                                                                                                   parallel_scan->ps_offset);
704
705         SpinLockAcquire(&btscan->btps_mutex);
706         btscan->btps_scanPage = scan_page;
707         btscan->btps_pageStatus = BTPARALLEL_IDLE;
708         SpinLockRelease(&btscan->btps_mutex);
709         ConditionVariableSignal(&btscan->btps_cv);
710 }
711
712 /*
713  * _bt_parallel_done() -- Mark the parallel scan as complete.
714  *
715  * When there are no pages left to scan, this function should be called to
716  * notify other workers.  Otherwise, they might wait forever for the scan to
717  * advance to the next page.
718  */
719 void
720 _bt_parallel_done(IndexScanDesc scan)
721 {
722         BTScanOpaque so = (BTScanOpaque) scan->opaque;
723         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
724         BTParallelScanDesc btscan;
725         bool            status_changed = false;
726
727         /* Do nothing, for non-parallel scans */
728         if (parallel_scan == NULL)
729                 return;
730
731         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
732                                                                                                   parallel_scan->ps_offset);
733
734         /*
735          * Mark the parallel scan as done for this combination of scan keys,
736          * unless some other process already did so.  See also
737          * _bt_advance_array_keys.
738          */
739         SpinLockAcquire(&btscan->btps_mutex);
740         if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
741                 btscan->btps_pageStatus != BTPARALLEL_DONE)
742         {
743                 btscan->btps_pageStatus = BTPARALLEL_DONE;
744                 status_changed = true;
745         }
746         SpinLockRelease(&btscan->btps_mutex);
747
748         /* wake up all the workers associated with this parallel scan */
749         if (status_changed)
750                 ConditionVariableBroadcast(&btscan->btps_cv);
751 }
752
753 /*
754  * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
755  *                      keys.
756  *
757  * Updates the count of array keys processed for both local and parallel
758  * scans.
759  */
760 void
761 _bt_parallel_advance_array_keys(IndexScanDesc scan)
762 {
763         BTScanOpaque so = (BTScanOpaque) scan->opaque;
764         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
765         BTParallelScanDesc btscan;
766
767         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
768                                                                                                   parallel_scan->ps_offset);
769
770         so->arrayKeyCount++;
771         SpinLockAcquire(&btscan->btps_mutex);
772         if (btscan->btps_pageStatus == BTPARALLEL_DONE)
773         {
774                 btscan->btps_scanPage = InvalidBlockNumber;
775                 btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
776                 btscan->btps_arrayKeyCount++;
777         }
778         SpinLockRelease(&btscan->btps_mutex);
779 }
780
781 /*
782  * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup assuming that
783  *                      btbulkdelete() wasn't called.
784  */
785 static bool
786 _bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
787 {
788         Buffer          metabuf;
789         Page            metapg;
790         BTMetaPageData *metad;
791         bool            result = false;
792
793         metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ);
794         metapg = BufferGetPage(metabuf);
795         metad = BTPageGetMeta(metapg);
796
797         if (metad->btm_version < BTREE_NOVAC_VERSION)
798         {
799                 /*
800                  * Do cleanup if metapage needs upgrade, because we don't have
801                  * cleanup-related meta-information yet.
802                  */
803                 result = true;
804         }
805         else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) &&
806                          TransactionIdPrecedes(metad->btm_oldest_btpo_xact,
807                                                                    RecentGlobalXmin))
808         {
809                 /*
810                  * If oldest btpo.xact in the deleted pages is older than
811                  * RecentGlobalXmin, then at least one deleted page can be recycled.
812                  */
813                 result = true;
814         }
815         else
816         {
817                 StdRdOptions *relopts;
818                 float8          cleanup_scale_factor;
819                 float8          prev_num_heap_tuples;
820
821                 /*
822                  * If table receives enough insertions and no cleanup was performed,
823                  * then index would appear have stale statistics.  If scale factor is
824                  * set, we avoid that by performing cleanup if the number of inserted
825                  * tuples exceeds vacuum_cleanup_index_scale_factor fraction of
826                  * original tuples count.
827                  */
828                 relopts = (StdRdOptions *) info->index->rd_options;
829                 cleanup_scale_factor = (relopts &&
830                                                                 relopts->vacuum_cleanup_index_scale_factor >= 0)
831                         ? relopts->vacuum_cleanup_index_scale_factor
832                         : vacuum_cleanup_index_scale_factor;
833                 prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
834
835                 if (cleanup_scale_factor <= 0 ||
836                         prev_num_heap_tuples < 0 ||
837                         (info->num_heap_tuples - prev_num_heap_tuples) /
838                         prev_num_heap_tuples >= cleanup_scale_factor)
839                         result = true;
840         }
841
842         _bt_relbuf(info->index, metabuf);
843         return result;
844 }
845
846 /*
847  * Bulk deletion of all index entries pointing to a set of heap tuples.
848  * The set of target tuples is specified via a callback routine that tells
849  * whether any given heap tuple (identified by ItemPointer) is being deleted.
850  *
851  * Result: a palloc'd struct containing statistical info for VACUUM displays.
852  */
853 IndexBulkDeleteResult *
854 btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
855                          IndexBulkDeleteCallback callback, void *callback_state)
856 {
857         Relation        rel = info->index;
858         BTCycleId       cycleid;
859
860         /* allocate stats if first time through, else re-use existing struct */
861         if (stats == NULL)
862                 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
863
864         /* Establish the vacuum cycle ID to use for this scan */
865         /* The ENSURE stuff ensures we clean up shared memory on failure */
866         PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
867         {
868                 TransactionId oldestBtpoXact;
869
870                 cycleid = _bt_start_vacuum(rel);
871
872                 btvacuumscan(info, stats, callback, callback_state, cycleid,
873                                          &oldestBtpoXact);
874
875                 /*
876                  * Update cleanup-related information in metapage. This information is
877                  * used only for cleanup but keeping them up to date can avoid
878                  * unnecessary cleanup even after bulkdelete.
879                  */
880                 _bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
881                                                                          info->num_heap_tuples);
882         }
883         PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
884         _bt_end_vacuum(rel);
885
886         return stats;
887 }
888
889 /*
890  * Post-VACUUM cleanup.
891  *
892  * Result: a palloc'd struct containing statistical info for VACUUM displays.
893  */
894 IndexBulkDeleteResult *
895 btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
896 {
897         /* No-op in ANALYZE ONLY mode */
898         if (info->analyze_only)
899                 return stats;
900
901         /*
902          * If btbulkdelete was called, we need not do anything, just return the
903          * stats from the latest btbulkdelete call.  If it wasn't called, we might
904          * still need to do a pass over the index, to recycle any newly-recyclable
905          * pages or to obtain index statistics.  _bt_vacuum_needs_cleanup
906          * determines if either are needed.
907          *
908          * Since we aren't going to actually delete any leaf items, there's no
909          * need to go through all the vacuum-cycle-ID pushups.
910          */
911         if (stats == NULL)
912         {
913                 TransactionId oldestBtpoXact;
914
915                 /* Check if we need a cleanup */
916                 if (!_bt_vacuum_needs_cleanup(info))
917                         return NULL;
918
919                 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
920                 btvacuumscan(info, stats, NULL, NULL, 0, &oldestBtpoXact);
921
922                 /* Update cleanup-related information in the metapage */
923                 _bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
924                                                                          info->num_heap_tuples);
925         }
926
927         /*
928          * It's quite possible for us to be fooled by concurrent page splits into
929          * double-counting some index tuples, so disbelieve any total that exceeds
930          * the underlying heap's count ... if we know that accurately.  Otherwise
931          * this might just make matters worse.
932          */
933         if (!info->estimated_count)
934         {
935                 if (stats->num_index_tuples > info->num_heap_tuples)
936                         stats->num_index_tuples = info->num_heap_tuples;
937         }
938
939         return stats;
940 }
941
942 /*
943  * btvacuumscan --- scan the index for VACUUMing purposes
944  *
945  * This combines the functions of looking for leaf tuples that are deletable
946  * according to the vacuum callback, looking for empty pages that can be
947  * deleted, and looking for old deleted pages that can be recycled.  Both
948  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
949  * btbulkdelete call occurred).
950  *
951  * The caller is responsible for initially allocating/zeroing a stats struct
952  * and for obtaining a vacuum cycle ID if necessary.
953  */
954 static void
955 btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
956                          IndexBulkDeleteCallback callback, void *callback_state,
957                          BTCycleId cycleid, TransactionId *oldestBtpoXact)
958 {
959         Relation        rel = info->index;
960         BTVacState      vstate;
961         BlockNumber num_pages;
962         BlockNumber blkno;
963         bool            needLock;
964
965         /*
966          * Reset counts that will be incremented during the scan; needed in case
967          * of multiple scans during a single VACUUM command
968          */
969         stats->estimated_count = false;
970         stats->num_index_tuples = 0;
971         stats->pages_deleted = 0;
972
973         /* Set up info to pass down to btvacuumpage */
974         vstate.info = info;
975         vstate.stats = stats;
976         vstate.callback = callback;
977         vstate.callback_state = callback_state;
978         vstate.cycleid = cycleid;
979         vstate.lastBlockVacuumed = BTREE_METAPAGE;      /* Initialise at first block */
980         vstate.lastBlockLocked = BTREE_METAPAGE;
981         vstate.totFreePages = 0;
982         vstate.oldestBtpoXact = InvalidTransactionId;
983
984         /* Create a temporary memory context to run _bt_pagedel in */
985         vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
986                                                                                                   "_bt_pagedel",
987                                                                                                   ALLOCSET_DEFAULT_SIZES);
988
989         /*
990          * The outer loop iterates over all index pages except the metapage, in
991          * physical order (we hope the kernel will cooperate in providing
992          * read-ahead for speed).  It is critical that we visit all leaf pages,
993          * including ones added after we start the scan, else we might fail to
994          * delete some deletable tuples.  Hence, we must repeatedly check the
995          * relation length.  We must acquire the relation-extension lock while
996          * doing so to avoid a race condition: if someone else is extending the
997          * relation, there is a window where bufmgr/smgr have created a new
998          * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
999          * we manage to scan such a page here, we'll improperly assume it can be
1000          * recycled.  Taking the lock synchronizes things enough to prevent a
1001          * problem: either num_pages won't include the new page, or _bt_getbuf
1002          * already has write lock on the buffer and it will be fully initialized
1003          * before we can examine it.  (See also vacuumlazy.c, which has the same
1004          * issue.)      Also, we need not worry if a page is added immediately after
1005          * we look; the page splitting code already has write-lock on the left
1006          * page before it adds a right page, so we must already have processed any
1007          * tuples due to be moved into such a page.
1008          *
1009          * We can skip locking for new or temp relations, however, since no one
1010          * else could be accessing them.
1011          */
1012         needLock = !RELATION_IS_LOCAL(rel);
1013
1014         blkno = BTREE_METAPAGE + 1;
1015         for (;;)
1016         {
1017                 /* Get the current relation length */
1018                 if (needLock)
1019                         LockRelationForExtension(rel, ExclusiveLock);
1020                 num_pages = RelationGetNumberOfBlocks(rel);
1021                 if (needLock)
1022                         UnlockRelationForExtension(rel, ExclusiveLock);
1023
1024                 /* Quit if we've scanned the whole relation */
1025                 if (blkno >= num_pages)
1026                         break;
1027                 /* Iterate over pages, then loop back to recheck length */
1028                 for (; blkno < num_pages; blkno++)
1029                 {
1030                         btvacuumpage(&vstate, blkno, blkno);
1031                 }
1032         }
1033
1034         /*
1035          * Check to see if we need to issue one final WAL record for this index,
1036          * which may be needed for correctness on a hot standby node when non-MVCC
1037          * index scans could take place.
1038          *
1039          * If the WAL is replayed in hot standby, the replay process needs to get
1040          * cleanup locks on all index leaf pages, just as we've been doing here.
1041          * However, we won't issue any WAL records about pages that have no items
1042          * to be deleted.  For pages between pages we've vacuumed, the replay code
1043          * will take locks under the direction of the lastBlockVacuumed fields in
1044          * the XLOG_BTREE_VACUUM WAL records.  To cover pages after the last one
1045          * we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record
1046          * against the last leaf page in the index, if that one wasn't vacuumed.
1047          */
1048         if (XLogStandbyInfoActive() &&
1049                 vstate.lastBlockVacuumed < vstate.lastBlockLocked)
1050         {
1051                 Buffer          buf;
1052
1053                 /*
1054                  * The page should be valid, but we can't use _bt_getbuf() because we
1055                  * want to use a nondefault buffer access strategy.  Since we aren't
1056                  * going to delete any items, getting cleanup lock again is probably
1057                  * overkill, but for consistency do that anyway.
1058                  */
1059                 buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked,
1060                                                                  RBM_NORMAL, info->strategy);
1061                 LockBufferForCleanup(buf);
1062                 _bt_checkpage(rel, buf);
1063                 _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed);
1064                 _bt_relbuf(rel, buf);
1065         }
1066
1067         MemoryContextDelete(vstate.pagedelcontext);
1068
1069         /*
1070          * If we found any recyclable pages (and recorded them in the FSM), then
1071          * forcibly update the upper-level FSM pages to ensure that searchers can
1072          * find them.  It's possible that the pages were also found during
1073          * previous scans and so this is a waste of time, but it's cheap enough
1074          * relative to scanning the index that it shouldn't matter much, and
1075          * making sure that free pages are available sooner not later seems
1076          * worthwhile.
1077          *
1078          * Note that if no recyclable pages exist, we don't bother vacuuming the
1079          * FSM at all.
1080          */
1081         if (vstate.totFreePages > 0)
1082                 IndexFreeSpaceMapVacuum(rel);
1083
1084         /* update statistics */
1085         stats->num_pages = num_pages;
1086         stats->pages_free = vstate.totFreePages;
1087
1088         if (oldestBtpoXact)
1089                 *oldestBtpoXact = vstate.oldestBtpoXact;
1090 }
1091
1092 /*
1093  * btvacuumpage --- VACUUM one page
1094  *
1095  * This processes a single page for btvacuumscan().  In some cases we
1096  * must go back and re-examine previously-scanned pages; this routine
1097  * recurses when necessary to handle that case.
1098  *
1099  * blkno is the page to process.  orig_blkno is the highest block number
1100  * reached by the outer btvacuumscan loop (the same as blkno, unless we
1101  * are recursing to re-examine a previous page).
1102  */
1103 static void
1104 btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
1105 {
1106         IndexVacuumInfo *info = vstate->info;
1107         IndexBulkDeleteResult *stats = vstate->stats;
1108         IndexBulkDeleteCallback callback = vstate->callback;
1109         void       *callback_state = vstate->callback_state;
1110         Relation        rel = info->index;
1111         bool            delete_now;
1112         BlockNumber recurse_to;
1113         Buffer          buf;
1114         Page            page;
1115         BTPageOpaque opaque = NULL;
1116
1117 restart:
1118         delete_now = false;
1119         recurse_to = P_NONE;
1120
1121         /* call vacuum_delay_point while not holding any buffer lock */
1122         vacuum_delay_point();
1123
1124         /*
1125          * We can't use _bt_getbuf() here because it always applies
1126          * _bt_checkpage(), which will barf on an all-zero page. We want to
1127          * recycle all-zero pages, not fail.  Also, we want to use a nondefault
1128          * buffer access strategy.
1129          */
1130         buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1131                                                          info->strategy);
1132         LockBuffer(buf, BT_READ);
1133         page = BufferGetPage(buf);
1134         if (!PageIsNew(page))
1135         {
1136                 _bt_checkpage(rel, buf);
1137                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1138         }
1139
1140         /*
1141          * If we are recursing, the only case we want to do anything with is a
1142          * live leaf page having the current vacuum cycle ID.  Any other state
1143          * implies we already saw the page (eg, deleted it as being empty).
1144          */
1145         if (blkno != orig_blkno)
1146         {
1147                 if (_bt_page_recyclable(page) ||
1148                         P_IGNORE(opaque) ||
1149                         !P_ISLEAF(opaque) ||
1150                         opaque->btpo_cycleid != vstate->cycleid)
1151                 {
1152                         _bt_relbuf(rel, buf);
1153                         return;
1154                 }
1155         }
1156
1157         /* Page is valid, see what to do with it */
1158         if (_bt_page_recyclable(page))
1159         {
1160                 /* Okay to recycle this page */
1161                 RecordFreeIndexPage(rel, blkno);
1162                 vstate->totFreePages++;
1163                 stats->pages_deleted++;
1164         }
1165         else if (P_ISDELETED(opaque))
1166         {
1167                 /* Already deleted, but can't recycle yet */
1168                 stats->pages_deleted++;
1169
1170                 /* Update the oldest btpo.xact */
1171                 if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
1172                         TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
1173                         vstate->oldestBtpoXact = opaque->btpo.xact;
1174         }
1175         else if (P_ISHALFDEAD(opaque))
1176         {
1177                 /* Half-dead, try to delete */
1178                 delete_now = true;
1179         }
1180         else if (P_ISLEAF(opaque))
1181         {
1182                 OffsetNumber deletable[MaxOffsetNumber];
1183                 int                     ndeletable;
1184                 OffsetNumber offnum,
1185                                         minoff,
1186                                         maxoff;
1187
1188                 /*
1189                  * Trade in the initial read lock for a super-exclusive write lock on
1190                  * this page.  We must get such a lock on every leaf page over the
1191                  * course of the vacuum scan, whether or not it actually contains any
1192                  * deletable tuples --- see nbtree/README.
1193                  */
1194                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1195                 LockBufferForCleanup(buf);
1196
1197                 /*
1198                  * Remember highest leaf page number we've taken cleanup lock on; see
1199                  * notes in btvacuumscan
1200                  */
1201                 if (blkno > vstate->lastBlockLocked)
1202                         vstate->lastBlockLocked = blkno;
1203
1204                 /*
1205                  * Check whether we need to recurse back to earlier pages.  What we
1206                  * are concerned about is a page split that happened since we started
1207                  * the vacuum scan.  If the split moved some tuples to a lower page
1208                  * then we might have missed 'em.  If so, set up for tail recursion.
1209                  * (Must do this before possibly clearing btpo_cycleid below!)
1210                  */
1211                 if (vstate->cycleid != 0 &&
1212                         opaque->btpo_cycleid == vstate->cycleid &&
1213                         !(opaque->btpo_flags & BTP_SPLIT_END) &&
1214                         !P_RIGHTMOST(opaque) &&
1215                         opaque->btpo_next < orig_blkno)
1216                         recurse_to = opaque->btpo_next;
1217
1218                 /*
1219                  * Scan over all items to see which ones need deleted according to the
1220                  * callback function.
1221                  */
1222                 ndeletable = 0;
1223                 minoff = P_FIRSTDATAKEY(opaque);
1224                 maxoff = PageGetMaxOffsetNumber(page);
1225                 if (callback)
1226                 {
1227                         for (offnum = minoff;
1228                                  offnum <= maxoff;
1229                                  offnum = OffsetNumberNext(offnum))
1230                         {
1231                                 IndexTuple      itup;
1232                                 ItemPointer htup;
1233
1234                                 itup = (IndexTuple) PageGetItem(page,
1235                                                                                                 PageGetItemId(page, offnum));
1236                                 htup = &(itup->t_tid);
1237
1238                                 /*
1239                                  * During Hot Standby we currently assume that
1240                                  * XLOG_BTREE_VACUUM records do not produce conflicts. That is
1241                                  * only true as long as the callback function depends only
1242                                  * upon whether the index tuple refers to heap tuples removed
1243                                  * in the initial heap scan. When vacuum starts it derives a
1244                                  * value of OldestXmin. Backends taking later snapshots could
1245                                  * have a RecentGlobalXmin with a later xid than the vacuum's
1246                                  * OldestXmin, so it is possible that row versions deleted
1247                                  * after OldestXmin could be marked as killed by other
1248                                  * backends. The callback function *could* look at the index
1249                                  * tuple state in isolation and decide to delete the index
1250                                  * tuple, though currently it does not. If it ever did, we
1251                                  * would need to reconsider whether XLOG_BTREE_VACUUM records
1252                                  * should cause conflicts. If they did cause conflicts they
1253                                  * would be fairly harsh conflicts, since we haven't yet
1254                                  * worked out a way to pass a useful value for
1255                                  * latestRemovedXid on the XLOG_BTREE_VACUUM records. This
1256                                  * applies to *any* type of index that marks index tuples as
1257                                  * killed.
1258                                  */
1259                                 if (callback(htup, callback_state))
1260                                         deletable[ndeletable++] = offnum;
1261                         }
1262                 }
1263
1264                 /*
1265                  * Apply any needed deletes.  We issue just one _bt_delitems_vacuum()
1266                  * call per page, so as to minimize WAL traffic.
1267                  */
1268                 if (ndeletable > 0)
1269                 {
1270                         /*
1271                          * Notice that the issued XLOG_BTREE_VACUUM WAL record includes
1272                          * all information to the replay code to allow it to get a cleanup
1273                          * lock on all pages between the previous lastBlockVacuumed and
1274                          * this page. This ensures that WAL replay locks all leaf pages at
1275                          * some point, which is important should non-MVCC scans be
1276                          * requested. This is currently unused on standby, but we record
1277                          * it anyway, so that the WAL contains the required information.
1278                          *
1279                          * Since we can visit leaf pages out-of-order when recursing,
1280                          * replay might end up locking such pages an extra time, but it
1281                          * doesn't seem worth the amount of bookkeeping it'd take to avoid
1282                          * that.
1283                          */
1284                         _bt_delitems_vacuum(rel, buf, deletable, ndeletable,
1285                                                                 vstate->lastBlockVacuumed);
1286
1287                         /*
1288                          * Remember highest leaf page number we've issued a
1289                          * XLOG_BTREE_VACUUM WAL record for.
1290                          */
1291                         if (blkno > vstate->lastBlockVacuumed)
1292                                 vstate->lastBlockVacuumed = blkno;
1293
1294                         stats->tuples_removed += ndeletable;
1295                         /* must recompute maxoff */
1296                         maxoff = PageGetMaxOffsetNumber(page);
1297                 }
1298                 else
1299                 {
1300                         /*
1301                          * If the page has been split during this vacuum cycle, it seems
1302                          * worth expending a write to clear btpo_cycleid even if we don't
1303                          * have any deletions to do.  (If we do, _bt_delitems_vacuum takes
1304                          * care of this.)  This ensures we won't process the page again.
1305                          *
1306                          * We treat this like a hint-bit update because there's no need to
1307                          * WAL-log it.
1308                          */
1309                         if (vstate->cycleid != 0 &&
1310                                 opaque->btpo_cycleid == vstate->cycleid)
1311                         {
1312                                 opaque->btpo_cycleid = 0;
1313                                 MarkBufferDirtyHint(buf, true);
1314                         }
1315                 }
1316
1317                 /*
1318                  * If it's now empty, try to delete; else count the live tuples. We
1319                  * don't delete when recursing, though, to avoid putting entries into
1320                  * freePages out-of-order (doesn't seem worth any extra code to handle
1321                  * the case).
1322                  */
1323                 if (minoff > maxoff)
1324                         delete_now = (blkno == orig_blkno);
1325                 else
1326                         stats->num_index_tuples += maxoff - minoff + 1;
1327         }
1328
1329         if (delete_now)
1330         {
1331                 MemoryContext oldcontext;
1332                 int                     ndel;
1333
1334                 /* Run pagedel in a temp context to avoid memory leakage */
1335                 MemoryContextReset(vstate->pagedelcontext);
1336                 oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1337
1338                 ndel = _bt_pagedel(rel, buf);
1339
1340                 /* count only this page, else may double-count parent */
1341                 if (ndel)
1342                 {
1343                         stats->pages_deleted++;
1344                         if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
1345                                 TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
1346                                 vstate->oldestBtpoXact = opaque->btpo.xact;
1347                 }
1348
1349                 MemoryContextSwitchTo(oldcontext);
1350                 /* pagedel released buffer, so we shouldn't */
1351         }
1352         else
1353                 _bt_relbuf(rel, buf);
1354
1355         /*
1356          * This is really tail recursion, but if the compiler is too stupid to
1357          * optimize it as such, we'd eat an uncomfortably large amount of stack
1358          * space per recursion level (due to the deletable[] array). A failure is
1359          * improbable since the number of levels isn't likely to be large ... but
1360          * just in case, let's hand-optimize into a loop.
1361          */
1362         if (recurse_to != P_NONE)
1363         {
1364                 blkno = recurse_to;
1365                 goto restart;
1366         }
1367 }
1368
1369 /*
1370  *      btcanreturn() -- Check whether btree indexes support index-only scans.
1371  *
1372  * btrees always do, so this is trivial.
1373  */
1374 bool
1375 btcanreturn(Relation index, int attno)
1376 {
1377         return true;
1378 }