granicus.if.org Git - postgresql/blob - src/backend/access/nbtree/nbtree.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nbtree.c
   4  *        Implementation of Lehman and Yao's btree management algorithm for
   5  *        Postgres.
   6  *
   7  * NOTES
   8  *        This file contains only the public interface routines.
   9  *
  10  *
  11  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
  12  * Portions Copyright (c) 1994, Regents of the University of California
  13  *
  14  * IDENTIFICATION
  15  *        src/backend/access/nbtree/nbtree.c
  16  *
  17  *-------------------------------------------------------------------------
  18  */
  19 #include "postgres.h"
  20
  21 #include "access/nbtree.h"
  22 #include "access/nbtxlog.h"
  23 #include "access/relscan.h"
  24 #include "access/xlog.h"
  25 #include "commands/progress.h"
  26 #include "commands/vacuum.h"
  27 #include "miscadmin.h"
  28 #include "nodes/execnodes.h"
  29 #include "pgstat.h"
  30 #include "postmaster/autovacuum.h"
  31 #include "storage/condition_variable.h"
  32 #include "storage/indexfsm.h"
  33 #include "storage/ipc.h"
  34 #include "storage/lmgr.h"
  35 #include "storage/smgr.h"
  36 #include "utils/builtins.h"
  37 #include "utils/index_selfuncs.h"
  38 #include "utils/memutils.h"
  39
  40
  41 /* Working state needed by btvacuumpage */
  42 typedef struct
  43 {
  44         IndexVacuumInfo *info;
  45         IndexBulkDeleteResult *stats;
  46         IndexBulkDeleteCallback callback;
  47         void       *callback_state;
  48         BTCycleId       cycleid;
  49         BlockNumber lastBlockVacuumed;  /* highest blkno actually vacuumed */
  50         BlockNumber lastBlockLocked;    /* highest blkno we've cleanup-locked */
  51         BlockNumber totFreePages;       /* true total # of free pages */
  52         TransactionId oldestBtpoXact;
  53         MemoryContext pagedelcontext;
  54 } BTVacState;
  55
  56 /*
  57  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
  58  *
  59  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
  60  * a new page; others must wait.
  61  *
  62  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
  63  * to a new page; some process can start doing that.
  64  *
  65  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
  66  * We reach this state once for every distinct combination of array keys.
  67  */
  68 typedef enum
  69 {
  70         BTPARALLEL_NOT_INITIALIZED,
  71         BTPARALLEL_ADVANCING,
  72         BTPARALLEL_IDLE,
  73         BTPARALLEL_DONE
  74 } BTPS_State;
  75
  76 /*
  77  * BTParallelScanDescData contains btree specific shared information required
  78  * for parallel scan.
  79  */
  80 typedef struct BTParallelScanDescData
  81 {
  82         BlockNumber btps_scanPage;      /* latest or next page to be scanned */
  83         BTPS_State      btps_pageStatus;        /* indicates whether next page is
  84                                                                          * available for scan. see above for
  85                                                                          * possible states of parallel scan. */
  86         int                     btps_arrayKeyCount; /* count indicating number of array scan
  87                                                                          * keys processed by parallel scan */
  88         slock_t         btps_mutex;             /* protects above variables */
  89         ConditionVariable btps_cv;      /* used to synchronize parallel scan */
  90 }                       BTParallelScanDescData;
  91
  92 typedef struct BTParallelScanDescData *BTParallelScanDesc;
  93
  94
  95 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
  96                                                  IndexBulkDeleteCallback callback, void *callback_state,
  97                                                  BTCycleId cycleid, TransactionId *oldestBtpoXact);
  98 static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
  99                                                  BlockNumber orig_blkno);
 100
 101
 102 /*
 103  * Btree handler function: return IndexAmRoutine with access method parameters
 104  * and callbacks.
 105  */
 106 Datum
 107 bthandler(PG_FUNCTION_ARGS)
 108 {
 109         IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
 110
 111         amroutine->amstrategies = BTMaxStrategyNumber;
 112         amroutine->amsupport = BTNProcs;
 113         amroutine->amcanorder = true;
 114         amroutine->amcanorderbyop = false;
 115         amroutine->amcanbackward = true;
 116         amroutine->amcanunique = true;
 117         amroutine->amcanmulticol = true;
 118         amroutine->amoptionalkey = true;
 119         amroutine->amsearcharray = true;
 120         amroutine->amsearchnulls = true;
 121         amroutine->amstorage = false;
 122         amroutine->amclusterable = true;
 123         amroutine->ampredlocks = true;
 124         amroutine->amcanparallel = true;
 125         amroutine->amcaninclude = true;
 126         amroutine->amkeytype = InvalidOid;
 127
 128         amroutine->ambuild = btbuild;
 129         amroutine->ambuildempty = btbuildempty;
 130         amroutine->aminsert = btinsert;
 131         amroutine->ambulkdelete = btbulkdelete;
 132         amroutine->amvacuumcleanup = btvacuumcleanup;
 133         amroutine->amcanreturn = btcanreturn;
 134         amroutine->amcostestimate = btcostestimate;
 135         amroutine->amoptions = btoptions;
 136         amroutine->amproperty = btproperty;
 137         amroutine->ambuildphasename = btbuildphasename;
 138         amroutine->amvalidate = btvalidate;
 139         amroutine->ambeginscan = btbeginscan;
 140         amroutine->amrescan = btrescan;
 141         amroutine->amgettuple = btgettuple;
 142         amroutine->amgetbitmap = btgetbitmap;
 143         amroutine->amendscan = btendscan;
 144         amroutine->ammarkpos = btmarkpos;
 145         amroutine->amrestrpos = btrestrpos;
 146         amroutine->amestimateparallelscan = btestimateparallelscan;
 147         amroutine->aminitparallelscan = btinitparallelscan;
 148         amroutine->amparallelrescan = btparallelrescan;
 149
 150         PG_RETURN_POINTER(amroutine);
 151 }
 152
 153 /*
 154  *      btbuildempty() -- build an empty btree index in the initialization fork
 155  */
 156 void
 157 btbuildempty(Relation index)
 158 {
 159         Page            metapage;
 160
 161         /* Construct metapage. */
 162         metapage = (Page) palloc(BLCKSZ);
 163         _bt_initmetapage(metapage, P_NONE, 0);
 164
 165         /*
 166          * Write the page and log it.  It might seem that an immediate sync would
 167          * be sufficient to guarantee that the file exists on disk, but recovery
 168          * itself might remove it while replaying, for example, an
 169          * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record.  Therefore, we need
 170          * this even when wal_level=minimal.
 171          */
 172         PageSetChecksumInplace(metapage, BTREE_METAPAGE);
 173         smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
 174                           (char *) metapage, true);
 175         log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
 176                                 BTREE_METAPAGE, metapage, true);
 177
 178         /*
 179          * An immediate sync is required even if we xlog'd the page, because the
 180          * write did not go through shared_buffers and therefore a concurrent
 181          * checkpoint may have moved the redo pointer past our xlog record.
 182          */
 183         smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
 184 }
 185
 186 /*
 187  *      btinsert() -- insert an index tuple into a btree.
 188  *
 189  *              Descend the tree recursively, find the appropriate location for our
 190  *              new tuple, and put it there.
 191  */
 192 bool
 193 btinsert(Relation rel, Datum *values, bool *isnull,
 194                  ItemPointer ht_ctid, Relation heapRel,
 195                  IndexUniqueCheck checkUnique,
 196                  IndexInfo *indexInfo)
 197 {
 198         bool            result;
 199         IndexTuple      itup;
 200
 201         /* generate an index tuple */
 202         itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
 203         itup->t_tid = *ht_ctid;
 204
 205         result = _bt_doinsert(rel, itup, checkUnique, heapRel);
 206
 207         pfree(itup);
 208
 209         return result;
 210 }
 211
 212 /*
 213  *      btgettuple() -- Get the next tuple in the scan.
 214  */
 215 bool
 216 btgettuple(IndexScanDesc scan, ScanDirection dir)
 217 {
 218         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 219         bool            res;
 220
 221         /* btree indexes are never lossy */
 222         scan->xs_recheck = false;
 223
 224         /*
 225          * If we have any array keys, initialize them during first call for a
 226          * scan.  We can't do this in btrescan because we don't know the scan
 227          * direction at that time.
 228          */
 229         if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
 230         {
 231                 /* punt if we have any unsatisfiable array keys */
 232                 if (so->numArrayKeys < 0)
 233                         return false;
 234
 235                 _bt_start_array_keys(scan, dir);
 236         }
 237
 238         /* This loop handles advancing to the next array elements, if any */
 239         do
 240         {
 241                 /*
 242                  * If we've already initialized this scan, we can just advance it in
 243                  * the appropriate direction.  If we haven't done so yet, we call
 244                  * _bt_first() to get the first item in the scan.
 245                  */
 246                 if (!BTScanPosIsValid(so->currPos))
 247                         res = _bt_first(scan, dir);
 248                 else
 249                 {
 250                         /*
 251                          * Check to see if we should kill the previously-fetched tuple.
 252                          */
 253                         if (scan->kill_prior_tuple)
 254                         {
 255                                 /*
 256                                  * Yes, remember it for later. (We'll deal with all such
 257                                  * tuples at once right before leaving the index page.)  The
 258                                  * test for numKilled overrun is not just paranoia: if the
 259                                  * caller reverses direction in the indexscan then the same
 260                                  * item might get entered multiple times. It's not worth
 261                                  * trying to optimize that, so we don't detect it, but instead
 262                                  * just forget any excess entries.
 263                                  */
 264                                 if (so->killedItems == NULL)
 265                                         so->killedItems = (int *)
 266                                                 palloc(MaxIndexTuplesPerPage * sizeof(int));
 267                                 if (so->numKilled < MaxIndexTuplesPerPage)
 268                                         so->killedItems[so->numKilled++] = so->currPos.itemIndex;
 269                         }
 270
 271                         /*
 272                          * Now continue the scan.
 273                          */
 274                         res = _bt_next(scan, dir);
 275                 }
 276
 277                 /* If we have a tuple, return it ... */
 278                 if (res)
 279                         break;
 280                 /* ... otherwise see if we have more array keys to deal with */
 281         } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
 282
 283         return res;
 284 }
 285
 286 /*
 287  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
 288  */
 289 int64
 290 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 291 {
 292         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 293         int64           ntids = 0;
 294         ItemPointer heapTid;
 295
 296         /*
 297          * If we have any array keys, initialize them.
 298          */
 299         if (so->numArrayKeys)
 300         {
 301                 /* punt if we have any unsatisfiable array keys */
 302                 if (so->numArrayKeys < 0)
 303                         return ntids;
 304
 305                 _bt_start_array_keys(scan, ForwardScanDirection);
 306         }
 307
 308         /* This loop handles advancing to the next array elements, if any */
 309         do
 310         {
 311                 /* Fetch the first page & tuple */
 312                 if (_bt_first(scan, ForwardScanDirection))
 313                 {
 314                         /* Save tuple ID, and continue scanning */
 315                         heapTid = &scan->xs_heaptid;
 316                         tbm_add_tuples(tbm, heapTid, 1, false);
 317                         ntids++;
 318
 319                         for (;;)
 320                         {
 321                                 /*
 322                                  * Advance to next tuple within page.  This is the same as the
 323                                  * easy case in _bt_next().
 324                                  */
 325                                 if (++so->currPos.itemIndex > so->currPos.lastItem)
 326                                 {
 327                                         /* let _bt_next do the heavy lifting */
 328                                         if (!_bt_next(scan, ForwardScanDirection))
 329                                                 break;
 330                                 }
 331
 332                                 /* Save tuple ID, and continue scanning */
 333                                 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
 334                                 tbm_add_tuples(tbm, heapTid, 1, false);
 335                                 ntids++;
 336                         }
 337                 }
 338                 /* Now see if we have more array keys to deal with */
 339         } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));
 340
 341         return ntids;
 342 }
 343
 344 /*
 345  *      btbeginscan() -- start a scan on a btree index
 346  */
 347 IndexScanDesc
 348 btbeginscan(Relation rel, int nkeys, int norderbys)
 349 {
 350         IndexScanDesc scan;
 351         BTScanOpaque so;
 352
 353         /* no order by operators allowed */
 354         Assert(norderbys == 0);
 355
 356         /* get the scan */
 357         scan = RelationGetIndexScan(rel, nkeys, norderbys);
 358
 359         /* allocate private workspace */
 360         so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
 361         BTScanPosInvalidate(so->currPos);
 362         BTScanPosInvalidate(so->markPos);
 363         if (scan->numberOfKeys > 0)
 364                 so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
 365         else
 366                 so->keyData = NULL;
 367
 368         so->arrayKeyData = NULL;        /* assume no array keys for now */
 369         so->numArrayKeys = 0;
 370         so->arrayKeys = NULL;
 371         so->arrayContext = NULL;
 372
 373         so->killedItems = NULL;         /* until needed */
 374         so->numKilled = 0;
 375
 376         /*
 377          * We don't know yet whether the scan will be index-only, so we do not
 378          * allocate the tuple workspace arrays until btrescan.  However, we set up
 379          * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
 380          */
 381         so->currTuples = so->markTuples = NULL;
 382
 383         scan->xs_itupdesc = RelationGetDescr(rel);
 384
 385         scan->opaque = so;
 386
 387         return scan;
 388 }
 389
 390 /*
 391  *      btrescan() -- rescan an index relation
 392  */
 393 void
 394 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 395                  ScanKey orderbys, int norderbys)
 396 {
 397         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 398
 399         /* we aren't holding any read locks, but gotta drop the pins */
 400         if (BTScanPosIsValid(so->currPos))
 401         {
 402                 /* Before leaving current page, deal with any killed items */
 403                 if (so->numKilled > 0)
 404                         _bt_killitems(scan);
 405                 BTScanPosUnpinIfPinned(so->currPos);
 406                 BTScanPosInvalidate(so->currPos);
 407         }
 408
 409         so->markItemIndex = -1;
 410         so->arrayKeyCount = 0;
 411         BTScanPosUnpinIfPinned(so->markPos);
 412         BTScanPosInvalidate(so->markPos);
 413
 414         /*
 415          * Allocate tuple workspace arrays, if needed for an index-only scan and
 416          * not already done in a previous rescan call.  To save on palloc
 417          * overhead, both workspaces are allocated as one palloc block; only this
 418          * function and btendscan know that.
 419          *
 420          * NOTE: this data structure also makes it safe to return data from a
 421          * "name" column, even though btree name_ops uses an underlying storage
 422          * datatype of cstring.  The risk there is that "name" is supposed to be
 423          * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
 424          * However, since we only return data out of tuples sitting in the
 425          * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
 426          * data out of the markTuples array --- running off the end of memory for
 427          * a SIGSEGV is not possible.  Yeah, this is ugly as sin, but it beats
 428          * adding special-case treatment for name_ops elsewhere.
 429          */
 430         if (scan->xs_want_itup && so->currTuples == NULL)
 431         {
 432                 so->currTuples = (char *) palloc(BLCKSZ * 2);
 433                 so->markTuples = so->currTuples + BLCKSZ;
 434         }
 435
 436         /*
 437          * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
 438          * - vadim 05/05/97
 439          */
 440         if (scankey && scan->numberOfKeys > 0)
 441                 memmove(scan->keyData,
 442                                 scankey,
 443                                 scan->numberOfKeys * sizeof(ScanKeyData));
 444         so->numberOfKeys = 0;           /* until _bt_preprocess_keys sets it */
 445
 446         /* If any keys are SK_SEARCHARRAY type, set up array-key info */
 447         _bt_preprocess_array_keys(scan);
 448 }
 449
 450 /*
 451  *      btendscan() -- close down a scan
 452  */
 453 void
 454 btendscan(IndexScanDesc scan)
 455 {
 456         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 457
 458         /* we aren't holding any read locks, but gotta drop the pins */
 459         if (BTScanPosIsValid(so->currPos))
 460         {
 461                 /* Before leaving current page, deal with any killed items */
 462                 if (so->numKilled > 0)
 463                         _bt_killitems(scan);
 464                 BTScanPosUnpinIfPinned(so->currPos);
 465         }
 466
 467         so->markItemIndex = -1;
 468         BTScanPosUnpinIfPinned(so->markPos);
 469
 470         /* No need to invalidate positions, the RAM is about to be freed. */
 471
 472         /* Release storage */
 473         if (so->keyData != NULL)
 474                 pfree(so->keyData);
 475         /* so->arrayKeyData and so->arrayKeys are in arrayContext */
 476         if (so->arrayContext != NULL)
 477                 MemoryContextDelete(so->arrayContext);
 478         if (so->killedItems != NULL)
 479                 pfree(so->killedItems);
 480         if (so->currTuples != NULL)
 481                 pfree(so->currTuples);
 482         /* so->markTuples should not be pfree'd, see btrescan */
 483         pfree(so);
 484 }
 485
 486 /*
 487  *      btmarkpos() -- save current scan position
 488  */
 489 void
 490 btmarkpos(IndexScanDesc scan)
 491 {
 492         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 493
 494         /* There may be an old mark with a pin (but no lock). */
 495         BTScanPosUnpinIfPinned(so->markPos);
 496
 497         /*
 498          * Just record the current itemIndex.  If we later step to next page
 499          * before releasing the marked position, _bt_steppage makes a full copy of
 500          * the currPos struct in markPos.  If (as often happens) the mark is moved
 501          * before we leave the page, we don't have to do that work.
 502          */
 503         if (BTScanPosIsValid(so->currPos))
 504                 so->markItemIndex = so->currPos.itemIndex;
 505         else
 506         {
 507                 BTScanPosInvalidate(so->markPos);
 508                 so->markItemIndex = -1;
 509         }
 510
 511         /* Also record the current positions of any array keys */
 512         if (so->numArrayKeys)
 513                 _bt_mark_array_keys(scan);
 514 }
 515
 516 /*
 517  *      btrestrpos() -- restore scan to last saved position
 518  */
 519 void
 520 btrestrpos(IndexScanDesc scan)
 521 {
 522         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 523
 524         /* Restore the marked positions of any array keys */
 525         if (so->numArrayKeys)
 526                 _bt_restore_array_keys(scan);
 527
 528         if (so->markItemIndex >= 0)
 529         {
 530                 /*
 531                  * The scan has never moved to a new page since the last mark.  Just
 532                  * restore the itemIndex.
 533                  *
 534                  * NB: In this case we can't count on anything in so->markPos to be
 535                  * accurate.
 536                  */
 537                 so->currPos.itemIndex = so->markItemIndex;
 538         }
 539         else
 540         {
 541                 /*
 542                  * The scan moved to a new page after last mark or restore, and we are
 543                  * now restoring to the marked page.  We aren't holding any read
 544                  * locks, but if we're still holding the pin for the current position,
 545                  * we must drop it.
 546                  */
 547                 if (BTScanPosIsValid(so->currPos))
 548                 {
 549                         /* Before leaving current page, deal with any killed items */
 550                         if (so->numKilled > 0)
 551                                 _bt_killitems(scan);
 552                         BTScanPosUnpinIfPinned(so->currPos);
 553                 }
 554
 555                 if (BTScanPosIsValid(so->markPos))
 556                 {
 557                         /* bump pin on mark buffer for assignment to current buffer */
 558                         if (BTScanPosIsPinned(so->markPos))
 559                                 IncrBufferRefCount(so->markPos.buf);
 560                         memcpy(&so->currPos, &so->markPos,
 561                                    offsetof(BTScanPosData, items[1]) +
 562                                    so->markPos.lastItem * sizeof(BTScanPosItem));
 563                         if (so->currTuples)
 564                                 memcpy(so->currTuples, so->markTuples,
 565                                            so->markPos.nextTupleOffset);
 566                 }
 567                 else
 568                         BTScanPosInvalidate(so->currPos);
 569         }
 570 }
 571
 572 /*
 573  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
 574  */
 575 Size
 576 btestimateparallelscan(void)
 577 {
 578         return sizeof(BTParallelScanDescData);
 579 }
 580
 581 /*
 582  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
 583  */
 584 void
 585 btinitparallelscan(void *target)
 586 {
 587         BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
 588
 589         SpinLockInit(&bt_target->btps_mutex);
 590         bt_target->btps_scanPage = InvalidBlockNumber;
 591         bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
 592         bt_target->btps_arrayKeyCount = 0;
 593         ConditionVariableInit(&bt_target->btps_cv);
 594 }
 595
 596 /*
 597  *      btparallelrescan() -- reset parallel scan
 598  */
 599 void
 600 btparallelrescan(IndexScanDesc scan)
 601 {
 602         BTParallelScanDesc btscan;
 603         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 604
 605         Assert(parallel_scan);
 606
 607         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 608                                                                                                   parallel_scan->ps_offset);
 609
 610         /*
 611          * In theory, we don't need to acquire the spinlock here, because there
 612          * shouldn't be any other workers running at this point, but we do so for
 613          * consistency.
 614          */
 615         SpinLockAcquire(&btscan->btps_mutex);
 616         btscan->btps_scanPage = InvalidBlockNumber;
 617         btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
 618         btscan->btps_arrayKeyCount = 0;
 619         SpinLockRelease(&btscan->btps_mutex);
 620 }
 621
 622 /*
 623  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
 624  *              page.  Other scans must wait until we call _bt_parallel_release()
 625  *              or _bt_parallel_done().
 626  *
 627  * The return value is true if we successfully seized the scan and false
 628  * if we did not.  The latter case occurs if no pages remain for the current
 629  * set of scankeys.
 630  *
 631  * If the return value is true, *pageno returns the next or current page
 632  * of the scan (depending on the scan direction).  An invalid block number
 633  * means the scan hasn't yet started, and P_NONE means we've reached the end.
 634  * The first time a participating process reaches the last page, it will return
 635  * true and set *pageno to P_NONE; after that, further attempts to seize the
 636  * scan will return false.
 637  *
 638  * Callers should ignore the value of pageno if the return value is false.
 639  */
 640 bool
 641 _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
 642 {
 643         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 644         BTPS_State      pageStatus;
 645         bool            exit_loop = false;
 646         bool            status = true;
 647         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 648         BTParallelScanDesc btscan;
 649
 650         *pageno = P_NONE;
 651
 652         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 653                                                                                                   parallel_scan->ps_offset);
 654
 655         while (1)
 656         {
 657                 SpinLockAcquire(&btscan->btps_mutex);
 658                 pageStatus = btscan->btps_pageStatus;
 659
 660                 if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
 661                 {
 662                         /* Parallel scan has already advanced to a new set of scankeys. */
 663                         status = false;
 664                 }
 665                 else if (pageStatus == BTPARALLEL_DONE)
 666                 {
 667                         /*
 668                          * We're done with this set of scankeys.  This may be the end, or
 669                          * there could be more sets to try.
 670                          */
 671                         status = false;
 672                 }
 673                 else if (pageStatus != BTPARALLEL_ADVANCING)
 674                 {
 675                         /*
 676                          * We have successfully seized control of the scan for the purpose
 677                          * of advancing it to a new page!
 678                          */
 679                         btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
 680                         *pageno = btscan->btps_scanPage;
 681                         exit_loop = true;
 682                 }
 683                 SpinLockRelease(&btscan->btps_mutex);
 684                 if (exit_loop || !status)
 685                         break;
 686                 ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
 687         }
 688         ConditionVariableCancelSleep();
 689
 690         return status;
 691 }
 692
 693 /*
 694  * _bt_parallel_release() -- Complete the process of advancing the scan to a
 695  *              new page.  We now have the new value btps_scanPage; some other backend
 696  *              can now begin advancing the scan.
 697  */
 698 void
 699 _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
 700 {
 701         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 702         BTParallelScanDesc btscan;
 703
 704         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 705                                                                                                   parallel_scan->ps_offset);
 706
 707         SpinLockAcquire(&btscan->btps_mutex);
 708         btscan->btps_scanPage = scan_page;
 709         btscan->btps_pageStatus = BTPARALLEL_IDLE;
 710         SpinLockRelease(&btscan->btps_mutex);
 711         ConditionVariableSignal(&btscan->btps_cv);
 712 }
 713
 714 /*
 715  * _bt_parallel_done() -- Mark the parallel scan as complete.
 716  *
 717  * When there are no pages left to scan, this function should be called to
 718  * notify other workers.  Otherwise, they might wait forever for the scan to
 719  * advance to the next page.
 720  */
 721 void
 722 _bt_parallel_done(IndexScanDesc scan)
 723 {
 724         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 725         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 726         BTParallelScanDesc btscan;
 727         bool            status_changed = false;
 728
 729         /* Do nothing, for non-parallel scans */
 730         if (parallel_scan == NULL)
 731                 return;
 732
 733         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 734                                                                                                   parallel_scan->ps_offset);
 735
 736         /*
 737          * Mark the parallel scan as done for this combination of scan keys,
 738          * unless some other process already did so.  See also
 739          * _bt_advance_array_keys.
 740          */
 741         SpinLockAcquire(&btscan->btps_mutex);
 742         if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
 743                 btscan->btps_pageStatus != BTPARALLEL_DONE)
 744         {
 745                 btscan->btps_pageStatus = BTPARALLEL_DONE;
 746                 status_changed = true;
 747         }
 748         SpinLockRelease(&btscan->btps_mutex);
 749
 750         /* wake up all the workers associated with this parallel scan */
 751         if (status_changed)
 752                 ConditionVariableBroadcast(&btscan->btps_cv);
 753 }
 754
 755 /*
 756  * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
 757  *                      keys.
 758  *
 759  * Updates the count of array keys processed for both local and parallel
 760  * scans.
 761  */
 762 void
 763 _bt_parallel_advance_array_keys(IndexScanDesc scan)
 764 {
 765         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 766         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 767         BTParallelScanDesc btscan;
 768
 769         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 770                                                                                                   parallel_scan->ps_offset);
 771
 772         so->arrayKeyCount++;
 773         SpinLockAcquire(&btscan->btps_mutex);
 774         if (btscan->btps_pageStatus == BTPARALLEL_DONE)
 775         {
 776                 btscan->btps_scanPage = InvalidBlockNumber;
 777                 btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
 778                 btscan->btps_arrayKeyCount++;
 779         }
 780         SpinLockRelease(&btscan->btps_mutex);
 781 }
 782
 783 /*
 784  * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup assuming that
 785  *                      btbulkdelete() wasn't called.
 786  */
 787 static bool
 788 _bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
 789 {
 790         Buffer          metabuf;
 791         Page            metapg;
 792         BTMetaPageData *metad;
 793         bool            result = false;
 794
 795         metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ);
 796         metapg = BufferGetPage(metabuf);
 797         metad = BTPageGetMeta(metapg);
 798
 799         if (metad->btm_version < BTREE_NOVAC_VERSION)
 800         {
 801                 /*
 802                  * Do cleanup if metapage needs upgrade, because we don't have
 803                  * cleanup-related meta-information yet.
 804                  */
 805                 result = true;
 806         }
 807         else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) &&
 808                          TransactionIdPrecedes(metad->btm_oldest_btpo_xact,
 809                                                                    RecentGlobalXmin))
 810         {
 811                 /*
 812                  * If oldest btpo.xact in the deleted pages is older than
 813                  * RecentGlobalXmin, then at least one deleted page can be recycled.
 814                  */
 815                 result = true;
 816         }
 817         else
 818         {
 819                 StdRdOptions *relopts;
 820                 float8          cleanup_scale_factor;
 821                 float8          prev_num_heap_tuples;
 822
 823                 /*
 824                  * If table receives enough insertions and no cleanup was performed,
 825                  * then index would appear have stale statistics.  If scale factor is
 826                  * set, we avoid that by performing cleanup if the number of inserted
 827                  * tuples exceeds vacuum_cleanup_index_scale_factor fraction of
 828                  * original tuples count.
 829                  */
 830                 relopts = (StdRdOptions *) info->index->rd_options;
 831                 cleanup_scale_factor = (relopts &&
 832                                                                 relopts->vacuum_cleanup_index_scale_factor >= 0)
 833                         ? relopts->vacuum_cleanup_index_scale_factor
 834                         : vacuum_cleanup_index_scale_factor;
 835                 prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
 836
 837                 if (cleanup_scale_factor <= 0 ||
 838                         prev_num_heap_tuples <= 0 ||
 839                         (info->num_heap_tuples - prev_num_heap_tuples) /
 840                         prev_num_heap_tuples >= cleanup_scale_factor)
 841                         result = true;
 842         }
 843
 844         _bt_relbuf(info->index, metabuf);
 845         return result;
 846 }
 847
 848 /*
 849  * Bulk deletion of all index entries pointing to a set of heap tuples.
 850  * The set of target tuples is specified via a callback routine that tells
 851  * whether any given heap tuple (identified by ItemPointer) is being deleted.
 852  *
 853  * Result: a palloc'd struct containing statistical info for VACUUM displays.
 854  */
 855 IndexBulkDeleteResult *
 856 btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 857                          IndexBulkDeleteCallback callback, void *callback_state)
 858 {
 859         Relation        rel = info->index;
 860         BTCycleId       cycleid;
 861
 862         /* allocate stats if first time through, else re-use existing struct */
 863         if (stats == NULL)
 864                 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
 865
 866         /* Establish the vacuum cycle ID to use for this scan */
 867         /* The ENSURE stuff ensures we clean up shared memory on failure */
 868         PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
 869         {
 870                 TransactionId oldestBtpoXact;
 871
 872                 cycleid = _bt_start_vacuum(rel);
 873
 874                 btvacuumscan(info, stats, callback, callback_state, cycleid,
 875                                          &oldestBtpoXact);
 876
 877                 /*
 878                  * Update cleanup-related information in metapage. This information is
 879                  * used only for cleanup but keeping them up to date can avoid
 880                  * unnecessary cleanup even after bulkdelete.
 881                  */
 882                 _bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
 883                                                                          info->num_heap_tuples);
 884         }
 885         PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
 886         _bt_end_vacuum(rel);
 887
 888         return stats;
 889 }
 890
 891 /*
 892  * Post-VACUUM cleanup.
 893  *
 894  * Result: a palloc'd struct containing statistical info for VACUUM displays.
 895  */
 896 IndexBulkDeleteResult *
 897 btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 898 {
 899         /* No-op in ANALYZE ONLY mode */
 900         if (info->analyze_only)
 901                 return stats;
 902
 903         /*
 904          * If btbulkdelete was called, we need not do anything, just return the
 905          * stats from the latest btbulkdelete call.  If it wasn't called, we might
 906          * still need to do a pass over the index, to recycle any newly-recyclable
 907          * pages or to obtain index statistics.  _bt_vacuum_needs_cleanup
 908          * determines if either are needed.
 909          *
 910          * Since we aren't going to actually delete any leaf items, there's no
 911          * need to go through all the vacuum-cycle-ID pushups.
 912          */
 913         if (stats == NULL)
 914         {
 915                 TransactionId oldestBtpoXact;
 916
 917                 /* Check if we need a cleanup */
 918                 if (!_bt_vacuum_needs_cleanup(info))
 919                         return NULL;
 920
 921                 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
 922                 btvacuumscan(info, stats, NULL, NULL, 0, &oldestBtpoXact);
 923
 924                 /* Update cleanup-related information in the metapage */
 925                 _bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
 926                                                                          info->num_heap_tuples);
 927         }
 928
 929         /*
 930          * It's quite possible for us to be fooled by concurrent page splits into
 931          * double-counting some index tuples, so disbelieve any total that exceeds
 932          * the underlying heap's count ... if we know that accurately.  Otherwise
 933          * this might just make matters worse.
 934          */
 935         if (!info->estimated_count)
 936         {
 937                 if (stats->num_index_tuples > info->num_heap_tuples)
 938                         stats->num_index_tuples = info->num_heap_tuples;
 939         }
 940
 941         return stats;
 942 }
 943
 944 /*
 945  * btvacuumscan --- scan the index for VACUUMing purposes
 946  *
 947  * This combines the functions of looking for leaf tuples that are deletable
 948  * according to the vacuum callback, looking for empty pages that can be
 949  * deleted, and looking for old deleted pages that can be recycled.  Both
 950  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
 951  * btbulkdelete call occurred).
 952  *
 953  * The caller is responsible for initially allocating/zeroing a stats struct
 954  * and for obtaining a vacuum cycle ID if necessary.
 955  */
 956 static void
 957 btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 958                          IndexBulkDeleteCallback callback, void *callback_state,
 959                          BTCycleId cycleid, TransactionId *oldestBtpoXact)
 960 {
 961         Relation        rel = info->index;
 962         BTVacState      vstate;
 963         BlockNumber num_pages;
 964         BlockNumber blkno;
 965         bool            needLock;
 966
 967         /*
 968          * Reset counts that will be incremented during the scan; needed in case
 969          * of multiple scans during a single VACUUM command
 970          */
 971         stats->estimated_count = false;
 972         stats->num_index_tuples = 0;
 973         stats->pages_deleted = 0;
 974
 975         /* Set up info to pass down to btvacuumpage */
 976         vstate.info = info;
 977         vstate.stats = stats;
 978         vstate.callback = callback;
 979         vstate.callback_state = callback_state;
 980         vstate.cycleid = cycleid;
 981         vstate.lastBlockVacuumed = BTREE_METAPAGE;      /* Initialise at first block */
 982         vstate.lastBlockLocked = BTREE_METAPAGE;
 983         vstate.totFreePages = 0;
 984         vstate.oldestBtpoXact = InvalidTransactionId;
 985
 986         /* Create a temporary memory context to run _bt_pagedel in */
 987         vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
 988                                                                                                   "_bt_pagedel",
 989                                                                                                   ALLOCSET_DEFAULT_SIZES);
 990
 991         /*
 992          * The outer loop iterates over all index pages except the metapage, in
 993          * physical order (we hope the kernel will cooperate in providing
 994          * read-ahead for speed).  It is critical that we visit all leaf pages,
 995          * including ones added after we start the scan, else we might fail to
 996          * delete some deletable tuples.  Hence, we must repeatedly check the
 997          * relation length.  We must acquire the relation-extension lock while
 998          * doing so to avoid a race condition: if someone else is extending the
 999          * relation, there is a window where bufmgr/smgr have created a new
1000          * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1001          * we manage to scan such a page here, we'll improperly assume it can be
1002          * recycled.  Taking the lock synchronizes things enough to prevent a
1003          * problem: either num_pages won't include the new page, or _bt_getbuf
1004          * already has write lock on the buffer and it will be fully initialized
1005          * before we can examine it.  (See also vacuumlazy.c, which has the same
1006          * issue.)      Also, we need not worry if a page is added immediately after
1007          * we look; the page splitting code already has write-lock on the left
1008          * page before it adds a right page, so we must already have processed any
1009          * tuples due to be moved into such a page.
1010          *
1011          * We can skip locking for new or temp relations, however, since no one
1012          * else could be accessing them.
1013          */
1014         needLock = !RELATION_IS_LOCAL(rel);
1015
1016         blkno = BTREE_METAPAGE + 1;
1017         for (;;)
1018         {
1019                 /* Get the current relation length */
1020                 if (needLock)
1021                         LockRelationForExtension(rel, ExclusiveLock);
1022                 num_pages = RelationGetNumberOfBlocks(rel);
1023                 if (needLock)
1024                         UnlockRelationForExtension(rel, ExclusiveLock);
1025
1026                 if (info->report_progress)
1027                         pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1028                                                                                  num_pages);
1029
1030                 /* Quit if we've scanned the whole relation */
1031                 if (blkno >= num_pages)
1032                         break;
1033                 /* Iterate over pages, then loop back to recheck length */
1034                 for (; blkno < num_pages; blkno++)
1035                 {
1036                         btvacuumpage(&vstate, blkno, blkno);
1037                         if (info->report_progress)
1038                                 pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1039                                                                                          blkno);
1040                 }
1041         }
1042
1043         /*
1044          * Check to see if we need to issue one final WAL record for this index,
1045          * which may be needed for correctness on a hot standby node when non-MVCC
1046          * index scans could take place.
1047          *
1048          * If the WAL is replayed in hot standby, the replay process needs to get
1049          * cleanup locks on all index leaf pages, just as we've been doing here.
1050          * However, we won't issue any WAL records about pages that have no items
1051          * to be deleted.  For pages between pages we've vacuumed, the replay code
1052          * will take locks under the direction of the lastBlockVacuumed fields in
1053          * the XLOG_BTREE_VACUUM WAL records.  To cover pages after the last one
1054          * we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record
1055          * against the last leaf page in the index, if that one wasn't vacuumed.
1056          */
1057         if (XLogStandbyInfoActive() &&
1058                 vstate.lastBlockVacuumed < vstate.lastBlockLocked)
1059         {
1060                 Buffer          buf;
1061
1062                 /*
1063                  * The page should be valid, but we can't use _bt_getbuf() because we
1064                  * want to use a nondefault buffer access strategy.  Since we aren't
1065                  * going to delete any items, getting cleanup lock again is probably
1066                  * overkill, but for consistency do that anyway.
1067                  */
1068                 buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked,
1069                                                                  RBM_NORMAL, info->strategy);
1070                 LockBufferForCleanup(buf);
1071                 _bt_checkpage(rel, buf);
1072                 _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed);
1073                 _bt_relbuf(rel, buf);
1074         }
1075
1076         MemoryContextDelete(vstate.pagedelcontext);
1077
1078         /*
1079          * If we found any recyclable pages (and recorded them in the FSM), then
1080          * forcibly update the upper-level FSM pages to ensure that searchers can
1081          * find them.  It's possible that the pages were also found during
1082          * previous scans and so this is a waste of time, but it's cheap enough
1083          * relative to scanning the index that it shouldn't matter much, and
1084          * making sure that free pages are available sooner not later seems
1085          * worthwhile.
1086          *
1087          * Note that if no recyclable pages exist, we don't bother vacuuming the
1088          * FSM at all.
1089          */
1090         if (vstate.totFreePages > 0)
1091                 IndexFreeSpaceMapVacuum(rel);
1092
1093         /* update statistics */
1094         stats->num_pages = num_pages;
1095         stats->pages_free = vstate.totFreePages;
1096
1097         if (oldestBtpoXact)
1098                 *oldestBtpoXact = vstate.oldestBtpoXact;
1099 }
1100
1101 /*
1102  * btvacuumpage --- VACUUM one page
1103  *
1104  * This processes a single page for btvacuumscan().  In some cases we
1105  * must go back and re-examine previously-scanned pages; this routine
1106  * recurses when necessary to handle that case.
1107  *
1108  * blkno is the page to process.  orig_blkno is the highest block number
1109  * reached by the outer btvacuumscan loop (the same as blkno, unless we
1110  * are recursing to re-examine a previous page).
1111  */
1112 static void
1113 btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
1114 {
1115         IndexVacuumInfo *info = vstate->info;
1116         IndexBulkDeleteResult *stats = vstate->stats;
1117         IndexBulkDeleteCallback callback = vstate->callback;
1118         void       *callback_state = vstate->callback_state;
1119         Relation        rel = info->index;
1120         bool            delete_now;
1121         BlockNumber recurse_to;
1122         Buffer          buf;
1123         Page            page;
1124         BTPageOpaque opaque = NULL;
1125
1126 restart:
1127         delete_now = false;
1128         recurse_to = P_NONE;
1129
1130         /* call vacuum_delay_point while not holding any buffer lock */
1131         vacuum_delay_point();
1132
1133         /*
1134          * We can't use _bt_getbuf() here because it always applies
1135          * _bt_checkpage(), which will barf on an all-zero page. We want to
1136          * recycle all-zero pages, not fail.  Also, we want to use a nondefault
1137          * buffer access strategy.
1138          */
1139         buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1140                                                          info->strategy);
1141         LockBuffer(buf, BT_READ);
1142         page = BufferGetPage(buf);
1143         if (!PageIsNew(page))
1144         {
1145                 _bt_checkpage(rel, buf);
1146                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1147         }
1148
1149         /*
1150          * If we are recursing, the only case we want to do anything with is a
1151          * live leaf page having the current vacuum cycle ID.  Any other state
1152          * implies we already saw the page (eg, deleted it as being empty).
1153          */
1154         if (blkno != orig_blkno)
1155         {
1156                 if (_bt_page_recyclable(page) ||
1157                         P_IGNORE(opaque) ||
1158                         !P_ISLEAF(opaque) ||
1159                         opaque->btpo_cycleid != vstate->cycleid)
1160                 {
1161                         _bt_relbuf(rel, buf);
1162                         return;
1163                 }
1164         }
1165
1166         /* Page is valid, see what to do with it */
1167         if (_bt_page_recyclable(page))
1168         {
1169                 /* Okay to recycle this page */
1170                 RecordFreeIndexPage(rel, blkno);
1171                 vstate->totFreePages++;
1172                 stats->pages_deleted++;
1173         }
1174         else if (P_ISDELETED(opaque))
1175         {
1176                 /* Already deleted, but can't recycle yet */
1177                 stats->pages_deleted++;
1178
1179                 /* Update the oldest btpo.xact */
1180                 if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
1181                         TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
1182                         vstate->oldestBtpoXact = opaque->btpo.xact;
1183         }
1184         else if (P_ISHALFDEAD(opaque))
1185         {
1186                 /* Half-dead, try to delete */
1187                 delete_now = true;
1188         }
1189         else if (P_ISLEAF(opaque))
1190         {
1191                 OffsetNumber deletable[MaxOffsetNumber];
1192                 int                     ndeletable;
1193                 OffsetNumber offnum,
1194                                         minoff,
1195                                         maxoff;
1196
1197                 /*
1198                  * Trade in the initial read lock for a super-exclusive write lock on
1199                  * this page.  We must get such a lock on every leaf page over the
1200                  * course of the vacuum scan, whether or not it actually contains any
1201                  * deletable tuples --- see nbtree/README.
1202                  */
1203                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1204                 LockBufferForCleanup(buf);
1205
1206                 /*
1207                  * Remember highest leaf page number we've taken cleanup lock on; see
1208                  * notes in btvacuumscan
1209                  */
1210                 if (blkno > vstate->lastBlockLocked)
1211                         vstate->lastBlockLocked = blkno;
1212
1213                 /*
1214                  * Check whether we need to recurse back to earlier pages.  What we
1215                  * are concerned about is a page split that happened since we started
1216                  * the vacuum scan.  If the split moved some tuples to a lower page
1217                  * then we might have missed 'em.  If so, set up for tail recursion.
1218                  * (Must do this before possibly clearing btpo_cycleid below!)
1219                  */
1220                 if (vstate->cycleid != 0 &&
1221                         opaque->btpo_cycleid == vstate->cycleid &&
1222                         !(opaque->btpo_flags & BTP_SPLIT_END) &&
1223                         !P_RIGHTMOST(opaque) &&
1224                         opaque->btpo_next < orig_blkno)
1225                         recurse_to = opaque->btpo_next;
1226
1227                 /*
1228                  * Scan over all items to see which ones need deleted according to the
1229                  * callback function.
1230                  */
1231                 ndeletable = 0;
1232                 minoff = P_FIRSTDATAKEY(opaque);
1233                 maxoff = PageGetMaxOffsetNumber(page);
1234                 if (callback)
1235                 {
1236                         for (offnum = minoff;
1237                                  offnum <= maxoff;
1238                                  offnum = OffsetNumberNext(offnum))
1239                         {
1240                                 IndexTuple      itup;
1241                                 ItemPointer htup;
1242
1243                                 itup = (IndexTuple) PageGetItem(page,
1244                                                                                                 PageGetItemId(page, offnum));
1245                                 htup = &(itup->t_tid);
1246
1247                                 /*
1248                                  * During Hot Standby we currently assume that
1249                                  * XLOG_BTREE_VACUUM records do not produce conflicts. That is
1250                                  * only true as long as the callback function depends only
1251                                  * upon whether the index tuple refers to heap tuples removed
1252                                  * in the initial heap scan. When vacuum starts it derives a
1253                                  * value of OldestXmin. Backends taking later snapshots could
1254                                  * have a RecentGlobalXmin with a later xid than the vacuum's
1255                                  * OldestXmin, so it is possible that row versions deleted
1256                                  * after OldestXmin could be marked as killed by other
1257                                  * backends. The callback function *could* look at the index
1258                                  * tuple state in isolation and decide to delete the index
1259                                  * tuple, though currently it does not. If it ever did, we
1260                                  * would need to reconsider whether XLOG_BTREE_VACUUM records
1261                                  * should cause conflicts. If they did cause conflicts they
1262                                  * would be fairly harsh conflicts, since we haven't yet
1263                                  * worked out a way to pass a useful value for
1264                                  * latestRemovedXid on the XLOG_BTREE_VACUUM records. This
1265                                  * applies to *any* type of index that marks index tuples as
1266                                  * killed.
1267                                  */
1268                                 if (callback(htup, callback_state))
1269                                         deletable[ndeletable++] = offnum;
1270                         }
1271                 }
1272
1273                 /*
1274                  * Apply any needed deletes.  We issue just one _bt_delitems_vacuum()
1275                  * call per page, so as to minimize WAL traffic.
1276                  */
1277                 if (ndeletable > 0)
1278                 {
1279                         /*
1280                          * Notice that the issued XLOG_BTREE_VACUUM WAL record includes
1281                          * all information to the replay code to allow it to get a cleanup
1282                          * lock on all pages between the previous lastBlockVacuumed and
1283                          * this page. This ensures that WAL replay locks all leaf pages at
1284                          * some point, which is important should non-MVCC scans be
1285                          * requested. This is currently unused on standby, but we record
1286                          * it anyway, so that the WAL contains the required information.
1287                          *
1288                          * Since we can visit leaf pages out-of-order when recursing,
1289                          * replay might end up locking such pages an extra time, but it
1290                          * doesn't seem worth the amount of bookkeeping it'd take to avoid
1291                          * that.
1292                          */
1293                         _bt_delitems_vacuum(rel, buf, deletable, ndeletable,
1294                                                                 vstate->lastBlockVacuumed);
1295
1296                         /*
1297                          * Remember highest leaf page number we've issued a
1298                          * XLOG_BTREE_VACUUM WAL record for.
1299                          */
1300                         if (blkno > vstate->lastBlockVacuumed)
1301                                 vstate->lastBlockVacuumed = blkno;
1302
1303                         stats->tuples_removed += ndeletable;
1304                         /* must recompute maxoff */
1305                         maxoff = PageGetMaxOffsetNumber(page);
1306                 }
1307                 else
1308                 {
1309                         /*
1310                          * If the page has been split during this vacuum cycle, it seems
1311                          * worth expending a write to clear btpo_cycleid even if we don't
1312                          * have any deletions to do.  (If we do, _bt_delitems_vacuum takes
1313                          * care of this.)  This ensures we won't process the page again.
1314                          *
1315                          * We treat this like a hint-bit update because there's no need to
1316                          * WAL-log it.
1317                          */
1318                         if (vstate->cycleid != 0 &&
1319                                 opaque->btpo_cycleid == vstate->cycleid)
1320                         {
1321                                 opaque->btpo_cycleid = 0;
1322                                 MarkBufferDirtyHint(buf, true);
1323                         }
1324                 }
1325
1326                 /*
1327                  * If it's now empty, try to delete; else count the live tuples. We
1328                  * don't delete when recursing, though, to avoid putting entries into
1329                  * freePages out-of-order (doesn't seem worth any extra code to handle
1330                  * the case).
1331                  */
1332                 if (minoff > maxoff)
1333                         delete_now = (blkno == orig_blkno);
1334                 else
1335                         stats->num_index_tuples += maxoff - minoff + 1;
1336         }
1337
1338         if (delete_now)
1339         {
1340                 MemoryContext oldcontext;
1341                 int                     ndel;
1342
1343                 /* Run pagedel in a temp context to avoid memory leakage */
1344                 MemoryContextReset(vstate->pagedelcontext);
1345                 oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1346
1347                 ndel = _bt_pagedel(rel, buf);
1348
1349                 /* count only this page, else may double-count parent */
1350                 if (ndel)
1351                 {
1352                         stats->pages_deleted++;
1353                         if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
1354                                 TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
1355                                 vstate->oldestBtpoXact = opaque->btpo.xact;
1356                 }
1357
1358                 MemoryContextSwitchTo(oldcontext);
1359                 /* pagedel released buffer, so we shouldn't */
1360         }
1361         else
1362                 _bt_relbuf(rel, buf);
1363
1364         /*
1365          * This is really tail recursion, but if the compiler is too stupid to
1366          * optimize it as such, we'd eat an uncomfortably large amount of stack
1367          * space per recursion level (due to the deletable[] array). A failure is
1368          * improbable since the number of levels isn't likely to be large ... but
1369          * just in case, let's hand-optimize into a loop.
1370          */
1371         if (recurse_to != P_NONE)
1372         {
1373                 blkno = recurse_to;
1374                 goto restart;
1375         }
1376 }
1377
1378 /*
1379  *      btcanreturn() -- Check whether btree indexes support index-only scans.
1380  *
1381  * btrees always do, so this is trivial.
1382  */
1383 bool
1384 btcanreturn(Relation index, int attno)
1385 {
1386         return true;
1387 }