granicus.if.org Git - postgresql/blob - src/backend/access/nbtree/nbtree.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nbtree.c
   4  *        Implementation of Lehman and Yao's btree management algorithm for
   5  *        Postgres.
   6  *
   7  * NOTES
   8  *        This file contains only the public interface routines.
   9  *
  10  *
  11  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
  12  * Portions Copyright (c) 1994, Regents of the University of California
  13  *
  14  * IDENTIFICATION
  15  *        src/backend/access/nbtree/nbtree.c
  16  *
  17  *-------------------------------------------------------------------------
  18  */
  19 #include "postgres.h"
  20
  21 #include "access/nbtree.h"
  22 #include "access/nbtxlog.h"
  23 #include "access/relscan.h"
  24 #include "access/xlog.h"
  25 #include "commands/vacuum.h"
  26 #include "miscadmin.h"
  27 #include "nodes/execnodes.h"
  28 #include "pgstat.h"
  29 #include "postmaster/autovacuum.h"
  30 #include "storage/condition_variable.h"
  31 #include "storage/indexfsm.h"
  32 #include "storage/ipc.h"
  33 #include "storage/lmgr.h"
  34 #include "storage/smgr.h"
  35 #include "utils/builtins.h"
  36 #include "utils/index_selfuncs.h"
  37 #include "utils/memutils.h"
  38
  39
  40 /* Working state needed by btvacuumpage */
  41 typedef struct
  42 {
  43         IndexVacuumInfo *info;
  44         IndexBulkDeleteResult *stats;
  45         IndexBulkDeleteCallback callback;
  46         void       *callback_state;
  47         BTCycleId       cycleid;
  48         BlockNumber lastBlockVacuumed;  /* highest blkno actually vacuumed */
  49         BlockNumber lastBlockLocked;    /* highest blkno we've cleanup-locked */
  50         BlockNumber totFreePages;       /* true total # of free pages */
  51         TransactionId oldestBtpoXact;
  52         MemoryContext pagedelcontext;
  53 } BTVacState;
  54
  55 /*
  56  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
  57  *
  58  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
  59  * a new page; others must wait.
  60  *
  61  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
  62  * to a new page; some process can start doing that.
  63  *
  64  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
  65  * We reach this state once for every distinct combination of array keys.
  66  */
  67 typedef enum
  68 {
  69         BTPARALLEL_NOT_INITIALIZED,
  70         BTPARALLEL_ADVANCING,
  71         BTPARALLEL_IDLE,
  72         BTPARALLEL_DONE
  73 } BTPS_State;
  74
  75 /*
  76  * BTParallelScanDescData contains btree specific shared information required
  77  * for parallel scan.
  78  */
  79 typedef struct BTParallelScanDescData
  80 {
  81         BlockNumber btps_scanPage;      /* latest or next page to be scanned */
  82         BTPS_State      btps_pageStatus;        /* indicates whether next page is
  83                                                                          * available for scan. see above for
  84                                                                          * possible states of parallel scan. */
  85         int                     btps_arrayKeyCount; /* count indicating number of array scan
  86                                                                          * keys processed by parallel scan */
  87         slock_t         btps_mutex;             /* protects above variables */
  88         ConditionVariable btps_cv;      /* used to synchronize parallel scan */
  89 }                       BTParallelScanDescData;
  90
  91 typedef struct BTParallelScanDescData *BTParallelScanDesc;
  92
  93
  94 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
  95                          IndexBulkDeleteCallback callback, void *callback_state,
  96                          BTCycleId cycleid, TransactionId *oldestBtpoXact);
  97 static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
  98                          BlockNumber orig_blkno);
  99
 100
 101 /*
 102  * Btree handler function: return IndexAmRoutine with access method parameters
 103  * and callbacks.
 104  */
 105 Datum
 106 bthandler(PG_FUNCTION_ARGS)
 107 {
 108         IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
 109
 110         amroutine->amstrategies = BTMaxStrategyNumber;
 111         amroutine->amsupport = BTNProcs;
 112         amroutine->amcanorder = true;
 113         amroutine->amcanorderbyop = false;
 114         amroutine->amcanbackward = true;
 115         amroutine->amcanunique = true;
 116         amroutine->amcanmulticol = true;
 117         amroutine->amoptionalkey = true;
 118         amroutine->amsearcharray = true;
 119         amroutine->amsearchnulls = true;
 120         amroutine->amstorage = false;
 121         amroutine->amclusterable = true;
 122         amroutine->ampredlocks = true;
 123         amroutine->amcanparallel = true;
 124         amroutine->amcaninclude = true;
 125         amroutine->amkeytype = InvalidOid;
 126
 127         amroutine->ambuild = btbuild;
 128         amroutine->ambuildempty = btbuildempty;
 129         amroutine->aminsert = btinsert;
 130         amroutine->ambulkdelete = btbulkdelete;
 131         amroutine->amvacuumcleanup = btvacuumcleanup;
 132         amroutine->amcanreturn = btcanreturn;
 133         amroutine->amcostestimate = btcostestimate;
 134         amroutine->amoptions = btoptions;
 135         amroutine->amproperty = btproperty;
 136         amroutine->amvalidate = btvalidate;
 137         amroutine->ambeginscan = btbeginscan;
 138         amroutine->amrescan = btrescan;
 139         amroutine->amgettuple = btgettuple;
 140         amroutine->amgetbitmap = btgetbitmap;
 141         amroutine->amendscan = btendscan;
 142         amroutine->ammarkpos = btmarkpos;
 143         amroutine->amrestrpos = btrestrpos;
 144         amroutine->amestimateparallelscan = btestimateparallelscan;
 145         amroutine->aminitparallelscan = btinitparallelscan;
 146         amroutine->amparallelrescan = btparallelrescan;
 147
 148         PG_RETURN_POINTER(amroutine);
 149 }
 150
 151 /*
 152  *      btbuildempty() -- build an empty btree index in the initialization fork
 153  */
 154 void
 155 btbuildempty(Relation index)
 156 {
 157         Page            metapage;
 158
 159         /* Construct metapage. */
 160         metapage = (Page) palloc(BLCKSZ);
 161         _bt_initmetapage(metapage, P_NONE, 0);
 162
 163         /*
 164          * Write the page and log it.  It might seem that an immediate sync would
 165          * be sufficient to guarantee that the file exists on disk, but recovery
 166          * itself might remove it while replaying, for example, an
 167          * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record.  Therefore, we need
 168          * this even when wal_level=minimal.
 169          */
 170         PageSetChecksumInplace(metapage, BTREE_METAPAGE);
 171         smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
 172                           (char *) metapage, true);
 173         log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
 174                                 BTREE_METAPAGE, metapage, true);
 175
 176         /*
 177          * An immediate sync is required even if we xlog'd the page, because the
 178          * write did not go through shared_buffers and therefore a concurrent
 179          * checkpoint may have moved the redo pointer past our xlog record.
 180          */
 181         smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
 182 }
 183
 184 /*
 185  *      btinsert() -- insert an index tuple into a btree.
 186  *
 187  *              Descend the tree recursively, find the appropriate location for our
 188  *              new tuple, and put it there.
 189  */
 190 bool
 191 btinsert(Relation rel, Datum *values, bool *isnull,
 192                  ItemPointer ht_ctid, Relation heapRel,
 193                  IndexUniqueCheck checkUnique,
 194                  IndexInfo *indexInfo)
 195 {
 196         bool            result;
 197         IndexTuple      itup;
 198
 199         /* generate an index tuple */
 200         itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
 201         itup->t_tid = *ht_ctid;
 202
 203         result = _bt_doinsert(rel, itup, checkUnique, heapRel);
 204
 205         pfree(itup);
 206
 207         return result;
 208 }
 209
 210 /*
 211  *      btgettuple() -- Get the next tuple in the scan.
 212  */
 213 bool
 214 btgettuple(IndexScanDesc scan, ScanDirection dir)
 215 {
 216         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 217         bool            res;
 218
 219         /* btree indexes are never lossy */
 220         scan->xs_recheck = false;
 221
 222         /*
 223          * If we have any array keys, initialize them during first call for a
 224          * scan.  We can't do this in btrescan because we don't know the scan
 225          * direction at that time.
 226          */
 227         if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
 228         {
 229                 /* punt if we have any unsatisfiable array keys */
 230                 if (so->numArrayKeys < 0)
 231                         return false;
 232
 233                 _bt_start_array_keys(scan, dir);
 234         }
 235
 236         /* This loop handles advancing to the next array elements, if any */
 237         do
 238         {
 239                 /*
 240                  * If we've already initialized this scan, we can just advance it in
 241                  * the appropriate direction.  If we haven't done so yet, we call
 242                  * _bt_first() to get the first item in the scan.
 243                  */
 244                 if (!BTScanPosIsValid(so->currPos))
 245                         res = _bt_first(scan, dir);
 246                 else
 247                 {
 248                         /*
 249                          * Check to see if we should kill the previously-fetched tuple.
 250                          */
 251                         if (scan->kill_prior_tuple)
 252                         {
 253                                 /*
 254                                  * Yes, remember it for later. (We'll deal with all such
 255                                  * tuples at once right before leaving the index page.)  The
 256                                  * test for numKilled overrun is not just paranoia: if the
 257                                  * caller reverses direction in the indexscan then the same
 258                                  * item might get entered multiple times. It's not worth
 259                                  * trying to optimize that, so we don't detect it, but instead
 260                                  * just forget any excess entries.
 261                                  */
 262                                 if (so->killedItems == NULL)
 263                                         so->killedItems = (int *)
 264                                                 palloc(MaxIndexTuplesPerPage * sizeof(int));
 265                                 if (so->numKilled < MaxIndexTuplesPerPage)
 266                                         so->killedItems[so->numKilled++] = so->currPos.itemIndex;
 267                         }
 268
 269                         /*
 270                          * Now continue the scan.
 271                          */
 272                         res = _bt_next(scan, dir);
 273                 }
 274
 275                 /* If we have a tuple, return it ... */
 276                 if (res)
 277                         break;
 278                 /* ... otherwise see if we have more array keys to deal with */
 279         } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
 280
 281         return res;
 282 }
 283
 284 /*
 285  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
 286  */
 287 int64
 288 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 289 {
 290         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 291         int64           ntids = 0;
 292         ItemPointer heapTid;
 293
 294         /*
 295          * If we have any array keys, initialize them.
 296          */
 297         if (so->numArrayKeys)
 298         {
 299                 /* punt if we have any unsatisfiable array keys */
 300                 if (so->numArrayKeys < 0)
 301                         return ntids;
 302
 303                 _bt_start_array_keys(scan, ForwardScanDirection);
 304         }
 305
 306         /* This loop handles advancing to the next array elements, if any */
 307         do
 308         {
 309                 /* Fetch the first page & tuple */
 310                 if (_bt_first(scan, ForwardScanDirection))
 311                 {
 312                         /* Save tuple ID, and continue scanning */
 313                         heapTid = &scan->xs_heaptid;
 314                         tbm_add_tuples(tbm, heapTid, 1, false);
 315                         ntids++;
 316
 317                         for (;;)
 318                         {
 319                                 /*
 320                                  * Advance to next tuple within page.  This is the same as the
 321                                  * easy case in _bt_next().
 322                                  */
 323                                 if (++so->currPos.itemIndex > so->currPos.lastItem)
 324                                 {
 325                                         /* let _bt_next do the heavy lifting */
 326                                         if (!_bt_next(scan, ForwardScanDirection))
 327                                                 break;
 328                                 }
 329
 330                                 /* Save tuple ID, and continue scanning */
 331                                 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
 332                                 tbm_add_tuples(tbm, heapTid, 1, false);
 333                                 ntids++;
 334                         }
 335                 }
 336                 /* Now see if we have more array keys to deal with */
 337         } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));
 338
 339         return ntids;
 340 }
 341
 342 /*
 343  *      btbeginscan() -- start a scan on a btree index
 344  */
 345 IndexScanDesc
 346 btbeginscan(Relation rel, int nkeys, int norderbys)
 347 {
 348         IndexScanDesc scan;
 349         BTScanOpaque so;
 350
 351         /* no order by operators allowed */
 352         Assert(norderbys == 0);
 353
 354         /* get the scan */
 355         scan = RelationGetIndexScan(rel, nkeys, norderbys);
 356
 357         /* allocate private workspace */
 358         so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
 359         BTScanPosInvalidate(so->currPos);
 360         BTScanPosInvalidate(so->markPos);
 361         if (scan->numberOfKeys > 0)
 362                 so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
 363         else
 364                 so->keyData = NULL;
 365
 366         so->arrayKeyData = NULL;        /* assume no array keys for now */
 367         so->numArrayKeys = 0;
 368         so->arrayKeys = NULL;
 369         so->arrayContext = NULL;
 370
 371         so->killedItems = NULL;         /* until needed */
 372         so->numKilled = 0;
 373
 374         /*
 375          * We don't know yet whether the scan will be index-only, so we do not
 376          * allocate the tuple workspace arrays until btrescan.  However, we set up
 377          * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
 378          */
 379         so->currTuples = so->markTuples = NULL;
 380
 381         scan->xs_itupdesc = RelationGetDescr(rel);
 382
 383         scan->opaque = so;
 384
 385         return scan;
 386 }
 387
 388 /*
 389  *      btrescan() -- rescan an index relation
 390  */
 391 void
 392 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 393                  ScanKey orderbys, int norderbys)
 394 {
 395         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 396
 397         /* we aren't holding any read locks, but gotta drop the pins */
 398         if (BTScanPosIsValid(so->currPos))
 399         {
 400                 /* Before leaving current page, deal with any killed items */
 401                 if (so->numKilled > 0)
 402                         _bt_killitems(scan);
 403                 BTScanPosUnpinIfPinned(so->currPos);
 404                 BTScanPosInvalidate(so->currPos);
 405         }
 406
 407         so->markItemIndex = -1;
 408         so->arrayKeyCount = 0;
 409         BTScanPosUnpinIfPinned(so->markPos);
 410         BTScanPosInvalidate(so->markPos);
 411
 412         /*
 413          * Allocate tuple workspace arrays, if needed for an index-only scan and
 414          * not already done in a previous rescan call.  To save on palloc
 415          * overhead, both workspaces are allocated as one palloc block; only this
 416          * function and btendscan know that.
 417          *
 418          * NOTE: this data structure also makes it safe to return data from a
 419          * "name" column, even though btree name_ops uses an underlying storage
 420          * datatype of cstring.  The risk there is that "name" is supposed to be
 421          * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
 422          * However, since we only return data out of tuples sitting in the
 423          * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
 424          * data out of the markTuples array --- running off the end of memory for
 425          * a SIGSEGV is not possible.  Yeah, this is ugly as sin, but it beats
 426          * adding special-case treatment for name_ops elsewhere.
 427          */
 428         if (scan->xs_want_itup && so->currTuples == NULL)
 429         {
 430                 so->currTuples = (char *) palloc(BLCKSZ * 2);
 431                 so->markTuples = so->currTuples + BLCKSZ;
 432         }
 433
 434         /*
 435          * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
 436          * - vadim 05/05/97
 437          */
 438         if (scankey && scan->numberOfKeys > 0)
 439                 memmove(scan->keyData,
 440                                 scankey,
 441                                 scan->numberOfKeys * sizeof(ScanKeyData));
 442         so->numberOfKeys = 0;           /* until _bt_preprocess_keys sets it */
 443
 444         /* If any keys are SK_SEARCHARRAY type, set up array-key info */
 445         _bt_preprocess_array_keys(scan);
 446 }
 447
 448 /*
 449  *      btendscan() -- close down a scan
 450  */
 451 void
 452 btendscan(IndexScanDesc scan)
 453 {
 454         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 455
 456         /* we aren't holding any read locks, but gotta drop the pins */
 457         if (BTScanPosIsValid(so->currPos))
 458         {
 459                 /* Before leaving current page, deal with any killed items */
 460                 if (so->numKilled > 0)
 461                         _bt_killitems(scan);
 462                 BTScanPosUnpinIfPinned(so->currPos);
 463         }
 464
 465         so->markItemIndex = -1;
 466         BTScanPosUnpinIfPinned(so->markPos);
 467
 468         /* No need to invalidate positions, the RAM is about to be freed. */
 469
 470         /* Release storage */
 471         if (so->keyData != NULL)
 472                 pfree(so->keyData);
 473         /* so->arrayKeyData and so->arrayKeys are in arrayContext */
 474         if (so->arrayContext != NULL)
 475                 MemoryContextDelete(so->arrayContext);
 476         if (so->killedItems != NULL)
 477                 pfree(so->killedItems);
 478         if (so->currTuples != NULL)
 479                 pfree(so->currTuples);
 480         /* so->markTuples should not be pfree'd, see btrescan */
 481         pfree(so);
 482 }
 483
 484 /*
 485  *      btmarkpos() -- save current scan position
 486  */
 487 void
 488 btmarkpos(IndexScanDesc scan)
 489 {
 490         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 491
 492         /* There may be an old mark with a pin (but no lock). */
 493         BTScanPosUnpinIfPinned(so->markPos);
 494
 495         /*
 496          * Just record the current itemIndex.  If we later step to next page
 497          * before releasing the marked position, _bt_steppage makes a full copy of
 498          * the currPos struct in markPos.  If (as often happens) the mark is moved
 499          * before we leave the page, we don't have to do that work.
 500          */
 501         if (BTScanPosIsValid(so->currPos))
 502                 so->markItemIndex = so->currPos.itemIndex;
 503         else
 504         {
 505                 BTScanPosInvalidate(so->markPos);
 506                 so->markItemIndex = -1;
 507         }
 508
 509         /* Also record the current positions of any array keys */
 510         if (so->numArrayKeys)
 511                 _bt_mark_array_keys(scan);
 512 }
 513
 514 /*
 515  *      btrestrpos() -- restore scan to last saved position
 516  */
 517 void
 518 btrestrpos(IndexScanDesc scan)
 519 {
 520         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 521
 522         /* Restore the marked positions of any array keys */
 523         if (so->numArrayKeys)
 524                 _bt_restore_array_keys(scan);
 525
 526         if (so->markItemIndex >= 0)
 527         {
 528                 /*
 529                  * The scan has never moved to a new page since the last mark.  Just
 530                  * restore the itemIndex.
 531                  *
 532                  * NB: In this case we can't count on anything in so->markPos to be
 533                  * accurate.
 534                  */
 535                 so->currPos.itemIndex = so->markItemIndex;
 536         }
 537         else
 538         {
 539                 /*
 540                  * The scan moved to a new page after last mark or restore, and we are
 541                  * now restoring to the marked page.  We aren't holding any read
 542                  * locks, but if we're still holding the pin for the current position,
 543                  * we must drop it.
 544                  */
 545                 if (BTScanPosIsValid(so->currPos))
 546                 {
 547                         /* Before leaving current page, deal with any killed items */
 548                         if (so->numKilled > 0)
 549                                 _bt_killitems(scan);
 550                         BTScanPosUnpinIfPinned(so->currPos);
 551                 }
 552
 553                 if (BTScanPosIsValid(so->markPos))
 554                 {
 555                         /* bump pin on mark buffer for assignment to current buffer */
 556                         if (BTScanPosIsPinned(so->markPos))
 557                                 IncrBufferRefCount(so->markPos.buf);
 558                         memcpy(&so->currPos, &so->markPos,
 559                                    offsetof(BTScanPosData, items[1]) +
 560                                    so->markPos.lastItem * sizeof(BTScanPosItem));
 561                         if (so->currTuples)
 562                                 memcpy(so->currTuples, so->markTuples,
 563                                            so->markPos.nextTupleOffset);
 564                 }
 565                 else
 566                         BTScanPosInvalidate(so->currPos);
 567         }
 568 }
 569
 570 /*
 571  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
 572  */
 573 Size
 574 btestimateparallelscan(void)
 575 {
 576         return sizeof(BTParallelScanDescData);
 577 }
 578
 579 /*
 580  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
 581  */
 582 void
 583 btinitparallelscan(void *target)
 584 {
 585         BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
 586
 587         SpinLockInit(&bt_target->btps_mutex);
 588         bt_target->btps_scanPage = InvalidBlockNumber;
 589         bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
 590         bt_target->btps_arrayKeyCount = 0;
 591         ConditionVariableInit(&bt_target->btps_cv);
 592 }
 593
 594 /*
 595  *      btparallelrescan() -- reset parallel scan
 596  */
 597 void
 598 btparallelrescan(IndexScanDesc scan)
 599 {
 600         BTParallelScanDesc btscan;
 601         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 602
 603         Assert(parallel_scan);
 604
 605         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 606                                                                                                   parallel_scan->ps_offset);
 607
 608         /*
 609          * In theory, we don't need to acquire the spinlock here, because there
 610          * shouldn't be any other workers running at this point, but we do so for
 611          * consistency.
 612          */
 613         SpinLockAcquire(&btscan->btps_mutex);
 614         btscan->btps_scanPage = InvalidBlockNumber;
 615         btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
 616         btscan->btps_arrayKeyCount = 0;
 617         SpinLockRelease(&btscan->btps_mutex);
 618 }
 619
 620 /*
 621  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
 622  *              page.  Other scans must wait until we call bt_parallel_release() or
 623  *              bt_parallel_done().
 624  *
 625  * The return value is true if we successfully seized the scan and false
 626  * if we did not.  The latter case occurs if no pages remain for the current
 627  * set of scankeys.
 628  *
 629  * If the return value is true, *pageno returns the next or current page
 630  * of the scan (depending on the scan direction).  An invalid block number
 631  * means the scan hasn't yet started, and P_NONE means we've reached the end.
 632  * The first time a participating process reaches the last page, it will return
 633  * true and set *pageno to P_NONE; after that, further attempts to seize the
 634  * scan will return false.
 635  *
 636  * Callers should ignore the value of pageno if the return value is false.
 637  */
 638 bool
 639 _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
 640 {
 641         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 642         BTPS_State      pageStatus;
 643         bool            exit_loop = false;
 644         bool            status = true;
 645         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 646         BTParallelScanDesc btscan;
 647
 648         *pageno = P_NONE;
 649
 650         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 651                                                                                                   parallel_scan->ps_offset);
 652
 653         while (1)
 654         {
 655                 SpinLockAcquire(&btscan->btps_mutex);
 656                 pageStatus = btscan->btps_pageStatus;
 657
 658                 if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
 659                 {
 660                         /* Parallel scan has already advanced to a new set of scankeys. */
 661                         status = false;
 662                 }
 663                 else if (pageStatus == BTPARALLEL_DONE)
 664                 {
 665                         /*
 666                          * We're done with this set of scankeys.  This may be the end, or
 667                          * there could be more sets to try.
 668                          */
 669                         status = false;
 670                 }
 671                 else if (pageStatus != BTPARALLEL_ADVANCING)
 672                 {
 673                         /*
 674                          * We have successfully seized control of the scan for the purpose
 675                          * of advancing it to a new page!
 676                          */
 677                         btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
 678                         *pageno = btscan->btps_scanPage;
 679                         exit_loop = true;
 680                 }
 681                 SpinLockRelease(&btscan->btps_mutex);
 682                 if (exit_loop || !status)
 683                         break;
 684                 ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
 685         }
 686         ConditionVariableCancelSleep();
 687
 688         return status;
 689 }
 690
 691 /*
 692  * _bt_parallel_release() -- Complete the process of advancing the scan to a
 693  *              new page.  We now have the new value btps_scanPage; some other backend
 694  *              can now begin advancing the scan.
 695  */
 696 void
 697 _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
 698 {
 699         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 700         BTParallelScanDesc btscan;
 701
 702         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 703                                                                                                   parallel_scan->ps_offset);
 704
 705         SpinLockAcquire(&btscan->btps_mutex);
 706         btscan->btps_scanPage = scan_page;
 707         btscan->btps_pageStatus = BTPARALLEL_IDLE;
 708         SpinLockRelease(&btscan->btps_mutex);
 709         ConditionVariableSignal(&btscan->btps_cv);
 710 }
 711
 712 /*
 713  * _bt_parallel_done() -- Mark the parallel scan as complete.
 714  *
 715  * When there are no pages left to scan, this function should be called to
 716  * notify other workers.  Otherwise, they might wait forever for the scan to
 717  * advance to the next page.
 718  */
 719 void
 720 _bt_parallel_done(IndexScanDesc scan)
 721 {
 722         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 723         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 724         BTParallelScanDesc btscan;
 725         bool            status_changed = false;
 726
 727         /* Do nothing, for non-parallel scans */
 728         if (parallel_scan == NULL)
 729                 return;
 730
 731         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 732                                                                                                   parallel_scan->ps_offset);
 733
 734         /*
 735          * Mark the parallel scan as done for this combination of scan keys,
 736          * unless some other process already did so.  See also
 737          * _bt_advance_array_keys.
 738          */
 739         SpinLockAcquire(&btscan->btps_mutex);
 740         if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
 741                 btscan->btps_pageStatus != BTPARALLEL_DONE)
 742         {
 743                 btscan->btps_pageStatus = BTPARALLEL_DONE;
 744                 status_changed = true;
 745         }
 746         SpinLockRelease(&btscan->btps_mutex);
 747
 748         /* wake up all the workers associated with this parallel scan */
 749         if (status_changed)
 750                 ConditionVariableBroadcast(&btscan->btps_cv);
 751 }
 752
 753 /*
 754  * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
 755  *                      keys.
 756  *
 757  * Updates the count of array keys processed for both local and parallel
 758  * scans.
 759  */
 760 void
 761 _bt_parallel_advance_array_keys(IndexScanDesc scan)
 762 {
 763         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 764         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 765         BTParallelScanDesc btscan;
 766
 767         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 768                                                                                                   parallel_scan->ps_offset);
 769
 770         so->arrayKeyCount++;
 771         SpinLockAcquire(&btscan->btps_mutex);
 772         if (btscan->btps_pageStatus == BTPARALLEL_DONE)
 773         {
 774                 btscan->btps_scanPage = InvalidBlockNumber;
 775                 btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
 776                 btscan->btps_arrayKeyCount++;
 777         }
 778         SpinLockRelease(&btscan->btps_mutex);
 779 }
 780
 781 /*
 782  * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup assuming that
 783  *                      btbulkdelete() wasn't called.
 784  */
 785 static bool
 786 _bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
 787 {
 788         Buffer          metabuf;
 789         Page            metapg;
 790         BTMetaPageData *metad;
 791         bool            result = false;
 792
 793         metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ);
 794         metapg = BufferGetPage(metabuf);
 795         metad = BTPageGetMeta(metapg);
 796
 797         if (metad->btm_version < BTREE_NOVAC_VERSION)
 798         {
 799                 /*
 800                  * Do cleanup if metapage needs upgrade, because we don't have
 801                  * cleanup-related meta-information yet.
 802                  */
 803                 result = true;
 804         }
 805         else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) &&
 806                          TransactionIdPrecedes(metad->btm_oldest_btpo_xact,
 807                                                                    RecentGlobalXmin))
 808         {
 809                 /*
 810                  * If oldest btpo.xact in the deleted pages is older than
 811                  * RecentGlobalXmin, then at least one deleted page can be recycled.
 812                  */
 813                 result = true;
 814         }
 815         else
 816         {
 817                 StdRdOptions *relopts;
 818                 float8          cleanup_scale_factor;
 819                 float8          prev_num_heap_tuples;
 820
 821                 /*
 822                  * If table receives enough insertions and no cleanup was performed,
 823                  * then index would appear have stale statistics.  If scale factor is
 824                  * set, we avoid that by performing cleanup if the number of inserted
 825                  * tuples exceeds vacuum_cleanup_index_scale_factor fraction of
 826                  * original tuples count.
 827                  */
 828                 relopts = (StdRdOptions *) info->index->rd_options;
 829                 cleanup_scale_factor = (relopts &&
 830                                                                 relopts->vacuum_cleanup_index_scale_factor >= 0)
 831                         ? relopts->vacuum_cleanup_index_scale_factor
 832                         : vacuum_cleanup_index_scale_factor;
 833                 prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
 834
 835                 if (cleanup_scale_factor <= 0 ||
 836                         prev_num_heap_tuples < 0 ||
 837                         (info->num_heap_tuples - prev_num_heap_tuples) /
 838                         prev_num_heap_tuples >= cleanup_scale_factor)
 839                         result = true;
 840         }
 841
 842         _bt_relbuf(info->index, metabuf);
 843         return result;
 844 }
 845
 846 /*
 847  * Bulk deletion of all index entries pointing to a set of heap tuples.
 848  * The set of target tuples is specified via a callback routine that tells
 849  * whether any given heap tuple (identified by ItemPointer) is being deleted.
 850  *
 851  * Result: a palloc'd struct containing statistical info for VACUUM displays.
 852  */
 853 IndexBulkDeleteResult *
 854 btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 855                          IndexBulkDeleteCallback callback, void *callback_state)
 856 {
 857         Relation        rel = info->index;
 858         BTCycleId       cycleid;
 859
 860         /* allocate stats if first time through, else re-use existing struct */
 861         if (stats == NULL)
 862                 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
 863
 864         /* Establish the vacuum cycle ID to use for this scan */
 865         /* The ENSURE stuff ensures we clean up shared memory on failure */
 866         PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
 867         {
 868                 TransactionId oldestBtpoXact;
 869
 870                 cycleid = _bt_start_vacuum(rel);
 871
 872                 btvacuumscan(info, stats, callback, callback_state, cycleid,
 873                                          &oldestBtpoXact);
 874
 875                 /*
 876                  * Update cleanup-related information in metapage. This information is
 877                  * used only for cleanup but keeping them up to date can avoid
 878                  * unnecessary cleanup even after bulkdelete.
 879                  */
 880                 _bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
 881                                                                          info->num_heap_tuples);
 882         }
 883         PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
 884         _bt_end_vacuum(rel);
 885
 886         return stats;
 887 }
 888
 889 /*
 890  * Post-VACUUM cleanup.
 891  *
 892  * Result: a palloc'd struct containing statistical info for VACUUM displays.
 893  */
 894 IndexBulkDeleteResult *
 895 btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 896 {
 897         /* No-op in ANALYZE ONLY mode */
 898         if (info->analyze_only)
 899                 return stats;
 900
 901         /*
 902          * If btbulkdelete was called, we need not do anything, just return the
 903          * stats from the latest btbulkdelete call.  If it wasn't called, we might
 904          * still need to do a pass over the index, to recycle any newly-recyclable
 905          * pages or to obtain index statistics.  _bt_vacuum_needs_cleanup
 906          * determines if either are needed.
 907          *
 908          * Since we aren't going to actually delete any leaf items, there's no
 909          * need to go through all the vacuum-cycle-ID pushups.
 910          */
 911         if (stats == NULL)
 912         {
 913                 TransactionId oldestBtpoXact;
 914
 915                 /* Check if we need a cleanup */
 916                 if (!_bt_vacuum_needs_cleanup(info))
 917                         return NULL;
 918
 919                 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
 920                 btvacuumscan(info, stats, NULL, NULL, 0, &oldestBtpoXact);
 921
 922                 /* Update cleanup-related information in the metapage */
 923                 _bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
 924                                                                          info->num_heap_tuples);
 925         }
 926
 927         /*
 928          * It's quite possible for us to be fooled by concurrent page splits into
 929          * double-counting some index tuples, so disbelieve any total that exceeds
 930          * the underlying heap's count ... if we know that accurately.  Otherwise
 931          * this might just make matters worse.
 932          */
 933         if (!info->estimated_count)
 934         {
 935                 if (stats->num_index_tuples > info->num_heap_tuples)
 936                         stats->num_index_tuples = info->num_heap_tuples;
 937         }
 938
 939         return stats;
 940 }
 941
 942 /*
 943  * btvacuumscan --- scan the index for VACUUMing purposes
 944  *
 945  * This combines the functions of looking for leaf tuples that are deletable
 946  * according to the vacuum callback, looking for empty pages that can be
 947  * deleted, and looking for old deleted pages that can be recycled.  Both
 948  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
 949  * btbulkdelete call occurred).
 950  *
 951  * The caller is responsible for initially allocating/zeroing a stats struct
 952  * and for obtaining a vacuum cycle ID if necessary.
 953  */
 954 static void
 955 btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 956                          IndexBulkDeleteCallback callback, void *callback_state,
 957                          BTCycleId cycleid, TransactionId *oldestBtpoXact)
 958 {
 959         Relation        rel = info->index;
 960         BTVacState      vstate;
 961         BlockNumber num_pages;
 962         BlockNumber blkno;
 963         bool            needLock;
 964
 965         /*
 966          * Reset counts that will be incremented during the scan; needed in case
 967          * of multiple scans during a single VACUUM command
 968          */
 969         stats->estimated_count = false;
 970         stats->num_index_tuples = 0;
 971         stats->pages_deleted = 0;
 972
 973         /* Set up info to pass down to btvacuumpage */
 974         vstate.info = info;
 975         vstate.stats = stats;
 976         vstate.callback = callback;
 977         vstate.callback_state = callback_state;
 978         vstate.cycleid = cycleid;
 979         vstate.lastBlockVacuumed = BTREE_METAPAGE;      /* Initialise at first block */
 980         vstate.lastBlockLocked = BTREE_METAPAGE;
 981         vstate.totFreePages = 0;
 982         vstate.oldestBtpoXact = InvalidTransactionId;
 983
 984         /* Create a temporary memory context to run _bt_pagedel in */
 985         vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
 986                                                                                                   "_bt_pagedel",
 987                                                                                                   ALLOCSET_DEFAULT_SIZES);
 988
 989         /*
 990          * The outer loop iterates over all index pages except the metapage, in
 991          * physical order (we hope the kernel will cooperate in providing
 992          * read-ahead for speed).  It is critical that we visit all leaf pages,
 993          * including ones added after we start the scan, else we might fail to
 994          * delete some deletable tuples.  Hence, we must repeatedly check the
 995          * relation length.  We must acquire the relation-extension lock while
 996          * doing so to avoid a race condition: if someone else is extending the
 997          * relation, there is a window where bufmgr/smgr have created a new
 998          * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
 999          * we manage to scan such a page here, we'll improperly assume it can be
1000          * recycled.  Taking the lock synchronizes things enough to prevent a
1001          * problem: either num_pages won't include the new page, or _bt_getbuf
1002          * already has write lock on the buffer and it will be fully initialized
1003          * before we can examine it.  (See also vacuumlazy.c, which has the same
1004          * issue.)      Also, we need not worry if a page is added immediately after
1005          * we look; the page splitting code already has write-lock on the left
1006          * page before it adds a right page, so we must already have processed any
1007          * tuples due to be moved into such a page.
1008          *
1009          * We can skip locking for new or temp relations, however, since no one
1010          * else could be accessing them.
1011          */
1012         needLock = !RELATION_IS_LOCAL(rel);
1013
1014         blkno = BTREE_METAPAGE + 1;
1015         for (;;)
1016         {
1017                 /* Get the current relation length */
1018                 if (needLock)
1019                         LockRelationForExtension(rel, ExclusiveLock);
1020                 num_pages = RelationGetNumberOfBlocks(rel);
1021                 if (needLock)
1022                         UnlockRelationForExtension(rel, ExclusiveLock);
1023
1024                 /* Quit if we've scanned the whole relation */
1025                 if (blkno >= num_pages)
1026                         break;
1027                 /* Iterate over pages, then loop back to recheck length */
1028                 for (; blkno < num_pages; blkno++)
1029                 {
1030                         btvacuumpage(&vstate, blkno, blkno);
1031                 }
1032         }
1033
1034         /*
1035          * Check to see if we need to issue one final WAL record for this index,
1036          * which may be needed for correctness on a hot standby node when non-MVCC
1037          * index scans could take place.
1038          *
1039          * If the WAL is replayed in hot standby, the replay process needs to get
1040          * cleanup locks on all index leaf pages, just as we've been doing here.
1041          * However, we won't issue any WAL records about pages that have no items
1042          * to be deleted.  For pages between pages we've vacuumed, the replay code
1043          * will take locks under the direction of the lastBlockVacuumed fields in
1044          * the XLOG_BTREE_VACUUM WAL records.  To cover pages after the last one
1045          * we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record
1046          * against the last leaf page in the index, if that one wasn't vacuumed.
1047          */
1048         if (XLogStandbyInfoActive() &&
1049                 vstate.lastBlockVacuumed < vstate.lastBlockLocked)
1050         {
1051                 Buffer          buf;
1052
1053                 /*
1054                  * The page should be valid, but we can't use _bt_getbuf() because we
1055                  * want to use a nondefault buffer access strategy.  Since we aren't
1056                  * going to delete any items, getting cleanup lock again is probably
1057                  * overkill, but for consistency do that anyway.
1058                  */
1059                 buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked,
1060                                                                  RBM_NORMAL, info->strategy);
1061                 LockBufferForCleanup(buf);
1062                 _bt_checkpage(rel, buf);
1063                 _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed);
1064                 _bt_relbuf(rel, buf);
1065         }
1066
1067         MemoryContextDelete(vstate.pagedelcontext);
1068
1069         /*
1070          * If we found any recyclable pages (and recorded them in the FSM), then
1071          * forcibly update the upper-level FSM pages to ensure that searchers can
1072          * find them.  It's possible that the pages were also found during
1073          * previous scans and so this is a waste of time, but it's cheap enough
1074          * relative to scanning the index that it shouldn't matter much, and
1075          * making sure that free pages are available sooner not later seems
1076          * worthwhile.
1077          *
1078          * Note that if no recyclable pages exist, we don't bother vacuuming the
1079          * FSM at all.
1080          */
1081         if (vstate.totFreePages > 0)
1082                 IndexFreeSpaceMapVacuum(rel);
1083
1084         /* update statistics */
1085         stats->num_pages = num_pages;
1086         stats->pages_free = vstate.totFreePages;
1087
1088         if (oldestBtpoXact)
1089                 *oldestBtpoXact = vstate.oldestBtpoXact;
1090 }
1091
1092 /*
1093  * btvacuumpage --- VACUUM one page
1094  *
1095  * This processes a single page for btvacuumscan().  In some cases we
1096  * must go back and re-examine previously-scanned pages; this routine
1097  * recurses when necessary to handle that case.
1098  *
1099  * blkno is the page to process.  orig_blkno is the highest block number
1100  * reached by the outer btvacuumscan loop (the same as blkno, unless we
1101  * are recursing to re-examine a previous page).
1102  */
1103 static void
1104 btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
1105 {
1106         IndexVacuumInfo *info = vstate->info;
1107         IndexBulkDeleteResult *stats = vstate->stats;
1108         IndexBulkDeleteCallback callback = vstate->callback;
1109         void       *callback_state = vstate->callback_state;
1110         Relation        rel = info->index;
1111         bool            delete_now;
1112         BlockNumber recurse_to;
1113         Buffer          buf;
1114         Page            page;
1115         BTPageOpaque opaque = NULL;
1116
1117 restart:
1118         delete_now = false;
1119         recurse_to = P_NONE;
1120
1121         /* call vacuum_delay_point while not holding any buffer lock */
1122         vacuum_delay_point();
1123
1124         /*
1125          * We can't use _bt_getbuf() here because it always applies
1126          * _bt_checkpage(), which will barf on an all-zero page. We want to
1127          * recycle all-zero pages, not fail.  Also, we want to use a nondefault
1128          * buffer access strategy.
1129          */
1130         buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1131                                                          info->strategy);
1132         LockBuffer(buf, BT_READ);
1133         page = BufferGetPage(buf);
1134         if (!PageIsNew(page))
1135         {
1136                 _bt_checkpage(rel, buf);
1137                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1138         }
1139
1140         /*
1141          * If we are recursing, the only case we want to do anything with is a
1142          * live leaf page having the current vacuum cycle ID.  Any other state
1143          * implies we already saw the page (eg, deleted it as being empty).
1144          */
1145         if (blkno != orig_blkno)
1146         {
1147                 if (_bt_page_recyclable(page) ||
1148                         P_IGNORE(opaque) ||
1149                         !P_ISLEAF(opaque) ||
1150                         opaque->btpo_cycleid != vstate->cycleid)
1151                 {
1152                         _bt_relbuf(rel, buf);
1153                         return;
1154                 }
1155         }
1156
1157         /* Page is valid, see what to do with it */
1158         if (_bt_page_recyclable(page))
1159         {
1160                 /* Okay to recycle this page */
1161                 RecordFreeIndexPage(rel, blkno);
1162                 vstate->totFreePages++;
1163                 stats->pages_deleted++;
1164         }
1165         else if (P_ISDELETED(opaque))
1166         {
1167                 /* Already deleted, but can't recycle yet */
1168                 stats->pages_deleted++;
1169
1170                 /* Update the oldest btpo.xact */
1171                 if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
1172                         TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
1173                         vstate->oldestBtpoXact = opaque->btpo.xact;
1174         }
1175         else if (P_ISHALFDEAD(opaque))
1176         {
1177                 /* Half-dead, try to delete */
1178                 delete_now = true;
1179         }
1180         else if (P_ISLEAF(opaque))
1181         {
1182                 OffsetNumber deletable[MaxOffsetNumber];
1183                 int                     ndeletable;
1184                 OffsetNumber offnum,
1185                                         minoff,
1186                                         maxoff;
1187
1188                 /*
1189                  * Trade in the initial read lock for a super-exclusive write lock on
1190                  * this page.  We must get such a lock on every leaf page over the
1191                  * course of the vacuum scan, whether or not it actually contains any
1192                  * deletable tuples --- see nbtree/README.
1193                  */
1194                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1195                 LockBufferForCleanup(buf);
1196
1197                 /*
1198                  * Remember highest leaf page number we've taken cleanup lock on; see
1199                  * notes in btvacuumscan
1200                  */
1201                 if (blkno > vstate->lastBlockLocked)
1202                         vstate->lastBlockLocked = blkno;
1203
1204                 /*
1205                  * Check whether we need to recurse back to earlier pages.  What we
1206                  * are concerned about is a page split that happened since we started
1207                  * the vacuum scan.  If the split moved some tuples to a lower page
1208                  * then we might have missed 'em.  If so, set up for tail recursion.
1209                  * (Must do this before possibly clearing btpo_cycleid below!)
1210                  */
1211                 if (vstate->cycleid != 0 &&
1212                         opaque->btpo_cycleid == vstate->cycleid &&
1213                         !(opaque->btpo_flags & BTP_SPLIT_END) &&
1214                         !P_RIGHTMOST(opaque) &&
1215                         opaque->btpo_next < orig_blkno)
1216                         recurse_to = opaque->btpo_next;
1217
1218                 /*
1219                  * Scan over all items to see which ones need deleted according to the
1220                  * callback function.
1221                  */
1222                 ndeletable = 0;
1223                 minoff = P_FIRSTDATAKEY(opaque);
1224                 maxoff = PageGetMaxOffsetNumber(page);
1225                 if (callback)
1226                 {
1227                         for (offnum = minoff;
1228                                  offnum <= maxoff;
1229                                  offnum = OffsetNumberNext(offnum))
1230                         {
1231                                 IndexTuple      itup;
1232                                 ItemPointer htup;
1233
1234                                 itup = (IndexTuple) PageGetItem(page,
1235                                                                                                 PageGetItemId(page, offnum));
1236                                 htup = &(itup->t_tid);
1237
1238                                 /*
1239                                  * During Hot Standby we currently assume that
1240                                  * XLOG_BTREE_VACUUM records do not produce conflicts. That is
1241                                  * only true as long as the callback function depends only
1242                                  * upon whether the index tuple refers to heap tuples removed
1243                                  * in the initial heap scan. When vacuum starts it derives a
1244                                  * value of OldestXmin. Backends taking later snapshots could
1245                                  * have a RecentGlobalXmin with a later xid than the vacuum's
1246                                  * OldestXmin, so it is possible that row versions deleted
1247                                  * after OldestXmin could be marked as killed by other
1248                                  * backends. The callback function *could* look at the index
1249                                  * tuple state in isolation and decide to delete the index
1250                                  * tuple, though currently it does not. If it ever did, we
1251                                  * would need to reconsider whether XLOG_BTREE_VACUUM records
1252                                  * should cause conflicts. If they did cause conflicts they
1253                                  * would be fairly harsh conflicts, since we haven't yet
1254                                  * worked out a way to pass a useful value for
1255                                  * latestRemovedXid on the XLOG_BTREE_VACUUM records. This
1256                                  * applies to *any* type of index that marks index tuples as
1257                                  * killed.
1258                                  */
1259                                 if (callback(htup, callback_state))
1260                                         deletable[ndeletable++] = offnum;
1261                         }
1262                 }
1263
1264                 /*
1265                  * Apply any needed deletes.  We issue just one _bt_delitems_vacuum()
1266                  * call per page, so as to minimize WAL traffic.
1267                  */
1268                 if (ndeletable > 0)
1269                 {
1270                         /*
1271                          * Notice that the issued XLOG_BTREE_VACUUM WAL record includes
1272                          * all information to the replay code to allow it to get a cleanup
1273                          * lock on all pages between the previous lastBlockVacuumed and
1274                          * this page. This ensures that WAL replay locks all leaf pages at
1275                          * some point, which is important should non-MVCC scans be
1276                          * requested. This is currently unused on standby, but we record
1277                          * it anyway, so that the WAL contains the required information.
1278                          *
1279                          * Since we can visit leaf pages out-of-order when recursing,
1280                          * replay might end up locking such pages an extra time, but it
1281                          * doesn't seem worth the amount of bookkeeping it'd take to avoid
1282                          * that.
1283                          */
1284                         _bt_delitems_vacuum(rel, buf, deletable, ndeletable,
1285                                                                 vstate->lastBlockVacuumed);
1286
1287                         /*
1288                          * Remember highest leaf page number we've issued a
1289                          * XLOG_BTREE_VACUUM WAL record for.
1290                          */
1291                         if (blkno > vstate->lastBlockVacuumed)
1292                                 vstate->lastBlockVacuumed = blkno;
1293
1294                         stats->tuples_removed += ndeletable;
1295                         /* must recompute maxoff */
1296                         maxoff = PageGetMaxOffsetNumber(page);
1297                 }
1298                 else
1299                 {
1300                         /*
1301                          * If the page has been split during this vacuum cycle, it seems
1302                          * worth expending a write to clear btpo_cycleid even if we don't
1303                          * have any deletions to do.  (If we do, _bt_delitems_vacuum takes
1304                          * care of this.)  This ensures we won't process the page again.
1305                          *
1306                          * We treat this like a hint-bit update because there's no need to
1307                          * WAL-log it.
1308                          */
1309                         if (vstate->cycleid != 0 &&
1310                                 opaque->btpo_cycleid == vstate->cycleid)
1311                         {
1312                                 opaque->btpo_cycleid = 0;
1313                                 MarkBufferDirtyHint(buf, true);
1314                         }
1315                 }
1316
1317                 /*
1318                  * If it's now empty, try to delete; else count the live tuples. We
1319                  * don't delete when recursing, though, to avoid putting entries into
1320                  * freePages out-of-order (doesn't seem worth any extra code to handle
1321                  * the case).
1322                  */
1323                 if (minoff > maxoff)
1324                         delete_now = (blkno == orig_blkno);
1325                 else
1326                         stats->num_index_tuples += maxoff - minoff + 1;
1327         }
1328
1329         if (delete_now)
1330         {
1331                 MemoryContext oldcontext;
1332                 int                     ndel;
1333
1334                 /* Run pagedel in a temp context to avoid memory leakage */
1335                 MemoryContextReset(vstate->pagedelcontext);
1336                 oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1337
1338                 ndel = _bt_pagedel(rel, buf);
1339
1340                 /* count only this page, else may double-count parent */
1341                 if (ndel)
1342                 {
1343                         stats->pages_deleted++;
1344                         if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
1345                                 TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
1346                                 vstate->oldestBtpoXact = opaque->btpo.xact;
1347                 }
1348
1349                 MemoryContextSwitchTo(oldcontext);
1350                 /* pagedel released buffer, so we shouldn't */
1351         }
1352         else
1353                 _bt_relbuf(rel, buf);
1354
1355         /*
1356          * This is really tail recursion, but if the compiler is too stupid to
1357          * optimize it as such, we'd eat an uncomfortably large amount of stack
1358          * space per recursion level (due to the deletable[] array). A failure is
1359          * improbable since the number of levels isn't likely to be large ... but
1360          * just in case, let's hand-optimize into a loop.
1361          */
1362         if (recurse_to != P_NONE)
1363         {
1364                 blkno = recurse_to;
1365                 goto restart;
1366         }
1367 }
1368
1369 /*
1370  *      btcanreturn() -- Check whether btree indexes support index-only scans.
1371  *
1372  * btrees always do, so this is trivial.
1373  */
1374 bool
1375 btcanreturn(Relation index, int attno)
1376 {
1377         return true;
1378 }