1 /*-------------------------------------------------------------------------
4 * Routines to support bitmapped scans of relations
6 * NOTE: it is critical that this plan type only be used with MVCC-compliant
7 * snapshots (ie, regular snapshots, not SnapshotAny or one of the other
8 * special snapshots). The reason is that since index and heap scans are
9 * decoupled, there can be no assurance that the index tuple prompting a
10 * visit to a particular heap TID still exists when the visit is made.
11 * Therefore the tuple might not exist anymore either (which is OK because
12 * heap_fetch will cope) --- but worse, the tuple slot could have been
13 * re-used for a newer tuple. With an MVCC snapshot the newer tuple is
14 * certain to fail the time qual and so it will not be mistakenly returned,
15 * but with anything else we might return a tuple that doesn't meet the
16 * required index qual conditions.
19 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
20 * Portions Copyright (c) 1994, Regents of the University of California
24 * src/backend/executor/nodeBitmapHeapscan.c
26 *-------------------------------------------------------------------------
30 * ExecBitmapHeapScan scans a relation using bitmap info
31 * ExecBitmapHeapNext workhorse for above
32 * ExecInitBitmapHeapScan creates and initializes state info.
33 * ExecReScanBitmapHeapScan prepares to rescan the plan.
34 * ExecEndBitmapHeapScan releases all storage.
38 #include "access/relscan.h"
39 #include "access/transam.h"
40 #include "executor/execdebug.h"
41 #include "executor/nodeBitmapHeapscan.h"
43 #include "storage/bufmgr.h"
44 #include "storage/predicate.h"
45 #include "utils/memutils.h"
46 #include "utils/rel.h"
47 #include "utils/spccache.h"
48 #include "utils/snapmgr.h"
49 #include "utils/tqual.h"
52 static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
53 static void bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres);
56 /* ----------------------------------------------------------------
59 * Retrieve next tuple from the BitmapHeapScan node's currentRelation
60 * ----------------------------------------------------------------
62 static TupleTableSlot *
63 BitmapHeapNext(BitmapHeapScanState *node)
65 ExprContext *econtext;
68 TBMIterator *tbmiterator;
69 TBMIterateResult *tbmres;
72 TBMIterator *prefetch_iterator;
74 OffsetNumber targoffset;
78 * extract necessary information from index scan node
80 econtext = node->ss.ps.ps_ExprContext;
81 slot = node->ss.ss_ScanTupleSlot;
82 scan = node->ss.ss_currentScanDesc;
84 tbmiterator = node->tbmiterator;
85 tbmres = node->tbmres;
87 prefetch_iterator = node->prefetch_iterator;
91 * If we haven't yet performed the underlying index scan, do it, and begin
92 * the iteration over the bitmap.
94 * For prefetching, we use *two* iterators, one for the pages we are
95 * actually scanning and another that runs ahead of the first for
96 * prefetching. node->prefetch_pages tracks exactly how many pages ahead
97 * the prefetch iterator is. Also, node->prefetch_target tracks the
98 * desired prefetch distance, which starts small and increases up to the
99 * node->prefetch_maximum. This is to avoid doing a lot of prefetching in
100 * a scan that stops after a few tuples because of a LIMIT.
104 tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
106 if (!tbm || !IsA(tbm, TIDBitmap))
107 elog(ERROR, "unrecognized result from subplan");
110 node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm);
111 node->tbmres = tbmres = NULL;
114 if (node->prefetch_maximum > 0)
116 node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm);
117 node->prefetch_pages = 0;
118 node->prefetch_target = -1;
120 #endif /* USE_PREFETCH */
129 * Get next page of results if needed
133 node->tbmres = tbmres = tbm_iterate(tbmiterator);
136 /* no more entries in the bitmap */
141 if (node->prefetch_pages > 0)
143 /* The main iterator has closed the distance by one page */
144 node->prefetch_pages--;
146 else if (prefetch_iterator)
148 /* Do not let the prefetch iterator get behind the main one */
149 TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
151 if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno)
152 elog(ERROR, "prefetch and main iterators are out of sync");
154 #endif /* USE_PREFETCH */
157 * Ignore any claimed entries past what we think is the end of the
158 * relation. (This is probably not necessary given that we got at
159 * least AccessShareLock on the table before performing any of the
160 * indexscans, but let's be safe.)
162 if (tbmres->blockno >= scan->rs_nblocks)
164 node->tbmres = tbmres = NULL;
169 * Fetch the current heap page and identify candidate tuples.
171 bitgetpage(scan, tbmres);
173 if (tbmres->ntuples >= 0)
179 * Set rs_cindex to first slot to examine
186 * Increase prefetch target if it's not yet at the max. Note that
187 * we will increase it to zero after fetching the very first
188 * page/tuple, then to one after the second tuple is fetched, then
189 * it doubles as later pages are fetched.
191 if (node->prefetch_target >= node->prefetch_maximum)
192 /* don't increase any further */ ;
193 else if (node->prefetch_target >= node->prefetch_maximum / 2)
194 node->prefetch_target = node->prefetch_maximum;
195 else if (node->prefetch_target > 0)
196 node->prefetch_target *= 2;
198 node->prefetch_target++;
199 #endif /* USE_PREFETCH */
204 * Continuing in previously obtained page; advance rs_cindex
211 * Try to prefetch at least a few pages even before we get to the
212 * second page if we don't stop reading after the first tuple.
214 if (node->prefetch_target < node->prefetch_maximum)
215 node->prefetch_target++;
216 #endif /* USE_PREFETCH */
220 * Out of range? If so, nothing more to look at on this page
222 if (scan->rs_cindex < 0 || scan->rs_cindex >= scan->rs_ntuples)
224 node->tbmres = tbmres = NULL;
231 * We issue prefetch requests *after* fetching the current page to try
232 * to avoid having prefetching interfere with the main I/O. Also, this
233 * should happen only when we have determined there is still something
234 * to do on the current page, else we may uselessly prefetch the same
235 * page we are just about to request for real.
237 if (prefetch_iterator)
239 while (node->prefetch_pages < node->prefetch_target)
241 TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
245 /* No more pages to prefetch */
246 tbm_end_iterate(prefetch_iterator);
247 node->prefetch_iterator = prefetch_iterator = NULL;
250 node->prefetch_pages++;
251 PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
254 #endif /* USE_PREFETCH */
257 * Okay to fetch the tuple
259 targoffset = scan->rs_vistuples[scan->rs_cindex];
260 dp = BufferGetPage(scan->rs_cbuf, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
261 lp = PageGetItemId(dp, targoffset);
262 Assert(ItemIdIsNormal(lp));
264 scan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
265 scan->rs_ctup.t_len = ItemIdGetLength(lp);
266 scan->rs_ctup.t_tableOid = scan->rs_rd->rd_id;
267 ItemPointerSet(&scan->rs_ctup.t_self, tbmres->blockno, targoffset);
269 pgstat_count_heap_fetch(scan->rs_rd);
272 * Set up the result slot to point to this tuple. Note that the slot
273 * acquires a pin on the buffer.
275 ExecStoreTuple(&scan->rs_ctup,
281 * If we are using lossy info, we have to recheck the qual conditions
286 econtext->ecxt_scantuple = slot;
287 ResetExprContext(econtext);
289 if (!ExecQual(node->bitmapqualorig, econtext, false))
291 /* Fails recheck, so drop it and loop back for another */
292 InstrCountFiltered2(node, 1);
293 ExecClearTuple(slot);
298 /* OK to return this tuple */
303 * if we get here it means we are at the end of the scan..
305 return ExecClearTuple(slot);
309 * bitgetpage - subroutine for BitmapHeapNext()
311 * This routine reads and pins the specified page of the relation, then
312 * builds an array indicating which tuples on the page are both potentially
313 * interesting according to the bitmap, and visible according to the snapshot.
316 bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
318 BlockNumber page = tbmres->blockno;
324 * Acquire pin on the target heap page, trading in any pin we held before.
326 Assert(page < scan->rs_nblocks);
328 scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf,
331 buffer = scan->rs_cbuf;
332 snapshot = scan->rs_snapshot;
337 * Prune and repair fragmentation for the whole page, if possible.
339 heap_page_prune_opt(scan->rs_rd, buffer);
342 * We must hold share lock on the buffer content while examining tuple
343 * visibility. Afterwards, however, the tuples we have found to be
344 * visible are guaranteed good as long as we hold the buffer pin.
346 LockBuffer(buffer, BUFFER_LOCK_SHARE);
349 * We need two separate strategies for lossy and non-lossy cases.
351 if (tbmres->ntuples >= 0)
354 * Bitmap is non-lossy, so we just look through the offsets listed in
355 * tbmres; but we have to follow any HOT chain starting at each such
360 for (curslot = 0; curslot < tbmres->ntuples; curslot++)
362 OffsetNumber offnum = tbmres->offsets[curslot];
364 HeapTupleData heapTuple;
366 ItemPointerSet(&tid, page, offnum);
367 if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot,
368 &heapTuple, NULL, true))
369 scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
375 * Bitmap is lossy, so we must examine each item pointer on the page.
376 * But we can ignore HOT chains, since we'll check each tuple anyway.
378 Page dp = BufferGetPage(buffer, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
379 OffsetNumber maxoff = PageGetMaxOffsetNumber(dp);
382 for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
385 HeapTupleData loctup;
388 lp = PageGetItemId(dp, offnum);
389 if (!ItemIdIsNormal(lp))
391 loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
392 loctup.t_len = ItemIdGetLength(lp);
393 loctup.t_tableOid = scan->rs_rd->rd_id;
394 ItemPointerSet(&loctup.t_self, page, offnum);
395 valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
398 scan->rs_vistuples[ntup++] = offnum;
399 PredicateLockTuple(scan->rs_rd, &loctup, snapshot);
401 CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
406 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
408 Assert(ntup <= MaxHeapTuplesPerPage);
409 scan->rs_ntuples = ntup;
413 * BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual
416 BitmapHeapRecheck(BitmapHeapScanState *node, TupleTableSlot *slot)
418 ExprContext *econtext;
421 * extract necessary information from index scan node
423 econtext = node->ss.ps.ps_ExprContext;
425 /* Does the tuple meet the original qual conditions? */
426 econtext->ecxt_scantuple = slot;
428 ResetExprContext(econtext);
430 return ExecQual(node->bitmapqualorig, econtext, false);
433 /* ----------------------------------------------------------------
434 * ExecBitmapHeapScan(node)
435 * ----------------------------------------------------------------
438 ExecBitmapHeapScan(BitmapHeapScanState *node)
440 return ExecScan(&node->ss,
441 (ExecScanAccessMtd) BitmapHeapNext,
442 (ExecScanRecheckMtd) BitmapHeapRecheck);
445 /* ----------------------------------------------------------------
446 * ExecReScanBitmapHeapScan(node)
447 * ----------------------------------------------------------------
450 ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
452 PlanState *outerPlan = outerPlanState(node);
454 /* rescan to release any page pin */
455 heap_rescan(node->ss.ss_currentScanDesc, NULL);
457 if (node->tbmiterator)
458 tbm_end_iterate(node->tbmiterator);
459 if (node->prefetch_iterator)
460 tbm_end_iterate(node->prefetch_iterator);
464 node->tbmiterator = NULL;
466 node->prefetch_iterator = NULL;
468 ExecScanReScan(&node->ss);
471 * if chgParam of subnode is not null then plan will be re-scanned by
472 * first ExecProcNode.
474 if (outerPlan->chgParam == NULL)
475 ExecReScan(outerPlan);
478 /* ----------------------------------------------------------------
479 * ExecEndBitmapHeapScan
480 * ----------------------------------------------------------------
483 ExecEndBitmapHeapScan(BitmapHeapScanState *node)
486 HeapScanDesc scanDesc;
489 * extract information from the node
491 relation = node->ss.ss_currentRelation;
492 scanDesc = node->ss.ss_currentScanDesc;
495 * Free the exprcontext
497 ExecFreeExprContext(&node->ss.ps);
500 * clear out tuple table slots
502 ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
503 ExecClearTuple(node->ss.ss_ScanTupleSlot);
506 * close down subplans
508 ExecEndNode(outerPlanState(node));
511 * release bitmap if any
513 if (node->tbmiterator)
514 tbm_end_iterate(node->tbmiterator);
515 if (node->prefetch_iterator)
516 tbm_end_iterate(node->prefetch_iterator);
523 heap_endscan(scanDesc);
526 * close the heap relation.
528 ExecCloseScanRelation(relation);
531 /* ----------------------------------------------------------------
532 * ExecInitBitmapHeapScan
534 * Initializes the scan's state information.
535 * ----------------------------------------------------------------
537 BitmapHeapScanState *
538 ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
540 BitmapHeapScanState *scanstate;
541 Relation currentRelation;
544 /* check for unsupported flags */
545 Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
548 * Assert caller didn't ask for an unsafe snapshot --- see comments at
551 Assert(IsMVCCSnapshot(estate->es_snapshot));
554 * create state structure
556 scanstate = makeNode(BitmapHeapScanState);
557 scanstate->ss.ps.plan = (Plan *) node;
558 scanstate->ss.ps.state = estate;
560 scanstate->tbm = NULL;
561 scanstate->tbmiterator = NULL;
562 scanstate->tbmres = NULL;
563 scanstate->exact_pages = 0;
564 scanstate->lossy_pages = 0;
565 scanstate->prefetch_iterator = NULL;
566 scanstate->prefetch_pages = 0;
567 scanstate->prefetch_target = 0;
568 /* may be updated below */
569 scanstate->prefetch_maximum = target_prefetch_pages;
572 * Miscellaneous initialization
574 * create expression context for node
576 ExecAssignExprContext(estate, &scanstate->ss.ps);
578 scanstate->ss.ps.ps_TupFromTlist = false;
581 * initialize child expressions
583 scanstate->ss.ps.targetlist = (List *)
584 ExecInitExpr((Expr *) node->scan.plan.targetlist,
585 (PlanState *) scanstate);
586 scanstate->ss.ps.qual = (List *)
587 ExecInitExpr((Expr *) node->scan.plan.qual,
588 (PlanState *) scanstate);
589 scanstate->bitmapqualorig = (List *)
590 ExecInitExpr((Expr *) node->bitmapqualorig,
591 (PlanState *) scanstate);
594 * tuple table initialization
596 ExecInitResultTupleSlot(estate, &scanstate->ss.ps);
597 ExecInitScanTupleSlot(estate, &scanstate->ss);
600 * open the base relation and acquire appropriate lock on it.
602 currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
605 * Determine the maximum for prefetch_target. If the tablespace has a
606 * specific IO concurrency set, use that to compute the corresponding
607 * maximum value; otherwise, we already initialized to the value computed
608 * by the GUC machinery.
611 get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace);
612 if (io_concurrency != effective_io_concurrency)
616 if (ComputeIoConcurrency(io_concurrency, &maximum))
617 scanstate->prefetch_maximum = rint(maximum);
620 scanstate->ss.ss_currentRelation = currentRelation;
623 * Even though we aren't going to do a conventional seqscan, it is useful
624 * to create a HeapScanDesc --- most of the fields in it are usable.
626 scanstate->ss.ss_currentScanDesc = heap_beginscan_bm(currentRelation,
632 * get the scan type from the relation descriptor.
634 ExecAssignScanType(&scanstate->ss, RelationGetDescr(currentRelation));
637 * Initialize result tuple type and projection info.
639 ExecAssignResultTypeFromTL(&scanstate->ss.ps);
640 ExecAssignScanProjectionInfo(&scanstate->ss);
643 * initialize child nodes
645 * We do this last because the child nodes will open indexscans on our
646 * relation's indexes, and we want to be sure we have acquired a lock on
647 * the relation first.
649 outerPlanState(scanstate) = ExecInitNode(outerPlan(node), estate, eflags);