]> granicus.if.org Git - postgresql/blob - src/backend/executor/nodeBitmapHeapscan.c
449aacb6e740a75fa9c85356eeaef0aa080f0115
[postgresql] / src / backend / executor / nodeBitmapHeapscan.c
1 /*-------------------------------------------------------------------------
2  *
3  * nodeBitmapHeapscan.c
4  *        Routines to support bitmapped scans of relations
5  *
6  * NOTE: it is critical that this plan type only be used with MVCC-compliant
7  * snapshots (ie, regular snapshots, not SnapshotAny or one of the other
8  * special snapshots).  The reason is that since index and heap scans are
9  * decoupled, there can be no assurance that the index tuple prompting a
10  * visit to a particular heap TID still exists when the visit is made.
11  * Therefore the tuple might not exist anymore either (which is OK because
12  * heap_fetch will cope) --- but worse, the tuple slot could have been
13  * re-used for a newer tuple.  With an MVCC snapshot the newer tuple is
14  * certain to fail the time qual and so it will not be mistakenly returned,
15  * but with anything else we might return a tuple that doesn't meet the
16  * required index qual conditions.
17  *
18  *
19  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
20  * Portions Copyright (c) 1994, Regents of the University of California
21  *
22  *
23  * IDENTIFICATION
24  *        src/backend/executor/nodeBitmapHeapscan.c
25  *
26  *-------------------------------------------------------------------------
27  */
28 /*
29  * INTERFACE ROUTINES
30  *              ExecBitmapHeapScan                      scans a relation using bitmap info
31  *              ExecBitmapHeapNext                      workhorse for above
32  *              ExecInitBitmapHeapScan          creates and initializes state info.
33  *              ExecReScanBitmapHeapScan        prepares to rescan the plan.
34  *              ExecEndBitmapHeapScan           releases all storage.
35  */
36 #include "postgres.h"
37
38 #include "access/relscan.h"
39 #include "access/transam.h"
40 #include "executor/execdebug.h"
41 #include "executor/nodeBitmapHeapscan.h"
42 #include "pgstat.h"
43 #include "storage/bufmgr.h"
44 #include "storage/predicate.h"
45 #include "utils/memutils.h"
46 #include "utils/rel.h"
47 #include "utils/spccache.h"
48 #include "utils/snapmgr.h"
49 #include "utils/tqual.h"
50
51
52 static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
53 static void bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres);
54
55
56 /* ----------------------------------------------------------------
57  *              BitmapHeapNext
58  *
59  *              Retrieve next tuple from the BitmapHeapScan node's currentRelation
60  * ----------------------------------------------------------------
61  */
62 static TupleTableSlot *
63 BitmapHeapNext(BitmapHeapScanState *node)
64 {
65         ExprContext *econtext;
66         HeapScanDesc scan;
67         TIDBitmap  *tbm;
68         TBMIterator *tbmiterator;
69         TBMIterateResult *tbmres;
70
71 #ifdef USE_PREFETCH
72         TBMIterator *prefetch_iterator;
73 #endif
74         OffsetNumber targoffset;
75         TupleTableSlot *slot;
76
77         /*
78          * extract necessary information from index scan node
79          */
80         econtext = node->ss.ps.ps_ExprContext;
81         slot = node->ss.ss_ScanTupleSlot;
82         scan = node->ss.ss_currentScanDesc;
83         tbm = node->tbm;
84         tbmiterator = node->tbmiterator;
85         tbmres = node->tbmres;
86 #ifdef USE_PREFETCH
87         prefetch_iterator = node->prefetch_iterator;
88 #endif
89
90         /*
91          * If we haven't yet performed the underlying index scan, do it, and begin
92          * the iteration over the bitmap.
93          *
94          * For prefetching, we use *two* iterators, one for the pages we are
95          * actually scanning and another that runs ahead of the first for
96          * prefetching.  node->prefetch_pages tracks exactly how many pages ahead
97          * the prefetch iterator is.  Also, node->prefetch_target tracks the
98          * desired prefetch distance, which starts small and increases up to the
99          * node->prefetch_maximum.  This is to avoid doing a lot of prefetching in
100          * a scan that stops after a few tuples because of a LIMIT.
101          */
102         if (tbm == NULL)
103         {
104                 tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
105
106                 if (!tbm || !IsA(tbm, TIDBitmap))
107                         elog(ERROR, "unrecognized result from subplan");
108
109                 node->tbm = tbm;
110                 node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm);
111                 node->tbmres = tbmres = NULL;
112
113 #ifdef USE_PREFETCH
114                 if (node->prefetch_maximum > 0)
115                 {
116                         node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm);
117                         node->prefetch_pages = 0;
118                         node->prefetch_target = -1;
119                 }
120 #endif   /* USE_PREFETCH */
121         }
122
123         for (;;)
124         {
125                 Page            dp;
126                 ItemId          lp;
127
128                 /*
129                  * Get next page of results if needed
130                  */
131                 if (tbmres == NULL)
132                 {
133                         node->tbmres = tbmres = tbm_iterate(tbmiterator);
134                         if (tbmres == NULL)
135                         {
136                                 /* no more entries in the bitmap */
137                                 break;
138                         }
139
140 #ifdef USE_PREFETCH
141                         if (node->prefetch_pages > 0)
142                         {
143                                 /* The main iterator has closed the distance by one page */
144                                 node->prefetch_pages--;
145                         }
146                         else if (prefetch_iterator)
147                         {
148                                 /* Do not let the prefetch iterator get behind the main one */
149                                 TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
150
151                                 if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno)
152                                         elog(ERROR, "prefetch and main iterators are out of sync");
153                         }
154 #endif   /* USE_PREFETCH */
155
156                         /*
157                          * Ignore any claimed entries past what we think is the end of the
158                          * relation.  (This is probably not necessary given that we got at
159                          * least AccessShareLock on the table before performing any of the
160                          * indexscans, but let's be safe.)
161                          */
162                         if (tbmres->blockno >= scan->rs_nblocks)
163                         {
164                                 node->tbmres = tbmres = NULL;
165                                 continue;
166                         }
167
168                         /*
169                          * Fetch the current heap page and identify candidate tuples.
170                          */
171                         bitgetpage(scan, tbmres);
172
173                         if (tbmres->ntuples >= 0)
174                                 node->exact_pages++;
175                         else
176                                 node->lossy_pages++;
177
178                         /*
179                          * Set rs_cindex to first slot to examine
180                          */
181                         scan->rs_cindex = 0;
182
183 #ifdef USE_PREFETCH
184
185                         /*
186                          * Increase prefetch target if it's not yet at the max.  Note that
187                          * we will increase it to zero after fetching the very first
188                          * page/tuple, then to one after the second tuple is fetched, then
189                          * it doubles as later pages are fetched.
190                          */
191                         if (node->prefetch_target >= node->prefetch_maximum)
192                                  /* don't increase any further */ ;
193                         else if (node->prefetch_target >= node->prefetch_maximum / 2)
194                                 node->prefetch_target = node->prefetch_maximum;
195                         else if (node->prefetch_target > 0)
196                                 node->prefetch_target *= 2;
197                         else
198                                 node->prefetch_target++;
199 #endif   /* USE_PREFETCH */
200                 }
201                 else
202                 {
203                         /*
204                          * Continuing in previously obtained page; advance rs_cindex
205                          */
206                         scan->rs_cindex++;
207
208 #ifdef USE_PREFETCH
209
210                         /*
211                          * Try to prefetch at least a few pages even before we get to the
212                          * second page if we don't stop reading after the first tuple.
213                          */
214                         if (node->prefetch_target < node->prefetch_maximum)
215                                 node->prefetch_target++;
216 #endif   /* USE_PREFETCH */
217                 }
218
219                 /*
220                  * Out of range?  If so, nothing more to look at on this page
221                  */
222                 if (scan->rs_cindex < 0 || scan->rs_cindex >= scan->rs_ntuples)
223                 {
224                         node->tbmres = tbmres = NULL;
225                         continue;
226                 }
227
228 #ifdef USE_PREFETCH
229
230                 /*
231                  * We issue prefetch requests *after* fetching the current page to try
232                  * to avoid having prefetching interfere with the main I/O. Also, this
233                  * should happen only when we have determined there is still something
234                  * to do on the current page, else we may uselessly prefetch the same
235                  * page we are just about to request for real.
236                  */
237                 if (prefetch_iterator)
238                 {
239                         while (node->prefetch_pages < node->prefetch_target)
240                         {
241                                 TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
242
243                                 if (tbmpre == NULL)
244                                 {
245                                         /* No more pages to prefetch */
246                                         tbm_end_iterate(prefetch_iterator);
247                                         node->prefetch_iterator = prefetch_iterator = NULL;
248                                         break;
249                                 }
250                                 node->prefetch_pages++;
251                                 PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
252                         }
253                 }
254 #endif   /* USE_PREFETCH */
255
256                 /*
257                  * Okay to fetch the tuple
258                  */
259                 targoffset = scan->rs_vistuples[scan->rs_cindex];
260                 dp = (Page) BufferGetPage(scan->rs_cbuf);
261                 lp = PageGetItemId(dp, targoffset);
262                 Assert(ItemIdIsNormal(lp));
263
264                 scan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
265                 scan->rs_ctup.t_len = ItemIdGetLength(lp);
266                 scan->rs_ctup.t_tableOid = scan->rs_rd->rd_id;
267                 ItemPointerSet(&scan->rs_ctup.t_self, tbmres->blockno, targoffset);
268
269                 pgstat_count_heap_fetch(scan->rs_rd);
270
271                 /*
272                  * Set up the result slot to point to this tuple. Note that the slot
273                  * acquires a pin on the buffer.
274                  */
275                 ExecStoreTuple(&scan->rs_ctup,
276                                            slot,
277                                            scan->rs_cbuf,
278                                            false);
279
280                 /*
281                  * If we are using lossy info, we have to recheck the qual conditions
282                  * at every tuple.
283                  */
284                 if (tbmres->recheck)
285                 {
286                         econtext->ecxt_scantuple = slot;
287                         ResetExprContext(econtext);
288
289                         if (!ExecQual(node->bitmapqualorig, econtext, false))
290                         {
291                                 /* Fails recheck, so drop it and loop back for another */
292                                 InstrCountFiltered2(node, 1);
293                                 ExecClearTuple(slot);
294                                 continue;
295                         }
296                 }
297
298                 /* OK to return this tuple */
299                 return slot;
300         }
301
302         /*
303          * if we get here it means we are at the end of the scan..
304          */
305         return ExecClearTuple(slot);
306 }
307
308 /*
309  * bitgetpage - subroutine for BitmapHeapNext()
310  *
311  * This routine reads and pins the specified page of the relation, then
312  * builds an array indicating which tuples on the page are both potentially
313  * interesting according to the bitmap, and visible according to the snapshot.
314  */
315 static void
316 bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
317 {
318         BlockNumber page = tbmres->blockno;
319         Buffer          buffer;
320         Snapshot        snapshot;
321         int                     ntup;
322
323         /*
324          * Acquire pin on the target heap page, trading in any pin we held before.
325          */
326         Assert(page < scan->rs_nblocks);
327
328         scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf,
329                                                                                  scan->rs_rd,
330                                                                                  page);
331         buffer = scan->rs_cbuf;
332         snapshot = scan->rs_snapshot;
333
334         ntup = 0;
335
336         /*
337          * Prune and repair fragmentation for the whole page, if possible.
338          */
339         heap_page_prune_opt(scan->rs_rd, buffer);
340
341         /*
342          * We must hold share lock on the buffer content while examining tuple
343          * visibility.  Afterwards, however, the tuples we have found to be
344          * visible are guaranteed good as long as we hold the buffer pin.
345          */
346         LockBuffer(buffer, BUFFER_LOCK_SHARE);
347
348         /*
349          * We need two separate strategies for lossy and non-lossy cases.
350          */
351         if (tbmres->ntuples >= 0)
352         {
353                 /*
354                  * Bitmap is non-lossy, so we just look through the offsets listed in
355                  * tbmres; but we have to follow any HOT chain starting at each such
356                  * offset.
357                  */
358                 int                     curslot;
359
360                 for (curslot = 0; curslot < tbmres->ntuples; curslot++)
361                 {
362                         OffsetNumber offnum = tbmres->offsets[curslot];
363                         ItemPointerData tid;
364                         HeapTupleData heapTuple;
365
366                         ItemPointerSet(&tid, page, offnum);
367                         if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot,
368                                                                            &heapTuple, NULL, true))
369                                 scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
370                 }
371         }
372         else
373         {
374                 /*
375                  * Bitmap is lossy, so we must examine each item pointer on the page.
376                  * But we can ignore HOT chains, since we'll check each tuple anyway.
377                  */
378                 Page            dp = (Page) BufferGetPage(buffer);
379                 OffsetNumber maxoff = PageGetMaxOffsetNumber(dp);
380                 OffsetNumber offnum;
381
382                 for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
383                 {
384                         ItemId          lp;
385                         HeapTupleData loctup;
386                         bool            valid;
387
388                         lp = PageGetItemId(dp, offnum);
389                         if (!ItemIdIsNormal(lp))
390                                 continue;
391                         loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
392                         loctup.t_len = ItemIdGetLength(lp);
393                         loctup.t_tableOid = scan->rs_rd->rd_id;
394                         ItemPointerSet(&loctup.t_self, page, offnum);
395                         valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
396                         if (valid)
397                         {
398                                 scan->rs_vistuples[ntup++] = offnum;
399                                 PredicateLockTuple(scan->rs_rd, &loctup, snapshot);
400                         }
401                         CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
402                                                                                         buffer, snapshot);
403                 }
404         }
405
406         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
407
408         Assert(ntup <= MaxHeapTuplesPerPage);
409         scan->rs_ntuples = ntup;
410 }
411
412 /*
413  * BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual
414  */
415 static bool
416 BitmapHeapRecheck(BitmapHeapScanState *node, TupleTableSlot *slot)
417 {
418         ExprContext *econtext;
419
420         /*
421          * extract necessary information from index scan node
422          */
423         econtext = node->ss.ps.ps_ExprContext;
424
425         /* Does the tuple meet the original qual conditions? */
426         econtext->ecxt_scantuple = slot;
427
428         ResetExprContext(econtext);
429
430         return ExecQual(node->bitmapqualorig, econtext, false);
431 }
432
433 /* ----------------------------------------------------------------
434  *              ExecBitmapHeapScan(node)
435  * ----------------------------------------------------------------
436  */
437 TupleTableSlot *
438 ExecBitmapHeapScan(BitmapHeapScanState *node)
439 {
440         return ExecScan(&node->ss,
441                                         (ExecScanAccessMtd) BitmapHeapNext,
442                                         (ExecScanRecheckMtd) BitmapHeapRecheck);
443 }
444
445 /* ----------------------------------------------------------------
446  *              ExecReScanBitmapHeapScan(node)
447  * ----------------------------------------------------------------
448  */
449 void
450 ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
451 {
452         PlanState  *outerPlan = outerPlanState(node);
453
454         /* rescan to release any page pin */
455         heap_rescan(node->ss.ss_currentScanDesc, NULL);
456
457         if (node->tbmiterator)
458                 tbm_end_iterate(node->tbmiterator);
459         if (node->prefetch_iterator)
460                 tbm_end_iterate(node->prefetch_iterator);
461         if (node->tbm)
462                 tbm_free(node->tbm);
463         node->tbm = NULL;
464         node->tbmiterator = NULL;
465         node->tbmres = NULL;
466         node->prefetch_iterator = NULL;
467
468         ExecScanReScan(&node->ss);
469
470         /*
471          * if chgParam of subnode is not null then plan will be re-scanned by
472          * first ExecProcNode.
473          */
474         if (outerPlan->chgParam == NULL)
475                 ExecReScan(outerPlan);
476 }
477
478 /* ----------------------------------------------------------------
479  *              ExecEndBitmapHeapScan
480  * ----------------------------------------------------------------
481  */
482 void
483 ExecEndBitmapHeapScan(BitmapHeapScanState *node)
484 {
485         Relation        relation;
486         HeapScanDesc scanDesc;
487
488         /*
489          * extract information from the node
490          */
491         relation = node->ss.ss_currentRelation;
492         scanDesc = node->ss.ss_currentScanDesc;
493
494         /*
495          * Free the exprcontext
496          */
497         ExecFreeExprContext(&node->ss.ps);
498
499         /*
500          * clear out tuple table slots
501          */
502         ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
503         ExecClearTuple(node->ss.ss_ScanTupleSlot);
504
505         /*
506          * close down subplans
507          */
508         ExecEndNode(outerPlanState(node));
509
510         /*
511          * release bitmap if any
512          */
513         if (node->tbmiterator)
514                 tbm_end_iterate(node->tbmiterator);
515         if (node->prefetch_iterator)
516                 tbm_end_iterate(node->prefetch_iterator);
517         if (node->tbm)
518                 tbm_free(node->tbm);
519
520         /*
521          * close heap scan
522          */
523         heap_endscan(scanDesc);
524
525         /*
526          * close the heap relation.
527          */
528         ExecCloseScanRelation(relation);
529 }
530
531 /* ----------------------------------------------------------------
532  *              ExecInitBitmapHeapScan
533  *
534  *              Initializes the scan's state information.
535  * ----------------------------------------------------------------
536  */
537 BitmapHeapScanState *
538 ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
539 {
540         BitmapHeapScanState *scanstate;
541         Relation        currentRelation;
542         int                     io_concurrency;
543
544         /* check for unsupported flags */
545         Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
546
547         /*
548          * Assert caller didn't ask for an unsafe snapshot --- see comments at
549          * head of file.
550          */
551         Assert(IsMVCCSnapshot(estate->es_snapshot));
552
553         /*
554          * create state structure
555          */
556         scanstate = makeNode(BitmapHeapScanState);
557         scanstate->ss.ps.plan = (Plan *) node;
558         scanstate->ss.ps.state = estate;
559
560         scanstate->tbm = NULL;
561         scanstate->tbmiterator = NULL;
562         scanstate->tbmres = NULL;
563         scanstate->exact_pages = 0;
564         scanstate->lossy_pages = 0;
565         scanstate->prefetch_iterator = NULL;
566         scanstate->prefetch_pages = 0;
567         scanstate->prefetch_target = 0;
568         /* may be updated below */
569         scanstate->prefetch_maximum = target_prefetch_pages;
570
571         /*
572          * Miscellaneous initialization
573          *
574          * create expression context for node
575          */
576         ExecAssignExprContext(estate, &scanstate->ss.ps);
577
578         scanstate->ss.ps.ps_TupFromTlist = false;
579
580         /*
581          * initialize child expressions
582          */
583         scanstate->ss.ps.targetlist = (List *)
584                 ExecInitExpr((Expr *) node->scan.plan.targetlist,
585                                          (PlanState *) scanstate);
586         scanstate->ss.ps.qual = (List *)
587                 ExecInitExpr((Expr *) node->scan.plan.qual,
588                                          (PlanState *) scanstate);
589         scanstate->bitmapqualorig = (List *)
590                 ExecInitExpr((Expr *) node->bitmapqualorig,
591                                          (PlanState *) scanstate);
592
593         /*
594          * tuple table initialization
595          */
596         ExecInitResultTupleSlot(estate, &scanstate->ss.ps);
597         ExecInitScanTupleSlot(estate, &scanstate->ss);
598
599         /*
600          * open the base relation and acquire appropriate lock on it.
601          */
602         currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
603
604         /*
605          * Determine the maximum for prefetch_target.  If the tablespace has a
606          * specific IO concurrency set, use that to compute the corresponding
607          * maximum value; otherwise, we already initialized to the value computed
608          * by the GUC machinery.
609          */
610         io_concurrency =
611                 get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace);
612         if (io_concurrency != effective_io_concurrency)
613         {
614                 double          maximum;
615
616                 if (ComputeIoConcurrency(io_concurrency, &maximum))
617                         scanstate->prefetch_maximum = rint(maximum);
618         }
619
620         scanstate->ss.ss_currentRelation = currentRelation;
621
622         /*
623          * Even though we aren't going to do a conventional seqscan, it is useful
624          * to create a HeapScanDesc --- most of the fields in it are usable.
625          */
626         scanstate->ss.ss_currentScanDesc = heap_beginscan_bm(currentRelation,
627                                                                                                                  estate->es_snapshot,
628                                                                                                                  0,
629                                                                                                                  NULL);
630
631         /*
632          * get the scan type from the relation descriptor.
633          */
634         ExecAssignScanType(&scanstate->ss, RelationGetDescr(currentRelation));
635
636         /*
637          * Initialize result tuple type and projection info.
638          */
639         ExecAssignResultTypeFromTL(&scanstate->ss.ps);
640         ExecAssignScanProjectionInfo(&scanstate->ss);
641
642         /*
643          * initialize child nodes
644          *
645          * We do this last because the child nodes will open indexscans on our
646          * relation's indexes, and we want to be sure we have acquired a lock on
647          * the relation first.
648          */
649         outerPlanState(scanstate) = ExecInitNode(outerPlan(node), estate, eflags);
650
651         /*
652          * all done.
653          */
654         return scanstate;
655 }