1 /*-------------------------------------------------------------------------
4 * Routines to support index-only scans
6 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/executor/nodeIndexonlyscan.c
13 *-------------------------------------------------------------------------
17 * ExecIndexOnlyScan scans an index
18 * IndexOnlyNext retrieve next tuple
19 * ExecInitIndexOnlyScan creates and initializes state info.
20 * ExecReScanIndexOnlyScan rescans the indexed relation.
21 * ExecEndIndexOnlyScan releases all storage.
22 * ExecIndexOnlyMarkPos marks scan position.
23 * ExecIndexOnlyRestrPos restores scan position.
24 * ExecIndexOnlyScanEstimate estimates DSM space needed for
25 * parallel index-only scan
26 * ExecIndexOnlyScanInitializeDSM initialize DSM for parallel
28 * ExecIndexOnlyScanInitializeWorker attach to DSM info in parallel worker
32 #include "access/relscan.h"
33 #include "access/visibilitymap.h"
34 #include "executor/execdebug.h"
35 #include "executor/nodeIndexonlyscan.h"
36 #include "executor/nodeIndexscan.h"
37 #include "storage/bufmgr.h"
38 #include "storage/predicate.h"
39 #include "utils/memutils.h"
40 #include "utils/rel.h"
43 static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node);
44 static void StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup,
48 /* ----------------------------------------------------------------
51 * Retrieve a tuple from the IndexOnlyScan node's index.
52 * ----------------------------------------------------------------
54 static TupleTableSlot *
55 IndexOnlyNext(IndexOnlyScanState *node)
58 ExprContext *econtext;
59 ScanDirection direction;
60 IndexScanDesc scandesc;
65 * extract necessary information from index scan node
67 estate = node->ss.ps.state;
68 direction = estate->es_direction;
69 /* flip direction if this is an overall backward scan */
70 if (ScanDirectionIsBackward(((IndexOnlyScan *) node->ss.ps.plan)->indexorderdir))
72 if (ScanDirectionIsForward(direction))
73 direction = BackwardScanDirection;
74 else if (ScanDirectionIsBackward(direction))
75 direction = ForwardScanDirection;
77 scandesc = node->ioss_ScanDesc;
78 econtext = node->ss.ps.ps_ExprContext;
79 slot = node->ss.ss_ScanTupleSlot;
82 * OK, now that we have what we need, fetch the next tuple.
84 while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
86 HeapTuple tuple = NULL;
89 * We can skip the heap fetch if the TID references a heap page on
90 * which all tuples are known visible to everybody. In any case,
91 * we'll use the index tuple not the heap tuple as the data source.
93 * Note on Memory Ordering Effects: visibilitymap_get_status does not
94 * lock the visibility map buffer, and therefore the result we read
95 * here could be slightly stale. However, it can't be stale enough to
98 * We need to detect clearing a VM bit due to an insert right away,
99 * because the tuple is present in the index page but not visible. The
100 * reading of the TID by this scan (using a shared lock on the index
101 * buffer) is serialized with the insert of the TID into the index
102 * (using an exclusive lock on the index buffer). Because the VM bit
103 * is cleared before updating the index, and locking/unlocking of the
104 * index page acts as a full memory barrier, we are sure to see the
105 * cleared bit if we see a recently-inserted TID.
107 * Deletes do not update the index page (only VACUUM will clear out
108 * the TID), so the clearing of the VM bit by a delete is not
109 * serialized with this test below, and we may see a value that is
110 * significantly stale. However, we don't care about the delete right
111 * away, because the tuple is still visible until the deleting
112 * transaction commits or the statement ends (if it's our
113 * transaction). In either case, the lock on the VM buffer will have
114 * been released (acting as a write barrier) after clearing the bit.
115 * And for us to have a snapshot that includes the deleting
116 * transaction (making the tuple invisible), we must have acquired
117 * ProcArrayLock after that time, acting as a read barrier.
119 * It's worth going through this complexity to avoid needing to lock
120 * the VM buffer, which could cause significant contention.
122 if (!VM_ALL_VISIBLE(scandesc->heapRelation,
123 ItemPointerGetBlockNumber(tid),
124 &node->ioss_VMBuffer))
127 * Rats, we have to visit the heap to check visibility.
129 node->ioss_HeapFetches++;
130 tuple = index_fetch_heap(scandesc);
132 continue; /* no visible tuple, try next index entry */
135 * Only MVCC snapshots are supported here, so there should be no
136 * need to keep following the HOT chain once a visible entry has
137 * been found. If we did want to allow that, we'd need to keep
138 * more state to remember not to call index_getnext_tid next time.
140 if (scandesc->xs_continue_hot)
141 elog(ERROR, "non-MVCC snapshots are not supported in index-only scans");
144 * Note: at this point we are holding a pin on the heap page, as
145 * recorded in scandesc->xs_cbuf. We could release that pin now,
146 * but it's not clear whether it's a win to do so. The next index
147 * entry might require a visit to the same heap page.
152 * Fill the scan tuple slot with data from the index. This might be
153 * provided in either HeapTuple or IndexTuple format. Conceivably an
154 * index AM might fill both fields, in which case we prefer the heap
155 * format, since it's probably a bit cheaper to fill a slot from.
157 if (scandesc->xs_hitup)
160 * We don't take the trouble to verify that the provided tuple has
161 * exactly the slot's format, but it seems worth doing a quick
162 * check on the number of fields.
164 Assert(slot->tts_tupleDescriptor->natts ==
165 scandesc->xs_hitupdesc->natts);
166 ExecStoreTuple(scandesc->xs_hitup, slot, InvalidBuffer, false);
168 else if (scandesc->xs_itup)
169 StoreIndexTuple(slot, scandesc->xs_itup, scandesc->xs_itupdesc);
171 elog(ERROR, "no data returned for index-only scan");
174 * If the index was lossy, we have to recheck the index quals.
175 * (Currently, this can never happen, but we should support the case
176 * for possible future use, eg with GiST indexes.)
178 if (scandesc->xs_recheck)
180 econtext->ecxt_scantuple = slot;
181 ResetExprContext(econtext);
182 if (!ExecQual(node->indexqual, econtext, false))
184 /* Fails recheck, so drop it and loop back for another */
185 InstrCountFiltered2(node, 1);
191 * We don't currently support rechecking ORDER BY distances. (In
192 * principle, if the index can support retrieval of the originally
193 * indexed value, it should be able to produce an exact distance
194 * calculation too. So it's not clear that adding code here for
195 * recheck/re-sort would be worth the trouble. But we should at least
196 * throw an error if someone tries it.)
198 if (scandesc->numberOfOrderBys > 0 && scandesc->xs_recheckorderby)
200 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
201 errmsg("lossy distance functions are not supported in index-only scans")));
204 * Predicate locks for index-only scans must be acquired at the page
205 * level when the heap is not accessed, since tuple-level predicate
206 * locks need the tuple's xmin value. If we had to visit the tuple
207 * anyway, then we already have the tuple-level lock and can skip the
211 PredicateLockPage(scandesc->heapRelation,
212 ItemPointerGetBlockNumber(tid),
213 estate->es_snapshot);
219 * if we get here it means the index scan failed so we are at the end of
222 return ExecClearTuple(slot);
227 * Fill the slot with data from the index tuple.
229 * At some point this might be generally-useful functionality, but
230 * right now we don't need it elsewhere.
233 StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup, TupleDesc itupdesc)
235 int nindexatts = itupdesc->natts;
236 Datum *values = slot->tts_values;
237 bool *isnull = slot->tts_isnull;
241 * Note: we must use the tupdesc supplied by the AM in index_getattr, not
242 * the slot's tupdesc, in case the latter has different datatypes (this
243 * happens for btree name_ops in particular). They'd better have the same
244 * number of columns though, as well as being datatype-compatible which is
245 * something we can't so easily check.
247 Assert(slot->tts_tupleDescriptor->natts == nindexatts);
249 ExecClearTuple(slot);
250 for (i = 0; i < nindexatts; i++)
251 values[i] = index_getattr(itup, i + 1, itupdesc, &isnull[i]);
252 ExecStoreVirtualTuple(slot);
256 * IndexOnlyRecheck -- access method routine to recheck a tuple in EvalPlanQual
258 * This can't really happen, since an index can't supply CTID which would
259 * be necessary data for any potential EvalPlanQual target relation. If it
260 * did happen, the EPQ code would pass us the wrong data, namely a heap
261 * tuple not an index tuple. So throw an error.
264 IndexOnlyRecheck(IndexOnlyScanState *node, TupleTableSlot *slot)
266 elog(ERROR, "EvalPlanQual recheck is not supported in index-only scans");
267 return false; /* keep compiler quiet */
270 /* ----------------------------------------------------------------
271 * ExecIndexOnlyScan(node)
272 * ----------------------------------------------------------------
275 ExecIndexOnlyScan(IndexOnlyScanState *node)
278 * If we have runtime keys and they've not already been set up, do it now.
280 if (node->ioss_NumRuntimeKeys != 0 && !node->ioss_RuntimeKeysReady)
281 ExecReScan((PlanState *) node);
283 return ExecScan(&node->ss,
284 (ExecScanAccessMtd) IndexOnlyNext,
285 (ExecScanRecheckMtd) IndexOnlyRecheck);
288 /* ----------------------------------------------------------------
289 * ExecReScanIndexOnlyScan(node)
291 * Recalculates the values of any scan keys whose value depends on
292 * information known at runtime, then rescans the indexed relation.
294 * Updating the scan key was formerly done separately in
295 * ExecUpdateIndexScanKeys. Integrating it into ReScan makes
296 * rescans of indices and relations/general streams more uniform.
297 * ----------------------------------------------------------------
300 ExecReScanIndexOnlyScan(IndexOnlyScanState *node)
302 bool reset_parallel_scan = true;
305 * If we are here to just update the scan keys, then don't reset parallel
306 * scan. For detailed reason behind this look in the comments for
307 * ExecReScanIndexScan.
309 if (node->ioss_NumRuntimeKeys != 0 && !node->ioss_RuntimeKeysReady)
310 reset_parallel_scan = false;
313 * If we are doing runtime key calculations (ie, any of the index key
314 * values weren't simple Consts), compute the new key values. But first,
315 * reset the context so we don't leak memory as each outer tuple is
316 * scanned. Note this assumes that we will recalculate *all* runtime keys
319 if (node->ioss_NumRuntimeKeys != 0)
321 ExprContext *econtext = node->ioss_RuntimeContext;
323 ResetExprContext(econtext);
324 ExecIndexEvalRuntimeKeys(econtext,
325 node->ioss_RuntimeKeys,
326 node->ioss_NumRuntimeKeys);
328 node->ioss_RuntimeKeysReady = true;
330 /* reset index scan */
331 if (node->ioss_ScanDesc)
334 index_rescan(node->ioss_ScanDesc,
335 node->ioss_ScanKeys, node->ioss_NumScanKeys,
336 node->ioss_OrderByKeys, node->ioss_NumOrderByKeys);
338 if (reset_parallel_scan && node->ioss_ScanDesc->parallel_scan)
339 index_parallelrescan(node->ioss_ScanDesc);
341 ExecScanReScan(&node->ss);
345 /* ----------------------------------------------------------------
346 * ExecEndIndexOnlyScan
347 * ----------------------------------------------------------------
350 ExecEndIndexOnlyScan(IndexOnlyScanState *node)
352 Relation indexRelationDesc;
353 IndexScanDesc indexScanDesc;
357 * extract information from the node
359 indexRelationDesc = node->ioss_RelationDesc;
360 indexScanDesc = node->ioss_ScanDesc;
361 relation = node->ss.ss_currentRelation;
363 /* Release VM buffer pin, if any. */
364 if (node->ioss_VMBuffer != InvalidBuffer)
366 ReleaseBuffer(node->ioss_VMBuffer);
367 node->ioss_VMBuffer = InvalidBuffer;
371 * Free the exprcontext(s) ... now dead code, see ExecFreeExprContext
374 ExecFreeExprContext(&node->ss.ps);
375 if (node->ioss_RuntimeContext)
376 FreeExprContext(node->ioss_RuntimeContext, true);
380 * clear out tuple table slots
382 ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
383 ExecClearTuple(node->ss.ss_ScanTupleSlot);
386 * close the index relation (no-op if we didn't open it)
389 index_endscan(indexScanDesc);
390 if (indexRelationDesc)
391 index_close(indexRelationDesc, NoLock);
394 * close the heap relation.
396 ExecCloseScanRelation(relation);
399 /* ----------------------------------------------------------------
400 * ExecIndexOnlyMarkPos
401 * ----------------------------------------------------------------
404 ExecIndexOnlyMarkPos(IndexOnlyScanState *node)
406 index_markpos(node->ioss_ScanDesc);
409 /* ----------------------------------------------------------------
410 * ExecIndexOnlyRestrPos
411 * ----------------------------------------------------------------
414 ExecIndexOnlyRestrPos(IndexOnlyScanState *node)
416 index_restrpos(node->ioss_ScanDesc);
419 /* ----------------------------------------------------------------
420 * ExecInitIndexOnlyScan
422 * Initializes the index scan's state information, creates
423 * scan keys, and opens the base and index relations.
425 * Note: index scans have 2 sets of state information because
426 * we have to keep track of the base relation and the
428 * ----------------------------------------------------------------
431 ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
433 IndexOnlyScanState *indexstate;
434 Relation currentRelation;
439 * create state structure
441 indexstate = makeNode(IndexOnlyScanState);
442 indexstate->ss.ps.plan = (Plan *) node;
443 indexstate->ss.ps.state = estate;
444 indexstate->ioss_HeapFetches = 0;
447 * Miscellaneous initialization
449 * create expression context for node
451 ExecAssignExprContext(estate, &indexstate->ss.ps);
454 * initialize child expressions
456 * Note: we don't initialize all of the indexorderby expression, only the
457 * sub-parts corresponding to runtime keys (see below).
459 indexstate->ss.ps.targetlist = (List *)
460 ExecInitExpr((Expr *) node->scan.plan.targetlist,
461 (PlanState *) indexstate);
462 indexstate->ss.ps.qual = (List *)
463 ExecInitExpr((Expr *) node->scan.plan.qual,
464 (PlanState *) indexstate);
465 indexstate->indexqual = (List *)
466 ExecInitExpr((Expr *) node->indexqual,
467 (PlanState *) indexstate);
470 * tuple table initialization
472 ExecInitResultTupleSlot(estate, &indexstate->ss.ps);
473 ExecInitScanTupleSlot(estate, &indexstate->ss);
476 * open the base relation and acquire appropriate lock on it.
478 currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
480 indexstate->ss.ss_currentRelation = currentRelation;
481 indexstate->ss.ss_currentScanDesc = NULL; /* no heap scan here */
484 * Build the scan tuple type using the indextlist generated by the
485 * planner. We use this, rather than the index's physical tuple
486 * descriptor, because the latter contains storage column types not the
487 * types of the original datums. (It's the AM's responsibility to return
488 * suitable data anyway.)
490 tupDesc = ExecTypeFromTL(node->indextlist, false);
491 ExecAssignScanType(&indexstate->ss, tupDesc);
494 * Initialize result tuple type and projection info. The node's
495 * targetlist will contain Vars with varno = INDEX_VAR, referencing the
498 ExecAssignResultTypeFromTL(&indexstate->ss.ps);
499 ExecAssignScanProjectionInfoWithVarno(&indexstate->ss, INDEX_VAR);
502 * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop
503 * here. This allows an index-advisor plugin to EXPLAIN a plan containing
504 * references to nonexistent indexes.
506 if (eflags & EXEC_FLAG_EXPLAIN_ONLY)
510 * Open the index relation.
512 * If the parent table is one of the target relations of the query, then
513 * InitPlan already opened and write-locked the index, so we can avoid
514 * taking another lock here. Otherwise we need a normal reader's lock.
516 relistarget = ExecRelationIsTargetRelation(estate, node->scan.scanrelid);
517 indexstate->ioss_RelationDesc = index_open(node->indexid,
518 relistarget ? NoLock : AccessShareLock);
521 * Initialize index-specific scan state
523 indexstate->ioss_RuntimeKeysReady = false;
524 indexstate->ioss_RuntimeKeys = NULL;
525 indexstate->ioss_NumRuntimeKeys = 0;
528 * build the index scan keys from the index qualification
530 ExecIndexBuildScanKeys((PlanState *) indexstate,
531 indexstate->ioss_RelationDesc,
534 &indexstate->ioss_ScanKeys,
535 &indexstate->ioss_NumScanKeys,
536 &indexstate->ioss_RuntimeKeys,
537 &indexstate->ioss_NumRuntimeKeys,
538 NULL, /* no ArrayKeys */
542 * any ORDER BY exprs have to be turned into scankeys in the same way
544 ExecIndexBuildScanKeys((PlanState *) indexstate,
545 indexstate->ioss_RelationDesc,
548 &indexstate->ioss_OrderByKeys,
549 &indexstate->ioss_NumOrderByKeys,
550 &indexstate->ioss_RuntimeKeys,
551 &indexstate->ioss_NumRuntimeKeys,
552 NULL, /* no ArrayKeys */
556 * If we have runtime keys, we need an ExprContext to evaluate them. The
557 * node's standard context won't do because we want to reset that context
558 * for every tuple. So, build another context just like the other one...
561 if (indexstate->ioss_NumRuntimeKeys != 0)
563 ExprContext *stdecontext = indexstate->ss.ps.ps_ExprContext;
565 ExecAssignExprContext(estate, &indexstate->ss.ps);
566 indexstate->ioss_RuntimeContext = indexstate->ss.ps.ps_ExprContext;
567 indexstate->ss.ps.ps_ExprContext = stdecontext;
571 indexstate->ioss_RuntimeContext = NULL;
575 * Initialize scan descriptor.
577 if (!node->scan.plan.parallel_aware)
579 indexstate->ioss_ScanDesc = index_beginscan(currentRelation,
580 indexstate->ioss_RelationDesc,
582 indexstate->ioss_NumScanKeys,
583 indexstate->ioss_NumOrderByKeys);
586 /* Set it up for index-only scan */
587 indexstate->ioss_ScanDesc->xs_want_itup = true;
588 indexstate->ioss_VMBuffer = InvalidBuffer;
591 * If no run-time keys to calculate, go ahead and pass the scankeys to
594 if (indexstate->ioss_NumRuntimeKeys == 0)
595 index_rescan(indexstate->ioss_ScanDesc,
596 indexstate->ioss_ScanKeys,
597 indexstate->ioss_NumScanKeys,
598 indexstate->ioss_OrderByKeys,
599 indexstate->ioss_NumOrderByKeys);
608 /* ----------------------------------------------------------------
609 * Parallel Index-only Scan Support
610 * ----------------------------------------------------------------
613 /* ----------------------------------------------------------------
614 * ExecIndexOnlyScanEstimate
616 * estimates the space required to serialize index-only scan node.
617 * ----------------------------------------------------------------
620 ExecIndexOnlyScanEstimate(IndexOnlyScanState *node,
621 ParallelContext *pcxt)
623 EState *estate = node->ss.ps.state;
625 node->ioss_PscanLen = index_parallelscan_estimate(node->ioss_RelationDesc,
626 estate->es_snapshot);
627 shm_toc_estimate_chunk(&pcxt->estimator, node->ioss_PscanLen);
628 shm_toc_estimate_keys(&pcxt->estimator, 1);
631 /* ----------------------------------------------------------------
632 * ExecIndexOnlyScanInitializeDSM
634 * Set up a parallel index-only scan descriptor.
635 * ----------------------------------------------------------------
638 ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node,
639 ParallelContext *pcxt)
641 EState *estate = node->ss.ps.state;
642 ParallelIndexScanDesc piscan;
644 piscan = shm_toc_allocate(pcxt->toc, node->ioss_PscanLen);
645 index_parallelscan_initialize(node->ss.ss_currentRelation,
646 node->ioss_RelationDesc,
649 shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, piscan);
650 node->ioss_ScanDesc =
651 index_beginscan_parallel(node->ss.ss_currentRelation,
652 node->ioss_RelationDesc,
653 node->ioss_NumScanKeys,
654 node->ioss_NumOrderByKeys,
656 node->ioss_ScanDesc->xs_want_itup = true;
657 node->ioss_VMBuffer = InvalidBuffer;
660 * If no run-time keys to calculate, go ahead and pass the scankeys to
663 if (node->ioss_NumRuntimeKeys == 0)
664 index_rescan(node->ioss_ScanDesc,
665 node->ioss_ScanKeys, node->ioss_NumScanKeys,
666 node->ioss_OrderByKeys, node->ioss_NumOrderByKeys);
669 /* ----------------------------------------------------------------
670 * ExecIndexOnlyScanInitializeWorker
672 * Copy relevant information from TOC into planstate.
673 * ----------------------------------------------------------------
676 ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node, shm_toc *toc)
678 ParallelIndexScanDesc piscan;
680 piscan = shm_toc_lookup(toc, node->ss.ps.plan->plan_node_id);
681 node->ioss_ScanDesc =
682 index_beginscan_parallel(node->ss.ss_currentRelation,
683 node->ioss_RelationDesc,
684 node->ioss_NumScanKeys,
685 node->ioss_NumOrderByKeys,
687 node->ioss_ScanDesc->xs_want_itup = true;
690 * If no run-time keys to calculate, go ahead and pass the scankeys to the
693 if (node->ioss_NumRuntimeKeys == 0)
694 index_rescan(node->ioss_ScanDesc,
695 node->ioss_ScanKeys, node->ioss_NumScanKeys,
696 node->ioss_OrderByKeys, node->ioss_NumOrderByKeys);