]> granicus.if.org Git - postgresql/blob - src/backend/executor/nodeIndexonlyscan.c
Change the format of the VM fork to add a second bit per page.
[postgresql] / src / backend / executor / nodeIndexonlyscan.c
1 /*-------------------------------------------------------------------------
2  *
3  * nodeIndexonlyscan.c
4  *        Routines to support index-only scans
5  *
6  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *        src/backend/executor/nodeIndexonlyscan.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 /*
16  * INTERFACE ROUTINES
17  *              ExecIndexOnlyScan                       scans an index
18  *              IndexOnlyNext                           retrieve next tuple
19  *              ExecInitIndexOnlyScan           creates and initializes state info.
20  *              ExecReScanIndexOnlyScan         rescans the indexed relation.
21  *              ExecEndIndexOnlyScan            releases all storage.
22  *              ExecIndexOnlyMarkPos            marks scan position.
23  *              ExecIndexOnlyRestrPos           restores scan position.
24  */
25 #include "postgres.h"
26
27 #include "access/relscan.h"
28 #include "access/visibilitymap.h"
29 #include "executor/execdebug.h"
30 #include "executor/nodeIndexonlyscan.h"
31 #include "executor/nodeIndexscan.h"
32 #include "storage/bufmgr.h"
33 #include "storage/predicate.h"
34 #include "utils/memutils.h"
35 #include "utils/rel.h"
36
37
38 static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node);
39 static void StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup,
40                                 TupleDesc itupdesc);
41
42
43 /* ----------------------------------------------------------------
44  *              IndexOnlyNext
45  *
46  *              Retrieve a tuple from the IndexOnlyScan node's index.
47  * ----------------------------------------------------------------
48  */
49 static TupleTableSlot *
50 IndexOnlyNext(IndexOnlyScanState *node)
51 {
52         EState     *estate;
53         ExprContext *econtext;
54         ScanDirection direction;
55         IndexScanDesc scandesc;
56         TupleTableSlot *slot;
57         ItemPointer tid;
58
59         /*
60          * extract necessary information from index scan node
61          */
62         estate = node->ss.ps.state;
63         direction = estate->es_direction;
64         /* flip direction if this is an overall backward scan */
65         if (ScanDirectionIsBackward(((IndexOnlyScan *) node->ss.ps.plan)->indexorderdir))
66         {
67                 if (ScanDirectionIsForward(direction))
68                         direction = BackwardScanDirection;
69                 else if (ScanDirectionIsBackward(direction))
70                         direction = ForwardScanDirection;
71         }
72         scandesc = node->ioss_ScanDesc;
73         econtext = node->ss.ps.ps_ExprContext;
74         slot = node->ss.ss_ScanTupleSlot;
75
76         /*
77          * OK, now that we have what we need, fetch the next tuple.
78          */
79         while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
80         {
81                 HeapTuple       tuple = NULL;
82
83                 /*
84                  * We can skip the heap fetch if the TID references a heap page on
85                  * which all tuples are known visible to everybody.  In any case,
86                  * we'll use the index tuple not the heap tuple as the data source.
87                  *
88                  * Note on Memory Ordering Effects: visibilitymap_get_status does not
89                  * lock the visibility map buffer, and therefore the result we read
90                  * here could be slightly stale.  However, it can't be stale enough to
91                  * matter.
92                  *
93                  * We need to detect clearing a VM bit due to an insert right away,
94                  * because the tuple is present in the index page but not visible. The
95                  * reading of the TID by this scan (using a shared lock on the index
96                  * buffer) is serialized with the insert of the TID into the index
97                  * (using an exclusive lock on the index buffer). Because the VM bit
98                  * is cleared before updating the index, and locking/unlocking of the
99                  * index page acts as a full memory barrier, we are sure to see the
100                  * cleared bit if we see a recently-inserted TID.
101                  *
102                  * Deletes do not update the index page (only VACUUM will clear out
103                  * the TID), so the clearing of the VM bit by a delete is not
104                  * serialized with this test below, and we may see a value that is
105                  * significantly stale. However, we don't care about the delete right
106                  * away, because the tuple is still visible until the deleting
107                  * transaction commits or the statement ends (if it's our
108                  * transaction). In either case, the lock on the VM buffer will have
109                  * been released (acting as a write barrier) after clearing the bit.
110                  * And for us to have a snapshot that includes the deleting
111                  * transaction (making the tuple invisible), we must have acquired
112                  * ProcArrayLock after that time, acting as a read barrier.
113                  *
114                  * It's worth going through this complexity to avoid needing to lock
115                  * the VM buffer, which could cause significant contention.
116                  */
117                 if (!VM_ALL_VISIBLE(scandesc->heapRelation,
118                                                         ItemPointerGetBlockNumber(tid),
119                                                         &node->ioss_VMBuffer))
120                 {
121                         /*
122                          * Rats, we have to visit the heap to check visibility.
123                          */
124                         node->ioss_HeapFetches++;
125                         tuple = index_fetch_heap(scandesc);
126                         if (tuple == NULL)
127                                 continue;               /* no visible tuple, try next index entry */
128
129                         /*
130                          * Only MVCC snapshots are supported here, so there should be no
131                          * need to keep following the HOT chain once a visible entry has
132                          * been found.  If we did want to allow that, we'd need to keep
133                          * more state to remember not to call index_getnext_tid next time.
134                          */
135                         if (scandesc->xs_continue_hot)
136                                 elog(ERROR, "non-MVCC snapshots are not supported in index-only scans");
137
138                         /*
139                          * Note: at this point we are holding a pin on the heap page, as
140                          * recorded in scandesc->xs_cbuf.  We could release that pin now,
141                          * but it's not clear whether it's a win to do so.  The next index
142                          * entry might require a visit to the same heap page.
143                          */
144                 }
145
146                 /*
147                  * Fill the scan tuple slot with data from the index.
148                  */
149                 StoreIndexTuple(slot, scandesc->xs_itup, scandesc->xs_itupdesc);
150
151                 /*
152                  * If the index was lossy, we have to recheck the index quals.
153                  * (Currently, this can never happen, but we should support the case
154                  * for possible future use, eg with GiST indexes.)
155                  */
156                 if (scandesc->xs_recheck)
157                 {
158                         econtext->ecxt_scantuple = slot;
159                         ResetExprContext(econtext);
160                         if (!ExecQual(node->indexqual, econtext, false))
161                         {
162                                 /* Fails recheck, so drop it and loop back for another */
163                                 InstrCountFiltered2(node, 1);
164                                 continue;
165                         }
166                 }
167
168                 /*
169                  * We don't currently support rechecking ORDER BY distances.  (In
170                  * principle, if the index can support retrieval of the originally
171                  * indexed value, it should be able to produce an exact distance
172                  * calculation too.  So it's not clear that adding code here for
173                  * recheck/re-sort would be worth the trouble.  But we should at least
174                  * throw an error if someone tries it.)
175                  */
176                 if (scandesc->numberOfOrderBys > 0 && scandesc->xs_recheckorderby)
177                         ereport(ERROR,
178                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
179                                          errmsg("lossy distance functions are not supported in index-only scans")));
180
181                 /*
182                  * Predicate locks for index-only scans must be acquired at the page
183                  * level when the heap is not accessed, since tuple-level predicate
184                  * locks need the tuple's xmin value.  If we had to visit the tuple
185                  * anyway, then we already have the tuple-level lock and can skip the
186                  * page lock.
187                  */
188                 if (tuple == NULL)
189                         PredicateLockPage(scandesc->heapRelation,
190                                                           ItemPointerGetBlockNumber(tid),
191                                                           estate->es_snapshot);
192
193                 return slot;
194         }
195
196         /*
197          * if we get here it means the index scan failed so we are at the end of
198          * the scan..
199          */
200         return ExecClearTuple(slot);
201 }
202
203 /*
204  * StoreIndexTuple
205  *              Fill the slot with data from the index tuple.
206  *
207  * At some point this might be generally-useful functionality, but
208  * right now we don't need it elsewhere.
209  */
210 static void
211 StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup, TupleDesc itupdesc)
212 {
213         int                     nindexatts = itupdesc->natts;
214         Datum      *values = slot->tts_values;
215         bool       *isnull = slot->tts_isnull;
216         int                     i;
217
218         /*
219          * Note: we must use the tupdesc supplied by the AM in index_getattr, not
220          * the slot's tupdesc, in case the latter has different datatypes (this
221          * happens for btree name_ops in particular).  They'd better have the same
222          * number of columns though, as well as being datatype-compatible which is
223          * something we can't so easily check.
224          */
225         Assert(slot->tts_tupleDescriptor->natts == nindexatts);
226
227         ExecClearTuple(slot);
228         for (i = 0; i < nindexatts; i++)
229                 values[i] = index_getattr(itup, i + 1, itupdesc, &isnull[i]);
230         ExecStoreVirtualTuple(slot);
231 }
232
233 /*
234  * IndexOnlyRecheck -- access method routine to recheck a tuple in EvalPlanQual
235  *
236  * This can't really happen, since an index can't supply CTID which would
237  * be necessary data for any potential EvalPlanQual target relation.  If it
238  * did happen, the EPQ code would pass us the wrong data, namely a heap
239  * tuple not an index tuple.  So throw an error.
240  */
241 static bool
242 IndexOnlyRecheck(IndexOnlyScanState *node, TupleTableSlot *slot)
243 {
244         elog(ERROR, "EvalPlanQual recheck is not supported in index-only scans");
245         return false;                           /* keep compiler quiet */
246 }
247
248 /* ----------------------------------------------------------------
249  *              ExecIndexOnlyScan(node)
250  * ----------------------------------------------------------------
251  */
252 TupleTableSlot *
253 ExecIndexOnlyScan(IndexOnlyScanState *node)
254 {
255         /*
256          * If we have runtime keys and they've not already been set up, do it now.
257          */
258         if (node->ioss_NumRuntimeKeys != 0 && !node->ioss_RuntimeKeysReady)
259                 ExecReScan((PlanState *) node);
260
261         return ExecScan(&node->ss,
262                                         (ExecScanAccessMtd) IndexOnlyNext,
263                                         (ExecScanRecheckMtd) IndexOnlyRecheck);
264 }
265
266 /* ----------------------------------------------------------------
267  *              ExecReScanIndexOnlyScan(node)
268  *
269  *              Recalculates the values of any scan keys whose value depends on
270  *              information known at runtime, then rescans the indexed relation.
271  *
272  *              Updating the scan key was formerly done separately in
273  *              ExecUpdateIndexScanKeys. Integrating it into ReScan makes
274  *              rescans of indices and relations/general streams more uniform.
275  * ----------------------------------------------------------------
276  */
277 void
278 ExecReScanIndexOnlyScan(IndexOnlyScanState *node)
279 {
280         /*
281          * If we are doing runtime key calculations (ie, any of the index key
282          * values weren't simple Consts), compute the new key values.  But first,
283          * reset the context so we don't leak memory as each outer tuple is
284          * scanned.  Note this assumes that we will recalculate *all* runtime keys
285          * on each call.
286          */
287         if (node->ioss_NumRuntimeKeys != 0)
288         {
289                 ExprContext *econtext = node->ioss_RuntimeContext;
290
291                 ResetExprContext(econtext);
292                 ExecIndexEvalRuntimeKeys(econtext,
293                                                                  node->ioss_RuntimeKeys,
294                                                                  node->ioss_NumRuntimeKeys);
295         }
296         node->ioss_RuntimeKeysReady = true;
297
298         /* reset index scan */
299         index_rescan(node->ioss_ScanDesc,
300                                  node->ioss_ScanKeys, node->ioss_NumScanKeys,
301                                  node->ioss_OrderByKeys, node->ioss_NumOrderByKeys);
302
303         ExecScanReScan(&node->ss);
304 }
305
306
307 /* ----------------------------------------------------------------
308  *              ExecEndIndexOnlyScan
309  * ----------------------------------------------------------------
310  */
311 void
312 ExecEndIndexOnlyScan(IndexOnlyScanState *node)
313 {
314         Relation        indexRelationDesc;
315         IndexScanDesc indexScanDesc;
316         Relation        relation;
317
318         /*
319          * extract information from the node
320          */
321         indexRelationDesc = node->ioss_RelationDesc;
322         indexScanDesc = node->ioss_ScanDesc;
323         relation = node->ss.ss_currentRelation;
324
325         /* Release VM buffer pin, if any. */
326         if (node->ioss_VMBuffer != InvalidBuffer)
327         {
328                 ReleaseBuffer(node->ioss_VMBuffer);
329                 node->ioss_VMBuffer = InvalidBuffer;
330         }
331
332         /*
333          * Free the exprcontext(s) ... now dead code, see ExecFreeExprContext
334          */
335 #ifdef NOT_USED
336         ExecFreeExprContext(&node->ss.ps);
337         if (node->ioss_RuntimeContext)
338                 FreeExprContext(node->ioss_RuntimeContext, true);
339 #endif
340
341         /*
342          * clear out tuple table slots
343          */
344         ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
345         ExecClearTuple(node->ss.ss_ScanTupleSlot);
346
347         /*
348          * close the index relation (no-op if we didn't open it)
349          */
350         if (indexScanDesc)
351                 index_endscan(indexScanDesc);
352         if (indexRelationDesc)
353                 index_close(indexRelationDesc, NoLock);
354
355         /*
356          * close the heap relation.
357          */
358         ExecCloseScanRelation(relation);
359 }
360
361 /* ----------------------------------------------------------------
362  *              ExecIndexOnlyMarkPos
363  * ----------------------------------------------------------------
364  */
365 void
366 ExecIndexOnlyMarkPos(IndexOnlyScanState *node)
367 {
368         index_markpos(node->ioss_ScanDesc);
369 }
370
371 /* ----------------------------------------------------------------
372  *              ExecIndexOnlyRestrPos
373  * ----------------------------------------------------------------
374  */
375 void
376 ExecIndexOnlyRestrPos(IndexOnlyScanState *node)
377 {
378         index_restrpos(node->ioss_ScanDesc);
379 }
380
381 /* ----------------------------------------------------------------
382  *              ExecInitIndexOnlyScan
383  *
384  *              Initializes the index scan's state information, creates
385  *              scan keys, and opens the base and index relations.
386  *
387  *              Note: index scans have 2 sets of state information because
388  *                        we have to keep track of the base relation and the
389  *                        index relation.
390  * ----------------------------------------------------------------
391  */
392 IndexOnlyScanState *
393 ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
394 {
395         IndexOnlyScanState *indexstate;
396         Relation        currentRelation;
397         bool            relistarget;
398         TupleDesc       tupDesc;
399
400         /*
401          * create state structure
402          */
403         indexstate = makeNode(IndexOnlyScanState);
404         indexstate->ss.ps.plan = (Plan *) node;
405         indexstate->ss.ps.state = estate;
406         indexstate->ioss_HeapFetches = 0;
407
408         /*
409          * Miscellaneous initialization
410          *
411          * create expression context for node
412          */
413         ExecAssignExprContext(estate, &indexstate->ss.ps);
414
415         indexstate->ss.ps.ps_TupFromTlist = false;
416
417         /*
418          * initialize child expressions
419          *
420          * Note: we don't initialize all of the indexorderby expression, only the
421          * sub-parts corresponding to runtime keys (see below).
422          */
423         indexstate->ss.ps.targetlist = (List *)
424                 ExecInitExpr((Expr *) node->scan.plan.targetlist,
425                                          (PlanState *) indexstate);
426         indexstate->ss.ps.qual = (List *)
427                 ExecInitExpr((Expr *) node->scan.plan.qual,
428                                          (PlanState *) indexstate);
429         indexstate->indexqual = (List *)
430                 ExecInitExpr((Expr *) node->indexqual,
431                                          (PlanState *) indexstate);
432
433         /*
434          * tuple table initialization
435          */
436         ExecInitResultTupleSlot(estate, &indexstate->ss.ps);
437         ExecInitScanTupleSlot(estate, &indexstate->ss);
438
439         /*
440          * open the base relation and acquire appropriate lock on it.
441          */
442         currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
443
444         indexstate->ss.ss_currentRelation = currentRelation;
445         indexstate->ss.ss_currentScanDesc = NULL;       /* no heap scan here */
446
447         /*
448          * Build the scan tuple type using the indextlist generated by the
449          * planner.  We use this, rather than the index's physical tuple
450          * descriptor, because the latter contains storage column types not the
451          * types of the original datums.  (It's the AM's responsibility to return
452          * suitable data anyway.)
453          */
454         tupDesc = ExecTypeFromTL(node->indextlist, false);
455         ExecAssignScanType(&indexstate->ss, tupDesc);
456
457         /*
458          * Initialize result tuple type and projection info.  The node's
459          * targetlist will contain Vars with varno = INDEX_VAR, referencing the
460          * scan tuple.
461          */
462         ExecAssignResultTypeFromTL(&indexstate->ss.ps);
463         ExecAssignScanProjectionInfoWithVarno(&indexstate->ss, INDEX_VAR);
464
465         /*
466          * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop
467          * here.  This allows an index-advisor plugin to EXPLAIN a plan containing
468          * references to nonexistent indexes.
469          */
470         if (eflags & EXEC_FLAG_EXPLAIN_ONLY)
471                 return indexstate;
472
473         /*
474          * Open the index relation.
475          *
476          * If the parent table is one of the target relations of the query, then
477          * InitPlan already opened and write-locked the index, so we can avoid
478          * taking another lock here.  Otherwise we need a normal reader's lock.
479          */
480         relistarget = ExecRelationIsTargetRelation(estate, node->scan.scanrelid);
481         indexstate->ioss_RelationDesc = index_open(node->indexid,
482                                                                          relistarget ? NoLock : AccessShareLock);
483
484         /*
485          * Initialize index-specific scan state
486          */
487         indexstate->ioss_RuntimeKeysReady = false;
488         indexstate->ioss_RuntimeKeys = NULL;
489         indexstate->ioss_NumRuntimeKeys = 0;
490
491         /*
492          * build the index scan keys from the index qualification
493          */
494         ExecIndexBuildScanKeys((PlanState *) indexstate,
495                                                    indexstate->ioss_RelationDesc,
496                                                    node->indexqual,
497                                                    false,
498                                                    &indexstate->ioss_ScanKeys,
499                                                    &indexstate->ioss_NumScanKeys,
500                                                    &indexstate->ioss_RuntimeKeys,
501                                                    &indexstate->ioss_NumRuntimeKeys,
502                                                    NULL,        /* no ArrayKeys */
503                                                    NULL);
504
505         /*
506          * any ORDER BY exprs have to be turned into scankeys in the same way
507          */
508         ExecIndexBuildScanKeys((PlanState *) indexstate,
509                                                    indexstate->ioss_RelationDesc,
510                                                    node->indexorderby,
511                                                    true,
512                                                    &indexstate->ioss_OrderByKeys,
513                                                    &indexstate->ioss_NumOrderByKeys,
514                                                    &indexstate->ioss_RuntimeKeys,
515                                                    &indexstate->ioss_NumRuntimeKeys,
516                                                    NULL,        /* no ArrayKeys */
517                                                    NULL);
518
519         /*
520          * If we have runtime keys, we need an ExprContext to evaluate them. The
521          * node's standard context won't do because we want to reset that context
522          * for every tuple.  So, build another context just like the other one...
523          * -tgl 7/11/00
524          */
525         if (indexstate->ioss_NumRuntimeKeys != 0)
526         {
527                 ExprContext *stdecontext = indexstate->ss.ps.ps_ExprContext;
528
529                 ExecAssignExprContext(estate, &indexstate->ss.ps);
530                 indexstate->ioss_RuntimeContext = indexstate->ss.ps.ps_ExprContext;
531                 indexstate->ss.ps.ps_ExprContext = stdecontext;
532         }
533         else
534         {
535                 indexstate->ioss_RuntimeContext = NULL;
536         }
537
538         /*
539          * Initialize scan descriptor.
540          */
541         indexstate->ioss_ScanDesc = index_beginscan(currentRelation,
542                                                                                                 indexstate->ioss_RelationDesc,
543                                                                                                 estate->es_snapshot,
544                                                                                                 indexstate->ioss_NumScanKeys,
545                                                                                         indexstate->ioss_NumOrderByKeys);
546
547         /* Set it up for index-only scan */
548         indexstate->ioss_ScanDesc->xs_want_itup = true;
549         indexstate->ioss_VMBuffer = InvalidBuffer;
550
551         /*
552          * If no run-time keys to calculate, go ahead and pass the scankeys to the
553          * index AM.
554          */
555         if (indexstate->ioss_NumRuntimeKeys == 0)
556                 index_rescan(indexstate->ioss_ScanDesc,
557                                          indexstate->ioss_ScanKeys,
558                                          indexstate->ioss_NumScanKeys,
559                                          indexstate->ioss_OrderByKeys,
560                                          indexstate->ioss_NumOrderByKeys);
561
562         /*
563          * all done.
564          */
565         return indexstate;
566 }