]> granicus.if.org Git - postgresql/blob - src/backend/executor/nodeSamplescan.c
Faster expression evaluation and targetlist projection.
[postgresql] / src / backend / executor / nodeSamplescan.c
1 /*-------------------------------------------------------------------------
2  *
3  * nodeSamplescan.c
4  *        Support routines for sample scans of relations (table sampling).
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *        src/backend/executor/nodeSamplescan.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16
17 #include "access/hash.h"
18 #include "access/relscan.h"
19 #include "access/tsmapi.h"
20 #include "executor/executor.h"
21 #include "executor/nodeSamplescan.h"
22 #include "miscadmin.h"
23 #include "pgstat.h"
24 #include "storage/predicate.h"
25 #include "utils/builtins.h"
26 #include "utils/rel.h"
27 #include "utils/tqual.h"
28
29 static void InitScanRelation(SampleScanState *node, EState *estate, int eflags);
30 static TupleTableSlot *SampleNext(SampleScanState *node);
31 static void tablesample_init(SampleScanState *scanstate);
32 static HeapTuple tablesample_getnext(SampleScanState *scanstate);
33 static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset,
34                                    HeapScanDesc scan);
35
36 /* ----------------------------------------------------------------
37  *                                              Scan Support
38  * ----------------------------------------------------------------
39  */
40
41 /* ----------------------------------------------------------------
42  *              SampleNext
43  *
44  *              This is a workhorse for ExecSampleScan
45  * ----------------------------------------------------------------
46  */
47 static TupleTableSlot *
48 SampleNext(SampleScanState *node)
49 {
50         HeapTuple       tuple;
51         TupleTableSlot *slot;
52
53         /*
54          * if this is first call within a scan, initialize
55          */
56         if (!node->begun)
57                 tablesample_init(node);
58
59         /*
60          * get the next tuple, and store it in our result slot
61          */
62         tuple = tablesample_getnext(node);
63
64         slot = node->ss.ss_ScanTupleSlot;
65
66         if (tuple)
67                 ExecStoreTuple(tuple,   /* tuple to store */
68                                            slot,        /* slot to store in */
69                                            node->ss.ss_currentScanDesc->rs_cbuf,        /* tuple's buffer */
70                                            false);      /* don't pfree this pointer */
71         else
72                 ExecClearTuple(slot);
73
74         return slot;
75 }
76
77 /*
78  * SampleRecheck -- access method routine to recheck a tuple in EvalPlanQual
79  */
80 static bool
81 SampleRecheck(SampleScanState *node, TupleTableSlot *slot)
82 {
83         /*
84          * No need to recheck for SampleScan, since like SeqScan we don't pass any
85          * checkable keys to heap_beginscan.
86          */
87         return true;
88 }
89
90 /* ----------------------------------------------------------------
91  *              ExecSampleScan(node)
92  *
93  *              Scans the relation using the sampling method and returns
94  *              the next qualifying tuple.
95  *              We call the ExecScan() routine and pass it the appropriate
96  *              access method functions.
97  * ----------------------------------------------------------------
98  */
99 TupleTableSlot *
100 ExecSampleScan(SampleScanState *node)
101 {
102         return ExecScan((ScanState *) node,
103                                         (ExecScanAccessMtd) SampleNext,
104                                         (ExecScanRecheckMtd) SampleRecheck);
105 }
106
107 /* ----------------------------------------------------------------
108  *              InitScanRelation
109  *
110  *              Set up to access the scan relation.
111  * ----------------------------------------------------------------
112  */
113 static void
114 InitScanRelation(SampleScanState *node, EState *estate, int eflags)
115 {
116         Relation        currentRelation;
117
118         /*
119          * get the relation object id from the relid'th entry in the range table,
120          * open that relation and acquire appropriate lock on it.
121          */
122         currentRelation = ExecOpenScanRelation(estate,
123                                                    ((SampleScan *) node->ss.ps.plan)->scan.scanrelid,
124                                                                                    eflags);
125
126         node->ss.ss_currentRelation = currentRelation;
127
128         /* we won't set up the HeapScanDesc till later */
129         node->ss.ss_currentScanDesc = NULL;
130
131         /* and report the scan tuple slot's rowtype */
132         ExecAssignScanType(&node->ss, RelationGetDescr(currentRelation));
133 }
134
135
136 /* ----------------------------------------------------------------
137  *              ExecInitSampleScan
138  * ----------------------------------------------------------------
139  */
140 SampleScanState *
141 ExecInitSampleScan(SampleScan *node, EState *estate, int eflags)
142 {
143         SampleScanState *scanstate;
144         TableSampleClause *tsc = node->tablesample;
145         TsmRoutine *tsm;
146
147         Assert(outerPlan(node) == NULL);
148         Assert(innerPlan(node) == NULL);
149
150         /*
151          * create state structure
152          */
153         scanstate = makeNode(SampleScanState);
154         scanstate->ss.ps.plan = (Plan *) node;
155         scanstate->ss.ps.state = estate;
156
157         /*
158          * Miscellaneous initialization
159          *
160          * create expression context for node
161          */
162         ExecAssignExprContext(estate, &scanstate->ss.ps);
163
164         /*
165          * initialize child expressions
166          */
167         scanstate->ss.ps.qual =
168                 ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
169
170         scanstate->args = ExecInitExprList(tsc->args, (PlanState *) scanstate);
171         scanstate->repeatable =
172                 ExecInitExpr(tsc->repeatable, (PlanState *) scanstate);
173
174         /*
175          * tuple table initialization
176          */
177         ExecInitResultTupleSlot(estate, &scanstate->ss.ps);
178         ExecInitScanTupleSlot(estate, &scanstate->ss);
179
180         /*
181          * initialize scan relation
182          */
183         InitScanRelation(scanstate, estate, eflags);
184
185         /*
186          * Initialize result tuple type and projection info.
187          */
188         ExecAssignResultTypeFromTL(&scanstate->ss.ps);
189         ExecAssignScanProjectionInfo(&scanstate->ss);
190
191         /*
192          * If we don't have a REPEATABLE clause, select a random seed.  We want to
193          * do this just once, since the seed shouldn't change over rescans.
194          */
195         if (tsc->repeatable == NULL)
196                 scanstate->seed = random();
197
198         /*
199          * Finally, initialize the TABLESAMPLE method handler.
200          */
201         tsm = GetTsmRoutine(tsc->tsmhandler);
202         scanstate->tsmroutine = tsm;
203         scanstate->tsm_state = NULL;
204
205         if (tsm->InitSampleScan)
206                 tsm->InitSampleScan(scanstate, eflags);
207
208         /* We'll do BeginSampleScan later; we can't evaluate params yet */
209         scanstate->begun = false;
210
211         return scanstate;
212 }
213
214 /* ----------------------------------------------------------------
215  *              ExecEndSampleScan
216  *
217  *              frees any storage allocated through C routines.
218  * ----------------------------------------------------------------
219  */
220 void
221 ExecEndSampleScan(SampleScanState *node)
222 {
223         /*
224          * Tell sampling function that we finished the scan.
225          */
226         if (node->tsmroutine->EndSampleScan)
227                 node->tsmroutine->EndSampleScan(node);
228
229         /*
230          * Free the exprcontext
231          */
232         ExecFreeExprContext(&node->ss.ps);
233
234         /*
235          * clean out the tuple table
236          */
237         ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
238         ExecClearTuple(node->ss.ss_ScanTupleSlot);
239
240         /*
241          * close heap scan
242          */
243         if (node->ss.ss_currentScanDesc)
244                 heap_endscan(node->ss.ss_currentScanDesc);
245
246         /*
247          * close the heap relation.
248          */
249         ExecCloseScanRelation(node->ss.ss_currentRelation);
250 }
251
252 /* ----------------------------------------------------------------
253  *              ExecReScanSampleScan
254  *
255  *              Rescans the relation.
256  *
257  * ----------------------------------------------------------------
258  */
259 void
260 ExecReScanSampleScan(SampleScanState *node)
261 {
262         /* Remember we need to do BeginSampleScan again (if we did it at all) */
263         node->begun = false;
264
265         ExecScanReScan(&node->ss);
266 }
267
268
269 /*
270  * Initialize the TABLESAMPLE method: evaluate params and call BeginSampleScan.
271  */
272 static void
273 tablesample_init(SampleScanState *scanstate)
274 {
275         TsmRoutine *tsm = scanstate->tsmroutine;
276         ExprContext *econtext = scanstate->ss.ps.ps_ExprContext;
277         Datum      *params;
278         Datum           datum;
279         bool            isnull;
280         uint32          seed;
281         bool            allow_sync;
282         int                     i;
283         ListCell   *arg;
284
285         params = (Datum *) palloc(list_length(scanstate->args) * sizeof(Datum));
286
287         i = 0;
288         foreach(arg, scanstate->args)
289         {
290                 ExprState  *argstate = (ExprState *) lfirst(arg);
291
292                 params[i] = ExecEvalExprSwitchContext(argstate,
293                                                                                           econtext,
294                                                                                           &isnull);
295                 if (isnull)
296                         ereport(ERROR,
297                                         (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT),
298                                          errmsg("TABLESAMPLE parameter cannot be null")));
299                 i++;
300         }
301
302         if (scanstate->repeatable)
303         {
304                 datum = ExecEvalExprSwitchContext(scanstate->repeatable,
305                                                                                   econtext,
306                                                                                   &isnull);
307                 if (isnull)
308                         ereport(ERROR,
309                                         (errcode(ERRCODE_INVALID_TABLESAMPLE_REPEAT),
310                                  errmsg("TABLESAMPLE REPEATABLE parameter cannot be null")));
311
312                 /*
313                  * The REPEATABLE parameter has been coerced to float8 by the parser.
314                  * The reason for using float8 at the SQL level is that it will
315                  * produce unsurprising results both for users used to databases that
316                  * accept only integers in the REPEATABLE clause and for those who
317                  * might expect that REPEATABLE works like setseed() (a float in the
318                  * range from -1 to 1).
319                  *
320                  * We use hashfloat8() to convert the supplied value into a suitable
321                  * seed.  For regression-testing purposes, that has the convenient
322                  * property that REPEATABLE(0) gives a machine-independent result.
323                  */
324                 seed = DatumGetUInt32(DirectFunctionCall1(hashfloat8, datum));
325         }
326         else
327         {
328                 /* Use the seed selected by ExecInitSampleScan */
329                 seed = scanstate->seed;
330         }
331
332         /* Set default values for params that BeginSampleScan can adjust */
333         scanstate->use_bulkread = true;
334         scanstate->use_pagemode = true;
335
336         /* Let tablesample method do its thing */
337         tsm->BeginSampleScan(scanstate,
338                                                  params,
339                                                  list_length(scanstate->args),
340                                                  seed);
341
342         /* We'll use syncscan if there's no NextSampleBlock function */
343         allow_sync = (tsm->NextSampleBlock == NULL);
344
345         /* Now we can create or reset the HeapScanDesc */
346         if (scanstate->ss.ss_currentScanDesc == NULL)
347         {
348                 scanstate->ss.ss_currentScanDesc =
349                         heap_beginscan_sampling(scanstate->ss.ss_currentRelation,
350                                                                         scanstate->ss.ps.state->es_snapshot,
351                                                                         0, NULL,
352                                                                         scanstate->use_bulkread,
353                                                                         allow_sync,
354                                                                         scanstate->use_pagemode);
355         }
356         else
357         {
358                 heap_rescan_set_params(scanstate->ss.ss_currentScanDesc, NULL,
359                                                            scanstate->use_bulkread,
360                                                            allow_sync,
361                                                            scanstate->use_pagemode);
362         }
363
364         pfree(params);
365
366         /* And we're initialized. */
367         scanstate->begun = true;
368 }
369
370 /*
371  * Get next tuple from TABLESAMPLE method.
372  *
373  * Note: an awful lot of this is copied-and-pasted from heapam.c.  It would
374  * perhaps be better to refactor to share more code.
375  */
376 static HeapTuple
377 tablesample_getnext(SampleScanState *scanstate)
378 {
379         TsmRoutine *tsm = scanstate->tsmroutine;
380         HeapScanDesc scan = scanstate->ss.ss_currentScanDesc;
381         HeapTuple       tuple = &(scan->rs_ctup);
382         Snapshot        snapshot = scan->rs_snapshot;
383         bool            pagemode = scan->rs_pageatatime;
384         BlockNumber blockno;
385         Page            page;
386         bool            all_visible;
387         OffsetNumber maxoffset;
388
389         if (!scan->rs_inited)
390         {
391                 /*
392                  * return null immediately if relation is empty
393                  */
394                 if (scan->rs_nblocks == 0)
395                 {
396                         Assert(!BufferIsValid(scan->rs_cbuf));
397                         tuple->t_data = NULL;
398                         return NULL;
399                 }
400                 if (tsm->NextSampleBlock)
401                 {
402                         blockno = tsm->NextSampleBlock(scanstate);
403                         if (!BlockNumberIsValid(blockno))
404                         {
405                                 tuple->t_data = NULL;
406                                 return NULL;
407                         }
408                 }
409                 else
410                         blockno = scan->rs_startblock;
411                 Assert(blockno < scan->rs_nblocks);
412                 heapgetpage(scan, blockno);
413                 scan->rs_inited = true;
414         }
415         else
416         {
417                 /* continue from previously returned page/tuple */
418                 blockno = scan->rs_cblock;              /* current page */
419         }
420
421         /*
422          * When not using pagemode, we must lock the buffer during tuple
423          * visibility checks.
424          */
425         if (!pagemode)
426                 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
427
428         page = (Page) BufferGetPage(scan->rs_cbuf);
429         all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
430         maxoffset = PageGetMaxOffsetNumber(page);
431
432         for (;;)
433         {
434                 OffsetNumber tupoffset;
435                 bool            finished;
436
437                 CHECK_FOR_INTERRUPTS();
438
439                 /* Ask the tablesample method which tuples to check on this page. */
440                 tupoffset = tsm->NextSampleTuple(scanstate,
441                                                                                  blockno,
442                                                                                  maxoffset);
443
444                 if (OffsetNumberIsValid(tupoffset))
445                 {
446                         ItemId          itemid;
447                         bool            visible;
448
449                         /* Skip invalid tuple pointers. */
450                         itemid = PageGetItemId(page, tupoffset);
451                         if (!ItemIdIsNormal(itemid))
452                                 continue;
453
454                         tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid);
455                         tuple->t_len = ItemIdGetLength(itemid);
456                         ItemPointerSet(&(tuple->t_self), blockno, tupoffset);
457
458                         if (all_visible)
459                                 visible = true;
460                         else
461                                 visible = SampleTupleVisible(tuple, tupoffset, scan);
462
463                         /* in pagemode, heapgetpage did this for us */
464                         if (!pagemode)
465                                 CheckForSerializableConflictOut(visible, scan->rs_rd, tuple,
466                                                                                                 scan->rs_cbuf, snapshot);
467
468                         if (visible)
469                         {
470                                 /* Found visible tuple, return it. */
471                                 if (!pagemode)
472                                         LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
473                                 break;
474                         }
475                         else
476                         {
477                                 /* Try next tuple from same page. */
478                                 continue;
479                         }
480                 }
481
482                 /*
483                  * if we get here, it means we've exhausted the items on this page and
484                  * it's time to move to the next.
485                  */
486                 if (!pagemode)
487                         LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
488
489                 if (tsm->NextSampleBlock)
490                 {
491                         blockno = tsm->NextSampleBlock(scanstate);
492                         Assert(!scan->rs_syncscan);
493                         finished = !BlockNumberIsValid(blockno);
494                 }
495                 else
496                 {
497                         /* Without NextSampleBlock, just do a plain forward seqscan. */
498                         blockno++;
499                         if (blockno >= scan->rs_nblocks)
500                                 blockno = 0;
501
502                         /*
503                          * Report our new scan position for synchronization purposes.
504                          *
505                          * Note: we do this before checking for end of scan so that the
506                          * final state of the position hint is back at the start of the
507                          * rel.  That's not strictly necessary, but otherwise when you run
508                          * the same query multiple times the starting position would shift
509                          * a little bit backwards on every invocation, which is confusing.
510                          * We don't guarantee any specific ordering in general, though.
511                          */
512                         if (scan->rs_syncscan)
513                                 ss_report_location(scan->rs_rd, blockno);
514
515                         finished = (blockno == scan->rs_startblock);
516                 }
517
518                 /*
519                  * Reached end of scan?
520                  */
521                 if (finished)
522                 {
523                         if (BufferIsValid(scan->rs_cbuf))
524                                 ReleaseBuffer(scan->rs_cbuf);
525                         scan->rs_cbuf = InvalidBuffer;
526                         scan->rs_cblock = InvalidBlockNumber;
527                         tuple->t_data = NULL;
528                         scan->rs_inited = false;
529                         return NULL;
530                 }
531
532                 Assert(blockno < scan->rs_nblocks);
533                 heapgetpage(scan, blockno);
534
535                 /* Re-establish state for new page */
536                 if (!pagemode)
537                         LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
538
539                 page = (Page) BufferGetPage(scan->rs_cbuf);
540                 all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
541                 maxoffset = PageGetMaxOffsetNumber(page);
542         }
543
544         /* Count successfully-fetched tuples as heap fetches */
545         pgstat_count_heap_getnext(scan->rs_rd);
546
547         return &(scan->rs_ctup);
548 }
549
550 /*
551  * Check visibility of the tuple.
552  */
553 static bool
554 SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan)
555 {
556         if (scan->rs_pageatatime)
557         {
558                 /*
559                  * In pageatatime mode, heapgetpage() already did visibility checks,
560                  * so just look at the info it left in rs_vistuples[].
561                  *
562                  * We use a binary search over the known-sorted array.  Note: we could
563                  * save some effort if we insisted that NextSampleTuple select tuples
564                  * in increasing order, but it's not clear that there would be enough
565                  * gain to justify the restriction.
566                  */
567                 int                     start = 0,
568                                         end = scan->rs_ntuples - 1;
569
570                 while (start <= end)
571                 {
572                         int                     mid = (start + end) / 2;
573                         OffsetNumber curoffset = scan->rs_vistuples[mid];
574
575                         if (tupoffset == curoffset)
576                                 return true;
577                         else if (tupoffset < curoffset)
578                                 end = mid - 1;
579                         else
580                                 start = mid + 1;
581                 }
582
583                 return false;
584         }
585         else
586         {
587                 /* Otherwise, we have to check the tuple individually. */
588                 return HeapTupleSatisfiesVisibility(tuple,
589                                                                                         scan->rs_snapshot,
590                                                                                         scan->rs_cbuf);
591         }
592 }