]> granicus.if.org Git - postgresql/blob - src/backend/executor/nodeSamplescan.c
9ce7c02aff415c2239f42f48e55e1735ee75519c
[postgresql] / src / backend / executor / nodeSamplescan.c
1 /*-------------------------------------------------------------------------
2  *
3  * nodeSamplescan.c
4  *        Support routines for sample scans of relations (table sampling).
5  *
6  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *        src/backend/executor/nodeSamplescan.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16
17 #include "access/hash.h"
18 #include "access/relscan.h"
19 #include "access/tsmapi.h"
20 #include "executor/executor.h"
21 #include "executor/nodeSamplescan.h"
22 #include "miscadmin.h"
23 #include "pgstat.h"
24 #include "storage/predicate.h"
25 #include "utils/rel.h"
26 #include "utils/tqual.h"
27
28 static void InitScanRelation(SampleScanState *node, EState *estate, int eflags);
29 static TupleTableSlot *SampleNext(SampleScanState *node);
30 static void tablesample_init(SampleScanState *scanstate);
31 static HeapTuple tablesample_getnext(SampleScanState *scanstate);
32 static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset,
33                                    HeapScanDesc scan);
34
35 /* ----------------------------------------------------------------
36  *                                              Scan Support
37  * ----------------------------------------------------------------
38  */
39
40 /* ----------------------------------------------------------------
41  *              SampleNext
42  *
43  *              This is a workhorse for ExecSampleScan
44  * ----------------------------------------------------------------
45  */
46 static TupleTableSlot *
47 SampleNext(SampleScanState *node)
48 {
49         HeapTuple       tuple;
50         TupleTableSlot *slot;
51
52         /*
53          * if this is first call within a scan, initialize
54          */
55         if (!node->begun)
56                 tablesample_init(node);
57
58         /*
59          * get the next tuple, and store it in our result slot
60          */
61         tuple = tablesample_getnext(node);
62
63         slot = node->ss.ss_ScanTupleSlot;
64
65         if (tuple)
66                 ExecStoreTuple(tuple,   /* tuple to store */
67                                            slot,        /* slot to store in */
68                                            node->ss.ss_currentScanDesc->rs_cbuf,        /* tuple's buffer */
69                                            false);      /* don't pfree this pointer */
70         else
71                 ExecClearTuple(slot);
72
73         return slot;
74 }
75
76 /*
77  * SampleRecheck -- access method routine to recheck a tuple in EvalPlanQual
78  */
79 static bool
80 SampleRecheck(SampleScanState *node, TupleTableSlot *slot)
81 {
82         /*
83          * No need to recheck for SampleScan, since like SeqScan we don't pass any
84          * checkable keys to heap_beginscan.
85          */
86         return true;
87 }
88
89 /* ----------------------------------------------------------------
90  *              ExecSampleScan(node)
91  *
92  *              Scans the relation using the sampling method and returns
93  *              the next qualifying tuple.
94  *              We call the ExecScan() routine and pass it the appropriate
95  *              access method functions.
96  * ----------------------------------------------------------------
97  */
98 TupleTableSlot *
99 ExecSampleScan(SampleScanState *node)
100 {
101         return ExecScan((ScanState *) node,
102                                         (ExecScanAccessMtd) SampleNext,
103                                         (ExecScanRecheckMtd) SampleRecheck);
104 }
105
106 /* ----------------------------------------------------------------
107  *              InitScanRelation
108  *
109  *              Set up to access the scan relation.
110  * ----------------------------------------------------------------
111  */
112 static void
113 InitScanRelation(SampleScanState *node, EState *estate, int eflags)
114 {
115         Relation        currentRelation;
116
117         /*
118          * get the relation object id from the relid'th entry in the range table,
119          * open that relation and acquire appropriate lock on it.
120          */
121         currentRelation = ExecOpenScanRelation(estate,
122                                                    ((SampleScan *) node->ss.ps.plan)->scan.scanrelid,
123                                                                                    eflags);
124
125         node->ss.ss_currentRelation = currentRelation;
126
127         /* we won't set up the HeapScanDesc till later */
128         node->ss.ss_currentScanDesc = NULL;
129
130         /* and report the scan tuple slot's rowtype */
131         ExecAssignScanType(&node->ss, RelationGetDescr(currentRelation));
132 }
133
134
135 /* ----------------------------------------------------------------
136  *              ExecInitSampleScan
137  * ----------------------------------------------------------------
138  */
139 SampleScanState *
140 ExecInitSampleScan(SampleScan *node, EState *estate, int eflags)
141 {
142         SampleScanState *scanstate;
143         TableSampleClause *tsc = node->tablesample;
144         TsmRoutine *tsm;
145
146         Assert(outerPlan(node) == NULL);
147         Assert(innerPlan(node) == NULL);
148
149         /*
150          * create state structure
151          */
152         scanstate = makeNode(SampleScanState);
153         scanstate->ss.ps.plan = (Plan *) node;
154         scanstate->ss.ps.state = estate;
155
156         /*
157          * Miscellaneous initialization
158          *
159          * create expression context for node
160          */
161         ExecAssignExprContext(estate, &scanstate->ss.ps);
162
163         /*
164          * initialize child expressions
165          */
166         scanstate->ss.ps.targetlist = (List *)
167                 ExecInitExpr((Expr *) node->scan.plan.targetlist,
168                                          (PlanState *) scanstate);
169         scanstate->ss.ps.qual = (List *)
170                 ExecInitExpr((Expr *) node->scan.plan.qual,
171                                          (PlanState *) scanstate);
172
173         scanstate->args = (List *)
174                 ExecInitExpr((Expr *) tsc->args,
175                                          (PlanState *) scanstate);
176         scanstate->repeatable =
177                 ExecInitExpr(tsc->repeatable,
178                                          (PlanState *) scanstate);
179
180         /*
181          * tuple table initialization
182          */
183         ExecInitResultTupleSlot(estate, &scanstate->ss.ps);
184         ExecInitScanTupleSlot(estate, &scanstate->ss);
185
186         /*
187          * initialize scan relation
188          */
189         InitScanRelation(scanstate, estate, eflags);
190
191         scanstate->ss.ps.ps_TupFromTlist = false;
192
193         /*
194          * Initialize result tuple type and projection info.
195          */
196         ExecAssignResultTypeFromTL(&scanstate->ss.ps);
197         ExecAssignScanProjectionInfo(&scanstate->ss);
198
199         /*
200          * If we don't have a REPEATABLE clause, select a random seed.  We want to
201          * do this just once, since the seed shouldn't change over rescans.
202          */
203         if (tsc->repeatable == NULL)
204                 scanstate->seed = random();
205
206         /*
207          * Finally, initialize the TABLESAMPLE method handler.
208          */
209         tsm = GetTsmRoutine(tsc->tsmhandler);
210         scanstate->tsmroutine = tsm;
211         scanstate->tsm_state = NULL;
212
213         if (tsm->InitSampleScan)
214                 tsm->InitSampleScan(scanstate, eflags);
215
216         /* We'll do BeginSampleScan later; we can't evaluate params yet */
217         scanstate->begun = false;
218
219         return scanstate;
220 }
221
222 /* ----------------------------------------------------------------
223  *              ExecEndSampleScan
224  *
225  *              frees any storage allocated through C routines.
226  * ----------------------------------------------------------------
227  */
228 void
229 ExecEndSampleScan(SampleScanState *node)
230 {
231         /*
232          * Tell sampling function that we finished the scan.
233          */
234         if (node->tsmroutine->EndSampleScan)
235                 node->tsmroutine->EndSampleScan(node);
236
237         /*
238          * Free the exprcontext
239          */
240         ExecFreeExprContext(&node->ss.ps);
241
242         /*
243          * clean out the tuple table
244          */
245         ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
246         ExecClearTuple(node->ss.ss_ScanTupleSlot);
247
248         /*
249          * close heap scan
250          */
251         if (node->ss.ss_currentScanDesc)
252                 heap_endscan(node->ss.ss_currentScanDesc);
253
254         /*
255          * close the heap relation.
256          */
257         ExecCloseScanRelation(node->ss.ss_currentRelation);
258 }
259
260 /* ----------------------------------------------------------------
261  *              ExecReScanSampleScan
262  *
263  *              Rescans the relation.
264  *
265  * ----------------------------------------------------------------
266  */
267 void
268 ExecReScanSampleScan(SampleScanState *node)
269 {
270         /* Remember we need to do BeginSampleScan again (if we did it at all) */
271         node->begun = false;
272
273         ExecScanReScan(&node->ss);
274 }
275
276
277 /*
278  * Initialize the TABLESAMPLE method: evaluate params and call BeginSampleScan.
279  */
280 static void
281 tablesample_init(SampleScanState *scanstate)
282 {
283         TsmRoutine *tsm = scanstate->tsmroutine;
284         ExprContext *econtext = scanstate->ss.ps.ps_ExprContext;
285         Datum      *params;
286         Datum           datum;
287         bool            isnull;
288         uint32          seed;
289         bool            allow_sync;
290         int                     i;
291         ListCell   *arg;
292
293         params = (Datum *) palloc(list_length(scanstate->args) * sizeof(Datum));
294
295         i = 0;
296         foreach(arg, scanstate->args)
297         {
298                 ExprState  *argstate = (ExprState *) lfirst(arg);
299
300                 params[i] = ExecEvalExprSwitchContext(argstate,
301                                                                                           econtext,
302                                                                                           &isnull,
303                                                                                           NULL);
304                 if (isnull)
305                         ereport(ERROR,
306                                         (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT),
307                                          errmsg("TABLESAMPLE parameter cannot be null")));
308                 i++;
309         }
310
311         if (scanstate->repeatable)
312         {
313                 datum = ExecEvalExprSwitchContext(scanstate->repeatable,
314                                                                                   econtext,
315                                                                                   &isnull,
316                                                                                   NULL);
317                 if (isnull)
318                         ereport(ERROR,
319                                         (errcode(ERRCODE_INVALID_TABLESAMPLE_REPEAT),
320                                  errmsg("TABLESAMPLE REPEATABLE parameter cannot be null")));
321
322                 /*
323                  * The REPEATABLE parameter has been coerced to float8 by the parser.
324                  * The reason for using float8 at the SQL level is that it will
325                  * produce unsurprising results both for users used to databases that
326                  * accept only integers in the REPEATABLE clause and for those who
327                  * might expect that REPEATABLE works like setseed() (a float in the
328                  * range from -1 to 1).
329                  *
330                  * We use hashfloat8() to convert the supplied value into a suitable
331                  * seed.  For regression-testing purposes, that has the convenient
332                  * property that REPEATABLE(0) gives a machine-independent result.
333                  */
334                 seed = DatumGetUInt32(DirectFunctionCall1(hashfloat8, datum));
335         }
336         else
337         {
338                 /* Use the seed selected by ExecInitSampleScan */
339                 seed = scanstate->seed;
340         }
341
342         /* Set default values for params that BeginSampleScan can adjust */
343         scanstate->use_bulkread = true;
344         scanstate->use_pagemode = true;
345
346         /* Let tablesample method do its thing */
347         tsm->BeginSampleScan(scanstate,
348                                                  params,
349                                                  list_length(scanstate->args),
350                                                  seed);
351
352         /* We'll use syncscan if there's no NextSampleBlock function */
353         allow_sync = (tsm->NextSampleBlock == NULL);
354
355         /* Now we can create or reset the HeapScanDesc */
356         if (scanstate->ss.ss_currentScanDesc == NULL)
357         {
358                 scanstate->ss.ss_currentScanDesc =
359                         heap_beginscan_sampling(scanstate->ss.ss_currentRelation,
360                                                                         scanstate->ss.ps.state->es_snapshot,
361                                                                         0, NULL,
362                                                                         scanstate->use_bulkread,
363                                                                         allow_sync,
364                                                                         scanstate->use_pagemode);
365         }
366         else
367         {
368                 heap_rescan_set_params(scanstate->ss.ss_currentScanDesc, NULL,
369                                                            scanstate->use_bulkread,
370                                                            allow_sync,
371                                                            scanstate->use_pagemode);
372         }
373
374         pfree(params);
375
376         /* And we're initialized. */
377         scanstate->begun = true;
378 }
379
380 /*
381  * Get next tuple from TABLESAMPLE method.
382  *
383  * Note: an awful lot of this is copied-and-pasted from heapam.c.  It would
384  * perhaps be better to refactor to share more code.
385  */
386 static HeapTuple
387 tablesample_getnext(SampleScanState *scanstate)
388 {
389         TsmRoutine *tsm = scanstate->tsmroutine;
390         HeapScanDesc scan = scanstate->ss.ss_currentScanDesc;
391         HeapTuple       tuple = &(scan->rs_ctup);
392         Snapshot        snapshot = scan->rs_snapshot;
393         bool            pagemode = scan->rs_pageatatime;
394         BlockNumber blockno;
395         Page            page;
396         bool            all_visible;
397         OffsetNumber maxoffset;
398
399         if (!scan->rs_inited)
400         {
401                 /*
402                  * return null immediately if relation is empty
403                  */
404                 if (scan->rs_nblocks == 0)
405                 {
406                         Assert(!BufferIsValid(scan->rs_cbuf));
407                         tuple->t_data = NULL;
408                         return NULL;
409                 }
410                 if (tsm->NextSampleBlock)
411                 {
412                         blockno = tsm->NextSampleBlock(scanstate);
413                         if (!BlockNumberIsValid(blockno))
414                         {
415                                 tuple->t_data = NULL;
416                                 return NULL;
417                         }
418                 }
419                 else
420                         blockno = scan->rs_startblock;
421                 Assert(blockno < scan->rs_nblocks);
422                 heapgetpage(scan, blockno);
423                 scan->rs_inited = true;
424         }
425         else
426         {
427                 /* continue from previously returned page/tuple */
428                 blockno = scan->rs_cblock;              /* current page */
429         }
430
431         /*
432          * When not using pagemode, we must lock the buffer during tuple
433          * visibility checks.
434          */
435         if (!pagemode)
436                 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
437
438         page = (Page) BufferGetPage(scan->rs_cbuf);
439         all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
440         maxoffset = PageGetMaxOffsetNumber(page);
441
442         for (;;)
443         {
444                 OffsetNumber tupoffset;
445                 bool            finished;
446
447                 CHECK_FOR_INTERRUPTS();
448
449                 /* Ask the tablesample method which tuples to check on this page. */
450                 tupoffset = tsm->NextSampleTuple(scanstate,
451                                                                                  blockno,
452                                                                                  maxoffset);
453
454                 if (OffsetNumberIsValid(tupoffset))
455                 {
456                         ItemId          itemid;
457                         bool            visible;
458
459                         /* Skip invalid tuple pointers. */
460                         itemid = PageGetItemId(page, tupoffset);
461                         if (!ItemIdIsNormal(itemid))
462                                 continue;
463
464                         tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid);
465                         tuple->t_len = ItemIdGetLength(itemid);
466                         ItemPointerSet(&(tuple->t_self), blockno, tupoffset);
467
468                         if (all_visible)
469                                 visible = true;
470                         else
471                                 visible = SampleTupleVisible(tuple, tupoffset, scan);
472
473                         /* in pagemode, heapgetpage did this for us */
474                         if (!pagemode)
475                                 CheckForSerializableConflictOut(visible, scan->rs_rd, tuple,
476                                                                                                 scan->rs_cbuf, snapshot);
477
478                         if (visible)
479                         {
480                                 /* Found visible tuple, return it. */
481                                 if (!pagemode)
482                                         LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
483                                 break;
484                         }
485                         else
486                         {
487                                 /* Try next tuple from same page. */
488                                 continue;
489                         }
490                 }
491
492                 /*
493                  * if we get here, it means we've exhausted the items on this page and
494                  * it's time to move to the next.
495                  */
496                 if (!pagemode)
497                         LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
498
499                 if (tsm->NextSampleBlock)
500                 {
501                         blockno = tsm->NextSampleBlock(scanstate);
502                         Assert(!scan->rs_syncscan);
503                         finished = !BlockNumberIsValid(blockno);
504                 }
505                 else
506                 {
507                         /* Without NextSampleBlock, just do a plain forward seqscan. */
508                         blockno++;
509                         if (blockno >= scan->rs_nblocks)
510                                 blockno = 0;
511
512                         /*
513                          * Report our new scan position for synchronization purposes.
514                          *
515                          * Note: we do this before checking for end of scan so that the
516                          * final state of the position hint is back at the start of the
517                          * rel.  That's not strictly necessary, but otherwise when you run
518                          * the same query multiple times the starting position would shift
519                          * a little bit backwards on every invocation, which is confusing.
520                          * We don't guarantee any specific ordering in general, though.
521                          */
522                         if (scan->rs_syncscan)
523                                 ss_report_location(scan->rs_rd, blockno);
524
525                         finished = (blockno == scan->rs_startblock);
526                 }
527
528                 /*
529                  * Reached end of scan?
530                  */
531                 if (finished)
532                 {
533                         if (BufferIsValid(scan->rs_cbuf))
534                                 ReleaseBuffer(scan->rs_cbuf);
535                         scan->rs_cbuf = InvalidBuffer;
536                         scan->rs_cblock = InvalidBlockNumber;
537                         tuple->t_data = NULL;
538                         scan->rs_inited = false;
539                         return NULL;
540                 }
541
542                 Assert(blockno < scan->rs_nblocks);
543                 heapgetpage(scan, blockno);
544
545                 /* Re-establish state for new page */
546                 if (!pagemode)
547                         LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
548
549                 page = (Page) BufferGetPage(scan->rs_cbuf);
550                 all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
551                 maxoffset = PageGetMaxOffsetNumber(page);
552         }
553
554         /* Count successfully-fetched tuples as heap fetches */
555         pgstat_count_heap_getnext(scan->rs_rd);
556
557         return &(scan->rs_ctup);
558 }
559
560 /*
561  * Check visibility of the tuple.
562  */
563 static bool
564 SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan)
565 {
566         if (scan->rs_pageatatime)
567         {
568                 /*
569                  * In pageatatime mode, heapgetpage() already did visibility checks,
570                  * so just look at the info it left in rs_vistuples[].
571                  *
572                  * We use a binary search over the known-sorted array.  Note: we could
573                  * save some effort if we insisted that NextSampleTuple select tuples
574                  * in increasing order, but it's not clear that there would be enough
575                  * gain to justify the restriction.
576                  */
577                 int                     start = 0,
578                                         end = scan->rs_ntuples - 1;
579
580                 while (start <= end)
581                 {
582                         int                     mid = (start + end) / 2;
583                         OffsetNumber curoffset = scan->rs_vistuples[mid];
584
585                         if (tupoffset == curoffset)
586                                 return true;
587                         else if (tupoffset < curoffset)
588                                 end = mid - 1;
589                         else
590                                 start = mid + 1;
591                 }
592
593                 return false;
594         }
595         else
596         {
597                 /* Otherwise, we have to check the tuple individually. */
598                 return HeapTupleSatisfiesVisibility(tuple,
599                                                                                         scan->rs_snapshot,
600                                                                                         scan->rs_cbuf);
601         }
602 }