]> granicus.if.org Git - postgresql/blob - src/backend/executor/nodeSamplescan.c
d38265e8104825e382d3fb66dade78edc312858d
[postgresql] / src / backend / executor / nodeSamplescan.c
1 /*-------------------------------------------------------------------------
2  *
3  * nodeSamplescan.c
4  *        Support routines for sample scans of relations (table sampling).
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *        src/backend/executor/nodeSamplescan.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16
17 #include "access/hash.h"
18 #include "access/relscan.h"
19 #include "access/tsmapi.h"
20 #include "executor/executor.h"
21 #include "executor/nodeSamplescan.h"
22 #include "miscadmin.h"
23 #include "pgstat.h"
24 #include "storage/predicate.h"
25 #include "utils/builtins.h"
26 #include "utils/rel.h"
27 #include "utils/tqual.h"
28
29 static void InitScanRelation(SampleScanState *node, EState *estate, int eflags);
30 static TupleTableSlot *SampleNext(SampleScanState *node);
31 static void tablesample_init(SampleScanState *scanstate);
32 static HeapTuple tablesample_getnext(SampleScanState *scanstate);
33 static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset,
34                                    HeapScanDesc scan);
35
36 /* ----------------------------------------------------------------
37  *                                              Scan Support
38  * ----------------------------------------------------------------
39  */
40
41 /* ----------------------------------------------------------------
42  *              SampleNext
43  *
44  *              This is a workhorse for ExecSampleScan
45  * ----------------------------------------------------------------
46  */
47 static TupleTableSlot *
48 SampleNext(SampleScanState *node)
49 {
50         HeapTuple       tuple;
51         TupleTableSlot *slot;
52
53         /*
54          * if this is first call within a scan, initialize
55          */
56         if (!node->begun)
57                 tablesample_init(node);
58
59         /*
60          * get the next tuple, and store it in our result slot
61          */
62         tuple = tablesample_getnext(node);
63
64         slot = node->ss.ss_ScanTupleSlot;
65
66         if (tuple)
67                 ExecStoreTuple(tuple,   /* tuple to store */
68                                            slot,        /* slot to store in */
69                                            node->ss.ss_currentScanDesc->rs_cbuf,        /* tuple's buffer */
70                                            false);      /* don't pfree this pointer */
71         else
72                 ExecClearTuple(slot);
73
74         return slot;
75 }
76
77 /*
78  * SampleRecheck -- access method routine to recheck a tuple in EvalPlanQual
79  */
80 static bool
81 SampleRecheck(SampleScanState *node, TupleTableSlot *slot)
82 {
83         /*
84          * No need to recheck for SampleScan, since like SeqScan we don't pass any
85          * checkable keys to heap_beginscan.
86          */
87         return true;
88 }
89
90 /* ----------------------------------------------------------------
91  *              ExecSampleScan(node)
92  *
93  *              Scans the relation using the sampling method and returns
94  *              the next qualifying tuple.
95  *              We call the ExecScan() routine and pass it the appropriate
96  *              access method functions.
97  * ----------------------------------------------------------------
98  */
99 TupleTableSlot *
100 ExecSampleScan(SampleScanState *node)
101 {
102         return ExecScan((ScanState *) node,
103                                         (ExecScanAccessMtd) SampleNext,
104                                         (ExecScanRecheckMtd) SampleRecheck);
105 }
106
107 /* ----------------------------------------------------------------
108  *              InitScanRelation
109  *
110  *              Set up to access the scan relation.
111  * ----------------------------------------------------------------
112  */
113 static void
114 InitScanRelation(SampleScanState *node, EState *estate, int eflags)
115 {
116         Relation        currentRelation;
117
118         /*
119          * get the relation object id from the relid'th entry in the range table,
120          * open that relation and acquire appropriate lock on it.
121          */
122         currentRelation = ExecOpenScanRelation(estate,
123                                                    ((SampleScan *) node->ss.ps.plan)->scan.scanrelid,
124                                                                                    eflags);
125
126         node->ss.ss_currentRelation = currentRelation;
127
128         /* we won't set up the HeapScanDesc till later */
129         node->ss.ss_currentScanDesc = NULL;
130
131         /* and report the scan tuple slot's rowtype */
132         ExecAssignScanType(&node->ss, RelationGetDescr(currentRelation));
133 }
134
135
136 /* ----------------------------------------------------------------
137  *              ExecInitSampleScan
138  * ----------------------------------------------------------------
139  */
140 SampleScanState *
141 ExecInitSampleScan(SampleScan *node, EState *estate, int eflags)
142 {
143         SampleScanState *scanstate;
144         TableSampleClause *tsc = node->tablesample;
145         TsmRoutine *tsm;
146
147         Assert(outerPlan(node) == NULL);
148         Assert(innerPlan(node) == NULL);
149
150         /*
151          * create state structure
152          */
153         scanstate = makeNode(SampleScanState);
154         scanstate->ss.ps.plan = (Plan *) node;
155         scanstate->ss.ps.state = estate;
156
157         /*
158          * Miscellaneous initialization
159          *
160          * create expression context for node
161          */
162         ExecAssignExprContext(estate, &scanstate->ss.ps);
163
164         /*
165          * initialize child expressions
166          */
167         scanstate->ss.ps.targetlist = (List *)
168                 ExecInitExpr((Expr *) node->scan.plan.targetlist,
169                                          (PlanState *) scanstate);
170         scanstate->ss.ps.qual = (List *)
171                 ExecInitExpr((Expr *) node->scan.plan.qual,
172                                          (PlanState *) scanstate);
173
174         scanstate->args = (List *)
175                 ExecInitExpr((Expr *) tsc->args,
176                                          (PlanState *) scanstate);
177         scanstate->repeatable =
178                 ExecInitExpr(tsc->repeatable,
179                                          (PlanState *) scanstate);
180
181         /*
182          * tuple table initialization
183          */
184         ExecInitResultTupleSlot(estate, &scanstate->ss.ps);
185         ExecInitScanTupleSlot(estate, &scanstate->ss);
186
187         /*
188          * initialize scan relation
189          */
190         InitScanRelation(scanstate, estate, eflags);
191
192         /*
193          * Initialize result tuple type and projection info.
194          */
195         ExecAssignResultTypeFromTL(&scanstate->ss.ps);
196         ExecAssignScanProjectionInfo(&scanstate->ss);
197
198         /*
199          * If we don't have a REPEATABLE clause, select a random seed.  We want to
200          * do this just once, since the seed shouldn't change over rescans.
201          */
202         if (tsc->repeatable == NULL)
203                 scanstate->seed = random();
204
205         /*
206          * Finally, initialize the TABLESAMPLE method handler.
207          */
208         tsm = GetTsmRoutine(tsc->tsmhandler);
209         scanstate->tsmroutine = tsm;
210         scanstate->tsm_state = NULL;
211
212         if (tsm->InitSampleScan)
213                 tsm->InitSampleScan(scanstate, eflags);
214
215         /* We'll do BeginSampleScan later; we can't evaluate params yet */
216         scanstate->begun = false;
217
218         return scanstate;
219 }
220
221 /* ----------------------------------------------------------------
222  *              ExecEndSampleScan
223  *
224  *              frees any storage allocated through C routines.
225  * ----------------------------------------------------------------
226  */
227 void
228 ExecEndSampleScan(SampleScanState *node)
229 {
230         /*
231          * Tell sampling function that we finished the scan.
232          */
233         if (node->tsmroutine->EndSampleScan)
234                 node->tsmroutine->EndSampleScan(node);
235
236         /*
237          * Free the exprcontext
238          */
239         ExecFreeExprContext(&node->ss.ps);
240
241         /*
242          * clean out the tuple table
243          */
244         ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
245         ExecClearTuple(node->ss.ss_ScanTupleSlot);
246
247         /*
248          * close heap scan
249          */
250         if (node->ss.ss_currentScanDesc)
251                 heap_endscan(node->ss.ss_currentScanDesc);
252
253         /*
254          * close the heap relation.
255          */
256         ExecCloseScanRelation(node->ss.ss_currentRelation);
257 }
258
259 /* ----------------------------------------------------------------
260  *              ExecReScanSampleScan
261  *
262  *              Rescans the relation.
263  *
264  * ----------------------------------------------------------------
265  */
266 void
267 ExecReScanSampleScan(SampleScanState *node)
268 {
269         /* Remember we need to do BeginSampleScan again (if we did it at all) */
270         node->begun = false;
271
272         ExecScanReScan(&node->ss);
273 }
274
275
276 /*
277  * Initialize the TABLESAMPLE method: evaluate params and call BeginSampleScan.
278  */
279 static void
280 tablesample_init(SampleScanState *scanstate)
281 {
282         TsmRoutine *tsm = scanstate->tsmroutine;
283         ExprContext *econtext = scanstate->ss.ps.ps_ExprContext;
284         Datum      *params;
285         Datum           datum;
286         bool            isnull;
287         uint32          seed;
288         bool            allow_sync;
289         int                     i;
290         ListCell   *arg;
291
292         params = (Datum *) palloc(list_length(scanstate->args) * sizeof(Datum));
293
294         i = 0;
295         foreach(arg, scanstate->args)
296         {
297                 ExprState  *argstate = (ExprState *) lfirst(arg);
298
299                 params[i] = ExecEvalExprSwitchContext(argstate,
300                                                                                           econtext,
301                                                                                           &isnull);
302                 if (isnull)
303                         ereport(ERROR,
304                                         (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT),
305                                          errmsg("TABLESAMPLE parameter cannot be null")));
306                 i++;
307         }
308
309         if (scanstate->repeatable)
310         {
311                 datum = ExecEvalExprSwitchContext(scanstate->repeatable,
312                                                                                   econtext,
313                                                                                   &isnull);
314                 if (isnull)
315                         ereport(ERROR,
316                                         (errcode(ERRCODE_INVALID_TABLESAMPLE_REPEAT),
317                                  errmsg("TABLESAMPLE REPEATABLE parameter cannot be null")));
318
319                 /*
320                  * The REPEATABLE parameter has been coerced to float8 by the parser.
321                  * The reason for using float8 at the SQL level is that it will
322                  * produce unsurprising results both for users used to databases that
323                  * accept only integers in the REPEATABLE clause and for those who
324                  * might expect that REPEATABLE works like setseed() (a float in the
325                  * range from -1 to 1).
326                  *
327                  * We use hashfloat8() to convert the supplied value into a suitable
328                  * seed.  For regression-testing purposes, that has the convenient
329                  * property that REPEATABLE(0) gives a machine-independent result.
330                  */
331                 seed = DatumGetUInt32(DirectFunctionCall1(hashfloat8, datum));
332         }
333         else
334         {
335                 /* Use the seed selected by ExecInitSampleScan */
336                 seed = scanstate->seed;
337         }
338
339         /* Set default values for params that BeginSampleScan can adjust */
340         scanstate->use_bulkread = true;
341         scanstate->use_pagemode = true;
342
343         /* Let tablesample method do its thing */
344         tsm->BeginSampleScan(scanstate,
345                                                  params,
346                                                  list_length(scanstate->args),
347                                                  seed);
348
349         /* We'll use syncscan if there's no NextSampleBlock function */
350         allow_sync = (tsm->NextSampleBlock == NULL);
351
352         /* Now we can create or reset the HeapScanDesc */
353         if (scanstate->ss.ss_currentScanDesc == NULL)
354         {
355                 scanstate->ss.ss_currentScanDesc =
356                         heap_beginscan_sampling(scanstate->ss.ss_currentRelation,
357                                                                         scanstate->ss.ps.state->es_snapshot,
358                                                                         0, NULL,
359                                                                         scanstate->use_bulkread,
360                                                                         allow_sync,
361                                                                         scanstate->use_pagemode);
362         }
363         else
364         {
365                 heap_rescan_set_params(scanstate->ss.ss_currentScanDesc, NULL,
366                                                            scanstate->use_bulkread,
367                                                            allow_sync,
368                                                            scanstate->use_pagemode);
369         }
370
371         pfree(params);
372
373         /* And we're initialized. */
374         scanstate->begun = true;
375 }
376
377 /*
378  * Get next tuple from TABLESAMPLE method.
379  *
380  * Note: an awful lot of this is copied-and-pasted from heapam.c.  It would
381  * perhaps be better to refactor to share more code.
382  */
383 static HeapTuple
384 tablesample_getnext(SampleScanState *scanstate)
385 {
386         TsmRoutine *tsm = scanstate->tsmroutine;
387         HeapScanDesc scan = scanstate->ss.ss_currentScanDesc;
388         HeapTuple       tuple = &(scan->rs_ctup);
389         Snapshot        snapshot = scan->rs_snapshot;
390         bool            pagemode = scan->rs_pageatatime;
391         BlockNumber blockno;
392         Page            page;
393         bool            all_visible;
394         OffsetNumber maxoffset;
395
396         if (!scan->rs_inited)
397         {
398                 /*
399                  * return null immediately if relation is empty
400                  */
401                 if (scan->rs_nblocks == 0)
402                 {
403                         Assert(!BufferIsValid(scan->rs_cbuf));
404                         tuple->t_data = NULL;
405                         return NULL;
406                 }
407                 if (tsm->NextSampleBlock)
408                 {
409                         blockno = tsm->NextSampleBlock(scanstate);
410                         if (!BlockNumberIsValid(blockno))
411                         {
412                                 tuple->t_data = NULL;
413                                 return NULL;
414                         }
415                 }
416                 else
417                         blockno = scan->rs_startblock;
418                 Assert(blockno < scan->rs_nblocks);
419                 heapgetpage(scan, blockno);
420                 scan->rs_inited = true;
421         }
422         else
423         {
424                 /* continue from previously returned page/tuple */
425                 blockno = scan->rs_cblock;              /* current page */
426         }
427
428         /*
429          * When not using pagemode, we must lock the buffer during tuple
430          * visibility checks.
431          */
432         if (!pagemode)
433                 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
434
435         page = (Page) BufferGetPage(scan->rs_cbuf);
436         all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
437         maxoffset = PageGetMaxOffsetNumber(page);
438
439         for (;;)
440         {
441                 OffsetNumber tupoffset;
442                 bool            finished;
443
444                 CHECK_FOR_INTERRUPTS();
445
446                 /* Ask the tablesample method which tuples to check on this page. */
447                 tupoffset = tsm->NextSampleTuple(scanstate,
448                                                                                  blockno,
449                                                                                  maxoffset);
450
451                 if (OffsetNumberIsValid(tupoffset))
452                 {
453                         ItemId          itemid;
454                         bool            visible;
455
456                         /* Skip invalid tuple pointers. */
457                         itemid = PageGetItemId(page, tupoffset);
458                         if (!ItemIdIsNormal(itemid))
459                                 continue;
460
461                         tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid);
462                         tuple->t_len = ItemIdGetLength(itemid);
463                         ItemPointerSet(&(tuple->t_self), blockno, tupoffset);
464
465                         if (all_visible)
466                                 visible = true;
467                         else
468                                 visible = SampleTupleVisible(tuple, tupoffset, scan);
469
470                         /* in pagemode, heapgetpage did this for us */
471                         if (!pagemode)
472                                 CheckForSerializableConflictOut(visible, scan->rs_rd, tuple,
473                                                                                                 scan->rs_cbuf, snapshot);
474
475                         if (visible)
476                         {
477                                 /* Found visible tuple, return it. */
478                                 if (!pagemode)
479                                         LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
480                                 break;
481                         }
482                         else
483                         {
484                                 /* Try next tuple from same page. */
485                                 continue;
486                         }
487                 }
488
489                 /*
490                  * if we get here, it means we've exhausted the items on this page and
491                  * it's time to move to the next.
492                  */
493                 if (!pagemode)
494                         LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
495
496                 if (tsm->NextSampleBlock)
497                 {
498                         blockno = tsm->NextSampleBlock(scanstate);
499                         Assert(!scan->rs_syncscan);
500                         finished = !BlockNumberIsValid(blockno);
501                 }
502                 else
503                 {
504                         /* Without NextSampleBlock, just do a plain forward seqscan. */
505                         blockno++;
506                         if (blockno >= scan->rs_nblocks)
507                                 blockno = 0;
508
509                         /*
510                          * Report our new scan position for synchronization purposes.
511                          *
512                          * Note: we do this before checking for end of scan so that the
513                          * final state of the position hint is back at the start of the
514                          * rel.  That's not strictly necessary, but otherwise when you run
515                          * the same query multiple times the starting position would shift
516                          * a little bit backwards on every invocation, which is confusing.
517                          * We don't guarantee any specific ordering in general, though.
518                          */
519                         if (scan->rs_syncscan)
520                                 ss_report_location(scan->rs_rd, blockno);
521
522                         finished = (blockno == scan->rs_startblock);
523                 }
524
525                 /*
526                  * Reached end of scan?
527                  */
528                 if (finished)
529                 {
530                         if (BufferIsValid(scan->rs_cbuf))
531                                 ReleaseBuffer(scan->rs_cbuf);
532                         scan->rs_cbuf = InvalidBuffer;
533                         scan->rs_cblock = InvalidBlockNumber;
534                         tuple->t_data = NULL;
535                         scan->rs_inited = false;
536                         return NULL;
537                 }
538
539                 Assert(blockno < scan->rs_nblocks);
540                 heapgetpage(scan, blockno);
541
542                 /* Re-establish state for new page */
543                 if (!pagemode)
544                         LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
545
546                 page = (Page) BufferGetPage(scan->rs_cbuf);
547                 all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
548                 maxoffset = PageGetMaxOffsetNumber(page);
549         }
550
551         /* Count successfully-fetched tuples as heap fetches */
552         pgstat_count_heap_getnext(scan->rs_rd);
553
554         return &(scan->rs_ctup);
555 }
556
557 /*
558  * Check visibility of the tuple.
559  */
560 static bool
561 SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan)
562 {
563         if (scan->rs_pageatatime)
564         {
565                 /*
566                  * In pageatatime mode, heapgetpage() already did visibility checks,
567                  * so just look at the info it left in rs_vistuples[].
568                  *
569                  * We use a binary search over the known-sorted array.  Note: we could
570                  * save some effort if we insisted that NextSampleTuple select tuples
571                  * in increasing order, but it's not clear that there would be enough
572                  * gain to justify the restriction.
573                  */
574                 int                     start = 0,
575                                         end = scan->rs_ntuples - 1;
576
577                 while (start <= end)
578                 {
579                         int                     mid = (start + end) / 2;
580                         OffsetNumber curoffset = scan->rs_vistuples[mid];
581
582                         if (tupoffset == curoffset)
583                                 return true;
584                         else if (tupoffset < curoffset)
585                                 end = mid - 1;
586                         else
587                                 start = mid + 1;
588                 }
589
590                 return false;
591         }
592         else
593         {
594                 /* Otherwise, we have to check the tuple individually. */
595                 return HeapTupleSatisfiesVisibility(tuple,
596                                                                                         scan->rs_snapshot,
597                                                                                         scan->rs_cbuf);
598         }
599 }