* nodeSamplescan.c
* Support routines for sample scans of relations (table sampling).
*
- * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
*/
#include "postgres.h"
-#include "access/tablesample.h"
+#include "access/hash.h"
+#include "access/heapam.h"
+#include "access/relscan.h"
+#include "access/tsmapi.h"
#include "executor/executor.h"
#include "executor/nodeSamplescan.h"
#include "miscadmin.h"
-#include "parser/parsetree.h"
#include "pgstat.h"
-#include "storage/bufmgr.h"
#include "storage/predicate.h"
+#include "utils/builtins.h"
#include "utils/rel.h"
-#include "utils/syscache.h"
-#include "utils/tqual.h"
-static void InitScanRelation(SampleScanState *node, EState *estate,
- int eflags, TableSampleClause *tablesample);
static TupleTableSlot *SampleNext(SampleScanState *node);
-
+static void tablesample_init(SampleScanState *scanstate);
+static HeapTuple tablesample_getnext(SampleScanState *scanstate);
+static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset,
+ HeapScanDesc scan);
/* ----------------------------------------------------------------
* Scan Support
static TupleTableSlot *
SampleNext(SampleScanState *node)
{
- TupleTableSlot *slot;
- TableSampleDesc *tsdesc;
- HeapTuple tuple;
+ HeapTuple tuple;
+ TupleTableSlot *slot;
/*
- * get information from the scan state
+ * if this is first call within a scan, initialize
*/
- slot = node->ss.ss_ScanTupleSlot;
- tsdesc = node->tsdesc;
+ if (!node->begun)
+ tablesample_init(node);
- tuple = tablesample_getnext(tsdesc);
+ /*
+ * get the next tuple, and store it in our result slot
+ */
+ tuple = tablesample_getnext(node);
+
+ slot = node->ss.ss_ScanTupleSlot;
if (tuple)
- ExecStoreTuple(tuple, /* tuple to store */
- slot, /* slot to store in */
- tsdesc->heapScan->rs_cbuf, /* buffer associated with this tuple */
- false); /* don't pfree this pointer */
+ ExecStoreBufferHeapTuple(tuple, /* tuple to store */
+ slot, /* slot to store in */
+ node->ss.ss_currentScanDesc->rs_cbuf); /* tuple's buffer */
else
ExecClearTuple(slot);
static bool
SampleRecheck(SampleScanState *node, TupleTableSlot *slot)
{
- /* No need to recheck for SampleScan */
+ /*
+ * No need to recheck for SampleScan, since like SeqScan we don't pass any
+ * checkable keys to heap_beginscan.
+ */
return true;
}
* access method functions.
* ----------------------------------------------------------------
*/
-TupleTableSlot *
-ExecSampleScan(SampleScanState *node)
+static TupleTableSlot *
+ExecSampleScan(PlanState *pstate)
{
- return ExecScan((ScanState *) node,
+ SampleScanState *node = castNode(SampleScanState, pstate);
+
+ return ExecScan(&node->ss,
(ExecScanAccessMtd) SampleNext,
(ExecScanRecheckMtd) SampleRecheck);
}
-/* ----------------------------------------------------------------
- * InitScanRelation
- *
- * Set up to access the scan relation.
- * ----------------------------------------------------------------
- */
-static void
-InitScanRelation(SampleScanState *node, EState *estate, int eflags,
- TableSampleClause *tablesample)
-{
- Relation currentRelation;
-
- /*
- * get the relation object id from the relid'th entry in the range table,
- * open that relation and acquire appropriate lock on it.
- */
- currentRelation = ExecOpenScanRelation(estate,
- ((SampleScan *) node->ss.ps.plan)->scanrelid,
- eflags);
-
- node->ss.ss_currentRelation = currentRelation;
-
- /*
- * Even though we aren't going to do a conventional seqscan, it is useful
- * to create a HeapScanDesc --- many of the fields in it are usable.
- */
- node->ss.ss_currentScanDesc =
- heap_beginscan_sampling(currentRelation, estate->es_snapshot, 0, NULL,
- tablesample->tsmseqscan,
- tablesample->tsmpagemode);
-
- /* and report the scan tuple slot's rowtype */
- ExecAssignScanType(&node->ss, RelationGetDescr(currentRelation));
-}
-
-
/* ----------------------------------------------------------------
* ExecInitSampleScan
* ----------------------------------------------------------------
ExecInitSampleScan(SampleScan *node, EState *estate, int eflags)
{
SampleScanState *scanstate;
- RangeTblEntry *rte = rt_fetch(node->scanrelid,
- estate->es_range_table);
+ TableSampleClause *tsc = node->tablesample;
+ TsmRoutine *tsm;
Assert(outerPlan(node) == NULL);
Assert(innerPlan(node) == NULL);
- Assert(rte->tablesample != NULL);
/*
* create state structure
scanstate = makeNode(SampleScanState);
scanstate->ss.ps.plan = (Plan *) node;
scanstate->ss.ps.state = estate;
+ scanstate->ss.ps.ExecProcNode = ExecSampleScan;
/*
* Miscellaneous initialization
ExecAssignExprContext(estate, &scanstate->ss.ps);
/*
- * initialize child expressions
+ * open the scan relation
*/
- scanstate->ss.ps.targetlist = (List *)
- ExecInitExpr((Expr *) node->plan.targetlist,
- (PlanState *) scanstate);
- scanstate->ss.ps.qual = (List *)
- ExecInitExpr((Expr *) node->plan.qual,
- (PlanState *) scanstate);
+ scanstate->ss.ss_currentRelation =
+ ExecOpenScanRelation(estate,
+ node->scan.scanrelid,
+ eflags);
+
+ /* we won't set up the HeapScanDesc till later */
+ scanstate->ss.ss_currentScanDesc = NULL;
+
+ /* and create slot with appropriate rowtype */
+ ExecInitScanTupleSlot(estate, &scanstate->ss,
+ RelationGetDescr(scanstate->ss.ss_currentRelation),
+ &TTSOpsBufferHeapTuple);
/*
- * tuple table initialization
+ * Initialize result type and projection.
*/
- ExecInitResultTupleSlot(estate, &scanstate->ss.ps);
- ExecInitScanTupleSlot(estate, &scanstate->ss);
+ ExecInitResultTypeTL(&scanstate->ss.ps);
+ ExecAssignScanProjectionInfo(&scanstate->ss);
/*
- * initialize scan relation
+ * initialize child expressions
*/
- InitScanRelation(scanstate, estate, eflags, rte->tablesample);
+ scanstate->ss.ps.qual =
+ ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
- scanstate->ss.ps.ps_TupFromTlist = false;
+ scanstate->args = ExecInitExprList(tsc->args, (PlanState *) scanstate);
+ scanstate->repeatable =
+ ExecInitExpr(tsc->repeatable, (PlanState *) scanstate);
/*
- * Initialize result tuple type and projection info.
+ * If we don't have a REPEATABLE clause, select a random seed. We want to
+ * do this just once, since the seed shouldn't change over rescans.
*/
- ExecAssignResultTypeFromTL(&scanstate->ss.ps);
- ExecAssignScanProjectionInfo(&scanstate->ss);
+ if (tsc->repeatable == NULL)
+ scanstate->seed = random();
+
+ /*
+ * Finally, initialize the TABLESAMPLE method handler.
+ */
+ tsm = GetTsmRoutine(tsc->tsmhandler);
+ scanstate->tsmroutine = tsm;
+ scanstate->tsm_state = NULL;
- scanstate->tsdesc = tablesample_init(scanstate, rte->tablesample);
+ if (tsm->InitSampleScan)
+ tsm->InitSampleScan(scanstate, eflags);
+
+ /* We'll do BeginSampleScan later; we can't evaluate params yet */
+ scanstate->begun = false;
return scanstate;
}
/*
* Tell sampling function that we finished the scan.
*/
- tablesample_end(node->tsdesc);
+ if (node->tsmroutine->EndSampleScan)
+ node->tsmroutine->EndSampleScan(node);
/*
* Free the exprcontext
/*
* clean out the tuple table
*/
- ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+ if (node->ss.ps.ps_ResultTupleSlot)
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
ExecClearTuple(node->ss.ss_ScanTupleSlot);
/*
* close heap scan
*/
- heap_endscan(node->ss.ss_currentScanDesc);
-
- /*
- * close the heap relation.
- */
- ExecCloseScanRelation(node->ss.ss_currentRelation);
+ if (node->ss.ss_currentScanDesc)
+ heap_endscan(node->ss.ss_currentScanDesc);
}
-/* ----------------------------------------------------------------
- * Join Support
- * ----------------------------------------------------------------
- */
-
/* ----------------------------------------------------------------
* ExecReScanSampleScan
*
void
ExecReScanSampleScan(SampleScanState *node)
{
- heap_rescan(node->ss.ss_currentScanDesc, NULL);
+ /* Remember we need to do BeginSampleScan again (if we did it at all) */
+ node->begun = false;
+
+ ExecScanReScan(&node->ss);
+}
+
+
+/*
+ * Initialize the TABLESAMPLE method: evaluate params and call BeginSampleScan.
+ */
+static void
+tablesample_init(SampleScanState *scanstate)
+{
+ TsmRoutine *tsm = scanstate->tsmroutine;
+ ExprContext *econtext = scanstate->ss.ps.ps_ExprContext;
+ Datum *params;
+ Datum datum;
+ bool isnull;
+ uint32 seed;
+ bool allow_sync;
+ int i;
+ ListCell *arg;
+
+ params = (Datum *) palloc(list_length(scanstate->args) * sizeof(Datum));
+
+ i = 0;
+ foreach(arg, scanstate->args)
+ {
+ ExprState *argstate = (ExprState *) lfirst(arg);
+
+ params[i] = ExecEvalExprSwitchContext(argstate,
+ econtext,
+ &isnull);
+ if (isnull)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT),
+ errmsg("TABLESAMPLE parameter cannot be null")));
+ i++;
+ }
+
+ if (scanstate->repeatable)
+ {
+ datum = ExecEvalExprSwitchContext(scanstate->repeatable,
+ econtext,
+ &isnull);
+ if (isnull)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TABLESAMPLE_REPEAT),
+ errmsg("TABLESAMPLE REPEATABLE parameter cannot be null")));
+
+ /*
+ * The REPEATABLE parameter has been coerced to float8 by the parser.
+ * The reason for using float8 at the SQL level is that it will
+ * produce unsurprising results both for users used to databases that
+ * accept only integers in the REPEATABLE clause and for those who
+ * might expect that REPEATABLE works like setseed() (a float in the
+ * range from -1 to 1).
+ *
+ * We use hashfloat8() to convert the supplied value into a suitable
+ * seed. For regression-testing purposes, that has the convenient
+ * property that REPEATABLE(0) gives a machine-independent result.
+ */
+ seed = DatumGetUInt32(DirectFunctionCall1(hashfloat8, datum));
+ }
+ else
+ {
+ /* Use the seed selected by ExecInitSampleScan */
+ seed = scanstate->seed;
+ }
+
+ /* Set default values for params that BeginSampleScan can adjust */
+ scanstate->use_bulkread = true;
+ scanstate->use_pagemode = true;
+
+ /* Let tablesample method do its thing */
+ tsm->BeginSampleScan(scanstate,
+ params,
+ list_length(scanstate->args),
+ seed);
+
+ /* We'll use syncscan if there's no NextSampleBlock function */
+ allow_sync = (tsm->NextSampleBlock == NULL);
+
+ /* Now we can create or reset the HeapScanDesc */
+ if (scanstate->ss.ss_currentScanDesc == NULL)
+ {
+ scanstate->ss.ss_currentScanDesc =
+ heap_beginscan_sampling(scanstate->ss.ss_currentRelation,
+ scanstate->ss.ps.state->es_snapshot,
+ 0, NULL,
+ scanstate->use_bulkread,
+ allow_sync,
+ scanstate->use_pagemode);
+ }
+ else
+ {
+ heap_rescan_set_params(scanstate->ss.ss_currentScanDesc, NULL,
+ scanstate->use_bulkread,
+ allow_sync,
+ scanstate->use_pagemode);
+ }
+
+ pfree(params);
+
+ /* And we're initialized. */
+ scanstate->begun = true;
+}
+
+/*
+ * Get next tuple from TABLESAMPLE method.
+ *
+ * Note: an awful lot of this is copied-and-pasted from heapam.c. It would
+ * perhaps be better to refactor to share more code.
+ */
+static HeapTuple
+tablesample_getnext(SampleScanState *scanstate)
+{
+ TsmRoutine *tsm = scanstate->tsmroutine;
+ HeapScanDesc scan = scanstate->ss.ss_currentScanDesc;
+ HeapTuple tuple = &(scan->rs_ctup);
+ Snapshot snapshot = scan->rs_snapshot;
+ bool pagemode = scan->rs_pageatatime;
+ BlockNumber blockno;
+ Page page;
+ bool all_visible;
+ OffsetNumber maxoffset;
+
+ if (!scan->rs_inited)
+ {
+ /*
+ * return null immediately if relation is empty
+ */
+ if (scan->rs_nblocks == 0)
+ {
+ Assert(!BufferIsValid(scan->rs_cbuf));
+ tuple->t_data = NULL;
+ return NULL;
+ }
+ if (tsm->NextSampleBlock)
+ {
+ blockno = tsm->NextSampleBlock(scanstate);
+ if (!BlockNumberIsValid(blockno))
+ {
+ tuple->t_data = NULL;
+ return NULL;
+ }
+ }
+ else
+ blockno = scan->rs_startblock;
+ Assert(blockno < scan->rs_nblocks);
+ heapgetpage(scan, blockno);
+ scan->rs_inited = true;
+ }
+ else
+ {
+ /* continue from previously returned page/tuple */
+ blockno = scan->rs_cblock; /* current page */
+ }
/*
- * Tell sampling function to reset its state for rescan.
+ * When not using pagemode, we must lock the buffer during tuple
+ * visibility checks.
*/
- tablesample_reset(node->tsdesc);
+ if (!pagemode)
+ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
+
+ page = (Page) BufferGetPage(scan->rs_cbuf);
+ all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
+ maxoffset = PageGetMaxOffsetNumber(page);
+
+ for (;;)
+ {
+ OffsetNumber tupoffset;
+ bool finished;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /* Ask the tablesample method which tuples to check on this page. */
+ tupoffset = tsm->NextSampleTuple(scanstate,
+ blockno,
+ maxoffset);
+
+ if (OffsetNumberIsValid(tupoffset))
+ {
+ ItemId itemid;
+ bool visible;
+
+ /* Skip invalid tuple pointers. */
+ itemid = PageGetItemId(page, tupoffset);
+ if (!ItemIdIsNormal(itemid))
+ continue;
+
+ tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid);
+ tuple->t_len = ItemIdGetLength(itemid);
+ ItemPointerSet(&(tuple->t_self), blockno, tupoffset);
+
+ if (all_visible)
+ visible = true;
+ else
+ visible = SampleTupleVisible(tuple, tupoffset, scan);
+
+ /* in pagemode, heapgetpage did this for us */
+ if (!pagemode)
+ CheckForSerializableConflictOut(visible, scan->rs_rd, tuple,
+ scan->rs_cbuf, snapshot);
+
+ if (visible)
+ {
+ /* Found visible tuple, return it. */
+ if (!pagemode)
+ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+ break;
+ }
+ else
+ {
+ /* Try next tuple from same page. */
+ continue;
+ }
+ }
+
+ /*
+ * if we get here, it means we've exhausted the items on this page and
+ * it's time to move to the next.
+ */
+ if (!pagemode)
+ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+
+ if (tsm->NextSampleBlock)
+ {
+ blockno = tsm->NextSampleBlock(scanstate);
+ Assert(!scan->rs_syncscan);
+ finished = !BlockNumberIsValid(blockno);
+ }
+ else
+ {
+ /* Without NextSampleBlock, just do a plain forward seqscan. */
+ blockno++;
+ if (blockno >= scan->rs_nblocks)
+ blockno = 0;
+
+ /*
+ * Report our new scan position for synchronization purposes.
+ *
+ * Note: we do this before checking for end of scan so that the
+ * final state of the position hint is back at the start of the
+ * rel. That's not strictly necessary, but otherwise when you run
+ * the same query multiple times the starting position would shift
+ * a little bit backwards on every invocation, which is confusing.
+ * We don't guarantee any specific ordering in general, though.
+ */
+ if (scan->rs_syncscan)
+ ss_report_location(scan->rs_rd, blockno);
+
+ finished = (blockno == scan->rs_startblock);
+ }
+
+ /*
+ * Reached end of scan?
+ */
+ if (finished)
+ {
+ if (BufferIsValid(scan->rs_cbuf))
+ ReleaseBuffer(scan->rs_cbuf);
+ scan->rs_cbuf = InvalidBuffer;
+ scan->rs_cblock = InvalidBlockNumber;
+ tuple->t_data = NULL;
+ scan->rs_inited = false;
+ return NULL;
+ }
+
+ Assert(blockno < scan->rs_nblocks);
+ heapgetpage(scan, blockno);
+
+ /* Re-establish state for new page */
+ if (!pagemode)
+ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
+
+ page = (Page) BufferGetPage(scan->rs_cbuf);
+ all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
+ maxoffset = PageGetMaxOffsetNumber(page);
+ }
+
+ /* Count successfully-fetched tuples as heap fetches */
+ pgstat_count_heap_getnext(scan->rs_rd);
+
+ return &(scan->rs_ctup);
+}
- ExecScanReScan(&node->ss);
+/*
+ * Check visibility of the tuple.
+ */
+static bool
+SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan)
+{
+ if (scan->rs_pageatatime)
+ {
+ /*
+ * In pageatatime mode, heapgetpage() already did visibility checks,
+ * so just look at the info it left in rs_vistuples[].
+ *
+ * We use a binary search over the known-sorted array. Note: we could
+ * save some effort if we insisted that NextSampleTuple select tuples
+ * in increasing order, but it's not clear that there would be enough
+ * gain to justify the restriction.
+ */
+ int start = 0,
+ end = scan->rs_ntuples - 1;
+
+ while (start <= end)
+ {
+ int mid = (start + end) / 2;
+ OffsetNumber curoffset = scan->rs_vistuples[mid];
+
+ if (tupoffset == curoffset)
+ return true;
+ else if (tupoffset < curoffset)
+ end = mid - 1;
+ else
+ start = mid + 1;
+ }
+
+ return false;
+ }
+ else
+ {
+ /* Otherwise, we have to check the tuple individually. */
+ return HeapTupleSatisfiesVisibility(tuple,
+ scan->rs_snapshot,
+ scan->rs_cbuf);
+ }
}