From 73c954d24896aeb05de0f81d75e891a858e439e9 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Sat, 30 Mar 2019 20:18:53 -0700 Subject: [PATCH] tableam: sample scan. This moves sample scan support to below tableam. It's not optional as there is, in contrast to e.g. bitmap heap scans, no alternative way to perform tablesample queries. If an AM can't deal with the block based API, it will have to throw an ERROR. The tableam callbacks for this are block based, but given the current TsmRoutine interface, that seems to be required. The new interface doesn't require TsmRoutines to perform visibility checks anymore - that requires the TsmRoutine to know details about the AM, which we want to avoid. To continue to allow taking the returned number of tuples account SampleScanState now has a donetuples field (which previously e.g. existed in SystemRowsSamplerData), which is only incremented after the visibility check succeeds. Author: Andres Freund Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de --- contrib/tsm_system_rows/tsm_system_rows.c | 86 ++------ contrib/tsm_system_time/tsm_system_time.c | 13 +- doc/src/sgml/tablesample-method.sgml | 9 +- src/backend/access/heap/heapam_handler.c | 229 ++++++++++++++++++++ src/backend/access/table/tableamapi.c | 3 + src/backend/access/tablesample/system.c | 11 +- src/backend/executor/nodeSamplescan.c | 249 +++------------------- src/include/access/tableam.h | 88 ++++++++ src/include/access/tsmapi.h | 3 +- src/include/nodes/execnodes.h | 3 + 10 files changed, 382 insertions(+), 312 deletions(-) diff --git a/contrib/tsm_system_rows/tsm_system_rows.c b/contrib/tsm_system_rows/tsm_system_rows.c index 1d35ea3c53..3611d05833 100644 --- a/contrib/tsm_system_rows/tsm_system_rows.c +++ b/contrib/tsm_system_rows/tsm_system_rows.c @@ -28,7 +28,6 @@ #include "postgres.h" -#include "access/heapam.h" #include "access/relscan.h" #include "access/tsmapi.h" #include "catalog/pg_type.h" @@ -46,7 +45,6 @@ typedef struct { uint32 seed; /* random seed */ int64 ntuples; /* number of tuples to return */ - int64 donetuples; /* number of tuples already returned */ OffsetNumber lt; /* last tuple returned from current block */ BlockNumber doneblocks; /* number of already-scanned blocks */ BlockNumber lb; /* last block visited */ @@ -67,11 +65,10 @@ static void system_rows_beginsamplescan(SampleScanState *node, Datum *params, int nparams, uint32 seed); -static BlockNumber system_rows_nextsampleblock(SampleScanState *node); +static BlockNumber system_rows_nextsampleblock(SampleScanState *node, BlockNumber nblocks); static OffsetNumber system_rows_nextsampletuple(SampleScanState *node, BlockNumber blockno, OffsetNumber maxoffset); -static bool SampleOffsetVisible(OffsetNumber tupoffset, HeapScanDesc scan); static uint32 random_relative_prime(uint32 n, SamplerRandomState randstate); @@ -187,7 +184,6 @@ system_rows_beginsamplescan(SampleScanState *node, sampler->seed = seed; sampler->ntuples = ntuples; - sampler->donetuples = 0; sampler->lt = InvalidOffsetNumber; sampler->doneblocks = 0; /* lb will be initialized during first NextSampleBlock call */ @@ -206,11 +202,9 @@ system_rows_beginsamplescan(SampleScanState *node, * Uses linear probing algorithm for picking next block. */ static BlockNumber -system_rows_nextsampleblock(SampleScanState *node) +system_rows_nextsampleblock(SampleScanState *node, BlockNumber nblocks) { SystemRowsSamplerData *sampler = (SystemRowsSamplerData *) node->tsm_state; - TableScanDesc scan = node->ss.ss_currentScanDesc; - HeapScanDesc hscan = (HeapScanDesc) scan; /* First call within scan? */ if (sampler->doneblocks == 0) @@ -222,14 +216,14 @@ system_rows_nextsampleblock(SampleScanState *node) SamplerRandomState randstate; /* If relation is empty, there's nothing to scan */ - if (hscan->rs_nblocks == 0) + if (nblocks == 0) return InvalidBlockNumber; /* We only need an RNG during this setup step */ sampler_random_init_state(sampler->seed, randstate); /* Compute nblocks/firstblock/step only once per query */ - sampler->nblocks = hscan->rs_nblocks; + sampler->nblocks = nblocks; /* Choose random starting block within the relation */ /* (Actually this is the predecessor of the first block visited) */ @@ -246,7 +240,7 @@ system_rows_nextsampleblock(SampleScanState *node) /* If we've read all blocks or returned all needed tuples, we're done */ if (++sampler->doneblocks > sampler->nblocks || - sampler->donetuples >= sampler->ntuples) + node->donetuples >= sampler->ntuples) return InvalidBlockNumber; /* @@ -259,7 +253,7 @@ system_rows_nextsampleblock(SampleScanState *node) { /* Advance lb, using uint64 arithmetic to forestall overflow */ sampler->lb = ((uint64) sampler->lb + sampler->step) % sampler->nblocks; - } while (sampler->lb >= hscan->rs_nblocks); + } while (sampler->lb >= nblocks); return sampler->lb; } @@ -279,77 +273,27 @@ system_rows_nextsampletuple(SampleScanState *node, OffsetNumber maxoffset) { SystemRowsSamplerData *sampler = (SystemRowsSamplerData *) node->tsm_state; - TableScanDesc scan = node->ss.ss_currentScanDesc; - HeapScanDesc hscan = (HeapScanDesc) scan; OffsetNumber tupoffset = sampler->lt; /* Quit if we've returned all needed tuples */ - if (sampler->donetuples >= sampler->ntuples) + if (node->donetuples >= sampler->ntuples) return InvalidOffsetNumber; - /* - * Because we should only count visible tuples as being returned, we need - * to search for a visible tuple rather than just let the core code do it. - */ - - /* We rely on the data accumulated in pagemode access */ - Assert(scan->rs_pageatatime); - for (;;) - { - /* Advance to next possible offset on page */ - if (tupoffset == InvalidOffsetNumber) - tupoffset = FirstOffsetNumber; - else - tupoffset++; - - /* Done? */ - if (tupoffset > maxoffset) - { - tupoffset = InvalidOffsetNumber; - break; - } + /* Advance to next possible offset on page */ + if (tupoffset == InvalidOffsetNumber) + tupoffset = FirstOffsetNumber; + else + tupoffset++; - /* Found a candidate? */ - if (SampleOffsetVisible(tupoffset, hscan)) - { - sampler->donetuples++; - break; - } - } + /* Done? */ + if (tupoffset > maxoffset) + tupoffset = InvalidOffsetNumber; sampler->lt = tupoffset; return tupoffset; } -/* - * Check if tuple offset is visible - * - * In pageatatime mode, heapgetpage() already did visibility checks, - * so just look at the info it left in rs_vistuples[]. - */ -static bool -SampleOffsetVisible(OffsetNumber tupoffset, HeapScanDesc scan) -{ - int start = 0, - end = scan->rs_ntuples - 1; - - while (start <= end) - { - int mid = (start + end) / 2; - OffsetNumber curoffset = scan->rs_vistuples[mid]; - - if (tupoffset == curoffset) - return true; - else if (tupoffset < curoffset) - end = mid - 1; - else - start = mid + 1; - } - - return false; -} - /* * Compute greatest common divisor of two uint32's. */ diff --git a/contrib/tsm_system_time/tsm_system_time.c b/contrib/tsm_system_time/tsm_system_time.c index 1cc7264e08..ab9f685f0a 100644 --- a/contrib/tsm_system_time/tsm_system_time.c +++ b/contrib/tsm_system_time/tsm_system_time.c @@ -26,7 +26,6 @@ #include -#include "access/heapam.h" #include "access/relscan.h" #include "access/tsmapi.h" #include "catalog/pg_type.h" @@ -66,7 +65,7 @@ static void system_time_beginsamplescan(SampleScanState *node, Datum *params, int nparams, uint32 seed); -static BlockNumber system_time_nextsampleblock(SampleScanState *node); +static BlockNumber system_time_nextsampleblock(SampleScanState *node, BlockNumber nblocks); static OffsetNumber system_time_nextsampletuple(SampleScanState *node, BlockNumber blockno, OffsetNumber maxoffset); @@ -213,11 +212,9 @@ system_time_beginsamplescan(SampleScanState *node, * Uses linear probing algorithm for picking next block. */ static BlockNumber -system_time_nextsampleblock(SampleScanState *node) +system_time_nextsampleblock(SampleScanState *node, BlockNumber nblocks) { SystemTimeSamplerData *sampler = (SystemTimeSamplerData *) node->tsm_state; - TableScanDesc scan = node->ss.ss_currentScanDesc; - HeapScanDesc hscan = (HeapScanDesc) scan; instr_time cur_time; /* First call within scan? */ @@ -230,14 +227,14 @@ system_time_nextsampleblock(SampleScanState *node) SamplerRandomState randstate; /* If relation is empty, there's nothing to scan */ - if (hscan->rs_nblocks == 0) + if (nblocks == 0) return InvalidBlockNumber; /* We only need an RNG during this setup step */ sampler_random_init_state(sampler->seed, randstate); /* Compute nblocks/firstblock/step only once per query */ - sampler->nblocks = hscan->rs_nblocks; + sampler->nblocks = nblocks; /* Choose random starting block within the relation */ /* (Actually this is the predecessor of the first block visited) */ @@ -273,7 +270,7 @@ system_time_nextsampleblock(SampleScanState *node) { /* Advance lb, using uint64 arithmetic to forestall overflow */ sampler->lb = ((uint64) sampler->lb + sampler->step) % sampler->nblocks; - } while (sampler->lb >= hscan->rs_nblocks); + } while (sampler->lb >= nblocks); return sampler->lb; } diff --git a/doc/src/sgml/tablesample-method.sgml b/doc/src/sgml/tablesample-method.sgml index b84b7ba885..880e42c950 100644 --- a/doc/src/sgml/tablesample-method.sgml +++ b/doc/src/sgml/tablesample-method.sgml @@ -227,7 +227,7 @@ BeginSampleScan (SampleScanState *node, BlockNumber -NextSampleBlock (SampleScanState *node); +NextSampleBlock (SampleScanState *node, BlockNumber nblocks); Returns the block number of the next page to be scanned, or @@ -262,10 +262,9 @@ NextSampleTuple (SampleScanState *node, numbers in the range 1 .. maxoffset actually contain valid tuples. This is not normally a problem since the core code ignores requests to sample missing or invisible tuples; that should not result in - any bias in the sample. However, if necessary, the function can - examine node->ss.ss_currentScanDesc->rs_vistuples[] - to identify which tuples are valid and visible. (This - requires node->use_pagemode to be true.) + any bias in the sample. However, if necessary, the function can use + node->donetuples to examine how many of the tuples + it returned were vlaid and visible. diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 314806bdcd..5e8bf0aa5c 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -28,6 +28,7 @@ #include "access/multixact.h" #include "access/rewriteheap.h" #include "access/tableam.h" +#include "access/tsmapi.h" #include "access/xact.h" #include "catalog/catalog.h" #include "catalog/index.h" @@ -52,6 +53,9 @@ static void reform_and_rewrite_tuple(HeapTuple tuple, Relation OldHeap, Relation NewHeap, Datum *values, bool *isnull, RewriteState rwstate); +static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, + HeapTuple tuple, + OffsetNumber tupoffset); static const TableAmRoutine heapam_methods; @@ -1943,6 +1947,181 @@ heapam_estimate_rel_size(Relation rel, int32 *attr_widths, } +/* ------------------------------------------------------------------------ + * Executor related callbacks for the heap AM + * ------------------------------------------------------------------------ + */ + +static bool +heapam_scan_sample_next_block(TableScanDesc scan, SampleScanState *scanstate) +{ + HeapScanDesc hscan = (HeapScanDesc) scan; + TsmRoutine *tsm = scanstate->tsmroutine; + BlockNumber blockno; + + /* return false immediately if relation is empty */ + if (hscan->rs_nblocks == 0) + return false; + + if (tsm->NextSampleBlock) + { + blockno = tsm->NextSampleBlock(scanstate, hscan->rs_nblocks); + hscan->rs_cblock = blockno; + } + else + { + /* scanning table sequentially */ + + if (hscan->rs_cblock == InvalidBlockNumber) + { + Assert(!hscan->rs_inited); + blockno = hscan->rs_startblock; + } + else + { + Assert(hscan->rs_inited); + + blockno = hscan->rs_cblock + 1; + + if (blockno >= hscan->rs_nblocks) + { + /* wrap to begining of rel, might not have started at 0 */ + blockno = 0; + } + + /* + * Report our new scan position for synchronization purposes. + * + * Note: we do this before checking for end of scan so that the + * final state of the position hint is back at the start of the + * rel. That's not strictly necessary, but otherwise when you run + * the same query multiple times the starting position would shift + * a little bit backwards on every invocation, which is confusing. + * We don't guarantee any specific ordering in general, though. + */ + if (scan->rs_syncscan) + ss_report_location(scan->rs_rd, blockno); + + if (blockno == hscan->rs_startblock) + { + blockno = InvalidBlockNumber; + } + } + } + + if (!BlockNumberIsValid(blockno)) + { + if (BufferIsValid(hscan->rs_cbuf)) + ReleaseBuffer(hscan->rs_cbuf); + hscan->rs_cbuf = InvalidBuffer; + hscan->rs_cblock = InvalidBlockNumber; + hscan->rs_inited = false; + + return false; + } + + heapgetpage(scan, blockno); + hscan->rs_inited = true; + + return true; +} + +static bool +heapam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate, + TupleTableSlot *slot) +{ + HeapScanDesc hscan = (HeapScanDesc) scan; + TsmRoutine *tsm = scanstate->tsmroutine; + BlockNumber blockno = hscan->rs_cblock; + bool pagemode = scan->rs_pageatatime; + + Page page; + bool all_visible; + OffsetNumber maxoffset; + + /* + * When not using pagemode, we must lock the buffer during tuple + * visibility checks. + */ + if (!pagemode) + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); + + page = (Page) BufferGetPage(hscan->rs_cbuf); + all_visible = PageIsAllVisible(page) && + !scan->rs_snapshot->takenDuringRecovery; + maxoffset = PageGetMaxOffsetNumber(page); + + for (;;) + { + OffsetNumber tupoffset; + + CHECK_FOR_INTERRUPTS(); + + /* Ask the tablesample method which tuples to check on this page. */ + tupoffset = tsm->NextSampleTuple(scanstate, + blockno, + maxoffset); + + if (OffsetNumberIsValid(tupoffset)) + { + ItemId itemid; + bool visible; + HeapTuple tuple = &(hscan->rs_ctup); + + /* Skip invalid tuple pointers. */ + itemid = PageGetItemId(page, tupoffset); + if (!ItemIdIsNormal(itemid)) + continue; + + tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid); + tuple->t_len = ItemIdGetLength(itemid); + ItemPointerSet(&(tuple->t_self), blockno, tupoffset); + + + if (all_visible) + visible = true; + else + visible = SampleHeapTupleVisible(scan, hscan->rs_cbuf, + tuple, tupoffset); + + /* in pagemode, heapgetpage did this for us */ + if (!pagemode) + CheckForSerializableConflictOut(visible, scan->rs_rd, tuple, + hscan->rs_cbuf, scan->rs_snapshot); + + /* Try next tuple from same page. */ + if (!visible) + continue; + + /* Found visible tuple, return it. */ + if (!pagemode) + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + ExecStoreBufferHeapTuple(tuple, slot, hscan->rs_cbuf); + + /* Count successfully-fetched tuples as heap fetches */ + pgstat_count_heap_getnext(scan->rs_rd); + + return true; + } + else + { + /* + * If we get here, it means we've exhausted the items on this page + * and it's time to move to the next. + */ + if (!pagemode) + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + ExecClearTuple(slot); + return false; + } + } + + Assert(0); +} + + /* ---------------------------------------------------------------------------- * Helper functions for the above. * ---------------------------------------------------------------------------- @@ -1991,6 +2170,53 @@ reform_and_rewrite_tuple(HeapTuple tuple, heap_freetuple(copiedTuple); } +/* + * Check visibility of the tuple. + */ +static bool +SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, + HeapTuple tuple, + OffsetNumber tupoffset) +{ + HeapScanDesc hscan = (HeapScanDesc) scan; + + if (scan->rs_pageatatime) + { + /* + * In pageatatime mode, heapgetpage() already did visibility checks, + * so just look at the info it left in rs_vistuples[]. + * + * We use a binary search over the known-sorted array. Note: we could + * save some effort if we insisted that NextSampleTuple select tuples + * in increasing order, but it's not clear that there would be enough + * gain to justify the restriction. + */ + int start = 0, + end = hscan->rs_ntuples - 1; + + while (start <= end) + { + int mid = (start + end) / 2; + OffsetNumber curoffset = hscan->rs_vistuples[mid]; + + if (tupoffset == curoffset) + return true; + else if (tupoffset < curoffset) + end = mid - 1; + else + start = mid + 1; + } + + return false; + } + else + { + /* Otherwise, we have to check the tuple individually. */ + return HeapTupleSatisfiesVisibility(tuple, scan->rs_snapshot, + buffer); + } +} + /* ------------------------------------------------------------------------ * Definition of the heap table access method. @@ -2039,6 +2265,9 @@ static const TableAmRoutine heapam_methods = { .index_validate_scan = heapam_index_validate_scan, .relation_estimate_size = heapam_estimate_rel_size, + + .scan_sample_next_block = heapam_scan_sample_next_block, + .scan_sample_next_tuple = heapam_scan_sample_next_tuple }; diff --git a/src/backend/access/table/tableamapi.c b/src/backend/access/table/tableamapi.c index 96660bf907..db937829d4 100644 --- a/src/backend/access/table/tableamapi.c +++ b/src/backend/access/table/tableamapi.c @@ -89,6 +89,9 @@ GetTableAmRoutine(Oid amhandler) Assert(routine->index_validate_scan != NULL); Assert(routine->relation_estimate_size != NULL); + Assert(routine->scan_sample_next_block != NULL); + Assert(routine->scan_sample_next_tuple != NULL); + return routine; } diff --git a/src/backend/access/tablesample/system.c b/src/backend/access/tablesample/system.c index 26f7de3e45..b1fb9b3201 100644 --- a/src/backend/access/tablesample/system.c +++ b/src/backend/access/tablesample/system.c @@ -26,7 +26,6 @@ #include -#include "access/heapam.h" #include "access/relscan.h" #include "access/tsmapi.h" #include "catalog/pg_type.h" @@ -56,7 +55,7 @@ static void system_beginsamplescan(SampleScanState *node, Datum *params, int nparams, uint32 seed); -static BlockNumber system_nextsampleblock(SampleScanState *node); +static BlockNumber system_nextsampleblock(SampleScanState *node, BlockNumber nblocks); static OffsetNumber system_nextsampletuple(SampleScanState *node, BlockNumber blockno, OffsetNumber maxoffset); @@ -177,11 +176,9 @@ system_beginsamplescan(SampleScanState *node, * Select next block to sample. */ static BlockNumber -system_nextsampleblock(SampleScanState *node) +system_nextsampleblock(SampleScanState *node, BlockNumber nblocks) { SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state; - TableScanDesc scan = node->ss.ss_currentScanDesc; - HeapScanDesc hscan = (HeapScanDesc) scan; BlockNumber nextblock = sampler->nextblock; uint32 hashinput[2]; @@ -200,7 +197,7 @@ system_nextsampleblock(SampleScanState *node) * Loop over block numbers until finding suitable block or reaching end of * relation. */ - for (; nextblock < hscan->rs_nblocks; nextblock++) + for (; nextblock < nblocks; nextblock++) { uint32 hash; @@ -212,7 +209,7 @@ system_nextsampleblock(SampleScanState *node) break; } - if (nextblock < hscan->rs_nblocks) + if (nextblock < nblocks) { /* Found a suitable block; remember where we should start next time */ sampler->nextblock = nextblock + 1; diff --git a/src/backend/executor/nodeSamplescan.c b/src/backend/executor/nodeSamplescan.c index 817e4ca41f..14a0a6357d 100644 --- a/src/backend/executor/nodeSamplescan.c +++ b/src/backend/executor/nodeSamplescan.c @@ -14,7 +14,6 @@ */ #include "postgres.h" -#include "access/heapam.h" #include "access/relscan.h" #include "access/tableam.h" #include "access/tsmapi.h" @@ -29,9 +28,7 @@ static TupleTableSlot *SampleNext(SampleScanState *node); static void tablesample_init(SampleScanState *scanstate); -static HeapTuple tablesample_getnext(SampleScanState *scanstate); -static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, - HeapScanDesc scan); +static TupleTableSlot *tablesample_getnext(SampleScanState *scanstate); /* ---------------------------------------------------------------- * Scan Support @@ -47,10 +44,6 @@ static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, static TupleTableSlot * SampleNext(SampleScanState *node) { - HeapTuple tuple; - TupleTableSlot *slot; - HeapScanDesc hscan; - /* * if this is first call within a scan, initialize */ @@ -60,19 +53,7 @@ SampleNext(SampleScanState *node) /* * get the next tuple, and store it in our result slot */ - tuple = tablesample_getnext(node); - - slot = node->ss.ss_ScanTupleSlot; - hscan = (HeapScanDesc) node->ss.ss_currentScanDesc; - - if (tuple) - ExecStoreBufferHeapTuple(tuple, /* tuple to store */ - slot, /* slot to store in */ - hscan->rs_cbuf); /* tuple's buffer */ - else - ExecClearTuple(slot); - - return slot; + return tablesample_getnext(node); } /* @@ -237,6 +218,9 @@ ExecReScanSampleScan(SampleScanState *node) { /* Remember we need to do BeginSampleScan again (if we did it at all) */ node->begun = false; + node->done = false; + node->haveblock = false; + node->donetuples = 0; ExecScanReScan(&node->ss); } @@ -258,6 +242,7 @@ tablesample_init(SampleScanState *scanstate) int i; ListCell *arg; + scanstate->donetuples = 0; params = (Datum *) palloc(list_length(scanstate->args) * sizeof(Datum)); i = 0; @@ -345,225 +330,49 @@ tablesample_init(SampleScanState *scanstate) /* * Get next tuple from TABLESAMPLE method. - * - * Note: an awful lot of this is copied-and-pasted from heapam.c. It would - * perhaps be better to refactor to share more code. */ -static HeapTuple +static TupleTableSlot * tablesample_getnext(SampleScanState *scanstate) { - TsmRoutine *tsm = scanstate->tsmroutine; TableScanDesc scan = scanstate->ss.ss_currentScanDesc; - HeapScanDesc hscan = (HeapScanDesc) scan; - HeapTuple tuple = &(hscan->rs_ctup); - Snapshot snapshot = scan->rs_snapshot; - bool pagemode = scan->rs_pageatatime; - BlockNumber blockno; - Page page; - bool all_visible; - OffsetNumber maxoffset; - - if (!hscan->rs_inited) - { - /* - * return null immediately if relation is empty - */ - if (hscan->rs_nblocks == 0) - { - Assert(!BufferIsValid(hscan->rs_cbuf)); - tuple->t_data = NULL; - return NULL; - } - if (tsm->NextSampleBlock) - { - blockno = tsm->NextSampleBlock(scanstate); - if (!BlockNumberIsValid(blockno)) - { - tuple->t_data = NULL; - return NULL; - } - } - else - blockno = hscan->rs_startblock; - Assert(blockno < hscan->rs_nblocks); - heapgetpage(scan, blockno); - hscan->rs_inited = true; - } - else - { - /* continue from previously returned page/tuple */ - blockno = hscan->rs_cblock; /* current page */ - } + TupleTableSlot *slot = scanstate->ss.ss_ScanTupleSlot; - /* - * When not using pagemode, we must lock the buffer during tuple - * visibility checks. - */ - if (!pagemode) - LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); + ExecClearTuple(slot); - page = (Page) BufferGetPage(hscan->rs_cbuf); - all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery; - maxoffset = PageGetMaxOffsetNumber(page); + if (scanstate->done) + return NULL; for (;;) { - OffsetNumber tupoffset; - bool finished; - - CHECK_FOR_INTERRUPTS(); - - /* Ask the tablesample method which tuples to check on this page. */ - tupoffset = tsm->NextSampleTuple(scanstate, - blockno, - maxoffset); - - if (OffsetNumberIsValid(tupoffset)) + if (!scanstate->haveblock) { - ItemId itemid; - bool visible; - - /* Skip invalid tuple pointers. */ - itemid = PageGetItemId(page, tupoffset); - if (!ItemIdIsNormal(itemid)) - continue; - - tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid); - tuple->t_len = ItemIdGetLength(itemid); - ItemPointerSet(&(tuple->t_self), blockno, tupoffset); - - if (all_visible) - visible = true; - else - visible = SampleTupleVisible(tuple, tupoffset, hscan); - - /* in pagemode, heapgetpage did this for us */ - if (!pagemode) - CheckForSerializableConflictOut(visible, scan->rs_rd, tuple, - hscan->rs_cbuf, snapshot); - - if (visible) - { - /* Found visible tuple, return it. */ - if (!pagemode) - LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); - break; - } - else + if (!table_scan_sample_next_block(scan, scanstate)) { - /* Try next tuple from same page. */ - continue; - } - } + scanstate->haveblock = false; + scanstate->done = true; - /* - * if we get here, it means we've exhausted the items on this page and - * it's time to move to the next. - */ - if (!pagemode) - LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + /* exhausted relation */ + return NULL; + } - if (tsm->NextSampleBlock) - { - blockno = tsm->NextSampleBlock(scanstate); - Assert(!scan->rs_syncscan); - finished = !BlockNumberIsValid(blockno); + scanstate->haveblock = true; } - else - { - /* Without NextSampleBlock, just do a plain forward seqscan. */ - blockno++; - if (blockno >= hscan->rs_nblocks) - blockno = 0; + if (!table_scan_sample_next_tuple(scan, scanstate, slot)) + { /* - * Report our new scan position for synchronization purposes. - * - * Note: we do this before checking for end of scan so that the - * final state of the position hint is back at the start of the - * rel. That's not strictly necessary, but otherwise when you run - * the same query multiple times the starting position would shift - * a little bit backwards on every invocation, which is confusing. - * We don't guarantee any specific ordering in general, though. + * If we get here, it means we've exhausted the items on this page + * and it's time to move to the next. */ - if (scan->rs_syncscan) - ss_report_location(scan->rs_rd, blockno); - - finished = (blockno == hscan->rs_startblock); + scanstate->haveblock = false; + continue; } - /* - * Reached end of scan? - */ - if (finished) - { - if (BufferIsValid(hscan->rs_cbuf)) - ReleaseBuffer(hscan->rs_cbuf); - hscan->rs_cbuf = InvalidBuffer; - hscan->rs_cblock = InvalidBlockNumber; - tuple->t_data = NULL; - hscan->rs_inited = false; - return NULL; - } - - Assert(blockno < hscan->rs_nblocks); - heapgetpage(scan, blockno); - - /* Re-establish state for new page */ - if (!pagemode) - LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); - - page = (Page) BufferGetPage(hscan->rs_cbuf); - all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery; - maxoffset = PageGetMaxOffsetNumber(page); + /* Found visible tuple, return it. */ + break; } - /* Count successfully-fetched tuples as heap fetches */ - pgstat_count_heap_getnext(scan->rs_rd); - - return &(hscan->rs_ctup); -} + scanstate->donetuples++; -/* - * Check visibility of the tuple. - */ -static bool -SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan) -{ - if (scan->rs_base.rs_pageatatime) - { - /* - * In pageatatime mode, heapgetpage() already did visibility checks, - * so just look at the info it left in rs_vistuples[]. - * - * We use a binary search over the known-sorted array. Note: we could - * save some effort if we insisted that NextSampleTuple select tuples - * in increasing order, but it's not clear that there would be enough - * gain to justify the restriction. - */ - int start = 0, - end = scan->rs_ntuples - 1; - - while (start <= end) - { - int mid = (start + end) / 2; - OffsetNumber curoffset = scan->rs_vistuples[mid]; - - if (tupoffset == curoffset) - return true; - else if (tupoffset < curoffset) - end = mid - 1; - else - start = mid + 1; - } - - return false; - } - else - { - /* Otherwise, we have to check the tuple individually. */ - return HeapTupleSatisfiesVisibility(tuple, - scan->rs_base.rs_snapshot, - scan->rs_cbuf); - } + return slot; } diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 0151f17af4..34813e4099 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -30,6 +30,7 @@ extern bool synchronize_seqscans; struct BulkInsertStateData; struct IndexInfo; struct IndexBuildCallback; +struct SampleScanState; struct VacuumParams; struct ValidateIndexState; @@ -519,6 +520,56 @@ typedef struct TableAmRoutine BlockNumber *pages, double *tuples, double *allvisfrac); + + /* ------------------------------------------------------------------------ + * Executor related functions. + * ------------------------------------------------------------------------ + */ + + /* + * Acquire the next block in a sample scan. Return false if the sample + * scan is finished, true otherwise. + * + * Typically this will first determine the target block by call the + * TsmRoutine's NextSampleBlock() callback if not NULL, or alternatively + * perform a sequential scan over all blocks. The determined block is + * then typically read and pinned. + * + * As the TsmRoutine interface is block based, a block needs to be passed + * to NextSampleBlock(). If that's not appropriate for an AM, it + * internally needs to perform mapping between the internal and a block + * based representation. + * + * Note that it's not acceptable to hold deadlock prone resources such as + * lwlocks until scan_sample_next_tuple() has exhausted the tuples on the + * block - the tuple is likely to be returned to an upper query node, and + * the next call could be off a long while. Holding buffer pins etc is + * obviously OK. + * + * Currently it is required to implement this interface, as there's no + * alternative way (contrary e.g. to bitmap scans) to implement sample + * scans. If infeasible to implement the AM may raise an error. + */ + bool (*scan_sample_next_block) (TableScanDesc scan, + struct SampleScanState *scanstate); + + /* + * This callback, only called after scan_sample_next_block has returned + * true, should determine the next tuple to be returned from the selected + * block using the TsmRoutine's NextSampleTuple() callback. + * + * The callback needs to perform visibility checks, and only return + * visible tuples. That obviously can mean calling NextSampletuple() + * multiple times. + * + * The TsmRoutine interface assumes that there's a maximum offset on a + * given page, so if that doesn't apply to an AM, it needs to emulate that + * assumption somehow. + */ + bool (*scan_sample_next_tuple) (TableScanDesc scan, + struct SampleScanState *scanstate, + TupleTableSlot *slot); + } TableAmRoutine; @@ -1339,6 +1390,43 @@ table_relation_estimate_size(Relation rel, int32 *attr_widths, } +/* ---------------------------------------------------------------------------- + * Executor related functionality + * ---------------------------------------------------------------------------- + */ + +/* + * Acquire the next block in a sample scan. Returns false if the sample scan + * is finished, true otherwise. + * + * This will call the TsmRoutine's NextSampleBlock() callback if necessary + * (i.e. NextSampleBlock is not NULL), or perform a sequential scan over the + * underlying relation. + */ +static inline bool +table_scan_sample_next_block(TableScanDesc scan, + struct SampleScanState *scanstate) +{ + return scan->rs_rd->rd_tableam->scan_sample_next_block(scan, scanstate); +} + +/* + * Fetch the next sample tuple into `slot` and return true if a visible tuple + * was found, false otherwise. table_scan_sample_next_block() needs to + * previously have selected a block (i.e. returned true). + * + * This will call the TsmRoutine's NextSampleTuple() callback. + */ +static inline bool +table_scan_sample_next_tuple(TableScanDesc scan, + struct SampleScanState *scanstate, + TupleTableSlot *slot) +{ + return scan->rs_rd->rd_tableam->scan_sample_next_tuple(scan, scanstate, + slot); +} + + /* ---------------------------------------------------------------------------- * Functions to make modifications a bit simpler. * ---------------------------------------------------------------------------- diff --git a/src/include/access/tsmapi.h b/src/include/access/tsmapi.h index a5c0b4cafe..d3bdb754b5 100644 --- a/src/include/access/tsmapi.h +++ b/src/include/access/tsmapi.h @@ -34,7 +34,8 @@ typedef void (*BeginSampleScan_function) (SampleScanState *node, int nparams, uint32 seed); -typedef BlockNumber (*NextSampleBlock_function) (SampleScanState *node); +typedef BlockNumber (*NextSampleBlock_function) (SampleScanState *node, + BlockNumber nblocks); typedef OffsetNumber (*NextSampleTuple_function) (SampleScanState *node, BlockNumber blockno, diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index dbd7ed0363..437a191b45 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1305,6 +1305,9 @@ typedef struct SampleScanState bool use_pagemode; /* use page-at-a-time visibility checking? */ bool begun; /* false means need to call BeginSampleScan */ uint32 seed; /* random seed */ + int64 donetuples; /* number of tuples already returned */ + bool haveblock; /* has a block for sampling been determined */ + bool done; /* exhausted all tuples? */ } SampleScanState; /* -- 2.40.0