]> granicus.if.org Git - postgresql/commitdiff
tableam: sample scan.
authorAndres Freund <andres@anarazel.de>
Sun, 31 Mar 2019 03:18:53 +0000 (20:18 -0700)
committerAndres Freund <andres@anarazel.de>
Mon, 1 Apr 2019 01:37:57 +0000 (18:37 -0700)
This moves sample scan support to below tableam. It's not optional as
there is, in contrast to e.g. bitmap heap scans, no alternative way to
perform tablesample queries. If an AM can't deal with the block based
API, it will have to throw an ERROR.

The tableam callbacks for this are block based, but given the current
TsmRoutine interface, that seems to be required.

The new interface doesn't require TsmRoutines to perform visibility
checks anymore - that requires the TsmRoutine to know details about
the AM, which we want to avoid.  To continue to allow taking the
returned number of tuples account SampleScanState now has a donetuples
field (which previously e.g. existed in SystemRowsSamplerData), which
is only incremented after the visibility check succeeds.

Author: Andres Freund
Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de

contrib/tsm_system_rows/tsm_system_rows.c
contrib/tsm_system_time/tsm_system_time.c
doc/src/sgml/tablesample-method.sgml
src/backend/access/heap/heapam_handler.c
src/backend/access/table/tableamapi.c
src/backend/access/tablesample/system.c
src/backend/executor/nodeSamplescan.c
src/include/access/tableam.h
src/include/access/tsmapi.h
src/include/nodes/execnodes.h

index 1d35ea3c53aabefda85fe353dda461ed266d25a3..3611d0583315eb983b2f9afac8e6c17f37f164a2 100644 (file)
@@ -28,7 +28,6 @@
 
 #include "postgres.h"
 
-#include "access/heapam.h"
 #include "access/relscan.h"
 #include "access/tsmapi.h"
 #include "catalog/pg_type.h"
@@ -46,7 +45,6 @@ typedef struct
 {
        uint32          seed;                   /* random seed */
        int64           ntuples;                /* number of tuples to return */
-       int64           donetuples;             /* number of tuples already returned */
        OffsetNumber lt;                        /* last tuple returned from current block */
        BlockNumber doneblocks;         /* number of already-scanned blocks */
        BlockNumber lb;                         /* last block visited */
@@ -67,11 +65,10 @@ static void system_rows_beginsamplescan(SampleScanState *node,
                                                        Datum *params,
                                                        int nparams,
                                                        uint32 seed);
-static BlockNumber system_rows_nextsampleblock(SampleScanState *node);
+static BlockNumber system_rows_nextsampleblock(SampleScanState *node, BlockNumber nblocks);
 static OffsetNumber system_rows_nextsampletuple(SampleScanState *node,
                                                        BlockNumber blockno,
                                                        OffsetNumber maxoffset);
-static bool SampleOffsetVisible(OffsetNumber tupoffset, HeapScanDesc scan);
 static uint32 random_relative_prime(uint32 n, SamplerRandomState randstate);
 
 
@@ -187,7 +184,6 @@ system_rows_beginsamplescan(SampleScanState *node,
 
        sampler->seed = seed;
        sampler->ntuples = ntuples;
-       sampler->donetuples = 0;
        sampler->lt = InvalidOffsetNumber;
        sampler->doneblocks = 0;
        /* lb will be initialized during first NextSampleBlock call */
@@ -206,11 +202,9 @@ system_rows_beginsamplescan(SampleScanState *node,
  * Uses linear probing algorithm for picking next block.
  */
 static BlockNumber
-system_rows_nextsampleblock(SampleScanState *node)
+system_rows_nextsampleblock(SampleScanState *node, BlockNumber nblocks)
 {
        SystemRowsSamplerData *sampler = (SystemRowsSamplerData *) node->tsm_state;
-       TableScanDesc scan = node->ss.ss_currentScanDesc;
-       HeapScanDesc hscan = (HeapScanDesc) scan;
 
        /* First call within scan? */
        if (sampler->doneblocks == 0)
@@ -222,14 +216,14 @@ system_rows_nextsampleblock(SampleScanState *node)
                        SamplerRandomState randstate;
 
                        /* If relation is empty, there's nothing to scan */
-                       if (hscan->rs_nblocks == 0)
+                       if (nblocks == 0)
                                return InvalidBlockNumber;
 
                        /* We only need an RNG during this setup step */
                        sampler_random_init_state(sampler->seed, randstate);
 
                        /* Compute nblocks/firstblock/step only once per query */
-                       sampler->nblocks = hscan->rs_nblocks;
+                       sampler->nblocks = nblocks;
 
                        /* Choose random starting block within the relation */
                        /* (Actually this is the predecessor of the first block visited) */
@@ -246,7 +240,7 @@ system_rows_nextsampleblock(SampleScanState *node)
 
        /* If we've read all blocks or returned all needed tuples, we're done */
        if (++sampler->doneblocks > sampler->nblocks ||
-               sampler->donetuples >= sampler->ntuples)
+               node->donetuples >= sampler->ntuples)
                return InvalidBlockNumber;
 
        /*
@@ -259,7 +253,7 @@ system_rows_nextsampleblock(SampleScanState *node)
        {
                /* Advance lb, using uint64 arithmetic to forestall overflow */
                sampler->lb = ((uint64) sampler->lb + sampler->step) % sampler->nblocks;
-       } while (sampler->lb >= hscan->rs_nblocks);
+       } while (sampler->lb >= nblocks);
 
        return sampler->lb;
 }
@@ -279,77 +273,27 @@ system_rows_nextsampletuple(SampleScanState *node,
                                                        OffsetNumber maxoffset)
 {
        SystemRowsSamplerData *sampler = (SystemRowsSamplerData *) node->tsm_state;
-       TableScanDesc scan = node->ss.ss_currentScanDesc;
-       HeapScanDesc hscan = (HeapScanDesc) scan;
        OffsetNumber tupoffset = sampler->lt;
 
        /* Quit if we've returned all needed tuples */
-       if (sampler->donetuples >= sampler->ntuples)
+       if (node->donetuples >= sampler->ntuples)
                return InvalidOffsetNumber;
 
-       /*
-        * Because we should only count visible tuples as being returned, we need
-        * to search for a visible tuple rather than just let the core code do it.
-        */
-
-       /* We rely on the data accumulated in pagemode access */
-       Assert(scan->rs_pageatatime);
-       for (;;)
-       {
-               /* Advance to next possible offset on page */
-               if (tupoffset == InvalidOffsetNumber)
-                       tupoffset = FirstOffsetNumber;
-               else
-                       tupoffset++;
-
-               /* Done? */
-               if (tupoffset > maxoffset)
-               {
-                       tupoffset = InvalidOffsetNumber;
-                       break;
-               }
+       /* Advance to next possible offset on page */
+       if (tupoffset == InvalidOffsetNumber)
+               tupoffset = FirstOffsetNumber;
+       else
+               tupoffset++;
 
-               /* Found a candidate? */
-               if (SampleOffsetVisible(tupoffset, hscan))
-               {
-                       sampler->donetuples++;
-                       break;
-               }
-       }
+       /* Done? */
+       if (tupoffset > maxoffset)
+               tupoffset = InvalidOffsetNumber;
 
        sampler->lt = tupoffset;
 
        return tupoffset;
 }
 
-/*
- * Check if tuple offset is visible
- *
- * In pageatatime mode, heapgetpage() already did visibility checks,
- * so just look at the info it left in rs_vistuples[].
- */
-static bool
-SampleOffsetVisible(OffsetNumber tupoffset, HeapScanDesc scan)
-{
-       int                     start = 0,
-                               end = scan->rs_ntuples - 1;
-
-       while (start <= end)
-       {
-               int                     mid = (start + end) / 2;
-               OffsetNumber curoffset = scan->rs_vistuples[mid];
-
-               if (tupoffset == curoffset)
-                       return true;
-               else if (tupoffset < curoffset)
-                       end = mid - 1;
-               else
-                       start = mid + 1;
-       }
-
-       return false;
-}
-
 /*
  * Compute greatest common divisor of two uint32's.
  */
index 1cc7264e084da477822bd8b2e76f8efd36f95451..ab9f685f0afb4ae19c70fdfb22732f03667568f9 100644 (file)
@@ -26,7 +26,6 @@
 
 #include <math.h>
 
-#include "access/heapam.h"
 #include "access/relscan.h"
 #include "access/tsmapi.h"
 #include "catalog/pg_type.h"
@@ -66,7 +65,7 @@ static void system_time_beginsamplescan(SampleScanState *node,
                                                        Datum *params,
                                                        int nparams,
                                                        uint32 seed);
-static BlockNumber system_time_nextsampleblock(SampleScanState *node);
+static BlockNumber system_time_nextsampleblock(SampleScanState *node, BlockNumber nblocks);
 static OffsetNumber system_time_nextsampletuple(SampleScanState *node,
                                                        BlockNumber blockno,
                                                        OffsetNumber maxoffset);
@@ -213,11 +212,9 @@ system_time_beginsamplescan(SampleScanState *node,
  * Uses linear probing algorithm for picking next block.
  */
 static BlockNumber
-system_time_nextsampleblock(SampleScanState *node)
+system_time_nextsampleblock(SampleScanState *node, BlockNumber nblocks)
 {
        SystemTimeSamplerData *sampler = (SystemTimeSamplerData *) node->tsm_state;
-       TableScanDesc scan = node->ss.ss_currentScanDesc;
-       HeapScanDesc hscan = (HeapScanDesc) scan;
        instr_time      cur_time;
 
        /* First call within scan? */
@@ -230,14 +227,14 @@ system_time_nextsampleblock(SampleScanState *node)
                        SamplerRandomState randstate;
 
                        /* If relation is empty, there's nothing to scan */
-                       if (hscan->rs_nblocks == 0)
+                       if (nblocks == 0)
                                return InvalidBlockNumber;
 
                        /* We only need an RNG during this setup step */
                        sampler_random_init_state(sampler->seed, randstate);
 
                        /* Compute nblocks/firstblock/step only once per query */
-                       sampler->nblocks = hscan->rs_nblocks;
+                       sampler->nblocks = nblocks;
 
                        /* Choose random starting block within the relation */
                        /* (Actually this is the predecessor of the first block visited) */
@@ -273,7 +270,7 @@ system_time_nextsampleblock(SampleScanState *node)
        {
                /* Advance lb, using uint64 arithmetic to forestall overflow */
                sampler->lb = ((uint64) sampler->lb + sampler->step) % sampler->nblocks;
-       } while (sampler->lb >= hscan->rs_nblocks);
+       } while (sampler->lb >= nblocks);
 
        return sampler->lb;
 }
index b84b7ba885512150d40387b058d0dfd85f385561..880e42c9509f1dd5d3cbb0b57a0e4f08902814ff 100644 (file)
@@ -227,7 +227,7 @@ BeginSampleScan (SampleScanState *node,
   <para>
 <programlisting>
 BlockNumber
-NextSampleBlock (SampleScanState *node);
+NextSampleBlock (SampleScanState *node, BlockNumber nblocks);
 </programlisting>
 
    Returns the block number of the next page to be scanned, or
@@ -262,10 +262,9 @@ NextSampleTuple (SampleScanState *node,
     numbers in the range <literal>1 .. maxoffset</literal> actually contain valid
     tuples.  This is not normally a problem since the core code ignores
     requests to sample missing or invisible tuples; that should not result in
-    any bias in the sample.  However, if necessary, the function can
-    examine <literal>node-&gt;ss.ss_currentScanDesc-&gt;rs_vistuples[]</literal>
-    to identify which tuples are valid and visible.  (This
-    requires <literal>node-&gt;use_pagemode</literal> to be <literal>true</literal>.)
+    any bias in the sample.  However, if necessary, the function can use
+    <literal>node-&gt;donetuples</literal> to examine how many of the tuples
+    it returned were vlaid and visible.
    </para>
   </note>
 
index 314806bdcd7da70de5a4b96437049c310cc4f303..5e8bf0aa5ca2b0e20206a20304b4b4f58ed6552d 100644 (file)
@@ -28,6 +28,7 @@
 #include "access/multixact.h"
 #include "access/rewriteheap.h"
 #include "access/tableam.h"
+#include "access/tsmapi.h"
 #include "access/xact.h"
 #include "catalog/catalog.h"
 #include "catalog/index.h"
@@ -52,6 +53,9 @@ static void reform_and_rewrite_tuple(HeapTuple tuple,
                                                 Relation OldHeap, Relation NewHeap,
                                                 Datum *values, bool *isnull, RewriteState rwstate);
 
+static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
+                                          HeapTuple tuple,
+                                          OffsetNumber tupoffset);
 
 static const TableAmRoutine heapam_methods;
 
@@ -1943,6 +1947,181 @@ heapam_estimate_rel_size(Relation rel, int32 *attr_widths,
 }
 
 
+/* ------------------------------------------------------------------------
+ * Executor related callbacks for the heap AM
+ * ------------------------------------------------------------------------
+ */
+
+static bool
+heapam_scan_sample_next_block(TableScanDesc scan, SampleScanState *scanstate)
+{
+       HeapScanDesc hscan = (HeapScanDesc) scan;
+       TsmRoutine *tsm = scanstate->tsmroutine;
+       BlockNumber blockno;
+
+       /* return false immediately if relation is empty */
+       if (hscan->rs_nblocks == 0)
+               return false;
+
+       if (tsm->NextSampleBlock)
+       {
+               blockno = tsm->NextSampleBlock(scanstate, hscan->rs_nblocks);
+               hscan->rs_cblock = blockno;
+       }
+       else
+       {
+               /* scanning table sequentially */
+
+               if (hscan->rs_cblock == InvalidBlockNumber)
+               {
+                       Assert(!hscan->rs_inited);
+                       blockno = hscan->rs_startblock;
+               }
+               else
+               {
+                       Assert(hscan->rs_inited);
+
+                       blockno = hscan->rs_cblock + 1;
+
+                       if (blockno >= hscan->rs_nblocks)
+                       {
+                               /* wrap to begining of rel, might not have started at 0 */
+                               blockno = 0;
+                       }
+
+                       /*
+                        * Report our new scan position for synchronization purposes.
+                        *
+                        * Note: we do this before checking for end of scan so that the
+                        * final state of the position hint is back at the start of the
+                        * rel.  That's not strictly necessary, but otherwise when you run
+                        * the same query multiple times the starting position would shift
+                        * a little bit backwards on every invocation, which is confusing.
+                        * We don't guarantee any specific ordering in general, though.
+                        */
+                       if (scan->rs_syncscan)
+                               ss_report_location(scan->rs_rd, blockno);
+
+                       if (blockno == hscan->rs_startblock)
+                       {
+                               blockno = InvalidBlockNumber;
+                       }
+               }
+       }
+
+       if (!BlockNumberIsValid(blockno))
+       {
+               if (BufferIsValid(hscan->rs_cbuf))
+                       ReleaseBuffer(hscan->rs_cbuf);
+               hscan->rs_cbuf = InvalidBuffer;
+               hscan->rs_cblock = InvalidBlockNumber;
+               hscan->rs_inited = false;
+
+               return false;
+       }
+
+       heapgetpage(scan, blockno);
+       hscan->rs_inited = true;
+
+       return true;
+}
+
+static bool
+heapam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate,
+                                                         TupleTableSlot *slot)
+{
+       HeapScanDesc hscan = (HeapScanDesc) scan;
+       TsmRoutine *tsm = scanstate->tsmroutine;
+       BlockNumber blockno = hscan->rs_cblock;
+       bool            pagemode = scan->rs_pageatatime;
+
+       Page            page;
+       bool            all_visible;
+       OffsetNumber maxoffset;
+
+       /*
+        * When not using pagemode, we must lock the buffer during tuple
+        * visibility checks.
+        */
+       if (!pagemode)
+               LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
+
+       page = (Page) BufferGetPage(hscan->rs_cbuf);
+       all_visible = PageIsAllVisible(page) &&
+               !scan->rs_snapshot->takenDuringRecovery;
+       maxoffset = PageGetMaxOffsetNumber(page);
+
+       for (;;)
+       {
+               OffsetNumber tupoffset;
+
+               CHECK_FOR_INTERRUPTS();
+
+               /* Ask the tablesample method which tuples to check on this page. */
+               tupoffset = tsm->NextSampleTuple(scanstate,
+                                                                                blockno,
+                                                                                maxoffset);
+
+               if (OffsetNumberIsValid(tupoffset))
+               {
+                       ItemId          itemid;
+                       bool            visible;
+                       HeapTuple       tuple = &(hscan->rs_ctup);
+
+                       /* Skip invalid tuple pointers. */
+                       itemid = PageGetItemId(page, tupoffset);
+                       if (!ItemIdIsNormal(itemid))
+                               continue;
+
+                       tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid);
+                       tuple->t_len = ItemIdGetLength(itemid);
+                       ItemPointerSet(&(tuple->t_self), blockno, tupoffset);
+
+
+                       if (all_visible)
+                               visible = true;
+                       else
+                               visible = SampleHeapTupleVisible(scan, hscan->rs_cbuf,
+                                                                                                tuple, tupoffset);
+
+                       /* in pagemode, heapgetpage did this for us */
+                       if (!pagemode)
+                               CheckForSerializableConflictOut(visible, scan->rs_rd, tuple,
+                                                                                               hscan->rs_cbuf, scan->rs_snapshot);
+
+                       /* Try next tuple from same page. */
+                       if (!visible)
+                               continue;
+
+                       /* Found visible tuple, return it. */
+                       if (!pagemode)
+                               LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+
+                       ExecStoreBufferHeapTuple(tuple, slot, hscan->rs_cbuf);
+
+                       /* Count successfully-fetched tuples as heap fetches */
+                       pgstat_count_heap_getnext(scan->rs_rd);
+
+                       return true;
+               }
+               else
+               {
+                       /*
+                        * If we get here, it means we've exhausted the items on this page
+                        * and it's time to move to the next.
+                        */
+                       if (!pagemode)
+                               LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+
+                       ExecClearTuple(slot);
+                       return false;
+               }
+       }
+
+       Assert(0);
+}
+
+
 /* ----------------------------------------------------------------------------
  *  Helper functions for the above.
  * ----------------------------------------------------------------------------
@@ -1991,6 +2170,53 @@ reform_and_rewrite_tuple(HeapTuple tuple,
        heap_freetuple(copiedTuple);
 }
 
+/*
+ * Check visibility of the tuple.
+ */
+static bool
+SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
+                                          HeapTuple tuple,
+                                          OffsetNumber tupoffset)
+{
+       HeapScanDesc hscan = (HeapScanDesc) scan;
+
+       if (scan->rs_pageatatime)
+       {
+               /*
+                * In pageatatime mode, heapgetpage() already did visibility checks,
+                * so just look at the info it left in rs_vistuples[].
+                *
+                * We use a binary search over the known-sorted array.  Note: we could
+                * save some effort if we insisted that NextSampleTuple select tuples
+                * in increasing order, but it's not clear that there would be enough
+                * gain to justify the restriction.
+                */
+               int                     start = 0,
+                                       end = hscan->rs_ntuples - 1;
+
+               while (start <= end)
+               {
+                       int                     mid = (start + end) / 2;
+                       OffsetNumber curoffset = hscan->rs_vistuples[mid];
+
+                       if (tupoffset == curoffset)
+                               return true;
+                       else if (tupoffset < curoffset)
+                               end = mid - 1;
+                       else
+                               start = mid + 1;
+               }
+
+               return false;
+       }
+       else
+       {
+               /* Otherwise, we have to check the tuple individually. */
+               return HeapTupleSatisfiesVisibility(tuple, scan->rs_snapshot,
+                                                                                       buffer);
+       }
+}
+
 
 /* ------------------------------------------------------------------------
  * Definition of the heap table access method.
@@ -2039,6 +2265,9 @@ static const TableAmRoutine heapam_methods = {
        .index_validate_scan = heapam_index_validate_scan,
 
        .relation_estimate_size = heapam_estimate_rel_size,
+
+       .scan_sample_next_block = heapam_scan_sample_next_block,
+       .scan_sample_next_tuple = heapam_scan_sample_next_tuple
 };
 
 
index 96660bf907f979a99a33639dfeb9d000ba279009..db937829d4caf6b0846f9eba6de66a10318da23f 100644 (file)
@@ -89,6 +89,9 @@ GetTableAmRoutine(Oid amhandler)
        Assert(routine->index_validate_scan != NULL);
        Assert(routine->relation_estimate_size != NULL);
 
+       Assert(routine->scan_sample_next_block != NULL);
+       Assert(routine->scan_sample_next_tuple != NULL);
+
        return routine;
 }
 
index 26f7de3e45d99a0089fd6bc639ea8e770162661d..b1fb9b320154e4669e4e3c60ebce64c5df044283 100644 (file)
@@ -26,7 +26,6 @@
 
 #include <math.h>
 
-#include "access/heapam.h"
 #include "access/relscan.h"
 #include "access/tsmapi.h"
 #include "catalog/pg_type.h"
@@ -56,7 +55,7 @@ static void system_beginsamplescan(SampleScanState *node,
                                           Datum *params,
                                           int nparams,
                                           uint32 seed);
-static BlockNumber system_nextsampleblock(SampleScanState *node);
+static BlockNumber system_nextsampleblock(SampleScanState *node, BlockNumber nblocks);
 static OffsetNumber system_nextsampletuple(SampleScanState *node,
                                           BlockNumber blockno,
                                           OffsetNumber maxoffset);
@@ -177,11 +176,9 @@ system_beginsamplescan(SampleScanState *node,
  * Select next block to sample.
  */
 static BlockNumber
-system_nextsampleblock(SampleScanState *node)
+system_nextsampleblock(SampleScanState *node, BlockNumber nblocks)
 {
        SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;
-       TableScanDesc scan = node->ss.ss_currentScanDesc;
-       HeapScanDesc hscan = (HeapScanDesc) scan;
        BlockNumber nextblock = sampler->nextblock;
        uint32          hashinput[2];
 
@@ -200,7 +197,7 @@ system_nextsampleblock(SampleScanState *node)
         * Loop over block numbers until finding suitable block or reaching end of
         * relation.
         */
-       for (; nextblock < hscan->rs_nblocks; nextblock++)
+       for (; nextblock < nblocks; nextblock++)
        {
                uint32          hash;
 
@@ -212,7 +209,7 @@ system_nextsampleblock(SampleScanState *node)
                        break;
        }
 
-       if (nextblock < hscan->rs_nblocks)
+       if (nextblock < nblocks)
        {
                /* Found a suitable block; remember where we should start next time */
                sampler->nextblock = nextblock + 1;
index 817e4ca41fc1fb0eed3a7fb1959b8fa3ccbbcc8b..14a0a6357d44f4bc1cd6e746f1ba00a29de77ba0 100644 (file)
@@ -14,7 +14,6 @@
  */
 #include "postgres.h"
 
-#include "access/heapam.h"
 #include "access/relscan.h"
 #include "access/tableam.h"
 #include "access/tsmapi.h"
@@ -29,9 +28,7 @@
 
 static TupleTableSlot *SampleNext(SampleScanState *node);
 static void tablesample_init(SampleScanState *scanstate);
-static HeapTuple tablesample_getnext(SampleScanState *scanstate);
-static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset,
-                                  HeapScanDesc scan);
+static TupleTableSlot *tablesample_getnext(SampleScanState *scanstate);
 
 /* ----------------------------------------------------------------
  *                                             Scan Support
@@ -47,10 +44,6 @@ static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset,
 static TupleTableSlot *
 SampleNext(SampleScanState *node)
 {
-       HeapTuple       tuple;
-       TupleTableSlot *slot;
-       HeapScanDesc hscan;
-
        /*
         * if this is first call within a scan, initialize
         */
@@ -60,19 +53,7 @@ SampleNext(SampleScanState *node)
        /*
         * get the next tuple, and store it in our result slot
         */
-       tuple = tablesample_getnext(node);
-
-       slot = node->ss.ss_ScanTupleSlot;
-       hscan = (HeapScanDesc) node->ss.ss_currentScanDesc;
-
-       if (tuple)
-               ExecStoreBufferHeapTuple(tuple, /* tuple to store */
-                                                                slot,  /* slot to store in */
-                                                                hscan->rs_cbuf); /* tuple's buffer */
-       else
-               ExecClearTuple(slot);
-
-       return slot;
+       return tablesample_getnext(node);
 }
 
 /*
@@ -237,6 +218,9 @@ ExecReScanSampleScan(SampleScanState *node)
 {
        /* Remember we need to do BeginSampleScan again (if we did it at all) */
        node->begun = false;
+       node->done = false;
+       node->haveblock = false;
+       node->donetuples = 0;
 
        ExecScanReScan(&node->ss);
 }
@@ -258,6 +242,7 @@ tablesample_init(SampleScanState *scanstate)
        int                     i;
        ListCell   *arg;
 
+       scanstate->donetuples = 0;
        params = (Datum *) palloc(list_length(scanstate->args) * sizeof(Datum));
 
        i = 0;
@@ -345,225 +330,49 @@ tablesample_init(SampleScanState *scanstate)
 
 /*
  * Get next tuple from TABLESAMPLE method.
- *
- * Note: an awful lot of this is copied-and-pasted from heapam.c.  It would
- * perhaps be better to refactor to share more code.
  */
-static HeapTuple
+static TupleTableSlot *
 tablesample_getnext(SampleScanState *scanstate)
 {
-       TsmRoutine *tsm = scanstate->tsmroutine;
        TableScanDesc scan = scanstate->ss.ss_currentScanDesc;
-       HeapScanDesc hscan = (HeapScanDesc) scan;
-       HeapTuple       tuple = &(hscan->rs_ctup);
-       Snapshot        snapshot = scan->rs_snapshot;
-       bool            pagemode = scan->rs_pageatatime;
-       BlockNumber blockno;
-       Page            page;
-       bool            all_visible;
-       OffsetNumber maxoffset;
-
-       if (!hscan->rs_inited)
-       {
-               /*
-                * return null immediately if relation is empty
-                */
-               if (hscan->rs_nblocks == 0)
-               {
-                       Assert(!BufferIsValid(hscan->rs_cbuf));
-                       tuple->t_data = NULL;
-                       return NULL;
-               }
-               if (tsm->NextSampleBlock)
-               {
-                       blockno = tsm->NextSampleBlock(scanstate);
-                       if (!BlockNumberIsValid(blockno))
-                       {
-                               tuple->t_data = NULL;
-                               return NULL;
-                       }
-               }
-               else
-                       blockno = hscan->rs_startblock;
-               Assert(blockno < hscan->rs_nblocks);
-               heapgetpage(scan, blockno);
-               hscan->rs_inited = true;
-       }
-       else
-       {
-               /* continue from previously returned page/tuple */
-               blockno = hscan->rs_cblock;     /* current page */
-       }
+       TupleTableSlot *slot = scanstate->ss.ss_ScanTupleSlot;
 
-       /*
-        * When not using pagemode, we must lock the buffer during tuple
-        * visibility checks.
-        */
-       if (!pagemode)
-               LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
+       ExecClearTuple(slot);
 
-       page = (Page) BufferGetPage(hscan->rs_cbuf);
-       all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
-       maxoffset = PageGetMaxOffsetNumber(page);
+       if (scanstate->done)
+               return NULL;
 
        for (;;)
        {
-               OffsetNumber tupoffset;
-               bool            finished;
-
-               CHECK_FOR_INTERRUPTS();
-
-               /* Ask the tablesample method which tuples to check on this page. */
-               tupoffset = tsm->NextSampleTuple(scanstate,
-                                                                                blockno,
-                                                                                maxoffset);
-
-               if (OffsetNumberIsValid(tupoffset))
+               if (!scanstate->haveblock)
                {
-                       ItemId          itemid;
-                       bool            visible;
-
-                       /* Skip invalid tuple pointers. */
-                       itemid = PageGetItemId(page, tupoffset);
-                       if (!ItemIdIsNormal(itemid))
-                               continue;
-
-                       tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid);
-                       tuple->t_len = ItemIdGetLength(itemid);
-                       ItemPointerSet(&(tuple->t_self), blockno, tupoffset);
-
-                       if (all_visible)
-                               visible = true;
-                       else
-                               visible = SampleTupleVisible(tuple, tupoffset, hscan);
-
-                       /* in pagemode, heapgetpage did this for us */
-                       if (!pagemode)
-                               CheckForSerializableConflictOut(visible, scan->rs_rd, tuple,
-                                                                                               hscan->rs_cbuf, snapshot);
-
-                       if (visible)
-                       {
-                               /* Found visible tuple, return it. */
-                               if (!pagemode)
-                                       LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
-                               break;
-                       }
-                       else
+                       if (!table_scan_sample_next_block(scan, scanstate))
                        {
-                               /* Try next tuple from same page. */
-                               continue;
-                       }
-               }
+                               scanstate->haveblock = false;
+                               scanstate->done = true;
 
-               /*
-                * if we get here, it means we've exhausted the items on this page and
-                * it's time to move to the next.
-                */
-               if (!pagemode)
-                       LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+                               /* exhausted relation */
+                               return NULL;
+                       }
 
-               if (tsm->NextSampleBlock)
-               {
-                       blockno = tsm->NextSampleBlock(scanstate);
-                       Assert(!scan->rs_syncscan);
-                       finished = !BlockNumberIsValid(blockno);
+                       scanstate->haveblock = true;
                }
-               else
-               {
-                       /* Without NextSampleBlock, just do a plain forward seqscan. */
-                       blockno++;
-                       if (blockno >= hscan->rs_nblocks)
-                               blockno = 0;
 
+               if (!table_scan_sample_next_tuple(scan, scanstate, slot))
+               {
                        /*
-                        * Report our new scan position for synchronization purposes.
-                        *
-                        * Note: we do this before checking for end of scan so that the
-                        * final state of the position hint is back at the start of the
-                        * rel.  That's not strictly necessary, but otherwise when you run
-                        * the same query multiple times the starting position would shift
-                        * a little bit backwards on every invocation, which is confusing.
-                        * We don't guarantee any specific ordering in general, though.
+                        * If we get here, it means we've exhausted the items on this page
+                        * and it's time to move to the next.
                         */
-                       if (scan->rs_syncscan)
-                               ss_report_location(scan->rs_rd, blockno);
-
-                       finished = (blockno == hscan->rs_startblock);
+                       scanstate->haveblock = false;
+                       continue;
                }
 
-               /*
-                * Reached end of scan?
-                */
-               if (finished)
-               {
-                       if (BufferIsValid(hscan->rs_cbuf))
-                               ReleaseBuffer(hscan->rs_cbuf);
-                       hscan->rs_cbuf = InvalidBuffer;
-                       hscan->rs_cblock = InvalidBlockNumber;
-                       tuple->t_data = NULL;
-                       hscan->rs_inited = false;
-                       return NULL;
-               }
-
-               Assert(blockno < hscan->rs_nblocks);
-               heapgetpage(scan, blockno);
-
-               /* Re-establish state for new page */
-               if (!pagemode)
-                       LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
-
-               page = (Page) BufferGetPage(hscan->rs_cbuf);
-               all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
-               maxoffset = PageGetMaxOffsetNumber(page);
+               /* Found visible tuple, return it. */
+               break;
        }
 
-       /* Count successfully-fetched tuples as heap fetches */
-       pgstat_count_heap_getnext(scan->rs_rd);
-
-       return &(hscan->rs_ctup);
-}
+       scanstate->donetuples++;
 
-/*
- * Check visibility of the tuple.
- */
-static bool
-SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan)
-{
-       if (scan->rs_base.rs_pageatatime)
-       {
-               /*
-                * In pageatatime mode, heapgetpage() already did visibility checks,
-                * so just look at the info it left in rs_vistuples[].
-                *
-                * We use a binary search over the known-sorted array.  Note: we could
-                * save some effort if we insisted that NextSampleTuple select tuples
-                * in increasing order, but it's not clear that there would be enough
-                * gain to justify the restriction.
-                */
-               int                     start = 0,
-                                       end = scan->rs_ntuples - 1;
-
-               while (start <= end)
-               {
-                       int                     mid = (start + end) / 2;
-                       OffsetNumber curoffset = scan->rs_vistuples[mid];
-
-                       if (tupoffset == curoffset)
-                               return true;
-                       else if (tupoffset < curoffset)
-                               end = mid - 1;
-                       else
-                               start = mid + 1;
-               }
-
-               return false;
-       }
-       else
-       {
-               /* Otherwise, we have to check the tuple individually. */
-               return HeapTupleSatisfiesVisibility(tuple,
-                                                                                       scan->rs_base.rs_snapshot,
-                                                                                       scan->rs_cbuf);
-       }
+       return slot;
 }
index 0151f17af42e544f2133ffe9a8b84fd17603c9cd..34813e40991e915e2a7b79c92d9206a2f105435a 100644 (file)
@@ -30,6 +30,7 @@ extern bool synchronize_seqscans;
 struct BulkInsertStateData;
 struct IndexInfo;
 struct IndexBuildCallback;
+struct SampleScanState;
 struct VacuumParams;
 struct ValidateIndexState;
 
@@ -519,6 +520,56 @@ typedef struct TableAmRoutine
                                                                                   BlockNumber *pages, double *tuples,
                                                                                   double *allvisfrac);
 
+
+       /* ------------------------------------------------------------------------
+        * Executor related functions.
+        * ------------------------------------------------------------------------
+        */
+
+       /*
+        * Acquire the next block in a sample scan. Return false if the sample
+        * scan is finished, true otherwise.
+        *
+        * Typically this will first determine the target block by call the
+        * TsmRoutine's NextSampleBlock() callback if not NULL, or alternatively
+        * perform a sequential scan over all blocks.  The determined block is
+        * then typically read and pinned.
+        *
+        * As the TsmRoutine interface is block based, a block needs to be passed
+        * to NextSampleBlock(). If that's not appropriate for an AM, it
+        * internally needs to perform mapping between the internal and a block
+        * based representation.
+        *
+        * Note that it's not acceptable to hold deadlock prone resources such as
+        * lwlocks until scan_sample_next_tuple() has exhausted the tuples on the
+        * block - the tuple is likely to be returned to an upper query node, and
+        * the next call could be off a long while. Holding buffer pins etc is
+        * obviously OK.
+        *
+        * Currently it is required to implement this interface, as there's no
+        * alternative way (contrary e.g. to bitmap scans) to implement sample
+        * scans. If infeasible to implement the AM may raise an error.
+        */
+       bool            (*scan_sample_next_block) (TableScanDesc scan,
+                                                                                  struct SampleScanState *scanstate);
+
+       /*
+        * This callback, only called after scan_sample_next_block has returned
+        * true, should determine the next tuple to be returned from the selected
+        * block using the TsmRoutine's NextSampleTuple() callback.
+        *
+        * The callback needs to perform visibility checks, and only return
+        * visible tuples. That obviously can mean calling NextSampletuple()
+        * multiple times.
+        *
+        * The TsmRoutine interface assumes that there's a maximum offset on a
+        * given page, so if that doesn't apply to an AM, it needs to emulate that
+        * assumption somehow.
+        */
+       bool            (*scan_sample_next_tuple) (TableScanDesc scan,
+                                                                                  struct SampleScanState *scanstate,
+                                                                                  TupleTableSlot *slot);
+
 } TableAmRoutine;
 
 
@@ -1339,6 +1390,43 @@ table_relation_estimate_size(Relation rel, int32 *attr_widths,
 }
 
 
+/* ----------------------------------------------------------------------------
+ * Executor related functionality
+ * ----------------------------------------------------------------------------
+ */
+
+/*
+ * Acquire the next block in a sample scan. Returns false if the sample scan
+ * is finished, true otherwise.
+ *
+ * This will call the TsmRoutine's NextSampleBlock() callback if necessary
+ * (i.e. NextSampleBlock is not NULL), or perform a sequential scan over the
+ * underlying relation.
+ */
+static inline bool
+table_scan_sample_next_block(TableScanDesc scan,
+                                                        struct SampleScanState *scanstate)
+{
+       return scan->rs_rd->rd_tableam->scan_sample_next_block(scan, scanstate);
+}
+
+/*
+ * Fetch the next sample tuple into `slot` and return true if a visible tuple
+ * was found, false otherwise. table_scan_sample_next_block() needs to
+ * previously have selected a block (i.e. returned true).
+ *
+ * This will call the TsmRoutine's NextSampleTuple() callback.
+ */
+static inline bool
+table_scan_sample_next_tuple(TableScanDesc scan,
+                                                        struct SampleScanState *scanstate,
+                                                        TupleTableSlot *slot)
+{
+       return scan->rs_rd->rd_tableam->scan_sample_next_tuple(scan, scanstate,
+                                                                                                                  slot);
+}
+
+
 /* ----------------------------------------------------------------------------
  * Functions to make modifications a bit simpler.
  * ----------------------------------------------------------------------------
index a5c0b4cafecfac336864536b6dc0749d512f08bd..d3bdb754b5b8df71efe50fb95b9126fedf641b69 100644 (file)
@@ -34,7 +34,8 @@ typedef void (*BeginSampleScan_function) (SampleScanState *node,
                                                                                  int nparams,
                                                                                  uint32 seed);
 
-typedef BlockNumber (*NextSampleBlock_function) (SampleScanState *node);
+typedef BlockNumber (*NextSampleBlock_function) (SampleScanState *node,
+                                                                                                BlockNumber nblocks);
 
 typedef OffsetNumber (*NextSampleTuple_function) (SampleScanState *node,
                                                                                                  BlockNumber blockno,
index dbd7ed0363b9286f453c9a0745b70eb59981341d..437a191b4581757ca6ff682835240d4b5be02bb6 100644 (file)
@@ -1305,6 +1305,9 @@ typedef struct SampleScanState
        bool            use_pagemode;   /* use page-at-a-time visibility checking? */
        bool            begun;                  /* false means need to call BeginSampleScan */
        uint32          seed;                   /* random seed */
+       int64           donetuples;             /* number of tuples already returned */
+       bool            haveblock;              /* has a block for sampling been determined */
+       bool            done;                   /* exhausted all tuples? */
 } SampleScanState;
 
 /*