* ------------------------------------------------------------------------
*/
+static bool
+heapam_scan_bitmap_next_block(TableScanDesc scan,
+ TBMIterateResult *tbmres)
+{
+ HeapScanDesc hscan = (HeapScanDesc) scan;
+ BlockNumber page = tbmres->blockno;
+ Buffer buffer;
+ Snapshot snapshot;
+ int ntup;
+
+ hscan->rs_cindex = 0;
+ hscan->rs_ntuples = 0;
+
+ /*
+ * Ignore any claimed entries past what we think is the end of the
+ * relation. It may have been extended after the start of our scan (we
+ * only hold an AccessShareLock, and it could be inserts from this
+ * backend).
+ */
+ if (page >= hscan->rs_nblocks)
+ return false;
+
+ /*
+ * Acquire pin on the target heap page, trading in any pin we held before.
+ */
+ hscan->rs_cbuf = ReleaseAndReadBuffer(hscan->rs_cbuf,
+ scan->rs_rd,
+ page);
+ hscan->rs_cblock = page;
+ buffer = hscan->rs_cbuf;
+ snapshot = scan->rs_snapshot;
+
+ ntup = 0;
+
+ /*
+ * Prune and repair fragmentation for the whole page, if possible.
+ */
+ heap_page_prune_opt(scan->rs_rd, buffer);
+
+ /*
+ * We must hold share lock on the buffer content while examining tuple
+ * visibility. Afterwards, however, the tuples we have found to be
+ * visible are guaranteed good as long as we hold the buffer pin.
+ */
+ LockBuffer(buffer, BUFFER_LOCK_SHARE);
+
+ /*
+ * We need two separate strategies for lossy and non-lossy cases.
+ */
+ if (tbmres->ntuples >= 0)
+ {
+ /*
+ * Bitmap is non-lossy, so we just look through the offsets listed in
+ * tbmres; but we have to follow any HOT chain starting at each such
+ * offset.
+ */
+ int curslot;
+
+ for (curslot = 0; curslot < tbmres->ntuples; curslot++)
+ {
+ OffsetNumber offnum = tbmres->offsets[curslot];
+ ItemPointerData tid;
+ HeapTupleData heapTuple;
+
+ ItemPointerSet(&tid, page, offnum);
+ if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot,
+ &heapTuple, NULL, true))
+ hscan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
+ }
+ }
+ else
+ {
+ /*
+ * Bitmap is lossy, so we must examine each item pointer on the page.
+ * But we can ignore HOT chains, since we'll check each tuple anyway.
+ */
+ Page dp = (Page) BufferGetPage(buffer);
+ OffsetNumber maxoff = PageGetMaxOffsetNumber(dp);
+ OffsetNumber offnum;
+
+ for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
+ {
+ ItemId lp;
+ HeapTupleData loctup;
+ bool valid;
+
+ lp = PageGetItemId(dp, offnum);
+ if (!ItemIdIsNormal(lp))
+ continue;
+ loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
+ loctup.t_len = ItemIdGetLength(lp);
+ loctup.t_tableOid = scan->rs_rd->rd_id;
+ ItemPointerSet(&loctup.t_self, page, offnum);
+ valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
+ if (valid)
+ {
+ hscan->rs_vistuples[ntup++] = offnum;
+ PredicateLockTuple(scan->rs_rd, &loctup, snapshot);
+ }
+ CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
+ buffer, snapshot);
+ }
+ }
+
+ LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+
+ Assert(ntup <= MaxHeapTuplesPerPage);
+ hscan->rs_ntuples = ntup;
+
+ return ntup > 0;
+}
+
+static bool
+heapam_scan_bitmap_next_tuple(TableScanDesc scan,
+ TBMIterateResult *tbmres,
+ TupleTableSlot *slot)
+{
+ HeapScanDesc hscan = (HeapScanDesc) scan;
+ OffsetNumber targoffset;
+ Page dp;
+ ItemId lp;
+
+ /*
+ * Out of range? If so, nothing more to look at on this page
+ */
+ if (hscan->rs_cindex < 0 || hscan->rs_cindex >= hscan->rs_ntuples)
+ return false;
+
+ targoffset = hscan->rs_vistuples[hscan->rs_cindex];
+ dp = (Page) BufferGetPage(hscan->rs_cbuf);
+ lp = PageGetItemId(dp, targoffset);
+ Assert(ItemIdIsNormal(lp));
+
+ hscan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
+ hscan->rs_ctup.t_len = ItemIdGetLength(lp);
+ hscan->rs_ctup.t_tableOid = scan->rs_rd->rd_id;
+ ItemPointerSet(&hscan->rs_ctup.t_self, hscan->rs_cblock, targoffset);
+
+ pgstat_count_heap_fetch(scan->rs_rd);
+
+ /*
+ * Set up the result slot to point to this tuple. Note that the slot
+ * acquires a pin on the buffer.
+ */
+ ExecStoreBufferHeapTuple(&hscan->rs_ctup,
+ slot,
+ hscan->rs_cbuf);
+
+ hscan->rs_cindex++;
+
+ return true;
+}
+
static bool
heapam_scan_sample_next_block(TableScanDesc scan, SampleScanState *scanstate)
{
.relation_estimate_size = heapam_estimate_rel_size,
+ .scan_bitmap_next_block = heapam_scan_bitmap_next_block,
+ .scan_bitmap_next_tuple = heapam_scan_bitmap_next_tuple,
.scan_sample_next_block = heapam_scan_sample_next_block,
.scan_sample_next_tuple = heapam_scan_sample_next_tuple
};
Assert(routine->index_validate_scan != NULL);
Assert(routine->relation_estimate_size != NULL);
+ /* optional, but one callback implies presence of hte other */
+ Assert((routine->scan_bitmap_next_block == NULL) ==
+ (routine->scan_bitmap_next_tuple == NULL));
Assert(routine->scan_sample_next_block != NULL);
Assert(routine->scan_sample_next_tuple != NULL);
#include <math.h>
-#include "access/heapam.h"
#include "access/relscan.h"
#include "access/tableam.h"
#include "access/transam.h"
static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
-static void bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres);
static inline void BitmapDoneInitializingSharedState(
ParallelBitmapHeapState *pstate);
static inline void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
{
ExprContext *econtext;
TableScanDesc scan;
- HeapScanDesc hscan;
TIDBitmap *tbm;
TBMIterator *tbmiterator = NULL;
TBMSharedIterator *shared_tbmiterator = NULL;
TBMIterateResult *tbmres;
- OffsetNumber targoffset;
TupleTableSlot *slot;
ParallelBitmapHeapState *pstate = node->pstate;
dsa_area *dsa = node->ss.ps.state->es_query_dsa;
econtext = node->ss.ps.ps_ExprContext;
slot = node->ss.ss_ScanTupleSlot;
scan = node->ss.ss_currentScanDesc;
- hscan = (HeapScanDesc) scan;
tbm = node->tbm;
if (pstate == NULL)
tbmiterator = node->tbmiterator;
for (;;)
{
- Page dp;
- ItemId lp;
+ bool skip_fetch;
CHECK_FOR_INTERRUPTS();
BitmapAdjustPrefetchIterator(node, tbmres);
- /*
- * Ignore any claimed entries past what we think is the end of the
- * relation. (This is probably not necessary given that we got at
- * least AccessShareLock on the table before performing any of the
- * indexscans, but let's be safe.)
- */
- if (tbmres->blockno >= hscan->rs_nblocks)
- {
- node->tbmres = tbmres = NULL;
- continue;
- }
-
/*
* We can skip fetching the heap page if we don't need any fields
* from the heap, and the bitmap entries don't need rechecking,
* and all tuples on the page are visible to our transaction.
+ *
+ * XXX: It's a layering violation that we do these checks above
+ * tableam, they should probably moved below it at some point.
*/
- node->skip_fetch = (node->can_skip_fetch &&
- !tbmres->recheck &&
- VM_ALL_VISIBLE(node->ss.ss_currentRelation,
- tbmres->blockno,
- &node->vmbuffer));
+ skip_fetch = (node->can_skip_fetch &&
+ !tbmres->recheck &&
+ VM_ALL_VISIBLE(node->ss.ss_currentRelation,
+ tbmres->blockno,
+ &node->vmbuffer));
- if (node->skip_fetch)
+ if (skip_fetch)
{
+ /* can't be lossy in the skip_fetch case */
+ Assert(tbmres->ntuples >= 0);
+
/*
* The number of tuples on this page is put into
- * scan->rs_ntuples; note we don't fill scan->rs_vistuples.
+ * node->return_empty_tuples.
*/
- hscan->rs_ntuples = tbmres->ntuples;
+ node->return_empty_tuples = tbmres->ntuples;
}
- else
+ else if (!table_scan_bitmap_next_block(scan, tbmres))
{
- /*
- * Fetch the current heap page and identify candidate tuples.
- */
- bitgetpage(hscan, tbmres);
+ /* AM doesn't think this block is valid, skip */
+ continue;
}
if (tbmres->ntuples >= 0)
else
node->lossy_pages++;
- /*
- * Set rs_cindex to first slot to examine
- */
- hscan->rs_cindex = 0;
-
/* Adjust the prefetch target */
BitmapAdjustPrefetchTarget(node);
}
else
{
/*
- * Continuing in previously obtained page; advance rs_cindex
+ * Continuing in previously obtained page.
*/
- hscan->rs_cindex++;
#ifdef USE_PREFETCH
#endif /* USE_PREFETCH */
}
- /*
- * Out of range? If so, nothing more to look at on this page
- */
- if (hscan->rs_cindex < 0 || hscan->rs_cindex >= hscan->rs_ntuples)
- {
- node->tbmres = tbmres = NULL;
- continue;
- }
-
/*
* We issue prefetch requests *after* fetching the current page to try
* to avoid having prefetching interfere with the main I/O. Also, this
* should happen only when we have determined there is still something
* to do on the current page, else we may uselessly prefetch the same
* page we are just about to request for real.
+ *
+ * XXX: It's a layering violation that we do these checks above
+ * tableam, they should probably moved below it at some point.
*/
BitmapPrefetch(node, scan);
- if (node->skip_fetch)
+ if (node->return_empty_tuples > 0)
{
/*
* If we don't have to fetch the tuple, just return nulls.
*/
ExecStoreAllNullTuple(slot);
+
+ if (--node->return_empty_tuples == 0)
+ {
+ /* no more tuples to return in the next round */
+ node->tbmres = tbmres = NULL;
+ }
}
else
{
/*
- * Okay to fetch the tuple.
- */
- targoffset = hscan->rs_vistuples[hscan->rs_cindex];
- dp = (Page) BufferGetPage(hscan->rs_cbuf);
- lp = PageGetItemId(dp, targoffset);
- Assert(ItemIdIsNormal(lp));
-
- hscan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
- hscan->rs_ctup.t_len = ItemIdGetLength(lp);
- hscan->rs_ctup.t_tableOid = scan->rs_rd->rd_id;
- ItemPointerSet(&hscan->rs_ctup.t_self, tbmres->blockno, targoffset);
-
- pgstat_count_heap_fetch(scan->rs_rd);
-
- /*
- * Set up the result slot to point to this tuple. Note that the
- * slot acquires a pin on the buffer.
+ * Attempt to fetch tuple from AM.
*/
- ExecStoreBufferHeapTuple(&hscan->rs_ctup,
- slot,
- hscan->rs_cbuf);
+ if (!table_scan_bitmap_next_tuple(scan, tbmres, slot))
+ {
+ /* nothing more to look at on this page */
+ node->tbmres = tbmres = NULL;
+ continue;
+ }
/*
* If we are using lossy info, we have to recheck the qual
return ExecClearTuple(slot);
}
-/*
- * bitgetpage - subroutine for BitmapHeapNext()
- *
- * This routine reads and pins the specified page of the relation, then
- * builds an array indicating which tuples on the page are both potentially
- * interesting according to the bitmap, and visible according to the snapshot.
- */
-static void
-bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
-{
- BlockNumber page = tbmres->blockno;
- Buffer buffer;
- Snapshot snapshot;
- int ntup;
-
- /*
- * Acquire pin on the target heap page, trading in any pin we held before.
- */
- Assert(page < scan->rs_nblocks);
-
- scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf,
- scan->rs_base.rs_rd,
- page);
- buffer = scan->rs_cbuf;
- snapshot = scan->rs_base.rs_snapshot;
-
- ntup = 0;
-
- /*
- * Prune and repair fragmentation for the whole page, if possible.
- */
- heap_page_prune_opt(scan->rs_base.rs_rd, buffer);
-
- /*
- * We must hold share lock on the buffer content while examining tuple
- * visibility. Afterwards, however, the tuples we have found to be
- * visible are guaranteed good as long as we hold the buffer pin.
- */
- LockBuffer(buffer, BUFFER_LOCK_SHARE);
-
- /*
- * We need two separate strategies for lossy and non-lossy cases.
- */
- if (tbmres->ntuples >= 0)
- {
- /*
- * Bitmap is non-lossy, so we just look through the offsets listed in
- * tbmres; but we have to follow any HOT chain starting at each such
- * offset.
- */
- int curslot;
-
- for (curslot = 0; curslot < tbmres->ntuples; curslot++)
- {
- OffsetNumber offnum = tbmres->offsets[curslot];
- ItemPointerData tid;
- HeapTupleData heapTuple;
-
- ItemPointerSet(&tid, page, offnum);
- if (heap_hot_search_buffer(&tid, scan->rs_base.rs_rd, buffer,
- snapshot, &heapTuple, NULL, true))
- scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
- }
- }
- else
- {
- /*
- * Bitmap is lossy, so we must examine each item pointer on the page.
- * But we can ignore HOT chains, since we'll check each tuple anyway.
- */
- Page dp = (Page) BufferGetPage(buffer);
- OffsetNumber maxoff = PageGetMaxOffsetNumber(dp);
- OffsetNumber offnum;
-
- for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
- {
- ItemId lp;
- HeapTupleData loctup;
- bool valid;
-
- lp = PageGetItemId(dp, offnum);
- if (!ItemIdIsNormal(lp))
- continue;
- loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
- loctup.t_len = ItemIdGetLength(lp);
- loctup.t_tableOid = scan->rs_base.rs_rd->rd_id;
- ItemPointerSet(&loctup.t_self, page, offnum);
- valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
- if (valid)
- {
- scan->rs_vistuples[ntup++] = offnum;
- PredicateLockTuple(scan->rs_base.rs_rd, &loctup, snapshot);
- }
- CheckForSerializableConflictOut(valid, scan->rs_base.rs_rd,
- &loctup, buffer, snapshot);
- }
- }
-
- LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
-
- Assert(ntup <= MaxHeapTuplesPerPage);
- scan->rs_ntuples = ntup;
-}
-
/*
* BitmapDoneInitializingSharedState - Shared state is initialized
*
scanstate->tbm = NULL;
scanstate->tbmiterator = NULL;
scanstate->tbmres = NULL;
- scanstate->skip_fetch = false;
+ scanstate->return_empty_tuples = 0;
scanstate->vmbuffer = InvalidBuffer;
scanstate->pvmbuffer = InvalidBuffer;
scanstate->exact_pages = 0;
scanstate->ss.ss_currentRelation = currentRelation;
- /*
- * Even though we aren't going to do a conventional seqscan, it is useful
- * to create a HeapScanDesc --- most of the fields in it are usable.
- */
scanstate->ss.ss_currentScanDesc = table_beginscan_bm(currentRelation,
estate->es_snapshot,
0,
info->amsearchnulls = amroutine->amsearchnulls;
info->amcanparallel = amroutine->amcanparallel;
info->amhasgettuple = (amroutine->amgettuple != NULL);
- info->amhasgetbitmap = (amroutine->amgetbitmap != NULL);
+ info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
+ relation->rd_tableam->scan_bitmap_next_block != NULL;
info->amcostestimate = amroutine->amcostestimate;
Assert(info->amcostestimate != NULL);
struct IndexInfo;
struct IndexBuildCallback;
struct SampleScanState;
+struct TBMIterateResult;
struct VacuumParams;
struct ValidateIndexState;
*/
/*
- * Acquire the next block in a sample scan. Return false if the sample
- * scan is finished, true otherwise.
+ * Prepare to fetch / check / return tuples from `tbmres->blockno` as part
+ * of a bitmap table scan. `scan` was started via table_beginscan_bm().
+ * Return false if there's no tuples to be found on the page, true
+ * otherwise.
+ *
+ * This will typically read and pin the target block, and do the necessary
+ * work to allow scan_bitmap_next_tuple() to return tuples (e.g. it might
+ * make sense to perform tuple visibility checks at this time). For some
+ * AMs it will make more sense to do all the work referencing `tbmres`
+ * contents here, for others it might be better to defer more work to
+ * scan_bitmap_next_tuple.
+ *
+ * If `tbmres->blockno` is -1, this is a lossy scan and all visible tuples
+ * on the page have to be returned, otherwise the tuples at offsets in
+ * `tbmres->offsets` need to be returned.
+ *
+ * XXX: Currently this may only be implemented if the AM uses md.c as its
+ * storage manager, and uses ItemPointer->ip_blkid in a manner that maps
+ * blockids directly to the underlying storage. nodeBitmapHeapscan.c
+ * performs prefetching directly using that interface. This probably
+ * needs to be rectified at a later point.
+ *
+ * XXX: Currently this may only be implemented if the AM uses the
+ * visibilitymap, as nodeBitmapHeapscan.c unconditionally accesses it to
+ * perform prefetching. This probably needs to be rectified at a later
+ * point.
+ *
+ * Optional callback, but either both scan_bitmap_next_block and
+ * scan_bitmap_next_tuple need to exist, or neither.
+ */
+ bool (*scan_bitmap_next_block) (TableScanDesc scan,
+ struct TBMIterateResult *tbmres);
+
+ /*
+ * Fetch the next tuple of a bitmap table scan into `slot` and return true
+ * if a visible tuple was found, false otherwise.
+ *
+ * For some AMs it will make more sense to do all the work referencing
+ * `tbmres` contents in scan_bitmap_next_block, for others it might be
+ * better to defer more work to this callback.
+ *
+ * Optional callback, but either both scan_bitmap_next_block and
+ * scan_bitmap_next_tuple need to exist, or neither.
+ */
+ bool (*scan_bitmap_next_tuple) (TableScanDesc scan,
+ struct TBMIterateResult *tbmres,
+ TupleTableSlot *slot);
+
+ /*
+ * Prepare to fetch tuples from the next block in a sample scan. Return
+ * false if the sample scan is finished, true otherwise. `scan` was
+ * started via table_beginscan_sampling().
*
* Typically this will first determine the target block by call the
* TsmRoutine's NextSampleBlock() callback if not NULL, or alternatively
*/
/*
- * Acquire the next block in a sample scan. Returns false if the sample scan
- * is finished, true otherwise.
+ * Prepare to fetch / check / return tuples from `tbmres->blockno` as part of
+ * a bitmap table scan. `scan` needs to have been started via
+ * table_beginscan_bm(). Returns false if there's no tuples to be found on the
+ * page, true otherwise.
+ *
+ * Note, this is an optionally implemented function, therefore should only be
+ * used after verifying the presence (at plan time or such).
+ */
+static inline bool
+table_scan_bitmap_next_block(TableScanDesc scan,
+ struct TBMIterateResult *tbmres)
+{
+ return scan->rs_rd->rd_tableam->scan_bitmap_next_block(scan,
+ tbmres);
+}
+
+/*
+ * Fetch the next tuple of a bitmap table scan into `slot` and return true if
+ * a visible tuple was found, false otherwise.
+ * table_scan_bitmap_next_block() needs to previously have selected a
+ * block (i.e. returned true), and no previous
+ * table_scan_bitmap_next_tuple() for the same block may have
+ * returned false.
+ */
+static inline bool
+table_scan_bitmap_next_tuple(TableScanDesc scan,
+ struct TBMIterateResult *tbmres,
+ TupleTableSlot *slot)
+{
+ return scan->rs_rd->rd_tableam->scan_bitmap_next_tuple(scan,
+ tbmres,
+ slot);
+}
+
+/*
+ * Prepare to fetch tuples from the next block in a sample scan. Returns false
+ * if the sample scan is finished, true otherwise. `scan` needs to have been
+ * started via table_beginscan_sampling().
*
* This will call the TsmRoutine's NextSampleBlock() callback if necessary
* (i.e. NextSampleBlock is not NULL), or perform a sequential scan over the
/*
* Fetch the next sample tuple into `slot` and return true if a visible tuple
* was found, false otherwise. table_scan_sample_next_block() needs to
- * previously have selected a block (i.e. returned true).
+ * previously have selected a block (i.e. returned true), and no previous
+ * table_scan_sample_next_tuple() for the same block may have returned false.
*
* This will call the TsmRoutine's NextSampleTuple() callback.
*/
* tbmiterator iterator for scanning current pages
* tbmres current-page data
* can_skip_fetch can we potentially skip tuple fetches in this scan?
- * skip_fetch are we skipping tuple fetches on this page?
+ * return_empty_tuples number of empty tuples to return
* vmbuffer buffer for visibility-map lookups
* pvmbuffer ditto, for prefetched pages
* exact_pages total number of exact pages retrieved
TBMIterator *tbmiterator;
TBMIterateResult *tbmres;
bool can_skip_fetch;
- bool skip_fetch;
+ int return_empty_tuples;
Buffer vmbuffer;
Buffer pvmbuffer;
long exact_pages;
typedef struct TBMSharedIterator TBMSharedIterator;
/* Result structure for tbm_iterate */
-typedef struct
+typedef struct TBMIterateResult
{
BlockNumber blockno; /* page number containing tuples */
int ntuples; /* -1 indicates lossy result */