]> granicus.if.org Git - postgresql/blobdiff - src/backend/access/index/indexam.c
Extend index AM API for parallel index scans.
[postgresql] / src / backend / access / index / indexam.c
index 88f73e8241e5c8a521e9f9209c49f8041f9336d4..ba27c1e86d9f64de25aa2db6cd8da9b31eb6453b 100644 (file)
@@ -3,7 +3,7 @@
  * indexam.c
  *       general index access method routines
  *
- * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  *             index_insert    - insert an index tuple into a relation
  *             index_markpos   - mark a scan position
  *             index_restrpos  - restore a scan position
- *             index_getnext   - get the next tuple from a scan
+ *             index_parallelscan_estimate - estimate shared memory for parallel scan
+ *             index_parallelscan_initialize - initialize parallel scan
+ *             index_parallelrescan  - (re)start a parallel scan of an index
+ *             index_beginscan_parallel - join parallel index scan
+ *             index_getnext_tid       - get the next TID from a scan
+ *             index_fetch_heap                - get the scan's next heap tuple
+ *             index_getnext   - get the next heap tuple from a scan
  *             index_getbitmap - get all tuples from a scan
  *             index_bulk_delete       - bulk deletion of index tuples
  *             index_vacuum_cleanup    - post-deletion cleanup of an index
+ *             index_can_return        - does index support index-only scans?
  *             index_getprocid - get a support procedure OID
  *             index_getprocinfo - get a support procedure's lookup info
  *
 
 #include "postgres.h"
 
+#include "access/amapi.h"
 #include "access/relscan.h"
 #include "access/transam.h"
-#include "access/xact.h"
+#include "access/xlog.h"
+#include "catalog/catalog.h"
+#include "catalog/index.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
-#include "utils/relcache.h"
 #include "utils/snapmgr.h"
 #include "utils/tqual.h"
 
 
 /* ----------------------------------------------------------------
  *                                     macros used in index_ routines
+ *
+ * Note: the ReindexIsProcessingIndex() check in RELATION_CHECKS is there
+ * to check that we don't try to scan or do retail insertions into an index
+ * that is currently being rebuilt or pending rebuild.  This helps to catch
+ * things that don't work when reindexing system catalogs.  The assertion
+ * doesn't prevent the actual rebuild because we don't use RELATION_CHECKS
+ * when calling the index AM's ambuild routine, and there is no reason for
+ * ambuild to call its subsidiary routines through this file.
  * ----------------------------------------------------------------
  */
 #define RELATION_CHECKS \
 ( \
        AssertMacro(RelationIsValid(indexRelation)), \
-       AssertMacro(PointerIsValid(indexRelation->rd_am)) \
+       AssertMacro(PointerIsValid(indexRelation->rd_amroutine)), \
+       AssertMacro(!ReindexIsProcessingIndex(RelationGetRelid(indexRelation))) \
 )
 
 #define SCAN_CHECKS \
 ( \
        AssertMacro(IndexScanIsValid(scan)), \
        AssertMacro(RelationIsValid(scan->indexRelation)), \
-       AssertMacro(PointerIsValid(scan->indexRelation->rd_am)) \
+       AssertMacro(PointerIsValid(scan->indexRelation->rd_amroutine)) \
 )
 
-#define GET_REL_PROCEDURE(pname) \
+#define CHECK_REL_PROCEDURE(pname) \
 do { \
-       procedure = &indexRelation->rd_aminfo->pname; \
-       if (!OidIsValid(procedure->fn_oid)) \
-       { \
-               RegProcedure    procOid = indexRelation->rd_am->pname; \
-               if (!RegProcedureIsValid(procOid)) \
-                       elog(ERROR, "invalid %s regproc", CppAsString(pname)); \
-               fmgr_info_cxt(procOid, procedure, indexRelation->rd_indexcxt); \
-       } \
+       if (indexRelation->rd_amroutine->pname == NULL) \
+               elog(ERROR, "function %s is not defined for index %s", \
+                        CppAsString(pname), RelationGetRelationName(indexRelation)); \
 } while(0)
 
-#define GET_SCAN_PROCEDURE(pname) \
+#define CHECK_SCAN_PROCEDURE(pname) \
 do { \
-       procedure = &scan->indexRelation->rd_aminfo->pname; \
-       if (!OidIsValid(procedure->fn_oid)) \
-       { \
-               RegProcedure    procOid = scan->indexRelation->rd_am->pname; \
-               if (!RegProcedureIsValid(procOid)) \
-                       elog(ERROR, "invalid %s regproc", CppAsString(pname)); \
-               fmgr_info_cxt(procOid, procedure, scan->indexRelation->rd_indexcxt); \
-       } \
+       if (scan->indexRelation->rd_amroutine->pname == NULL) \
+               elog(ERROR, "function %s is not defined for index %s", \
+                        CppAsString(pname), RelationGetRelationName(scan->indexRelation)); \
 } while(0)
 
 static IndexScanDesc index_beginscan_internal(Relation indexRelation,
-                                                int nkeys, int norderbys);
+                                                int nkeys, int norderbys, Snapshot snapshot,
+                                                ParallelIndexScanDesc pscan, bool temp_snap);
 
 
 /* ----------------------------------------------------------------
@@ -128,7 +137,7 @@ static IndexScanDesc index_beginscan_internal(Relation indexRelation,
  *             index_open - open an index relation by relation OID
  *
  *             If lockmode is not "NoLock", the specified kind of lock is
- *             obtained on the index.  (Generally, NoLock should only be
+ *             obtained on the index.  (Generally, NoLock should only be
  *             used if the caller knows it has some appropriate lock on the
  *             index already.)
  *
@@ -189,26 +198,17 @@ index_insert(Relation indexRelation,
                         Relation heapRelation,
                         IndexUniqueCheck checkUnique)
 {
-       FmgrInfo   *procedure;
-
        RELATION_CHECKS;
-       GET_REL_PROCEDURE(aminsert);
+       CHECK_REL_PROCEDURE(aminsert);
 
-       if (!(indexRelation->rd_am->ampredlocks))
+       if (!(indexRelation->rd_amroutine->ampredlocks))
                CheckForSerializableConflictIn(indexRelation,
                                                                           (HeapTuple) NULL,
                                                                           InvalidBuffer);
 
-       /*
-        * have the am's insert proc do all the work.
-        */
-       return DatumGetBool(FunctionCall6(procedure,
-                                                                         PointerGetDatum(indexRelation),
-                                                                         PointerGetDatum(values),
-                                                                         PointerGetDatum(isnull),
-                                                                         PointerGetDatum(heap_t_ctid),
-                                                                         PointerGetDatum(heapRelation),
-                                                                         Int32GetDatum((int32) checkUnique)));
+       return indexRelation->rd_amroutine->aminsert(indexRelation, values, isnull,
+                                                                                                heap_t_ctid, heapRelation,
+                                                                                                checkUnique);
 }
 
 /*
@@ -224,7 +224,7 @@ index_beginscan(Relation heapRelation,
 {
        IndexScanDesc scan;
 
-       scan = index_beginscan_internal(indexRelation, nkeys, norderbys);
+       scan = index_beginscan_internal(indexRelation, nkeys, norderbys, snapshot, NULL, false);
 
        /*
         * Save additional parameters into the scandesc.  Everything else was set
@@ -249,7 +249,7 @@ index_beginscan_bitmap(Relation indexRelation,
 {
        IndexScanDesc scan;
 
-       scan = index_beginscan_internal(indexRelation, nkeys, 0);
+       scan = index_beginscan_internal(indexRelation, nkeys, 0, snapshot, NULL, false);
 
        /*
         * Save additional parameters into the scandesc.  Everything else was set
@@ -265,16 +265,16 @@ index_beginscan_bitmap(Relation indexRelation,
  */
 static IndexScanDesc
 index_beginscan_internal(Relation indexRelation,
-                                                int nkeys, int norderbys)
+                                                int nkeys, int norderbys, Snapshot snapshot,
+                                                ParallelIndexScanDesc pscan, bool temp_snap)
 {
        IndexScanDesc scan;
-       FmgrInfo   *procedure;
 
        RELATION_CHECKS;
-       GET_REL_PROCEDURE(ambeginscan);
+       CHECK_REL_PROCEDURE(ambeginscan);
 
-       if (!(indexRelation->rd_am->ampredlocks))
-               PredicateLockRelation(indexRelation);
+       if (!(indexRelation->rd_amroutine->ampredlocks))
+               PredicateLockRelation(indexRelation, snapshot);
 
        /*
         * We hold a reference count to the relcache entry throughout the scan.
@@ -284,11 +284,11 @@ index_beginscan_internal(Relation indexRelation,
        /*
         * Tell the AM to open a scan.
         */
-       scan = (IndexScanDesc)
-               DatumGetPointer(FunctionCall3(procedure,
-                                                                         PointerGetDatum(indexRelation),
-                                                                         Int32GetDatum(nkeys),
-                                                                         Int32GetDatum(norderbys)));
+       scan = indexRelation->rd_amroutine->ambeginscan(indexRelation, nkeys,
+                                                                                                       norderbys);
+       /* Initialize information for parallel scan. */
+       scan->parallel_scan = pscan;
+       scan->xs_temp_snap = temp_snap;
 
        return scan;
 }
@@ -310,10 +310,8 @@ index_rescan(IndexScanDesc scan,
                         ScanKey keys, int nkeys,
                         ScanKey orderbys, int norderbys)
 {
-       FmgrInfo   *procedure;
-
        SCAN_CHECKS;
-       GET_SCAN_PROCEDURE(amrescan);
+       CHECK_SCAN_PROCEDURE(amrescan);
 
        Assert(nkeys == scan->numberOfKeys);
        Assert(norderbys == scan->numberOfOrderBys);
@@ -325,16 +323,12 @@ index_rescan(IndexScanDesc scan,
                scan->xs_cbuf = InvalidBuffer;
        }
 
-       scan->xs_next_hot = InvalidOffsetNumber;
+       scan->xs_continue_hot = false;
 
        scan->kill_prior_tuple = false;         /* for safety */
 
-       FunctionCall5(procedure,
-                                 PointerGetDatum(scan),
-                                 PointerGetDatum(keys),
-                                 Int32GetDatum(nkeys),
-                                 PointerGetDatum(orderbys),
-                                 Int32GetDatum(norderbys));
+       scan->indexRelation->rd_amroutine->amrescan(scan, keys, nkeys,
+                                                                                               orderbys, norderbys);
 }
 
 /* ----------------
@@ -344,10 +338,8 @@ index_rescan(IndexScanDesc scan,
 void
 index_endscan(IndexScanDesc scan)
 {
-       FmgrInfo   *procedure;
-
        SCAN_CHECKS;
-       GET_SCAN_PROCEDURE(amendscan);
+       CHECK_SCAN_PROCEDURE(amendscan);
 
        /* Release any held pin on a heap page */
        if (BufferIsValid(scan->xs_cbuf))
@@ -357,11 +349,14 @@ index_endscan(IndexScanDesc scan)
        }
 
        /* End the AM's scan */
-       FunctionCall1(procedure, PointerGetDatum(scan));
+       scan->indexRelation->rd_amroutine->amendscan(scan);
 
        /* Release index refcount acquired by index_beginscan */
        RelationDecrementReferenceCount(scan->indexRelation);
 
+       if (scan->xs_temp_snap)
+               UnregisterSnapshot(scan->xs_snapshot);
+
        /* Release the scan data structure itself */
        IndexScanEnd(scan);
 }
@@ -373,12 +368,10 @@ index_endscan(IndexScanDesc scan)
 void
 index_markpos(IndexScanDesc scan)
 {
-       FmgrInfo   *procedure;
-
        SCAN_CHECKS;
-       GET_SCAN_PROCEDURE(ammarkpos);
+       CHECK_SCAN_PROCEDURE(ammarkpos);
 
-       FunctionCall1(procedure, PointerGetDatum(scan));
+       scan->indexRelation->rd_amroutine->ammarkpos(scan);
 }
 
 /* ----------------
@@ -393,276 +386,310 @@ index_markpos(IndexScanDesc scan)
  * returnable tuple in each HOT chain, and so restoring the prior state at the
  * granularity of the index AM is sufficient.  Since the only current user
  * of mark/restore functionality is nodeMergejoin.c, this effectively means
- * that merge-join plans only work for MVCC snapshots. This could be fixed
+ * that merge-join plans only work for MVCC snapshots.  This could be fixed
  * if necessary, but for now it seems unimportant.
  * ----------------
  */
 void
 index_restrpos(IndexScanDesc scan)
 {
-       FmgrInfo   *procedure;
-
        Assert(IsMVCCSnapshot(scan->xs_snapshot));
 
        SCAN_CHECKS;
-       GET_SCAN_PROCEDURE(amrestrpos);
+       CHECK_SCAN_PROCEDURE(amrestrpos);
 
-       scan->xs_next_hot = InvalidOffsetNumber;
+       scan->xs_continue_hot = false;
 
        scan->kill_prior_tuple = false;         /* for safety */
 
-       FunctionCall1(procedure, PointerGetDatum(scan));
+       scan->indexRelation->rd_amroutine->amrestrpos(scan);
 }
 
-/* ----------------
- *             index_getnext - get the next heap tuple from a scan
- *
- * The result is the next heap tuple satisfying the scan keys and the
- * snapshot, or NULL if no more matching tuples exist. On success,
- * the buffer containing the heap tuple is pinned (the pin will be dropped
- * at the next index_getnext or index_endscan).
+/*
+ * index_parallelscan_estimate - estimate shared memory for parallel scan
  *
- * Note: caller must check scan->xs_recheck, and perform rechecking of the
- * scan keys if required.  We do not do that here because we don't have
- * enough information to do it efficiently in the general case.
- * ----------------
+ * Currently, we don't pass any information to the AM-specific estimator,
+ * so it can probably only return a constant.  In the future, we might need
+ * to pass more information.
  */
-HeapTuple
-index_getnext(IndexScanDesc scan, ScanDirection direction)
+Size
+index_parallelscan_estimate(Relation indexRelation, Snapshot snapshot)
 {
-       HeapTuple       heapTuple = &scan->xs_ctup;
-       ItemPointer tid = &heapTuple->t_self;
-       FmgrInfo   *procedure;
+       Size            nbytes;
 
-       SCAN_CHECKS;
-       GET_SCAN_PROCEDURE(amgettuple);
+       RELATION_CHECKS;
 
-       Assert(TransactionIdIsValid(RecentGlobalXmin));
+       nbytes = offsetof(ParallelIndexScanDescData, ps_snapshot_data);
+       nbytes = add_size(nbytes, EstimateSnapshotSpace(snapshot));
+       nbytes = MAXALIGN(nbytes);
 
        /*
-        * We always reset xs_hot_dead; if we are here then either we are just
-        * starting the scan, or we previously returned a visible tuple, and in
-        * either case it's inappropriate to kill the prior index entry.
+        * If amestimateparallelscan is not provided, assume there is no
+        * AM-specific data needed.  (It's hard to believe that could work, but
+        * it's easy enough to cater to it here.)
         */
-       scan->xs_hot_dead = false;
+       if (indexRelation->rd_amroutine->amestimateparallelscan != NULL)
+               nbytes = add_size(nbytes,
+                                         indexRelation->rd_amroutine->amestimateparallelscan());
 
-       for (;;)
+       return nbytes;
+}
+
+/*
+ * index_parallelscan_initialize - initialize parallel scan
+ *
+ * We initialize both the ParallelIndexScanDesc proper and the AM-specific
+ * information which follows it.
+ *
+ * This function calls access method specific initialization routine to
+ * initialize am specific information.  Call this just once in the leader
+ * process; then, individual workers attach via index_beginscan_parallel.
+ */
+void
+index_parallelscan_initialize(Relation heapRelation, Relation indexRelation,
+                                                         Snapshot snapshot, ParallelIndexScanDesc target)
+{
+       Size            offset;
+
+       RELATION_CHECKS;
+
+       offset = add_size(offsetof(ParallelIndexScanDescData, ps_snapshot_data),
+                                         EstimateSnapshotSpace(snapshot));
+       offset = MAXALIGN(offset);
+
+       target->ps_relid = RelationGetRelid(heapRelation);
+       target->ps_indexid = RelationGetRelid(indexRelation);
+       target->ps_offset = offset;
+       SerializeSnapshot(snapshot, target->ps_snapshot_data);
+
+       /* aminitparallelscan is optional; assume no-op if not provided by AM */
+       if (indexRelation->rd_amroutine->aminitparallelscan != NULL)
        {
-               OffsetNumber offnum;
-               bool            at_chain_start;
-               Page            dp;
+               void       *amtarget;
 
-               if (scan->xs_next_hot != InvalidOffsetNumber)
-               {
-                       /*
-                        * We are resuming scan of a HOT chain after having returned an
-                        * earlier member.      Must still hold pin on current heap page.
-                        */
-                       Assert(BufferIsValid(scan->xs_cbuf));
-                       Assert(ItemPointerGetBlockNumber(tid) ==
-                                  BufferGetBlockNumber(scan->xs_cbuf));
-                       Assert(TransactionIdIsValid(scan->xs_prev_xmax));
-                       offnum = scan->xs_next_hot;
-                       at_chain_start = false;
-                       scan->xs_next_hot = InvalidOffsetNumber;
-               }
-               else
-               {
-                       bool            found;
-                       Buffer          prev_buf;
+               amtarget = OffsetToPointer(target, offset);
+               indexRelation->rd_amroutine->aminitparallelscan(amtarget);
+       }
+}
 
-                       /*
-                        * If we scanned a whole HOT chain and found only dead tuples,
-                        * tell index AM to kill its entry for that TID. We do not do this
-                        * when in recovery because it may violate MVCC to do so. see
-                        * comments in RelationGetIndexScan().
-                        */
-                       if (!scan->xactStartedInRecovery)
-                               scan->kill_prior_tuple = scan->xs_hot_dead;
+/* ----------------
+ *             index_parallelrescan  - (re)start a parallel scan of an index
+ * ----------------
+ */
+void
+index_parallelrescan(IndexScanDesc scan)
+{
+       SCAN_CHECKS;
 
-                       /*
-                        * The AM's gettuple proc finds the next index entry matching the
-                        * scan keys, and puts the TID in xs_ctup.t_self (ie, *tid). It
-                        * should also set scan->xs_recheck, though we pay no attention to
-                        * that here.
-                        */
-                       found = DatumGetBool(FunctionCall2(procedure,
-                                                                                          PointerGetDatum(scan),
-                                                                                          Int32GetDatum(direction)));
+       /* amparallelrescan is optional; assume no-op if not provided by AM */
+       if (scan->indexRelation->rd_amroutine->amparallelrescan != NULL)
+               scan->indexRelation->rd_amroutine->amparallelrescan(scan);
+}
+
+/*
+ * index_beginscan_parallel - join parallel index scan
+ *
+ * Caller must be holding suitable locks on the heap and the index.
+ */
+IndexScanDesc
+index_beginscan_parallel(Relation heaprel, Relation indexrel, int nkeys,
+                                                int norderbys, ParallelIndexScanDesc pscan)
+{
+       Snapshot        snapshot;
+       IndexScanDesc scan;
 
-                       /* Reset kill flag immediately for safety */
-                       scan->kill_prior_tuple = false;
+       Assert(RelationGetRelid(heaprel) == pscan->ps_relid);
+       snapshot = RestoreSnapshot(pscan->ps_snapshot_data);
+       RegisterSnapshot(snapshot);
+       scan = index_beginscan_internal(indexrel, nkeys, norderbys, snapshot,
+                                                                       pscan, true);
 
-                       /* If we're out of index entries, break out of outer loop */
-                       if (!found)
-                               break;
+       /*
+        * Save additional parameters into the scandesc.  Everything else was set
+        * up by index_beginscan_internal.
+        */
+       scan->heapRelation = heaprel;
+       scan->xs_snapshot = snapshot;
 
-                       pgstat_count_index_tuples(scan->indexRelation, 1);
+       return scan;
+}
 
-                       /* Switch to correct buffer if we don't have it already */
-                       prev_buf = scan->xs_cbuf;
-                       scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf,
-                                                                                                scan->heapRelation,
-                                                                                        ItemPointerGetBlockNumber(tid));
+/* ----------------
+ * index_getnext_tid - get the next TID from a scan
+ *
+ * The result is the next TID satisfying the scan keys,
+ * or NULL if no more matching tuples exist.
+ * ----------------
+ */
+ItemPointer
+index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
+{
+       bool            found;
 
-                       /*
-                        * Prune page, but only if we weren't already on this page
-                        */
-                       if (prev_buf != scan->xs_cbuf)
-                               heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf,
-                                                                       RecentGlobalXmin);
+       SCAN_CHECKS;
+       CHECK_SCAN_PROCEDURE(amgettuple);
 
-                       /* Prepare to scan HOT chain starting at index-referenced offnum */
-                       offnum = ItemPointerGetOffsetNumber(tid);
-                       at_chain_start = true;
+       Assert(TransactionIdIsValid(RecentGlobalXmin));
 
-                       /* We don't know what the first tuple's xmin should be */
-                       scan->xs_prev_xmax = InvalidTransactionId;
+       /*
+        * The AM's amgettuple proc finds the next index entry matching the scan
+        * keys, and puts the TID into scan->xs_ctup.t_self.  It should also set
+        * scan->xs_recheck and possibly scan->xs_itup, though we pay no attention
+        * to those fields here.
+        */
+       found = scan->indexRelation->rd_amroutine->amgettuple(scan, direction);
+
+       /* Reset kill flag immediately for safety */
+       scan->kill_prior_tuple = false;
 
-                       /* Initialize flag to detect if all entries are dead */
-                       scan->xs_hot_dead = true;
+       /* If we're out of index entries, we're done */
+       if (!found)
+       {
+               /* ... but first, release any held pin on a heap page */
+               if (BufferIsValid(scan->xs_cbuf))
+               {
+                       ReleaseBuffer(scan->xs_cbuf);
+                       scan->xs_cbuf = InvalidBuffer;
                }
+               return NULL;
+       }
 
-               /* Obtain share-lock on the buffer so we can examine visibility */
-               LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE);
+       pgstat_count_index_tuples(scan->indexRelation, 1);
 
-               dp = (Page) BufferGetPage(scan->xs_cbuf);
+       /* Return the TID of the tuple we found. */
+       return &scan->xs_ctup.t_self;
+}
 
-               /* Scan through possible multiple members of HOT-chain */
-               for (;;)
-               {
-                       ItemId          lp;
-                       ItemPointer ctid;
-                       bool            valid;
+/* ----------------
+ *             index_fetch_heap - get the scan's next heap tuple
+ *
+ * The result is a visible heap tuple associated with the index TID most
+ * recently fetched by index_getnext_tid, or NULL if no more matching tuples
+ * exist.  (There can be more than one matching tuple because of HOT chains,
+ * although when using an MVCC snapshot it should be impossible for more than
+ * one such tuple to exist.)
+ *
+ * On success, the buffer containing the heap tup is pinned (the pin will be
+ * dropped in a future index_getnext_tid, index_fetch_heap or index_endscan
+ * call).
+ *
+ * Note: caller must check scan->xs_recheck, and perform rechecking of the
+ * scan keys if required.  We do not do that here because we don't have
+ * enough information to do it efficiently in the general case.
+ * ----------------
+ */
+HeapTuple
+index_fetch_heap(IndexScanDesc scan)
+{
+       ItemPointer tid = &scan->xs_ctup.t_self;
+       bool            all_dead = false;
+       bool            got_heap_tuple;
 
-                       /* check for bogus TID */
-                       if (offnum < FirstOffsetNumber ||
-                               offnum > PageGetMaxOffsetNumber(dp))
-                               break;
+       /* We can skip the buffer-switching logic if we're in mid-HOT chain. */
+       if (!scan->xs_continue_hot)
+       {
+               /* Switch to correct buffer if we don't have it already */
+               Buffer          prev_buf = scan->xs_cbuf;
 
-                       lp = PageGetItemId(dp, offnum);
-
-                       /* check for unused, dead, or redirected items */
-                       if (!ItemIdIsNormal(lp))
-                       {
-                               /* We should only see a redirect at start of chain */
-                               if (ItemIdIsRedirected(lp) && at_chain_start)
-                               {
-                                       /* Follow the redirect */
-                                       offnum = ItemIdGetRedirect(lp);
-                                       at_chain_start = false;
-                                       continue;
-                               }
-                               /* else must be end of chain */
-                               break;
-                       }
+               scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf,
+                                                                                        scan->heapRelation,
+                                                                                        ItemPointerGetBlockNumber(tid));
 
-                       /*
-                        * We must initialize all of *heapTuple (ie, scan->xs_ctup) since
-                        * it is returned to the executor on success.
-                        */
-                       heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp);
-                       heapTuple->t_len = ItemIdGetLength(lp);
-                       ItemPointerSetOffsetNumber(tid, offnum);
-                       heapTuple->t_tableOid = RelationGetRelid(scan->heapRelation);
-                       ctid = &heapTuple->t_data->t_ctid;
+               /*
+                * Prune page, but only if we weren't already on this page
+                */
+               if (prev_buf != scan->xs_cbuf)
+                       heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf);
+       }
 
-                       /*
-                        * Shouldn't see a HEAP_ONLY tuple at chain start.  (This test
-                        * should be unnecessary, since the chain root can't be removed
-                        * while we have pin on the index entry, but let's make it
-                        * anyway.)
-                        */
-                       if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
-                               break;
+       /* Obtain share-lock on the buffer so we can examine visibility */
+       LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE);
+       got_heap_tuple = heap_hot_search_buffer(tid, scan->heapRelation,
+                                                                                       scan->xs_cbuf,
+                                                                                       scan->xs_snapshot,
+                                                                                       &scan->xs_ctup,
+                                                                                       &all_dead,
+                                                                                       !scan->xs_continue_hot);
+       LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK);
+
+       if (got_heap_tuple)
+       {
+               /*
+                * Only in a non-MVCC snapshot can more than one member of the HOT
+                * chain be visible.
+                */
+               scan->xs_continue_hot = !IsMVCCSnapshot(scan->xs_snapshot);
+               pgstat_count_heap_fetch(scan->indexRelation);
+               return &scan->xs_ctup;
+       }
 
-                       /*
-                        * The xmin should match the previous xmax value, else chain is
-                        * broken.      (Note: this test is not optional because it protects
-                        * us against the case where the prior chain member's xmax aborted
-                        * since we looked at it.)
-                        */
-                       if (TransactionIdIsValid(scan->xs_prev_xmax) &&
-                               !TransactionIdEquals(scan->xs_prev_xmax,
-                                                                 HeapTupleHeaderGetXmin(heapTuple->t_data)))
-                               break;
+       /* We've reached the end of the HOT chain. */
+       scan->xs_continue_hot = false;
 
-                       /* If it's visible per the snapshot, we must return it */
-                       valid = HeapTupleSatisfiesVisibility(heapTuple, scan->xs_snapshot,
-                                                                                                scan->xs_cbuf);
-
-                       CheckForSerializableConflictOut(valid, scan->heapRelation,
-                                                                                       heapTuple, scan->xs_cbuf);
-
-                       if (valid)
-                       {
-                               /*
-                                * If the snapshot is MVCC, we know that it could accept at
-                                * most one member of the HOT chain, so we can skip examining
-                                * any more members.  Otherwise, check for continuation of the
-                                * HOT-chain, and set state for next time.
-                                */
-                               if (IsMVCCSnapshot(scan->xs_snapshot)
-                                       && !IsolationIsSerializable())
-                                       scan->xs_next_hot = InvalidOffsetNumber;
-                               else if (HeapTupleIsHotUpdated(heapTuple))
-                               {
-                                       Assert(ItemPointerGetBlockNumber(ctid) ==
-                                                  ItemPointerGetBlockNumber(tid));
-                                       scan->xs_next_hot = ItemPointerGetOffsetNumber(ctid);
-                                       scan->xs_prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data);
-                               }
-                               else
-                                       scan->xs_next_hot = InvalidOffsetNumber;
-
-                               PredicateLockTuple(scan->heapRelation, heapTuple);
-
-                               LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK);
-
-                               pgstat_count_heap_fetch(scan->indexRelation);
-
-                               return heapTuple;
-                       }
+       /*
+        * If we scanned a whole HOT chain and found only dead tuples, tell index
+        * AM to kill its entry for that TID (this will take effect in the next
+        * amgettuple call, in index_getnext_tid).  We do not do this when in
+        * recovery because it may violate MVCC to do so.  See comments in
+        * RelationGetIndexScan().
+        */
+       if (!scan->xactStartedInRecovery)
+               scan->kill_prior_tuple = all_dead;
 
-                       /*
-                        * If we can't see it, maybe no one else can either.  Check to see
-                        * if the tuple is dead to all transactions.  If we find that all
-                        * the tuples in the HOT chain are dead, we'll signal the index AM
-                        * to not return that TID on future indexscans.
-                        */
-                       if (scan->xs_hot_dead &&
-                               HeapTupleSatisfiesVacuum(heapTuple->t_data, RecentGlobalXmin,
-                                                                                scan->xs_cbuf) != HEAPTUPLE_DEAD)
-                               scan->xs_hot_dead = false;
+       return NULL;
+}
+
+/* ----------------
+ *             index_getnext - get the next heap tuple from a scan
+ *
+ * The result is the next heap tuple satisfying the scan keys and the
+ * snapshot, or NULL if no more matching tuples exist.
+ *
+ * On success, the buffer containing the heap tup is pinned (the pin will be
+ * dropped in a future index_getnext_tid, index_fetch_heap or index_endscan
+ * call).
+ *
+ * Note: caller must check scan->xs_recheck, and perform rechecking of the
+ * scan keys if required.  We do not do that here because we don't have
+ * enough information to do it efficiently in the general case.
+ * ----------------
+ */
+HeapTuple
+index_getnext(IndexScanDesc scan, ScanDirection direction)
+{
+       HeapTuple       heapTuple;
+       ItemPointer tid;
 
+       for (;;)
+       {
+               if (scan->xs_continue_hot)
+               {
                        /*
-                        * Check to see if HOT chain continues past this tuple; if so
-                        * fetch the next offnum (we don't bother storing it into
-                        * xs_next_hot, but must store xs_prev_xmax), and loop around.
+                        * We are resuming scan of a HOT chain after having returned an
+                        * earlier member.  Must still hold pin on current heap page.
                         */
-                       if (HeapTupleIsHotUpdated(heapTuple))
-                       {
-                               Assert(ItemPointerGetBlockNumber(ctid) ==
-                                          ItemPointerGetBlockNumber(tid));
-                               offnum = ItemPointerGetOffsetNumber(ctid);
-                               at_chain_start = false;
-                               scan->xs_prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data);
-                       }
-                       else
-                               break;                  /* end of chain */
-               }                                               /* loop over a single HOT chain */
-
-               LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK);
-
-               /* Loop around to ask index AM for another TID */
-               scan->xs_next_hot = InvalidOffsetNumber;
-       }
+                       Assert(BufferIsValid(scan->xs_cbuf));
+                       Assert(ItemPointerGetBlockNumber(&scan->xs_ctup.t_self) ==
+                                  BufferGetBlockNumber(scan->xs_cbuf));
+               }
+               else
+               {
+                       /* Time to fetch the next TID from the index */
+                       tid = index_getnext_tid(scan, direction);
 
-       /* Release any held pin on a heap page */
-       if (BufferIsValid(scan->xs_cbuf))
-       {
-               ReleaseBuffer(scan->xs_cbuf);
-               scan->xs_cbuf = InvalidBuffer;
+                       /* If we're out of index entries, we're done */
+                       if (tid == NULL)
+                               break;
+               }
+
+               /*
+                * Fetch the next (or only) visible heap tuple for this index entry.
+                * If we don't find anything, loop around and grab the next TID from
+                * the index.
+                */
+               heapTuple = index_fetch_heap(scan);
+               if (heapTuple != NULL)
+                       return heapTuple;
        }
 
        return NULL;                            /* failure exit */
@@ -684,12 +711,10 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
 int64
 index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap)
 {
-       FmgrInfo   *procedure;
        int64           ntids;
-       Datum           d;
 
        SCAN_CHECKS;
-       GET_SCAN_PROCEDURE(amgetbitmap);
+       CHECK_SCAN_PROCEDURE(amgetbitmap);
 
        /* just make sure this is false... */
        scan->kill_prior_tuple = false;
@@ -697,16 +722,7 @@ index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap)
        /*
         * have the am's getbitmap proc do all the work.
         */
-       d = FunctionCall2(procedure,
-                                         PointerGetDatum(scan),
-                                         PointerGetDatum(bitmap));
-
-       ntids = DatumGetInt64(d);
-
-       /* If int8 is pass-by-ref, must free the result to avoid memory leak */
-#ifndef USE_FLOAT8_BYVAL
-       pfree(DatumGetPointer(d));
-#endif
+       ntids = scan->indexRelation->rd_amroutine->amgetbitmap(scan, bitmap);
 
        pgstat_count_index_tuples(scan->indexRelation, ntids);
 
@@ -729,20 +745,12 @@ index_bulk_delete(IndexVacuumInfo *info,
                                  void *callback_state)
 {
        Relation        indexRelation = info->index;
-       FmgrInfo   *procedure;
-       IndexBulkDeleteResult *result;
 
        RELATION_CHECKS;
-       GET_REL_PROCEDURE(ambulkdelete);
+       CHECK_REL_PROCEDURE(ambulkdelete);
 
-       result = (IndexBulkDeleteResult *)
-               DatumGetPointer(FunctionCall4(procedure,
-                                                                         PointerGetDatum(info),
-                                                                         PointerGetDatum(stats),
-                                                                         PointerGetDatum((Pointer) callback),
-                                                                         PointerGetDatum(callback_state)));
-
-       return result;
+       return indexRelation->rd_amroutine->ambulkdelete(info, stats,
+                                                                                                  callback, callback_state);
 }
 
 /* ----------------
@@ -756,18 +764,30 @@ index_vacuum_cleanup(IndexVacuumInfo *info,
                                         IndexBulkDeleteResult *stats)
 {
        Relation        indexRelation = info->index;
-       FmgrInfo   *procedure;
-       IndexBulkDeleteResult *result;
 
        RELATION_CHECKS;
-       GET_REL_PROCEDURE(amvacuumcleanup);
+       CHECK_REL_PROCEDURE(amvacuumcleanup);
+
+       return indexRelation->rd_amroutine->amvacuumcleanup(info, stats);
+}
+
+/* ----------------
+ *             index_can_return
+ *
+ *             Does the index access method support index-only scans for the given
+ *             column?
+ * ----------------
+ */
+bool
+index_can_return(Relation indexRelation, int attno)
+{
+       RELATION_CHECKS;
 
-       result = (IndexBulkDeleteResult *)
-               DatumGetPointer(FunctionCall2(procedure,
-                                                                         PointerGetDatum(info),
-                                                                         PointerGetDatum(stats)));
+       /* amcanreturn is optional; assume FALSE if not provided by AM */
+       if (indexRelation->rd_amroutine->amcanreturn == NULL)
+               return false;
 
-       return result;
+       return indexRelation->rd_amroutine->amcanreturn(indexRelation, attno);
 }
 
 /* ----------------
@@ -788,7 +808,7 @@ index_vacuum_cleanup(IndexVacuumInfo *info,
  *             particular indexed attribute are those with both types equal to
  *             the index opclass' opcintype (note that this is subtly different
  *             from the indexed attribute's own type: it may be a binary-compatible
- *             type instead).  Only the default functions are stored in relcache
+ *             type instead).  Only the default functions are stored in relcache
  *             entries --- access methods can use the syscache to look up non-default
  *             functions.
  *
@@ -805,7 +825,7 @@ index_getprocid(Relation irel,
        int                     nproc;
        int                     procindex;
 
-       nproc = irel->rd_am->amsupport;
+       nproc = irel->rd_amroutine->amsupport;
 
        Assert(procnum > 0 && procnum <= (uint16) nproc);
 
@@ -822,7 +842,7 @@ index_getprocid(Relation irel,
  *             index_getprocinfo
  *
  *             This routine allows index AMs to keep fmgr lookup info for
- *             support procs in the relcache.  As above, only the "default"
+ *             support procs in the relcache.  As above, only the "default"
  *             functions for any particular indexed attribute are cached.
  *
  * Note: the return value points into cached data that will be lost during
@@ -839,7 +859,7 @@ index_getprocinfo(Relation irel,
        int                     nproc;
        int                     procindex;
 
-       nproc = irel->rd_am->amsupport;
+       nproc = irel->rd_amroutine->amsupport;
 
        Assert(procnum > 0 && procnum <= (uint16) nproc);
 
@@ -872,7 +892,6 @@ index_getprocinfo(Relation irel,
                                 procnum, attnum, RelationGetRelationName(irel));
 
                fmgr_info_cxt(procId, locinfo, irel->rd_indexcxt);
-               fmgr_info_set_collation(irel->rd_indcollation[attnum-1], locinfo);
        }
 
        return locinfo;