Extend index AM API for parallel index scans.

[postgresql] / src / backend / access / index / indexam.c
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c

index 81c2114976e93f5fe8f74e17c13402bc06e38762..ba27c1e86d9f64de25aa2db6cd8da9b31eb6453b 100644 (file)
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -3,29 +3,34 @@
   * indexam.c
   *       general index access method routines
   *
- * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.82 2005/05/27 23:31:20 tgl Exp $
+ *       src/backend/access/index/indexam.c
   *
   * INTERFACE ROUTINES
   *             index_open              - open an index relation by relation OID
- *             index_openrv    - open an index relation specified by a RangeVar
   *             index_close             - close an index relation
   *             index_beginscan - start a scan of an index with amgettuple
- *             index_beginscan_multi - start a scan of an index with amgetmulti
+ *             index_beginscan_bitmap - start a scan of an index with amgetbitmap
   *             index_rescan    - restart a scan of an index
   *             index_endscan   - end a scan
   *             index_insert    - insert an index tuple into a relation
   *             index_markpos   - mark a scan position
   *             index_restrpos  - restore a scan position
- *             index_getnext   - get the next tuple from a scan
- *             index_getmulti  - get multiple tuples from a scan
+ *             index_parallelscan_estimate - estimate shared memory for parallel scan
+ *             index_parallelscan_initialize - initialize parallel scan
+ *             index_parallelrescan  - (re)start a parallel scan of an index
+ *             index_beginscan_parallel - join parallel index scan
+ *             index_getnext_tid       - get the next TID from a scan
+ *             index_fetch_heap                - get the scan's next heap tuple
+ *             index_getnext   - get the next heap tuple from a scan
+ *             index_getbitmap - get all tuples from a scan
   *             index_bulk_delete       - bulk deletion of index tuples
   *             index_vacuum_cleanup    - post-deletion cleanup of an index
- *             index_cost_estimator    - fetch amcostestimate procedure OID
+ *             index_can_return        - does index support index-only scans?
   *             index_getprocid - get a support procedure OID
   *             index_getprocinfo - get a support procedure's lookup info
   *
@@ -64,55 +69,63 @@
  
  #include "postgres.h"
  
-#include "access/genam.h"
-#include "access/heapam.h"
-#include "utils/relcache.h"
-
+#include "access/amapi.h"
+#include "access/relscan.h"
+#include "access/transam.h"
+#include "access/xlog.h"
+#include "catalog/catalog.h"
+#include "catalog/index.h"
  #include "pgstat.h"
+#include "storage/bufmgr.h"
+#include "storage/lmgr.h"
+#include "storage/predicate.h"
+#include "utils/snapmgr.h"
+#include "utils/tqual.h"
+
  
  /* ----------------------------------------------------------------
   *                                     macros used in index_ routines
+ *
+ * Note: the ReindexIsProcessingIndex() check in RELATION_CHECKS is there
+ * to check that we don't try to scan or do retail insertions into an index
+ * that is currently being rebuilt or pending rebuild.  This helps to catch
+ * things that don't work when reindexing system catalogs.  The assertion
+ * doesn't prevent the actual rebuild because we don't use RELATION_CHECKS
+ * when calling the index AM's ambuild routine, and there is no reason for
+ * ambuild to call its subsidiary routines through this file.
   * ----------------------------------------------------------------
   */
  #define RELATION_CHECKS \
  ( \
         AssertMacro(RelationIsValid(indexRelation)), \
-       AssertMacro(PointerIsValid(indexRelation->rd_am)) \
+       AssertMacro(PointerIsValid(indexRelation->rd_amroutine)), \
+       AssertMacro(!ReindexIsProcessingIndex(RelationGetRelid(indexRelation))) \
  )
  
  #define SCAN_CHECKS \
  ( \
         AssertMacro(IndexScanIsValid(scan)), \
         AssertMacro(RelationIsValid(scan->indexRelation)), \
-       AssertMacro(PointerIsValid(scan->indexRelation->rd_am)) \
+       AssertMacro(PointerIsValid(scan->indexRelation->rd_amroutine)) \
  )
  
-#define GET_REL_PROCEDURE(pname) \
+#define CHECK_REL_PROCEDURE(pname) \
  do { \
-       procedure = &indexRelation->rd_aminfo->pname; \
-       if (!OidIsValid(procedure->fn_oid)) \
-       { \
-               RegProcedure    procOid = indexRelation->rd_am->pname; \
-               if (!RegProcedureIsValid(procOid)) \
-                       elog(ERROR, "invalid %s regproc", CppAsString(pname)); \
-               fmgr_info_cxt(procOid, procedure, indexRelation->rd_indexcxt); \
-       } \
+       if (indexRelation->rd_amroutine->pname == NULL) \
+               elog(ERROR, "function %s is not defined for index %s", \
+                        CppAsString(pname), RelationGetRelationName(indexRelation)); \
  } while(0)
  
-#define GET_SCAN_PROCEDURE(pname) \
+#define CHECK_SCAN_PROCEDURE(pname) \
  do { \
-       procedure = &scan->indexRelation->rd_aminfo->pname; \
-       if (!OidIsValid(procedure->fn_oid)) \
-       { \
-               RegProcedure    procOid = scan->indexRelation->rd_am->pname; \
-               if (!RegProcedureIsValid(procOid)) \
-                       elog(ERROR, "invalid %s regproc", CppAsString(pname)); \
-               fmgr_info_cxt(procOid, procedure, scan->indexRelation->rd_indexcxt); \
-       } \
+       if (scan->indexRelation->rd_amroutine->pname == NULL) \
+               elog(ERROR, "function %s is not defined for index %s", \
+                        CppAsString(pname), RelationGetRelationName(scan->indexRelation)); \
  } while(0)
  
  static IndexScanDesc index_beginscan_internal(Relation indexRelation,
-                                                                                         int nkeys, ScanKey key);
+                                                int nkeys, int norderbys, Snapshot snapshot,
+                                                ParallelIndexScanDesc pscan, bool temp_snap);
  
  
  /* ----------------------------------------------------------------
@@ -123,26 +136,23 @@ static IndexScanDesc index_beginscan_internal(Relation indexRelation,
  /* ----------------
   *             index_open - open an index relation by relation OID
   *
- *             Note: we acquire no lock on the index.  A lock is not needed when
- *             simply examining the index reldesc; the index's schema information
- *             is considered to be protected by the lock that the caller had better
- *             be holding on the parent relation.  Some type of lock should be
- *             obtained on the index before physically accessing it, however.
- *             This is handled automatically for most uses by index_beginscan
- *             and index_endscan for scan cases, or by ExecOpenIndices and
- *             ExecCloseIndices for update cases.  Other callers will need to
- *             obtain their own locks.
+ *             If lockmode is not "NoLock", the specified kind of lock is
+ *             obtained on the index.  (Generally, NoLock should only be
+ *             used if the caller knows it has some appropriate lock on the
+ *             index already.)
+ *
+ *             An error is raised if the index does not exist.
   *
   *             This is a convenience routine adapted for indexscan use.
   *             Some callers may prefer to use relation_open directly.
   * ----------------
   */
  Relation
-index_open(Oid relationId)
+index_open(Oid relationId, LOCKMODE lockmode)
  {
         Relation        r;
  
-       r = relation_open(relationId, NoLock);
+       r = relation_open(relationId, lockmode);
  
         if (r->rd_rel->relkind != RELKIND_INDEX)
                 ereport(ERROR,
@@ -150,47 +160,30 @@ index_open(Oid relationId)
                                  errmsg("\"%s\" is not an index",
                                                 RelationGetRelationName(r))));
  
-       pgstat_initstats(&r->pgstat_info, r);
-
         return r;
  }
  
  /* ----------------
- *             index_openrv - open an index relation specified
- *             by a RangeVar node
+ *             index_close - close an index relation
   *
- *             As above, but relation is specified by a RangeVar.
- * ----------------
- */
-Relation
-index_openrv(const RangeVar *relation)
-{
-       Relation        r;
-
-       r = relation_openrv(relation, NoLock);
-
-       if (r->rd_rel->relkind != RELKIND_INDEX)
-               ereport(ERROR,
-                               (errcode(ERRCODE_WRONG_OBJECT_TYPE),
-                                errmsg("\"%s\" is not an index",
-                                               RelationGetRelationName(r))));
-
-       pgstat_initstats(&r->pgstat_info, r);
-
-       return r;
-}
-
-/* ----------------
- *             index_close - close a index relation
+ *             If lockmode is not "NoLock", we then release the specified lock.
   *
- *             presently the relcache routines do all the work we need
- *             to open/close index relations.
+ *             Note that it is often sensible to hold a lock beyond index_close;
+ *             in that case, the lock is released automatically at xact end.
   * ----------------
   */
  void
-index_close(Relation relation)
+index_close(Relation relation, LOCKMODE lockmode)
  {
+       LockRelId       relid = relation->rd_lockInfo.lockRelId;
+
+       Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
+
+       /* The relcache does the real work... */
         RelationClose(relation);
+
+       if (lockmode != NoLock)
+               UnlockRelationId(&relid, lockmode);
  }
  
  /* ----------------
@@ -203,49 +196,40 @@ index_insert(Relation indexRelation,
                          bool *isnull,
                          ItemPointer heap_t_ctid,
                          Relation heapRelation,
-                        bool check_uniqueness)
+                        IndexUniqueCheck checkUnique)
  {
-       FmgrInfo   *procedure;
-
         RELATION_CHECKS;
-       GET_REL_PROCEDURE(aminsert);
+       CHECK_REL_PROCEDURE(aminsert);
  
-       /*
-        * have the am's insert proc do all the work.
-        */
-       return DatumGetBool(FunctionCall6(procedure,
-                                                                         PointerGetDatum(indexRelation),
-                                                                         PointerGetDatum(values),
-                                                                         PointerGetDatum(isnull),
-                                                                         PointerGetDatum(heap_t_ctid),
-                                                                         PointerGetDatum(heapRelation),
-                                                                         BoolGetDatum(check_uniqueness)));
+       if (!(indexRelation->rd_amroutine->ampredlocks))
+               CheckForSerializableConflictIn(indexRelation,
+                                                                          (HeapTuple) NULL,
+                                                                          InvalidBuffer);
+
+       return indexRelation->rd_amroutine->aminsert(indexRelation, values, isnull,
+                                                                                                heap_t_ctid, heapRelation,
+                                                                                                checkUnique);
  }
  
  /*
   * index_beginscan - start a scan of an index with amgettuple
   *
- * Note: heapRelation may be NULL if there is no intention of calling
- * index_getnext on this scan; index_getnext_indexitem will not use the
- * heapRelation link (nor the snapshot).  However, the caller had better
- * be holding some kind of lock on the heap relation in any case, to ensure
- * no one deletes it (or the index) out from under us.
+ * Caller must be holding suitable locks on the heap and the index.
   */
  IndexScanDesc
  index_beginscan(Relation heapRelation,
                                 Relation indexRelation,
                                 Snapshot snapshot,
-                               int nkeys, ScanKey key)
+                               int nkeys, int norderbys)
  {
         IndexScanDesc scan;
  
-       scan = index_beginscan_internal(indexRelation, nkeys, key);
+       scan = index_beginscan_internal(indexRelation, nkeys, norderbys, snapshot, NULL, false);
  
         /*
-        * Save additional parameters into the scandesc.  Everything else was
-        * set up by RelationGetIndexScan.
+        * Save additional parameters into the scandesc.  Everything else was set
+        * up by RelationGetIndexScan.
          */
-       scan->is_multiscan = false;
         scan->heapRelation = heapRelation;
         scan->xs_snapshot = snapshot;
  
@@ -253,25 +237,24 @@ index_beginscan(Relation heapRelation,
  }
  
  /*
- * index_beginscan_multi - start a scan of an index with amgetmulti
+ * index_beginscan_bitmap - start a scan of an index with amgetbitmap
   *
   * As above, caller had better be holding some lock on the parent heap
   * relation, even though it's not explicitly mentioned here.
   */
  IndexScanDesc
-index_beginscan_multi(Relation indexRelation,
-                                         Snapshot snapshot,
-                                         int nkeys, ScanKey key)
+index_beginscan_bitmap(Relation indexRelation,
+                                          Snapshot snapshot,
+                                          int nkeys)
  {
         IndexScanDesc scan;
  
-       scan = index_beginscan_internal(indexRelation, nkeys, key);
+       scan = index_beginscan_internal(indexRelation, nkeys, 0, snapshot, NULL, false);
  
         /*
-        * Save additional parameters into the scandesc.  Everything else was
-        * set up by RelationGetIndexScan.
+        * Save additional parameters into the scandesc.  Everything else was set
+        * up by RelationGetIndexScan.
          */
-       scan->is_multiscan = true;
         scan->xs_snapshot = snapshot;
  
         return scan;
@@ -282,33 +265,30 @@ index_beginscan_multi(Relation indexRelation,
   */
  static IndexScanDesc
  index_beginscan_internal(Relation indexRelation,
-                                                int nkeys, ScanKey key)
+                                                int nkeys, int norderbys, Snapshot snapshot,
+                                                ParallelIndexScanDesc pscan, bool temp_snap)
  {
         IndexScanDesc scan;
-       FmgrInfo   *procedure;
  
         RELATION_CHECKS;
-       GET_REL_PROCEDURE(ambeginscan);
+       CHECK_REL_PROCEDURE(ambeginscan);
  
-       RelationIncrementReferenceCount(indexRelation);
+       if (!(indexRelation->rd_amroutine->ampredlocks))
+               PredicateLockRelation(indexRelation, snapshot);
  
         /*
-        * Acquire AccessShareLock for the duration of the scan
-        *
-        * Note: we could get an SI inval message here and consequently have to
-        * rebuild the relcache entry.  The refcount increment above ensures
-        * that we will rebuild it and not just flush it...
+        * We hold a reference count to the relcache entry throughout the scan.
          */
-       LockRelation(indexRelation, AccessShareLock);
+       RelationIncrementReferenceCount(indexRelation);
  
         /*
          * Tell the AM to open a scan.
          */
-       scan = (IndexScanDesc)
-               DatumGetPointer(FunctionCall3(procedure,
-                                                                         PointerGetDatum(indexRelation),
-                                                                         Int32GetDatum(nkeys),
-                                                                         PointerGetDatum(key)));
+       scan = indexRelation->rd_amroutine->ambeginscan(indexRelation, nkeys,
+                                                                                                       norderbys);
+       /* Initialize information for parallel scan. */
+       scan->parallel_scan = pscan;
+       scan->xs_temp_snap = temp_snap;
  
         return scan;
  }
@@ -316,22 +296,25 @@ index_beginscan_internal(Relation indexRelation,
  /* ----------------
   *             index_rescan  - (re)start a scan of an index
   *
- * The caller may specify a new set of scankeys (but the number of keys
- * cannot change).     To restart the scan without changing keys, pass NULL
- * for the key array.
- *
- * Note that this is also called when first starting an indexscan;
- * see RelationGetIndexScan.  Keys *must* be passed in that case,
- * unless scan->numberOfKeys is zero.
+ * During a restart, the caller may specify a new set of scankeys and/or
+ * orderbykeys; but the number of keys cannot differ from what index_beginscan
+ * was told.  (Later we might relax that to "must not exceed", but currently
+ * the index AMs tend to assume that scan->numberOfKeys is what to believe.)
+ * To restart the scan without changing keys, pass NULL for the key arrays.
+ * (Of course, keys *must* be passed on the first call, unless
+ * scan->numberOfKeys is zero.)
   * ----------------
   */
  void
-index_rescan(IndexScanDesc scan, ScanKey key)
+index_rescan(IndexScanDesc scan,
+                        ScanKey keys, int nkeys,
+                        ScanKey orderbys, int norderbys)
  {
-       FmgrInfo   *procedure;
-
         SCAN_CHECKS;
-       GET_SCAN_PROCEDURE(amrescan);
+       CHECK_SCAN_PROCEDURE(amrescan);
+
+       Assert(nkeys == scan->numberOfKeys);
+       Assert(norderbys == scan->numberOfOrderBys);
  
         /* Release any held pin on a heap page */
         if (BufferIsValid(scan->xs_cbuf))
@@ -340,17 +323,12 @@ index_rescan(IndexScanDesc scan, ScanKey key)
                 scan->xs_cbuf = InvalidBuffer;
         }
  
-       scan->kill_prior_tuple = false;         /* for safety */
-       scan->keys_are_unique = false;          /* may be set by index AM */
-       scan->got_tuple = false;
-       scan->unique_tuple_pos = 0;
-       scan->unique_tuple_mark = 0;
+       scan->xs_continue_hot = false;
  
-       FunctionCall2(procedure,
-                                 PointerGetDatum(scan),
-                                 PointerGetDatum(key));
+       scan->kill_prior_tuple = false;         /* for safety */
  
-       pgstat_reset_index_scan(&scan->xs_pgstat_info);
+       scan->indexRelation->rd_amroutine->amrescan(scan, keys, nkeys,
+                                                                                               orderbys, norderbys);
  }
  
  /* ----------------
@@ -360,10 +338,8 @@ index_rescan(IndexScanDesc scan, ScanKey key)
  void
  index_endscan(IndexScanDesc scan)
  {
-       FmgrInfo   *procedure;
-
         SCAN_CHECKS;
-       GET_SCAN_PROCEDURE(amendscan);
+       CHECK_SCAN_PROCEDURE(amendscan);
  
         /* Release any held pin on a heap page */
         if (BufferIsValid(scan->xs_cbuf))
@@ -373,14 +349,14 @@ index_endscan(IndexScanDesc scan)
         }
  
         /* End the AM's scan */
-       FunctionCall1(procedure, PointerGetDatum(scan));
-
-       /* Release index lock and refcount acquired by index_beginscan */
-
-       UnlockRelation(scan->indexRelation, AccessShareLock);
+       scan->indexRelation->rd_amroutine->amendscan(scan);
  
+       /* Release index refcount acquired by index_beginscan */
         RelationDecrementReferenceCount(scan->indexRelation);
  
+       if (scan->xs_temp_snap)
+               UnregisterSnapshot(scan->xs_snapshot);
+
         /* Release the scan data structure itself */
         IndexScanEnd(scan);
  }
@@ -392,14 +368,10 @@ index_endscan(IndexScanDesc scan)
  void
  index_markpos(IndexScanDesc scan)
  {
-       FmgrInfo   *procedure;
-
         SCAN_CHECKS;
-       GET_SCAN_PROCEDURE(ammarkpos);
-
-       scan->unique_tuple_mark = scan->unique_tuple_pos;
+       CHECK_SCAN_PROCEDURE(ammarkpos);
  
-       FunctionCall1(procedure, PointerGetDatum(scan));
+       scan->indexRelation->rd_amroutine->ammarkpos(scan);
  }
  
  /* ----------------
@@ -408,254 +380,353 @@ index_markpos(IndexScanDesc scan)
   * NOTE: this only restores the internal scan state of the index AM.
   * The current result tuple (scan->xs_ctup) doesn't change.  See comments
   * for ExecRestrPos().
+ *
+ * NOTE: in the presence of HOT chains, mark/restore only works correctly
+ * if the scan's snapshot is MVCC-safe; that ensures that there's at most one
+ * returnable tuple in each HOT chain, and so restoring the prior state at the
+ * granularity of the index AM is sufficient.  Since the only current user
+ * of mark/restore functionality is nodeMergejoin.c, this effectively means
+ * that merge-join plans only work for MVCC snapshots.  This could be fixed
+ * if necessary, but for now it seems unimportant.
   * ----------------
   */
  void
  index_restrpos(IndexScanDesc scan)
  {
-       FmgrInfo   *procedure;
+       Assert(IsMVCCSnapshot(scan->xs_snapshot));
  
         SCAN_CHECKS;
-       GET_SCAN_PROCEDURE(amrestrpos);
+       CHECK_SCAN_PROCEDURE(amrestrpos);
+
+       scan->xs_continue_hot = false;
  
         scan->kill_prior_tuple = false;         /* for safety */
  
+       scan->indexRelation->rd_amroutine->amrestrpos(scan);
+}
+
+/*
+ * index_parallelscan_estimate - estimate shared memory for parallel scan
+ *
+ * Currently, we don't pass any information to the AM-specific estimator,
+ * so it can probably only return a constant.  In the future, we might need
+ * to pass more information.
+ */
+Size
+index_parallelscan_estimate(Relation indexRelation, Snapshot snapshot)
+{
+       Size            nbytes;
+
+       RELATION_CHECKS;
+
+       nbytes = offsetof(ParallelIndexScanDescData, ps_snapshot_data);
+       nbytes = add_size(nbytes, EstimateSnapshotSpace(snapshot));
+       nbytes = MAXALIGN(nbytes);
+
         /*
-        * We do not reset got_tuple; so if the scan is actually being
-        * short-circuited by index_getnext, the effective position
-        * restoration is done by restoring unique_tuple_pos.
+        * If amestimateparallelscan is not provided, assume there is no
+        * AM-specific data needed.  (It's hard to believe that could work, but
+        * it's easy enough to cater to it here.)
          */
-       scan->unique_tuple_pos = scan->unique_tuple_mark;
+       if (indexRelation->rd_amroutine->amestimateparallelscan != NULL)
+               nbytes = add_size(nbytes,
+                                         indexRelation->rd_amroutine->amestimateparallelscan());
  
-       FunctionCall1(procedure, PointerGetDatum(scan));
+       return nbytes;
  }
  
-/* ----------------
- *             index_getnext - get the next heap tuple from a scan
+/*
+ * index_parallelscan_initialize - initialize parallel scan
   *
- * The result is the next heap tuple satisfying the scan keys and the
- * snapshot, or NULL if no more matching tuples exist. On success,
- * the buffer containing the heap tuple is pinned (the pin will be dropped
- * at the next index_getnext or index_endscan).  The index TID corresponding
- * to the heap tuple can be obtained if needed from scan->currentItemData.
- * ----------------
+ * We initialize both the ParallelIndexScanDesc proper and the AM-specific
+ * information which follows it.
+ *
+ * This function calls access method specific initialization routine to
+ * initialize am specific information.  Call this just once in the leader
+ * process; then, individual workers attach via index_beginscan_parallel.
   */
-HeapTuple
-index_getnext(IndexScanDesc scan, ScanDirection direction)
+void
+index_parallelscan_initialize(Relation heapRelation, Relation indexRelation,
+                                                         Snapshot snapshot, ParallelIndexScanDesc target)
  {
-       HeapTuple       heapTuple = &scan->xs_ctup;
-       FmgrInfo   *procedure;
+       Size            offset;
+
+       RELATION_CHECKS;
+
+       offset = add_size(offsetof(ParallelIndexScanDescData, ps_snapshot_data),
+                                         EstimateSnapshotSpace(snapshot));
+       offset = MAXALIGN(offset);
+
+       target->ps_relid = RelationGetRelid(heapRelation);
+       target->ps_indexid = RelationGetRelid(indexRelation);
+       target->ps_offset = offset;
+       SerializeSnapshot(snapshot, target->ps_snapshot_data);
  
+       /* aminitparallelscan is optional; assume no-op if not provided by AM */
+       if (indexRelation->rd_amroutine->aminitparallelscan != NULL)
+       {
+               void       *amtarget;
+
+               amtarget = OffsetToPointer(target, offset);
+               indexRelation->rd_amroutine->aminitparallelscan(amtarget);
+       }
+}
+
+/* ----------------
+ *             index_parallelrescan  - (re)start a parallel scan of an index
+ * ----------------
+ */
+void
+index_parallelrescan(IndexScanDesc scan)
+{
         SCAN_CHECKS;
-       GET_SCAN_PROCEDURE(amgettuple);
+
+       /* amparallelrescan is optional; assume no-op if not provided by AM */
+       if (scan->indexRelation->rd_amroutine->amparallelrescan != NULL)
+               scan->indexRelation->rd_amroutine->amparallelrescan(scan);
+}
+
+/*
+ * index_beginscan_parallel - join parallel index scan
+ *
+ * Caller must be holding suitable locks on the heap and the index.
+ */
+IndexScanDesc
+index_beginscan_parallel(Relation heaprel, Relation indexrel, int nkeys,
+                                                int norderbys, ParallelIndexScanDesc pscan)
+{
+       Snapshot        snapshot;
+       IndexScanDesc scan;
+
+       Assert(RelationGetRelid(heaprel) == pscan->ps_relid);
+       snapshot = RestoreSnapshot(pscan->ps_snapshot_data);
+       RegisterSnapshot(snapshot);
+       scan = index_beginscan_internal(indexrel, nkeys, norderbys, snapshot,
+                                                                       pscan, true);
  
         /*
-        * If we already got a tuple and it must be unique, there's no need to
-        * make the index AM look through any additional tuples.  (This can
-        * save a useful amount of work in scenarios where there are many dead
-        * tuples due to heavy update activity.)
-        *
-        * To do this we must keep track of the logical scan position
-        * (before/on/after tuple).  Also, we have to be sure to release scan
-        * resources before returning NULL; if we fail to do so then a
-        * multi-index scan can easily run the system out of free buffers.      We
-        * can release index-level resources fairly cheaply by calling
-        * index_rescan.  This means there are two persistent states as far as
-        * the index AM is concerned: on-tuple and rescanned.  If we are
-        * actually asked to re-fetch the single tuple, we have to go through
-        * a fresh indexscan startup, which penalizes that (infrequent) case.
+        * Save additional parameters into the scandesc.  Everything else was set
+        * up by index_beginscan_internal.
          */
-       if (scan->keys_are_unique && scan->got_tuple)
-       {
-               int                     new_tuple_pos = scan->unique_tuple_pos;
+       scan->heapRelation = heaprel;
+       scan->xs_snapshot = snapshot;
  
-               if (ScanDirectionIsForward(direction))
-               {
-                       if (new_tuple_pos <= 0)
-                               new_tuple_pos++;
-               }
-               else
-               {
-                       if (new_tuple_pos >= 0)
-                               new_tuple_pos--;
-               }
-               if (new_tuple_pos == 0)
-               {
-                       /*
-                        * We are moving onto the unique tuple from having been off
-                        * it. We just fall through and let the index AM do the work.
-                        * Note we should get the right answer regardless of scan
-                        * direction.
-                        */
-                       scan->unique_tuple_pos = 0; /* need to update position */
-               }
-               else
-               {
-                       /*
-                        * Moving off the tuple; must do amrescan to release
-                        * index-level pins before we return NULL.      Since index_rescan
-                        * will reset my state, must save and restore...
-                        */
-                       int                     unique_tuple_mark = scan->unique_tuple_mark;
+       return scan;
+}
  
-                       index_rescan(scan, NULL /* no change to key */ );
+/* ----------------
+ * index_getnext_tid - get the next TID from a scan
+ *
+ * The result is the next TID satisfying the scan keys,
+ * or NULL if no more matching tuples exist.
+ * ----------------
+ */
+ItemPointer
+index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
+{
+       bool            found;
  
-                       scan->keys_are_unique = true;
-                       scan->got_tuple = true;
-                       scan->unique_tuple_pos = new_tuple_pos;
-                       scan->unique_tuple_mark = unique_tuple_mark;
+       SCAN_CHECKS;
+       CHECK_SCAN_PROCEDURE(amgettuple);
  
-                       return NULL;
-               }
-       }
+       Assert(TransactionIdIsValid(RecentGlobalXmin));
  
-       /* just make sure this is false... */
+       /*
+        * The AM's amgettuple proc finds the next index entry matching the scan
+        * keys, and puts the TID into scan->xs_ctup.t_self.  It should also set
+        * scan->xs_recheck and possibly scan->xs_itup, though we pay no attention
+        * to those fields here.
+        */
+       found = scan->indexRelation->rd_amroutine->amgettuple(scan, direction);
+
+       /* Reset kill flag immediately for safety */
         scan->kill_prior_tuple = false;
  
-       for (;;)
+       /* If we're out of index entries, we're done */
+       if (!found)
         {
-               bool            found;
+               /* ... but first, release any held pin on a heap page */
+               if (BufferIsValid(scan->xs_cbuf))
+               {
+                       ReleaseBuffer(scan->xs_cbuf);
+                       scan->xs_cbuf = InvalidBuffer;
+               }
+               return NULL;
+       }
  
-               pgstat_count_index_scan(&scan->xs_pgstat_info);
+       pgstat_count_index_tuples(scan->indexRelation, 1);
  
-               /*
-                * The AM's gettuple proc finds the next tuple matching the scan
-                * keys.
-                */
-               found = DatumGetBool(FunctionCall2(procedure,
-                                                                                  PointerGetDatum(scan),
-                                                                                  Int32GetDatum(direction)));
+       /* Return the TID of the tuple we found. */
+       return &scan->xs_ctup.t_self;
+}
  
-               /* Reset kill flag immediately for safety */
-               scan->kill_prior_tuple = false;
+/* ----------------
+ *             index_fetch_heap - get the scan's next heap tuple
+ *
+ * The result is a visible heap tuple associated with the index TID most
+ * recently fetched by index_getnext_tid, or NULL if no more matching tuples
+ * exist.  (There can be more than one matching tuple because of HOT chains,
+ * although when using an MVCC snapshot it should be impossible for more than
+ * one such tuple to exist.)
+ *
+ * On success, the buffer containing the heap tup is pinned (the pin will be
+ * dropped in a future index_getnext_tid, index_fetch_heap or index_endscan
+ * call).
+ *
+ * Note: caller must check scan->xs_recheck, and perform rechecking of the
+ * scan keys if required.  We do not do that here because we don't have
+ * enough information to do it efficiently in the general case.
+ * ----------------
+ */
+HeapTuple
+index_fetch_heap(IndexScanDesc scan)
+{
+       ItemPointer tid = &scan->xs_ctup.t_self;
+       bool            all_dead = false;
+       bool            got_heap_tuple;
  
-               if (!found)
-               {
-                       /* Release any held pin on a heap page */
-                       if (BufferIsValid(scan->xs_cbuf))
-                       {
-                               ReleaseBuffer(scan->xs_cbuf);
-                               scan->xs_cbuf = InvalidBuffer;
-                       }
-                       return NULL;            /* failure exit */
-               }
+       /* We can skip the buffer-switching logic if we're in mid-HOT chain. */
+       if (!scan->xs_continue_hot)
+       {
+               /* Switch to correct buffer if we don't have it already */
+               Buffer          prev_buf = scan->xs_cbuf;
+
+               scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf,
+                                                                                        scan->heapRelation,
+                                                                                        ItemPointerGetBlockNumber(tid));
  
                 /*
-                * Fetch the heap tuple and see if it matches the snapshot.
+                * Prune page, but only if we weren't already on this page
                  */
-               if (heap_release_fetch(scan->heapRelation, scan->xs_snapshot,
-                                                          heapTuple, &scan->xs_cbuf, true,
-                                                          &scan->xs_pgstat_info))
-                       break;
-
-               /* Skip if no undeleted tuple at this location */
-               if (heapTuple->t_data == NULL)
-                       continue;
+               if (prev_buf != scan->xs_cbuf)
+                       heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf);
+       }
  
+       /* Obtain share-lock on the buffer so we can examine visibility */
+       LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE);
+       got_heap_tuple = heap_hot_search_buffer(tid, scan->heapRelation,
+                                                                                       scan->xs_cbuf,
+                                                                                       scan->xs_snapshot,
+                                                                                       &scan->xs_ctup,
+                                                                                       &all_dead,
+                                                                                       !scan->xs_continue_hot);
+       LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK);
+
+       if (got_heap_tuple)
+       {
                 /*
-                * If we can't see it, maybe no one else can either.  Check to see
-                * if the tuple is dead to all transactions.  If so, signal the
-                * index AM to not return it on future indexscans.
-                *
-                * We told heap_release_fetch to keep a pin on the buffer, so we can
-                * re-access the tuple here.  But we must re-lock the buffer first.
+                * Only in a non-MVCC snapshot can more than one member of the HOT
+                * chain be visible.
                  */
-               LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE);
-
-               if (HeapTupleSatisfiesVacuum(heapTuple->t_data, RecentGlobalXmin,
-                                                                        scan->xs_cbuf) == HEAPTUPLE_DEAD)
-                       scan->kill_prior_tuple = true;
-
-               LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK);
+               scan->xs_continue_hot = !IsMVCCSnapshot(scan->xs_snapshot);
+               pgstat_count_heap_fetch(scan->indexRelation);
+               return &scan->xs_ctup;
         }
  
-       /* Success exit */
-       scan->got_tuple = true;
+       /* We've reached the end of the HOT chain. */
+       scan->xs_continue_hot = false;
  
         /*
-        * If we just fetched a known-unique tuple, then subsequent calls will
-        * go through the short-circuit code above.  unique_tuple_pos has been
-        * initialized to 0, which is the correct state ("on row").
+        * If we scanned a whole HOT chain and found only dead tuples, tell index
+        * AM to kill its entry for that TID (this will take effect in the next
+        * amgettuple call, in index_getnext_tid).  We do not do this when in
+        * recovery because it may violate MVCC to do so.  See comments in
+        * RelationGetIndexScan().
          */
+       if (!scan->xactStartedInRecovery)
+               scan->kill_prior_tuple = all_dead;
  
-       pgstat_count_index_getnext(&scan->xs_pgstat_info);
-
-       return heapTuple;
+       return NULL;
  }
  
  /* ----------------
- *             index_getnext_indexitem - get the next index tuple from a scan
+ *             index_getnext - get the next heap tuple from a scan
+ *
+ * The result is the next heap tuple satisfying the scan keys and the
+ * snapshot, or NULL if no more matching tuples exist.
   *
- * Finds the next index tuple satisfying the scan keys.  Note that the
- * corresponding heap tuple is not accessed, and thus no time qual (snapshot)
- * check is done, other than the index AM's internal check for killed tuples
- * (which most callers of this routine will probably want to suppress by
- * setting scan->ignore_killed_tuples = false).
+ * On success, the buffer containing the heap tup is pinned (the pin will be
+ * dropped in a future index_getnext_tid, index_fetch_heap or index_endscan
+ * call).
   *
- * On success (TRUE return), the found index TID is in scan->currentItemData,
- * and its heap TID is in scan->xs_ctup.t_self.  scan->xs_cbuf is untouched.
+ * Note: caller must check scan->xs_recheck, and perform rechecking of the
+ * scan keys if required.  We do not do that here because we don't have
+ * enough information to do it efficiently in the general case.
   * ----------------
   */
-bool
-index_getnext_indexitem(IndexScanDesc scan,
-                                               ScanDirection direction)
+HeapTuple
+index_getnext(IndexScanDesc scan, ScanDirection direction)
  {
-       FmgrInfo   *procedure;
-       bool            found;
+       HeapTuple       heapTuple;
+       ItemPointer tid;
  
-       SCAN_CHECKS;
-       GET_SCAN_PROCEDURE(amgettuple);
+       for (;;)
+       {
+               if (scan->xs_continue_hot)
+               {
+                       /*
+                        * We are resuming scan of a HOT chain after having returned an
+                        * earlier member.  Must still hold pin on current heap page.
+                        */
+                       Assert(BufferIsValid(scan->xs_cbuf));
+                       Assert(ItemPointerGetBlockNumber(&scan->xs_ctup.t_self) ==
+                                  BufferGetBlockNumber(scan->xs_cbuf));
+               }
+               else
+               {
+                       /* Time to fetch the next TID from the index */
+                       tid = index_getnext_tid(scan, direction);
  
-       /* just make sure this is false... */
-       scan->kill_prior_tuple = false;
+                       /* If we're out of index entries, we're done */
+                       if (tid == NULL)
+                               break;
+               }
  
-       /*
-        * have the am's gettuple proc do all the work.
-        */
-       found = DatumGetBool(FunctionCall2(procedure,
-                                                                          PointerGetDatum(scan),
-                                                                          Int32GetDatum(direction)));
+               /*
+                * Fetch the next (or only) visible heap tuple for this index entry.
+                * If we don't find anything, loop around and grab the next TID from
+                * the index.
+                */
+               heapTuple = index_fetch_heap(scan);
+               if (heapTuple != NULL)
+                       return heapTuple;
+       }
  
-       return found;
+       return NULL;                            /* failure exit */
  }
  
  /* ----------------
- *             index_getmulti - get multiple tuples from an index scan
+ *             index_getbitmap - get all tuples at once from an index scan
   *
- * Collects the TIDs of multiple heap tuples satisfying the scan keys.
+ * Adds the TIDs of all heap tuples satisfying the scan keys to a bitmap.
   * Since there's no interlock between the index scan and the eventual heap
   * access, this is only safe to use with MVCC-based snapshots: the heap
   * item slot could have been replaced by a newer tuple by the time we get
   * to it.
   *
- * A TRUE result indicates more calls should occur; a FALSE result says the
- * scan is done.  *returned_tids could be zero or nonzero in either case.
+ * Returns the number of matching tuples found.  (Note: this might be only
+ * approximate, so it should only be used for statistical purposes.)
   * ----------------
   */
-bool
-index_getmulti(IndexScanDesc scan,
-                          ItemPointer tids, int32 max_tids,
-                          int32 *returned_tids)
+int64
+index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap)
  {
-       FmgrInfo   *procedure;
-       bool            found;
+       int64           ntids;
  
         SCAN_CHECKS;
-       GET_SCAN_PROCEDURE(amgetmulti);
+       CHECK_SCAN_PROCEDURE(amgetbitmap);
  
         /* just make sure this is false... */
         scan->kill_prior_tuple = false;
  
         /*
-        * have the am's getmulti proc do all the work.
+        * have the am's getbitmap proc do all the work.
          */
-       found = DatumGetBool(FunctionCall4(procedure,
-                                                                          PointerGetDatum(scan),
-                                                                          PointerGetDatum(tids),
-                                                                          Int32GetDatum(max_tids),
-                                                                          PointerGetDatum(returned_tids)));
+       ntids = scan->indexRelation->rd_amroutine->amgetbitmap(scan, bitmap);
+
+       pgstat_count_index_tuples(scan->indexRelation, ntids);
  
-       return found;
+       return ntids;
  }
  
  /* ----------------
@@ -668,23 +739,18 @@ index_getmulti(IndexScanDesc scan,
   * ----------------
   */
  IndexBulkDeleteResult *
-index_bulk_delete(Relation indexRelation,
+index_bulk_delete(IndexVacuumInfo *info,
+                                 IndexBulkDeleteResult *stats,
                                   IndexBulkDeleteCallback callback,
                                   void *callback_state)
  {
-       FmgrInfo   *procedure;
-       IndexBulkDeleteResult *result;
+       Relation        indexRelation = info->index;
  
         RELATION_CHECKS;
-       GET_REL_PROCEDURE(ambulkdelete);
-
-       result = (IndexBulkDeleteResult *)
-               DatumGetPointer(FunctionCall3(procedure,
-                                                                         PointerGetDatum(indexRelation),
-                                                                         PointerGetDatum((Pointer) callback),
-                                                                         PointerGetDatum(callback_state)));
+       CHECK_REL_PROCEDURE(ambulkdelete);
  
-       return result;
+       return indexRelation->rd_amroutine->ambulkdelete(info, stats,
+                                                                                                  callback, callback_state);
  }
  
  /* ----------------
@@ -694,65 +760,60 @@ index_bulk_delete(Relation indexRelation,
   * ----------------
   */
  IndexBulkDeleteResult *
-index_vacuum_cleanup(Relation indexRelation,
-                                        IndexVacuumCleanupInfo *info,
+index_vacuum_cleanup(IndexVacuumInfo *info,
                                          IndexBulkDeleteResult *stats)
  {
-       FmgrInfo   *procedure;
-       IndexBulkDeleteResult *result;
+       Relation        indexRelation = info->index;
  
         RELATION_CHECKS;
+       CHECK_REL_PROCEDURE(amvacuumcleanup);
  
-       /* It's okay for an index AM not to have a vacuumcleanup procedure */
-       if (!RegProcedureIsValid(indexRelation->rd_am->amvacuumcleanup))
-               return stats;
-
-       GET_REL_PROCEDURE(amvacuumcleanup);
-
-       result = (IndexBulkDeleteResult *)
-               DatumGetPointer(FunctionCall3(procedure,
-                                                                         PointerGetDatum(indexRelation),
-                                                                         PointerGetDatum((Pointer) info),
-                                                                         PointerGetDatum((Pointer) stats)));
-
-       return result;
+       return indexRelation->rd_amroutine->amvacuumcleanup(info, stats);
  }
  
  /* ----------------
- *             index_cost_estimator
+ *             index_can_return
   *
- *             Fetch the amcostestimate procedure OID for an index.
- *
- *             We could combine fetching and calling the procedure,
- *             as index_insert does for example; but that would require
- *             importing a bunch of planner/optimizer stuff into this file.
+ *             Does the index access method support index-only scans for the given
+ *             column?
   * ----------------
   */
-RegProcedure
-index_cost_estimator(Relation indexRelation)
+bool
+index_can_return(Relation indexRelation, int attno)
  {
-       FmgrInfo   *procedure;
-
         RELATION_CHECKS;
-       GET_REL_PROCEDURE(amcostestimate);
  
-       return procedure->fn_oid;
+       /* amcanreturn is optional; assume FALSE if not provided by AM */
+       if (indexRelation->rd_amroutine->amcanreturn == NULL)
+               return false;
+
+       return indexRelation->rd_amroutine->amcanreturn(indexRelation, attno);
  }
  
  /* ----------------
   *             index_getprocid
   *
- *             Some indexed access methods may require support routines that are
- *             not in the operator class/operator model imposed by pg_am.      These
- *             access methods may store the OIDs of registered procedures they
- *             need in pg_amproc.      These registered procedure OIDs are ordered in
- *             a way that makes sense to the access method, and used only by the
- *             access method.  The general index code doesn't know anything about
- *             the routines involved; it just builds an ordered list of them for
+ *             Index access methods typically require support routines that are
+ *             not directly the implementation of any WHERE-clause query operator
+ *             and so cannot be kept in pg_amop.  Instead, such routines are kept
+ *             in pg_amproc.  These registered procedure OIDs are assigned numbers
+ *             according to a convention established by the access method.
+ *             The general index code doesn't know anything about the routines
+ *             involved; it just builds an ordered list of them for
   *             each attribute on which an index is defined.
   *
- *             This routine returns the requested procedure OID for a particular
- *             indexed attribute.
+ *             As of Postgres 8.3, support routines within an operator family
+ *             are further subdivided by the "left type" and "right type" of the
+ *             query operator(s) that they support.  The "default" functions for a
+ *             particular indexed attribute are those with both types equal to
+ *             the index opclass' opcintype (note that this is subtly different
+ *             from the indexed attribute's own type: it may be a binary-compatible
+ *             type instead).  Only the default functions are stored in relcache
+ *             entries --- access methods can use the syscache to look up non-default
+ *             functions.
+ *
+ *             This routine returns the requested default procedure OID for a
+ *             particular indexed attribute.
   * ----------------
   */
  RegProcedure
@@ -764,7 +825,7 @@ index_getprocid(Relation irel,
         int                     nproc;
         int                     procindex;
  
-       nproc = irel->rd_am->amsupport;
+       nproc = irel->rd_amroutine->amsupport;
  
         Assert(procnum > 0 && procnum <= (uint16) nproc);
  
@@ -781,7 +842,8 @@ index_getprocid(Relation irel,
   *             index_getprocinfo
   *
   *             This routine allows index AMs to keep fmgr lookup info for
- *             support procs in the relcache.
+ *             support procs in the relcache.  As above, only the "default"
+ *             functions for any particular indexed attribute are cached.
   *
   * Note: the return value points into cached data that will be lost during
   * any relcache rebuild!  Therefore, either use the callinfo right away,
@@ -797,7 +859,7 @@ index_getprocinfo(Relation irel,
         int                     nproc;
         int                     procindex;
  
-       nproc = irel->rd_am->amsupport;
+       nproc = irel->rd_amroutine->amsupport;
  
         Assert(procnum > 0 && procnum <= (uint16) nproc);
  
@@ -820,11 +882,10 @@ index_getprocinfo(Relation irel,
                 procId = loc[procindex];
  
                 /*
-                * Complain if function was not found during
-                * IndexSupportInitialize. This should not happen unless the
-                * system tables contain bogus entries for the index opclass.  (If
-                * an AM wants to allow a support function to be optional, it can
-                * use index_getprocid.)
+                * Complain if function was not found during IndexSupportInitialize.
+                * This should not happen unless the system tables contain bogus
+                * entries for the index opclass.  (If an AM wants to allow a support
+                * function to be optional, it can use index_getprocid.)
                  */
                 if (!RegProcedureIsValid(procId))
                         elog(ERROR, "missing support function %d for attribute %d of index \"%s\"",