]> granicus.if.org Git - postgresql/commitdiff
BRIN: Block Range Indexes
authorAlvaro Herrera <alvherre@alvh.no-ip.org>
Fri, 7 Nov 2014 19:38:14 +0000 (16:38 -0300)
committerAlvaro Herrera <alvherre@alvh.no-ip.org>
Fri, 7 Nov 2014 19:38:14 +0000 (16:38 -0300)
BRIN is a new index access method intended to accelerate scans of very
large tables, without the maintenance overhead of btrees or other
traditional indexes.  They work by maintaining "summary" data about
block ranges.  Bitmap index scans work by reading each summary tuple and
comparing them with the query quals; all pages in the range are returned
in a lossy TID bitmap if the quals are consistent with the values in the
summary tuple, otherwise not.  Normal index scans are not supported
because these indexes do not store TIDs.

As new tuples are added into the index, the summary information is
updated (if the block range in which the tuple is added is already
summarized) or not; in the latter case, a subsequent pass of VACUUM or
the brin_summarize_new_values() function will create the summary
information.

For data types with natural 1-D sort orders, the summary info consists
of the maximum and the minimum values of each indexed column within each
page range.  This type of operator class we call "Minmax", and we
supply a bunch of them for most data types with B-tree opclasses.
Since the BRIN code is generalized, other approaches are possible for
things such as arrays, geometric types, ranges, etc; even for things
such as enum types we could do something different than minmax with
better results.  In this commit I only include minmax.

Catalog version bumped due to new builtin catalog entries.

There's more that could be done here, but this is a good step forwards.

Loosely based on ideas from Simon Riggs; code mostly by Álvaro Herrera,
with contribution by Heikki Linnakangas.

Patch reviewed by: Amit Kapila, Heikki Linnakangas, Robert Haas.
Testing help from Jeff Janes, Erik Rijkers, Emanuel Calvo.

PS:
  The research leading to these results has received funding from the
  European Union's Seventh Framework Programme (FP7/2007-2013) under
  grant agreement n° 318633.

57 files changed:
contrib/pageinspect/Makefile
contrib/pageinspect/brinfuncs.c [new file with mode: 0644]
contrib/pageinspect/pageinspect--1.2--1.3.sql [new file with mode: 0644]
contrib/pageinspect/pageinspect--1.3.sql [moved from contrib/pageinspect/pageinspect--1.2.sql with 71% similarity]
contrib/pageinspect/pageinspect.control
contrib/pg_xlogdump/rmgrdesc.c
doc/src/sgml/brin.sgml [new file with mode: 0644]
doc/src/sgml/filelist.sgml
doc/src/sgml/indices.sgml
doc/src/sgml/pageinspect.sgml
doc/src/sgml/postgres.sgml
src/backend/access/Makefile
src/backend/access/brin/Makefile [new file with mode: 0644]
src/backend/access/brin/README [new file with mode: 0644]
src/backend/access/brin/brin.c [new file with mode: 0644]
src/backend/access/brin/brin_minmax.c [new file with mode: 0644]
src/backend/access/brin/brin_pageops.c [new file with mode: 0644]
src/backend/access/brin/brin_revmap.c [new file with mode: 0644]
src/backend/access/brin/brin_tuple.c [new file with mode: 0644]
src/backend/access/brin/brin_xlog.c [new file with mode: 0644]
src/backend/access/common/reloptions.c
src/backend/access/heap/heapam.c
src/backend/access/rmgrdesc/Makefile
src/backend/access/rmgrdesc/brindesc.c [new file with mode: 0644]
src/backend/access/transam/rmgr.c
src/backend/catalog/index.c
src/backend/replication/logical/decode.c
src/backend/storage/page/bufpage.c
src/backend/utils/adt/selfuncs.c
src/include/access/brin.h [new file with mode: 0644]
src/include/access/brin_internal.h [new file with mode: 0644]
src/include/access/brin_page.h [new file with mode: 0644]
src/include/access/brin_pageops.h [new file with mode: 0644]
src/include/access/brin_revmap.h [new file with mode: 0644]
src/include/access/brin_tuple.h [new file with mode: 0644]
src/include/access/brin_xlog.h [new file with mode: 0644]
src/include/access/heapam.h
src/include/access/reloptions.h
src/include/access/relscan.h
src/include/access/rmgrlist.h
src/include/catalog/catversion.h
src/include/catalog/index.h
src/include/catalog/pg_am.h
src/include/catalog/pg_amop.h
src/include/catalog/pg_amproc.h
src/include/catalog/pg_opclass.h
src/include/catalog/pg_opfamily.h
src/include/catalog/pg_proc.h
src/include/storage/bufpage.h
src/include/utils/selfuncs.h
src/test/regress/expected/brin.out [new file with mode: 0644]
src/test/regress/expected/opr_sanity.out
src/test/regress/output/misc.source
src/test/regress/parallel_schedule
src/test/regress/serial_schedule
src/test/regress/sql/brin.sql [new file with mode: 0644]
src/test/regress/sql/opr_sanity.sql

index f10229db4820c6622aaf59271a7768571e7d2259..a59de8aba9e11a94138dbacae336b924b45156f0 100644 (file)
@@ -1,10 +1,11 @@
 # contrib/pageinspect/Makefile
 
 MODULE_big     = pageinspect
-OBJS           = rawpage.o heapfuncs.o btreefuncs.o fsmfuncs.o $(WIN32RES)
+OBJS           = rawpage.o heapfuncs.o btreefuncs.o fsmfuncs.o brinfuncs.o $(WIN32RES)
 
 EXTENSION = pageinspect
-DATA = pageinspect--1.2.sql pageinspect--1.0--1.1.sql \
+DATA = pageinspect--1.3.sql pageinspect--1.0--1.1.sql \
+       pageinspect--1.2--1.3.sql \
        pageinspect--1.1--1.2.sql pageinspect--unpackaged--1.0.sql
 PGFILEDESC = "pageinspect - functions to inspect contents of database pages"
 
diff --git a/contrib/pageinspect/brinfuncs.c b/contrib/pageinspect/brinfuncs.c
new file mode 100644 (file)
index 0000000..359fc1d
--- /dev/null
@@ -0,0 +1,414 @@
+/*
+ * brinfuncs.c
+ *             Functions to investigate BRIN indexes
+ *
+ * Copyright (c) 2014, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *             contrib/pageinspect/brinfuncs.c
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "access/brin.h"
+#include "access/brin_internal.h"
+#include "access/brin_page.h"
+#include "access/brin_revmap.h"
+#include "access/brin_tuple.h"
+#include "catalog/index.h"
+#include "catalog/pg_type.h"
+#include "funcapi.h"
+#include "lib/stringinfo.h"
+#include "utils/array.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/rel.h"
+#include "miscadmin.h"
+
+
+PG_FUNCTION_INFO_V1(brin_page_type);
+PG_FUNCTION_INFO_V1(brin_page_items);
+PG_FUNCTION_INFO_V1(brin_metapage_info);
+PG_FUNCTION_INFO_V1(brin_revmap_data);
+
+typedef struct brin_column_state
+{
+       int                     nstored;
+       FmgrInfo        outputFn[FLEXIBLE_ARRAY_MEMBER];
+} brin_column_state;
+
+typedef struct brin_page_state
+{
+       BrinDesc   *bdesc;
+       Page            page;
+       OffsetNumber offset;
+       bool            unusedItem;
+       bool            done;
+       AttrNumber      attno;
+       BrinMemTuple *dtup;
+       brin_column_state *columns[FLEXIBLE_ARRAY_MEMBER];
+} brin_page_state;
+
+
+static Page verify_brin_page(bytea *raw_page, uint16 type,
+                                const char *strtype);
+
+Datum
+brin_page_type(PG_FUNCTION_ARGS)
+{
+       bytea      *raw_page = PG_GETARG_BYTEA_P(0);
+       Page            page = VARDATA(raw_page);
+       BrinSpecialSpace *special;
+       char *type;
+
+       special = (BrinSpecialSpace *) PageGetSpecialPointer(page);
+
+       switch (special->type)
+       {
+               case BRIN_PAGETYPE_META:
+                       type = "meta";
+                       break;
+               case BRIN_PAGETYPE_REVMAP:
+                       type = "revmap";
+                       break;
+               case BRIN_PAGETYPE_REGULAR:
+                       type = "regular";
+                       break;
+               default:
+                       type = psprintf("unknown (%02x)", special->type);
+                       break;
+       }
+
+       PG_RETURN_TEXT_P(cstring_to_text(type));
+}
+
+/*
+ * Verify that the given bytea contains a BRIN page of the indicated page
+ * type, or die in the attempt.  A pointer to the page is returned.
+ */
+static Page
+verify_brin_page(bytea *raw_page, uint16 type, const char *strtype)
+{
+       Page    page;
+       int             raw_page_size;
+       BrinSpecialSpace *special;
+
+       raw_page_size = VARSIZE(raw_page) - VARHDRSZ;
+
+       if (raw_page_size < SizeOfPageHeaderData)
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                errmsg("input page too small"),
+                                errdetail("Expected size %d, got %d", raw_page_size, BLCKSZ)));
+
+       page = VARDATA(raw_page);
+
+       /* verify the special space says this page is what we want */
+       special = (BrinSpecialSpace *) PageGetSpecialPointer(page);
+       if (special->type != type)
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                errmsg("page is not a BRIN page of type \"%s\"", strtype),
+                                errdetail("Expected special type %08x, got %08x.",
+                                                  type, special->type)));
+
+       return page;
+}
+
+
+/*
+ * Extract all item values from a BRIN index page
+ *
+ * Usage: SELECT * FROM brin_page_items(get_raw_page('idx', 1), 'idx'::regclass);
+ */
+Datum
+brin_page_items(PG_FUNCTION_ARGS)
+{
+       brin_page_state *state;
+       FuncCallContext *fctx;
+
+       if (!superuser())
+               ereport(ERROR,
+                               (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+                                (errmsg("must be superuser to use raw page functions"))));
+
+       if (SRF_IS_FIRSTCALL())
+       {
+               bytea      *raw_page = PG_GETARG_BYTEA_P(0);
+               Oid                     indexRelid = PG_GETARG_OID(1);
+               Page            page;
+               TupleDesc       tupdesc;
+               MemoryContext mctx;
+               Relation        indexRel;
+               AttrNumber      attno;
+
+               /* minimally verify the page we got */
+               page = verify_brin_page(raw_page, BRIN_PAGETYPE_REGULAR, "regular");
+
+               /* create a function context for cross-call persistence */
+               fctx = SRF_FIRSTCALL_INIT();
+
+               /* switch to memory context appropriate for multiple function calls */
+               mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
+
+               /* Build a tuple descriptor for our result type */
+               if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+                       elog(ERROR, "return type must be a row type");
+
+               indexRel = index_open(indexRelid, AccessShareLock);
+
+               state = palloc(offsetof(brin_page_state, columns) +
+                                          sizeof(brin_column_state) * RelationGetDescr(indexRel)->natts);
+
+               state->bdesc = brin_build_desc(indexRel);
+               state->page = page;
+               state->offset = FirstOffsetNumber;
+               state->unusedItem = false;
+               state->done = false;
+               state->dtup = NULL;
+
+               /*
+                * Initialize output functions for all indexed datatypes; simplifies
+                * calling them later.
+                */
+               for (attno = 1; attno <= state->bdesc->bd_tupdesc->natts; attno++)
+               {
+                       Oid             output;
+                       bool    isVarlena;
+                       BrinOpcInfo *opcinfo;
+                       int             i;
+                       brin_column_state *column;
+
+                       opcinfo = state->bdesc->bd_info[attno - 1];
+                       column = palloc(offsetof(brin_column_state, outputFn) +
+                                                       sizeof(FmgrInfo) * opcinfo->oi_nstored);
+
+                       column->nstored = opcinfo->oi_nstored;
+                       for (i = 0; i < opcinfo->oi_nstored; i++)
+                       {
+                               getTypeOutputInfo(opcinfo->oi_typids[i], &output, &isVarlena);
+                               fmgr_info(output, &column->outputFn[i]);
+                       }
+
+                       state->columns[attno - 1] = column;
+               }
+
+               index_close(indexRel, AccessShareLock);
+
+               fctx->user_fctx = state;
+               fctx->tuple_desc = BlessTupleDesc(tupdesc);
+
+               MemoryContextSwitchTo(mctx);
+       }
+
+       fctx = SRF_PERCALL_SETUP();
+       state = fctx->user_fctx;
+
+       if (!state->done)
+       {
+               HeapTuple       result;
+               Datum           values[7];
+               bool            nulls[7];
+
+               /*
+                * This loop is called once for every attribute of every tuple in the
+                * page.  At the start of a tuple, we get a NULL dtup; that's our
+                * signal for obtaining and decoding the next one.  If that's not the
+                * case, we output the next attribute.
+                */
+               if (state->dtup == NULL)
+               {
+                       BrinTuple          *tup;
+                       MemoryContext mctx;
+                       ItemId          itemId;
+
+                       /* deformed tuple must live across calls */
+                       mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
+
+                       /* verify item status: if there's no data, we can't decode */
+                       itemId = PageGetItemId(state->page, state->offset);
+                       if (ItemIdIsUsed(itemId))
+                       {
+                               tup = (BrinTuple *) PageGetItem(state->page,
+                                                                                         PageGetItemId(state->page,
+                                                                                                                       state->offset));
+                               state->dtup = brin_deform_tuple(state->bdesc, tup);
+                               state->attno = 1;
+                               state->unusedItem = false;
+                       }
+                       else
+                               state->unusedItem = true;
+
+                       MemoryContextSwitchTo(mctx);
+               }
+               else
+                       state->attno++;
+
+               MemSet(nulls, 0, sizeof(nulls));
+
+               if (state->unusedItem)
+               {
+                       values[0] = UInt16GetDatum(state->offset);
+                       nulls[1] = true;
+                       nulls[2] = true;
+                       nulls[3] = true;
+                       nulls[4] = true;
+                       nulls[5] = true;
+                       nulls[6] = true;
+               }
+               else
+               {
+                       int             att = state->attno - 1;
+
+                       values[0] = UInt16GetDatum(state->offset);
+                       values[1] = UInt32GetDatum(state->dtup->bt_blkno);
+                       values[2] = UInt16GetDatum(state->attno);
+                       values[3] = BoolGetDatum(state->dtup->bt_columns[att].bv_allnulls);
+                       values[4] = BoolGetDatum(state->dtup->bt_columns[att].bv_hasnulls);
+                       values[5] = BoolGetDatum(state->dtup->bt_placeholder);
+                       if (!state->dtup->bt_columns[att].bv_allnulls)
+                       {
+                               BrinValues   *bvalues = &state->dtup->bt_columns[att];
+                               StringInfoData  s;
+                               bool            first;
+                               int                     i;
+
+                               initStringInfo(&s);
+                               appendStringInfoChar(&s, '{');
+
+                               first = true;
+                               for (i = 0; i < state->columns[att]->nstored; i++)
+                               {
+                                       char   *val;
+
+                                       if (!first)
+                                               appendStringInfoString(&s, " .. ");
+                                       first = false;
+                                       val = OutputFunctionCall(&state->columns[att]->outputFn[i],
+                                                                                        bvalues->bv_values[i]);
+                                       appendStringInfoString(&s, val);
+                                       pfree(val);
+                               }
+                               appendStringInfoChar(&s, '}');
+
+                               values[6] = CStringGetTextDatum(s.data);
+                               pfree(s.data);
+                       }
+                       else
+                       {
+                               nulls[6] = true;
+                       }
+               }
+
+               result = heap_form_tuple(fctx->tuple_desc, values, nulls);
+
+               /*
+                * If the item was unused, jump straight to the next one; otherwise,
+                * the only cleanup needed here is to set our signal to go to the next
+                * tuple in the following iteration, by freeing the current one.
+                */
+               if (state->unusedItem)
+                       state->offset = OffsetNumberNext(state->offset);
+               else if (state->attno >= state->bdesc->bd_tupdesc->natts)
+               {
+                       pfree(state->dtup);
+                       state->dtup = NULL;
+                       state->offset = OffsetNumberNext(state->offset);
+               }
+
+               /*
+                * If we're beyond the end of the page, set flag to end the function in
+                * the following iteration.
+                */
+               if (state->offset > PageGetMaxOffsetNumber(state->page))
+                       state->done = true;
+
+               SRF_RETURN_NEXT(fctx, HeapTupleGetDatum(result));
+       }
+
+       brin_free_desc(state->bdesc);
+
+       SRF_RETURN_DONE(fctx);
+}
+
+Datum
+brin_metapage_info(PG_FUNCTION_ARGS)
+{
+       bytea      *raw_page = PG_GETARG_BYTEA_P(0);
+       Page            page;
+       BrinMetaPageData *meta;
+       TupleDesc       tupdesc;
+       Datum           values[4];
+       bool            nulls[4];
+       HeapTuple       htup;
+
+       page = verify_brin_page(raw_page, BRIN_PAGETYPE_META, "metapage");
+
+       /* Build a tuple descriptor for our result type */
+       if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+               elog(ERROR, "return type must be a row type");
+       tupdesc = BlessTupleDesc(tupdesc);
+
+       /* Extract values from the metapage */
+       meta = (BrinMetaPageData *) PageGetContents(page);
+       MemSet(nulls, 0, sizeof(nulls));
+       values[0] = CStringGetTextDatum(psprintf("0x%08X", meta->brinMagic));
+       values[1] = Int32GetDatum(meta->brinVersion);
+       values[2] = Int32GetDatum(meta->pagesPerRange);
+       values[3] = Int64GetDatum(meta->lastRevmapPage);
+
+       htup = heap_form_tuple(tupdesc, values, nulls);
+
+       PG_RETURN_DATUM(HeapTupleGetDatum(htup));
+}
+
+/*
+ * Return the TID array stored in a BRIN revmap page
+ */
+Datum
+brin_revmap_data(PG_FUNCTION_ARGS)
+{
+       struct
+       {
+               ItemPointerData *tids;
+               int             idx;
+       } *state;
+       FuncCallContext *fctx;
+
+       if (!superuser())
+               ereport(ERROR,
+                               (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+                                (errmsg("must be superuser to use raw page functions"))));
+
+       if (SRF_IS_FIRSTCALL())
+       {
+               bytea      *raw_page = PG_GETARG_BYTEA_P(0);
+               MemoryContext mctx;
+               Page            page;
+
+               /* minimally verify the page we got */
+               page = verify_brin_page(raw_page, BRIN_PAGETYPE_REVMAP, "revmap");
+
+               /* create a function context for cross-call persistence */
+               fctx = SRF_FIRSTCALL_INIT();
+
+               /* switch to memory context appropriate for multiple function calls */
+               mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
+
+               state = palloc(sizeof(*state));
+               state->tids = ((RevmapContents *) PageGetContents(page))->rm_tids;
+               state->idx = 0;
+
+               fctx->user_fctx = state;
+
+               MemoryContextSwitchTo(mctx);
+       }
+
+       fctx = SRF_PERCALL_SETUP();
+       state = fctx->user_fctx;
+
+       if (state->idx < REVMAP_PAGE_MAXITEMS)
+               SRF_RETURN_NEXT(fctx, PointerGetDatum(&state->tids[state->idx++]));
+
+       SRF_RETURN_DONE(fctx);
+}
diff --git a/contrib/pageinspect/pageinspect--1.2--1.3.sql b/contrib/pageinspect/pageinspect--1.2--1.3.sql
new file mode 100644 (file)
index 0000000..9bc4dde
--- /dev/null
@@ -0,0 +1,43 @@
+/* contrib/pageinspect/pageinspect--1.2--1.3.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pageinspect UPDATE TO '1.3'" to load this file. \quit
+
+--
+-- brin_page_type()
+--
+CREATE FUNCTION brin_page_type(IN page bytea)
+RETURNS text
+AS 'MODULE_PATHNAME', 'brin_page_type'
+LANGUAGE C STRICT;
+
+--
+-- brin_metapage_info()
+--
+CREATE FUNCTION brin_metapage_info(IN page bytea, OUT magic text,
+       OUT version integer, OUT pagesperrange integer, OUT lastrevmappage bigint)
+AS 'MODULE_PATHNAME', 'brin_metapage_info'
+LANGUAGE C STRICT;
+
+--
+-- brin_revmap_data()
+CREATE FUNCTION brin_revmap_data(IN page bytea,
+       OUT pages tid)
+RETURNS SETOF tid
+AS 'MODULE_PATHNAME', 'brin_revmap_data'
+LANGUAGE C STRICT;
+
+--
+-- brin_page_items()
+--
+CREATE FUNCTION brin_page_items(IN page bytea, IN index_oid regclass,
+       OUT itemoffset int,
+       OUT blknum int,
+       OUT attnum int,
+       OUT allnulls bool,
+       OUT hasnulls bool,
+       OUT placeholder bool,
+       OUT value text)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'brin_page_items'
+LANGUAGE C STRICT;
similarity index 71%
rename from contrib/pageinspect/pageinspect--1.2.sql
rename to contrib/pageinspect/pageinspect--1.3.sql
index 15e8e1e3811c7a5b7b21dd7e1f3620ea50fbc7d2..856dcdfb592914deea02f7a7a1e8dc544a475a33 100644 (file)
@@ -1,4 +1,4 @@
-/* contrib/pageinspect/pageinspect--1.2.sql */
+/* contrib/pageinspect/pageinspect--1.3.sql */
 
 -- complain if script is sourced in psql, rather than via CREATE EXTENSION
 \echo Use "CREATE EXTENSION pageinspect" to load this file. \quit
@@ -98,6 +98,45 @@ RETURNS SETOF record
 AS 'MODULE_PATHNAME', 'bt_page_items'
 LANGUAGE C STRICT;
 
+--
+-- brin_page_type()
+--
+CREATE FUNCTION brin_page_type(IN page bytea)
+RETURNS text
+AS 'MODULE_PATHNAME', 'brin_page_type'
+LANGUAGE C STRICT;
+
+--
+-- brin_metapage_info()
+--
+CREATE FUNCTION brin_metapage_info(IN page bytea, OUT magic text,
+       OUT version integer, OUT pagesperrange integer, OUT lastrevmappage bigint)
+AS 'MODULE_PATHNAME', 'brin_metapage_info'
+LANGUAGE C STRICT;
+
+--
+-- brin_revmap_data()
+CREATE FUNCTION brin_revmap_data(IN page bytea,
+       OUT pages tid)
+RETURNS SETOF tid
+AS 'MODULE_PATHNAME', 'brin_revmap_data'
+LANGUAGE C STRICT;
+
+--
+-- brin_page_items()
+--
+CREATE FUNCTION brin_page_items(IN page bytea, IN index_oid regclass,
+       OUT itemoffset int,
+       OUT blknum int,
+       OUT attnum int,
+       OUT allnulls bool,
+       OUT hasnulls bool,
+       OUT placeholder bool,
+       OUT value text)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'brin_page_items'
+LANGUAGE C STRICT;
+
 --
 -- fsm_page_contents()
 --
index aecd91a711b8cd5aee1e1b1bd83c54bbd47a5d7b..a9dab3327c9623ef4d4d8e834aa6b6eda92be0da 100644 (file)
@@ -1,5 +1,5 @@
 # pageinspect extension
 comment = 'inspect the contents of database pages at a low level'
-default_version = '1.2'
+default_version = '1.3'
 module_pathname = '$libdir/pageinspect'
 relocatable = true
index bfb3573878911f4caa3d3e32648dcb740fd28c64..93971982390967aac3a30d3075ae8a105861eda2 100644 (file)
@@ -8,6 +8,7 @@
 #define FRONTEND 1
 #include "postgres.h"
 
+#include "access/brin_xlog.h"
 #include "access/clog.h"
 #include "access/gin.h"
 #include "access/gist_private.h"
diff --git a/doc/src/sgml/brin.sgml b/doc/src/sgml/brin.sgml
new file mode 100644 (file)
index 0000000..03d1fd6
--- /dev/null
@@ -0,0 +1,490 @@
+<!-- doc/src/sgml/brin.sgml -->
+
+<chapter id="BRIN">
+<title>BRIN Indexes</title>
+
+   <indexterm>
+    <primary>index</primary>
+    <secondary>BRIN</secondary>
+   </indexterm>
+
+<sect1 id="brin-intro">
+ <title>Introduction</title>
+
+ <para>
+  <acronym>BRIN</acronym> stands for Block Range Index.
+  <acronym>BRIN</acronym> is designed for handling very large tables
+  in which certain columns have some natural correlation with their
+  physical location within the table.
+  A <firstterm>block range</> is a group of pages that are physically
+  adjacent in the table; for each block range, some summary info is stored
+  by the index.
+  For example, a table storing a store's sale orders might have
+  a date column on which each order was placed, and most of the time
+  the entries for earlier orders will appear earlier in the table as well;
+  a table storing a ZIP code column might have all codes for a city
+  grouped together naturally.
+ </para>
+
+ <para>
+  <acronym>BRIN</acronym> indexes can satisfy queries via regular bitmap
+  index scans, and will return all tuples in all pages within each range if
+  the summary info stored by the index is <firstterm>consistent</> with the
+  query conditions.
+  The query executor is in charge of rechecking these tuples and discarding
+  those that do not match the query conditions &mdash; in other words, these
+  indexes are lossy.
+  Because a <acronym>BRIN</acronym> index is very small, scanning the index
+  adds little overhead compared to a sequential scan, but may avoid scanning
+  large parts of the table that are known not to contain matching tuples.
+ </para>
+
+ <para>
+  The specific data that a <acronym>BRIN</acronym> index will store,
+  as well as the specific queries that the index will be able to satisfy,
+  depend on the operator class selected for each column of the index.
+  Data types having a linear sort order can have operator classes that
+  store the minimum and maximum value within each block range, for instance;
+  geometrical types might store the bounding box for all the objects
+  in the block range.
+ </para>
+  
+ <para>
+  The size of the block range is determined at index creation time by
+  the <literal>pages_per_range</> storage parameter.  The number of index
+  entries will be equal to the size of the relation in pages divided by
+  the selected value for <literal>pages_per_range</>.  Therefore, the smaller
+  the number, the larger the index becomes (because of the need to
+  store more index entries), but at the same time the summary data stored can
+  be more precise and more data blocks can be skipped during an index scan.
+ </para>
+</sect1>
+
+<sect1 id="brin-builtin-opclasses">
+ <title>Built-in Operator Classes</title>
+
+ <para>
+  The core <productname>PostgreSQL</productname> distribution includes
+  includes the <acronym>BRIN</acronym> operator classes shown in 
+  <xref linkend="brin-builtin-opclasses-table">.
+ </para>
+
+ <para>
+  The <firstterm>minmax</>
+  operator classes store the minimum and the maximum values appearing
+  in the indexed column within the range.
+ </para>
+
+ <table id="brin-builtin-opclasses-table">
+  <title>Built-in <acronym>BRIN</acronym> Operator Classes</title>
+  <tgroup cols="3">
+   <thead>
+    <row>
+     <entry>Name</entry>
+     <entry>Indexed Data Type</entry>
+     <entry>Indexable Operators</entry>
+    </row>
+   </thead>
+   <tbody>
+    <row>
+     <entry><literal>bytea_minmax_ops</literal></entry>
+     <entry><type>bytea</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>char_minmax_ops</literal></entry>
+     <entry><type>"char"</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>name_minmax_ops</literal></entry>
+     <entry><type>name</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>int8_minmax_ops</literal></entry>
+     <entry><type>bigint</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>int2_minmax_ops</literal></entry>
+     <entry><type>smallint</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>int4_minmax_ops</literal></entry>
+     <entry><type>integer</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>text_minmax_ops</literal></entry>
+     <entry><type>text</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>oid_minmax_ops</literal></entry>
+     <entry><type>oid</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>tid_minmax_ops</literal></entry>
+     <entry><type>tid</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>float4_minmax_ops</literal></entry>
+     <entry><type>real</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>float8_minmax_ops</literal></entry>
+     <entry><type>double precision</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>abstime_minmax_ops</literal></entry>
+     <entry><type>abstime</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>reltime_minmax_ops</literal></entry>
+     <entry><type>reltime</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>macaddr_minmax_ops</literal></entry>
+     <entry><type>macaddr</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>inet_minmax_ops</literal></entry>
+     <entry><type>inet</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>bpchar_minmax_ops</literal></entry>
+     <entry><type>character</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>date_minmax_ops</literal></entry>
+     <entry><type>date</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>time_minmax_ops</literal></entry>
+     <entry><type>time without time zone</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>timestamp_minmax_ops</literal></entry>
+     <entry><type>timestamp without time zone</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>timestamptz_minmax_ops</literal></entry>
+     <entry><type>timestamp with time zone</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>interval_minmax_ops</literal></entry>
+     <entry><type>interval</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>timetz_minmax_ops</literal></entry>
+     <entry><type>time with time zone</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>bit_minmax_ops</literal></entry>
+     <entry><type>bit</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>varbit_minmax_ops</literal></entry>
+     <entry><type>bit varying</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>numeric_minmax_ops</literal></entry>
+     <entry><type>numeric</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>uuid_minmax_ops</literal></entry>
+     <entry><type>uuid</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+    <row>
+     <entry><literal>pg_lsn_minmax_ops</literal></entry>
+     <entry><type>pg_lsn</type></entry>
+     <entry>
+      <literal>&lt;</literal>
+      <literal>&lt;=</literal>
+      <literal>=</literal>
+      <literal>&gt;=</literal>
+      <literal>&gt;</literal>
+     </entry>
+    </row>
+   </tbody>
+  </tgroup>
+ </table>
+</sect1>
+
+<sect1 id="brin-extensibility">
+ <title>Extensibility</title>
+
+ <para>
+  The <acronym>BRIN</acronym> interface has a high level of abstraction,
+  requiring the access method implementer only to implement the semantics
+  of the data type being accessed.  The <acronym>BRIN</acronym> layer
+  itself takes care of concurrency, logging and searching the index structure.
+ </para>
+
+ <para>
+  All it takes to get a <acronym>BRIN</acronym> access method working is to
+  implement a few user-defined methods, which define the behavior of
+  summary values stored in the index and the way they interact with
+  scan keys.
+  In short, <acronym>BRIN</acronym> combines
+  extensibility with generality, code reuse, and a clean interface.
+ </para>
+
+ <para>
+  There are four methods that an operator class for <acronym>BRIN</acronym>
+  must provide:
+
+  <variablelist>
+   <varlistentry>
+    <term><function>BrinOpcInfo *opcInfo(Oid type_oid)</></term>
+    <listitem>
+     <para>
+      Returns internal information about the indexed columns' summary data.
+      The return value must point to a palloc'd <structname>BrinOpcInfo</>,
+      which has this definition:
+<programlisting>
+typedef struct BrinOpcInfo
+{
+    /* Number of columns stored in an index column of this opclass */
+    uint16      oi_nstored;
+
+    /* Opaque pointer for the opclass' private use */
+    void       *oi_opaque;
+
+    /* Type IDs of the stored columns */
+    Oid         oi_typids[FLEXIBLE_ARRAY_MEMBER];
+} BrinOpcInfo;
+</programlisting>
+      <structname>BrinOpcInfo</>.<structfield>oi_opaque</> can be used by the
+      operator class routines to pass information between support procedures
+      during an index scan.
+     </para>
+    </listitem>
+   </varlistentry>
+
+   <varlistentry>
+    <term><function>bool consistent(BrinDesc *bdesc, BrinValues *column,
+       ScanKey key)</function></term>
+    <listitem>
+     <para>
+      Returns whether the ScanKey is consistent with the given indexed
+      values for a range.
+      The attribute number to use is passed as part of the scan key.
+     </para>
+    </listitem>
+   </varlistentry>
+
+   <varlistentry>
+    <term><function>bool addValue(BrinDesc *bdesc, BrinValues *column,
+       Datum newval, bool isnull)</function></term>
+    <listitem>
+     <para>
+      Given an index tuple and an indexed value, modifies the indicated
+      attribute of the tuple so that it additionally represents the new value.
+      If any modification was done to the tuple, <literal>true</literal> is
+      returned.
+     </para>
+    </listitem>
+   </varlistentry>
+
+   <varlistentry>
+    <term><function>bool unionTuples(BrinDesc *bdesc, BrinValues *a,
+       BrinValues *b)</function></term>
+    <listitem>
+     <para>
+      Consolidates two index tuples. Given two index tuples, modifies the
+      indicated attribute of the first of them so that it represents both tuples.
+      The second tuple is not modified.
+     </para>
+    </listitem>
+   </varlistentry>
+  </variablelist>
+
+  To implement these methods in a generic way, the operator class
+  defines its own internal support functions.
+  (For instance, <quote>min/max</> operator classes implements
+  support functions for the four inequality operators for the data type.)
+  Additionally, the operator class must supply appropriate
+  operator entries,
+  to enable the optimizer to use the index when those operators are
+  used in queries.
+ </para>
+</sect1>
+</chapter>
index 5902f979c89823e00b903a4be5c8449bbe2f73f9..f03b72ab1dbc4cd4880caf336685bbd2124cdae1 100644 (file)
@@ -87,6 +87,7 @@
 <!ENTITY gist       SYSTEM "gist.sgml">
 <!ENTITY spgist     SYSTEM "spgist.sgml">
 <!ENTITY gin        SYSTEM "gin.sgml">
+<!ENTITY brin       SYSTEM "brin.sgml">
 <!ENTITY planstats    SYSTEM "planstats.sgml">
 <!ENTITY indexam    SYSTEM "indexam.sgml">
 <!ENTITY nls        SYSTEM "nls.sgml">
index 64530a11c86f7cf12f94ed162e9124d0065b04ca..b73463a323872444b3b28bcdbfd6a20bcf586815 100644 (file)
@@ -116,7 +116,8 @@ CREATE INDEX test1_id_index ON test1 (id);
 
   <para>
    <productname>PostgreSQL</productname> provides several index types:
-   B-tree, Hash, GiST, SP-GiST and GIN.  Each index type uses a different
+   B-tree, Hash, GiST, SP-GiST, GIN and BRIN.
+   Each index type uses a different
    algorithm that is best suited to different types of queries.
    By default, the <command>CREATE INDEX</command> command creates
    B-tree indexes, which fit the most common situations.
@@ -326,6 +327,39 @@ SELECT * FROM places ORDER BY location <-> point '(101,456)' LIMIT 10;
    classes are available in the <literal>contrib</> collection or as separate
    projects.  For more information see <xref linkend="GIN">.
   </para>
+
+  <para>
+   <indexterm>
+    <primary>index</primary>
+    <secondary>BRIN</secondary>
+   </indexterm>
+   <indexterm>
+    <primary>BRIN</primary>
+    <see>index</see>
+   </indexterm>
+   BRIN indexes (a shorthand for Block Range indexes)
+   store summaries about the values stored in consecutive table physical block ranges.
+   Like GiST, SP-GiST and GIN,
+   BRIN can support many different indexing strategies,
+   and the particular operators with which a BRIN index can be used
+   vary depending on the indexing strategy.
+   For datatypes that have a linear sort order, the indexed data
+   corresponds to the minimum and maximum values of the
+   values in the column for each block range,
+   which support indexed queries using these operators:
+
+   <simplelist>
+    <member><literal>&lt;</literal></member>
+    <member><literal>&lt;=</literal></member>
+    <member><literal>=</literal></member>
+    <member><literal>&gt;=</literal></member>
+    <member><literal>&gt;</literal></member>
+   </simplelist>
+
+   The BRIN operator classes included in the standard distribution are
+   documented in <xref linkend="brin-builtin-opclasses-table">.
+   For more information see <xref linkend="BRIN">.
+  </para>
  </sect1>
 
 
index 191fb156c13bb2b7ec9b7b119860f48c1c20e904..70517ac4e174be60a725db92775032fbf9b9b5c3 100644 (file)
@@ -196,6 +196,110 @@ test=# SELECT * FROM bt_page_items('pg_cast_oid_index', 1);
     </listitem>
    </varlistentry>
 
+   <varlistentry>
+    <term>
+     <function>brin_page_type(page bytea) returns text</function>
+     <indexterm>
+      <primary>brin_page_type</primary>
+     </indexterm>
+    </term>
+
+    <listitem>
+     <para>
+      <function>brin_page_type</function> returns the page type of the given
+      <acronym>BRIN</acronym> index page, or throws an error if the page is
+      not a valid <acronym>BRIN</acronym> page.  For example:
+<screen>
+brintest=# select brin_page_type(get_raw_page('brinidx', 0));
+ brin_page_type 
+----------------
+ meta
+</screen>
+     </para>
+    </listitem>
+   </varlistentry>
+
+   <varlistentry>
+    <term>
+     <function>brin_metapage_info(page bytea) returns record</function>
+     <indexterm>
+      <primary>brin_metapage_info</primary>
+     </indexterm>
+    </term>
+
+    <listitem>
+     <para>
+      <function>brin_metapage_info</function> returns assorted information
+      about a <acronym>BRIN</acronym> index metapage.  For example:
+<screen>
+brintest=# select * from brin_metapage_info(get_raw_page('brinidx', 0));
+   magic    | version | pagesperrange | lastrevmappage 
+------------+---------+---------------+----------------
+ 0xA8109CFA |       1 |             4 |              2
+</screen>
+     </para>
+    </listitem>
+   </varlistentry>
+
+   <varlistentry>
+    <term>
+     <function>brin_revmap_data(page bytea) returns setof tid</function>
+     <indexterm>
+      <primary>brin_revmap_data</primary>
+     </indexterm>
+    </term>
+
+    <listitem>
+     <para>
+      <function>brin_revmap_data</function> returns the list of tuple
+      identifiers in a <acronym>BRIN</acronym> index range map page.
+      For example:
+<screen>
+brintest=# select * from brin_revmap_data(get_raw_page('brinidx', 2)) limit 5;
+  pages  
+---------
+ (6,137)
+ (6,138)
+ (6,139)
+ (6,140)
+ (6,141)
+</screen>
+     </para>
+    </listitem>
+   </varlistentry>
+
+   <varlistentry>
+    <term>
+     <function>brin_page_items(page bytea, index oid) returns setof record</function>
+     <indexterm>
+      <primary>brin_page_items</primary>
+     </indexterm>
+    </term>
+
+    <listitem>
+     <para>
+      <function>brin_page_items</function> returns the data stored in the
+      <acronym>BRIN</acronym> data page.  For example:
+<screen>
+brintest=# select * from brin_page_items(get_raw_page('brinidx', 5),
+brintest(#                               'brinidx')
+brintest-# order by blknum, attnum limit 6;
+ itemoffset | blknum | attnum | allnulls | hasnulls | placeholder |    value     
+------------+--------+--------+----------+----------+-------------+--------------
+        137 |      0 |      1 | t        | f        | f           | 
+        137 |      0 |      2 | f        | f        | f           | {1 .. 88}
+        138 |      4 |      1 | t        | f        | f           | 
+        138 |      4 |      2 | f        | f        | f           | {89 .. 176}
+        139 |      8 |      1 | t        | f        | f           | 
+        139 |      8 |      2 | f        | f        | f           | {177 .. 264}
+</screen>
+      The returned columns correspond to the fields in the
+      <structname>BrinMemTuple</> and <structname>BrinValues</> structs.
+      See <filename>src/include/access/brin_tuple.h</> for details.
+     </para>
+    </listitem>
+   </varlistentry>
+
    <varlistentry>
     <term>
      <function>fsm_page_contents(page bytea) returns text</function>
index 9bde1085e9b1221d76881367108ef85a82e1d511..a648a4c5f6473dada26dd990bf52800977b33b3e 100644 (file)
   &gist;
   &spgist;
   &gin;
+  &brin;
   &storage;
   &bki;
   &planstats;
index c32088f81dfc7120795b69a9afb0f8706f084d43..21721b48f0445727d10d33c8322a4e3ca522ae46 100644 (file)
@@ -8,6 +8,6 @@ subdir = src/backend/access
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
-SUBDIRS            = common gin gist hash heap index nbtree rmgrdesc spgist transam
+SUBDIRS            = brin common gin gist hash heap index nbtree rmgrdesc spgist transam
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/brin/Makefile b/src/backend/access/brin/Makefile
new file mode 100644 (file)
index 0000000..ac44fcd
--- /dev/null
@@ -0,0 +1,18 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for access/brin
+#
+# IDENTIFICATION
+#    src/backend/access/brin/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/access/brin
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = brin.o brin_pageops.o brin_revmap.o brin_tuple.o brin_xlog.o \
+          brin_minmax.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/brin/README b/src/backend/access/brin/README
new file mode 100644 (file)
index 0000000..2619be8
--- /dev/null
@@ -0,0 +1,189 @@
+Block Range Indexes (BRIN)
+==========================
+
+BRIN indexes intend to enable very fast scanning of extremely large tables.
+
+The essential idea of a BRIN index is to keep track of summarizing values in
+consecutive groups of heap pages (page ranges); for example, the minimum and
+maximum values for datatypes with a btree opclass, or the bounding box for
+geometric types.  These values can be used to avoid scanning such pages
+during a table scan, depending on query quals.
+
+The cost of this is having to update the stored summary values of each page
+range as tuples are inserted into them.
+
+
+Access Method Design
+--------------------
+
+Since item pointers are not stored inside indexes of this type, it is not
+possible to support the amgettuple interface.  Instead, we only provide
+amgetbitmap support.  The amgetbitmap routine returns a lossy TIDBitmap
+comprising all pages in those page ranges that match the query
+qualifications.  The recheck step in the BitmapHeapScan node prunes tuples
+that are not visible according to the query qualifications.
+
+An operator class must have the following entries:
+
+- generic support procedures (pg_amproc), identical to all opclasses:
+  * "opcinfo" (BRIN_PROCNUM_OPCINFO) initializes a structure for index
+    creation or scanning
+  * "addValue" (BRIN_PROCNUM_ADDVALUE) takes an index tuple and a heap item,
+    and possibly changes the index tuple so that it includes the heap item
+    values
+  * "consistent" (BRIN_PROCNUM_CONSISTENT) takes an index tuple and query
+    quals, and returns whether the index tuple values match the query quals.
+  * "union" (BRIN_PROCNUM_UNION) takes two index tuples and modifies the first
+    one so that it represents the union of the two.
+Procedure numbers up to 10 are reserved for future expansion.
+
+Additionally, each opclass needs additional support functions:
+- Minmax-style operator classes:
+  * Proc numbers 11-14 are used for the functions implementing inequality
+    operators for the type, in this order: less than, less or equal,
+    greater or equal, greater than.
+
+Opclasses using a different design will require different additional procedure
+numbers.
+
+Operator classes also need to have operator (pg_amop) entries so that the
+optimizer can choose the index to execute queries.
+- Minmax-style operator classes:
+  * The same operators as btree (<=, <, =, >=, >)
+
+Each index tuple stores some NULL bits and some opclass-specified values, which
+are stored in a single null bitmask of length twice the number of columns.  The
+generic NULL bits indicate, for each column:
+  * bt_hasnulls: Whether there's any NULL value at all in the page range
+  * bt_allnulls: Whether all values are NULLs in the page range
+
+The opclass-specified values are:
+- Minmax-style operator classes
+  * minimum value across all tuples in the range
+  * maximum value across all tuples in the range
+
+Note that the addValue and Union support procedures  must be careful to
+datumCopy() the values they want to store in the in-memory BRIN tuple, and
+must pfree() the old copies when replacing older ones.  Since some values
+referenced from the tuple persist and others go away, there is no
+well-defined lifetime for a memory context that would make this automatic.
+
+
+The Range Map
+-------------
+
+To find the index tuple for a particular page range, we have an internal
+structure we call the range map, or "revmap" for short.  This stores one TID
+per page range, which is the address of the index tuple summarizing that
+range.  Since the map entries are fixed size, it is possible to compute the
+address of the range map entry for any given heap page by simple arithmetic.
+
+When a new heap tuple is inserted in a summarized page range, we compare the
+existing index tuple with the new heap tuple.  If the heap tuple is outside
+the summarization data given by the index tuple for any indexed column (or
+if the new heap tuple contains null values but the index tuple indicates
+there are no nulls), the index is updated with the new values.  In many
+cases it is possible to update the index tuple in-place, but if the new
+index tuple is larger than the old one and there's not enough space in the
+page, it is necessary to create a new index tuple with the new values.  The
+range map can be updated quickly to point to it; the old index tuple is
+removed.
+
+If the range map points to an invalid TID, the corresponding page range is
+considered to be not summarized.  When tuples are added to unsummarized
+pages, nothing needs to happen.
+
+To scan a table following a BRIN index, we scan the range map sequentially.
+This yields index tuples in ascending page range order.  Query quals are
+matched to each index tuple; if they match, each page within the page range
+is returned as part of the output TID bitmap.  If there's no match, they are
+skipped.  Range map entries returning invalid index TIDs, that is
+unsummarized page ranges, are also returned in the TID bitmap.
+
+The revmap is stored in the first few blocks of the index main fork,
+immediately following the metapage.  Whenever the revmap needs to be
+extended by another page, existing tuples in that page are moved to some
+other page.
+
+Heap tuples can be removed from anywhere without restriction.  It might be
+useful to mark the corresponding index tuple somehow, if the heap tuple is
+one of the constraining values of the summary data (i.e. either min or max
+in the case of a btree-opclass-bearing datatype), so that in the future we
+are aware of the need to re-execute summarization on that range, leading to
+a possible tightening of the summary values.
+
+Summarization
+-------------
+
+At index creation time, the whole table is scanned; for each page range the
+summarizing values of each indexed column and nulls bitmap are collected and
+stored in the index.  The partially-filled page range at the end of the
+table is also summarized.
+
+As new tuples get inserted at the end of the table, they may update the
+index tuple that summarizes the partial page range at the end.  Eventually
+that page range is complete and new tuples belong in a new page range that
+hasn't yet been summarized.  Those insertions do not create a new index
+entry; instead, the page range remains unsummarized until later.
+
+Wehn VACUUM is run on the table, all unsummarized page ranges are
+summarized.  This action can also be invoked by the user via
+brin_summarize_new_values().  Both these procedures scan all the
+unsummarized ranges, and create a summary tuple.  Again, this includes the
+partially-filled page range at the end of the table.
+
+Vacuuming
+---------
+
+Since no heap TIDs are stored in a BRIN index, it's not necessary to scan the
+index when heap tuples are removed.  It might be that some summary values can
+be tightened if heap tuples have been deleted; but this would represent an
+optimization opportunity only, not a correctness issue.  It's simpler to
+represent this as the need to re-run summarization on the affected page range
+rather than "subtracting" values from the existing one.  This is not
+currently implemented.
+
+Note that if there are no indexes on the table other than the BRIN index,
+usage of maintenance_work_mem by vacuum can be decreased significantly, because
+no detailed index scan needs to take place (and thus it's not necessary for
+vacuum to save TIDs to remove).  It's unlikely that BRIN would be the only
+indexes in a table, though, because primary keys can be btrees only, and so
+we don't implement this optimization.
+
+
+Optimizer
+---------
+
+The optimizer selects the index based on the operator class' pg_amop
+entries for the column.
+
+
+Future improvements
+-------------------
+
+* Different-size page ranges?
+  In the current design, each "index entry" in a BRIN index covers the same
+  number of pages.  There's no hard reason for this; it might make sense to
+  allow the index to self-tune so that some index entries cover smaller page
+  ranges, if this allows the summary values to be more compact.  This would incur
+  larger BRIN overhead for the index itself, but might allow better pruning of
+  page ranges during scan.  In the limit of one index tuple per page, the index
+  itself would occupy too much space, even though we would be able to skip
+  reading the most heap pages, because the summary values are tight; in the
+  opposite limit of a single tuple that summarizes the whole table, we wouldn't
+  be able to prune anything even though the index is very small.  This can
+  probably be made to work by using the range map as an index in itself.
+
+* More compact representation for TIDBitmap?
+  TIDBitmap is the structure used to represent bitmap scans.  The
+  representation of lossy page ranges is not optimal for our purposes, because
+  it uses a Bitmapset to represent pages in the range; since we're going to return
+  all pages in a large range, it might be more convenient to allow for a
+  struct that uses start and end page numbers to represent the range, instead.
+
+* Better vacuuming?
+  It might be useful to enable passing more useful info to BRIN indexes during
+  vacuuming about tuples that are deleted, i.e. do not require the callback to
+  pass each tuple's TID.  For instance we might need a callback that passes a
+  block number instead of a TID.  That would help determine when to re-run
+  summarization on blocks that have seen lots of tuple deletions.
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
new file mode 100644 (file)
index 0000000..76cc36c
--- /dev/null
@@ -0,0 +1,1228 @@
+/*
+ * brin.c
+ *             Implementation of BRIN indexes for Postgres
+ *
+ * See src/backend/access/brin/README for details.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *       src/backend/access/brin/brin.c
+ *
+ * TODO
+ *             * ScalarArrayOpExpr (amsearcharray -> SK_SEARCHARRAY)
+ */
+#include "postgres.h"
+
+#include "access/brin.h"
+#include "access/brin_internal.h"
+#include "access/brin_page.h"
+#include "access/brin_pageops.h"
+#include "access/brin_xlog.h"
+#include "access/reloptions.h"
+#include "access/relscan.h"
+#include "access/xact.h"
+#include "access/xloginsert.h"
+#include "catalog/index.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/bufmgr.h"
+#include "storage/freespace.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+#include "utils/snapmgr.h"
+
+
+/*
+ * We use a BrinBuildState during initial construction of a BRIN index.
+ * The running state is kept in a BrinMemTuple.
+ */
+typedef struct BrinBuildState
+{
+       Relation        bs_irel;
+       int                     bs_numtuples;
+       Buffer          bs_currentInsertBuf;
+       BlockNumber bs_pagesPerRange;
+       BlockNumber bs_currRangeStart;
+       BrinRevmap *bs_rmAccess;
+       BrinDesc   *bs_bdesc;
+       BrinMemTuple *bs_dtuple;
+} BrinBuildState;
+
+/*
+ * Struct used as "opaque" during index scans
+ */
+typedef struct BrinOpaque
+{
+       BlockNumber bo_pagesPerRange;
+       BrinRevmap *bo_rmAccess;
+       BrinDesc   *bo_bdesc;
+} BrinOpaque;
+
+PG_FUNCTION_INFO_V1(brin_summarize_new_values);
+
+static BrinBuildState *initialize_brin_buildstate(Relation idxRel,
+                                                  BrinRevmap *revmap, BlockNumber pagesPerRange);
+static void terminate_brin_buildstate(BrinBuildState *state);
+static void brinsummarize(Relation index, Relation heapRel,
+                         double *numSummarized, double *numExisting);
+static void form_and_insert_tuple(BrinBuildState *state);
+static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a,
+                        BrinTuple *b);
+
+
+/*
+ * A tuple in the heap is being inserted.  To keep a brin index up to date,
+ * we need to obtain the relevant index tuple and compare its stored values
+ * with those of the new tuple.  If the tuple values are not consistent with
+ * the summary tuple, we need to update the index tuple.
+ *
+ * If the range is not currently summarized (i.e. the revmap returns NULL for
+ * it), there's nothing to do.
+ */
+Datum
+brininsert(PG_FUNCTION_ARGS)
+{
+       Relation        idxRel = (Relation) PG_GETARG_POINTER(0);
+       Datum      *values = (Datum *) PG_GETARG_POINTER(1);
+       bool       *nulls = (bool *) PG_GETARG_POINTER(2);
+       ItemPointer heaptid = (ItemPointer) PG_GETARG_POINTER(3);
+
+       /* we ignore the rest of our arguments */
+       BlockNumber pagesPerRange;
+       BrinDesc   *bdesc = NULL;
+       BrinRevmap *revmap;
+       Buffer          buf = InvalidBuffer;
+       MemoryContext tupcxt = NULL;
+       MemoryContext oldcxt = NULL;
+
+       revmap = brinRevmapInitialize(idxRel, &pagesPerRange);
+
+       for (;;)
+       {
+               bool            need_insert = false;
+               OffsetNumber off;
+               BrinTuple  *brtup;
+               BrinMemTuple *dtup;
+               BlockNumber heapBlk;
+               int                     keyno;
+               BrinTuple  *tmptup PG_USED_FOR_ASSERTS_ONLY;
+               BrinMemTuple *tmpdtup PG_USED_FOR_ASSERTS_ONLY;
+               Size tmpsiz PG_USED_FOR_ASSERTS_ONLY;
+
+               CHECK_FOR_INTERRUPTS();
+
+               heapBlk = ItemPointerGetBlockNumber(heaptid);
+               /* normalize the block number to be the first block in the range */
+               heapBlk = (heapBlk / pagesPerRange) * pagesPerRange;
+               brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, NULL,
+                                                                                BUFFER_LOCK_SHARE);
+
+               /* if range is unsummarized, there's nothing to do */
+               if (!brtup)
+                       break;
+
+               /* First time through? */
+               if (bdesc == NULL)
+               {
+                       bdesc = brin_build_desc(idxRel);
+                       tupcxt = AllocSetContextCreate(CurrentMemoryContext,
+                                                                                  "brininsert cxt",
+                                                                                  ALLOCSET_DEFAULT_MINSIZE,
+                                                                                  ALLOCSET_DEFAULT_INITSIZE,
+                                                                                  ALLOCSET_DEFAULT_MAXSIZE);
+                       oldcxt = MemoryContextSwitchTo(tupcxt);
+               }
+
+               dtup = brin_deform_tuple(bdesc, brtup);
+
+#ifdef USE_ASSERT_CHECKING
+               {
+                       /*
+                        * When assertions are enabled, we use this as an opportunity to
+                        * test the "union" method, which would otherwise be used very
+                        * rarely: first create a placeholder tuple, and addValue the
+                        * value we just got into it.  Then union the existing index tuple
+                        * with the updated placeholder tuple.  The tuple resulting from
+                        * that union should be identical to the one resulting from the
+                        * regular operation (straight addValue) below.
+                        *
+                        * Here we create the tuple to compare with; the actual comparison
+                        * is below.
+                        */
+                       tmptup = brin_form_placeholder_tuple(bdesc, heapBlk, &tmpsiz);
+                       tmpdtup = brin_deform_tuple(bdesc, tmptup);
+                       for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
+                       {
+                               BrinValues *bval;
+                               FmgrInfo   *addValue;
+
+                               bval = &tmpdtup->bt_columns[keyno];
+                               addValue = index_getprocinfo(idxRel, keyno + 1,
+                                                                                        BRIN_PROCNUM_ADDVALUE);
+                               FunctionCall4Coll(addValue,
+                                                                 idxRel->rd_indcollation[keyno],
+                                                                 PointerGetDatum(bdesc),
+                                                                 PointerGetDatum(bval),
+                                                                 values[keyno],
+                                                                 nulls[keyno]);
+                       }
+
+                       union_tuples(bdesc, tmpdtup, brtup);
+
+                       tmpdtup->bt_placeholder = dtup->bt_placeholder;
+                       tmptup = brin_form_tuple(bdesc, heapBlk, tmpdtup, &tmpsiz);
+               }
+#endif
+
+               /*
+                * Compare the key values of the new tuple to the stored index values;
+                * our deformed tuple will get updated if the new tuple doesn't fit
+                * the original range (note this means we can't break out of the loop
+                * early). Make a note of whether this happens, so that we know to
+                * insert the modified tuple later.
+                */
+               for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
+               {
+                       Datum           result;
+                       BrinValues *bval;
+                       FmgrInfo   *addValue;
+
+                       bval = &dtup->bt_columns[keyno];
+                       addValue = index_getprocinfo(idxRel, keyno + 1,
+                                                                                BRIN_PROCNUM_ADDVALUE);
+                       result = FunctionCall4Coll(addValue,
+                                                                          idxRel->rd_indcollation[keyno],
+                                                                          PointerGetDatum(bdesc),
+                                                                          PointerGetDatum(bval),
+                                                                          values[keyno],
+                                                                          nulls[keyno]);
+                       /* if that returned true, we need to insert the updated tuple */
+                       need_insert |= DatumGetBool(result);
+               }
+
+#ifdef USE_ASSERT_CHECKING
+               {
+                       /*
+                        * Now we can compare the tuple produced by the union function
+                        * with the one from plain addValue.
+                        */
+                       BrinTuple  *cmptup;
+                       Size            cmpsz;
+
+                       cmptup = brin_form_tuple(bdesc, heapBlk, dtup, &cmpsz);
+                       Assert(brin_tuples_equal(tmptup, tmpsiz, cmptup, cmpsz));
+               }
+#endif
+
+               if (!need_insert)
+               {
+                       /*
+                        * The tuple is consistent with the new values, so there's nothing
+                        * to do.
+                        */
+                       LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+               }
+               else
+               {
+                       Page            page = BufferGetPage(buf);
+                       ItemId          lp = PageGetItemId(page, off);
+                       Size            origsz;
+                       BrinTuple  *origtup;
+                       Size            newsz;
+                       BrinTuple  *newtup;
+                       bool            samepage;
+
+                       /*
+                        * Make a copy of the old tuple, so that we can compare it after
+                        * re-acquiring the lock.
+                        */
+                       origsz = ItemIdGetLength(lp);
+                       origtup = brin_copy_tuple(brtup, origsz);
+
+                       /*
+                        * Before releasing the lock, check if we can attempt a same-page
+                        * update.  Another process could insert a tuple concurrently in
+                        * the same page though, so downstream we must be prepared to cope
+                        * if this turns out to not be possible after all.
+                        */
+                       samepage = brin_can_do_samepage_update(buf, origsz, newsz);
+
+                       LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+                       newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz);
+
+                       /*
+                        * Try to update the tuple.  If this doesn't work for whatever
+                        * reason, we need to restart from the top; the revmap might be
+                        * pointing at a different tuple for this block now, so we need to
+                        * recompute to ensure both our new heap tuple and the other
+                        * inserter's are covered by the combined tuple.  It might be that
+                        * we don't need to update at all.
+                        */
+                       if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk,
+                                                          buf, off, origtup, origsz, newtup, newsz,
+                                                          samepage))
+                       {
+                               /* no luck; start over */
+                               MemoryContextResetAndDeleteChildren(tupcxt);
+                               continue;
+                       }
+               }
+
+               /* success! */
+               break;
+       }
+
+       brinRevmapTerminate(revmap);
+       if (BufferIsValid(buf))
+               ReleaseBuffer(buf);
+       if (bdesc != NULL)
+       {
+               brin_free_desc(bdesc);
+               MemoryContextSwitchTo(oldcxt);
+               MemoryContextDelete(tupcxt);
+       }
+
+       return BoolGetDatum(false);
+}
+
+/*
+ * Initialize state for a BRIN index scan.
+ *
+ * We read the metapage here to determine the pages-per-range number that this
+ * index was built with.  Note that since this cannot be changed while we're
+ * holding lock on index, it's not necessary to recompute it during brinrescan.
+ */
+Datum
+brinbeginscan(PG_FUNCTION_ARGS)
+{
+       Relation        r = (Relation) PG_GETARG_POINTER(0);
+       int                     nkeys = PG_GETARG_INT32(1);
+       int                     norderbys = PG_GETARG_INT32(2);
+       IndexScanDesc scan;
+       BrinOpaque *opaque;
+
+       scan = RelationGetIndexScan(r, nkeys, norderbys);
+
+       opaque = (BrinOpaque *) palloc(sizeof(BrinOpaque));
+       opaque->bo_rmAccess = brinRevmapInitialize(r, &opaque->bo_pagesPerRange);
+       opaque->bo_bdesc = brin_build_desc(r);
+       scan->opaque = opaque;
+
+       PG_RETURN_POINTER(scan);
+}
+
+/*
+ * Execute the index scan.
+ *
+ * This works by reading index TIDs from the revmap, and obtaining the index
+ * tuples pointed to by them; the summary values in the index tuples are
+ * compared to the scan keys.  We return into the TID bitmap all the pages in
+ * ranges corresponding to index tuples that match the scan keys.
+ *
+ * If a TID from the revmap is read as InvalidTID, we know that range is
+ * unsummarized.  Pages in those ranges need to be returned regardless of scan
+ * keys.
+ *
+ * XXX see _bt_first on what to do about sk_subtype.
+ */
+Datum
+bringetbitmap(PG_FUNCTION_ARGS)
+{
+       IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+       TIDBitmap  *tbm = (TIDBitmap *) PG_GETARG_POINTER(1);
+       Relation        idxRel = scan->indexRelation;
+       Buffer          buf = InvalidBuffer;
+       BrinDesc   *bdesc;
+       Oid                     heapOid;
+       Relation        heapRel;
+       BrinOpaque *opaque;
+       BlockNumber nblocks;
+       BlockNumber heapBlk;
+       int                     totalpages = 0;
+       int                     keyno;
+       FmgrInfo   *consistentFn;
+       MemoryContext oldcxt;
+       MemoryContext perRangeCxt;
+
+       opaque = (BrinOpaque *) scan->opaque;
+       bdesc = opaque->bo_bdesc;
+       pgstat_count_index_scan(idxRel);
+
+       /*
+        * We need to know the size of the table so that we know how long to
+        * iterate on the revmap.
+        */
+       heapOid = IndexGetRelation(RelationGetRelid(idxRel), false);
+       heapRel = heap_open(heapOid, AccessShareLock);
+       nblocks = RelationGetNumberOfBlocks(heapRel);
+       heap_close(heapRel, AccessShareLock);
+
+       /*
+        * Obtain consistent functions for all indexed column.  Maybe it'd be
+        * possible to do this lazily only the first time we see a scan key that
+        * involves each particular attribute.
+        */
+       consistentFn = palloc(sizeof(FmgrInfo) * bdesc->bd_tupdesc->natts);
+       for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
+       {
+               FmgrInfo   *tmp;
+
+               tmp = index_getprocinfo(idxRel, keyno + 1, BRIN_PROCNUM_CONSISTENT);
+               fmgr_info_copy(&consistentFn[keyno], tmp, CurrentMemoryContext);
+       }
+
+       /*
+        * Setup and use a per-range memory context, which is reset every time we
+        * loop below.  This avoids having to free the tuples within the loop.
+        */
+       perRangeCxt = AllocSetContextCreate(CurrentMemoryContext,
+                                                                               "bringetbitmap cxt",
+                                                                               ALLOCSET_DEFAULT_MINSIZE,
+                                                                               ALLOCSET_DEFAULT_INITSIZE,
+                                                                               ALLOCSET_DEFAULT_MAXSIZE);
+       oldcxt = MemoryContextSwitchTo(perRangeCxt);
+
+       /*
+        * Now scan the revmap.  We start by querying for heap page 0,
+        * incrementing by the number of pages per range; this gives us a full
+        * view of the table.
+        */
+       for (heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange)
+       {
+               bool            addrange;
+               BrinTuple  *tup;
+               OffsetNumber off;
+               Size            size;
+
+               CHECK_FOR_INTERRUPTS();
+
+               MemoryContextResetAndDeleteChildren(perRangeCxt);
+
+               tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf,
+                                                                          &off, &size, BUFFER_LOCK_SHARE);
+               if (tup)
+               {
+                       tup = brin_copy_tuple(tup, size);
+                       LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+               }
+
+               /*
+                * For page ranges with no indexed tuple, we must return the whole
+                * range; otherwise, compare it to the scan keys.
+                */
+               if (tup == NULL)
+               {
+                       addrange = true;
+               }
+               else
+               {
+                       BrinMemTuple *dtup;
+                       int                     keyno;
+
+                       dtup = brin_deform_tuple(bdesc, tup);
+                       if (dtup->bt_placeholder)
+                       {
+                               /*
+                                * Placeholder tuples are always returned, regardless of the
+                                * values stored in them.
+                                */
+                               addrange = true;
+                       }
+                       else
+                       {
+                               /*
+                                * Compare scan keys with summary values stored for the range.
+                                * If scan keys are matched, the page range must be added to
+                                * the bitmap.  We initially assume the range needs to be
+                                * added; in particular this serves the case where there are
+                                * no keys.
+                                */
+                               addrange = true;
+                               for (keyno = 0; keyno < scan->numberOfKeys; keyno++)
+                               {
+                                       ScanKey         key = &scan->keyData[keyno];
+                                       AttrNumber      keyattno = key->sk_attno;
+                                       BrinValues *bval = &dtup->bt_columns[keyattno - 1];
+                                       Datum           add;
+
+                                       /*
+                                        * The collation of the scan key must match the collation
+                                        * used in the index column (but only if the search is not
+                                        * IS NULL/ IS NOT NULL).  Otherwise we shouldn't be using
+                                        * this index ...
+                                        */
+                                       Assert((key->sk_flags & SK_ISNULL) ||
+                                                  (key->sk_collation ==
+                                          bdesc->bd_tupdesc->attrs[keyattno - 1]->attcollation));
+
+                                       /*
+                                        * Check whether the scan key is consistent with the page
+                                        * range values; if so, have the pages in the range added
+                                        * to the output bitmap.
+                                        *
+                                        * When there are multiple scan keys, failure to meet the
+                                        * criteria for a single one of them is enough to discard
+                                        * the range as a whole, so break out of the loop as soon
+                                        * as a false return value is obtained.
+                                        */
+                                       add = FunctionCall3Coll(&consistentFn[keyattno - 1],
+                                                                                       key->sk_collation,
+                                                                                       PointerGetDatum(bdesc),
+                                                                                       PointerGetDatum(bval),
+                                                                                       PointerGetDatum(key));
+                                       addrange = DatumGetBool(add);
+                                       if (!addrange)
+                                               break;
+                               }
+                       }
+               }
+
+               /* add the pages in the range to the output bitmap, if needed */
+               if (addrange)
+               {
+                       BlockNumber pageno;
+
+                       for (pageno = heapBlk;
+                                pageno <= heapBlk + opaque->bo_pagesPerRange - 1;
+                                pageno++)
+                       {
+                               MemoryContextSwitchTo(oldcxt);
+                               tbm_add_page(tbm, pageno);
+                               totalpages++;
+                               MemoryContextSwitchTo(perRangeCxt);
+                       }
+               }
+       }
+
+       MemoryContextSwitchTo(oldcxt);
+       MemoryContextDelete(perRangeCxt);
+
+       if (buf != InvalidBuffer)
+               ReleaseBuffer(buf);
+
+       /*
+        * XXX We have an approximation of the number of *pages* that our scan
+        * returns, but we don't have a precise idea of the number of heap tuples
+        * involved.
+        */
+       PG_RETURN_INT64(totalpages * 10);
+}
+
+/*
+ * Re-initialize state for a BRIN index scan
+ */
+Datum
+brinrescan(PG_FUNCTION_ARGS)
+{
+       IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+       ScanKey         scankey = (ScanKey) PG_GETARG_POINTER(1);
+
+       /* other arguments ignored */
+
+       if (scankey && scan->numberOfKeys > 0)
+               memmove(scan->keyData, scankey,
+                               scan->numberOfKeys * sizeof(ScanKeyData));
+
+       PG_RETURN_VOID();
+}
+
+/*
+ * Close down a BRIN index scan
+ */
+Datum
+brinendscan(PG_FUNCTION_ARGS)
+{
+       IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+       BrinOpaque *opaque = (BrinOpaque *) scan->opaque;
+
+       brinRevmapTerminate(opaque->bo_rmAccess);
+       brin_free_desc(opaque->bo_bdesc);
+       pfree(opaque);
+
+       PG_RETURN_VOID();
+}
+
+Datum
+brinmarkpos(PG_FUNCTION_ARGS)
+{
+       elog(ERROR, "BRIN does not support mark/restore");
+       PG_RETURN_VOID();
+}
+
+Datum
+brinrestrpos(PG_FUNCTION_ARGS)
+{
+       elog(ERROR, "BRIN does not support mark/restore");
+       PG_RETURN_VOID();
+}
+
+/*
+ * Per-heap-tuple callback for IndexBuildHeapScan.
+ *
+ * Note we don't worry about the page range at the end of the table here; it is
+ * present in the build state struct after we're called the last time, but not
+ * inserted into the index.  Caller must ensure to do so, if appropriate.
+ */
+static void
+brinbuildCallback(Relation index,
+                                 HeapTuple htup,
+                                 Datum *values,
+                                 bool *isnull,
+                                 bool tupleIsAlive,
+                                 void *brstate)
+{
+       BrinBuildState *state = (BrinBuildState *) brstate;
+       BlockNumber thisblock;
+       int                     i;
+
+       thisblock = ItemPointerGetBlockNumber(&htup->t_self);
+
+       /*
+        * If we're in a block that belongs to a future range, summarize what we've
+        * got and start afresh.  Note the scan might have skipped many pages,
+        * if they were devoid of live tuples; make sure to insert index tuples
+        * for those too.
+        */
+       while (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1)
+       {
+
+               BRIN_elog(DEBUG2, "brinbuildCallback: completed a range: %u--%u",
+                                 state->bs_currRangeStart,
+                                 state->bs_currRangeStart + state->bs_pagesPerRange);
+
+               /* create the index tuple and insert it */
+               form_and_insert_tuple(state);
+
+               /* set state to correspond to the next range */
+               state->bs_currRangeStart += state->bs_pagesPerRange;
+
+               /* re-initialize state for it */
+               brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
+       }
+
+       /* Accumulate the current tuple into the running state */
+       for (i = 0; i < state->bs_bdesc->bd_tupdesc->natts; i++)
+       {
+               FmgrInfo   *addValue;
+               BrinValues *col;
+
+               col = &state->bs_dtuple->bt_columns[i];
+               addValue = index_getprocinfo(index, i + 1,
+                                                                        BRIN_PROCNUM_ADDVALUE);
+
+               /*
+                * Update dtuple state, if and as necessary.
+                */
+               FunctionCall4Coll(addValue,
+                                                 state->bs_bdesc->bd_tupdesc->attrs[i]->attcollation,
+                                                 PointerGetDatum(state->bs_bdesc),
+                                                 PointerGetDatum(col),
+                                                 values[i], isnull[i]);
+       }
+}
+
+/*
+ * brinbuild() -- build a new BRIN index.
+ */
+Datum
+brinbuild(PG_FUNCTION_ARGS)
+{
+       Relation        heap = (Relation) PG_GETARG_POINTER(0);
+       Relation        index = (Relation) PG_GETARG_POINTER(1);
+       IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
+       IndexBuildResult *result;
+       double          reltuples;
+       double          idxtuples;
+       BrinRevmap *revmap;
+       BrinBuildState *state;
+       Buffer          meta;
+       BlockNumber pagesPerRange;
+
+       /*
+        * We expect to be called exactly once for any index relation.
+        */
+       if (RelationGetNumberOfBlocks(index) != 0)
+               elog(ERROR, "index \"%s\" already contains data",
+                        RelationGetRelationName(index));
+
+       /*
+        * Critical section not required, because on error the creation of the
+        * whole relation will be rolled back.
+        */
+
+       meta = ReadBuffer(index, P_NEW);
+       Assert(BufferGetBlockNumber(meta) == BRIN_METAPAGE_BLKNO);
+       LockBuffer(meta, BUFFER_LOCK_EXCLUSIVE);
+
+       brin_metapage_init(BufferGetPage(meta), BrinGetPagesPerRange(index),
+                                          BRIN_CURRENT_VERSION);
+       MarkBufferDirty(meta);
+
+       if (RelationNeedsWAL(index))
+       {
+               xl_brin_createidx xlrec;
+               XLogRecPtr      recptr;
+               XLogRecData rdata;
+               Page            page;
+
+               xlrec.node = index->rd_node;
+               xlrec.version = BRIN_CURRENT_VERSION;
+               xlrec.pagesPerRange = BrinGetPagesPerRange(index);
+
+               rdata.buffer = InvalidBuffer;
+               rdata.data = (char *) &xlrec;
+               rdata.len = SizeOfBrinCreateIdx;
+               rdata.next = NULL;
+
+               recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_CREATE_INDEX, &rdata);
+
+               page = BufferGetPage(meta);
+               PageSetLSN(page, recptr);
+       }
+
+       UnlockReleaseBuffer(meta);
+
+       /*
+        * Initialize our state, including the deformed tuple state.
+        */
+       revmap = brinRevmapInitialize(index, &pagesPerRange);
+       state = initialize_brin_buildstate(index, revmap, pagesPerRange);
+
+       /*
+        * Now scan the relation.  No syncscan allowed here because we want the
+        * heap blocks in physical order.
+        */
+       reltuples = IndexBuildHeapScan(heap, index, indexInfo, false,
+                                                                  brinbuildCallback, (void *) state);
+
+       /* process the final batch */
+       form_and_insert_tuple(state);
+
+       /* release resources */
+       idxtuples = state->bs_numtuples;
+       brinRevmapTerminate(state->bs_rmAccess);
+       terminate_brin_buildstate(state);
+
+       /*
+        * Return statistics
+        */
+       result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
+
+       result->heap_tuples = reltuples;
+       result->index_tuples = idxtuples;
+
+       PG_RETURN_POINTER(result);
+}
+
+Datum
+brinbuildempty(PG_FUNCTION_ARGS)
+{
+
+       Relation        index = (Relation) PG_GETARG_POINTER(0);
+       Buffer          metabuf;
+
+       /* An empty BRIN index has a metapage only. */
+       metabuf =
+               ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
+       LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
+
+       /* Initialize and xlog metabuffer. */
+       START_CRIT_SECTION();
+       brin_metapage_init(BufferGetPage(metabuf), BrinGetPagesPerRange(index),
+                                          BRIN_CURRENT_VERSION);
+       MarkBufferDirty(metabuf);
+       log_newpage_buffer(metabuf, false);
+       END_CRIT_SECTION();
+
+       UnlockReleaseBuffer(metabuf);
+
+       PG_RETURN_VOID();
+}
+
+/*
+ * brinbulkdelete
+ *             Since there are no per-heap-tuple index tuples in BRIN indexes,
+ *             there's not a lot we can do here.
+ *
+ * XXX we could mark item tuples as "dirty" (when a minimum or maximum heap
+ * tuple is deleted), meaning the need to re-run summarization on the affected
+ * range.  Need to an extra flag in mmtuples for that.
+ */
+Datum
+brinbulkdelete(PG_FUNCTION_ARGS)
+{
+       /* other arguments are not currently used */
+       IndexBulkDeleteResult *stats =
+               (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
+
+       /* allocate stats if first time through, else re-use existing struct */
+       if (stats == NULL)
+               stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+
+       PG_RETURN_POINTER(stats);
+}
+
+/*
+ * This routine is in charge of "vacuuming" a BRIN index: we just summarize
+ * ranges that are currently unsummarized.
+ */
+Datum
+brinvacuumcleanup(PG_FUNCTION_ARGS)
+{
+       IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
+       IndexBulkDeleteResult *stats =
+               (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
+       Relation        heapRel;
+
+       /* No-op in ANALYZE ONLY mode */
+       if (info->analyze_only)
+               PG_RETURN_POINTER(stats);
+
+       if (!stats)
+               stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+       stats->num_pages = RelationGetNumberOfBlocks(info->index);
+       /* rest of stats is initialized by zeroing */
+
+       heapRel = heap_open(IndexGetRelation(RelationGetRelid(info->index), false),
+                                               AccessShareLock);
+
+       brinsummarize(info->index, heapRel,
+                                 &stats->num_index_tuples, &stats->num_index_tuples);
+
+       heap_close(heapRel, AccessShareLock);
+
+       PG_RETURN_POINTER(stats);
+}
+
+/*
+ * reloptions processor for BRIN indexes
+ */
+Datum
+brinoptions(PG_FUNCTION_ARGS)
+{
+       Datum           reloptions = PG_GETARG_DATUM(0);
+       bool            validate = PG_GETARG_BOOL(1);
+       relopt_value *options;
+       BrinOptions *rdopts;
+       int                     numoptions;
+       static const relopt_parse_elt tab[] = {
+               {"pages_per_range", RELOPT_TYPE_INT, offsetof(BrinOptions, pagesPerRange)}
+       };
+
+       options = parseRelOptions(reloptions, validate, RELOPT_KIND_BRIN,
+                                                         &numoptions);
+
+       /* if none set, we're done */
+       if (numoptions == 0)
+               PG_RETURN_NULL();
+
+       rdopts = allocateReloptStruct(sizeof(BrinOptions), options, numoptions);
+
+       fillRelOptions((void *) rdopts, sizeof(BrinOptions), options, numoptions,
+                                  validate, tab, lengthof(tab));
+
+       pfree(options);
+
+       PG_RETURN_BYTEA_P(rdopts);
+}
+
+/*
+ * SQL-callable function to scan through an index and summarize all ranges
+ * that are not currently summarized.
+ */
+Datum
+brin_summarize_new_values(PG_FUNCTION_ARGS)
+{
+       Oid                     indexoid = PG_GETARG_OID(0);
+       Relation        indexRel;
+       Relation        heapRel;
+       double          numSummarized = 0;
+
+       heapRel = heap_open(IndexGetRelation(indexoid, false),
+                                               ShareUpdateExclusiveLock);
+       indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
+
+       brinsummarize(indexRel, heapRel, &numSummarized, NULL);
+
+       relation_close(indexRel, ShareUpdateExclusiveLock);
+       relation_close(heapRel, ShareUpdateExclusiveLock);
+
+       PG_RETURN_INT32((int32) numSummarized);
+}
+
+/*
+ * Build a BrinDesc used to create or scan a BRIN index
+ */
+BrinDesc *
+brin_build_desc(Relation rel)
+{
+       BrinOpcInfo **opcinfo;
+       BrinDesc   *bdesc;
+       TupleDesc       tupdesc;
+       int                     totalstored = 0;
+       int                     keyno;
+       long            totalsize;
+       MemoryContext cxt;
+       MemoryContext oldcxt;
+
+       cxt = AllocSetContextCreate(CurrentMemoryContext,
+                                                               "brin desc cxt",
+                                                               ALLOCSET_SMALL_INITSIZE,
+                                                               ALLOCSET_SMALL_MINSIZE,
+                                                               ALLOCSET_SMALL_MAXSIZE);
+       oldcxt = MemoryContextSwitchTo(cxt);
+       tupdesc = RelationGetDescr(rel);
+
+       /*
+        * Obtain BrinOpcInfo for each indexed column.  While at it, accumulate
+        * the number of columns stored, since the number is opclass-defined.
+        */
+       opcinfo = (BrinOpcInfo **) palloc(sizeof(BrinOpcInfo *) * tupdesc->natts);
+       for (keyno = 0; keyno < tupdesc->natts; keyno++)
+       {
+               FmgrInfo   *opcInfoFn;
+
+               opcInfoFn = index_getprocinfo(rel, keyno + 1, BRIN_PROCNUM_OPCINFO);
+
+               opcinfo[keyno] = (BrinOpcInfo *)
+                       DatumGetPointer(FunctionCall1(opcInfoFn,
+                                                                                 tupdesc->attrs[keyno]->atttypid));
+               totalstored += opcinfo[keyno]->oi_nstored;
+       }
+
+       /* Allocate our result struct and fill it in */
+       totalsize = offsetof(BrinDesc, bd_info) +
+               sizeof(BrinOpcInfo *) * tupdesc->natts;
+
+       bdesc = palloc(totalsize);
+       bdesc->bd_context = cxt;
+       bdesc->bd_index = rel;
+       bdesc->bd_tupdesc = tupdesc;
+       bdesc->bd_disktdesc = NULL; /* generated lazily */
+       bdesc->bd_totalstored = totalstored;
+
+       for (keyno = 0; keyno < tupdesc->natts; keyno++)
+               bdesc->bd_info[keyno] = opcinfo[keyno];
+       pfree(opcinfo);
+
+       MemoryContextSwitchTo(oldcxt);
+
+       return bdesc;
+}
+
+void
+brin_free_desc(BrinDesc *bdesc)
+{
+       /* make sure the tupdesc is still valid */
+       Assert(bdesc->bd_tupdesc->tdrefcount >= 1);
+       /* no need for retail pfree */
+       MemoryContextDelete(bdesc->bd_context);
+}
+
+/*
+ * Initialize a BrinBuildState appropriate to create tuples on the given index.
+ */
+static BrinBuildState *
+initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap,
+                                                  BlockNumber pagesPerRange)
+{
+       BrinBuildState *state;
+
+       state = palloc(sizeof(BrinBuildState));
+
+       state->bs_irel = idxRel;
+       state->bs_numtuples = 0;
+       state->bs_currentInsertBuf = InvalidBuffer;
+       state->bs_pagesPerRange = pagesPerRange;
+       state->bs_currRangeStart = 0;
+       state->bs_rmAccess = revmap;
+       state->bs_bdesc = brin_build_desc(idxRel);
+       state->bs_dtuple = brin_new_memtuple(state->bs_bdesc);
+
+       brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
+
+       return state;
+}
+
+/*
+ * Release resources associated with a BrinBuildState.
+ */
+static void
+terminate_brin_buildstate(BrinBuildState *state)
+{
+       /* release the last index buffer used */
+       if (!BufferIsInvalid(state->bs_currentInsertBuf))
+       {
+               Page            page;
+
+               page = BufferGetPage(state->bs_currentInsertBuf);
+               RecordPageWithFreeSpace(state->bs_irel,
+                                                               BufferGetBlockNumber(state->bs_currentInsertBuf),
+                                                               PageGetFreeSpace(page));
+               ReleaseBuffer(state->bs_currentInsertBuf);
+       }
+
+       brin_free_desc(state->bs_bdesc);
+       pfree(state->bs_dtuple);
+       pfree(state);
+}
+
+/*
+ * Summarize the given page range of the given index.
+ *
+ * This routine can run in parallel with insertions into the heap.  To avoid
+ * missing those values from the summary tuple, we first insert a placeholder
+ * index tuple into the index, then execute the heap scan; transactions
+ * concurrent with the scan update the placeholder tuple.  After the scan, we
+ * union the placeholder tuple with the one computed by this routine.  The
+ * update of the index value happens in a loop, so that if somebody updates
+ * the placeholder tuple after we read it, we detect the case and try again.
+ * This ensures that the concurrently inserted tuples are not lost.
+ */
+static void
+summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel,
+                               BlockNumber heapBlk)
+{
+       Buffer          phbuf;
+       BrinTuple  *phtup;
+       Size            phsz;
+       OffsetNumber offset;
+
+       /*
+        * Insert the placeholder tuple
+        */
+       phbuf = InvalidBuffer;
+       phtup = brin_form_placeholder_tuple(state->bs_bdesc, heapBlk, &phsz);
+       offset = brin_doinsert(state->bs_irel, state->bs_pagesPerRange,
+                                                  state->bs_rmAccess, &phbuf,
+                                                  heapBlk, phtup, phsz);
+
+       /*
+        * Execute the partial heap scan covering the heap blocks in the specified
+        * page range, summarizing the heap tuples in it.  This scan stops just
+        * short of brinbuildCallback creating the new index entry.
+        */
+       state->bs_currRangeStart = heapBlk;
+       IndexBuildHeapRangeScan(heapRel, state->bs_irel, indexInfo, false,
+                                                       heapBlk, state->bs_pagesPerRange,
+                                                       brinbuildCallback, (void *) state);
+
+       /*
+        * Now we update the values obtained by the scan with the placeholder
+        * tuple.  We do this in a loop which only terminates if we're able to
+        * update the placeholder tuple successfully; if we are not, this means
+        * somebody else modified the placeholder tuple after we read it.
+        */
+       for (;;)
+       {
+               BrinTuple  *newtup;
+               Size            newsize;
+               bool            didupdate;
+               bool            samepage;
+
+               CHECK_FOR_INTERRUPTS();
+
+               /*
+                * Update the summary tuple and try to update.
+                */
+               newtup = brin_form_tuple(state->bs_bdesc,
+                                                                heapBlk, state->bs_dtuple, &newsize);
+               samepage = brin_can_do_samepage_update(phbuf, phsz, newsize);
+               didupdate =
+                       brin_doupdate(state->bs_irel, state->bs_pagesPerRange,
+                                                 state->bs_rmAccess, heapBlk, phbuf, offset,
+                                                 phtup, phsz, newtup, newsize, samepage);
+               brin_free_tuple(phtup);
+               brin_free_tuple(newtup);
+
+               /* If the update succeeded, we're done. */
+               if (didupdate)
+                       break;
+
+               /*
+                * If the update didn't work, it might be because somebody updated the
+                * placeholder tuple concurrently.  Extract the new version, union it
+                * with the values we have from the scan, and start over.  (There are
+                * other reasons for the update to fail, but it's simple to treat them
+                * the same.)
+                */
+               phtup = brinGetTupleForHeapBlock(state->bs_rmAccess, heapBlk, &phbuf,
+                                                                                &offset, &phsz, BUFFER_LOCK_SHARE);
+               /* the placeholder tuple must exist */
+               if (phtup == NULL)
+                       elog(ERROR, "missing placeholder tuple");
+               phtup = brin_copy_tuple(phtup, phsz);
+               LockBuffer(phbuf, BUFFER_LOCK_UNLOCK);
+
+               /* merge it into the tuple from the heap scan */
+               union_tuples(state->bs_bdesc, state->bs_dtuple, phtup);
+       }
+
+       ReleaseBuffer(phbuf);
+}
+
+/*
+ * Scan a complete BRIN index, and summarize each page range that's not already
+ * summarized.  The index and heap must have been locked by caller in at
+ * least ShareUpdateExclusiveLock mode.
+ *
+ * For each new index tuple inserted, *numSummarized (if not NULL) is
+ * incremented; for each existing tuple, numExisting (if not NULL) is
+ * incremented.
+ */
+static void
+brinsummarize(Relation index, Relation heapRel, double *numSummarized,
+                         double *numExisting)
+{
+       BrinRevmap *revmap;
+       BrinBuildState *state = NULL;
+       IndexInfo  *indexInfo = NULL;
+       BlockNumber heapNumBlocks;
+       BlockNumber heapBlk;
+       BlockNumber pagesPerRange;
+       Buffer          buf;
+
+       revmap = brinRevmapInitialize(index, &pagesPerRange);
+
+       /*
+        * Scan the revmap to find unsummarized items.
+        */
+       buf = InvalidBuffer;
+       heapNumBlocks = RelationGetNumberOfBlocks(heapRel);
+       for (heapBlk = 0; heapBlk < heapNumBlocks; heapBlk += pagesPerRange)
+       {
+               BrinTuple  *tup;
+               OffsetNumber off;
+
+               CHECK_FOR_INTERRUPTS();
+
+               tup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, NULL,
+                                                                          BUFFER_LOCK_SHARE);
+               if (tup == NULL)
+               {
+                       /* no revmap entry for this heap range. Summarize it. */
+                       if (state == NULL)
+                       {
+                               /* first time through */
+                               Assert(!indexInfo);
+                               state = initialize_brin_buildstate(index, revmap,
+                                                                                                  pagesPerRange);
+                               indexInfo = BuildIndexInfo(index);
+
+                               /*
+                                * We only have ShareUpdateExclusiveLock on the table, and
+                                * therefore other sessions may insert tuples into the range
+                                * we're going to scan.  This is okay, because we take
+                                * additional precautions to avoid losing the additional
+                                * tuples; see comments in summarize_range.  Set the
+                                * concurrent flag, which causes IndexBuildHeapRangeScan to
+                                * use a snapshot other than SnapshotAny, and silences
+                                * warnings emitted there.
+                                */
+                               indexInfo->ii_Concurrent = true;
+
+                               /*
+                                * If using transaction-snapshot mode, it would be possible
+                                * for another transaction to insert a tuple that's not
+                                * visible to our snapshot if we have already acquired one,
+                                * when in snapshot-isolation mode; therefore, disallow this
+                                * from running in such a transaction unless a snapshot hasn't
+                                * been acquired yet.
+                                *
+                                * This code is called by VACUUM and
+                                * brin_summarize_new_values. Have the error message mention
+                                * the latter because VACUUM cannot run in a transaction and
+                                * thus cannot cause this issue.
+                                */
+                               if (IsolationUsesXactSnapshot() && FirstSnapshotSet)
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+                                                        errmsg("brin_summarize_new_values() cannot run in a transaction that has already obtained a snapshot")));
+                       }
+                       summarize_range(indexInfo, state, heapRel, heapBlk);
+
+                       /* and re-initialize state for the next range */
+                       brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
+
+                       if (numSummarized)
+                               *numSummarized += 1.0;
+               }
+               else
+               {
+                       if (numExisting)
+                               *numExisting += 1.0;
+                       LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+               }
+       }
+
+       if (BufferIsValid(buf))
+               ReleaseBuffer(buf);
+
+       /* free resources */
+       brinRevmapTerminate(revmap);
+       if (state)
+               terminate_brin_buildstate(state);
+}
+
+/*
+ * Given a deformed tuple in the build state, convert it into the on-disk
+ * format and insert it into the index, making the revmap point to it.
+ */
+static void
+form_and_insert_tuple(BrinBuildState *state)
+{
+       BrinTuple  *tup;
+       Size            size;
+
+       tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
+                                                 state->bs_dtuple, &size);
+       brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
+                                 &state->bs_currentInsertBuf, state->bs_currRangeStart,
+                                 tup, size);
+       state->bs_numtuples++;
+
+       pfree(tup);
+}
+
+/*
+ * Given two deformed tuples, adjust the first one so that it's consistent
+ * with the summary values in both.
+ */
+static void
+union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b)
+{
+       int                     keyno;
+       BrinMemTuple *db;
+       MemoryContext cxt;
+       MemoryContext oldcxt;
+
+       /* Use our own memory context to avoid retail pfree */
+       cxt = AllocSetContextCreate(CurrentMemoryContext,
+                                                               "brin union",
+                                                               ALLOCSET_DEFAULT_MINSIZE,
+                                                               ALLOCSET_DEFAULT_INITSIZE,
+                                                               ALLOCSET_DEFAULT_MAXSIZE);
+       oldcxt = MemoryContextSwitchTo(cxt);
+       db = brin_deform_tuple(bdesc, b);
+       MemoryContextSwitchTo(oldcxt);
+
+       for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
+       {
+               FmgrInfo   *unionFn;
+               BrinValues *col_a = &a->bt_columns[keyno];
+               BrinValues *col_b = &db->bt_columns[keyno];
+
+               unionFn = index_getprocinfo(bdesc->bd_index, keyno + 1,
+                                                                       BRIN_PROCNUM_UNION);
+               FunctionCall3Coll(unionFn,
+                                                 bdesc->bd_index->rd_indcollation[keyno],
+                                                 PointerGetDatum(bdesc),
+                                                 PointerGetDatum(col_a),
+                                                 PointerGetDatum(col_b));
+       }
+
+       MemoryContextDelete(cxt);
+}
diff --git a/src/backend/access/brin/brin_minmax.c b/src/backend/access/brin/brin_minmax.c
new file mode 100644 (file)
index 0000000..3a2bee2
--- /dev/null
@@ -0,0 +1,341 @@
+/*
+ * brin_minmax.c
+ *             Implementation of Min/Max opclass for BRIN
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *       src/backend/access/brin/brin_minmax.c
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/brin_internal.h"
+#include "access/brin_tuple.h"
+#include "access/skey.h"
+#include "catalog/pg_type.h"
+#include "utils/datum.h"
+#include "utils/lsyscache.h"
+#include "utils/syscache.h"
+
+
+/*
+ * Procedure numbers must not collide with BRIN_PROCNUM defines in
+ * brin_internal.h.  Note we only need inequality functions.
+ */
+#define                MINMAX_NUM_PROCNUMS             4       /* # support procs we need */
+#define                PROCNUM_LESS                    11
+#define                PROCNUM_LESSEQUAL               12
+#define                PROCNUM_GREATEREQUAL    13
+#define                PROCNUM_GREATER                 14
+
+/*
+ * Subtract this from procnum to obtain index in MinmaxOpaque arrays
+ * (Must be equal to minimum of private procnums)
+ */
+#define                PROCNUM_BASE                    11
+
+static FmgrInfo *minmax_get_procinfo(BrinDesc *bdesc, uint16 attno,
+                                       uint16 procnum);
+
+PG_FUNCTION_INFO_V1(minmaxOpcInfo);
+PG_FUNCTION_INFO_V1(minmaxAddValue);
+PG_FUNCTION_INFO_V1(minmaxConsistent);
+PG_FUNCTION_INFO_V1(minmaxUnion);
+
+
+typedef struct MinmaxOpaque
+{
+       FmgrInfo        operators[MINMAX_NUM_PROCNUMS];
+       bool            inited[MINMAX_NUM_PROCNUMS];
+} MinmaxOpaque;
+
+Datum
+minmaxOpcInfo(PG_FUNCTION_ARGS)
+{
+       Oid                     typoid = PG_GETARG_OID(0);
+       BrinOpcInfo *result;
+
+       /*
+        * opaque->operators is initialized lazily, as indicated by 'inited' which
+        * is initialized to all false by palloc0.
+        */
+
+       result = palloc0(MAXALIGN(SizeofBrinOpcInfo(2)) +
+                                        sizeof(MinmaxOpaque));
+       result->oi_nstored = 2;
+       result->oi_opaque = (MinmaxOpaque *)
+               MAXALIGN((char *) result + SizeofBrinOpcInfo(2));
+       result->oi_typids[0] = typoid;
+       result->oi_typids[1] = typoid;
+
+       PG_RETURN_POINTER(result);
+}
+
+/*
+ * Examine the given index tuple (which contains partial status of a certain
+ * page range) by comparing it to the given value that comes from another heap
+ * tuple.  If the new value is outside the min/max range specified by the
+ * existing tuple values, update the index tuple and return true.  Otherwise,
+ * return false and do not modify in this case.
+ */
+Datum
+minmaxAddValue(PG_FUNCTION_ARGS)
+{
+       BrinDesc   *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
+       BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1);
+       Datum           newval = PG_GETARG_DATUM(2);
+       bool            isnull = PG_GETARG_DATUM(3);
+       Oid                     colloid = PG_GET_COLLATION();
+       FmgrInfo   *cmpFn;
+       Datum           compar;
+       bool            updated = false;
+       Form_pg_attribute attr;
+       AttrNumber      attno;
+
+       /*
+        * If the new value is null, we record that we saw it if it's the first
+        * one; otherwise, there's nothing to do.
+        */
+       if (isnull)
+       {
+               if (column->bv_hasnulls)
+                       PG_RETURN_BOOL(false);
+
+               column->bv_hasnulls = true;
+               PG_RETURN_BOOL(true);
+       }
+
+       attno = column->bv_attno;
+       attr = bdesc->bd_tupdesc->attrs[attno - 1];
+
+       /*
+        * If the recorded value is null, store the new value (which we know to be
+        * not null) as both minimum and maximum, and we're done.
+        */
+       if (column->bv_allnulls)
+       {
+               column->bv_values[0] = datumCopy(newval, attr->attbyval, attr->attlen);
+               column->bv_values[1] = datumCopy(newval, attr->attbyval, attr->attlen);
+               column->bv_allnulls = false;
+               PG_RETURN_BOOL(true);
+       }
+
+       /*
+        * Otherwise, need to compare the new value with the existing boundaries
+        * and update them accordingly.  First check if it's less than the
+        * existing minimum.
+        */
+       cmpFn = minmax_get_procinfo(bdesc, attno, PROCNUM_LESS);
+       compar = FunctionCall2Coll(cmpFn, colloid, newval, column->bv_values[0]);
+       if (DatumGetBool(compar))
+       {
+               if (!attr->attbyval)
+                       pfree(DatumGetPointer(column->bv_values[0]));
+               column->bv_values[0] = datumCopy(newval, attr->attbyval, attr->attlen);
+               updated = true;
+       }
+
+       /*
+        * And now compare it to the existing maximum.
+        */
+       cmpFn = minmax_get_procinfo(bdesc, attno, PROCNUM_GREATER);
+       compar = FunctionCall2Coll(cmpFn, colloid, newval, column->bv_values[1]);
+       if (DatumGetBool(compar))
+       {
+               if (!attr->attbyval)
+                       pfree(DatumGetPointer(column->bv_values[1]));
+               column->bv_values[1] = datumCopy(newval, attr->attbyval, attr->attlen);
+               updated = true;
+       }
+
+       PG_RETURN_BOOL(updated);
+}
+
+/*
+ * Given an index tuple corresponding to a certain page range and a scan key,
+ * return whether the scan key is consistent with the index tuple's min/max
+ * values.  Return true if so, false otherwise.
+ */
+Datum
+minmaxConsistent(PG_FUNCTION_ARGS)
+{
+       BrinDesc   *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
+       BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1);
+       ScanKey         key = (ScanKey) PG_GETARG_POINTER(2);
+       Oid                     colloid = PG_GET_COLLATION();
+       AttrNumber      attno;
+       Datum           value;
+       Datum           matches;
+
+       Assert(key->sk_attno == column->bv_attno);
+
+       /* handle IS NULL/IS NOT NULL tests */
+       if (key->sk_flags & SK_ISNULL)
+       {
+               if (key->sk_flags & SK_SEARCHNULL)
+               {
+                       if (column->bv_allnulls || column->bv_hasnulls)
+                               PG_RETURN_BOOL(true);
+                       PG_RETURN_BOOL(false);
+               }
+
+               /*
+                * For IS NOT NULL, we can only skip ranges that are known to have
+                * only nulls.
+                */
+               Assert(key->sk_flags & SK_SEARCHNOTNULL);
+               PG_RETURN_BOOL(!column->bv_allnulls);
+       }
+
+       /* if the range is all empty, it cannot possibly be consistent */
+       if (column->bv_allnulls)
+               PG_RETURN_BOOL(false);
+
+       attno = key->sk_attno;
+       value = key->sk_argument;
+       switch (key->sk_strategy)
+       {
+               case BTLessStrategyNumber:
+                       matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
+                                                                                                                       PROCNUM_LESS),
+                                                                               colloid, column->bv_values[0], value);
+                       break;
+               case BTLessEqualStrategyNumber:
+                       matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
+                                                                                                                 PROCNUM_LESSEQUAL),
+                                                                               colloid, column->bv_values[0], value);
+                       break;
+               case BTEqualStrategyNumber:
+
+                       /*
+                        * In the equality case (WHERE col = someval), we want to return
+                        * the current page range if the minimum value in the range <=
+                        * scan key, and the maximum value >= scan key.
+                        */
+                       matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
+                                                                                                                 PROCNUM_LESSEQUAL),
+                                                                               colloid, column->bv_values[0], value);
+                       if (!DatumGetBool(matches))
+                               break;
+                       /* max() >= scankey */
+                       matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
+                                                                                                          PROCNUM_GREATEREQUAL),
+                                                                               colloid, column->bv_values[1], value);
+                       break;
+               case BTGreaterEqualStrategyNumber:
+                       matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
+                                                                                                          PROCNUM_GREATEREQUAL),
+                                                                               colloid, column->bv_values[1], value);
+                       break;
+               case BTGreaterStrategyNumber:
+                       matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
+                                                                                                                       PROCNUM_GREATER),
+                                                                               colloid, column->bv_values[1], value);
+                       break;
+               default:
+                       /* shouldn't happen */
+                       elog(ERROR, "invalid strategy number %d", key->sk_strategy);
+                       matches = 0;
+                       break;
+       }
+
+       PG_RETURN_DATUM(matches);
+}
+
+/*
+ * Given two BrinValues, update the first of them as a union of the summary
+ * values contained in both.  The second one is untouched.
+ */
+Datum
+minmaxUnion(PG_FUNCTION_ARGS)
+{
+       BrinDesc   *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
+       BrinValues *col_a = (BrinValues *) PG_GETARG_POINTER(1);
+       BrinValues *col_b = (BrinValues *) PG_GETARG_POINTER(2);
+       Oid                     colloid = PG_GET_COLLATION();
+       AttrNumber      attno;
+       Form_pg_attribute attr;
+       bool            needsadj;
+
+       Assert(col_a->bv_attno == col_b->bv_attno);
+
+       /* If there are no values in B, there's nothing to do */
+       if (col_b->bv_allnulls)
+               PG_RETURN_VOID();
+
+       attno = col_a->bv_attno;
+       attr = bdesc->bd_tupdesc->attrs[attno - 1];
+
+       /* Adjust "hasnulls" */
+       if (col_b->bv_hasnulls && !col_a->bv_hasnulls)
+               col_a->bv_hasnulls = true;
+
+       /*
+        * Adjust "allnulls".  If B has values but A doesn't, just copy the values
+        * from B into A, and we're done.  (We cannot run the operators in this
+        * case, because values in A might contain garbage.)
+        */
+       if (!col_b->bv_allnulls && col_a->bv_allnulls)
+       {
+               col_a->bv_allnulls = false;
+               col_a->bv_values[0] = datumCopy(col_b->bv_values[0],
+                                                                               attr->attbyval, attr->attlen);
+               col_a->bv_values[1] = datumCopy(col_b->bv_values[1],
+                                                                               attr->attbyval, attr->attlen);
+               PG_RETURN_VOID();
+       }
+
+       /* Adjust minimum, if B's min is less than A's min */
+       needsadj = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
+                                                                                                        PROCNUM_LESS),
+                                                 colloid, col_b->bv_values[0], col_a->bv_values[0]);
+       if (needsadj)
+       {
+               if (!attr->attbyval)
+                       pfree(DatumGetPointer(col_a->bv_values[0]));
+               col_a->bv_values[0] = datumCopy(col_b->bv_values[0],
+                                                                               attr->attbyval, attr->attlen);
+       }
+
+       /* Adjust maximum, if B's max is greater than A's max */
+       needsadj = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
+                                                                                                        PROCNUM_GREATER),
+                                                 colloid, col_b->bv_values[1], col_a->bv_values[1]);
+       if (needsadj)
+       {
+               if (!attr->attbyval)
+                       pfree(DatumGetPointer(col_a->bv_values[1]));
+               col_a->bv_values[1] = datumCopy(col_b->bv_values[1],
+                                                                               attr->attbyval, attr->attlen);
+       }
+
+       PG_RETURN_VOID();
+}
+
+/*
+ * Return the procedure corresponding to the given function support number.
+ */
+static FmgrInfo *
+minmax_get_procinfo(BrinDesc *bdesc, uint16 attno, uint16 procnum)
+{
+       MinmaxOpaque *opaque;
+       uint16          basenum = procnum - PROCNUM_BASE;
+
+       opaque = (MinmaxOpaque *) bdesc->bd_info[attno - 1]->oi_opaque;
+
+       /*
+        * We cache these in the opaque struct, to avoid repetitive syscache
+        * lookups.
+        */
+       if (!opaque->inited[basenum])
+       {
+               fmgr_info_copy(&opaque->operators[basenum],
+                                          index_getprocinfo(bdesc->bd_index, attno, procnum),
+                                          bdesc->bd_context);
+               opaque->inited[basenum] = true;
+       }
+
+       return &opaque->operators[basenum];
+}
diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c
new file mode 100644 (file)
index 0000000..c34b86c
--- /dev/null
@@ -0,0 +1,723 @@
+/*
+ * brin_pageops.c
+ *             Page-handling routines for BRIN indexes
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *       src/backend/access/brin/brin_pageops.c
+ */
+#include "postgres.h"
+
+#include "access/brin_pageops.h"
+#include "access/brin_page.h"
+#include "access/brin_revmap.h"
+#include "access/brin_xlog.h"
+#include "access/xloginsert.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "storage/freespace.h"
+#include "storage/lmgr.h"
+#include "storage/smgr.h"
+#include "utils/rel.h"
+
+
+static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
+                                        bool *was_extended);
+static Size br_page_get_freespace(Page page);
+
+
+/*
+ * Update tuple origtup (size origsz), located in offset oldoff of buffer
+ * oldbuf, to newtup (size newsz) as summary tuple for the page range starting
+ * at heapBlk.  oldbuf must not be locked on entry, and is not locked at exit.
+ *
+ * If samepage is true, attempt to put the new tuple in the same page, but if
+ * there's no room, use some other one.
+ *
+ * If the update is successful, return true; the revmap is updated to point to
+ * the new tuple.  If the update is not done for whatever reason, return false.
+ * Caller may retry the update if this happens.
+ */
+bool
+brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
+                         BrinRevmap *revmap, BlockNumber heapBlk,
+                         Buffer oldbuf, OffsetNumber oldoff,
+                         const BrinTuple *origtup, Size origsz,
+                         const BrinTuple *newtup, Size newsz,
+                         bool samepage)
+{
+       Page            oldpage;
+       ItemId          oldlp;
+       BrinTuple  *oldtup;
+       Size            oldsz;
+       Buffer          newbuf;
+       BrinSpecialSpace *special;
+       bool            extended = false;
+
+       newsz = MAXALIGN(newsz);
+
+       /* make sure the revmap is long enough to contain the entry we need */
+       brinRevmapExtend(revmap, heapBlk);
+
+       if (!samepage)
+       {
+               /* need a page on which to put the item */
+               newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended);
+               /* XXX delay vacuuming FSM until locks are released? */
+               if (extended)
+                       FreeSpaceMapVacuum(idxrel);
+               if (!BufferIsValid(newbuf))
+                       return false;
+
+               /*
+                * Note: it's possible (though unlikely) that the returned newbuf is
+                * the same as oldbuf, if brin_getinsertbuffer determined that the old
+                * buffer does in fact have enough space.
+                */
+               if (newbuf == oldbuf)
+                       newbuf = InvalidBuffer;
+       }
+       else
+       {
+               LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
+               newbuf = InvalidBuffer;
+       }
+       oldpage = BufferGetPage(oldbuf);
+       oldlp = PageGetItemId(oldpage, oldoff);
+
+       /*
+        * Check that the old tuple wasn't updated concurrently: it might have
+        * moved someplace else entirely ...
+        */
+       if (!ItemIdIsNormal(oldlp))
+       {
+               LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
+               if (BufferIsValid(newbuf))
+                       UnlockReleaseBuffer(newbuf);
+               return false;
+       }
+
+       oldsz = ItemIdGetLength(oldlp);
+       oldtup = (BrinTuple *) PageGetItem(oldpage, oldlp);
+
+       /*
+        * ... or it might have been updated in place to different contents.
+        */
+       if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz))
+       {
+               LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
+               if (BufferIsValid(newbuf))
+                       UnlockReleaseBuffer(newbuf);
+               return false;
+       }
+
+       special = (BrinSpecialSpace *) PageGetSpecialPointer(oldpage);
+
+       /*
+        * Great, the old tuple is intact.  We can proceed with the update.
+        *
+        * If there's enough room in the old page for the new tuple, replace it.
+        *
+        * Note that there might now be enough space on the page even though the
+        * caller told us there isn't, if a concurrent update moved another tuple
+        * elsewhere or replaced a tuple with a smaller one.
+        */
+       if (((special->flags & BRIN_EVACUATE_PAGE) == 0) &&
+               brin_can_do_samepage_update(oldbuf, origsz, newsz))
+       {
+               if (BufferIsValid(newbuf))
+                       UnlockReleaseBuffer(newbuf);
+
+               START_CRIT_SECTION();
+               PageIndexDeleteNoCompact(oldpage, &oldoff, 1);
+               if (PageAddItem(oldpage, (Item) newtup, newsz, oldoff, true,
+                                               false) == InvalidOffsetNumber)
+                       elog(ERROR, "failed to add BRIN tuple");
+               MarkBufferDirty(oldbuf);
+
+               /* XLOG stuff */
+               if (RelationNeedsWAL(idxrel))
+               {
+                       BlockNumber blk = BufferGetBlockNumber(oldbuf);
+                       xl_brin_samepage_update xlrec;
+                       XLogRecPtr      recptr;
+                       XLogRecData rdata[2];
+                       uint8           info = XLOG_BRIN_SAMEPAGE_UPDATE;
+
+                       xlrec.node = idxrel->rd_node;
+                       ItemPointerSetBlockNumber(&xlrec.tid, blk);
+                       ItemPointerSetOffsetNumber(&xlrec.tid, oldoff);
+                       rdata[0].data = (char *) &xlrec;
+                       rdata[0].len = SizeOfBrinSamepageUpdate;
+                       rdata[0].buffer = InvalidBuffer;
+                       rdata[0].next = &(rdata[1]);
+
+                       rdata[1].data = (char *) newtup;
+                       rdata[1].len = newsz;
+                       rdata[1].buffer = oldbuf;
+                       rdata[1].buffer_std = true;
+                       rdata[1].next = NULL;
+
+                       recptr = XLogInsert(RM_BRIN_ID, info, rdata);
+
+                       PageSetLSN(oldpage, recptr);
+               }
+
+               END_CRIT_SECTION();
+
+               LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
+               return true;
+       }
+       else if (newbuf == InvalidBuffer)
+       {
+               /*
+                * Not enough space, but caller said that there was. Tell them to
+                * start over.
+                */
+               LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
+               return false;
+       }
+       else
+       {
+               /*
+                * Not enough free space on the oldpage. Put the new tuple on the new
+                * page, and update the revmap.
+                */
+               Page            newpage = BufferGetPage(newbuf);
+               Buffer          revmapbuf;
+               ItemPointerData newtid;
+               OffsetNumber newoff;
+
+               revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
+
+               START_CRIT_SECTION();
+
+               PageIndexDeleteNoCompact(oldpage, &oldoff, 1);
+               newoff = PageAddItem(newpage, (Item) newtup, newsz,
+                                                        InvalidOffsetNumber, false, false);
+               if (newoff == InvalidOffsetNumber)
+                       elog(ERROR, "failed to add BRIN tuple to new page");
+               MarkBufferDirty(oldbuf);
+               MarkBufferDirty(newbuf);
+
+               ItemPointerSet(&newtid, BufferGetBlockNumber(newbuf), newoff);
+               brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid);
+               MarkBufferDirty(revmapbuf);
+
+               /* XLOG stuff */
+               if (RelationNeedsWAL(idxrel))
+               {
+                       xl_brin_update xlrec;
+                       XLogRecPtr      recptr;
+                       XLogRecData rdata[4];
+                       uint8           info;
+
+                       info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0);
+
+                       xlrec.new.node = idxrel->rd_node;
+                       ItemPointerSet(&xlrec.new.tid, BufferGetBlockNumber(newbuf), newoff);
+                       xlrec.new.heapBlk = heapBlk;
+                       xlrec.new.tuplen = newsz;
+                       xlrec.new.revmapBlk = BufferGetBlockNumber(revmapbuf);
+                       xlrec.new.pagesPerRange = pagesPerRange;
+                       ItemPointerSet(&xlrec.oldtid, BufferGetBlockNumber(oldbuf), oldoff);
+
+                       rdata[0].data = (char *) &xlrec;
+                       rdata[0].len = SizeOfBrinUpdate;
+                       rdata[0].buffer = InvalidBuffer;
+                       rdata[0].next = &(rdata[1]);
+
+                       rdata[1].data = (char *) newtup;
+                       rdata[1].len = newsz;
+                       rdata[1].buffer = extended ? InvalidBuffer : newbuf;
+                       rdata[1].buffer_std = true;
+                       rdata[1].next = &(rdata[2]);
+
+                       rdata[2].data = (char *) NULL;
+                       rdata[2].len = 0;
+                       rdata[2].buffer = revmapbuf;
+                       rdata[2].buffer_std = true;
+                       rdata[2].next = &(rdata[3]);
+
+                       rdata[3].data = (char *) NULL;
+                       rdata[3].len = 0;
+                       rdata[3].buffer = oldbuf;
+                       rdata[3].buffer_std = true;
+                       rdata[3].next = NULL;
+
+                       recptr = XLogInsert(RM_BRIN_ID, info, rdata);
+
+                       PageSetLSN(oldpage, recptr);
+                       PageSetLSN(newpage, recptr);
+                       PageSetLSN(BufferGetPage(revmapbuf), recptr);
+               }
+
+               END_CRIT_SECTION();
+
+               LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
+               LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
+               UnlockReleaseBuffer(newbuf);
+               return true;
+       }
+}
+
+/*
+ * Return whether brin_doupdate can do a samepage update.
+ */
+bool
+brin_can_do_samepage_update(Buffer buffer, Size origsz, Size newsz)
+{
+       return
+               ((newsz <= origsz) ||
+                PageGetExactFreeSpace(BufferGetPage(buffer)) >= (newsz - origsz));
+}
+
+/*
+ * Insert an index tuple into the index relation.  The revmap is updated to
+ * mark the range containing the given page as pointing to the inserted entry.
+ * A WAL record is written.
+ *
+ * The buffer, if valid, is first checked for free space to insert the new
+ * entry; if there isn't enough, a new buffer is obtained and pinned.  No
+ * buffer lock must be held on entry, no buffer lock is held on exit.
+ *
+ * Return value is the offset number where the tuple was inserted.
+ */
+OffsetNumber
+brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
+                         BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk,
+                         BrinTuple *tup, Size itemsz)
+{
+       Page            page;
+       BlockNumber blk;
+       OffsetNumber off;
+       Buffer          revmapbuf;
+       ItemPointerData tid;
+       bool            extended = false;
+
+       itemsz = MAXALIGN(itemsz);
+
+       /* Make sure the revmap is long enough to contain the entry we need */
+       brinRevmapExtend(revmap, heapBlk);
+
+       /*
+        * Obtain a locked buffer to insert the new tuple.  Note
+        * brin_getinsertbuffer ensures there's enough space in the returned
+        * buffer.
+        */
+       if (BufferIsValid(*buffer))
+       {
+               /*
+                * It's possible that another backend (or ourselves!) extended the
+                * revmap over the page we held a pin on, so we cannot assume that
+                * it's still a regular page.
+                */
+               LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+               if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz)
+               {
+                       UnlockReleaseBuffer(*buffer);
+                       *buffer = InvalidBuffer;
+               }
+       }
+
+       if (!BufferIsValid(*buffer))
+       {
+               *buffer = brin_getinsertbuffer(idxrel, InvalidBuffer, itemsz, &extended);
+               Assert(BufferIsValid(*buffer));
+               Assert(br_page_get_freespace(BufferGetPage(*buffer)) >= itemsz);
+       }
+
+       /* Now obtain lock on revmap buffer */
+       revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
+
+       page = BufferGetPage(*buffer);
+       blk = BufferGetBlockNumber(*buffer);
+
+       START_CRIT_SECTION();
+       off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber,
+                                         false, false);
+       if (off == InvalidOffsetNumber)
+               elog(ERROR, "could not insert new index tuple to page");
+       MarkBufferDirty(*buffer);
+
+       BRIN_elog(DEBUG2, "inserted tuple (%u,%u) for range starting at %u",
+                         blk, off, heapBlk);
+
+       ItemPointerSet(&tid, blk, off);
+       brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, tid);
+       MarkBufferDirty(revmapbuf);
+
+       /* XLOG stuff */
+       if (RelationNeedsWAL(idxrel))
+       {
+               xl_brin_insert xlrec;
+               XLogRecPtr      recptr;
+               XLogRecData rdata[3];
+               uint8           info;
+
+               info = XLOG_BRIN_INSERT | (extended ? XLOG_BRIN_INIT_PAGE : 0);
+               xlrec.node = idxrel->rd_node;
+               xlrec.heapBlk = heapBlk;
+               xlrec.pagesPerRange = pagesPerRange;
+               xlrec.revmapBlk = BufferGetBlockNumber(revmapbuf);
+               xlrec.tuplen = itemsz;
+               ItemPointerSet(&xlrec.tid, blk, off);
+
+               rdata[0].data = (char *) &xlrec;
+               rdata[0].len = SizeOfBrinInsert;
+               rdata[0].buffer = InvalidBuffer;
+               rdata[0].buffer_std = false;
+               rdata[0].next = &(rdata[1]);
+
+               rdata[1].data = (char *) tup;
+               rdata[1].len = itemsz;
+               rdata[1].buffer = extended ? InvalidBuffer : *buffer;
+               rdata[1].buffer_std = true;
+               rdata[1].next = &(rdata[2]);
+
+               rdata[2].data = (char *) NULL;
+               rdata[2].len = 0;
+               rdata[2].buffer = revmapbuf;
+               rdata[2].buffer_std = false;
+               rdata[2].next = NULL;
+
+               recptr = XLogInsert(RM_BRIN_ID, info, rdata);
+
+               PageSetLSN(page, recptr);
+               PageSetLSN(BufferGetPage(revmapbuf), recptr);
+       }
+
+       END_CRIT_SECTION();
+
+       /* Tuple is firmly on buffer; we can release our locks */
+       LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
+       LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
+
+       if (extended)
+               FreeSpaceMapVacuum(idxrel);
+
+       return off;
+}
+
+/*
+ * Initialize a page with the given type.
+ *
+ * Caller is responsible for marking it dirty, as appropriate.
+ */
+void
+brin_page_init(Page page, uint16 type)
+{
+       BrinSpecialSpace *special;
+
+       PageInit(page, BLCKSZ, sizeof(BrinSpecialSpace));
+
+       special = (BrinSpecialSpace *) PageGetSpecialPointer(page);
+       special->type = type;
+}
+
+/*
+ * Initialize a new BRIN index' metapage.
+ */
+void
+brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version)
+{
+       BrinMetaPageData *metadata;
+
+       brin_page_init(page, BRIN_PAGETYPE_META);
+
+       metadata = (BrinMetaPageData *) PageGetContents(page);
+
+       metadata->brinMagic = BRIN_META_MAGIC;
+       metadata->brinVersion = version;
+       metadata->pagesPerRange = pagesPerRange;
+
+       /*
+        * Note we cheat here a little.  0 is not a valid revmap block number
+        * (because it's the metapage buffer), but doing this enables the first
+        * revmap page to be created when the index is.
+        */
+       metadata->lastRevmapPage = 0;
+}
+
+/*
+ * Initiate page evacuation protocol.
+ *
+ * The page must be locked in exclusive mode by the caller.
+ *
+ * If the page is not yet initialized or empty, return false without doing
+ * anything; it can be used for revmap without any further changes.  If it
+ * contains tuples, mark it for evacuation and return true.
+ */
+bool
+brin_start_evacuating_page(Relation idxRel, Buffer buf)
+{
+       OffsetNumber off;
+       OffsetNumber maxoff;
+       BrinSpecialSpace *special;
+       Page            page;
+
+       page = BufferGetPage(buf);
+
+       if (PageIsNew(page))
+               return false;
+
+       special = (BrinSpecialSpace *) PageGetSpecialPointer(page);
+
+       maxoff = PageGetMaxOffsetNumber(page);
+       for (off = FirstOffsetNumber; off <= maxoff; off++)
+       {
+               ItemId          lp;
+
+               lp = PageGetItemId(page, off);
+               if (ItemIdIsUsed(lp))
+               {
+                       /* prevent other backends from adding more stuff to this page */
+                       special->flags |= BRIN_EVACUATE_PAGE;
+                       MarkBufferDirtyHint(buf, true);
+
+                       return true;
+               }
+       }
+       return false;
+}
+
+/*
+ * Move all tuples out of a page.
+ *
+ * The caller must hold lock on the page. The lock and pin are released.
+ */
+void
+brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange,
+                                  BrinRevmap *revmap, Buffer buf)
+{
+       OffsetNumber off;
+       OffsetNumber maxoff;
+       Page            page;
+
+       page = BufferGetPage(buf);
+
+       Assert(((BrinSpecialSpace *)
+                       PageGetSpecialPointer(page))->flags & BRIN_EVACUATE_PAGE);
+
+       maxoff = PageGetMaxOffsetNumber(page);
+       for (off = FirstOffsetNumber; off <= maxoff; off++)
+       {
+               BrinTuple  *tup;
+               Size            sz;
+               ItemId          lp;
+
+               CHECK_FOR_INTERRUPTS();
+
+               lp = PageGetItemId(page, off);
+               if (ItemIdIsUsed(lp))
+               {
+                       sz = ItemIdGetLength(lp);
+                       tup = (BrinTuple *) PageGetItem(page, lp);
+                       tup = brin_copy_tuple(tup, sz);
+
+                       LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+                       if (!brin_doupdate(idxRel, pagesPerRange, revmap, tup->bt_blkno,
+                                                          buf, off, tup, sz, tup, sz, false))
+                               off--;                  /* retry */
+
+                       LockBuffer(buf, BUFFER_LOCK_SHARE);
+
+                       /* It's possible that someone extended the revmap over this page */
+                       if (!BRIN_IS_REGULAR_PAGE(page))
+                               break;
+               }
+       }
+
+       UnlockReleaseBuffer(buf);
+}
+
+/*
+ * Return a pinned and exclusively locked buffer which can be used to insert an
+ * index item of size itemsz.  If oldbuf is a valid buffer, it is also locked
+ * (in a order determined to avoid deadlocks.)
+ *
+ * If there's no existing page with enough free space to accomodate the new
+ * item, the relation is extended.  If this happens, *extended is set to true.
+ *
+ * If we find that the old page is no longer a regular index page (because
+ * of a revmap extension), the old buffer is unlocked and we return
+ * InvalidBuffer.
+ */
+static Buffer
+brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
+                                        bool *was_extended)
+{
+       BlockNumber oldblk;
+       BlockNumber newblk;
+       Page            page;
+       int                     freespace;
+
+       if (BufferIsValid(oldbuf))
+               oldblk = BufferGetBlockNumber(oldbuf);
+       else
+               oldblk = InvalidBlockNumber;
+
+       /*
+        * Loop until we find a page with sufficient free space.  By the time we
+        * return to caller out of this loop, both buffers are valid and locked;
+        * if we have to restart here, neither buffer is locked and buf is not a
+        * pinned buffer.
+        */
+       newblk = RelationGetTargetBlock(irel);
+       if (newblk == InvalidBlockNumber)
+               newblk = GetPageWithFreeSpace(irel, itemsz);
+       for (;;)
+       {
+               Buffer          buf;
+               bool            extensionLockHeld = false;
+               bool            extended = false;
+
+               CHECK_FOR_INTERRUPTS();
+
+               if (newblk == InvalidBlockNumber)
+               {
+                       /*
+                        * There's not enough free space in any existing index page,
+                        * according to the FSM: extend the relation to obtain a shiny new
+                        * page.
+                        */
+                       if (!RELATION_IS_LOCAL(irel))
+                       {
+                               LockRelationForExtension(irel, ExclusiveLock);
+                               extensionLockHeld = true;
+                       }
+                       buf = ReadBuffer(irel, P_NEW);
+                       newblk = BufferGetBlockNumber(buf);
+                       *was_extended = extended = true;
+
+                       BRIN_elog(DEBUG2, "brin_getinsertbuffer: extending to page %u",
+                                         BufferGetBlockNumber(buf));
+               }
+               else if (newblk == oldblk)
+               {
+                       /*
+                        * There's an odd corner-case here where the FSM is out-of-date,
+                        * and gave us the old page.
+                        */
+                       buf = oldbuf;
+               }
+               else
+               {
+                       buf = ReadBuffer(irel, newblk);
+               }
+
+               /*
+                * We lock the old buffer first, if it's earlier than the new one; but
+                * before we do, we need to check that it hasn't been turned into a
+                * revmap page concurrently; if we detect that it happened, give up
+                * and tell caller to start over.
+                */
+               if (BufferIsValid(oldbuf) && oldblk < newblk)
+               {
+                       LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
+                       if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)))
+                       {
+                               LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
+                               ReleaseBuffer(buf);
+                               return InvalidBuffer;
+                       }
+               }
+
+               LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+               if (extensionLockHeld)
+                       UnlockRelationForExtension(irel, ExclusiveLock);
+
+               page = BufferGetPage(buf);
+
+               if (extended)
+                       brin_page_init(page, BRIN_PAGETYPE_REGULAR);
+
+               /*
+                * We have a new buffer to insert into.  Check that the new page has
+                * enough free space, and return it if it does; otherwise start over.
+                * Note that we allow for the FSM to be out of date here, and in that
+                * case we update it and move on.
+                *
+                * (br_page_get_freespace also checks that the FSM didn't hand us a
+                * page that has since been repurposed for the revmap.)
+                */
+               freespace = br_page_get_freespace(page);
+               if (freespace >= itemsz)
+               {
+                       RelationSetTargetBlock(irel, BufferGetBlockNumber(buf));
+
+                       /*
+                        * Since the target block specification can get lost on cache
+                        * invalidations, make sure we update the more permanent FSM with
+                        * data about it before going away.
+                        */
+                       if (extended)
+                               RecordPageWithFreeSpace(irel, BufferGetBlockNumber(buf),
+                                                                               freespace);
+
+                       /*
+                        * Lock the old buffer if not locked already.  Note that in this
+                        * case we know for sure it's a regular page: it's later than the
+                        * new page we just got, which is not a revmap page, and revmap
+                        * pages are always consecutive.
+                        */
+                       if (BufferIsValid(oldbuf) && oldblk > newblk)
+                       {
+                               LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
+                               Assert(BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)));
+                       }
+
+                       return buf;
+               }
+
+               /* This page is no good. */
+
+               /*
+                * If an entirely new page does not contain enough free space for the
+                * new item, then surely that item is oversized.  Complain loudly; but
+                * first make sure we record the page as free, for next time.
+                */
+               if (extended)
+               {
+                       RecordPageWithFreeSpace(irel, BufferGetBlockNumber(buf),
+                                                                       freespace);
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+                       errmsg("index row size %lu exceeds maximum %lu for index \"%s\"",
+                                  (unsigned long) itemsz,
+                                  (unsigned long) freespace,
+                                  RelationGetRelationName(irel))));
+                       return InvalidBuffer;           /* keep compiler quiet */
+               }
+
+               if (newblk != oldblk)
+                       UnlockReleaseBuffer(buf);
+               if (BufferIsValid(oldbuf) && oldblk <= newblk)
+                       LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
+
+               newblk = RecordAndGetPageWithFreeSpace(irel, newblk, freespace, itemsz);
+       }
+}
+
+/*
+ * Return the amount of free space on a regular BRIN index page.
+ *
+ * If the page is not a regular page, or has been marked with the
+ * BRIN_EVACUATE_PAGE flag, returns 0.
+ */
+static Size
+br_page_get_freespace(Page page)
+{
+       BrinSpecialSpace *special;
+
+       special = (BrinSpecialSpace *) PageGetSpecialPointer(page);
+       if (!BRIN_IS_REGULAR_PAGE(page) ||
+               (special->flags & BRIN_EVACUATE_PAGE) != 0)
+               return 0;
+       else
+               return PageGetFreeSpace(page);
+}
diff --git a/src/backend/access/brin/brin_revmap.c b/src/backend/access/brin/brin_revmap.c
new file mode 100644 (file)
index 0000000..b08a94b
--- /dev/null
@@ -0,0 +1,510 @@
+/*
+ * brin_revmap.c
+ *             Range map for BRIN indexes
+ *
+ * The range map (revmap) is a translation structure for BRIN indexes: for each
+ * page range there is one summary tuple, and its location is tracked by the
+ * revmap.  Whenever a new tuple is inserted into a table that violates the
+ * previously recorded summary values, a new tuple is inserted into the index
+ * and the revmap is updated to point to it.
+ *
+ * The revmap is stored in the first pages of the index, immediately following
+ * the metapage.  When the revmap needs to be expanded, all tuples on the
+ * regular BRIN page at that block (if any) are moved out of the way.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *       src/backend/access/brin/brin_revmap.c
+ */
+#include "postgres.h"
+
+#include "access/brin_page.h"
+#include "access/brin_pageops.h"
+#include "access/brin_revmap.h"
+#include "access/brin_tuple.h"
+#include "access/brin_xlog.h"
+#include "access/rmgr.h"
+#include "access/xloginsert.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "storage/lmgr.h"
+#include "utils/rel.h"
+
+
+/*
+ * In revmap pages, each item stores an ItemPointerData.  These defines let one
+ * find the logical revmap page number and index number of the revmap item for
+ * the given heap block number.
+ */
+#define HEAPBLK_TO_REVMAP_BLK(pagesPerRange, heapBlk) \
+       ((heapBlk / pagesPerRange) / REVMAP_PAGE_MAXITEMS)
+#define HEAPBLK_TO_REVMAP_INDEX(pagesPerRange, heapBlk) \
+       ((heapBlk / pagesPerRange) % REVMAP_PAGE_MAXITEMS)
+
+
+struct BrinRevmap
+{
+       Relation        rm_irel;
+       BlockNumber rm_pagesPerRange;
+       BlockNumber rm_lastRevmapPage; /* cached from the metapage */
+       Buffer          rm_metaBuf;
+       Buffer          rm_currBuf;
+};
+
+/* typedef appears in brin_revmap.h */
+
+
+static BlockNumber revmap_get_blkno(BrinRevmap *revmap,
+                                 BlockNumber heapBlk);
+static Buffer revmap_get_buffer(BrinRevmap *revmap, BlockNumber heapBlk);
+static BlockNumber revmap_extend_and_get_blkno(BrinRevmap *revmap,
+                                                       BlockNumber heapBlk);
+static void revmap_physical_extend(BrinRevmap *revmap);
+
+/*
+ * Initialize an access object for a range map.  This must be freed by
+ * brinRevmapTerminate when caller is done with it.
+ */
+BrinRevmap *
+brinRevmapInitialize(Relation idxrel, BlockNumber *pagesPerRange)
+{
+       BrinRevmap *revmap;
+       Buffer          meta;
+       BrinMetaPageData *metadata;
+
+       meta = ReadBuffer(idxrel, BRIN_METAPAGE_BLKNO);
+       LockBuffer(meta, BUFFER_LOCK_SHARE);
+       metadata = (BrinMetaPageData *) PageGetContents(BufferGetPage(meta));
+
+       revmap = palloc(sizeof(BrinRevmap));
+       revmap->rm_irel = idxrel;
+       revmap->rm_pagesPerRange = metadata->pagesPerRange;
+       revmap->rm_lastRevmapPage = metadata->lastRevmapPage;
+       revmap->rm_metaBuf = meta;
+       revmap->rm_currBuf = InvalidBuffer;
+
+       *pagesPerRange = metadata->pagesPerRange;
+
+       LockBuffer(meta, BUFFER_LOCK_UNLOCK);
+
+       return revmap;
+}
+
+/*
+ * Release resources associated with a revmap access object.
+ */
+void
+brinRevmapTerminate(BrinRevmap *revmap)
+{
+       ReleaseBuffer(revmap->rm_metaBuf);
+       if (revmap->rm_currBuf != InvalidBuffer)
+               ReleaseBuffer(revmap->rm_currBuf);
+       pfree(revmap);
+}
+
+/*
+ * Extend the revmap to cover the given heap block number.
+ */
+void
+brinRevmapExtend(BrinRevmap *revmap, BlockNumber heapBlk)
+{
+       BlockNumber     mapBlk;
+
+       mapBlk = revmap_extend_and_get_blkno(revmap, heapBlk);
+
+       /* Ensure the buffer we got is in the expected range */
+       Assert(mapBlk != InvalidBlockNumber &&
+                  mapBlk != BRIN_METAPAGE_BLKNO &&
+                  mapBlk <= revmap->rm_lastRevmapPage);
+}
+
+/*
+ * Prepare to insert an entry into the revmap; the revmap buffer in which the
+ * entry is to reside is locked and returned.  Most callers should call
+ * brinRevmapExtend beforehand, as this routine does not extend the revmap if
+ * it's not long enough.
+ *
+ * The returned buffer is also recorded in the revmap struct; finishing that
+ * releases the buffer, therefore the caller needn't do it explicitely.
+ */
+Buffer
+brinLockRevmapPageForUpdate(BrinRevmap *revmap, BlockNumber heapBlk)
+{
+       Buffer          rmBuf;
+
+       rmBuf = revmap_get_buffer(revmap, heapBlk);
+       LockBuffer(rmBuf, BUFFER_LOCK_EXCLUSIVE);
+
+       return rmBuf;
+}
+
+/*
+ * In the given revmap buffer (locked appropriately by caller), which is used
+ * in a BRIN index of pagesPerRange pages per range, set the element
+ * corresponding to heap block number heapBlk to the given TID.
+ *
+ * Once the operation is complete, the caller must update the LSN on the
+ * returned buffer.
+ *
+ * This is used both in regular operation and during WAL replay.
+ */
+void
+brinSetHeapBlockItemptr(Buffer buf, BlockNumber pagesPerRange,
+                                               BlockNumber heapBlk, ItemPointerData tid)
+{
+       RevmapContents *contents;
+       ItemPointerData *iptr;
+       Page            page;
+
+       /* The correct page should already be pinned and locked */
+       page = BufferGetPage(buf);
+       contents = (RevmapContents *) PageGetContents(page);
+       iptr = (ItemPointerData *) contents->rm_tids;
+       iptr += HEAPBLK_TO_REVMAP_INDEX(pagesPerRange, heapBlk);
+
+       ItemPointerSet(iptr,
+                                  ItemPointerGetBlockNumber(&tid),
+                                  ItemPointerGetOffsetNumber(&tid));
+}
+
+/*
+ * Fetch the BrinTuple for a given heap block.
+ *
+ * The buffer containing the tuple is locked, and returned in *buf. As an
+ * optimization, the caller can pass a pinned buffer *buf on entry, which will
+ * avoid a pin-unpin cycle when the next tuple is on the same page as a
+ * previous one.
+ *
+ * If no tuple is found for the given heap range, returns NULL. In that case,
+ * *buf might still be updated, but it's not locked.
+ *
+ * The output tuple offset within the buffer is returned in *off, and its size
+ * is returned in *size.
+ */
+BrinTuple *
+brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk,
+                                                Buffer *buf, OffsetNumber *off, Size *size, int mode)
+{
+       Relation        idxRel = revmap->rm_irel;
+       BlockNumber mapBlk;
+       RevmapContents *contents;
+       ItemPointerData *iptr;
+       BlockNumber blk;
+       Page            page;
+       ItemId          lp;
+       BrinTuple  *tup;
+       ItemPointerData previptr;
+
+       /* normalize the heap block number to be the first page in the range */
+       heapBlk = (heapBlk / revmap->rm_pagesPerRange) * revmap->rm_pagesPerRange;
+
+       /* Compute the revmap page number we need */
+       mapBlk = revmap_get_blkno(revmap, heapBlk);
+       if (mapBlk == InvalidBlockNumber)
+       {
+               *off = InvalidOffsetNumber;
+               return NULL;
+       }
+
+       ItemPointerSetInvalid(&previptr);
+       for (;;)
+       {
+               CHECK_FOR_INTERRUPTS();
+
+               if (revmap->rm_currBuf == InvalidBuffer ||
+                       BufferGetBlockNumber(revmap->rm_currBuf) != mapBlk)
+               {
+                       if (revmap->rm_currBuf != InvalidBuffer)
+                               ReleaseBuffer(revmap->rm_currBuf);
+
+                       Assert(mapBlk != InvalidBlockNumber);
+                       revmap->rm_currBuf = ReadBuffer(revmap->rm_irel, mapBlk);
+               }
+
+               LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_SHARE);
+
+               contents = (RevmapContents *)
+                       PageGetContents(BufferGetPage(revmap->rm_currBuf));
+               iptr = contents->rm_tids;
+               iptr += HEAPBLK_TO_REVMAP_INDEX(revmap->rm_pagesPerRange, heapBlk);
+
+               if (!ItemPointerIsValid(iptr))
+               {
+                       LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_UNLOCK);
+                       return NULL;
+               }
+
+               /*
+                * Check the TID we got in a previous iteration, if any, and save the
+                * current TID we got from the revmap; if we loop, we can sanity-check
+                * that the next one we get is different.  Otherwise we might be stuck
+                * looping forever if the revmap is somehow badly broken.
+                */
+               if (ItemPointerIsValid(&previptr) && ItemPointerEquals(&previptr, iptr))
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_INDEX_CORRUPTED),
+                                        errmsg_internal("corrupted BRIN index: inconsistent range map")));
+               previptr = *iptr;
+
+               blk = ItemPointerGetBlockNumber(iptr);
+               *off = ItemPointerGetOffsetNumber(iptr);
+
+               LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_UNLOCK);
+
+               /* Ok, got a pointer to where the BrinTuple should be. Fetch it. */
+               if (!BufferIsValid(*buf) || BufferGetBlockNumber(*buf) != blk)
+               {
+                       if (BufferIsValid(*buf))
+                               ReleaseBuffer(*buf);
+                       *buf = ReadBuffer(idxRel, blk);
+               }
+               LockBuffer(*buf, mode);
+               page = BufferGetPage(*buf);
+
+               /* If we land on a revmap page, start over */
+               if (BRIN_IS_REGULAR_PAGE(page))
+               {
+                       lp = PageGetItemId(page, *off);
+                       if (ItemIdIsUsed(lp))
+                       {
+                               tup = (BrinTuple *) PageGetItem(page, lp);
+
+                               if (tup->bt_blkno == heapBlk)
+                               {
+                                       if (size)
+                                               *size = ItemIdGetLength(lp);
+                                       /* found it! */
+                                       return tup;
+                               }
+                       }
+               }
+
+               /*
+                * No luck. Assume that the revmap was updated concurrently.
+                */
+               LockBuffer(*buf, BUFFER_LOCK_UNLOCK);
+       }
+       /* not reached, but keep compiler quiet */
+       return NULL;
+}
+
+/*
+ * Given a heap block number, find the corresponding physical revmap block
+ * number and return it.  If the revmap page hasn't been allocated yet, return
+ * InvalidBlockNumber.
+ */
+static BlockNumber
+revmap_get_blkno(BrinRevmap *revmap, BlockNumber heapBlk)
+{
+       BlockNumber targetblk;
+
+       /* obtain revmap block number, skip 1 for metapage block */
+       targetblk = HEAPBLK_TO_REVMAP_BLK(revmap->rm_pagesPerRange, heapBlk) + 1;
+
+       /* Normal case: the revmap page is already allocated */
+       if (targetblk <= revmap->rm_lastRevmapPage)
+               return targetblk;
+
+       return InvalidBlockNumber;
+}
+
+/*
+ * Obtain and return a buffer containing the revmap page for the given heap
+ * page.  The revmap must have been previously extended to cover that page.
+ * The returned buffer is also recorded in the revmap struct; finishing that
+ * releases the buffer, therefore the caller needn't do it explicitely.
+ */
+static Buffer
+revmap_get_buffer(BrinRevmap *revmap, BlockNumber heapBlk)
+{
+       BlockNumber mapBlk;
+
+       /* Translate the heap block number to physical index location. */
+       mapBlk = revmap_get_blkno(revmap, heapBlk);
+
+       if (mapBlk == InvalidBlockNumber)
+               elog(ERROR, "revmap does not cover heap block %u", heapBlk);
+
+       /* Ensure the buffer we got is in the expected range */
+       Assert(mapBlk != BRIN_METAPAGE_BLKNO &&
+                  mapBlk <= revmap->rm_lastRevmapPage);
+
+       BRIN_elog(DEBUG2, "getting revmap page for logical page %lu (physical %u) for heap %u",
+                         HEAPBLK_TO_REVMAP_BLK(revmap->rm_pagesPerRange, heapBlk),
+                         mapBlk, heapBlk);
+
+       /*
+        * Obtain the buffer from which we need to read.  If we already have the
+        * correct buffer in our access struct, use that; otherwise, release that,
+        * (if valid) and read the one we need.
+        */
+       if (revmap->rm_currBuf == InvalidBuffer ||
+               mapBlk != BufferGetBlockNumber(revmap->rm_currBuf))
+       {
+               if (revmap->rm_currBuf != InvalidBuffer)
+                       ReleaseBuffer(revmap->rm_currBuf);
+
+               revmap->rm_currBuf = ReadBuffer(revmap->rm_irel, mapBlk);
+       }
+
+       return revmap->rm_currBuf;
+}
+
+/*
+ * Given a heap block number, find the corresponding physical revmap block
+ * number and return it. If the revmap page hasn't been allocated yet, extend
+ * the revmap until it is.
+ */
+static BlockNumber
+revmap_extend_and_get_blkno(BrinRevmap *revmap, BlockNumber heapBlk)
+{
+       BlockNumber     targetblk;
+
+       /* obtain revmap block number, skip 1 for metapage block */
+       targetblk = HEAPBLK_TO_REVMAP_BLK(revmap->rm_pagesPerRange, heapBlk) + 1;
+
+       /* Extend the revmap, if necessary */
+       while (targetblk > revmap->rm_lastRevmapPage)
+       {
+               CHECK_FOR_INTERRUPTS();
+               revmap_physical_extend(revmap);
+       }
+
+       return targetblk;
+}
+
+/*
+ * Try to extend the revmap by one page.  This might not happen for a number of
+ * reasons; caller is expected to retry until the expected outcome is obtained.
+ */
+static void
+revmap_physical_extend(BrinRevmap *revmap)
+{
+       Buffer          buf;
+       Page            page;
+       Page            metapage;
+       BrinMetaPageData *metadata;
+       BlockNumber mapBlk;
+       BlockNumber nblocks;
+       Relation        irel = revmap->rm_irel;
+       bool            needLock = !RELATION_IS_LOCAL(irel);
+
+       /*
+        * Lock the metapage. This locks out concurrent extensions of the revmap,
+        * but note that we still need to grab the relation extension lock because
+        * another backend can extend the index with regular BRIN pages.
+        */
+       LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_EXCLUSIVE);
+       metapage = BufferGetPage(revmap->rm_metaBuf);
+       metadata = (BrinMetaPageData *) PageGetContents(metapage);
+
+       /*
+        * Check that our cached lastRevmapPage value was up-to-date; if it
+        * wasn't, update the cached copy and have caller start over.
+        */
+       if (metadata->lastRevmapPage != revmap->rm_lastRevmapPage)
+       {
+               revmap->rm_lastRevmapPage = metadata->lastRevmapPage;
+               LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK);
+               return;
+       }
+       mapBlk = metadata->lastRevmapPage + 1;
+
+       nblocks = RelationGetNumberOfBlocks(irel);
+       if (mapBlk < nblocks)
+       {
+               buf = ReadBuffer(irel, mapBlk);
+               LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+               page = BufferGetPage(buf);
+       }
+       else
+       {
+               if (needLock)
+                       LockRelationForExtension(irel, ExclusiveLock);
+
+               buf = ReadBuffer(irel, P_NEW);
+               if (BufferGetBlockNumber(buf) != mapBlk)
+               {
+                       /*
+                        * Very rare corner case: somebody extended the relation
+                        * concurrently after we read its length.  If this happens, give
+                        * up and have caller start over.  We will have to evacuate that
+                        * page from under whoever is using it.
+                        */
+                       if (needLock)
+                               UnlockRelationForExtension(irel, ExclusiveLock);
+                       LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK);
+                       return;
+               }
+               LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+               page = BufferGetPage(buf);
+
+               if (needLock)
+                       UnlockRelationForExtension(irel, ExclusiveLock);
+       }
+
+       /* Check that it's a regular block (or an empty page) */
+       if (!PageIsNew(page) && !BRIN_IS_REGULAR_PAGE(page))
+               ereport(ERROR,
+                               (errcode(ERRCODE_INDEX_CORRUPTED),
+                                errmsg("unexpected page type 0x%04X in BRIN index \"%s\" block %u",
+                                               BRIN_PAGE_TYPE(page),
+                                               RelationGetRelationName(irel),
+                                               BufferGetBlockNumber(buf))));
+
+       /* If the page is in use, evacuate it and restart */
+       if (brin_start_evacuating_page(irel, buf))
+       {
+               LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK);
+               brin_evacuate_page(irel, revmap->rm_pagesPerRange, revmap, buf);
+
+               /* have caller start over */
+               return;
+       }
+
+       /*
+        * Ok, we have now locked the metapage and the target block. Re-initialize
+        * it as a revmap page.
+        */
+       START_CRIT_SECTION();
+
+       /* the rm_tids array is initialized to all invalid by PageInit */
+       brin_page_init(page, BRIN_PAGETYPE_REVMAP);
+       MarkBufferDirty(buf);
+
+       metadata->lastRevmapPage = mapBlk;
+       MarkBufferDirty(revmap->rm_metaBuf);
+
+       if (RelationNeedsWAL(revmap->rm_irel))
+       {
+               xl_brin_revmap_extend xlrec;
+               XLogRecPtr      recptr;
+               XLogRecData rdata[2];
+
+               xlrec.node = revmap->rm_irel->rd_node;
+               xlrec.targetBlk = mapBlk;
+               rdata[0].data = (char *) &xlrec;
+               rdata[0].len = SizeOfBrinRevmapExtend;
+               rdata[0].buffer = InvalidBuffer;
+               rdata[0].buffer_std = false;
+               rdata[0].next = &(rdata[1]);
+
+               rdata[1].data = (char *) NULL;
+               rdata[1].len = 0;
+               rdata[1].buffer = revmap->rm_metaBuf;
+               rdata[1].buffer_std = false;
+               rdata[1].next = NULL;
+
+               recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_REVMAP_EXTEND, rdata);
+               PageSetLSN(metapage, recptr);
+               PageSetLSN(page, recptr);
+       }
+
+       END_CRIT_SECTION();
+
+       LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK);
+
+       UnlockReleaseBuffer(buf);
+}
diff --git a/src/backend/access/brin/brin_tuple.c b/src/backend/access/brin/brin_tuple.c
new file mode 100644 (file)
index 0000000..d895cb7
--- /dev/null
@@ -0,0 +1,554 @@
+/*
+ * brin_tuples.c
+ *             Method implementations for tuples in BRIN indexes.
+ *
+ * Intended usage is that code outside this file only deals with
+ * BrinMemTuples, and convert to and from the on-disk representation through
+ * functions in this file.
+ *
+ * NOTES
+ *
+ * A BRIN tuple is similar to a heap tuple, with a few key differences.  The
+ * first interesting difference is that the tuple header is much simpler, only
+ * containing its total length and a small area for flags.  Also, the stored
+ * data does not match the relation tuple descriptor exactly: for each
+ * attribute in the descriptor, the index tuple carries an arbitrary number
+ * of values, depending on the opclass.
+ *
+ * Also, for each column of the index relation there are two null bits: one
+ * (hasnulls) stores whether any tuple within the page range has that column
+ * set to null; the other one (allnulls) stores whether the column values are
+ * all null.  If allnulls is true, then the tuple data area does not contain
+ * values for that column at all; whereas it does if the hasnulls is set.
+ * Note the size of the null bitmask may not be the same as that of the
+ * datum array.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *       src/backend/access/brin/brin_tuple.c
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "access/brin_tuple.h"
+#include "access/tupdesc.h"
+#include "access/tupmacs.h"
+#include "utils/datum.h"
+#include "utils/memutils.h"
+
+
+static inline void brin_deconstruct_tuple(BrinDesc *brdesc,
+                                          char *tp, bits8 *nullbits, bool nulls,
+                                          Datum *values, bool *allnulls, bool *hasnulls);
+
+
+/*
+ * Return a tuple descriptor used for on-disk storage of BRIN tuples.
+ */
+static TupleDesc
+brtuple_disk_tupdesc(BrinDesc *brdesc)
+{
+       /* We cache these in the BrinDesc */
+       if (brdesc->bd_disktdesc == NULL)
+       {
+               int                     i;
+               int                     j;
+               AttrNumber      attno = 1;
+               TupleDesc       tupdesc;
+               MemoryContext oldcxt;
+
+               /* make sure it's in the bdesc's context */
+               oldcxt = MemoryContextSwitchTo(brdesc->bd_context);
+
+               tupdesc = CreateTemplateTupleDesc(brdesc->bd_totalstored, false);
+
+               for (i = 0; i < brdesc->bd_tupdesc->natts; i++)
+               {
+                       for (j = 0; j < brdesc->bd_info[i]->oi_nstored; j++)
+                               TupleDescInitEntry(tupdesc, attno++, NULL,
+                                                                  brdesc->bd_info[i]->oi_typids[j],
+                                                                  -1, 0);
+               }
+
+               MemoryContextSwitchTo(oldcxt);
+
+               brdesc->bd_disktdesc = tupdesc;
+       }
+
+       return brdesc->bd_disktdesc;
+}
+
+/*
+ * Generate a new on-disk tuple to be inserted in a BRIN index.
+ *
+ * See brin_form_placeholder_tuple if you touch this.
+ */
+BrinTuple *
+brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple,
+                               Size *size)
+{
+       Datum      *values;
+       bool       *nulls;
+       bool            anynulls = false;
+       BrinTuple  *rettuple;
+       int                     keyno;
+       int                     idxattno;
+       uint16          phony_infomask;
+       bits8      *phony_nullbitmap;
+       Size            len,
+                               hoff,
+                               data_len;
+
+       Assert(brdesc->bd_totalstored > 0);
+
+       values = palloc(sizeof(Datum) * brdesc->bd_totalstored);
+       nulls = palloc0(sizeof(bool) * brdesc->bd_totalstored);
+       phony_nullbitmap = palloc(sizeof(bits8) * BITMAPLEN(brdesc->bd_totalstored));
+
+       /*
+        * Set up the values/nulls arrays for heap_fill_tuple
+        */
+       idxattno = 0;
+       for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
+       {
+               int                     datumno;
+
+               /*
+                * "allnulls" is set when there's no nonnull value in any row in the
+                * column; when this happens, there is no data to store.  Thus set the
+                * nullable bits for all data elements of this column and we're done.
+                */
+               if (tuple->bt_columns[keyno].bv_allnulls)
+               {
+                       for (datumno = 0;
+                                datumno < brdesc->bd_info[keyno]->oi_nstored;
+                                datumno++)
+                               nulls[idxattno++] = true;
+                       anynulls = true;
+                       continue;
+               }
+
+               /*
+                * The "hasnulls" bit is set when there are some null values in the
+                * data.  We still need to store a real value, but the presence of
+                * this means we need a null bitmap.
+                */
+               if (tuple->bt_columns[keyno].bv_hasnulls)
+                       anynulls = true;
+
+               for (datumno = 0;
+                        datumno < brdesc->bd_info[keyno]->oi_nstored;
+                        datumno++)
+                       values[idxattno++] = tuple->bt_columns[keyno].bv_values[datumno];
+       }
+
+       /* compute total space needed */
+       len = SizeOfBrinTuple;
+       if (anynulls)
+       {
+               /*
+                * We need a double-length bitmap on an on-disk BRIN index tuple; the
+                * first half stores the "allnulls" bits, the second stores
+                * "hasnulls".
+                */
+               len += BITMAPLEN(brdesc->bd_tupdesc->natts * 2);
+       }
+
+       len = hoff = MAXALIGN(len);
+
+       data_len = heap_compute_data_size(brtuple_disk_tupdesc(brdesc),
+                                                                         values, nulls);
+
+       len += data_len;
+
+       rettuple = palloc0(len);
+       rettuple->bt_blkno = blkno;
+       rettuple->bt_info = hoff;
+       Assert((rettuple->bt_info & BRIN_OFFSET_MASK) == hoff);
+
+       /*
+        * The infomask and null bitmap as computed by heap_fill_tuple are useless
+        * to us.  However, that function will not accept a null infomask; and we
+        * need to pass a valid null bitmap so that it will correctly skip
+        * outputting null attributes in the data area.
+        */
+       heap_fill_tuple(brtuple_disk_tupdesc(brdesc),
+                                       values,
+                                       nulls,
+                                       (char *) rettuple + hoff,
+                                       data_len,
+                                       &phony_infomask,
+                                       phony_nullbitmap);
+
+       /* done with these */
+       pfree(values);
+       pfree(nulls);
+       pfree(phony_nullbitmap);
+
+       /*
+        * Now fill in the real null bitmasks.  allnulls first.
+        */
+       if (anynulls)
+       {
+               bits8      *bitP;
+               int                     bitmask;
+
+               rettuple->bt_info |= BRIN_NULLS_MASK;
+
+               /*
+                * Note that we reverse the sense of null bits in this module: we
+                * store a 1 for a null attribute rather than a 0.  So we must reverse
+                * the sense of the att_isnull test in br_deconstruct_tuple as well.
+                */
+               bitP = ((bits8 *) ((char *) rettuple + SizeOfBrinTuple)) - 1;
+               bitmask = HIGHBIT;
+               for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
+               {
+                       if (bitmask != HIGHBIT)
+                               bitmask <<= 1;
+                       else
+                       {
+                               bitP += 1;
+                               *bitP = 0x0;
+                               bitmask = 1;
+                       }
+
+                       if (!tuple->bt_columns[keyno].bv_allnulls)
+                               continue;
+
+                       *bitP |= bitmask;
+               }
+               /* hasnulls bits follow */
+               for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
+               {
+                       if (bitmask != HIGHBIT)
+                               bitmask <<= 1;
+                       else
+                       {
+                               bitP += 1;
+                               *bitP = 0x0;
+                               bitmask = 1;
+                       }
+
+                       if (!tuple->bt_columns[keyno].bv_hasnulls)
+                               continue;
+
+                       *bitP |= bitmask;
+               }
+               bitP = ((bits8 *) (rettuple + SizeOfBrinTuple)) - 1;
+       }
+
+       if (tuple->bt_placeholder)
+               rettuple->bt_info |= BRIN_PLACEHOLDER_MASK;
+
+       *size = len;
+       return rettuple;
+}
+
+/*
+ * Generate a new on-disk tuple with no data values, marked as placeholder.
+ *
+ * This is a cut-down version of brin_form_tuple.
+ */
+BrinTuple *
+brin_form_placeholder_tuple(BrinDesc *brdesc, BlockNumber blkno, Size *size)
+{
+       Size            len;
+       Size            hoff;
+       BrinTuple  *rettuple;
+       int                     keyno;
+       bits8      *bitP;
+       int                     bitmask;
+
+       /* compute total space needed: always add nulls */
+       len = SizeOfBrinTuple;
+       len += BITMAPLEN(brdesc->bd_tupdesc->natts * 2);
+       len = hoff = MAXALIGN(len);
+
+       rettuple = palloc0(len);
+       rettuple->bt_blkno = blkno;
+       rettuple->bt_info = hoff;
+       rettuple->bt_info |= BRIN_NULLS_MASK | BRIN_PLACEHOLDER_MASK;
+
+       bitP = ((bits8 *) ((char *) rettuple + SizeOfBrinTuple)) - 1;
+       bitmask = HIGHBIT;
+       /* set allnulls true for all attributes */
+       for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
+       {
+               if (bitmask != HIGHBIT)
+                       bitmask <<= 1;
+               else
+               {
+                       bitP += 1;
+                       *bitP = 0x0;
+                       bitmask = 1;
+               }
+
+               *bitP |= bitmask;
+       }
+       /* no need to set hasnulls */
+
+       *size = len;
+       return rettuple;
+}
+
+/*
+ * Free a tuple created by brin_form_tuple
+ */
+void
+brin_free_tuple(BrinTuple *tuple)
+{
+       pfree(tuple);
+}
+
+/*
+ * Create an palloc'd copy of a BrinTuple.
+ */
+BrinTuple *
+brin_copy_tuple(BrinTuple *tuple, Size len)
+{
+       BrinTuple  *newtup;
+
+       newtup = palloc(len);
+       memcpy(newtup, tuple, len);
+
+       return newtup;
+}
+
+/*
+ * Return whether two BrinTuples are bitwise identical.
+ */
+bool
+brin_tuples_equal(const BrinTuple *a, Size alen, const BrinTuple *b, Size blen)
+{
+       if (alen != blen)
+               return false;
+       if (memcmp(a, b, alen) != 0)
+               return false;
+       return true;
+}
+
+/*
+ * Create a new BrinMemTuple from scratch, and initialize it to an empty
+ * state.
+ *
+ * Note: we don't provide any means to free a deformed tuple, so make sure to
+ * use a temporary memory context.
+ */
+BrinMemTuple *
+brin_new_memtuple(BrinDesc *brdesc)
+{
+       BrinMemTuple *dtup;
+       char       *currdatum;
+       long            basesize;
+       int                     i;
+
+       basesize = MAXALIGN(sizeof(BrinMemTuple) +
+                                               sizeof(BrinValues) * brdesc->bd_tupdesc->natts);
+       dtup = palloc0(basesize + sizeof(Datum) * brdesc->bd_totalstored);
+       currdatum = (char *) dtup + basesize;
+       for (i = 0; i < brdesc->bd_tupdesc->natts; i++)
+       {
+               dtup->bt_columns[i].bv_attno = i + 1;
+               dtup->bt_columns[i].bv_allnulls = true;
+               dtup->bt_columns[i].bv_hasnulls = false;
+               dtup->bt_columns[i].bv_values = (Datum *) currdatum;
+               currdatum += sizeof(Datum) * brdesc->bd_info[i]->oi_nstored;
+       }
+
+       dtup->bt_context = AllocSetContextCreate(CurrentMemoryContext,
+                                                                                        "brin dtuple",
+                                                                                        ALLOCSET_DEFAULT_MINSIZE,
+                                                                                        ALLOCSET_DEFAULT_INITSIZE,
+                                                                                        ALLOCSET_DEFAULT_MAXSIZE);
+       return dtup;
+}
+
+/*
+ * Reset a BrinMemTuple to initial state
+ */
+void
+brin_memtuple_initialize(BrinMemTuple *dtuple, BrinDesc *brdesc)
+{
+       int                     i;
+
+       MemoryContextReset(dtuple->bt_context);
+       for (i = 0; i < brdesc->bd_tupdesc->natts; i++)
+       {
+               dtuple->bt_columns[i].bv_allnulls = true;
+               dtuple->bt_columns[i].bv_hasnulls = false;
+       }
+}
+
+/*
+ * Convert a BrinTuple back to a BrinMemTuple.  This is the reverse of
+ * brin_form_tuple.
+ *
+ * Note we don't need the "on disk tupdesc" here; we rely on our own routine to
+ * deconstruct the tuple from the on-disk format.
+ */
+BrinMemTuple *
+brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple)
+{
+       BrinMemTuple *dtup;
+       Datum      *values;
+       bool       *allnulls;
+       bool       *hasnulls;
+       char       *tp;
+       bits8      *nullbits;
+       int                     keyno;
+       int                     valueno;
+       MemoryContext oldcxt;
+
+       dtup = brin_new_memtuple(brdesc);
+
+       if (BrinTupleIsPlaceholder(tuple))
+               dtup->bt_placeholder = true;
+       dtup->bt_blkno = tuple->bt_blkno;
+
+       values = palloc(sizeof(Datum) * brdesc->bd_totalstored);
+       allnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts);
+       hasnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts);
+
+       tp = (char *) tuple + BrinTupleDataOffset(tuple);
+
+       if (BrinTupleHasNulls(tuple))
+               nullbits = (bits8 *) ((char *) tuple + SizeOfBrinTuple);
+       else
+               nullbits = NULL;
+       brin_deconstruct_tuple(brdesc,
+                                                  tp, nullbits, BrinTupleHasNulls(tuple),
+                                                  values, allnulls, hasnulls);
+
+       /*
+        * Iterate to assign each of the values to the corresponding item in the
+        * values array of each column.  The copies occur in the tuple's context.
+        */
+       oldcxt = MemoryContextSwitchTo(dtup->bt_context);
+       for (valueno = 0, keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
+       {
+               int                     i;
+
+               if (allnulls[keyno])
+               {
+                       valueno += brdesc->bd_info[keyno]->oi_nstored;
+                       continue;
+               }
+
+               /*
+                * We would like to skip datumCopy'ing the values datum in some cases,
+                * caller permitting ...
+                */
+               for (i = 0; i < brdesc->bd_info[keyno]->oi_nstored; i++)
+                       dtup->bt_columns[keyno].bv_values[i] =
+                               datumCopy(values[valueno++],
+                                                 brdesc->bd_tupdesc->attrs[keyno]->attbyval,
+                                                 brdesc->bd_tupdesc->attrs[keyno]->attlen);
+
+               dtup->bt_columns[keyno].bv_hasnulls = hasnulls[keyno];
+               dtup->bt_columns[keyno].bv_allnulls = false;
+       }
+
+       MemoryContextSwitchTo(oldcxt);
+
+       pfree(values);
+       pfree(allnulls);
+       pfree(hasnulls);
+
+       return dtup;
+}
+
+/*
+ * brin_deconstruct_tuple
+ *             Guts of attribute extraction from an on-disk BRIN tuple.
+ *
+ * Its arguments are:
+ *     brdesc          BRIN descriptor for the stored tuple
+ *     tp                      pointer to the tuple data area
+ *     nullbits        pointer to the tuple nulls bitmask
+ *     nulls           "has nulls" bit in tuple infomask
+ *     values          output values, array of size brdesc->bd_totalstored
+ *     allnulls        output "allnulls", size brdesc->bd_tupdesc->natts
+ *     hasnulls        output "hasnulls", size brdesc->bd_tupdesc->natts
+ *
+ * Output arrays must have been allocated by caller.
+ */
+static inline void
+brin_deconstruct_tuple(BrinDesc *brdesc,
+                                          char *tp, bits8 *nullbits, bool nulls,
+                                          Datum *values, bool *allnulls, bool *hasnulls)
+{
+       int                     attnum;
+       int                     stored;
+       TupleDesc       diskdsc;
+       long            off;
+
+       /*
+        * First iterate to natts to obtain both null flags for each attribute.
+        * Note that we reverse the sense of the att_isnull test, because we store
+        * 1 for a null value (rather than a 1 for a not null value as is the
+        * att_isnull convention used elsewhere.)  See brin_form_tuple.
+        */
+       for (attnum = 0; attnum < brdesc->bd_tupdesc->natts; attnum++)
+       {
+               /*
+                * the "all nulls" bit means that all values in the page range for
+                * this column are nulls.  Therefore there are no values in the tuple
+                * data area.
+                */
+               allnulls[attnum] = nulls && !att_isnull(attnum, nullbits);
+
+               /*
+                * the "has nulls" bit means that some tuples have nulls, but others
+                * have not-null values.  Therefore we know the tuple contains data
+                * for this column.
+                *
+                * The hasnulls bits follow the allnulls bits in the same bitmask.
+                */
+               hasnulls[attnum] =
+                       nulls && !att_isnull(brdesc->bd_tupdesc->natts + attnum, nullbits);
+       }
+
+       /*
+        * Iterate to obtain each attribute's stored values.  Note that since we
+        * may reuse attribute entries for more than one column, we cannot cache
+        * offsets here.
+        */
+       diskdsc = brtuple_disk_tupdesc(brdesc);
+       stored = 0;
+       off = 0;
+       for (attnum = 0; attnum < brdesc->bd_tupdesc->natts; attnum++)
+       {
+               int                     datumno;
+
+               if (allnulls[attnum])
+               {
+                       stored += brdesc->bd_info[attnum]->oi_nstored;
+                       continue;
+               }
+
+               for (datumno = 0;
+                        datumno < brdesc->bd_info[attnum]->oi_nstored;
+                        datumno++)
+               {
+                       Form_pg_attribute thisatt = diskdsc->attrs[stored];
+
+                       if (thisatt->attlen == -1)
+                       {
+                               off = att_align_pointer(off, thisatt->attalign, -1,
+                                                                               tp + off);
+                       }
+                       else
+                       {
+                               /* not varlena, so safe to use att_align_nominal */
+                               off = att_align_nominal(off, thisatt->attalign);
+                       }
+
+                       values[stored++] = fetchatt(thisatt, tp + off);
+
+                       off = att_addlength_pointer(off, thisatt->attlen, tp + off);
+               }
+       }
+}
diff --git a/src/backend/access/brin/brin_xlog.c b/src/backend/access/brin/brin_xlog.c
new file mode 100644 (file)
index 0000000..8dc80ad
--- /dev/null
@@ -0,0 +1,291 @@
+/*
+ * brin_xlog.c
+ *             XLog replay routines for BRIN indexes
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *       src/backend/access/brin/brin_xlog.c
+ */
+#include "postgres.h"
+
+#include "access/brin_page.h"
+#include "access/brin_pageops.h"
+#include "access/brin_xlog.h"
+#include "access/xlogutils.h"
+
+
+/*
+ * xlog replay routines
+ */
+static void
+brin_xlog_createidx(XLogRecPtr lsn, XLogRecord *record)
+{
+       xl_brin_createidx *xlrec = (xl_brin_createidx *) XLogRecGetData(record);
+       Buffer          buf;
+       Page            page;
+
+       /* Backup blocks are not used in create_index records */
+       Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
+
+       /* create the index' metapage */
+       buf = XLogReadBuffer(xlrec->node, BRIN_METAPAGE_BLKNO, true);
+       Assert(BufferIsValid(buf));
+       page = (Page) BufferGetPage(buf);
+       brin_metapage_init(page, xlrec->pagesPerRange, xlrec->version);
+       PageSetLSN(page, lsn);
+       MarkBufferDirty(buf);
+       UnlockReleaseBuffer(buf);
+}
+
+/*
+ * Common part of an insert or update. Inserts the new tuple and updates the
+ * revmap.
+ */
+static void
+brin_xlog_insert_update(XLogRecPtr lsn, XLogRecord *record,
+                                               xl_brin_insert *xlrec, BrinTuple *tuple)
+{
+       BlockNumber blkno;
+       Buffer          buffer;
+       Page            page;
+       XLogRedoAction action;
+
+       blkno = ItemPointerGetBlockNumber(&xlrec->tid);
+
+       /*
+        * If we inserted the first and only tuple on the page, re-initialize the
+        * page from scratch.
+        */
+       if (record->xl_info & XLOG_BRIN_INIT_PAGE)
+       {
+               XLogReadBufferForRedoExtended(lsn, record, 0,
+                                                                         xlrec->node, MAIN_FORKNUM, blkno,
+                                                                         RBM_ZERO, false, &buffer);
+               page = BufferGetPage(buffer);
+               brin_page_init(page, BRIN_PAGETYPE_REGULAR);
+               action = BLK_NEEDS_REDO;
+       }
+       else
+       {
+               action = XLogReadBufferForRedo(lsn, record, 0,
+                                                                          xlrec->node, blkno, &buffer);
+       }
+
+       /* insert the index item into the page */
+       if (action == BLK_NEEDS_REDO)
+       {
+               OffsetNumber offnum;
+
+               Assert(tuple->bt_blkno == xlrec->heapBlk);
+
+               page = (Page) BufferGetPage(buffer);
+               offnum = ItemPointerGetOffsetNumber(&(xlrec->tid));
+               if (PageGetMaxOffsetNumber(page) + 1 < offnum)
+                       elog(PANIC, "brin_xlog_insert_update: invalid max offset number");
+
+               offnum = PageAddItem(page, (Item) tuple, xlrec->tuplen, offnum, true,
+                                                        false);
+               if (offnum == InvalidOffsetNumber)
+                       elog(PANIC, "brin_xlog_insert_update: failed to add tuple");
+
+               PageSetLSN(page, lsn);
+               MarkBufferDirty(buffer);
+       }
+       if (BufferIsValid(buffer))
+               UnlockReleaseBuffer(buffer);
+
+       /* update the revmap */
+       action = XLogReadBufferForRedo(lsn, record, 1, xlrec->node,
+                                                                  xlrec->revmapBlk, &buffer);
+       if (action == BLK_NEEDS_REDO)
+       {
+               page = (Page) BufferGetPage(buffer);
+
+               brinSetHeapBlockItemptr(buffer, xlrec->pagesPerRange, xlrec->heapBlk,
+                                                               xlrec->tid);
+               PageSetLSN(page, lsn);
+               MarkBufferDirty(buffer);
+       }
+       if (BufferIsValid(buffer))
+               UnlockReleaseBuffer(buffer);
+
+       /* XXX no FSM updates here ... */
+}
+
+/*
+ * replay a BRIN index insertion
+ */
+static void
+brin_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
+{
+       xl_brin_insert *xlrec = (xl_brin_insert *) XLogRecGetData(record);
+       BrinTuple  *newtup;
+
+       newtup = (BrinTuple *) ((char *) xlrec + SizeOfBrinInsert);
+
+       brin_xlog_insert_update(lsn, record, xlrec, newtup);
+}
+
+/*
+ * replay a BRIN index update
+ */
+static void
+brin_xlog_update(XLogRecPtr lsn, XLogRecord *record)
+{
+       xl_brin_update *xlrec = (xl_brin_update *) XLogRecGetData(record);
+       BlockNumber blkno;
+       Buffer          buffer;
+       BrinTuple  *newtup;
+       XLogRedoAction action;
+
+       newtup = (BrinTuple *) ((char *) xlrec + SizeOfBrinUpdate);
+
+       /* First remove the old tuple */
+       blkno = ItemPointerGetBlockNumber(&(xlrec->oldtid));
+       action = XLogReadBufferForRedo(lsn, record, 2, xlrec->new.node,
+                                                                  blkno, &buffer);
+       if (action == BLK_NEEDS_REDO)
+       {
+               Page            page;
+               OffsetNumber offnum;
+
+               page = (Page) BufferGetPage(buffer);
+
+               offnum = ItemPointerGetOffsetNumber(&(xlrec->oldtid));
+               if (PageGetMaxOffsetNumber(page) + 1 < offnum)
+                       elog(PANIC, "brin_xlog_update: invalid max offset number");
+
+               PageIndexDeleteNoCompact(page, &offnum, 1);
+
+               PageSetLSN(page, lsn);
+               MarkBufferDirty(buffer);
+       }
+
+       /* Then insert the new tuple and update revmap, like in an insertion. */
+       brin_xlog_insert_update(lsn, record, &xlrec->new, newtup);
+
+       if (BufferIsValid(buffer))
+               UnlockReleaseBuffer(buffer);
+}
+
+/*
+ * Update a tuple on a single page.
+ */
+static void
+brin_xlog_samepage_update(XLogRecPtr lsn, XLogRecord *record)
+{
+       xl_brin_samepage_update *xlrec;
+       BlockNumber blkno;
+       Buffer          buffer;
+       XLogRedoAction action;
+
+       xlrec = (xl_brin_samepage_update *) XLogRecGetData(record);
+       blkno = ItemPointerGetBlockNumber(&(xlrec->tid));
+       action = XLogReadBufferForRedo(lsn, record, 0, xlrec->node, blkno,
+                                                                  &buffer);
+       if (action == BLK_NEEDS_REDO)
+       {
+               int                     tuplen;
+               BrinTuple  *mmtuple;
+               Page            page;
+               OffsetNumber offnum;
+
+               tuplen = record->xl_len - SizeOfBrinSamepageUpdate;
+               mmtuple = (BrinTuple *) ((char *) xlrec + SizeOfBrinSamepageUpdate);
+
+               page = (Page) BufferGetPage(buffer);
+
+               offnum = ItemPointerGetOffsetNumber(&(xlrec->tid));
+               if (PageGetMaxOffsetNumber(page) + 1 < offnum)
+                       elog(PANIC, "brin_xlog_samepage_update: invalid max offset number");
+
+               PageIndexDeleteNoCompact(page, &offnum, 1);
+               offnum = PageAddItem(page, (Item) mmtuple, tuplen, offnum, true, false);
+               if (offnum == InvalidOffsetNumber)
+                       elog(PANIC, "brin_xlog_samepage_update: failed to add tuple");
+
+               PageSetLSN(page, lsn);
+               MarkBufferDirty(buffer);
+       }
+       if (BufferIsValid(buffer))
+               UnlockReleaseBuffer(buffer);
+
+       /* XXX no FSM updates here ... */
+}
+
+/*
+ * Replay a revmap page extension
+ */
+static void
+brin_xlog_revmap_extend(XLogRecPtr lsn, XLogRecord *record)
+{
+       xl_brin_revmap_extend *xlrec;
+       Buffer          metabuf;
+       Buffer          buf;
+       Page            page;
+       XLogRedoAction action;
+
+       xlrec = (xl_brin_revmap_extend *) XLogRecGetData(record);
+       /* Update the metapage */
+       action = XLogReadBufferForRedo(lsn, record, 0, xlrec->node,
+                                                                  BRIN_METAPAGE_BLKNO, &metabuf);
+       if (action == BLK_NEEDS_REDO)
+       {
+               Page            metapg;
+               BrinMetaPageData *metadata;
+
+               metapg = BufferGetPage(metabuf);
+               metadata = (BrinMetaPageData *) PageGetContents(metapg);
+
+               Assert(metadata->lastRevmapPage == xlrec->targetBlk - 1);
+               metadata->lastRevmapPage = xlrec->targetBlk;
+
+               PageSetLSN(metapg, lsn);
+               MarkBufferDirty(metabuf);
+       }
+
+       /*
+        * Re-init the target block as a revmap page.  There's never a full- page
+        * image here.
+        */
+
+       buf = XLogReadBuffer(xlrec->node, xlrec->targetBlk, true);
+       page = (Page) BufferGetPage(buf);
+       brin_page_init(page, BRIN_PAGETYPE_REVMAP);
+
+       PageSetLSN(page, lsn);
+       MarkBufferDirty(buf);
+
+       UnlockReleaseBuffer(buf);
+       if (BufferIsValid(metabuf))
+               UnlockReleaseBuffer(metabuf);
+}
+
+void
+brin_redo(XLogRecPtr lsn, XLogRecord *record)
+{
+       uint8           info = record->xl_info & ~XLR_INFO_MASK;
+
+       switch (info & XLOG_BRIN_OPMASK)
+       {
+               case XLOG_BRIN_CREATE_INDEX:
+                       brin_xlog_createidx(lsn, record);
+                       break;
+               case XLOG_BRIN_INSERT:
+                       brin_xlog_insert(lsn, record);
+                       break;
+               case XLOG_BRIN_UPDATE:
+                       brin_xlog_update(lsn, record);
+                       break;
+               case XLOG_BRIN_SAMEPAGE_UPDATE:
+                       brin_xlog_samepage_update(lsn, record);
+                       break;
+               case XLOG_BRIN_REVMAP_EXTEND:
+                       brin_xlog_revmap_extend(lsn, record);
+                       break;
+               default:
+                       elog(PANIC, "brin_redo: unknown op code %u", info);
+       }
+}
index e0b81b9eb5139e8db3c4de25ca3bec7ffaab21bb..c55a7758273cab58a0f97e336b7f562d4d21617b 100644 (file)
@@ -209,6 +209,13 @@ static relopt_int intRelOpts[] =
                        RELOPT_KIND_HEAP | RELOPT_KIND_TOAST
                }, -1, 0, 2000000000
        },
+       {
+               {
+                       "pages_per_range",
+                       "Number of pages that each page range covers in a BRIN index",
+                       RELOPT_KIND_BRIN
+               }, 128, 1, 131072
+       },
 
        /* list terminator */
        {{NULL}}
index 8f671ac4342818623fe10d86fe4819d7531c7ddb..43098f444224a087d72543db9dce8da43fe28ef2 100644 (file)
@@ -272,6 +272,8 @@ initscan(HeapScanDesc scan, ScanKey key, bool is_rescan)
                scan->rs_startblock = 0;
        }
 
+       scan->rs_initblock = 0;
+       scan->rs_numblocks = InvalidBlockNumber;
        scan->rs_inited = false;
        scan->rs_ctup.t_data = NULL;
        ItemPointerSetInvalid(&scan->rs_ctup.t_self);
@@ -297,6 +299,14 @@ initscan(HeapScanDesc scan, ScanKey key, bool is_rescan)
                pgstat_count_heap_scan(scan->rs_rd);
 }
 
+void
+heap_setscanlimits(HeapScanDesc scan, BlockNumber startBlk, BlockNumber numBlks)
+{
+       scan->rs_startblock = startBlk;
+       scan->rs_initblock = startBlk;
+       scan->rs_numblocks = numBlks;
+}
+
 /*
  * heapgetpage - subroutine for heapgettup()
  *
@@ -637,7 +647,8 @@ heapgettup(HeapScanDesc scan,
                 */
                if (backward)
                {
-                       finished = (page == scan->rs_startblock);
+                       finished = (page == scan->rs_startblock) ||
+                               (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks <= 0 : false);
                        if (page == 0)
                                page = scan->rs_nblocks;
                        page--;
@@ -647,7 +658,8 @@ heapgettup(HeapScanDesc scan,
                        page++;
                        if (page >= scan->rs_nblocks)
                                page = 0;
-                       finished = (page == scan->rs_startblock);
+                       finished = (page == scan->rs_startblock) ||
+                               (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks <= 0 : false);
 
                        /*
                         * Report our new scan position for synchronization purposes. We
@@ -898,7 +910,8 @@ heapgettup_pagemode(HeapScanDesc scan,
                 */
                if (backward)
                {
-                       finished = (page == scan->rs_startblock);
+                       finished = (page == scan->rs_startblock) ||
+                               (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks <= 0 : false);
                        if (page == 0)
                                page = scan->rs_nblocks;
                        page--;
@@ -908,7 +921,8 @@ heapgettup_pagemode(HeapScanDesc scan,
                        page++;
                        if (page >= scan->rs_nblocks)
                                page = 0;
-                       finished = (page == scan->rs_startblock);
+                       finished = (page == scan->rs_startblock) ||
+                               (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks <= 0 : false);
 
                        /*
                         * Report our new scan position for synchronization purposes. We
index 7d092d205d6083404a50a09c4a4e3e7aae268469..32cb985036c2d2124b750abfdb97f8c25becc843 100644 (file)
@@ -8,7 +8,8 @@ subdir = src/backend/access/rmgrdesc
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = clogdesc.o dbasedesc.o gindesc.o gistdesc.o hashdesc.o heapdesc.o \
+OBJS = brindesc.o clogdesc.o dbasedesc.o gindesc.o gistdesc.o \
+          hashdesc.o heapdesc.o \
           mxactdesc.o nbtdesc.o relmapdesc.o seqdesc.o smgrdesc.o spgdesc.o \
           standbydesc.o tblspcdesc.o xactdesc.o xlogdesc.o
 
diff --git a/src/backend/access/rmgrdesc/brindesc.c b/src/backend/access/rmgrdesc/brindesc.c
new file mode 100644 (file)
index 0000000..39135bf
--- /dev/null
@@ -0,0 +1,112 @@
+/*-------------------------------------------------------------------------
+ *
+ * brindesc.c
+ *       rmgr descriptor routines for BRIN indexes
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *       src/backend/access/rmgrdesc/brindesc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/brin_xlog.h"
+
+void
+brin_desc(StringInfo buf, XLogRecord *record)
+{
+       char       *rec = XLogRecGetData(record);
+       uint8           info = record->xl_info & ~XLR_INFO_MASK;
+
+       info &= XLOG_BRIN_OPMASK;
+       if (info == XLOG_BRIN_CREATE_INDEX)
+       {
+               xl_brin_createidx *xlrec = (xl_brin_createidx *) rec;
+
+               appendStringInfo(buf, "v%d pagesPerRange %u rel %u/%u/%u",
+                                                xlrec->version, xlrec->pagesPerRange,
+                                                xlrec->node.spcNode, xlrec->node.dbNode,
+                                                xlrec->node.relNode);
+       }
+       else if (info == XLOG_BRIN_INSERT)
+       {
+               xl_brin_insert *xlrec = (xl_brin_insert *) rec;
+
+               appendStringInfo(buf, "rel %u/%u/%u heapBlk %u revmapBlk %u pagesPerRange %u TID (%u,%u)",
+                                                xlrec->node.spcNode, xlrec->node.dbNode,
+                                                xlrec->node.relNode,
+                                                xlrec->heapBlk, xlrec->revmapBlk,
+                                                xlrec->pagesPerRange,
+                                                ItemPointerGetBlockNumber(&xlrec->tid),
+                                                ItemPointerGetOffsetNumber(&xlrec->tid));
+       }
+       else if (info == XLOG_BRIN_UPDATE)
+       {
+               xl_brin_update *xlrec = (xl_brin_update *) rec;
+
+               appendStringInfo(buf, "rel %u/%u/%u heapBlk %u revmapBlk %u pagesPerRange %u old TID (%u,%u) TID (%u,%u)",
+                                                xlrec->new.node.spcNode, xlrec->new.node.dbNode,
+                                                xlrec->new.node.relNode,
+                                                xlrec->new.heapBlk, xlrec->new.revmapBlk,
+                                                xlrec->new.pagesPerRange,
+                                                ItemPointerGetBlockNumber(&xlrec->oldtid),
+                                                ItemPointerGetOffsetNumber(&xlrec->oldtid),
+                                                ItemPointerGetBlockNumber(&xlrec->new.tid),
+                                                ItemPointerGetOffsetNumber(&xlrec->new.tid));
+       }
+       else if (info == XLOG_BRIN_SAMEPAGE_UPDATE)
+       {
+               xl_brin_samepage_update *xlrec = (xl_brin_samepage_update *) rec;
+
+               appendStringInfo(buf, "rel %u/%u/%u TID (%u,%u)",
+                                                xlrec->node.spcNode, xlrec->node.dbNode,
+                                                xlrec->node.relNode,
+                                                ItemPointerGetBlockNumber(&xlrec->tid),
+                                                ItemPointerGetOffsetNumber(&xlrec->tid));
+       }
+       else if (info == XLOG_BRIN_REVMAP_EXTEND)
+       {
+               xl_brin_revmap_extend *xlrec = (xl_brin_revmap_extend *) rec;
+
+               appendStringInfo(buf, "rel %u/%u/%u targetBlk %u",
+                                                xlrec->node.spcNode, xlrec->node.dbNode,
+                                                xlrec->node.relNode, xlrec->targetBlk);
+       }
+}
+
+const char *
+brin_identify(uint8 info)
+{
+       const char *id = NULL;
+
+       switch (info & ~XLR_INFO_MASK)
+       {
+               case XLOG_BRIN_CREATE_INDEX:
+                       id = "CREATE_INDEX";
+                       break;
+               case XLOG_BRIN_INSERT:
+                       id = "INSERT";
+                       break;
+               case XLOG_BRIN_INSERT | XLOG_BRIN_INIT_PAGE:
+                       id = "INSERT+INIT";
+                       break;
+               case XLOG_BRIN_UPDATE:
+                       id = "UPDATE";
+                       break;
+               case XLOG_BRIN_UPDATE | XLOG_BRIN_INIT_PAGE:
+                       id = "UPDATE+INIT";
+                       break;
+               case XLOG_BRIN_SAMEPAGE_UPDATE:
+                       id = "SAMEPAGE_UPDATE";
+                       break;
+               case XLOG_BRIN_REVMAP_EXTEND:
+                       id = "REVMAP_EXTEND";
+                       break;
+       }
+
+       return id;
+}
index 2645a7a368551d783883656e438139b69e4aff95..befd60f2d3777f1f3d072aac594e963d5a19c7a5 100644 (file)
@@ -12,6 +12,7 @@
 #include "access/gist_private.h"
 #include "access/hash.h"
 #include "access/heapam_xlog.h"
+#include "access/brin_xlog.h"
 #include "access/multixact.h"
 #include "access/nbtree.h"
 #include "access/spgist.h"
index 0c31aa95d70a169bb83f3ce7db735f46eee53e03..912038a712efe10c04579cb450b3e1586f20c8dc 100644 (file)
@@ -2103,6 +2103,27 @@ IndexBuildHeapScan(Relation heapRelation,
                                   bool allow_sync,
                                   IndexBuildCallback callback,
                                   void *callback_state)
+{
+       return IndexBuildHeapRangeScan(heapRelation, indexRelation,
+                                                                  indexInfo, allow_sync,
+                                                                  0, InvalidBlockNumber,
+                                                                  callback, callback_state);
+}
+
+/*
+ * As above, except that instead of scanning the complete heap, only the given
+ * number of blocks are scanned.  Scan to end-of-rel can be signalled by
+ * passing InvalidBlockNumber as numblocks.
+ */
+double
+IndexBuildHeapRangeScan(Relation heapRelation,
+                                               Relation indexRelation,
+                                               IndexInfo *indexInfo,
+                                               bool allow_sync,
+                                               BlockNumber start_blockno,
+                                               BlockNumber numblocks,
+                                               IndexBuildCallback callback,
+                                               void *callback_state)
 {
        bool            is_system_catalog;
        bool            checking_uniqueness;
@@ -2174,6 +2195,9 @@ IndexBuildHeapScan(Relation heapRelation,
                                                                true,   /* buffer access strategy OK */
                                                                allow_sync);    /* syncscan OK? */
 
+       /* set our scan endpoints */
+       heap_setscanlimits(scan, start_blockno, numblocks);
+
        reltuples = 0;
 
        /*
index 9f1b20e04abc77496c5a0f6b265209fb2ff74303..8e78aafda7cbf01914df86fd03a5657eda93fea3 100644 (file)
@@ -132,6 +132,7 @@ LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogRecord *record)
                case RM_GIST_ID:
                case RM_SEQ_ID:
                case RM_SPGIST_ID:
+               case RM_BRIN_ID:
                        break;
                case RM_NEXT_ID:
                        elog(ERROR, "unexpected RM_NEXT_ID rmgr_id: %u", (RmgrIds) buf.record.xl_rmid);
index 6351a9bea47779573862f222998ffb4afdeb248b..2b858c8271910c544aacac7a40db0959dfb9a990 100644 (file)
@@ -399,7 +399,8 @@ PageRestoreTempPage(Page tempPage, Page oldPage)
 }
 
 /*
- * sorting support for PageRepairFragmentation and PageIndexMultiDelete
+ * sorting support for PageRepairFragmentation, PageIndexMultiDelete,
+ * PageIndexDeleteNoCompact
  */
 typedef struct itemIdSortData
 {
@@ -896,6 +897,182 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
        phdr->pd_upper = upper;
 }
 
+/*
+ * PageIndexDeleteNoCompact
+ *             Delete the given items for an index page, and defragment the resulting
+ *             free space, but do not compact the item pointers array.
+ *
+ * itemnos is the array of tuples to delete; nitems is its size.  maxIdxTuples
+ * is the maximum number of tuples that can exist in a page.
+ *
+ * Unused items at the end of the array are removed.
+ *
+ * This is used for index AMs that require that existing TIDs of live tuples
+ * remain unchanged.
+ */
+void
+PageIndexDeleteNoCompact(Page page, OffsetNumber *itemnos, int nitems)
+{
+       PageHeader      phdr = (PageHeader) page;
+       LocationIndex pd_lower = phdr->pd_lower;
+       LocationIndex pd_upper = phdr->pd_upper;
+       LocationIndex pd_special = phdr->pd_special;
+       int                     nline;
+       bool            empty;
+       OffsetNumber offnum;
+       int                     nextitm;
+
+       /*
+        * As with PageRepairFragmentation, paranoia seems justified.
+        */
+       if (pd_lower < SizeOfPageHeaderData ||
+               pd_lower > pd_upper ||
+               pd_upper > pd_special ||
+               pd_special > BLCKSZ ||
+               pd_special != MAXALIGN(pd_special))
+               ereport(ERROR,
+                               (errcode(ERRCODE_DATA_CORRUPTED),
+                                errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
+                                               pd_lower, pd_upper, pd_special)));
+
+       /*
+        * Scan the existing item pointer array and mark as unused those that are
+        * in our kill-list; make sure any non-interesting ones are marked unused
+        * as well.
+        */
+       nline = PageGetMaxOffsetNumber(page);
+       empty = true;
+       nextitm = 0;
+       for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum))
+       {
+               ItemId          lp;
+               ItemLength      itemlen;
+               ItemOffset      offset;
+
+               lp = PageGetItemId(page, offnum);
+
+               itemlen = ItemIdGetLength(lp);
+               offset = ItemIdGetOffset(lp);
+
+               if (ItemIdIsUsed(lp))
+               {
+                       if (offset < pd_upper ||
+                               (offset + itemlen) > pd_special ||
+                               offset != MAXALIGN(offset))
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_DATA_CORRUPTED),
+                                                errmsg("corrupted item pointer: offset = %u, length = %u",
+                                                               offset, (unsigned int) itemlen)));
+
+                       if (nextitm < nitems && offnum == itemnos[nextitm])
+                       {
+                               /* this one is on our list to delete, so mark it unused */
+                               ItemIdSetUnused(lp);
+                               nextitm++;
+                       }
+                       else if (ItemIdHasStorage(lp))
+                       {
+                               /* This one's live -- must do the compaction dance */
+                               empty = false;
+                       }
+                       else
+                       {
+                               /* get rid of this one too */
+                               ItemIdSetUnused(lp);
+                       }
+               }
+       }
+
+       /* this will catch invalid or out-of-order itemnos[] */
+       if (nextitm != nitems)
+               elog(ERROR, "incorrect index offsets supplied");
+
+       if (empty)
+       {
+               /* Page is completely empty, so just reset it quickly */
+               phdr->pd_lower = SizeOfPageHeaderData;
+               phdr->pd_upper = pd_special;
+       }
+       else
+       {
+               /* There are live items: need to compact the page the hard way */
+               itemIdSortData itemidbase[MaxOffsetNumber];
+               itemIdSort      itemidptr;
+               int                     i;
+               Size            totallen;
+               Offset          upper;
+
+               /*
+                * Scan the page taking note of each item that we need to preserve.
+                * This includes both live items (those that contain data) and
+                * interspersed unused ones.  It's critical to preserve these unused
+                * items, because otherwise the offset numbers for later live items
+                * would change, which is not acceptable.  Unused items might get used
+                * again later; that is fine.
+                */
+               itemidptr = itemidbase;
+               totallen = 0;
+               for (i = 0; i < nline; i++, itemidptr++)
+               {
+                       ItemId          lp;
+
+                       itemidptr->offsetindex = i;
+
+                       lp = PageGetItemId(page, i + 1);
+                       if (ItemIdHasStorage(lp))
+                       {
+                               itemidptr->itemoff = ItemIdGetOffset(lp);
+                               itemidptr->alignedlen = MAXALIGN(ItemIdGetLength(lp));
+                               totallen += itemidptr->alignedlen;
+                       }
+                       else
+                       {
+                               itemidptr->itemoff = 0;
+                               itemidptr->alignedlen = 0;
+                       }
+               }
+               /* By here, there are exactly nline elements in itemidbase array */
+
+               if (totallen > (Size) (pd_special - pd_lower))
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_DATA_CORRUPTED),
+                                        errmsg("corrupted item lengths: total %u, available space %u",
+                                                       (unsigned int) totallen, pd_special - pd_lower)));
+
+               /* sort itemIdSortData array into decreasing itemoff order */
+               qsort((char *) itemidbase, nline, sizeof(itemIdSortData),
+                         itemoffcompare);
+
+               /*
+                * Defragment the data areas of each tuple, being careful to preserve
+                * each item's position in the linp array.
+                */
+               upper = pd_special;
+               PageClearHasFreeLinePointers(page);
+               for (i = 0, itemidptr = itemidbase; i < nline; i++, itemidptr++)
+               {
+                       ItemId          lp;
+
+                       lp = PageGetItemId(page, itemidptr->offsetindex + 1);
+                       if (itemidptr->alignedlen == 0)
+                       {
+                               PageSetHasFreeLinePointers(page);
+                               ItemIdSetUnused(lp);
+                               continue;
+                       }
+                       upper -= itemidptr->alignedlen;
+                       memmove((char *) page + upper,
+                                       (char *) page + itemidptr->itemoff,
+                                       itemidptr->alignedlen);
+                       lp->lp_off = upper;
+                       /* lp_flags and lp_len remain the same as originally */
+               }
+
+               /* Set the new page limits */
+               phdr->pd_upper = upper;
+               phdr->pd_lower = SizeOfPageHeaderData + i * sizeof(ItemIdData);
+       }
+}
 
 /*
  * Set checksum for a page in shared buffers.
index e932ccf0da51cc83f74143cffd2146eaa70c240c..ea9150b23f01d8573950ab2219db6d7ac65f35e4 100644 (file)
@@ -6081,7 +6081,7 @@ genericcostestimate(PlannerInfo *root,
        else
                numIndexPages = 1.0;
 
-       /* fetch estimated page cost for schema containing index */
+       /* fetch estimated page cost for tablespace containing index */
        get_tablespace_page_costs(index->reltablespace,
                                                          &spc_random_page_cost,
                                                          NULL);
@@ -7162,7 +7162,7 @@ gincostestimate(PG_FUNCTION_ARGS)
                                                                                           JOIN_INNER,
                                                                                           NULL);
 
-       /* fetch estimated page cost for schema containing index */
+       /* fetch estimated page cost for tablespace containing index */
        get_tablespace_page_costs(index->reltablespace,
                                                          &spc_random_page_cost,
                                                          NULL);
@@ -7349,3 +7349,73 @@ gincostestimate(PG_FUNCTION_ARGS)
 
        PG_RETURN_VOID();
 }
+
+/*
+ * BRIN has search behavior completely different from other index types
+ */
+Datum
+brincostestimate(PG_FUNCTION_ARGS)
+{
+       PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+       IndexPath  *path = (IndexPath *) PG_GETARG_POINTER(1);
+       double          loop_count = PG_GETARG_FLOAT8(2);
+       Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
+       Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
+       Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
+       double     *indexCorrelation = (double *) PG_GETARG_POINTER(6);
+       IndexOptInfo *index = path->indexinfo;
+       List       *indexQuals = path->indexquals;
+       List       *indexOrderBys = path->indexorderbys;
+       double          numPages = index->pages;
+       double          numTuples = index->tuples;
+       Cost            spc_seq_page_cost;
+       Cost            spc_random_page_cost;
+       QualCost        index_qual_cost;
+       double          qual_op_cost;
+       double          qual_arg_cost;
+
+       /* fetch estimated page cost for tablespace containing index */
+       get_tablespace_page_costs(index->reltablespace,
+                                                         &spc_random_page_cost,
+                                                         &spc_seq_page_cost);
+
+       /*
+        * BRIN indexes are always read in full; use that as startup cost.
+        * XXX maybe only include revmap pages here?
+        */
+       *indexStartupCost = spc_seq_page_cost * numPages * loop_count;
+
+       /*
+        * To read a BRIN index there might be a bit of back and forth over regular
+        * pages, as revmap might point to them out of sequential order; calculate
+        * this as reading the whole index in random order.
+        */
+       *indexTotalCost = spc_random_page_cost * numPages * loop_count;
+
+       *indexSelectivity =
+               clauselist_selectivity(root, path->indexquals,
+                                                          path->indexinfo->rel->relid,
+                                                          JOIN_INNER, NULL);
+       *indexCorrelation = 1;
+
+       /*
+        * Add on index qual eval costs, much as in genericcostestimate.
+        */
+       cost_qual_eval(&index_qual_cost, indexQuals, root);
+       qual_arg_cost = index_qual_cost.startup + index_qual_cost.per_tuple;
+       cost_qual_eval(&index_qual_cost, indexOrderBys, root);
+       qual_arg_cost += index_qual_cost.startup + index_qual_cost.per_tuple;
+       qual_op_cost = cpu_operator_cost *
+               (list_length(indexQuals) + list_length(indexOrderBys));
+       qual_arg_cost -= qual_op_cost;
+       if (qual_arg_cost < 0)          /* just in case... */
+               qual_arg_cost = 0;
+
+       *indexStartupCost += qual_arg_cost;
+       *indexTotalCost += qual_arg_cost;
+       *indexTotalCost += (numTuples * *indexSelectivity) * (cpu_index_tuple_cost + qual_op_cost);
+
+       /* XXX what about pages_per_range? */
+
+       PG_RETURN_VOID();
+}
diff --git a/src/include/access/brin.h b/src/include/access/brin.h
new file mode 100644 (file)
index 0000000..a522c20
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+ * AM-callable functions for BRIN indexes
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *             src/include/access/brin.h
+ */
+#ifndef BRIN_H
+#define BRIN_H
+
+#include "fmgr.h"
+#include "nodes/execnodes.h"
+#include "utils/relcache.h"
+
+
+/*
+ * prototypes for functions in brin.c (external entry points for BRIN)
+ */
+extern Datum brinbuild(PG_FUNCTION_ARGS);
+extern Datum brinbuildempty(PG_FUNCTION_ARGS);
+extern Datum brininsert(PG_FUNCTION_ARGS);
+extern Datum brinbeginscan(PG_FUNCTION_ARGS);
+extern Datum bringettuple(PG_FUNCTION_ARGS);
+extern Datum bringetbitmap(PG_FUNCTION_ARGS);
+extern Datum brinrescan(PG_FUNCTION_ARGS);
+extern Datum brinendscan(PG_FUNCTION_ARGS);
+extern Datum brinmarkpos(PG_FUNCTION_ARGS);
+extern Datum brinrestrpos(PG_FUNCTION_ARGS);
+extern Datum brinbulkdelete(PG_FUNCTION_ARGS);
+extern Datum brinvacuumcleanup(PG_FUNCTION_ARGS);
+extern Datum brincanreturn(PG_FUNCTION_ARGS);
+extern Datum brincostestimate(PG_FUNCTION_ARGS);
+extern Datum brinoptions(PG_FUNCTION_ARGS);
+
+/*
+ * Storage type for BRIN's reloptions
+ */
+typedef struct BrinOptions
+{
+       int32           vl_len_;                /* varlena header (do not touch directly!) */
+       BlockNumber pagesPerRange;
+} BrinOptions;
+
+#define BRIN_DEFAULT_PAGES_PER_RANGE   128
+#define BrinGetPagesPerRange(relation) \
+       ((relation)->rd_options ? \
+        ((BrinOptions *) (relation)->rd_options)->pagesPerRange : \
+         BRIN_DEFAULT_PAGES_PER_RANGE)
+
+#endif   /* BRIN_H */
diff --git a/src/include/access/brin_internal.h b/src/include/access/brin_internal.h
new file mode 100644 (file)
index 0000000..651ab5f
--- /dev/null
@@ -0,0 +1,88 @@
+/*
+ * brin_internal.h
+ *             internal declarations for BRIN indexes
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *             src/include/access/brin_internal.h
+ */
+#ifndef BRIN_INTERNAL_H
+#define BRIN_INTERNAL_H
+
+#include "fmgr.h"
+#include "storage/buf.h"
+#include "storage/bufpage.h"
+#include "storage/off.h"
+#include "utils/relcache.h"
+
+
+/*
+ * A BrinDesc is a struct designed to enable decoding a BRIN tuple from the
+ * on-disk format to an in-memory tuple and vice-versa.
+ */
+
+/* struct returned by "OpcInfo" amproc */
+typedef struct BrinOpcInfo
+{
+       /* Number of columns stored in an index column of this opclass */
+       uint16          oi_nstored;
+
+       /* Opaque pointer for the opclass' private use */
+       void       *oi_opaque;
+
+       /* Type IDs of the stored columns */
+       Oid                     oi_typids[FLEXIBLE_ARRAY_MEMBER];
+} BrinOpcInfo;
+
+/* the size of a BrinOpcInfo for the given number of columns */
+#define SizeofBrinOpcInfo(ncols) \
+       (offsetof(BrinOpcInfo, oi_typids) + sizeof(Oid) * ncols)
+
+typedef struct BrinDesc
+{
+       /* Containing memory context */
+       MemoryContext bd_context;
+
+       /* the index relation itself */
+       Relation        bd_index;
+
+       /* tuple descriptor of the index relation */
+       TupleDesc       bd_tupdesc;
+
+       /* cached copy for on-disk tuples; generated at first use */
+       TupleDesc       bd_disktdesc;
+
+       /* total number of Datum entries that are stored on-disk for all columns */
+       int                     bd_totalstored;
+
+       /* per-column info; bd_tupdesc->natts entries long */
+       BrinOpcInfo *bd_info[FLEXIBLE_ARRAY_MEMBER];
+} BrinDesc;
+
+/*
+ * Globally-known function support numbers for BRIN indexes.  Individual
+ * opclasses define their own function support numbers, which must not collide
+ * with the definitions here.
+ */
+#define BRIN_PROCNUM_OPCINFO           1
+#define BRIN_PROCNUM_ADDVALUE          2
+#define BRIN_PROCNUM_CONSISTENT                3
+#define BRIN_PROCNUM_UNION                     4
+/* procedure numbers up to 10 are reserved for BRIN future expansion */
+
+#define BRIN_DEBUG
+
+/* we allow debug if using GCC; otherwise don't bother */
+#if defined(BRIN_DEBUG) && defined(__GNUC__)
+#define BRIN_elog(level, ...)          elog(level, __VA_ARGS__)
+#else
+#define BRIN_elog(a)   void(0)
+#endif
+
+/* brin.c */
+extern BrinDesc *brin_build_desc(Relation rel);
+extern void brin_free_desc(BrinDesc *bdesc);
+
+#endif   /* BRIN_INTERNAL_H */
diff --git a/src/include/access/brin_page.h b/src/include/access/brin_page.h
new file mode 100644 (file)
index 0000000..636cf86
--- /dev/null
@@ -0,0 +1,70 @@
+/*
+ * brin_page.h
+ *             Prototypes and definitions for BRIN page layouts
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *             src/include/access/brin_page.h
+ *
+ * NOTES
+ *
+ * These structs should really be private to specific BRIN files, but it's
+ * useful to have them here so that they can be used by pageinspect and similar
+ * tools.
+ */
+#ifndef BRIN_PAGE_H
+#define BRIN_PAGE_H
+
+#include "storage/block.h"
+#include "storage/itemptr.h"
+
+/* special space on all BRIN pages stores a "type" identifier */
+#define                BRIN_PAGETYPE_META                      0xF091
+#define                BRIN_PAGETYPE_REVMAP            0xF092
+#define                BRIN_PAGETYPE_REGULAR           0xF093
+
+#define BRIN_PAGE_TYPE(page)   \
+       (((BrinSpecialSpace *) PageGetSpecialPointer(page))->type)
+#define BRIN_IS_REVMAP_PAGE(page) (BRIN_PAGE_TYPE(page) == BRIN_PAGETYPE_REVMAP)
+#define BRIN_IS_REGULAR_PAGE(page) (BRIN_PAGE_TYPE(page) == BRIN_PAGETYPE_REGULAR)
+
+/* flags for BrinSpecialSpace */
+#define                BRIN_EVACUATE_PAGE                      (1 << 0)
+
+typedef struct BrinSpecialSpace
+{
+       uint16          flags;
+       uint16          type;
+} BrinSpecialSpace;
+
+/* Metapage definitions */
+typedef struct BrinMetaPageData
+{
+       uint32          brinMagic;
+       uint32          brinVersion;
+       BlockNumber pagesPerRange;
+       BlockNumber lastRevmapPage;
+} BrinMetaPageData;
+
+#define BRIN_CURRENT_VERSION           1
+#define BRIN_META_MAGIC                        0xA8109CFA
+
+#define BRIN_METAPAGE_BLKNO            0
+
+/* Definitions for revmap pages */
+typedef struct RevmapContents
+{
+       ItemPointerData rm_tids[1]; /* really REVMAP_PAGE_MAXITEMS */
+} RevmapContents;
+
+#define REVMAP_CONTENT_SIZE \
+       (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - \
+        offsetof(RevmapContents, rm_tids) - \
+        MAXALIGN(sizeof(BrinSpecialSpace)))
+/* max num of items in the array */
+#define REVMAP_PAGE_MAXITEMS \
+       (REVMAP_CONTENT_SIZE / sizeof(ItemPointerData))
+
+#endif   /* BRIN_PAGE_H */
diff --git a/src/include/access/brin_pageops.h b/src/include/access/brin_pageops.h
new file mode 100644 (file)
index 0000000..86a9e81
--- /dev/null
@@ -0,0 +1,36 @@
+/*
+ * brin_pageops.h
+ *             Prototypes for operating on BRIN pages.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *       src/include/access/brin_pageops.h
+ */
+#ifndef BRIN_PAGEOPS_H
+#define BRIN_PAGEOPS_H
+
+#include "access/brin_revmap.h"
+
+extern bool brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
+                         BrinRevmap *revmap, BlockNumber heapBlk,
+                         Buffer oldbuf, OffsetNumber oldoff,
+                         const BrinTuple *origtup, Size origsz,
+                         const BrinTuple *newtup, Size newsz,
+                         bool samepage);
+extern bool brin_can_do_samepage_update(Buffer buffer, Size origsz,
+                                                       Size newsz);
+extern OffsetNumber brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
+                         BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk,
+                         BrinTuple *tup, Size itemsz);
+
+extern void brin_page_init(Page page, uint16 type);
+extern void brin_metapage_init(Page page, BlockNumber pagesPerRange,
+                                  uint16 version);
+
+extern bool brin_start_evacuating_page(Relation idxRel, Buffer buf);
+extern void brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange,
+                                  BrinRevmap *revmap, Buffer buf);
+
+#endif   /* BRIN_PAGEOPS_H */
diff --git a/src/include/access/brin_revmap.h b/src/include/access/brin_revmap.h
new file mode 100644 (file)
index 0000000..ff0e7e6
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ * brin_revmap.h
+ *             Prototypes for BRIN reverse range maps
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *             src/include/access/brin_revmap.h
+ */
+
+#ifndef BRIN_REVMAP_H
+#define BRIN_REVMAP_H
+
+#include "access/brin_tuple.h"
+#include "storage/block.h"
+#include "storage/buf.h"
+#include "storage/itemptr.h"
+#include "storage/off.h"
+#include "utils/relcache.h"
+
+/* struct definition lives in brin_revmap.c */
+typedef struct BrinRevmap BrinRevmap;
+
+extern BrinRevmap *brinRevmapInitialize(Relation idxrel,
+                                        BlockNumber *pagesPerRange);
+extern void brinRevmapTerminate(BrinRevmap *revmap);
+
+extern void brinRevmapExtend(BrinRevmap *revmap,
+                                BlockNumber heapBlk);
+extern Buffer brinLockRevmapPageForUpdate(BrinRevmap *revmap,
+                                                       BlockNumber heapBlk);
+extern void brinSetHeapBlockItemptr(Buffer rmbuf, BlockNumber pagesPerRange,
+                                               BlockNumber heapBlk, ItemPointerData tid);
+extern BrinTuple *brinGetTupleForHeapBlock(BrinRevmap *revmap,
+                                                BlockNumber heapBlk, Buffer *buf, OffsetNumber *off,
+                                                Size *size, int mode);
+
+#endif   /* BRIN_REVMAP_H */
diff --git a/src/include/access/brin_tuple.h b/src/include/access/brin_tuple.h
new file mode 100644 (file)
index 0000000..00f55e7
--- /dev/null
@@ -0,0 +1,96 @@
+/*
+ * brin_tuple.h
+ *             Declarations for dealing with BRIN-specific tuples.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *       src/include/access/brin_tuple.h
+ */
+#ifndef BRIN_TUPLE_H
+#define BRIN_TUPLE_H
+
+#include "access/brin_internal.h"
+#include "access/tupdesc.h"
+
+
+/*
+ * A BRIN index stores one index tuple per page range.  Each index tuple
+ * has one BrinValues struct for each indexed column; in turn, each BrinValues
+ * has (besides the null flags) an array of Datum whose size is determined by
+ * the opclass.
+ */
+typedef struct BrinValues
+{
+       AttrNumber      bv_attno;               /* index attribute number */
+       bool            bv_hasnulls;    /* is there any nulls in the page range? */
+       bool            bv_allnulls;    /* are all values nulls in the page range? */
+       Datum      *bv_values;          /* current accumulated values */
+} BrinValues;
+
+/*
+ * This struct is used to represent an in-memory index tuple.  The values can
+ * only be meaningfully decoded with an appropriate BrinDesc.
+ */
+typedef struct BrinMemTuple
+{
+       bool            bt_placeholder; /* this is a placeholder tuple */
+       BlockNumber bt_blkno;           /* heap blkno that the tuple is for */
+       MemoryContext bt_context;       /* memcxt holding the dt_column values */
+       BrinValues      bt_columns[FLEXIBLE_ARRAY_MEMBER];
+} BrinMemTuple;
+
+/*
+ * An on-disk BRIN tuple.  This is possibly followed by a nulls bitmask, with
+ * room for 2 null bits (two bits for each indexed column); an opclass-defined
+ * number of Datum values for each column follow.
+ */
+typedef struct BrinTuple
+{
+       /* heap block number that the tuple is for */
+       BlockNumber bt_blkno;
+
+       /* ---------------
+        * mt_info is laid out in the following fashion:
+        *
+        * 7th (high) bit: has nulls
+        * 6th bit: is placeholder tuple
+        * 5th bit: unused
+        * 4-0 bit: offset of data
+        * ---------------
+        */
+       uint8           bt_info;
+} BrinTuple;
+
+#define SizeOfBrinTuple (offsetof(BrinTuple, bt_info) + sizeof(uint8))
+
+/*
+ * t_info manipulation macros
+ */
+#define BRIN_OFFSET_MASK               0x1F
+/* bit 0x20 is not used at present */
+#define BRIN_PLACEHOLDER_MASK  0x40
+#define BRIN_NULLS_MASK                        0x80
+
+#define BrinTupleDataOffset(tup)       ((Size) (((BrinTuple *) (tup))->bt_info & BRIN_OFFSET_MASK))
+#define BrinTupleHasNulls(tup) (((((BrinTuple *) (tup))->bt_info & BRIN_NULLS_MASK)) != 0)
+#define BrinTupleIsPlaceholder(tup) (((((BrinTuple *) (tup))->bt_info & BRIN_PLACEHOLDER_MASK)) != 0)
+
+
+extern BrinTuple *brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno,
+                               BrinMemTuple *tuple, Size *size);
+extern BrinTuple *brin_form_placeholder_tuple(BrinDesc *brdesc,
+                                                       BlockNumber blkno, Size *size);
+extern void brin_free_tuple(BrinTuple *tuple);
+extern BrinTuple *brin_copy_tuple(BrinTuple *tuple, Size len);
+extern bool brin_tuples_equal(const BrinTuple *a, Size alen,
+                                 const BrinTuple *b, Size blen);
+
+extern BrinMemTuple *brin_new_memtuple(BrinDesc *brdesc);
+extern void brin_memtuple_initialize(BrinMemTuple *dtuple,
+                                                BrinDesc *brdesc);
+extern BrinMemTuple *brin_deform_tuple(BrinDesc *brdesc,
+                                 BrinTuple *tuple);
+
+#endif   /* BRIN_TUPLE_H */
diff --git a/src/include/access/brin_xlog.h b/src/include/access/brin_xlog.h
new file mode 100644 (file)
index 0000000..3d959e8
--- /dev/null
@@ -0,0 +1,109 @@
+/*-------------------------------------------------------------------------
+ *
+ * brin_xlog.h
+ *       POSTGRES BRIN access XLOG definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/brin_xlog.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BRIN_XLOG_H
+#define BRIN_XLOG_H
+
+#include "access/xlogrecord.h"
+#include "lib/stringinfo.h"
+#include "storage/bufpage.h"
+#include "storage/itemptr.h"
+#include "storage/relfilenode.h"
+#include "utils/relcache.h"
+
+
+/*
+ * WAL record definitions for BRIN's WAL operations
+ *
+ * XLOG allows to store some information in high 4 bits of log
+ * record xl_info field.
+ */
+#define XLOG_BRIN_CREATE_INDEX         0x00
+#define XLOG_BRIN_INSERT                       0x10
+#define XLOG_BRIN_UPDATE                       0x20
+#define XLOG_BRIN_SAMEPAGE_UPDATE      0x30
+#define XLOG_BRIN_REVMAP_EXTEND                0x40
+#define XLOG_BRIN_REVMAP_VACUUM                0x50
+
+#define XLOG_BRIN_OPMASK                       0x70
+/*
+ * When we insert the first item on a new page, we restore the entire page in
+ * redo.
+ */
+#define XLOG_BRIN_INIT_PAGE            0x80
+
+/* This is what we need to know about a BRIN index create */
+typedef struct xl_brin_createidx
+{
+       BlockNumber pagesPerRange;
+       RelFileNode node;
+       uint16          version;
+} xl_brin_createidx;
+#define SizeOfBrinCreateIdx (offsetof(xl_brin_createidx, version) + sizeof(uint16))
+
+/*
+ * This is what we need to know about a BRIN tuple insert
+ */
+typedef struct xl_brin_insert
+{
+       RelFileNode node;
+       BlockNumber heapBlk;
+
+       /* extra information needed to update the revmap */
+       BlockNumber revmapBlk;
+       BlockNumber pagesPerRange;
+
+       uint16          tuplen;
+       ItemPointerData tid;
+       /* tuple data follows at end of struct */
+} xl_brin_insert;
+
+#define SizeOfBrinInsert       (offsetof(xl_brin_insert, tid) + sizeof(ItemPointerData))
+
+/*
+ * A cross-page update is the same as an insert, but also store the old tid.
+ */
+typedef struct xl_brin_update
+{
+       ItemPointerData oldtid;
+       xl_brin_insert new;
+} xl_brin_update;
+
+#define SizeOfBrinUpdate       (offsetof(xl_brin_update, new) + SizeOfBrinInsert)
+
+/* This is what we need to know about a BRIN tuple samepage update */
+typedef struct xl_brin_samepage_update
+{
+       RelFileNode node;
+       ItemPointerData tid;
+       /* tuple data follows at end of struct */
+} xl_brin_samepage_update;
+
+#define SizeOfBrinSamepageUpdate               (offsetof(xl_brin_samepage_update, tid) + sizeof(ItemPointerData))
+
+/* This is what we need to know about a revmap extension */
+typedef struct xl_brin_revmap_extend
+{
+       RelFileNode node;
+       BlockNumber targetBlk;
+} xl_brin_revmap_extend;
+
+#define SizeOfBrinRevmapExtend (offsetof(xl_brin_revmap_extend, targetBlk) + \
+                                                                sizeof(BlockNumber))
+
+
+extern void brin_desc(StringInfo buf, XLogRecord *record);
+extern void brin_redo(XLogRecPtr lsn, XLogRecord *record);
+extern const char *brin_identify(uint8 info);
+
+#endif   /* BRIN_XLOG_H */
index 7f7166d832e9db320c4243d86d939773fd424ac9..9cd66a1b0f9b518d0e754eecd1eafa76d5dfe32e 100644 (file)
@@ -113,6 +113,8 @@ extern HeapScanDesc heap_beginscan_strat(Relation relation, Snapshot snapshot,
                                         bool allow_strat, bool allow_sync);
 extern HeapScanDesc heap_beginscan_bm(Relation relation, Snapshot snapshot,
                                  int nkeys, ScanKey key);
+extern void heap_setscanlimits(HeapScanDesc scan, BlockNumber startBlk,
+                  BlockNumber endBlk);
 extern void heap_rescan(HeapScanDesc scan, ScanKey key);
 extern void heap_endscan(HeapScanDesc scan);
 extern HeapTuple heap_getnext(HeapScanDesc scan, ScanDirection direction);
index c22644841f93e007a7f7b031a83dc2b8d7666621..a538830be585e43334dd1c802e07624d1d6122df 100644 (file)
@@ -45,8 +45,9 @@ typedef enum relopt_kind
        RELOPT_KIND_TABLESPACE = (1 << 7),
        RELOPT_KIND_SPGIST = (1 << 8),
        RELOPT_KIND_VIEW = (1 << 9),
+       RELOPT_KIND_BRIN = (1 << 10),
        /* if you add a new kind, make sure you update "last_default" too */
-       RELOPT_KIND_LAST_DEFAULT = RELOPT_KIND_VIEW,
+       RELOPT_KIND_LAST_DEFAULT = RELOPT_KIND_BRIN,
        /* some compilers treat enums as signed ints, so we can't use 1 << 31 */
        RELOPT_KIND_MAX = (1 << 30)
 } relopt_kind;
index 8a57698feb676d0fa358ea1483404034323228a2..8beb1be8829d86d8e87394ca65f1739cdf197e1f 100644 (file)
@@ -35,8 +35,10 @@ typedef struct HeapScanDescData
        bool            rs_temp_snap;   /* unregister snapshot at scan end? */
 
        /* state set up at initscan time */
-       BlockNumber rs_nblocks;         /* number of blocks to scan */
+       BlockNumber rs_nblocks;         /* total number of blocks in rel */
        BlockNumber rs_startblock;      /* block # to start at */
+       BlockNumber     rs_initblock;   /* block # to consider initial of rel */
+       BlockNumber     rs_numblocks;   /* number of blocks to scan */
        BufferAccessStrategy rs_strategy;       /* access strategy for reads */
        bool            rs_syncscan;    /* report location to syncscan logic? */
 
index 77d4574ed177e0ea51644d792a97d8ac3d966459..76a6421fb6855c989aad1c98ca6e8d9019b94cc0 100644 (file)
@@ -42,3 +42,4 @@ PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gi
 PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup)
 PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL)
 PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup)
+PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL)
index 2c059c88c188c4c425732a204cd05e6607feae70..b5c5e7aa5e8222ccb7b35146e29310d3536378c1 100644 (file)
@@ -53,6 +53,6 @@
  */
 
 /*                                                     yyyymmddN */
-#define CATALOG_VERSION_NO     201411041
+#define CATALOG_VERSION_NO     201411071
 
 #endif
index 098ac7df19956eb9c18abd7960ee74eabe04e395..c36a729c91f90b036535420533311f3a6966a2e1 100644 (file)
@@ -98,6 +98,14 @@ extern double IndexBuildHeapScan(Relation heapRelation,
                                   bool allow_sync,
                                   IndexBuildCallback callback,
                                   void *callback_state);
+extern double IndexBuildHeapRangeScan(Relation heapRelation,
+                                               Relation indexRelation,
+                                               IndexInfo *indexInfo,
+                                               bool allow_sync,
+                                               BlockNumber start_blockno,
+                                               BlockNumber end_blockno,
+                                               IndexBuildCallback callback,
+                                               void *callback_state);
 
 extern void validate_index(Oid heapId, Oid indexId, Snapshot snapshot);
 
index 759ea705702a290ea610ad08cf52a40d3402a602..67b57cda9150296301511f54f9710047ed6470bc 100644 (file)
@@ -132,5 +132,7 @@ DESCR("GIN index access method");
 DATA(insert OID = 4000 (  spgist       0 5 f f f f f t f t f f f 0 spginsert spgbeginscan spggettuple spggetbitmap spgrescan spgendscan spgmarkpos spgrestrpos spgbuild spgbuildempty spgbulkdelete spgvacuumcleanup spgcanreturn spgcostestimate spgoptions ));
 DESCR("SP-GiST index access method");
 #define SPGIST_AM_OID 4000
+DATA(insert OID = 3580 (  brin 5 14 f f f f t t f t t f f 0 brininsert brinbeginscan - bringetbitmap brinrescan brinendscan brinmarkpos brinrestrpos brinbuild brinbuildempty brinbulkdelete brinvacuumcleanup - brincostestimate brinoptions ));
+#define BRIN_AM_OID 3580
 
 #endif   /* PG_AM_H */
index 3ef5a49cc9bd67245690f822d82bcc8a8f92d667..e72cc6c093a38f09fcb4149c0d080ece84f4e5b7 100644 (file)
@@ -845,4 +845,168 @@ DATA(insert (     3550    869 869 25 s    932 783 0 ));
 DATA(insert (  3550    869 869 26 s    933 783 0 ));
 DATA(insert (  3550    869 869 27 s    934 783 0 ));
 
+/* BRIN opclasses */
+/* minmax bytea */
+DATA(insert (   4064     17   17 1 s      1957    3580 0 ));
+DATA(insert (   4064     17   17 2 s      1958    3580 0 ));
+DATA(insert (   4064     17   17 3 s      1955    3580 0 ));
+DATA(insert (   4064     17   17 4 s      1960    3580 0 ));
+DATA(insert (   4064     17   17 5 s      1959    3580 0 ));
+/* minmax "char" */
+DATA(insert (   4062     18   18 1 s       631    3580 0 ));
+DATA(insert (   4062     18   18 2 s       632    3580 0 ));
+DATA(insert (   4062     18   18 3 s        92    3580 0 ));
+DATA(insert (   4062     18   18 4 s       634    3580 0 ));
+DATA(insert (   4062     18   18 5 s       633    3580 0 ));
+/* minmax name */
+DATA(insert (   4065     19   19 1 s       660    3580 0 ));
+DATA(insert (   4065     19   19 2 s       661    3580 0 ));
+DATA(insert (   4065     19   19 3 s        93    3580 0 ));
+DATA(insert (   4065     19   19 4 s       663    3580 0 ));
+DATA(insert (   4065     19   19 5 s       662    3580 0 ));
+/* minmax bigint */
+DATA(insert (   4063     20   20 1 s       412    3580 0 ));
+DATA(insert (   4063     20   20 2 s       414    3580 0 ));
+DATA(insert (   4063     20   20 3 s       410    3580 0 ));
+DATA(insert (   4063     20   20 4 s       415    3580 0 ));
+DATA(insert (   4063     20   20 5 s       413    3580 0 ));
+/* minmax smallint */
+DATA(insert (   4067     21   21 1 s        95    3580 0 ));
+DATA(insert (   4067     21   21 2 s       522    3580 0 ));
+DATA(insert (   4067     21   21 3 s        94    3580 0 ));
+DATA(insert (   4067     21   21 4 s       524    3580 0 ));
+DATA(insert (   4067     21   21 5 s       520    3580 0 ));
+/* minmax integer */
+DATA(insert (   4054     23   23 1 s        97    3580 0 ));
+DATA(insert (   4054     23   23 2 s       523    3580 0 ));
+DATA(insert (   4054     23   23 3 s        96    3580 0 ));
+DATA(insert (   4054     23   23 4 s       525    3580 0 ));
+DATA(insert (   4054     23   23 5 s       521    3580 0 ));
+/* minmax text */
+DATA(insert (   4056     25   25 1 s       664    3580 0 ));
+DATA(insert (   4056     25   25 2 s       665    3580 0 ));
+DATA(insert (   4056     25   25 3 s        98    3580 0 ));
+DATA(insert (   4056     25   25 4 s       667    3580 0 ));
+DATA(insert (   4056     25   25 5 s       666    3580 0 ));
+/* minmax oid */
+DATA(insert (   4068     26   26 1 s       609    3580 0 ));
+DATA(insert (   4068     26   26 2 s       611    3580 0 ));
+DATA(insert (   4068     26   26 3 s       607    3580 0 ));
+DATA(insert (   4068     26   26 4 s       612    3580 0 ));
+DATA(insert (   4068     26   26 5 s       610    3580 0 ));
+/* minmax tid */
+DATA(insert (   4069     27   27 1 s      2799    3580 0 ));
+DATA(insert (   4069     27   27 2 s      2801    3580 0 ));
+DATA(insert (   4069     27   27 3 s       387    3580 0 ));
+DATA(insert (   4069     27   27 4 s      2802    3580 0 ));
+DATA(insert (   4069     27   27 5 s      2800    3580 0 ));
+/* minmax real */
+DATA(insert (   4070    700  700 1 s       622    3580 0 ));
+DATA(insert (   4070    700  700 2 s       624    3580 0 ));
+DATA(insert (   4070    700  700 3 s       620    3580 0 ));
+DATA(insert (   4070    700  700 4 s       625    3580 0 ));
+DATA(insert (   4070    700  700 5 s       623    3580 0 ));
+/* minmax double precision */
+DATA(insert (   4071    701  701 1 s       672    3580 0 ));
+DATA(insert (   4071    701  701 2 s       673    3580 0 ));
+DATA(insert (   4071    701  701 3 s       670    3580 0 ));
+DATA(insert (   4071    701  701 4 s       675    3580 0 ));
+DATA(insert (   4071    701  701 5 s       674    3580 0 ));
+/* minmax abstime */
+DATA(insert (   4072    702  702 1 s       562    3580 0 ));
+DATA(insert (   4072    702  702 2 s       564    3580 0 ));
+DATA(insert (   4072    702  702 3 s       560    3580 0 ));
+DATA(insert (   4072    702  702 4 s       565    3580 0 ));
+DATA(insert (   4072    702  702 5 s       563    3580 0 ));
+/* minmax reltime */
+DATA(insert (   4073    703  703 1 s       568    3580 0 ));
+DATA(insert (   4073    703  703 2 s       570    3580 0 ));
+DATA(insert (   4073    703  703 3 s       566    3580 0 ));
+DATA(insert (   4073    703  703 4 s       571    3580 0 ));
+DATA(insert (   4073    703  703 5 s       569    3580 0 ));
+/* minmax macaddr */
+DATA(insert (   4074    829  829 1 s      1222    3580 0 ));
+DATA(insert (   4074    829  829 2 s      1223    3580 0 ));
+DATA(insert (   4074    829  829 3 s      1220    3580 0 ));
+DATA(insert (   4074    829  829 4 s      1225    3580 0 ));
+DATA(insert (   4074    829  829 5 s      1224    3580 0 ));
+/* minmax inet */
+DATA(insert (   4075    869  869 1 s      1203    3580 0 ));
+DATA(insert (   4075    869  869 2 s      1204    3580 0 ));
+DATA(insert (   4075    869  869 3 s      1201    3580 0 ));
+DATA(insert (   4075    869  869 4 s      1206    3580 0 ));
+DATA(insert (   4075    869  869 5 s      1205    3580 0 ));
+/* minmax character */
+DATA(insert (   4076   1042 1042 1 s      1058    3580 0 ));
+DATA(insert (   4076   1042 1042 2 s      1059    3580 0 ));
+DATA(insert (   4076   1042 1042 3 s      1054    3580 0 ));
+DATA(insert (   4076   1042 1042 4 s      1061    3580 0 ));
+DATA(insert (   4076   1042 1042 5 s      1060    3580 0 ));
+/* minmax date */
+DATA(insert (   4061   1082 1082 1 s      1095    3580 0 ));
+DATA(insert (   4061   1082 1082 2 s      1096    3580 0 ));
+DATA(insert (   4061   1082 1082 3 s      1093    3580 0 ));
+DATA(insert (   4061   1082 1082 4 s      1098    3580 0 ));
+DATA(insert (   4061   1082 1082 5 s      1097    3580 0 ));
+/* minmax time without time zone */
+DATA(insert (   4077   1083 1083 1 s      1110    3580 0 ));
+DATA(insert (   4077   1083 1083 2 s      1111    3580 0 ));
+DATA(insert (   4077   1083 1083 3 s      1108    3580 0 ));
+DATA(insert (   4077   1083 1083 4 s      1113    3580 0 ));
+DATA(insert (   4077   1083 1083 5 s      1112    3580 0 ));
+/* minmax timestamp without time zone */
+DATA(insert (   4059   1114 1114 1 s      2062    3580 0 ));
+DATA(insert (   4059   1114 1114 2 s      2063    3580 0 ));
+DATA(insert (   4059   1114 1114 3 s      2060    3580 0 ));
+DATA(insert (   4059   1114 1114 4 s      2065    3580 0 ));
+DATA(insert (   4059   1114 1114 5 s      2064    3580 0 ));
+/* minmax timestamp with time zone */
+DATA(insert (   4060   1184 1184 1 s      1322    3580 0 ));
+DATA(insert (   4060   1184 1184 2 s      1323    3580 0 ));
+DATA(insert (   4060   1184 1184 3 s      1320    3580 0 ));
+DATA(insert (   4060   1184 1184 4 s      1325    3580 0 ));
+DATA(insert (   4060   1184 1184 5 s      1324    3580 0 ));
+/* minmax interval */
+DATA(insert (   4078   1186 1186 1 s      1332    3580 0 ));
+DATA(insert (   4078   1186 1186 2 s      1333    3580 0 ));
+DATA(insert (   4078   1186 1186 3 s      1330    3580 0 ));
+DATA(insert (   4078   1186 1186 4 s      1335    3580 0 ));
+DATA(insert (   4078   1186 1186 5 s      1334    3580 0 ));
+/* minmax time with time zone */
+DATA(insert (   4058   1266 1266 1 s      1552    3580 0 ));
+DATA(insert (   4058   1266 1266 2 s      1553    3580 0 ));
+DATA(insert (   4058   1266 1266 3 s      1550    3580 0 ));
+DATA(insert (   4058   1266 1266 4 s      1555    3580 0 ));
+DATA(insert (   4058   1266 1266 5 s      1554    3580 0 ));
+/* minmax bit */
+DATA(insert (   4079   1560 1560 1 s      1786    3580 0 ));
+DATA(insert (   4079   1560 1560 2 s      1788    3580 0 ));
+DATA(insert (   4079   1560 1560 3 s      1784    3580 0 ));
+DATA(insert (   4079   1560 1560 4 s      1789    3580 0 ));
+DATA(insert (   4079   1560 1560 5 s      1787    3580 0 ));
+/* minmax bit varying */
+DATA(insert (   4080   1562 1562 1 s      1806    3580 0 ));
+DATA(insert (   4080   1562 1562 2 s      1808    3580 0 ));
+DATA(insert (   4080   1562 1562 3 s      1804    3580 0 ));
+DATA(insert (   4080   1562 1562 4 s      1809    3580 0 ));
+DATA(insert (   4080   1562 1562 5 s      1807    3580 0 ));
+/* minmax numeric */
+DATA(insert (   4055   1700 1700 1 s      1754    3580 0 ));
+DATA(insert (   4055   1700 1700 2 s      1755    3580 0 ));
+DATA(insert (   4055   1700 1700 3 s      1752    3580 0 ));
+DATA(insert (   4055   1700 1700 4 s      1757    3580 0 ));
+DATA(insert (   4055   1700 1700 5 s      1756    3580 0 ));
+/* minmax uuid */
+DATA(insert (   4081   2950 2950 1 s      2974    3580 0 ));
+DATA(insert (   4081   2950 2950 2 s      2976    3580 0 ));
+DATA(insert (   4081   2950 2950 3 s      2972    3580 0 ));
+DATA(insert (   4081   2950 2950 4 s      2977    3580 0 ));
+DATA(insert (   4081   2950 2950 5 s      2975    3580 0 ));
+/* minmax pg_lsn */
+DATA(insert (   4082   3220 3220 1 s      3224    3580 0 ));
+DATA(insert (   4082   3220 3220 2 s      3226    3580 0 ));
+DATA(insert (   4082   3220 3220 3 s      3222    3580 0 ));
+DATA(insert (   4082   3220 3220 4 s      3227    3580 0 ));
+DATA(insert (   4082   3220 3220 5 s      3225    3580 0 ));
+
 #endif   /* PG_AMOP_H */
index a1de3363e6f274db38298fb2ab349d48d834c6b3..e09f5578d792061dda6ec45b48bd28f51f84e07f 100644 (file)
@@ -432,4 +432,249 @@ DATA(insert (     4017   25 25 3 4029 ));
 DATA(insert (  4017   25 25 4 4030 ));
 DATA(insert (  4017   25 25 5 4031 ));
 
+/* BRIN opclasses */
+/* minmax bytea */
+DATA(insert (   4064    17    17  1  3383 ));
+DATA(insert (   4064    17    17  2  3384 ));
+DATA(insert (   4064    17    17  3  3385 ));
+DATA(insert (   4064    17    17  4  3386 ));
+DATA(insert (   4064    17    17  11 1949 ));
+DATA(insert (   4064    17    17  12 1950 ));
+DATA(insert (   4064    17    17  13 1952 ));
+DATA(insert (   4064    17    17  14 1951 ));
+/* minmax "char" */
+DATA(insert (   4062    18    18  1  3383 ));
+DATA(insert (   4062    18    18  2  3384 ));
+DATA(insert (   4062    18    18  3  3385 ));
+DATA(insert (   4062    18    18  4  3386 ));
+DATA(insert (   4062    18    18  11 1246 ));
+DATA(insert (   4062    18    18  12   72 ));
+DATA(insert (   4062    18    18  13   74 ));
+DATA(insert (   4062    18    18  14   73 ));
+/* minmax name */
+DATA(insert (   4065    19    19  1  3383 ));
+DATA(insert (   4065    19    19  2  3384 ));
+DATA(insert (   4065    19    19  3  3385 ));
+DATA(insert (   4065    19    19  4  3386 ));
+DATA(insert (   4065    19    19  11  655 ));
+DATA(insert (   4065    19    19  12  656 ));
+DATA(insert (   4065    19    19  13  658 ));
+DATA(insert (   4065    19    19  14  657 ));
+/* minmax bigint */
+DATA(insert (   4063    20    20  1  3383 ));
+DATA(insert (   4063    20    20  2  3384 ));
+DATA(insert (   4063    20    20  3  3385 ));
+DATA(insert (   4063    20    20  4  3386 ));
+DATA(insert (   4063    20    20  11  469 ));
+DATA(insert (   4063    20    20  12  471 ));
+DATA(insert (   4063    20    20  13  472 ));
+DATA(insert (   4063    20    20  14  470 ));
+/* minmax smallint */
+DATA(insert (   4067    21    21  1  3383 ));
+DATA(insert (   4067    21    21  2  3384 ));
+DATA(insert (   4067    21    21  3  3385 ));
+DATA(insert (   4067    21    21  4  3386 ));
+DATA(insert (   4067    21    21  11   64 ));
+DATA(insert (   4067    21    21  12  148 ));
+DATA(insert (   4067    21    21  13  151 ));
+DATA(insert (   4067    21    21  14  146 ));
+/* minmax integer */
+DATA(insert (   4054    23    23  1  3383 ));
+DATA(insert (   4054    23    23  2  3384 ));
+DATA(insert (   4054    23    23  3  3385 ));
+DATA(insert (   4054    23    23  4  3386 ));
+DATA(insert (   4054    23    23  11   66 ));
+DATA(insert (   4054    23    23  12  149 ));
+DATA(insert (   4054    23    23  13  150 ));
+DATA(insert (   4054    23    23  14  147 ));
+/* minmax text */
+DATA(insert (   4056    25    25  1  3383 ));
+DATA(insert (   4056    25    25  2  3384 ));
+DATA(insert (   4056    25    25  3  3385 ));
+DATA(insert (   4056    25    25  4  3386 ));
+DATA(insert (   4056    25    25  11  740 ));
+DATA(insert (   4056    25    25  12  741 ));
+DATA(insert (   4056    25    25  13  743 ));
+DATA(insert (   4056    25    25  14  742 ));
+/* minmax oid */
+DATA(insert (   4068    26    26  1  3383 ));
+DATA(insert (   4068    26    26  2  3384 ));
+DATA(insert (   4068    26    26  3  3385 ));
+DATA(insert (   4068    26    26  4  3386 ));
+DATA(insert (   4068    26    26  11  716 ));
+DATA(insert (   4068    26    26  12  717 ));
+DATA(insert (   4068    26    26  13 1639 ));
+DATA(insert (   4068    26    26  14 1638 ));
+/* minmax tid */
+DATA(insert (   4069    27    27  1  3383 ));
+DATA(insert (   4069    27    27  2  3384 ));
+DATA(insert (   4069    27    27  3  3385 ));
+DATA(insert (   4069    27    27  4  3386 ));
+DATA(insert (   4069    27    27  11 2791 ));
+DATA(insert (   4069    27    27  12 2793 ));
+DATA(insert (   4069    27    27  13 2792 ));
+DATA(insert (   4069    27    27  14 2790 ));
+/* minmax real */
+DATA(insert (   4070   700   700  1  3383 ));
+DATA(insert (   4070   700   700  2  3384 ));
+DATA(insert (   4070   700   700  3  3385 ));
+DATA(insert (   4070   700   700  4  3386 ));
+DATA(insert (   4070   700   700  11  289 ));
+DATA(insert (   4070   700   700  12  290 ));
+DATA(insert (   4070   700   700  13  292 ));
+DATA(insert (   4070   700   700  14  291 ));
+/* minmax double precision */
+DATA(insert (   4071   701   701  1  3383 ));
+DATA(insert (   4071   701   701  2  3384 ));
+DATA(insert (   4071   701   701  3  3385 ));
+DATA(insert (   4071   701   701  4  3386 ));
+DATA(insert (   4071   701   701  11  295 ));
+DATA(insert (   4071   701   701  12  296 ));
+DATA(insert (   4071   701   701  13  298 ));
+DATA(insert (   4071   701   701  14  297 ));
+/* minmax abstime */
+DATA(insert (   4072   702   702  1  3383 ));
+DATA(insert (   4072   702   702  2  3384 ));
+DATA(insert (   4072   702   702  3  3385 ));
+DATA(insert (   4072   702   702  4  3386 ));
+DATA(insert (   4072   702   702  11  253 ));
+DATA(insert (   4072   702   702  12  255 ));
+DATA(insert (   4072   702   702  13  256 ));
+DATA(insert (   4072   702   702  14  254 ));
+/* minmax reltime */
+DATA(insert (   4073   703   703  1  3383 ));
+DATA(insert (   4073   703   703  2  3384 ));
+DATA(insert (   4073   703   703  3  3385 ));
+DATA(insert (   4073   703   703  4  3386 ));
+DATA(insert (   4073   703   703  11  259 ));
+DATA(insert (   4073   703   703  12  261 ));
+DATA(insert (   4073   703   703  13  262 ));
+DATA(insert (   4073   703   703  14  260 ));
+/* minmax macaddr */
+DATA(insert (   4074   829   829  1  3383 ));
+DATA(insert (   4074   829   829  2  3384 ));
+DATA(insert (   4074   829   829  3  3385 ));
+DATA(insert (   4074   829   829  4  3386 ));
+DATA(insert (   4074   829   829  11  831 ));
+DATA(insert (   4074   829   829  12  832 ));
+DATA(insert (   4074   829   829  13  834 ));
+DATA(insert (   4074   829   829  14  833 ));
+/* minmax inet */
+DATA(insert (   4075   869   869  1  3383 ));
+DATA(insert (   4075   869   869  2  3384 ));
+DATA(insert (   4075   869   869  3  3385 ));
+DATA(insert (   4075   869   869  4  3386 ));
+DATA(insert (   4075   869   869  11  921 ));
+DATA(insert (   4075   869   869  12  922 ));
+DATA(insert (   4075   869   869  13  924 ));
+DATA(insert (   4075   869   869  14  923 ));
+/* minmax character */
+DATA(insert (   4076  1042  1042  1  3383 ));
+DATA(insert (   4076  1042  1042  2  3384 ));
+DATA(insert (   4076  1042  1042  3  3385 ));
+DATA(insert (   4076  1042  1042  4  3386 ));
+DATA(insert (   4076  1042  1042  11 1049 ));
+DATA(insert (   4076  1042  1042  12 1050 ));
+DATA(insert (   4076  1042  1042  13 1052 ));
+DATA(insert (   4076  1042  1042  14 1051 ));
+/* minmax date */
+DATA(insert (   4061  1082  1082  1  3383 ));
+DATA(insert (   4061  1082  1082  2  3384 ));
+DATA(insert (   4061  1082  1082  3  3385 ));
+DATA(insert (   4061  1082  1082  4  3386 ));
+DATA(insert (   4061  1082  1082  11 1087 ));
+DATA(insert (   4061  1082  1082  12 1088 ));
+DATA(insert (   4061  1082  1082  13 1090 ));
+DATA(insert (   4061  1082  1082  14 1089 ));
+/* minmax time without time zone */
+DATA(insert (   4077  1083  1083  1  3383 ));
+DATA(insert (   4077  1083  1083  2  3384 ));
+DATA(insert (   4077  1083  1083  3  3385 ));
+DATA(insert (   4077  1083  1083  4  3386 ));
+DATA(insert (   4077  1083  1083  11 1102 ));
+DATA(insert (   4077  1083  1083  12 1103 ));
+DATA(insert (   4077  1083  1083  13 1105 ));
+DATA(insert (   4077  1083  1083  14 1104 ));
+/* minmax timestamp without time zone */
+DATA(insert (   4059  1114  1114  1  3383 ));
+DATA(insert (   4059  1114  1114  2  3384 ));
+DATA(insert (   4059  1114  1114  3  3385 ));
+DATA(insert (   4059  1114  1114  4  3386 ));
+DATA(insert (   4059  1114  1114  11 2054 ));
+DATA(insert (   4059  1114  1114  12 2055 ));
+DATA(insert (   4059  1114  1114  13 2056 ));
+DATA(insert (   4059  1114  1114  14 2057 ));
+/* minmax timestamp with time zone */
+DATA(insert (   4060  1184  1184  1  3383 ));
+DATA(insert (   4060  1184  1184  2  3384 ));
+DATA(insert (   4060  1184  1184  3  3385 ));
+DATA(insert (   4060  1184  1184  4  3386 ));
+DATA(insert (   4060  1184  1184  11 1154 ));
+DATA(insert (   4060  1184  1184  12 1155 ));
+DATA(insert (   4060  1184  1184  13 1156 ));
+DATA(insert (   4060  1184  1184  14 1157 ));
+/* minmax interval */
+DATA(insert (   4078  1186  1186  1  3383 ));
+DATA(insert (   4078  1186  1186  2  3384 ));
+DATA(insert (   4078  1186  1186  3  3385 ));
+DATA(insert (   4078  1186  1186  4  3386 ));
+DATA(insert (   4078  1186  1186  11 1164 ));
+DATA(insert (   4078  1186  1186  12 1165 ));
+DATA(insert (   4078  1186  1186  13 1166 ));
+DATA(insert (   4078  1186  1186  14 1167 ));
+/* minmax time with time zone */
+DATA(insert (   4058  1266  1266  1  3383 ));
+DATA(insert (   4058  1266  1266  2  3384 ));
+DATA(insert (   4058  1266  1266  3  3385 ));
+DATA(insert (   4058  1266  1266  4  3386 ));
+DATA(insert (   4058  1266  1266  11 1354 ));
+DATA(insert (   4058  1266  1266  12 1355 ));
+DATA(insert (   4058  1266  1266  13 1356 ));
+DATA(insert (   4058  1266  1266  14 1357 ));
+/* minmax bit */
+DATA(insert (   4079  1560  1560  1  3383 ));
+DATA(insert (   4079  1560  1560  2  3384 ));
+DATA(insert (   4079  1560  1560  3  3385 ));
+DATA(insert (   4079  1560  1560  4  3386 ));
+DATA(insert (   4079  1560  1560  11 1595 ));
+DATA(insert (   4079  1560  1560  12 1594 ));
+DATA(insert (   4079  1560  1560  13 1592 ));
+DATA(insert (   4079  1560  1560  14 1593 ));
+/* minmax bit varying */
+DATA(insert (   4080  1562  1562  1  3383 ));
+DATA(insert (   4080  1562  1562  2  3384 ));
+DATA(insert (   4080  1562  1562  3  3385 ));
+DATA(insert (   4080  1562  1562  4  3386 ));
+DATA(insert (   4080  1562  1562  11 1671 ));
+DATA(insert (   4080  1562  1562  12 1670 ));
+DATA(insert (   4080  1562  1562  13 1668 ));
+DATA(insert (   4080  1562  1562  14 1669 ));
+/* minmax numeric */
+DATA(insert (   4055  1700  1700  1  3383 ));
+DATA(insert (   4055  1700  1700  2  3384 ));
+DATA(insert (   4055  1700  1700  3  3385 ));
+DATA(insert (   4055  1700  1700  4  3386 ));
+DATA(insert (   4055  1700  1700  11 1722 ));
+DATA(insert (   4055  1700  1700  12 1723 ));
+DATA(insert (   4055  1700  1700  13 1721 ));
+DATA(insert (   4055  1700  1700  14 1720 ));
+/* minmax uuid */
+DATA(insert (   4081  2950  2950  1  3383 ));
+DATA(insert (   4081  2950  2950  2  3384 ));
+DATA(insert (   4081  2950  2950  3  3385 ));
+DATA(insert (   4081  2950  2950  4  3386 ));
+DATA(insert (   4081  2950  2950  11 2954 ));
+DATA(insert (   4081  2950  2950  12 2955 ));
+DATA(insert (   4081  2950  2950  13 2957 ));
+DATA(insert (   4081  2950  2950  14 2958 ));
+/* minmax pg_lsn */
+DATA(insert (   4082  3220  3220  1  3383 ));
+DATA(insert (   4082  3220  3220  2  3384 ));
+DATA(insert (   4082  3220  3220  3  3385 ));
+DATA(insert (   4082  3220  3220  4  3386 ));
+DATA(insert (   4082  3220  3220  11 3231 ));
+DATA(insert (   4082  3220  3220  12 3232 ));
+DATA(insert (   4082  3220  3220  13 3234 ));
+DATA(insert (   4082  3220  3220  14 3235 ));
+
 #endif   /* PG_AMPROC_H */
index dc523416c9251c5badf3379a79b3cb80d92f69ad..595cd7f4879a9754d559ef6c3b155703f1a28655 100644 (file)
@@ -236,4 +236,36 @@ DATA(insert (      405             jsonb_ops                       PGNSP PGUID 4034  3802 t 0 ));
 DATA(insert (  2742    jsonb_ops                       PGNSP PGUID 4036  3802 t 25 ));
 DATA(insert (  2742    jsonb_path_ops          PGNSP PGUID 4037  3802 f 23 ));
 
+/* BRIN operator classes */
+/* no brin opclass for bool */
+DATA(insert (  3580    bytea_minmax_ops                PGNSP PGUID 4064    17 t 0 ));
+DATA(insert (  3580    char_minmax_ops                 PGNSP PGUID 4062    18 t 0 ));
+DATA(insert (  3580    name_minmax_ops                 PGNSP PGUID 4065    19 t 0 ));
+DATA(insert (  3580    int8_minmax_ops                 PGNSP PGUID 4063    20 t 0 ));
+DATA(insert (  3580    int2_minmax_ops                 PGNSP PGUID 4067    21 t 0 ));
+DATA(insert (  3580    int4_minmax_ops                 PGNSP PGUID 4054    23 t 0 ));
+DATA(insert (  3580    text_minmax_ops                 PGNSP PGUID 4056    25 t 0 ));
+DATA(insert (  3580    oid_minmax_ops                  PGNSP PGUID 4068    26 t 0 ));
+DATA(insert (  3580    tid_minmax_ops                  PGNSP PGUID 4069    27 t 0 ));
+DATA(insert (  3580    float4_minmax_ops               PGNSP PGUID 4070   700 t 0 ));
+DATA(insert (  3580    float8_minmax_ops               PGNSP PGUID 4071   701 t 0 ));
+DATA(insert (  3580    abstime_minmax_ops              PGNSP PGUID 4072   702 t 0 ));
+DATA(insert (  3580    reltime_minmax_ops              PGNSP PGUID 4073   703 t 0 ));
+DATA(insert (  3580    macaddr_minmax_ops              PGNSP PGUID 4074   829 t 0 ));
+DATA(insert (  3580    inet_minmax_ops                 PGNSP PGUID 4075   869 t 0 ));
+DATA(insert (  3580    bpchar_minmax_ops               PGNSP PGUID 4076  1042 t 0 ));
+DATA(insert (  3580    date_minmax_ops                 PGNSP PGUID 4061  1082 t 0 ));
+DATA(insert (  3580    time_minmax_ops                 PGNSP PGUID 4077  1083 t 0 ));
+DATA(insert (  3580    timestamp_minmax_ops    PGNSP PGUID 4059  1114 t 0 ));
+DATA(insert (  3580    timestamptz_minmax_ops  PGNSP PGUID 4060  1184 t 0 ));
+DATA(insert (  3580    interval_minmax_ops             PGNSP PGUID 4078  1186 t 0 ));
+DATA(insert (  3580    timetz_minmax_ops               PGNSP PGUID 4058  1266 t 0 ));
+DATA(insert (  3580    bit_minmax_ops                  PGNSP PGUID 4079  1560 t 0 ));
+DATA(insert (  3580    varbit_minmax_ops               PGNSP PGUID 4080  1562 t 0 ));
+DATA(insert (  3580    numeric_minmax_ops              PGNSP PGUID 4055  1700 t 0 ));
+/* no brin opclass for record, anyarray */
+DATA(insert (  3580    uuid_minmax_ops                 PGNSP PGUID 4081  2950 t 0 ));
+DATA(insert (  3580    pg_lsn_minmax_ops               PGNSP PGUID 4082  3220 t 0 ));
+/* no brin opclass for enum, tsvector, tsquery, jsonb, range */
+
 #endif   /* PG_OPCLASS_H */
index 26297ced0da8f821e894b746abdaa6dc06f72f20..2d8af766025621978f87d69b543a0a1901579ccb 100644 (file)
@@ -157,4 +157,32 @@ DATA(insert OID = 4035 (   783             jsonb_ops               PGNSP PGUID ));
 DATA(insert OID = 4036 (       2742    jsonb_ops               PGNSP PGUID ));
 DATA(insert OID = 4037 (       2742    jsonb_path_ops  PGNSP PGUID ));
 
+DATA(insert OID = 4054 (       3580    int4_minmax_ops                 PGNSP PGUID ));
+DATA(insert OID = 4055 (       3580    numeric_minmax_ops              PGNSP PGUID ));
+DATA(insert OID = 4056 (       3580    text_minmax_ops                 PGNSP PGUID ));
+DATA(insert OID = 4058 (       3580    timetz_minmax_ops               PGNSP PGUID ));
+DATA(insert OID = 4059 (       3580    timestamp_minmax_ops    PGNSP PGUID ));
+DATA(insert OID = 4060 (       3580    timestamptz_minmax_ops  PGNSP PGUID ));
+DATA(insert OID = 4061 (       3580    date_minmax_ops                 PGNSP PGUID ));
+DATA(insert OID = 4062 (       3580    char_minmax_ops                 PGNSP PGUID ));
+DATA(insert OID = 4063 (       3580    int8_minmax_ops                 PGNSP PGUID ));
+DATA(insert OID = 4064 (       3580    bytea_minmax_ops                PGNSP PGUID ));
+DATA(insert OID = 4065 (       3580    name_minmax_ops                 PGNSP PGUID ));
+DATA(insert OID = 4067 (       3580    int2_minmax_ops                 PGNSP PGUID ));
+DATA(insert OID = 4068 (       3580    oid_minmax_ops                  PGNSP PGUID ));
+DATA(insert OID = 4069 (       3580    tid_minmax_ops                  PGNSP PGUID ));
+DATA(insert OID = 4070 (       3580    float4_minmax_ops               PGNSP PGUID ));
+DATA(insert OID = 4071 (       3580    float8_minmax_ops               PGNSP PGUID ));
+DATA(insert OID = 4072 (       3580    abstime_minmax_ops              PGNSP PGUID ));
+DATA(insert OID = 4073 (       3580    reltime_minmax_ops              PGNSP PGUID ));
+DATA(insert OID = 4074 (       3580    macaddr_minmax_ops              PGNSP PGUID ));
+DATA(insert OID = 4075 (       3580    inet_minmax_ops                 PGNSP PGUID ));
+DATA(insert OID = 4076 (       3580    bpchar_minmax_ops               PGNSP PGUID ));
+DATA(insert OID = 4077 (       3580    time_minmax_ops                 PGNSP PGUID ));
+DATA(insert OID = 4078 (       3580    interval_minmax_ops             PGNSP PGUID ));
+DATA(insert OID = 4079 (       3580    bit_minmax_ops                  PGNSP PGUID ));
+DATA(insert OID = 4080 (       3580    varbit_minmax_ops               PGNSP PGUID ));
+DATA(insert OID = 4081 (       3580    uuid_minmax_ops                 PGNSP PGUID ));
+DATA(insert OID = 4082 (       3580    pg_lsn_minmax_ops               PGNSP PGUID ));
+
 #endif   /* PG_OPFAMILY_H */
index b6dc1b82adbaeb385d42a1ff54f081e2b11c47d0..497e652674b2221d5501b1685032b634c66ade77 100644 (file)
@@ -565,6 +565,35 @@ DESCR("btree(internal)");
 DATA(insert OID = 2785 (  btoptions               PGNSP PGUID 12 1 0 0 0 f f f f t f s 2 0 17 "1009 16" _null_ _null_ _null_ _null_  btoptions _null_ _null_ _null_ ));
 DESCR("btree(internal)");
 
+DATA(insert OID = 3789 (  bringetbitmap           PGNSP PGUID 12 1 0 0 0 f f f f t f v 2 0 20 "2281 2281" _null_ _null_ _null_ _null_  bringetbitmap _null_ _null_ _null_ ));
+DESCR("brin(internal)");
+DATA(insert OID = 3790 (  brininsert              PGNSP PGUID 12 1 0 0 0 f f f f t f v 6 0 16 "2281 2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_      brininsert _null_ _null_ _null_ ));
+DESCR("brin(internal)");
+DATA(insert OID = 3791 (  brinbeginscan           PGNSP PGUID 12 1 0 0 0 f f f f t f v 3 0 2281 "2281 2281 2281" _null_ _null_ _null_ _null_   brinbeginscan _null_ _null_ _null_ ));
+DESCR("brin(internal)");
+DATA(insert OID = 3792 (  brinrescan              PGNSP PGUID 12 1 0 0 0 f f f f t f v 5 0 2278 "2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ brinrescan _null_ _null_ _null_ ));
+DESCR("brin(internal)");
+DATA(insert OID = 3793 (  brinendscan             PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ brinendscan _null_ _null_ _null_ ));
+DESCR("brin(internal)");
+DATA(insert OID = 3794 (  brinmarkpos             PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ brinmarkpos _null_ _null_ _null_ ));
+DESCR("brin(internal)");
+DATA(insert OID = 3795 (  brinrestrpos            PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ brinrestrpos _null_ _null_ _null_ ));
+DESCR("brin(internal)");
+DATA(insert OID = 3796 (  brinbuild               PGNSP PGUID 12 1 0 0 0 f f f f t f v 3 0 2281 "2281 2281 2281" _null_ _null_ _null_ _null_ brinbuild _null_ _null_ _null_ ));
+DESCR("brin(internal)");
+DATA(insert OID = 3797 (  brinbuildempty          PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ brinbuildempty _null_ _null_ _null_ ));
+DESCR("brin(internal)");
+DATA(insert OID = 3798 (  brinbulkdelete          PGNSP PGUID 12 1 0 0 0 f f f f t f v 4 0 2281 "2281 2281 2281 2281" _null_ _null_ _null_ _null_ brinbulkdelete _null_ _null_ _null_ ));
+DESCR("brin(internal)");
+DATA(insert OID = 3799 (  brinvacuumcleanup   PGNSP PGUID 12 1 0 0 0 f f f f t f v 2 0 2281 "2281 2281" _null_ _null_ _null_ _null_ brinvacuumcleanup _null_ _null_ _null_ ));
+DESCR("brin(internal)");
+DATA(insert OID = 3800 (  brincostestimate   PGNSP PGUID 12 1 0 0 0 f f f f t f v 7 0 2278 "2281 2281 2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ brincostestimate _null_ _null_ _null_ ));
+DESCR("brin(internal)");
+DATA(insert OID = 3801 (  brinoptions             PGNSP PGUID 12 1 0 0 0 f f f f t f s 2 0 17 "1009 16" _null_ _null_ _null_ _null_  brinoptions _null_ _null_ _null_ ));
+DESCR("brin(internal)");
+DATA(insert OID = 3952 (  brin_summarize_new_values PGNSP PGUID 12 1 0 0 0 f f f f f f v 1 0 23 "2205" _null_ _null_ _null_ _null_ brin_summarize_new_values _null_ _null_ _null_ ));
+DESCR("brin: standalone scan new table pages");
+
 DATA(insert OID = 339 (  poly_same                PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "604 604" _null_ _null_ _null_ _null_ poly_same _null_ _null_ _null_ ));
 DATA(insert OID = 340 (  poly_contain     PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "604 604" _null_ _null_ _null_ _null_ poly_contain _null_ _null_ _null_ ));
 DATA(insert OID = 341 (  poly_left                PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "604 604" _null_ _null_ _null_ _null_ poly_left _null_ _null_ _null_ ));
@@ -4078,6 +4107,16 @@ DATA(insert OID = 2747 (  arrayoverlap              PGNSP PGUID 12 1 0 0 0 f f f f t f i
 DATA(insert OID = 2748 (  arraycontains                   PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "2277 2277" _null_ _null_ _null_ _null_ arraycontains _null_ _null_ _null_ ));
 DATA(insert OID = 2749 (  arraycontained          PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "2277 2277" _null_ _null_ _null_ _null_ arraycontained _null_ _null_ _null_ ));
 
+/* BRIN minmax */
+DATA(insert OID = 3383 ( brin_minmax_opcinfo PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 2281 "2281" _null_ _null_ _null_ _null_ minmaxOpcInfo _null_ _null_ _null_ ));
+DESCR("BRIN minmax support");
+DATA(insert OID = 3384 ( brin_minmax_add_value PGNSP PGUID 12 1 0 0 0 f f f f t f i 4 0 16 "2281 2281 2281 2281" _null_ _null_ _null_ _null_ minmaxAddValue _null_ _null_ _null_ ));
+DESCR("BRIN minmax support");
+DATA(insert OID = 3385 ( brin_minmax_consistent PGNSP PGUID 12 1 0 0 0 f f f f t f i 3 0 16 "2281 2281 2281" _null_ _null_ _null_ _null_ minmaxConsistent _null_ _null_ _null_ ));
+DESCR("BRIN minmax support");
+DATA(insert OID = 3386 ( brin_minmax_union PGNSP PGUID 12 1 0 0 0 f f f f t f i 3 0 16 "2281 2281 2281" _null_ _null_ _null_ _null_ minmaxUnion _null_ _null_ _null_ ));
+DESCR("BRIN minmax support");
+
 /* userlock replacements */
 DATA(insert OID = 2880 (  pg_advisory_lock                             PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "20" _null_ _null_ _null_ _null_ pg_advisory_lock_int8 _null_ _null_ _null_ ));
 DESCR("obtain exclusive advisory lock");
index d96e375f3f55f06faac6562fbcd97e1ab1e9c412..db7075f387b76616ebcfd4a46e714bed67ccd9bd 100644 (file)
@@ -403,6 +403,8 @@ extern Size PageGetExactFreeSpace(Page page);
 extern Size PageGetHeapFreeSpace(Page page);
 extern void PageIndexTupleDelete(Page page, OffsetNumber offset);
 extern void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems);
+extern void PageIndexDeleteNoCompact(Page page, OffsetNumber *itemnos,
+                                                int nitems);
 extern char *PageSetChecksumCopy(Page page, BlockNumber blkno);
 extern void PageSetChecksumInplace(Page page, BlockNumber blkno);
 
index 0f662ec8bb4465f8b41459f5c148662a5cf6e907..25cb3fa85fe4b09a11edd81c5e1be604278ff751 100644 (file)
@@ -190,6 +190,7 @@ extern double estimate_num_groups(PlannerInfo *root, List *groupExprs,
 extern Selectivity estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey,
                                                 double nbuckets);
 
+extern Datum brincostestimate(PG_FUNCTION_ARGS);
 extern Datum btcostestimate(PG_FUNCTION_ARGS);
 extern Datum hashcostestimate(PG_FUNCTION_ARGS);
 extern Datum gistcostestimate(PG_FUNCTION_ARGS);
diff --git a/src/test/regress/expected/brin.out b/src/test/regress/expected/brin.out
new file mode 100644 (file)
index 0000000..f8be27e
--- /dev/null
@@ -0,0 +1,179 @@
+SET synchronous_commit = 0;
+CREATE TABLE brintest (byteacol bytea,
+       charcol "char",
+       namecol name,
+       int8col bigint,
+       int2col smallint,
+       int4col integer,
+       textcol text,
+       oidcol oid,
+       tidcol tid,
+       float4col real,
+       float8col double precision,
+       macaddrcol macaddr,
+       inetcol inet,
+       bpcharcol character,
+       datecol date,
+       timecol time without time zone,
+       timestampcol timestamp without time zone,
+       timestamptzcol timestamp with time zone,
+       intervalcol interval,
+       timetzcol time with time zone,
+       bitcol bit(10),
+       varbitcol bit varying(16),
+       numericcol numeric,
+       uuidcol uuid,
+       lsncol pg_lsn
+) WITH (fillfactor=50);
+INSERT INTO brintest SELECT
+       repeat(stringu1, 42)::bytea,
+       substr(stringu1, 1, 1)::"char",
+       stringu1::name, 142857 * tenthous,
+       thousand,
+       twothousand,
+       repeat(stringu1, 42),
+       unique1::oid,
+       format('(%s,%s)', tenthous, twenty)::tid,
+       (four + 1.0)/(hundred+1),
+       odd::float8 / (tenthous + 1),
+       format('%s:00:%s:00:%s:00', to_hex(odd), to_hex(even), to_hex(hundred))::macaddr,
+       inet '10.2.3.4' + tenthous,
+       substr(stringu1, 1, 1)::bpchar,
+       date '1995-08-15' + tenthous,
+       time '01:20:30' + thousand * interval '18.5 second',
+       timestamp '1942-07-23 03:05:09' + tenthous * interval '36.38 hours',
+       timestamptz '1972-10-10 03:00' + thousand * interval '1 hour',
+       justify_days(justify_hours(tenthous * interval '12 minutes')),
+       timetz '01:30:20' + hundred * interval '15 seconds',
+       thousand::bit(10),
+       tenthous::bit(16)::varbit,
+       tenthous::numeric(36,30) * fivethous * even / (hundred + 1),
+       format('%s%s-%s-%s-%s-%s%s%s', to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'))::uuid,
+       format('%s/%s%s', odd, even, tenthous)::pg_lsn
+FROM tenk1;
+CREATE INDEX brinidx ON brintest USING brin (
+       byteacol,
+       charcol,
+       namecol,
+       int8col,
+       int2col,
+       int4col,
+       textcol,
+       oidcol,
+       tidcol,
+       float4col,
+       float8col,
+       macaddrcol,
+       inetcol,
+       bpcharcol,
+       datecol,
+       timecol,
+       timestampcol,
+       timestamptzcol,
+       intervalcol,
+       timetzcol,
+       bitcol,
+       varbitcol,
+       numericcol,
+       uuidcol,
+       lsncol
+) with (pages_per_range = 1);
+CREATE TABLE brinopers (colname name, op text[], value text[],
+       check (cardinality(op) = cardinality(value)));
+INSERT INTO brinopers VALUES ('byteacol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAAA, AAAAAA, AAAAAA}');
+INSERT INTO brinopers VALUES ('charcol', '{>, >=, =, <=, <}', '{Z, Z, A, A, A}');
+INSERT INTO brinopers VALUES ('namecol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAAA, AAAAAA, AAAAAA}');
+INSERT INTO brinopers VALUES ('int8col', '{>, >=, =, <=, <}', '{1428427143, 1428427143, 0, 0, 0}');
+INSERT INTO brinopers VALUES ('int2col', '{>, >=, =, <=, <}', '{999, 999, 0, 0, 0}');
+INSERT INTO brinopers VALUES ('int4col', '{>, >=, =, <=, <}', '{1999, 1999, 0, 0, 0}');
+INSERT INTO brinopers VALUES ('textcol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAA, AAAAA, AAAAA}');
+INSERT INTO brinopers VALUES ('oidcol', '{>, >=, =, <=, <}', '{9999, 9999, 0, 0, 0}');
+INSERT INTO brinopers VALUES ('tidcol', '{>, >=, =, <=, <}', '{"(9999,19)", "(9999,19)", "(0,0)", "(0,0)", "(0,0)"}');
+INSERT INTO brinopers VALUES ('float4col', '{>, >=, =, <=, <}', '{1, 1, 0.0103093, 0.0103093, 0.0103093}');
+INSERT INTO brinopers VALUES ('float8col', '{>, >=, =, <=, <}', '{1.98, 1.98, 0, 0, 0}');
+INSERT INTO brinopers VALUES ('inetcol', '{>, >=, =, <=, <}', '{10.2.42.19, 10.2.42.19, 10.2.3.4, 10.2.3.4, 10.2.3.4}');
+INSERT INTO brinopers VALUES ('bpcharcol', '{>, >=, =, <=, <}', '{Z, Z, A, A, A}');
+INSERT INTO brinopers VALUES ('datecol', '{>, >=, =, <=, <}', '{2022-12-30, 2022-12-30, 1995-08-15, 1995-08-15, 1995-08-15}');
+INSERT INTO brinopers VALUES ('timecol', '{>, >=, =, <=, <}', '{06:28:31.5, 06:28:31.5, 01:20:30, 01:20:30, 01:20:30}');
+INSERT INTO brinopers VALUES ('timestampcol', '{>, >=, =, <=, <}', '{1984-01-20 22:42:21, 1984-01-20 22:42:21, 1942-07-23 03:05:09, 1942-07-23 03:05:09, 1942-07-23 03:05:09}');
+INSERT INTO brinopers VALUES ('timestamptzcol', '{>, >=, =, <=, <}', '{1972-11-20 19:00:00-03, 1972-11-20 19:00:00-03, 1972-10-10 03:00:00-04, 1972-10-10 03:00:00-04, 1972-10-10 03:00:00-04}');
+INSERT INTO brinopers VALUES ('intervalcol', '{>, >=, =, <=, <}', '{2 mons 23 days 07:48:00, 2 mons 23 days 07:48:00, 00:00:00, 00:00:00, 00:00:00}');
+INSERT INTO brinopers VALUES ('timetzcol', '{>, >=, =, <=, <}', '{01:55:05-03, 01:55:05-03, 01:30:20-03, 01:30:20-03, 01:30:20-03}');
+INSERT INTO brinopers VALUES ('numericcol', '{>, >=, =, <=, <}', '{99470151.9, 99470151.9, 0.00, 0.01, 0.01}');
+INSERT INTO brinopers VALUES ('macaddrcol', '{>, >=, =, <=, <}', '{ff:fe:00:00:00:00, ff:fe:00:00:00:00, 00:00:01:00:00:00, 00:00:01:00:00:00, 00:00:01:00:00:00}');
+INSERT INTO brinopers VALUES ('bitcol', '{>, >=, =, <=, <}', '{1111111000, 1111111000, 0000000010, 0000000010, 0000000010}');
+INSERT INTO brinopers VALUES ('varbitcol', '{>, >=, =, <=, <}', '{1111111111111000, 1111111111111000, 0000000000000100, 0000000000000100, 0000000000000100}');
+INSERT INTO brinopers VALUES ('uuidcol', '{>, >=, =, <=, <}', '{99989998-9998-9998-9998-999899989998, 99989998-9998-9998-9998-999899989998, 00040004-0004-0004-0004-000400040004, 00040004-0004-0004-0004-000400040004, 00040004-0004-0004-0004-000400040005}');
+INSERT INTO brinopers VALUES ('lsncol', '{>, >=, =, <=, <}', '{198/1999799, 198/1999799, 30/312815, 0/1200, 0/1200}');
+DO $x$
+DECLARE
+        r record;
+        tabname text;
+        tabname_ss text;
+               count int;
+               query text;
+               plan text;
+BEGIN
+        FOR r IN SELECT row_number() OVER (), colname, oper, value[ordinality] FROM brinopers, unnest(op) WITH ORDINALITY AS oper LOOP
+                tabname := format('qry_%s', r.row_number);
+                tabname_ss := tabname || '_ss';
+                               query = format($y$INSERT INTO %s SELECT ctid FROM brintest WHERE %s %s %L $y$,
+                        tabname, r.colname, r.oper, r.value);
+                               -- run the query using the brin index
+                SET enable_seqscan = 0;
+                SET enable_bitmapscan = 1;
+                EXECUTE format('create temp table %s (tid tid) /* ON COMMIT DROP*/', tabname);
+                EXECUTE query;
+
+                               -- run the query using a seqscan
+                SET enable_seqscan = 1;
+                SET enable_bitmapscan = 0;
+                               query = format($y$INSERT INTO %s SELECT ctid FROM brintest WHERE %s %s %L $y$,
+                        tabname_ss, r.colname, r.oper, r.value);
+                EXECUTE format('create temp table %s (tid tid) /* ON COMMIT DROP */', tabname_ss);
+                EXECUTE query;
+
+                               -- make sure both return the same results
+                EXECUTE format('SELECT * from %s EXCEPT ALL SELECT * FROM %s', tabname, tabname_ss);
+                               GET DIAGNOSTICS count = ROW_COUNT;
+                IF count <> 0 THEN RAISE EXCEPTION 'something not right in %: count %', r, count; END IF;
+                EXECUTE format('SELECT * from %s EXCEPT ALL SELECT * FROM %s', tabname_ss, tabname);
+                               GET DIAGNOSTICS count = ROW_COUNT;
+                IF count <> 0 THEN RAISE EXCEPTION 'something not right in %: count %', r, count; END IF;
+        end loop;
+end;
+$x$;
+INSERT INTO brintest SELECT
+       repeat(stringu1, 42)::bytea,
+       substr(stringu1, 1, 1)::"char",
+       stringu1::name, 142857 * tenthous,
+       thousand,
+       twothousand,
+       repeat(stringu1, 42),
+       unique1::oid,
+       format('(%s,%s)', tenthous, twenty)::tid,
+       (four + 1.0)/(hundred+1),
+       odd::float8 / (tenthous + 1),
+       format('%s:00:%s:00:%s:00', to_hex(odd), to_hex(even), to_hex(hundred))::macaddr,
+       inet '10.2.3.4' + tenthous,
+       substr(stringu1, 1, 1)::bpchar,
+       date '1995-08-15' + tenthous,
+       time '01:20:30' + thousand * interval '18.5 second',
+       timestamp '1942-07-23 03:05:09' + tenthous * interval '36.38 hours',
+       timestamptz '1972-10-10 03:00' + thousand * interval '1 hour',
+       justify_days(justify_hours(tenthous * interval '12 minutes')),
+       timetz '01:30:20' + hundred * interval '15 seconds',
+       thousand::bit(10),
+       tenthous::bit(16)::varbit,
+       tenthous::numeric(36,30) * fivethous * even / (hundred + 1),
+       format('%s%s-%s-%s-%s-%s%s%s', to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'))::uuid,
+       format('%s/%s%s', odd, even, tenthous)::pg_lsn
+FROM tenk1;
+SELECT brin_summarize_new_values('brinidx'::regclass);
+ brin_summarize_new_values 
+---------------------------
+                      2000
+(1 row)
+
+UPDATE brintest SET int8col = int8col * int4col;
+SET synchronous_commit = 1;
index 992522ea3f147e7fc6651b25945cc69dc1684cf8..9870bfaa0185bc9db5de600ab43e55d83172f9c1 100644 (file)
@@ -1658,6 +1658,11 @@ ORDER BY 1, 2, 3;
        2742 |            9 | ?
        2742 |           10 | ?|
        2742 |           11 | ?&
+       3580 |            1 | <
+       3580 |            2 | <=
+       3580 |            3 | =
+       3580 |            4 | >=
+       3580 |            5 | >
        4000 |            1 | <<
        4000 |            1 | ~<~
        4000 |            2 | &<
@@ -1680,7 +1685,7 @@ ORDER BY 1, 2, 3;
        4000 |           15 | >
        4000 |           16 | @>
        4000 |           18 | =
-(80 rows)
+(85 rows)
 
 -- Check that all opclass search operators have selectivity estimators.
 -- This is not absolutely required, but it seems a reasonable thing
@@ -1842,11 +1847,13 @@ WHERE NOT (
   -- GIN has six support functions. 1-3 are mandatory, 5 is optional, and
   --   at least one of 4 and 6 must be given.
   -- SP-GiST has five support functions, all mandatory
+  -- BRIN has four mandatory support functions, and a bunch of optionals
   amname = 'btree' AND procnums @> '{1}' OR
   amname = 'hash' AND procnums = '{1}' OR
   amname = 'gist' AND procnums @> '{1, 2, 3, 4, 5, 6, 7}' OR
   amname = 'gin' AND (procnums @> '{1, 2, 3}' AND (procnums && '{4, 6}')) OR
-  amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}'
+  amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}' OR
+  amname = 'brin' AND procnums @> '{1, 2, 3, 4}'
 );
  amname | opfname | amproclefttype | amprocrighttype | procnums 
 --------+---------+----------------+-----------------+----------
@@ -1867,7 +1874,8 @@ WHERE NOT (
   amname = 'hash' AND procnums = '{1}' OR
   amname = 'gist' AND procnums @> '{1, 2, 3, 4, 5, 6, 7}' OR
   amname = 'gin' AND (procnums @> '{1, 2, 3}' AND (procnums && '{4, 6}')) OR
-  amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}'
+  amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}' OR
+  amname = 'brin' AND procnums @> '{1, 2, 3, 4}'
 );
  amname | opcname | procnums 
 --------+---------+----------
index 7015bfda2c2474781a229ff6d86e564ca13b36c1..06606081ad3f8027130c1232dacdc07a7e1fa96a 100644 (file)
@@ -591,6 +591,8 @@ SELECT user_relns() AS user_relns
  bb
  box_tbl
  bprime
+ brinopers
+ brintest
  bt_f8_heap
  bt_i4_heap
  bt_name_heap
@@ -698,7 +700,7 @@ SELECT user_relns() AS user_relns
  tvvmv
  varchar_tbl
  xacttest
-(120 rows)
+(122 rows)
 
 SELECT name(equipment(hobby_construct(text 'skywalking', text 'mer')));
  name 
index 9902dbeb39c1f2bad9703d93c6b596f7d3979cbc..d4f02e5703a8560adde003a802043385216fbf16 100644 (file)
@@ -83,7 +83,7 @@ test: select_into select_distinct select_distinct_on select_implicit select_havi
 # ----------
 # Another group of parallel tests
 # ----------
-test: privileges security_label collate matview lock replica_identity rowsecurity
+test: brin privileges security_label collate matview lock replica_identity rowsecurity
 
 # ----------
 # Another group of parallel tests
index 2902a05dfb6fea209a7a1a1bab341e7ca72453c2..b1e44b3bf30ad68b0522da59f977e646ac956dc2 100644 (file)
@@ -106,6 +106,7 @@ test: alter_generic
 test: misc
 test: psql
 test: async
+test: brin
 test: rules
 test: event_trigger
 test: select_views
diff --git a/src/test/regress/sql/brin.sql b/src/test/regress/sql/brin.sql
new file mode 100644 (file)
index 0000000..244652f
--- /dev/null
@@ -0,0 +1,184 @@
+SET synchronous_commit = 0;
+
+CREATE TABLE brintest (byteacol bytea,
+       charcol "char",
+       namecol name,
+       int8col bigint,
+       int2col smallint,
+       int4col integer,
+       textcol text,
+       oidcol oid,
+       tidcol tid,
+       float4col real,
+       float8col double precision,
+       macaddrcol macaddr,
+       inetcol inet,
+       bpcharcol character,
+       datecol date,
+       timecol time without time zone,
+       timestampcol timestamp without time zone,
+       timestamptzcol timestamp with time zone,
+       intervalcol interval,
+       timetzcol time with time zone,
+       bitcol bit(10),
+       varbitcol bit varying(16),
+       numericcol numeric,
+       uuidcol uuid,
+       lsncol pg_lsn
+) WITH (fillfactor=50);
+
+INSERT INTO brintest SELECT
+       repeat(stringu1, 42)::bytea,
+       substr(stringu1, 1, 1)::"char",
+       stringu1::name, 142857 * tenthous,
+       thousand,
+       twothousand,
+       repeat(stringu1, 42),
+       unique1::oid,
+       format('(%s,%s)', tenthous, twenty)::tid,
+       (four + 1.0)/(hundred+1),
+       odd::float8 / (tenthous + 1),
+       format('%s:00:%s:00:%s:00', to_hex(odd), to_hex(even), to_hex(hundred))::macaddr,
+       inet '10.2.3.4' + tenthous,
+       substr(stringu1, 1, 1)::bpchar,
+       date '1995-08-15' + tenthous,
+       time '01:20:30' + thousand * interval '18.5 second',
+       timestamp '1942-07-23 03:05:09' + tenthous * interval '36.38 hours',
+       timestamptz '1972-10-10 03:00' + thousand * interval '1 hour',
+       justify_days(justify_hours(tenthous * interval '12 minutes')),
+       timetz '01:30:20' + hundred * interval '15 seconds',
+       thousand::bit(10),
+       tenthous::bit(16)::varbit,
+       tenthous::numeric(36,30) * fivethous * even / (hundred + 1),
+       format('%s%s-%s-%s-%s-%s%s%s', to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'))::uuid,
+       format('%s/%s%s', odd, even, tenthous)::pg_lsn
+FROM tenk1;
+
+CREATE INDEX brinidx ON brintest USING brin (
+       byteacol,
+       charcol,
+       namecol,
+       int8col,
+       int2col,
+       int4col,
+       textcol,
+       oidcol,
+       tidcol,
+       float4col,
+       float8col,
+       macaddrcol,
+       inetcol,
+       bpcharcol,
+       datecol,
+       timecol,
+       timestampcol,
+       timestamptzcol,
+       intervalcol,
+       timetzcol,
+       bitcol,
+       varbitcol,
+       numericcol,
+       uuidcol,
+       lsncol
+) with (pages_per_range = 1);
+
+CREATE TABLE brinopers (colname name, op text[], value text[],
+       check (cardinality(op) = cardinality(value)));
+
+INSERT INTO brinopers VALUES ('byteacol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAAA, AAAAAA, AAAAAA}');
+INSERT INTO brinopers VALUES ('charcol', '{>, >=, =, <=, <}', '{Z, Z, A, A, A}');
+INSERT INTO brinopers VALUES ('namecol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAAA, AAAAAA, AAAAAA}');
+INSERT INTO brinopers VALUES ('int8col', '{>, >=, =, <=, <}', '{1428427143, 1428427143, 0, 0, 0}');
+INSERT INTO brinopers VALUES ('int2col', '{>, >=, =, <=, <}', '{999, 999, 0, 0, 0}');
+INSERT INTO brinopers VALUES ('int4col', '{>, >=, =, <=, <}', '{1999, 1999, 0, 0, 0}');
+INSERT INTO brinopers VALUES ('textcol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAA, AAAAA, AAAAA}');
+INSERT INTO brinopers VALUES ('oidcol', '{>, >=, =, <=, <}', '{9999, 9999, 0, 0, 0}');
+INSERT INTO brinopers VALUES ('tidcol', '{>, >=, =, <=, <}', '{"(9999,19)", "(9999,19)", "(0,0)", "(0,0)", "(0,0)"}');
+INSERT INTO brinopers VALUES ('float4col', '{>, >=, =, <=, <}', '{1, 1, 0.0103093, 0.0103093, 0.0103093}');
+INSERT INTO brinopers VALUES ('float8col', '{>, >=, =, <=, <}', '{1.98, 1.98, 0, 0, 0}');
+INSERT INTO brinopers VALUES ('inetcol', '{>, >=, =, <=, <}', '{10.2.42.19, 10.2.42.19, 10.2.3.4, 10.2.3.4, 10.2.3.4}');
+INSERT INTO brinopers VALUES ('bpcharcol', '{>, >=, =, <=, <}', '{Z, Z, A, A, A}');
+INSERT INTO brinopers VALUES ('datecol', '{>, >=, =, <=, <}', '{2022-12-30, 2022-12-30, 1995-08-15, 1995-08-15, 1995-08-15}');
+INSERT INTO brinopers VALUES ('timecol', '{>, >=, =, <=, <}', '{06:28:31.5, 06:28:31.5, 01:20:30, 01:20:30, 01:20:30}');
+INSERT INTO brinopers VALUES ('timestampcol', '{>, >=, =, <=, <}', '{1984-01-20 22:42:21, 1984-01-20 22:42:21, 1942-07-23 03:05:09, 1942-07-23 03:05:09, 1942-07-23 03:05:09}');
+INSERT INTO brinopers VALUES ('timestamptzcol', '{>, >=, =, <=, <}', '{1972-11-20 19:00:00-03, 1972-11-20 19:00:00-03, 1972-10-10 03:00:00-04, 1972-10-10 03:00:00-04, 1972-10-10 03:00:00-04}');
+INSERT INTO brinopers VALUES ('intervalcol', '{>, >=, =, <=, <}', '{2 mons 23 days 07:48:00, 2 mons 23 days 07:48:00, 00:00:00, 00:00:00, 00:00:00}');
+INSERT INTO brinopers VALUES ('timetzcol', '{>, >=, =, <=, <}', '{01:55:05-03, 01:55:05-03, 01:30:20-03, 01:30:20-03, 01:30:20-03}');
+INSERT INTO brinopers VALUES ('numericcol', '{>, >=, =, <=, <}', '{99470151.9, 99470151.9, 0.00, 0.01, 0.01}');
+INSERT INTO brinopers VALUES ('macaddrcol', '{>, >=, =, <=, <}', '{ff:fe:00:00:00:00, ff:fe:00:00:00:00, 00:00:01:00:00:00, 00:00:01:00:00:00, 00:00:01:00:00:00}');
+INSERT INTO brinopers VALUES ('bitcol', '{>, >=, =, <=, <}', '{1111111000, 1111111000, 0000000010, 0000000010, 0000000010}');
+INSERT INTO brinopers VALUES ('varbitcol', '{>, >=, =, <=, <}', '{1111111111111000, 1111111111111000, 0000000000000100, 0000000000000100, 0000000000000100}');
+INSERT INTO brinopers VALUES ('uuidcol', '{>, >=, =, <=, <}', '{99989998-9998-9998-9998-999899989998, 99989998-9998-9998-9998-999899989998, 00040004-0004-0004-0004-000400040004, 00040004-0004-0004-0004-000400040004, 00040004-0004-0004-0004-000400040005}');
+INSERT INTO brinopers VALUES ('lsncol', '{>, >=, =, <=, <}', '{198/1999799, 198/1999799, 30/312815, 0/1200, 0/1200}');
+
+DO $x$
+DECLARE
+        r record;
+        tabname text;
+        tabname_ss text;
+               count int;
+               query text;
+               plan text;
+BEGIN
+        FOR r IN SELECT row_number() OVER (), colname, oper, value[ordinality] FROM brinopers, unnest(op) WITH ORDINALITY AS oper LOOP
+                tabname := format('qry_%s', r.row_number);
+                tabname_ss := tabname || '_ss';
+                               query = format($y$INSERT INTO %s SELECT ctid FROM brintest WHERE %s %s %L $y$,
+                        tabname, r.colname, r.oper, r.value);
+                               -- run the query using the brin index
+                SET enable_seqscan = 0;
+                SET enable_bitmapscan = 1;
+                EXECUTE format('create temp table %s (tid tid) /* ON COMMIT DROP*/', tabname);
+                EXECUTE query;
+
+                               -- run the query using a seqscan
+                SET enable_seqscan = 1;
+                SET enable_bitmapscan = 0;
+                               query = format($y$INSERT INTO %s SELECT ctid FROM brintest WHERE %s %s %L $y$,
+                        tabname_ss, r.colname, r.oper, r.value);
+                EXECUTE format('create temp table %s (tid tid) /* ON COMMIT DROP */', tabname_ss);
+                EXECUTE query;
+
+                               -- make sure both return the same results
+                EXECUTE format('SELECT * from %s EXCEPT ALL SELECT * FROM %s', tabname, tabname_ss);
+                               GET DIAGNOSTICS count = ROW_COUNT;
+                IF count <> 0 THEN RAISE EXCEPTION 'something not right in %: count %', r, count; END IF;
+                EXECUTE format('SELECT * from %s EXCEPT ALL SELECT * FROM %s', tabname_ss, tabname);
+                               GET DIAGNOSTICS count = ROW_COUNT;
+                IF count <> 0 THEN RAISE EXCEPTION 'something not right in %: count %', r, count; END IF;
+        end loop;
+end;
+$x$;
+
+INSERT INTO brintest SELECT
+       repeat(stringu1, 42)::bytea,
+       substr(stringu1, 1, 1)::"char",
+       stringu1::name, 142857 * tenthous,
+       thousand,
+       twothousand,
+       repeat(stringu1, 42),
+       unique1::oid,
+       format('(%s,%s)', tenthous, twenty)::tid,
+       (four + 1.0)/(hundred+1),
+       odd::float8 / (tenthous + 1),
+       format('%s:00:%s:00:%s:00', to_hex(odd), to_hex(even), to_hex(hundred))::macaddr,
+       inet '10.2.3.4' + tenthous,
+       substr(stringu1, 1, 1)::bpchar,
+       date '1995-08-15' + tenthous,
+       time '01:20:30' + thousand * interval '18.5 second',
+       timestamp '1942-07-23 03:05:09' + tenthous * interval '36.38 hours',
+       timestamptz '1972-10-10 03:00' + thousand * interval '1 hour',
+       justify_days(justify_hours(tenthous * interval '12 minutes')),
+       timetz '01:30:20' + hundred * interval '15 seconds',
+       thousand::bit(10),
+       tenthous::bit(16)::varbit,
+       tenthous::numeric(36,30) * fivethous * even / (hundred + 1),
+       format('%s%s-%s-%s-%s-%s%s%s', to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'))::uuid,
+       format('%s/%s%s', odd, even, tenthous)::pg_lsn
+FROM tenk1;
+
+SELECT brin_summarize_new_values('brinidx'::regclass);
+
+UPDATE brintest SET int8col = int8col * int4col;
+
+SET synchronous_commit = 1;
index b394c3007699f8e42ab9a92456efa5d13c1028b8..7159a8377ee3f689e26f78efd78316465c41c5a5 100644 (file)
@@ -1195,11 +1195,13 @@ WHERE NOT (
   -- GIN has six support functions. 1-3 are mandatory, 5 is optional, and
   --   at least one of 4 and 6 must be given.
   -- SP-GiST has five support functions, all mandatory
+  -- BRIN has four mandatory support functions, and a bunch of optionals
   amname = 'btree' AND procnums @> '{1}' OR
   amname = 'hash' AND procnums = '{1}' OR
   amname = 'gist' AND procnums @> '{1, 2, 3, 4, 5, 6, 7}' OR
   amname = 'gin' AND (procnums @> '{1, 2, 3}' AND (procnums && '{4, 6}')) OR
-  amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}'
+  amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}' OR
+  amname = 'brin' AND procnums @> '{1, 2, 3, 4}'
 );
 
 -- Also, check if there are any pg_opclass entries that don't seem to have
@@ -1218,7 +1220,8 @@ WHERE NOT (
   amname = 'hash' AND procnums = '{1}' OR
   amname = 'gist' AND procnums @> '{1, 2, 3, 4, 5, 6, 7}' OR
   amname = 'gin' AND (procnums @> '{1, 2, 3}' AND (procnums && '{4, 6}')) OR
-  amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}'
+  amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}' OR
+  amname = 'brin' AND procnums @> '{1, 2, 3, 4}'
 );
 
 -- Unfortunately, we can't check the amproc link very well because the