# contrib/pageinspect/Makefile
MODULE_big = pageinspect
-OBJS = rawpage.o heapfuncs.o btreefuncs.o fsmfuncs.o $(WIN32RES)
+OBJS = rawpage.o heapfuncs.o btreefuncs.o fsmfuncs.o brinfuncs.o $(WIN32RES)
EXTENSION = pageinspect
-DATA = pageinspect--1.2.sql pageinspect--1.0--1.1.sql \
+DATA = pageinspect--1.3.sql pageinspect--1.0--1.1.sql \
+ pageinspect--1.2--1.3.sql \
pageinspect--1.1--1.2.sql pageinspect--unpackaged--1.0.sql
PGFILEDESC = "pageinspect - functions to inspect contents of database pages"
--- /dev/null
+ * brinfuncs.c
+ * Functions to investigate BRIN indexes
+ *
+ * Copyright (c) 2014, PostgreSQL Global Development Group
+ *
+ * contrib/pageinspect/brinfuncs.c
+ */
+#include "postgres.h"
+#include "access/htup_details.h"
+#include "access/brin.h"
+#include "access/brin_internal.h"
+#include "access/brin_page.h"
+#include "access/brin_revmap.h"
+#include "access/brin_tuple.h"
+#include "catalog/index.h"
+#include "catalog/pg_type.h"
+#include "funcapi.h"
+#include "lib/stringinfo.h"
+#include "utils/array.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/rel.h"
+#include "miscadmin.h"
+typedef struct brin_column_state
+ int nstored;
+} brin_column_state;
+typedef struct brin_page_state
+ BrinDesc *bdesc;
+ Page page;
+ OffsetNumber offset;
+ bool unusedItem;
+ bool done;
+ AttrNumber attno;
+ BrinMemTuple *dtup;
+ brin_column_state *columns[FLEXIBLE_ARRAY_MEMBER];
+} brin_page_state;
+static Page verify_brin_page(bytea *raw_page, uint16 type,
+ const char *strtype);
+ bytea *raw_page = PG_GETARG_BYTEA_P(0);
+ Page page = VARDATA(raw_page);
+ BrinSpecialSpace *special;
+ char *type;
+ special = (BrinSpecialSpace *) PageGetSpecialPointer(page);
+ switch (special->type)
+ {
+ type = "meta";
+ break;
+ type = "revmap";
+ break;
+ type = "regular";
+ break;
+ default:
+ type = psprintf("unknown (%02x)", special->type);
+ break;
+ }
+ PG_RETURN_TEXT_P(cstring_to_text(type));
+ * Verify that the given bytea contains a BRIN page of the indicated page
+ * type, or die in the attempt. A pointer to the page is returned.
+ */
+static Page
+verify_brin_page(bytea *raw_page, uint16 type, const char *strtype)
+ Page page;
+ int raw_page_size;
+ BrinSpecialSpace *special;
+ raw_page_size = VARSIZE(raw_page) - VARHDRSZ;
+ if (raw_page_size < SizeOfPageHeaderData)
+ ereport(ERROR,
+ errmsg("input page too small"),
+ errdetail("Expected size %d, got %d", raw_page_size, BLCKSZ)));
+ page = VARDATA(raw_page);
+ /* verify the special space says this page is what we want */
+ special = (BrinSpecialSpace *) PageGetSpecialPointer(page);
+ if (special->type != type)
+ ereport(ERROR,
+ errmsg("page is not a BRIN page of type \"%s\"", strtype),
+ errdetail("Expected special type %08x, got %08x.",
+ type, special->type)));
+ return page;
+ * Extract all item values from a BRIN index page
+ *
+ * Usage: SELECT * FROM brin_page_items(get_raw_page('idx', 1), 'idx'::regclass);
+ */
+ brin_page_state *state;
+ FuncCallContext *fctx;
+ if (!superuser())
+ ereport(ERROR,
+ (errmsg("must be superuser to use raw page functions"))));
+ {
+ bytea *raw_page = PG_GETARG_BYTEA_P(0);
+ Oid indexRelid = PG_GETARG_OID(1);
+ Page page;
+ TupleDesc tupdesc;
+ MemoryContext mctx;
+ Relation indexRel;
+ AttrNumber attno;
+ /* minimally verify the page we got */
+ page = verify_brin_page(raw_page, BRIN_PAGETYPE_REGULAR, "regular");
+ /* create a function context for cross-call persistence */
+ /* switch to memory context appropriate for multiple function calls */
+ mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
+ /* Build a tuple descriptor for our result type */
+ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+ indexRel = index_open(indexRelid, AccessShareLock);
+ state = palloc(offsetof(brin_page_state, columns) +
+ sizeof(brin_column_state) * RelationGetDescr(indexRel)->natts);
+ state->bdesc = brin_build_desc(indexRel);
+ state->page = page;
+ state->offset = FirstOffsetNumber;
+ state->unusedItem = false;
+ state->done = false;
+ state->dtup = NULL;
+ /*
+ * Initialize output functions for all indexed datatypes; simplifies
+ * calling them later.
+ */
+ for (attno = 1; attno <= state->bdesc->bd_tupdesc->natts; attno++)
+ {
+ Oid output;
+ bool isVarlena;
+ BrinOpcInfo *opcinfo;
+ int i;
+ brin_column_state *column;
+ opcinfo = state->bdesc->bd_info[attno - 1];
+ column = palloc(offsetof(brin_column_state, outputFn) +
+ sizeof(FmgrInfo) * opcinfo->oi_nstored);
+ column->nstored = opcinfo->oi_nstored;
+ for (i = 0; i < opcinfo->oi_nstored; i++)
+ {
+ getTypeOutputInfo(opcinfo->oi_typids[i], &output, &isVarlena);
+ fmgr_info(output, &column->outputFn[i]);
+ }
+ state->columns[attno - 1] = column;
+ }
+ index_close(indexRel, AccessShareLock);
+ fctx->user_fctx = state;
+ fctx->tuple_desc = BlessTupleDesc(tupdesc);
+ MemoryContextSwitchTo(mctx);
+ }
+ state = fctx->user_fctx;
+ if (!state->done)
+ {
+ HeapTuple result;
+ Datum values[7];
+ bool nulls[7];
+ /*
+ * This loop is called once for every attribute of every tuple in the
+ * page. At the start of a tuple, we get a NULL dtup; that's our
+ * signal for obtaining and decoding the next one. If that's not the
+ * case, we output the next attribute.
+ */
+ if (state->dtup == NULL)
+ {
+ BrinTuple *tup;
+ MemoryContext mctx;
+ ItemId itemId;
+ /* deformed tuple must live across calls */
+ mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
+ /* verify item status: if there's no data, we can't decode */
+ itemId = PageGetItemId(state->page, state->offset);
+ if (ItemIdIsUsed(itemId))
+ {
+ tup = (BrinTuple *) PageGetItem(state->page,
+ PageGetItemId(state->page,
+ state->offset));
+ state->dtup = brin_deform_tuple(state->bdesc, tup);
+ state->attno = 1;
+ state->unusedItem = false;
+ }
+ else
+ state->unusedItem = true;
+ MemoryContextSwitchTo(mctx);
+ }
+ else
+ state->attno++;
+ MemSet(nulls, 0, sizeof(nulls));
+ if (state->unusedItem)
+ {
+ values[0] = UInt16GetDatum(state->offset);
+ nulls[1] = true;
+ nulls[2] = true;
+ nulls[3] = true;
+ nulls[4] = true;
+ nulls[5] = true;
+ nulls[6] = true;
+ }
+ else
+ {
+ int att = state->attno - 1;
+ values[0] = UInt16GetDatum(state->offset);
+ values[1] = UInt32GetDatum(state->dtup->bt_blkno);
+ values[2] = UInt16GetDatum(state->attno);
+ values[3] = BoolGetDatum(state->dtup->bt_columns[att].bv_allnulls);
+ values[4] = BoolGetDatum(state->dtup->bt_columns[att].bv_hasnulls);
+ values[5] = BoolGetDatum(state->dtup->bt_placeholder);
+ if (!state->dtup->bt_columns[att].bv_allnulls)
+ {
+ BrinValues *bvalues = &state->dtup->bt_columns[att];
+ StringInfoData s;
+ bool first;
+ int i;
+ initStringInfo(&s);
+ appendStringInfoChar(&s, '{');
+ first = true;
+ for (i = 0; i < state->columns[att]->nstored; i++)
+ {
+ char *val;
+ if (!first)
+ appendStringInfoString(&s, " .. ");
+ first = false;
+ val = OutputFunctionCall(&state->columns[att]->outputFn[i],
+ bvalues->bv_values[i]);
+ appendStringInfoString(&s, val);
+ pfree(val);
+ }
+ appendStringInfoChar(&s, '}');
+ values[6] = CStringGetTextDatum(s.data);
+ pfree(s.data);
+ }
+ else
+ {
+ nulls[6] = true;
+ }
+ }
+ result = heap_form_tuple(fctx->tuple_desc, values, nulls);
+ /*
+ * If the item was unused, jump straight to the next one; otherwise,
+ * the only cleanup needed here is to set our signal to go to the next
+ * tuple in the following iteration, by freeing the current one.
+ */
+ if (state->unusedItem)
+ state->offset = OffsetNumberNext(state->offset);
+ else if (state->attno >= state->bdesc->bd_tupdesc->natts)
+ {
+ pfree(state->dtup);
+ state->dtup = NULL;
+ state->offset = OffsetNumberNext(state->offset);
+ }
+ /*
+ * If we're beyond the end of the page, set flag to end the function in
+ * the following iteration.
+ */
+ if (state->offset > PageGetMaxOffsetNumber(state->page))
+ state->done = true;
+ SRF_RETURN_NEXT(fctx, HeapTupleGetDatum(result));
+ }
+ brin_free_desc(state->bdesc);
+ bytea *raw_page = PG_GETARG_BYTEA_P(0);
+ Page page;
+ BrinMetaPageData *meta;
+ TupleDesc tupdesc;
+ Datum values[4];
+ bool nulls[4];
+ HeapTuple htup;
+ page = verify_brin_page(raw_page, BRIN_PAGETYPE_META, "metapage");
+ /* Build a tuple descriptor for our result type */
+ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+ tupdesc = BlessTupleDesc(tupdesc);
+ /* Extract values from the metapage */
+ meta = (BrinMetaPageData *) PageGetContents(page);
+ MemSet(nulls, 0, sizeof(nulls));
+ values[0] = CStringGetTextDatum(psprintf("0x%08X", meta->brinMagic));
+ values[1] = Int32GetDatum(meta->brinVersion);
+ values[2] = Int32GetDatum(meta->pagesPerRange);
+ values[3] = Int64GetDatum(meta->lastRevmapPage);
+ htup = heap_form_tuple(tupdesc, values, nulls);
+ PG_RETURN_DATUM(HeapTupleGetDatum(htup));
+ * Return the TID array stored in a BRIN revmap page
+ */
+ struct
+ {
+ ItemPointerData *tids;
+ int idx;
+ } *state;
+ FuncCallContext *fctx;
+ if (!superuser())
+ ereport(ERROR,
+ (errmsg("must be superuser to use raw page functions"))));
+ {
+ bytea *raw_page = PG_GETARG_BYTEA_P(0);
+ MemoryContext mctx;
+ Page page;
+ /* minimally verify the page we got */
+ page = verify_brin_page(raw_page, BRIN_PAGETYPE_REVMAP, "revmap");
+ /* create a function context for cross-call persistence */
+ /* switch to memory context appropriate for multiple function calls */
+ mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
+ state = palloc(sizeof(*state));
+ state->tids = ((RevmapContents *) PageGetContents(page))->rm_tids;
+ state->idx = 0;
+ fctx->user_fctx = state;
+ MemoryContextSwitchTo(mctx);
+ }
+ state = fctx->user_fctx;
+ if (state->idx < REVMAP_PAGE_MAXITEMS)
+ SRF_RETURN_NEXT(fctx, PointerGetDatum(&state->tids[state->idx++]));
--- /dev/null
+/* contrib/pageinspect/pageinspect--1.2--1.3.sql */
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pageinspect UPDATE TO '1.3'" to load this file. \quit
+-- brin_page_type()
+CREATE FUNCTION brin_page_type(IN page bytea)
+AS 'MODULE_PATHNAME', 'brin_page_type'
+-- brin_metapage_info()
+CREATE FUNCTION brin_metapage_info(IN page bytea, OUT magic text,
+ OUT version integer, OUT pagesperrange integer, OUT lastrevmappage bigint)
+AS 'MODULE_PATHNAME', 'brin_metapage_info'
+-- brin_revmap_data()
+CREATE FUNCTION brin_revmap_data(IN page bytea,
+ OUT pages tid)
+AS 'MODULE_PATHNAME', 'brin_revmap_data'
+-- brin_page_items()
+CREATE FUNCTION brin_page_items(IN page bytea, IN index_oid regclass,
+ OUT itemoffset int,
+ OUT blknum int,
+ OUT attnum int,
+ OUT allnulls bool,
+ OUT hasnulls bool,
+ OUT placeholder bool,
+ OUT value text)
+AS 'MODULE_PATHNAME', 'brin_page_items'
-/* contrib/pageinspect/pageinspect--1.2.sql */
+/* contrib/pageinspect/pageinspect--1.3.sql */
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION pageinspect" to load this file. \quit
AS 'MODULE_PATHNAME', 'bt_page_items'
+-- brin_page_type()
+CREATE FUNCTION brin_page_type(IN page bytea)
+AS 'MODULE_PATHNAME', 'brin_page_type'
+-- brin_metapage_info()
+CREATE FUNCTION brin_metapage_info(IN page bytea, OUT magic text,
+ OUT version integer, OUT pagesperrange integer, OUT lastrevmappage bigint)
+AS 'MODULE_PATHNAME', 'brin_metapage_info'
+-- brin_revmap_data()
+CREATE FUNCTION brin_revmap_data(IN page bytea,
+ OUT pages tid)
+AS 'MODULE_PATHNAME', 'brin_revmap_data'
+-- brin_page_items()
+CREATE FUNCTION brin_page_items(IN page bytea, IN index_oid regclass,
+ OUT itemoffset int,
+ OUT blknum int,
+ OUT attnum int,
+ OUT allnulls bool,
+ OUT hasnulls bool,
+ OUT placeholder bool,
+ OUT value text)
+AS 'MODULE_PATHNAME', 'brin_page_items'
-- fsm_page_contents()
# pageinspect extension
comment = 'inspect the contents of database pages at a low level'
-default_version = '1.2'
+default_version = '1.3'
module_pathname = '$libdir/pageinspect'
relocatable = true
#define FRONTEND 1
#include "postgres.h"
+#include "access/brin_xlog.h"
#include "access/clog.h"
#include "access/gin.h"
#include "access/gist_private.h"
--- /dev/null
+<!-- doc/src/sgml/brin.sgml -->
+<chapter id="BRIN">
+<title>BRIN Indexes</title>
+ <indexterm>
+ <primary>index</primary>
+ <secondary>BRIN</secondary>
+ </indexterm>
+<sect1 id="brin-intro">
+ <title>Introduction</title>
+ <para>
+ <acronym>BRIN</acronym> stands for Block Range Index.
+ <acronym>BRIN</acronym> is designed for handling very large tables
+ in which certain columns have some natural correlation with their
+ physical location within the table.
+ A <firstterm>block range</> is a group of pages that are physically
+ adjacent in the table; for each block range, some summary info is stored
+ by the index.
+ For example, a table storing a store's sale orders might have
+ a date column on which each order was placed, and most of the time
+ the entries for earlier orders will appear earlier in the table as well;
+ a table storing a ZIP code column might have all codes for a city
+ grouped together naturally.
+ </para>
+ <para>
+ <acronym>BRIN</acronym> indexes can satisfy queries via regular bitmap
+ index scans, and will return all tuples in all pages within each range if
+ the summary info stored by the index is <firstterm>consistent</> with the
+ query conditions.
+ The query executor is in charge of rechecking these tuples and discarding
+ those that do not match the query conditions — in other words, these
+ indexes are lossy.
+ Because a <acronym>BRIN</acronym> index is very small, scanning the index
+ adds little overhead compared to a sequential scan, but may avoid scanning
+ large parts of the table that are known not to contain matching tuples.
+ </para>
+ <para>
+ The specific data that a <acronym>BRIN</acronym> index will store,
+ as well as the specific queries that the index will be able to satisfy,
+ depend on the operator class selected for each column of the index.
+ Data types having a linear sort order can have operator classes that
+ store the minimum and maximum value within each block range, for instance;
+ geometrical types might store the bounding box for all the objects
+ in the block range.
+ </para>
+ <para>
+ The size of the block range is determined at index creation time by
+ the <literal>pages_per_range</> storage parameter. The number of index
+ entries will be equal to the size of the relation in pages divided by
+ the selected value for <literal>pages_per_range</>. Therefore, the smaller
+ the number, the larger the index becomes (because of the need to
+ store more index entries), but at the same time the summary data stored can
+ be more precise and more data blocks can be skipped during an index scan.
+ </para>
+<sect1 id="brin-builtin-opclasses">
+ <title>Built-in Operator Classes</title>
+ <para>
+ The core <productname>PostgreSQL</productname> distribution includes
+ includes the <acronym>BRIN</acronym> operator classes shown in
+ <xref linkend="brin-builtin-opclasses-table">.
+ </para>
+ <para>
+ The <firstterm>minmax</>
+ operator classes store the minimum and the maximum values appearing
+ in the indexed column within the range.
+ </para>
+ <table id="brin-builtin-opclasses-table">
+ <title>Built-in <acronym>BRIN</acronym> Operator Classes</title>
+ <tgroup cols="3">
+ <thead>
+ <row>
+ <entry>Name</entry>
+ <entry>Indexed Data Type</entry>
+ <entry>Indexable Operators</entry>
+ </row>
+ </thead>
+ <tbody>
+ <row>
+ <entry><literal>bytea_minmax_ops</literal></entry>
+ <entry><type>bytea</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>char_minmax_ops</literal></entry>
+ <entry><type>"char"</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>name_minmax_ops</literal></entry>
+ <entry><type>name</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>int8_minmax_ops</literal></entry>
+ <entry><type>bigint</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>int2_minmax_ops</literal></entry>
+ <entry><type>smallint</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>int4_minmax_ops</literal></entry>
+ <entry><type>integer</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>text_minmax_ops</literal></entry>
+ <entry><type>text</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>oid_minmax_ops</literal></entry>
+ <entry><type>oid</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>tid_minmax_ops</literal></entry>
+ <entry><type>tid</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>float4_minmax_ops</literal></entry>
+ <entry><type>real</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>float8_minmax_ops</literal></entry>
+ <entry><type>double precision</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>abstime_minmax_ops</literal></entry>
+ <entry><type>abstime</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>reltime_minmax_ops</literal></entry>
+ <entry><type>reltime</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>macaddr_minmax_ops</literal></entry>
+ <entry><type>macaddr</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>inet_minmax_ops</literal></entry>
+ <entry><type>inet</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>bpchar_minmax_ops</literal></entry>
+ <entry><type>character</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>date_minmax_ops</literal></entry>
+ <entry><type>date</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>time_minmax_ops</literal></entry>
+ <entry><type>time without time zone</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>timestamp_minmax_ops</literal></entry>
+ <entry><type>timestamp without time zone</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>timestamptz_minmax_ops</literal></entry>
+ <entry><type>timestamp with time zone</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>interval_minmax_ops</literal></entry>
+ <entry><type>interval</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>timetz_minmax_ops</literal></entry>
+ <entry><type>time with time zone</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>bit_minmax_ops</literal></entry>
+ <entry><type>bit</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>varbit_minmax_ops</literal></entry>
+ <entry><type>bit varying</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>numeric_minmax_ops</literal></entry>
+ <entry><type>numeric</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>uuid_minmax_ops</literal></entry>
+ <entry><type>uuid</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ <row>
+ <entry><literal>pg_lsn_minmax_ops</literal></entry>
+ <entry><type>pg_lsn</type></entry>
+ <entry>
+ <literal><</literal>
+ <literal><=</literal>
+ <literal>=</literal>
+ <literal>>=</literal>
+ <literal>></literal>
+ </entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+<sect1 id="brin-extensibility">
+ <title>Extensibility</title>
+ <para>
+ The <acronym>BRIN</acronym> interface has a high level of abstraction,
+ requiring the access method implementer only to implement the semantics
+ of the data type being accessed. The <acronym>BRIN</acronym> layer
+ itself takes care of concurrency, logging and searching the index structure.
+ </para>
+ <para>
+ All it takes to get a <acronym>BRIN</acronym> access method working is to
+ implement a few user-defined methods, which define the behavior of
+ summary values stored in the index and the way they interact with
+ scan keys.
+ In short, <acronym>BRIN</acronym> combines
+ extensibility with generality, code reuse, and a clean interface.
+ </para>
+ <para>
+ There are four methods that an operator class for <acronym>BRIN</acronym>
+ must provide:
+ <variablelist>
+ <varlistentry>
+ <term><function>BrinOpcInfo *opcInfo(Oid type_oid)</></term>
+ <listitem>
+ <para>
+ Returns internal information about the indexed columns' summary data.
+ The return value must point to a palloc'd <structname>BrinOpcInfo</>,
+ which has this definition:
+typedef struct BrinOpcInfo
+ /* Number of columns stored in an index column of this opclass */
+ uint16 oi_nstored;
+ /* Opaque pointer for the opclass' private use */
+ void *oi_opaque;
+ /* Type IDs of the stored columns */
+} BrinOpcInfo;
+ <structname>BrinOpcInfo</>.<structfield>oi_opaque</> can be used by the
+ operator class routines to pass information between support procedures
+ during an index scan.
+ </para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term><function>bool consistent(BrinDesc *bdesc, BrinValues *column,
+ ScanKey key)</function></term>
+ <listitem>
+ <para>
+ Returns whether the ScanKey is consistent with the given indexed
+ values for a range.
+ The attribute number to use is passed as part of the scan key.
+ </para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term><function>bool addValue(BrinDesc *bdesc, BrinValues *column,
+ Datum newval, bool isnull)</function></term>
+ <listitem>
+ <para>
+ Given an index tuple and an indexed value, modifies the indicated
+ attribute of the tuple so that it additionally represents the new value.
+ If any modification was done to the tuple, <literal>true</literal> is
+ returned.
+ </para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term><function>bool unionTuples(BrinDesc *bdesc, BrinValues *a,
+ BrinValues *b)</function></term>
+ <listitem>
+ <para>
+ Consolidates two index tuples. Given two index tuples, modifies the
+ indicated attribute of the first of them so that it represents both tuples.
+ The second tuple is not modified.
+ </para>
+ </listitem>
+ </varlistentry>
+ </variablelist>
+ To implement these methods in a generic way, the operator class
+ defines its own internal support functions.
+ (For instance, <quote>min/max</> operator classes implements
+ support functions for the four inequality operators for the data type.)
+ Additionally, the operator class must supply appropriate
+ operator entries,
+ to enable the optimizer to use the index when those operators are
+ used in queries.
+ </para>
<!ENTITY gist SYSTEM "gist.sgml">
<!ENTITY spgist SYSTEM "spgist.sgml">
<!ENTITY gin SYSTEM "gin.sgml">
+<!ENTITY brin SYSTEM "brin.sgml">
<!ENTITY planstats SYSTEM "planstats.sgml">
<!ENTITY indexam SYSTEM "indexam.sgml">
<!ENTITY nls SYSTEM "nls.sgml">
<productname>PostgreSQL</productname> provides several index types:
- B-tree, Hash, GiST, SP-GiST and GIN. Each index type uses a different
+ B-tree, Hash, GiST, SP-GiST, GIN and BRIN.
+ Each index type uses a different
algorithm that is best suited to different types of queries.
By default, the <command>CREATE INDEX</command> command creates
B-tree indexes, which fit the most common situations.
classes are available in the <literal>contrib</> collection or as separate
projects. For more information see <xref linkend="GIN">.
+ <para>
+ <indexterm>
+ <primary>index</primary>
+ <secondary>BRIN</secondary>
+ </indexterm>
+ <indexterm>
+ <primary>BRIN</primary>
+ <see>index</see>
+ </indexterm>
+ BRIN indexes (a shorthand for Block Range indexes)
+ store summaries about the values stored in consecutive table physical block ranges.
+ Like GiST, SP-GiST and GIN,
+ BRIN can support many different indexing strategies,
+ and the particular operators with which a BRIN index can be used
+ vary depending on the indexing strategy.
+ For datatypes that have a linear sort order, the indexed data
+ corresponds to the minimum and maximum values of the
+ values in the column for each block range,
+ which support indexed queries using these operators:
+ <simplelist>
+ <member><literal><</literal></member>
+ <member><literal><=</literal></member>
+ <member><literal>=</literal></member>
+ <member><literal>>=</literal></member>
+ <member><literal>></literal></member>
+ </simplelist>
+ The BRIN operator classes included in the standard distribution are
+ documented in <xref linkend="brin-builtin-opclasses-table">.
+ For more information see <xref linkend="BRIN">.
+ </para>
+ <varlistentry>
+ <term>
+ <function>brin_page_type(page bytea) returns text</function>
+ <indexterm>
+ <primary>brin_page_type</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ <function>brin_page_type</function> returns the page type of the given
+ <acronym>BRIN</acronym> index page, or throws an error if the page is
+ not a valid <acronym>BRIN</acronym> page. For example:
+brintest=# select brin_page_type(get_raw_page('brinidx', 0));
+ brin_page_type
+ meta
+ </para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term>
+ <function>brin_metapage_info(page bytea) returns record</function>
+ <indexterm>
+ <primary>brin_metapage_info</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ <function>brin_metapage_info</function> returns assorted information
+ about a <acronym>BRIN</acronym> index metapage. For example:
+brintest=# select * from brin_metapage_info(get_raw_page('brinidx', 0));
+ magic | version | pagesperrange | lastrevmappage
+ 0xA8109CFA | 1 | 4 | 2
+ </para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term>
+ <function>brin_revmap_data(page bytea) returns setof tid</function>
+ <indexterm>
+ <primary>brin_revmap_data</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ <function>brin_revmap_data</function> returns the list of tuple
+ identifiers in a <acronym>BRIN</acronym> index range map page.
+ For example:
+brintest=# select * from brin_revmap_data(get_raw_page('brinidx', 2)) limit 5;
+ pages
+ (6,137)
+ (6,138)
+ (6,139)
+ (6,140)
+ (6,141)
+ </para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term>
+ <function>brin_page_items(page bytea, index oid) returns setof record</function>
+ <indexterm>
+ <primary>brin_page_items</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ <function>brin_page_items</function> returns the data stored in the
+ <acronym>BRIN</acronym> data page. For example:
+brintest=# select * from brin_page_items(get_raw_page('brinidx', 5),
+brintest(# 'brinidx')
+brintest-# order by blknum, attnum limit 6;
+ itemoffset | blknum | attnum | allnulls | hasnulls | placeholder | value
+ 137 | 0 | 1 | t | f | f |
+ 137 | 0 | 2 | f | f | f | {1 .. 88}
+ 138 | 4 | 1 | t | f | f |
+ 138 | 4 | 2 | f | f | f | {89 .. 176}
+ 139 | 8 | 1 | t | f | f |
+ 139 | 8 | 2 | f | f | f | {177 .. 264}
+ The returned columns correspond to the fields in the
+ <structname>BrinMemTuple</> and <structname>BrinValues</> structs.
+ See <filename>src/include/access/brin_tuple.h</> for details.
+ </para>
+ </listitem>
+ </varlistentry>
<function>fsm_page_contents(page bytea) returns text</function>
+ &brin;
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
-SUBDIRS = common gin gist hash heap index nbtree rmgrdesc spgist transam
+SUBDIRS = brin common gin gist hash heap index nbtree rmgrdesc spgist transam
include $(top_srcdir)/src/backend/common.mk
--- /dev/null
+# Makefile--
+# Makefile for access/brin
+# src/backend/access/brin/Makefile
+subdir = src/backend/access/brin
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+OBJS = brin.o brin_pageops.o brin_revmap.o brin_tuple.o brin_xlog.o \
+ brin_minmax.o
+include $(top_srcdir)/src/backend/common.mk
--- /dev/null
+Block Range Indexes (BRIN)
+BRIN indexes intend to enable very fast scanning of extremely large tables.
+The essential idea of a BRIN index is to keep track of summarizing values in
+consecutive groups of heap pages (page ranges); for example, the minimum and
+maximum values for datatypes with a btree opclass, or the bounding box for
+geometric types. These values can be used to avoid scanning such pages
+during a table scan, depending on query quals.
+The cost of this is having to update the stored summary values of each page
+range as tuples are inserted into them.
+Access Method Design
+Since item pointers are not stored inside indexes of this type, it is not
+possible to support the amgettuple interface. Instead, we only provide
+amgetbitmap support. The amgetbitmap routine returns a lossy TIDBitmap
+comprising all pages in those page ranges that match the query
+qualifications. The recheck step in the BitmapHeapScan node prunes tuples
+that are not visible according to the query qualifications.
+An operator class must have the following entries:
+- generic support procedures (pg_amproc), identical to all opclasses:
+ * "opcinfo" (BRIN_PROCNUM_OPCINFO) initializes a structure for index
+ creation or scanning
+ * "addValue" (BRIN_PROCNUM_ADDVALUE) takes an index tuple and a heap item,
+ and possibly changes the index tuple so that it includes the heap item
+ values
+ * "consistent" (BRIN_PROCNUM_CONSISTENT) takes an index tuple and query
+ quals, and returns whether the index tuple values match the query quals.
+ * "union" (BRIN_PROCNUM_UNION) takes two index tuples and modifies the first
+ one so that it represents the union of the two.
+Procedure numbers up to 10 are reserved for future expansion.
+Additionally, each opclass needs additional support functions:
+- Minmax-style operator classes:
+ * Proc numbers 11-14 are used for the functions implementing inequality
+ operators for the type, in this order: less than, less or equal,
+ greater or equal, greater than.
+Opclasses using a different design will require different additional procedure
+Operator classes also need to have operator (pg_amop) entries so that the
+optimizer can choose the index to execute queries.
+- Minmax-style operator classes:
+ * The same operators as btree (<=, <, =, >=, >)
+Each index tuple stores some NULL bits and some opclass-specified values, which
+are stored in a single null bitmask of length twice the number of columns. The
+generic NULL bits indicate, for each column:
+ * bt_hasnulls: Whether there's any NULL value at all in the page range
+ * bt_allnulls: Whether all values are NULLs in the page range
+The opclass-specified values are:
+- Minmax-style operator classes
+ * minimum value across all tuples in the range
+ * maximum value across all tuples in the range
+Note that the addValue and Union support procedures must be careful to
+datumCopy() the values they want to store in the in-memory BRIN tuple, and
+must pfree() the old copies when replacing older ones. Since some values
+referenced from the tuple persist and others go away, there is no
+well-defined lifetime for a memory context that would make this automatic.
+The Range Map
+To find the index tuple for a particular page range, we have an internal
+structure we call the range map, or "revmap" for short. This stores one TID
+per page range, which is the address of the index tuple summarizing that
+range. Since the map entries are fixed size, it is possible to compute the
+address of the range map entry for any given heap page by simple arithmetic.
+When a new heap tuple is inserted in a summarized page range, we compare the
+existing index tuple with the new heap tuple. If the heap tuple is outside
+the summarization data given by the index tuple for any indexed column (or
+if the new heap tuple contains null values but the index tuple indicates
+there are no nulls), the index is updated with the new values. In many
+cases it is possible to update the index tuple in-place, but if the new
+index tuple is larger than the old one and there's not enough space in the
+page, it is necessary to create a new index tuple with the new values. The
+range map can be updated quickly to point to it; the old index tuple is
+If the range map points to an invalid TID, the corresponding page range is
+considered to be not summarized. When tuples are added to unsummarized
+pages, nothing needs to happen.
+To scan a table following a BRIN index, we scan the range map sequentially.
+This yields index tuples in ascending page range order. Query quals are
+matched to each index tuple; if they match, each page within the page range
+is returned as part of the output TID bitmap. If there's no match, they are
+skipped. Range map entries returning invalid index TIDs, that is
+unsummarized page ranges, are also returned in the TID bitmap.
+The revmap is stored in the first few blocks of the index main fork,
+immediately following the metapage. Whenever the revmap needs to be
+extended by another page, existing tuples in that page are moved to some
+other page.
+Heap tuples can be removed from anywhere without restriction. It might be
+useful to mark the corresponding index tuple somehow, if the heap tuple is
+one of the constraining values of the summary data (i.e. either min or max
+in the case of a btree-opclass-bearing datatype), so that in the future we
+are aware of the need to re-execute summarization on that range, leading to
+a possible tightening of the summary values.
+At index creation time, the whole table is scanned; for each page range the
+summarizing values of each indexed column and nulls bitmap are collected and
+stored in the index. The partially-filled page range at the end of the
+table is also summarized.
+As new tuples get inserted at the end of the table, they may update the
+index tuple that summarizes the partial page range at the end. Eventually
+that page range is complete and new tuples belong in a new page range that
+hasn't yet been summarized. Those insertions do not create a new index
+entry; instead, the page range remains unsummarized until later.
+Wehn VACUUM is run on the table, all unsummarized page ranges are
+summarized. This action can also be invoked by the user via
+brin_summarize_new_values(). Both these procedures scan all the
+unsummarized ranges, and create a summary tuple. Again, this includes the
+partially-filled page range at the end of the table.
+Since no heap TIDs are stored in a BRIN index, it's not necessary to scan the
+index when heap tuples are removed. It might be that some summary values can
+be tightened if heap tuples have been deleted; but this would represent an
+optimization opportunity only, not a correctness issue. It's simpler to
+represent this as the need to re-run summarization on the affected page range
+rather than "subtracting" values from the existing one. This is not
+currently implemented.
+Note that if there are no indexes on the table other than the BRIN index,
+usage of maintenance_work_mem by vacuum can be decreased significantly, because
+no detailed index scan needs to take place (and thus it's not necessary for
+vacuum to save TIDs to remove). It's unlikely that BRIN would be the only
+indexes in a table, though, because primary keys can be btrees only, and so
+we don't implement this optimization.
+The optimizer selects the index based on the operator class' pg_amop
+entries for the column.
+Future improvements
+* Different-size page ranges?
+ In the current design, each "index entry" in a BRIN index covers the same
+ number of pages. There's no hard reason for this; it might make sense to
+ allow the index to self-tune so that some index entries cover smaller page
+ ranges, if this allows the summary values to be more compact. This would incur
+ larger BRIN overhead for the index itself, but might allow better pruning of
+ page ranges during scan. In the limit of one index tuple per page, the index
+ itself would occupy too much space, even though we would be able to skip
+ reading the most heap pages, because the summary values are tight; in the
+ opposite limit of a single tuple that summarizes the whole table, we wouldn't
+ be able to prune anything even though the index is very small. This can
+ probably be made to work by using the range map as an index in itself.
+* More compact representation for TIDBitmap?
+ TIDBitmap is the structure used to represent bitmap scans. The
+ representation of lossy page ranges is not optimal for our purposes, because
+ it uses a Bitmapset to represent pages in the range; since we're going to return
+ all pages in a large range, it might be more convenient to allow for a
+ struct that uses start and end page numbers to represent the range, instead.
+* Better vacuuming?
+ It might be useful to enable passing more useful info to BRIN indexes during
+ vacuuming about tuples that are deleted, i.e. do not require the callback to
+ pass each tuple's TID. For instance we might need a callback that passes a
+ block number instead of a TID. That would help determine when to re-run
+ summarization on blocks that have seen lots of tuple deletions.
--- /dev/null
+ * brin.c
+ * Implementation of BRIN indexes for Postgres
+ *
+ * See src/backend/access/brin/README for details.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/brin/brin.c
+ *
+ * TODO
+ * * ScalarArrayOpExpr (amsearcharray -> SK_SEARCHARRAY)
+ */
+#include "postgres.h"
+#include "access/brin.h"
+#include "access/brin_internal.h"
+#include "access/brin_page.h"
+#include "access/brin_pageops.h"
+#include "access/brin_xlog.h"
+#include "access/reloptions.h"
+#include "access/relscan.h"
+#include "access/xact.h"
+#include "access/xloginsert.h"
+#include "catalog/index.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/bufmgr.h"
+#include "storage/freespace.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+#include "utils/snapmgr.h"
+ * We use a BrinBuildState during initial construction of a BRIN index.
+ * The running state is kept in a BrinMemTuple.
+ */
+typedef struct BrinBuildState
+ Relation bs_irel;
+ int bs_numtuples;
+ Buffer bs_currentInsertBuf;
+ BlockNumber bs_pagesPerRange;
+ BlockNumber bs_currRangeStart;
+ BrinRevmap *bs_rmAccess;
+ BrinDesc *bs_bdesc;
+ BrinMemTuple *bs_dtuple;
+} BrinBuildState;
+ * Struct used as "opaque" during index scans
+ */
+typedef struct BrinOpaque
+ BlockNumber bo_pagesPerRange;
+ BrinRevmap *bo_rmAccess;
+ BrinDesc *bo_bdesc;
+} BrinOpaque;
+static BrinBuildState *initialize_brin_buildstate(Relation idxRel,
+ BrinRevmap *revmap, BlockNumber pagesPerRange);
+static void terminate_brin_buildstate(BrinBuildState *state);
+static void brinsummarize(Relation index, Relation heapRel,
+ double *numSummarized, double *numExisting);
+static void form_and_insert_tuple(BrinBuildState *state);
+static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a,
+ BrinTuple *b);
+ * A tuple in the heap is being inserted. To keep a brin index up to date,
+ * we need to obtain the relevant index tuple and compare its stored values
+ * with those of the new tuple. If the tuple values are not consistent with
+ * the summary tuple, we need to update the index tuple.
+ *
+ * If the range is not currently summarized (i.e. the revmap returns NULL for
+ * it), there's nothing to do.
+ */
+ Relation idxRel = (Relation) PG_GETARG_POINTER(0);
+ Datum *values = (Datum *) PG_GETARG_POINTER(1);
+ bool *nulls = (bool *) PG_GETARG_POINTER(2);
+ ItemPointer heaptid = (ItemPointer) PG_GETARG_POINTER(3);
+ /* we ignore the rest of our arguments */
+ BlockNumber pagesPerRange;
+ BrinDesc *bdesc = NULL;
+ BrinRevmap *revmap;
+ Buffer buf = InvalidBuffer;
+ MemoryContext tupcxt = NULL;
+ MemoryContext oldcxt = NULL;
+ revmap = brinRevmapInitialize(idxRel, &pagesPerRange);
+ for (;;)
+ {
+ bool need_insert = false;
+ OffsetNumber off;
+ BrinTuple *brtup;
+ BrinMemTuple *dtup;
+ BlockNumber heapBlk;
+ int keyno;
+ BrinTuple *tmptup PG_USED_FOR_ASSERTS_ONLY;
+ BrinMemTuple *tmpdtup PG_USED_FOR_ASSERTS_ONLY;
+ heapBlk = ItemPointerGetBlockNumber(heaptid);
+ /* normalize the block number to be the first block in the range */
+ heapBlk = (heapBlk / pagesPerRange) * pagesPerRange;
+ brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, NULL,
+ /* if range is unsummarized, there's nothing to do */
+ if (!brtup)
+ break;
+ /* First time through? */
+ if (bdesc == NULL)
+ {
+ bdesc = brin_build_desc(idxRel);
+ tupcxt = AllocSetContextCreate(CurrentMemoryContext,
+ "brininsert cxt",
+ oldcxt = MemoryContextSwitchTo(tupcxt);
+ }
+ dtup = brin_deform_tuple(bdesc, brtup);
+ {
+ /*
+ * When assertions are enabled, we use this as an opportunity to
+ * test the "union" method, which would otherwise be used very
+ * rarely: first create a placeholder tuple, and addValue the
+ * value we just got into it. Then union the existing index tuple
+ * with the updated placeholder tuple. The tuple resulting from
+ * that union should be identical to the one resulting from the
+ * regular operation (straight addValue) below.
+ *
+ * Here we create the tuple to compare with; the actual comparison
+ * is below.
+ */
+ tmptup = brin_form_placeholder_tuple(bdesc, heapBlk, &tmpsiz);
+ tmpdtup = brin_deform_tuple(bdesc, tmptup);
+ for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
+ {
+ BrinValues *bval;
+ FmgrInfo *addValue;
+ bval = &tmpdtup->bt_columns[keyno];
+ addValue = index_getprocinfo(idxRel, keyno + 1,
+ FunctionCall4Coll(addValue,
+ idxRel->rd_indcollation[keyno],
+ PointerGetDatum(bdesc),
+ PointerGetDatum(bval),
+ values[keyno],
+ nulls[keyno]);
+ }
+ union_tuples(bdesc, tmpdtup, brtup);
+ tmpdtup->bt_placeholder = dtup->bt_placeholder;
+ tmptup = brin_form_tuple(bdesc, heapBlk, tmpdtup, &tmpsiz);
+ }
+ /*
+ * Compare the key values of the new tuple to the stored index values;
+ * our deformed tuple will get updated if the new tuple doesn't fit
+ * the original range (note this means we can't break out of the loop
+ * early). Make a note of whether this happens, so that we know to
+ * insert the modified tuple later.
+ */
+ for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
+ {
+ Datum result;
+ BrinValues *bval;
+ FmgrInfo *addValue;
+ bval = &dtup->bt_columns[keyno];
+ addValue = index_getprocinfo(idxRel, keyno + 1,
+ result = FunctionCall4Coll(addValue,
+ idxRel->rd_indcollation[keyno],
+ PointerGetDatum(bdesc),
+ PointerGetDatum(bval),
+ values[keyno],
+ nulls[keyno]);
+ /* if that returned true, we need to insert the updated tuple */
+ need_insert |= DatumGetBool(result);
+ }
+ {
+ /*
+ * Now we can compare the tuple produced by the union function
+ * with the one from plain addValue.
+ */
+ BrinTuple *cmptup;
+ Size cmpsz;
+ cmptup = brin_form_tuple(bdesc, heapBlk, dtup, &cmpsz);
+ Assert(brin_tuples_equal(tmptup, tmpsiz, cmptup, cmpsz));
+ }
+ if (!need_insert)
+ {
+ /*
+ * The tuple is consistent with the new values, so there's nothing
+ * to do.
+ */
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ }
+ else
+ {
+ Page page = BufferGetPage(buf);
+ ItemId lp = PageGetItemId(page, off);
+ Size origsz;
+ BrinTuple *origtup;
+ Size newsz;
+ BrinTuple *newtup;
+ bool samepage;
+ /*
+ * Make a copy of the old tuple, so that we can compare it after
+ * re-acquiring the lock.
+ */
+ origsz = ItemIdGetLength(lp);
+ origtup = brin_copy_tuple(brtup, origsz);
+ /*
+ * Before releasing the lock, check if we can attempt a same-page
+ * update. Another process could insert a tuple concurrently in
+ * the same page though, so downstream we must be prepared to cope
+ * if this turns out to not be possible after all.
+ */
+ samepage = brin_can_do_samepage_update(buf, origsz, newsz);
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz);
+ /*
+ * Try to update the tuple. If this doesn't work for whatever
+ * reason, we need to restart from the top; the revmap might be
+ * pointing at a different tuple for this block now, so we need to
+ * recompute to ensure both our new heap tuple and the other
+ * inserter's are covered by the combined tuple. It might be that
+ * we don't need to update at all.
+ */
+ if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk,
+ buf, off, origtup, origsz, newtup, newsz,
+ samepage))
+ {
+ /* no luck; start over */
+ MemoryContextResetAndDeleteChildren(tupcxt);
+ continue;
+ }
+ }
+ /* success! */
+ break;
+ }
+ brinRevmapTerminate(revmap);
+ if (BufferIsValid(buf))
+ ReleaseBuffer(buf);
+ if (bdesc != NULL)
+ {
+ brin_free_desc(bdesc);
+ MemoryContextSwitchTo(oldcxt);
+ MemoryContextDelete(tupcxt);
+ }
+ return BoolGetDatum(false);
+ * Initialize state for a BRIN index scan.
+ *
+ * We read the metapage here to determine the pages-per-range number that this
+ * index was built with. Note that since this cannot be changed while we're
+ * holding lock on index, it's not necessary to recompute it during brinrescan.
+ */
+ Relation r = (Relation) PG_GETARG_POINTER(0);
+ int nkeys = PG_GETARG_INT32(1);
+ int norderbys = PG_GETARG_INT32(2);
+ IndexScanDesc scan;
+ BrinOpaque *opaque;
+ scan = RelationGetIndexScan(r, nkeys, norderbys);
+ opaque = (BrinOpaque *) palloc(sizeof(BrinOpaque));
+ opaque->bo_rmAccess = brinRevmapInitialize(r, &opaque->bo_pagesPerRange);
+ opaque->bo_bdesc = brin_build_desc(r);
+ scan->opaque = opaque;
+ * Execute the index scan.
+ *
+ * This works by reading index TIDs from the revmap, and obtaining the index
+ * tuples pointed to by them; the summary values in the index tuples are
+ * compared to the scan keys. We return into the TID bitmap all the pages in
+ * ranges corresponding to index tuples that match the scan keys.
+ *
+ * If a TID from the revmap is read as InvalidTID, we know that range is
+ * unsummarized. Pages in those ranges need to be returned regardless of scan
+ * keys.
+ *
+ * XXX see _bt_first on what to do about sk_subtype.
+ */
+ IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+ TIDBitmap *tbm = (TIDBitmap *) PG_GETARG_POINTER(1);
+ Relation idxRel = scan->indexRelation;
+ Buffer buf = InvalidBuffer;
+ BrinDesc *bdesc;
+ Oid heapOid;
+ Relation heapRel;
+ BrinOpaque *opaque;
+ BlockNumber nblocks;
+ BlockNumber heapBlk;
+ int totalpages = 0;
+ int keyno;
+ FmgrInfo *consistentFn;
+ MemoryContext oldcxt;
+ MemoryContext perRangeCxt;
+ opaque = (BrinOpaque *) scan->opaque;
+ bdesc = opaque->bo_bdesc;
+ pgstat_count_index_scan(idxRel);
+ /*
+ * We need to know the size of the table so that we know how long to
+ * iterate on the revmap.
+ */
+ heapOid = IndexGetRelation(RelationGetRelid(idxRel), false);
+ heapRel = heap_open(heapOid, AccessShareLock);
+ nblocks = RelationGetNumberOfBlocks(heapRel);
+ heap_close(heapRel, AccessShareLock);
+ /*
+ * Obtain consistent functions for all indexed column. Maybe it'd be
+ * possible to do this lazily only the first time we see a scan key that
+ * involves each particular attribute.
+ */
+ consistentFn = palloc(sizeof(FmgrInfo) * bdesc->bd_tupdesc->natts);
+ for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
+ {
+ FmgrInfo *tmp;
+ tmp = index_getprocinfo(idxRel, keyno + 1, BRIN_PROCNUM_CONSISTENT);
+ fmgr_info_copy(&consistentFn[keyno], tmp, CurrentMemoryContext);
+ }
+ /*
+ * Setup and use a per-range memory context, which is reset every time we
+ * loop below. This avoids having to free the tuples within the loop.
+ */
+ perRangeCxt = AllocSetContextCreate(CurrentMemoryContext,
+ "bringetbitmap cxt",
+ oldcxt = MemoryContextSwitchTo(perRangeCxt);
+ /*
+ * Now scan the revmap. We start by querying for heap page 0,
+ * incrementing by the number of pages per range; this gives us a full
+ * view of the table.
+ */
+ for (heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange)
+ {
+ bool addrange;
+ BrinTuple *tup;
+ OffsetNumber off;
+ Size size;
+ MemoryContextResetAndDeleteChildren(perRangeCxt);
+ tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf,
+ &off, &size, BUFFER_LOCK_SHARE);
+ if (tup)
+ {
+ tup = brin_copy_tuple(tup, size);
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ }
+ /*
+ * For page ranges with no indexed tuple, we must return the whole
+ * range; otherwise, compare it to the scan keys.
+ */
+ if (tup == NULL)
+ {
+ addrange = true;
+ }
+ else
+ {
+ BrinMemTuple *dtup;
+ int keyno;
+ dtup = brin_deform_tuple(bdesc, tup);
+ if (dtup->bt_placeholder)
+ {
+ /*
+ * Placeholder tuples are always returned, regardless of the
+ * values stored in them.
+ */
+ addrange = true;
+ }
+ else
+ {
+ /*
+ * Compare scan keys with summary values stored for the range.
+ * If scan keys are matched, the page range must be added to
+ * the bitmap. We initially assume the range needs to be
+ * added; in particular this serves the case where there are
+ * no keys.
+ */
+ addrange = true;
+ for (keyno = 0; keyno < scan->numberOfKeys; keyno++)
+ {
+ ScanKey key = &scan->keyData[keyno];
+ AttrNumber keyattno = key->sk_attno;
+ BrinValues *bval = &dtup->bt_columns[keyattno - 1];
+ Datum add;
+ /*
+ * The collation of the scan key must match the collation
+ * used in the index column (but only if the search is not
+ * IS NULL/ IS NOT NULL). Otherwise we shouldn't be using
+ * this index ...
+ */
+ Assert((key->sk_flags & SK_ISNULL) ||
+ (key->sk_collation ==
+ bdesc->bd_tupdesc->attrs[keyattno - 1]->attcollation));
+ /*
+ * Check whether the scan key is consistent with the page
+ * range values; if so, have the pages in the range added
+ * to the output bitmap.
+ *
+ * When there are multiple scan keys, failure to meet the
+ * criteria for a single one of them is enough to discard
+ * the range as a whole, so break out of the loop as soon
+ * as a false return value is obtained.
+ */
+ add = FunctionCall3Coll(&consistentFn[keyattno - 1],
+ key->sk_collation,
+ PointerGetDatum(bdesc),
+ PointerGetDatum(bval),
+ PointerGetDatum(key));
+ addrange = DatumGetBool(add);
+ if (!addrange)
+ break;
+ }
+ }
+ }
+ /* add the pages in the range to the output bitmap, if needed */
+ if (addrange)
+ {
+ BlockNumber pageno;
+ for (pageno = heapBlk;
+ pageno <= heapBlk + opaque->bo_pagesPerRange - 1;
+ pageno++)
+ {
+ MemoryContextSwitchTo(oldcxt);
+ tbm_add_page(tbm, pageno);
+ totalpages++;
+ MemoryContextSwitchTo(perRangeCxt);
+ }
+ }
+ }
+ MemoryContextSwitchTo(oldcxt);
+ MemoryContextDelete(perRangeCxt);
+ if (buf != InvalidBuffer)
+ ReleaseBuffer(buf);
+ /*
+ * XXX We have an approximation of the number of *pages* that our scan
+ * returns, but we don't have a precise idea of the number of heap tuples
+ * involved.
+ */
+ PG_RETURN_INT64(totalpages * 10);
+ * Re-initialize state for a BRIN index scan
+ */
+ IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+ ScanKey scankey = (ScanKey) PG_GETARG_POINTER(1);
+ /* other arguments ignored */
+ if (scankey && scan->numberOfKeys > 0)
+ memmove(scan->keyData, scankey,
+ scan->numberOfKeys * sizeof(ScanKeyData));
+ * Close down a BRIN index scan
+ */
+ IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+ BrinOpaque *opaque = (BrinOpaque *) scan->opaque;
+ brinRevmapTerminate(opaque->bo_rmAccess);
+ brin_free_desc(opaque->bo_bdesc);
+ pfree(opaque);
+ elog(ERROR, "BRIN does not support mark/restore");
+ elog(ERROR, "BRIN does not support mark/restore");
+ * Per-heap-tuple callback for IndexBuildHeapScan.
+ *
+ * Note we don't worry about the page range at the end of the table here; it is
+ * present in the build state struct after we're called the last time, but not
+ * inserted into the index. Caller must ensure to do so, if appropriate.
+ */
+static void
+brinbuildCallback(Relation index,
+ HeapTuple htup,
+ Datum *values,
+ bool *isnull,
+ bool tupleIsAlive,
+ void *brstate)
+ BrinBuildState *state = (BrinBuildState *) brstate;
+ BlockNumber thisblock;
+ int i;
+ thisblock = ItemPointerGetBlockNumber(&htup->t_self);
+ /*
+ * If we're in a block that belongs to a future range, summarize what we've
+ * got and start afresh. Note the scan might have skipped many pages,
+ * if they were devoid of live tuples; make sure to insert index tuples
+ * for those too.
+ */
+ while (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1)
+ {
+ BRIN_elog(DEBUG2, "brinbuildCallback: completed a range: %u--%u",
+ state->bs_currRangeStart,
+ state->bs_currRangeStart + state->bs_pagesPerRange);
+ /* create the index tuple and insert it */
+ form_and_insert_tuple(state);
+ /* set state to correspond to the next range */
+ state->bs_currRangeStart += state->bs_pagesPerRange;
+ /* re-initialize state for it */
+ brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
+ }
+ /* Accumulate the current tuple into the running state */
+ for (i = 0; i < state->bs_bdesc->bd_tupdesc->natts; i++)
+ {
+ FmgrInfo *addValue;
+ BrinValues *col;
+ col = &state->bs_dtuple->bt_columns[i];
+ addValue = index_getprocinfo(index, i + 1,
+ /*
+ * Update dtuple state, if and as necessary.
+ */
+ FunctionCall4Coll(addValue,
+ state->bs_bdesc->bd_tupdesc->attrs[i]->attcollation,
+ PointerGetDatum(state->bs_bdesc),
+ PointerGetDatum(col),
+ values[i], isnull[i]);
+ }
+ * brinbuild() -- build a new BRIN index.
+ */
+ Relation heap = (Relation) PG_GETARG_POINTER(0);
+ Relation index = (Relation) PG_GETARG_POINTER(1);
+ IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
+ IndexBuildResult *result;
+ double reltuples;
+ double idxtuples;
+ BrinRevmap *revmap;
+ BrinBuildState *state;
+ Buffer meta;
+ BlockNumber pagesPerRange;
+ /*
+ * We expect to be called exactly once for any index relation.
+ */
+ if (RelationGetNumberOfBlocks(index) != 0)
+ elog(ERROR, "index \"%s\" already contains data",
+ RelationGetRelationName(index));
+ /*
+ * Critical section not required, because on error the creation of the
+ * whole relation will be rolled back.
+ */
+ meta = ReadBuffer(index, P_NEW);
+ Assert(BufferGetBlockNumber(meta) == BRIN_METAPAGE_BLKNO);
+ LockBuffer(meta, BUFFER_LOCK_EXCLUSIVE);
+ brin_metapage_init(BufferGetPage(meta), BrinGetPagesPerRange(index),
+ MarkBufferDirty(meta);
+ if (RelationNeedsWAL(index))
+ {
+ xl_brin_createidx xlrec;
+ XLogRecPtr recptr;
+ XLogRecData rdata;
+ Page page;
+ xlrec.node = index->rd_node;
+ xlrec.version = BRIN_CURRENT_VERSION;
+ xlrec.pagesPerRange = BrinGetPagesPerRange(index);
+ rdata.buffer = InvalidBuffer;
+ rdata.data = (char *) &xlrec;
+ rdata.len = SizeOfBrinCreateIdx;
+ rdata.next = NULL;
+ recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_CREATE_INDEX, &rdata);
+ page = BufferGetPage(meta);
+ PageSetLSN(page, recptr);
+ }
+ UnlockReleaseBuffer(meta);
+ /*
+ * Initialize our state, including the deformed tuple state.
+ */
+ revmap = brinRevmapInitialize(index, &pagesPerRange);
+ state = initialize_brin_buildstate(index, revmap, pagesPerRange);
+ /*
+ * Now scan the relation. No syncscan allowed here because we want the
+ * heap blocks in physical order.
+ */
+ reltuples = IndexBuildHeapScan(heap, index, indexInfo, false,
+ brinbuildCallback, (void *) state);
+ /* process the final batch */
+ form_and_insert_tuple(state);
+ /* release resources */
+ idxtuples = state->bs_numtuples;
+ brinRevmapTerminate(state->bs_rmAccess);
+ terminate_brin_buildstate(state);
+ /*
+ * Return statistics
+ */
+ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
+ result->heap_tuples = reltuples;
+ result->index_tuples = idxtuples;
+ Relation index = (Relation) PG_GETARG_POINTER(0);
+ Buffer metabuf;
+ /* An empty BRIN index has a metapage only. */
+ metabuf =
+ ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
+ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
+ /* Initialize and xlog metabuffer. */
+ brin_metapage_init(BufferGetPage(metabuf), BrinGetPagesPerRange(index),
+ MarkBufferDirty(metabuf);
+ log_newpage_buffer(metabuf, false);
+ UnlockReleaseBuffer(metabuf);
+ * brinbulkdelete
+ * Since there are no per-heap-tuple index tuples in BRIN indexes,
+ * there's not a lot we can do here.
+ *
+ * XXX we could mark item tuples as "dirty" (when a minimum or maximum heap
+ * tuple is deleted), meaning the need to re-run summarization on the affected
+ * range. Need to an extra flag in mmtuples for that.
+ */
+ /* other arguments are not currently used */
+ IndexBulkDeleteResult *stats =
+ (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
+ /* allocate stats if first time through, else re-use existing struct */
+ if (stats == NULL)
+ stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+ * This routine is in charge of "vacuuming" a BRIN index: we just summarize
+ * ranges that are currently unsummarized.
+ */
+ IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
+ IndexBulkDeleteResult *stats =
+ (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
+ Relation heapRel;
+ /* No-op in ANALYZE ONLY mode */
+ if (info->analyze_only)
+ if (!stats)
+ stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+ stats->num_pages = RelationGetNumberOfBlocks(info->index);
+ /* rest of stats is initialized by zeroing */
+ heapRel = heap_open(IndexGetRelation(RelationGetRelid(info->index), false),
+ AccessShareLock);
+ brinsummarize(info->index, heapRel,
+ &stats->num_index_tuples, &stats->num_index_tuples);
+ heap_close(heapRel, AccessShareLock);
+ * reloptions processor for BRIN indexes
+ */
+ Datum reloptions = PG_GETARG_DATUM(0);
+ bool validate = PG_GETARG_BOOL(1);
+ relopt_value *options;
+ BrinOptions *rdopts;
+ int numoptions;
+ static const relopt_parse_elt tab[] = {
+ {"pages_per_range", RELOPT_TYPE_INT, offsetof(BrinOptions, pagesPerRange)}
+ };
+ options = parseRelOptions(reloptions, validate, RELOPT_KIND_BRIN,
+ &numoptions);
+ /* if none set, we're done */
+ if (numoptions == 0)
+ rdopts = allocateReloptStruct(sizeof(BrinOptions), options, numoptions);
+ fillRelOptions((void *) rdopts, sizeof(BrinOptions), options, numoptions,
+ validate, tab, lengthof(tab));
+ pfree(options);
+ PG_RETURN_BYTEA_P(rdopts);
+ * SQL-callable function to scan through an index and summarize all ranges
+ * that are not currently summarized.
+ */
+ Oid indexoid = PG_GETARG_OID(0);
+ Relation indexRel;
+ Relation heapRel;
+ double numSummarized = 0;
+ heapRel = heap_open(IndexGetRelation(indexoid, false),
+ ShareUpdateExclusiveLock);
+ indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
+ brinsummarize(indexRel, heapRel, &numSummarized, NULL);
+ relation_close(indexRel, ShareUpdateExclusiveLock);
+ relation_close(heapRel, ShareUpdateExclusiveLock);
+ PG_RETURN_INT32((int32) numSummarized);
+ * Build a BrinDesc used to create or scan a BRIN index
+ */
+BrinDesc *
+brin_build_desc(Relation rel)
+ BrinOpcInfo **opcinfo;
+ BrinDesc *bdesc;
+ TupleDesc tupdesc;
+ int totalstored = 0;
+ int keyno;
+ long totalsize;
+ MemoryContext cxt;
+ MemoryContext oldcxt;
+ cxt = AllocSetContextCreate(CurrentMemoryContext,
+ "brin desc cxt",
+ oldcxt = MemoryContextSwitchTo(cxt);
+ tupdesc = RelationGetDescr(rel);
+ /*
+ * Obtain BrinOpcInfo for each indexed column. While at it, accumulate
+ * the number of columns stored, since the number is opclass-defined.
+ */
+ opcinfo = (BrinOpcInfo **) palloc(sizeof(BrinOpcInfo *) * tupdesc->natts);
+ for (keyno = 0; keyno < tupdesc->natts; keyno++)
+ {
+ FmgrInfo *opcInfoFn;
+ opcInfoFn = index_getprocinfo(rel, keyno + 1, BRIN_PROCNUM_OPCINFO);
+ opcinfo[keyno] = (BrinOpcInfo *)
+ DatumGetPointer(FunctionCall1(opcInfoFn,
+ tupdesc->attrs[keyno]->atttypid));
+ totalstored += opcinfo[keyno]->oi_nstored;
+ }
+ /* Allocate our result struct and fill it in */
+ totalsize = offsetof(BrinDesc, bd_info) +
+ sizeof(BrinOpcInfo *) * tupdesc->natts;
+ bdesc = palloc(totalsize);
+ bdesc->bd_context = cxt;
+ bdesc->bd_index = rel;
+ bdesc->bd_tupdesc = tupdesc;
+ bdesc->bd_disktdesc = NULL; /* generated lazily */
+ bdesc->bd_totalstored = totalstored;
+ for (keyno = 0; keyno < tupdesc->natts; keyno++)
+ bdesc->bd_info[keyno] = opcinfo[keyno];
+ pfree(opcinfo);
+ MemoryContextSwitchTo(oldcxt);
+ return bdesc;
+brin_free_desc(BrinDesc *bdesc)
+ /* make sure the tupdesc is still valid */
+ Assert(bdesc->bd_tupdesc->tdrefcount >= 1);
+ /* no need for retail pfree */
+ MemoryContextDelete(bdesc->bd_context);
+ * Initialize a BrinBuildState appropriate to create tuples on the given index.
+ */
+static BrinBuildState *
+initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap,
+ BlockNumber pagesPerRange)
+ BrinBuildState *state;
+ state = palloc(sizeof(BrinBuildState));
+ state->bs_irel = idxRel;
+ state->bs_numtuples = 0;
+ state->bs_currentInsertBuf = InvalidBuffer;
+ state->bs_pagesPerRange = pagesPerRange;
+ state->bs_currRangeStart = 0;
+ state->bs_rmAccess = revmap;
+ state->bs_bdesc = brin_build_desc(idxRel);
+ state->bs_dtuple = brin_new_memtuple(state->bs_bdesc);
+ brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
+ return state;
+ * Release resources associated with a BrinBuildState.
+ */
+static void
+terminate_brin_buildstate(BrinBuildState *state)
+ /* release the last index buffer used */
+ if (!BufferIsInvalid(state->bs_currentInsertBuf))
+ {
+ Page page;
+ page = BufferGetPage(state->bs_currentInsertBuf);
+ RecordPageWithFreeSpace(state->bs_irel,
+ BufferGetBlockNumber(state->bs_currentInsertBuf),
+ PageGetFreeSpace(page));
+ ReleaseBuffer(state->bs_currentInsertBuf);
+ }
+ brin_free_desc(state->bs_bdesc);
+ pfree(state->bs_dtuple);
+ pfree(state);
+ * Summarize the given page range of the given index.
+ *
+ * This routine can run in parallel with insertions into the heap. To avoid
+ * missing those values from the summary tuple, we first insert a placeholder
+ * index tuple into the index, then execute the heap scan; transactions
+ * concurrent with the scan update the placeholder tuple. After the scan, we
+ * union the placeholder tuple with the one computed by this routine. The
+ * update of the index value happens in a loop, so that if somebody updates
+ * the placeholder tuple after we read it, we detect the case and try again.
+ * This ensures that the concurrently inserted tuples are not lost.
+ */
+static void
+summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel,
+ BlockNumber heapBlk)
+ Buffer phbuf;
+ BrinTuple *phtup;
+ Size phsz;
+ OffsetNumber offset;
+ /*
+ * Insert the placeholder tuple
+ */
+ phbuf = InvalidBuffer;
+ phtup = brin_form_placeholder_tuple(state->bs_bdesc, heapBlk, &phsz);
+ offset = brin_doinsert(state->bs_irel, state->bs_pagesPerRange,
+ state->bs_rmAccess, &phbuf,
+ heapBlk, phtup, phsz);
+ /*
+ * Execute the partial heap scan covering the heap blocks in the specified
+ * page range, summarizing the heap tuples in it. This scan stops just
+ * short of brinbuildCallback creating the new index entry.
+ */
+ state->bs_currRangeStart = heapBlk;
+ IndexBuildHeapRangeScan(heapRel, state->bs_irel, indexInfo, false,
+ heapBlk, state->bs_pagesPerRange,
+ brinbuildCallback, (void *) state);
+ /*
+ * Now we update the values obtained by the scan with the placeholder
+ * tuple. We do this in a loop which only terminates if we're able to
+ * update the placeholder tuple successfully; if we are not, this means
+ * somebody else modified the placeholder tuple after we read it.
+ */
+ for (;;)
+ {
+ BrinTuple *newtup;
+ Size newsize;
+ bool didupdate;
+ bool samepage;
+ /*
+ * Update the summary tuple and try to update.
+ */
+ newtup = brin_form_tuple(state->bs_bdesc,
+ heapBlk, state->bs_dtuple, &newsize);
+ samepage = brin_can_do_samepage_update(phbuf, phsz, newsize);
+ didupdate =
+ brin_doupdate(state->bs_irel, state->bs_pagesPerRange,
+ state->bs_rmAccess, heapBlk, phbuf, offset,
+ phtup, phsz, newtup, newsize, samepage);
+ brin_free_tuple(phtup);
+ brin_free_tuple(newtup);
+ /* If the update succeeded, we're done. */
+ if (didupdate)
+ break;
+ /*
+ * If the update didn't work, it might be because somebody updated the
+ * placeholder tuple concurrently. Extract the new version, union it
+ * with the values we have from the scan, and start over. (There are
+ * other reasons for the update to fail, but it's simple to treat them
+ * the same.)
+ */
+ phtup = brinGetTupleForHeapBlock(state->bs_rmAccess, heapBlk, &phbuf,
+ &offset, &phsz, BUFFER_LOCK_SHARE);
+ /* the placeholder tuple must exist */
+ if (phtup == NULL)
+ elog(ERROR, "missing placeholder tuple");
+ phtup = brin_copy_tuple(phtup, phsz);
+ LockBuffer(phbuf, BUFFER_LOCK_UNLOCK);
+ /* merge it into the tuple from the heap scan */
+ union_tuples(state->bs_bdesc, state->bs_dtuple, phtup);
+ }
+ ReleaseBuffer(phbuf);
+ * Scan a complete BRIN index, and summarize each page range that's not already
+ * summarized. The index and heap must have been locked by caller in at
+ * least ShareUpdateExclusiveLock mode.
+ *
+ * For each new index tuple inserted, *numSummarized (if not NULL) is
+ * incremented; for each existing tuple, numExisting (if not NULL) is
+ * incremented.
+ */
+static void
+brinsummarize(Relation index, Relation heapRel, double *numSummarized,
+ double *numExisting)
+ BrinRevmap *revmap;
+ BrinBuildState *state = NULL;
+ IndexInfo *indexInfo = NULL;
+ BlockNumber heapNumBlocks;
+ BlockNumber heapBlk;
+ BlockNumber pagesPerRange;
+ Buffer buf;
+ revmap = brinRevmapInitialize(index, &pagesPerRange);
+ /*
+ * Scan the revmap to find unsummarized items.
+ */
+ buf = InvalidBuffer;
+ heapNumBlocks = RelationGetNumberOfBlocks(heapRel);
+ for (heapBlk = 0; heapBlk < heapNumBlocks; heapBlk += pagesPerRange)
+ {
+ BrinTuple *tup;
+ OffsetNumber off;
+ tup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, NULL,
+ if (tup == NULL)
+ {
+ /* no revmap entry for this heap range. Summarize it. */
+ if (state == NULL)
+ {
+ /* first time through */
+ Assert(!indexInfo);
+ state = initialize_brin_buildstate(index, revmap,
+ pagesPerRange);
+ indexInfo = BuildIndexInfo(index);
+ /*
+ * We only have ShareUpdateExclusiveLock on the table, and
+ * therefore other sessions may insert tuples into the range
+ * we're going to scan. This is okay, because we take
+ * additional precautions to avoid losing the additional
+ * tuples; see comments in summarize_range. Set the
+ * concurrent flag, which causes IndexBuildHeapRangeScan to
+ * use a snapshot other than SnapshotAny, and silences
+ * warnings emitted there.
+ */
+ indexInfo->ii_Concurrent = true;
+ /*
+ * If using transaction-snapshot mode, it would be possible
+ * for another transaction to insert a tuple that's not
+ * visible to our snapshot if we have already acquired one,
+ * when in snapshot-isolation mode; therefore, disallow this
+ * from running in such a transaction unless a snapshot hasn't
+ * been acquired yet.
+ *
+ * This code is called by VACUUM and
+ * brin_summarize_new_values. Have the error message mention
+ * the latter because VACUUM cannot run in a transaction and
+ * thus cannot cause this issue.
+ */
+ if (IsolationUsesXactSnapshot() && FirstSnapshotSet)
+ ereport(ERROR,
+ errmsg("brin_summarize_new_values() cannot run in a transaction that has already obtained a snapshot")));
+ }
+ summarize_range(indexInfo, state, heapRel, heapBlk);
+ /* and re-initialize state for the next range */
+ brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
+ if (numSummarized)
+ *numSummarized += 1.0;
+ }
+ else
+ {
+ if (numExisting)
+ *numExisting += 1.0;
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ }
+ }
+ if (BufferIsValid(buf))
+ ReleaseBuffer(buf);
+ /* free resources */
+ brinRevmapTerminate(revmap);
+ if (state)
+ terminate_brin_buildstate(state);
+ * Given a deformed tuple in the build state, convert it into the on-disk
+ * format and insert it into the index, making the revmap point to it.
+ */
+static void
+form_and_insert_tuple(BrinBuildState *state)
+ BrinTuple *tup;
+ Size size;
+ tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
+ state->bs_dtuple, &size);
+ brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
+ &state->bs_currentInsertBuf, state->bs_currRangeStart,
+ tup, size);
+ state->bs_numtuples++;
+ pfree(tup);
+ * Given two deformed tuples, adjust the first one so that it's consistent
+ * with the summary values in both.
+ */
+static void
+union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b)
+ int keyno;
+ BrinMemTuple *db;
+ MemoryContext cxt;
+ MemoryContext oldcxt;
+ /* Use our own memory context to avoid retail pfree */
+ cxt = AllocSetContextCreate(CurrentMemoryContext,
+ "brin union",
+ oldcxt = MemoryContextSwitchTo(cxt);
+ db = brin_deform_tuple(bdesc, b);
+ MemoryContextSwitchTo(oldcxt);
+ for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
+ {
+ FmgrInfo *unionFn;
+ BrinValues *col_a = &a->bt_columns[keyno];
+ BrinValues *col_b = &db->bt_columns[keyno];
+ unionFn = index_getprocinfo(bdesc->bd_index, keyno + 1,
+ FunctionCall3Coll(unionFn,
+ bdesc->bd_index->rd_indcollation[keyno],
+ PointerGetDatum(bdesc),
+ PointerGetDatum(col_a),
+ PointerGetDatum(col_b));
+ }
+ MemoryContextDelete(cxt);
--- /dev/null
+ * brin_minmax.c
+ * Implementation of Min/Max opclass for BRIN
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/brin/brin_minmax.c
+ */
+#include "postgres.h"
+#include "access/genam.h"
+#include "access/brin_internal.h"
+#include "access/brin_tuple.h"
+#include "access/skey.h"
+#include "catalog/pg_type.h"
+#include "utils/datum.h"
+#include "utils/lsyscache.h"
+#include "utils/syscache.h"
+ * Procedure numbers must not collide with BRIN_PROCNUM defines in
+ * brin_internal.h. Note we only need inequality functions.
+ */
+#define MINMAX_NUM_PROCNUMS 4 /* # support procs we need */
+#define PROCNUM_LESS 11
+ * Subtract this from procnum to obtain index in MinmaxOpaque arrays
+ * (Must be equal to minimum of private procnums)
+ */
+#define PROCNUM_BASE 11
+static FmgrInfo *minmax_get_procinfo(BrinDesc *bdesc, uint16 attno,
+ uint16 procnum);
+typedef struct MinmaxOpaque
+ FmgrInfo operators[MINMAX_NUM_PROCNUMS];
+ bool inited[MINMAX_NUM_PROCNUMS];
+} MinmaxOpaque;
+ Oid typoid = PG_GETARG_OID(0);
+ BrinOpcInfo *result;
+ /*
+ * opaque->operators is initialized lazily, as indicated by 'inited' which
+ * is initialized to all false by palloc0.
+ */
+ result = palloc0(MAXALIGN(SizeofBrinOpcInfo(2)) +
+ sizeof(MinmaxOpaque));
+ result->oi_nstored = 2;
+ result->oi_opaque = (MinmaxOpaque *)
+ MAXALIGN((char *) result + SizeofBrinOpcInfo(2));
+ result->oi_typids[0] = typoid;
+ result->oi_typids[1] = typoid;
+ * Examine the given index tuple (which contains partial status of a certain
+ * page range) by comparing it to the given value that comes from another heap
+ * tuple. If the new value is outside the min/max range specified by the
+ * existing tuple values, update the index tuple and return true. Otherwise,
+ * return false and do not modify in this case.
+ */
+ BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
+ BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1);
+ Datum newval = PG_GETARG_DATUM(2);
+ bool isnull = PG_GETARG_DATUM(3);
+ Oid colloid = PG_GET_COLLATION();
+ FmgrInfo *cmpFn;
+ Datum compar;
+ bool updated = false;
+ Form_pg_attribute attr;
+ AttrNumber attno;
+ /*
+ * If the new value is null, we record that we saw it if it's the first
+ * one; otherwise, there's nothing to do.
+ */
+ if (isnull)
+ {
+ if (column->bv_hasnulls)
+ PG_RETURN_BOOL(false);
+ column->bv_hasnulls = true;
+ }
+ attno = column->bv_attno;
+ attr = bdesc->bd_tupdesc->attrs[attno - 1];
+ /*
+ * If the recorded value is null, store the new value (which we know to be
+ * not null) as both minimum and maximum, and we're done.
+ */
+ if (column->bv_allnulls)
+ {
+ column->bv_values[0] = datumCopy(newval, attr->attbyval, attr->attlen);
+ column->bv_values[1] = datumCopy(newval, attr->attbyval, attr->attlen);
+ column->bv_allnulls = false;
+ }
+ /*
+ * Otherwise, need to compare the new value with the existing boundaries
+ * and update them accordingly. First check if it's less than the
+ * existing minimum.
+ */
+ cmpFn = minmax_get_procinfo(bdesc, attno, PROCNUM_LESS);
+ compar = FunctionCall2Coll(cmpFn, colloid, newval, column->bv_values[0]);
+ if (DatumGetBool(compar))
+ {
+ if (!attr->attbyval)
+ pfree(DatumGetPointer(column->bv_values[0]));
+ column->bv_values[0] = datumCopy(newval, attr->attbyval, attr->attlen);
+ updated = true;
+ }
+ /*
+ * And now compare it to the existing maximum.
+ */
+ cmpFn = minmax_get_procinfo(bdesc, attno, PROCNUM_GREATER);
+ compar = FunctionCall2Coll(cmpFn, colloid, newval, column->bv_values[1]);
+ if (DatumGetBool(compar))
+ {
+ if (!attr->attbyval)
+ pfree(DatumGetPointer(column->bv_values[1]));
+ column->bv_values[1] = datumCopy(newval, attr->attbyval, attr->attlen);
+ updated = true;
+ }
+ PG_RETURN_BOOL(updated);
+ * Given an index tuple corresponding to a certain page range and a scan key,
+ * return whether the scan key is consistent with the index tuple's min/max
+ * values. Return true if so, false otherwise.
+ */
+ BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
+ BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1);
+ ScanKey key = (ScanKey) PG_GETARG_POINTER(2);
+ Oid colloid = PG_GET_COLLATION();
+ AttrNumber attno;
+ Datum value;
+ Datum matches;
+ Assert(key->sk_attno == column->bv_attno);
+ /* handle IS NULL/IS NOT NULL tests */
+ if (key->sk_flags & SK_ISNULL)
+ {
+ if (key->sk_flags & SK_SEARCHNULL)
+ {
+ if (column->bv_allnulls || column->bv_hasnulls)
+ PG_RETURN_BOOL(false);
+ }
+ /*
+ * For IS NOT NULL, we can only skip ranges that are known to have
+ * only nulls.
+ */
+ Assert(key->sk_flags & SK_SEARCHNOTNULL);
+ PG_RETURN_BOOL(!column->bv_allnulls);
+ }
+ /* if the range is all empty, it cannot possibly be consistent */
+ if (column->bv_allnulls)
+ PG_RETURN_BOOL(false);
+ attno = key->sk_attno;
+ value = key->sk_argument;
+ switch (key->sk_strategy)
+ {
+ case BTLessStrategyNumber:
+ matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
+ colloid, column->bv_values[0], value);
+ break;
+ case BTLessEqualStrategyNumber:
+ matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
+ colloid, column->bv_values[0], value);
+ break;
+ case BTEqualStrategyNumber:
+ /*
+ * In the equality case (WHERE col = someval), we want to return
+ * the current page range if the minimum value in the range <=
+ * scan key, and the maximum value >= scan key.
+ */
+ matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
+ colloid, column->bv_values[0], value);
+ if (!DatumGetBool(matches))
+ break;
+ /* max() >= scankey */
+ matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
+ colloid, column->bv_values[1], value);
+ break;
+ case BTGreaterEqualStrategyNumber:
+ matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
+ colloid, column->bv_values[1], value);
+ break;
+ case BTGreaterStrategyNumber:
+ matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
+ colloid, column->bv_values[1], value);
+ break;
+ default:
+ /* shouldn't happen */
+ elog(ERROR, "invalid strategy number %d", key->sk_strategy);
+ matches = 0;
+ break;
+ }
+ PG_RETURN_DATUM(matches);
+ * Given two BrinValues, update the first of them as a union of the summary
+ * values contained in both. The second one is untouched.
+ */
+ BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
+ BrinValues *col_a = (BrinValues *) PG_GETARG_POINTER(1);
+ BrinValues *col_b = (BrinValues *) PG_GETARG_POINTER(2);
+ Oid colloid = PG_GET_COLLATION();
+ AttrNumber attno;
+ Form_pg_attribute attr;
+ bool needsadj;
+ Assert(col_a->bv_attno == col_b->bv_attno);
+ /* If there are no values in B, there's nothing to do */
+ if (col_b->bv_allnulls)
+ attno = col_a->bv_attno;
+ attr = bdesc->bd_tupdesc->attrs[attno - 1];
+ /* Adjust "hasnulls" */
+ if (col_b->bv_hasnulls && !col_a->bv_hasnulls)
+ col_a->bv_hasnulls = true;
+ /*
+ * Adjust "allnulls". If B has values but A doesn't, just copy the values
+ * from B into A, and we're done. (We cannot run the operators in this
+ * case, because values in A might contain garbage.)
+ */
+ if (!col_b->bv_allnulls && col_a->bv_allnulls)
+ {
+ col_a->bv_allnulls = false;
+ col_a->bv_values[0] = datumCopy(col_b->bv_values[0],
+ attr->attbyval, attr->attlen);
+ col_a->bv_values[1] = datumCopy(col_b->bv_values[1],
+ attr->attbyval, attr->attlen);
+ }
+ /* Adjust minimum, if B's min is less than A's min */
+ needsadj = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
+ colloid, col_b->bv_values[0], col_a->bv_values[0]);
+ if (needsadj)
+ {
+ if (!attr->attbyval)
+ pfree(DatumGetPointer(col_a->bv_values[0]));
+ col_a->bv_values[0] = datumCopy(col_b->bv_values[0],
+ attr->attbyval, attr->attlen);
+ }
+ /* Adjust maximum, if B's max is greater than A's max */
+ needsadj = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
+ colloid, col_b->bv_values[1], col_a->bv_values[1]);
+ if (needsadj)
+ {
+ if (!attr->attbyval)
+ pfree(DatumGetPointer(col_a->bv_values[1]));
+ col_a->bv_values[1] = datumCopy(col_b->bv_values[1],
+ attr->attbyval, attr->attlen);
+ }
+ * Return the procedure corresponding to the given function support number.
+ */
+static FmgrInfo *
+minmax_get_procinfo(BrinDesc *bdesc, uint16 attno, uint16 procnum)
+ MinmaxOpaque *opaque;
+ uint16 basenum = procnum - PROCNUM_BASE;
+ opaque = (MinmaxOpaque *) bdesc->bd_info[attno - 1]->oi_opaque;
+ /*
+ * We cache these in the opaque struct, to avoid repetitive syscache
+ * lookups.
+ */
+ if (!opaque->inited[basenum])
+ {
+ fmgr_info_copy(&opaque->operators[basenum],
+ index_getprocinfo(bdesc->bd_index, attno, procnum),
+ bdesc->bd_context);
+ opaque->inited[basenum] = true;
+ }
+ return &opaque->operators[basenum];
--- /dev/null
+ * brin_pageops.c
+ * Page-handling routines for BRIN indexes
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/brin/brin_pageops.c
+ */
+#include "postgres.h"
+#include "access/brin_pageops.h"
+#include "access/brin_page.h"
+#include "access/brin_revmap.h"
+#include "access/brin_xlog.h"
+#include "access/xloginsert.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "storage/freespace.h"
+#include "storage/lmgr.h"
+#include "storage/smgr.h"
+#include "utils/rel.h"
+static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
+ bool *was_extended);
+static Size br_page_get_freespace(Page page);
+ * Update tuple origtup (size origsz), located in offset oldoff of buffer
+ * oldbuf, to newtup (size newsz) as summary tuple for the page range starting
+ * at heapBlk. oldbuf must not be locked on entry, and is not locked at exit.
+ *
+ * If samepage is true, attempt to put the new tuple in the same page, but if
+ * there's no room, use some other one.
+ *
+ * If the update is successful, return true; the revmap is updated to point to
+ * the new tuple. If the update is not done for whatever reason, return false.
+ * Caller may retry the update if this happens.
+ */
+brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
+ BrinRevmap *revmap, BlockNumber heapBlk,
+ Buffer oldbuf, OffsetNumber oldoff,
+ const BrinTuple *origtup, Size origsz,
+ const BrinTuple *newtup, Size newsz,
+ bool samepage)
+ Page oldpage;
+ ItemId oldlp;
+ BrinTuple *oldtup;
+ Size oldsz;
+ Buffer newbuf;
+ BrinSpecialSpace *special;
+ bool extended = false;
+ newsz = MAXALIGN(newsz);
+ /* make sure the revmap is long enough to contain the entry we need */
+ brinRevmapExtend(revmap, heapBlk);
+ if (!samepage)
+ {
+ /* need a page on which to put the item */
+ newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended);
+ /* XXX delay vacuuming FSM until locks are released? */
+ if (extended)
+ FreeSpaceMapVacuum(idxrel);
+ if (!BufferIsValid(newbuf))
+ return false;
+ /*
+ * Note: it's possible (though unlikely) that the returned newbuf is
+ * the same as oldbuf, if brin_getinsertbuffer determined that the old
+ * buffer does in fact have enough space.
+ */
+ if (newbuf == oldbuf)
+ newbuf = InvalidBuffer;
+ }
+ else
+ {
+ LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
+ newbuf = InvalidBuffer;
+ }
+ oldpage = BufferGetPage(oldbuf);
+ oldlp = PageGetItemId(oldpage, oldoff);
+ /*
+ * Check that the old tuple wasn't updated concurrently: it might have
+ * moved someplace else entirely ...
+ */
+ if (!ItemIdIsNormal(oldlp))
+ {
+ LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
+ if (BufferIsValid(newbuf))
+ UnlockReleaseBuffer(newbuf);
+ return false;
+ }
+ oldsz = ItemIdGetLength(oldlp);
+ oldtup = (BrinTuple *) PageGetItem(oldpage, oldlp);
+ /*
+ * ... or it might have been updated in place to different contents.
+ */
+ if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz))
+ {
+ LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
+ if (BufferIsValid(newbuf))
+ UnlockReleaseBuffer(newbuf);
+ return false;
+ }
+ special = (BrinSpecialSpace *) PageGetSpecialPointer(oldpage);
+ /*
+ * Great, the old tuple is intact. We can proceed with the update.
+ *
+ * If there's enough room in the old page for the new tuple, replace it.
+ *
+ * Note that there might now be enough space on the page even though the
+ * caller told us there isn't, if a concurrent update moved another tuple
+ * elsewhere or replaced a tuple with a smaller one.
+ */
+ if (((special->flags & BRIN_EVACUATE_PAGE) == 0) &&
+ brin_can_do_samepage_update(oldbuf, origsz, newsz))
+ {
+ if (BufferIsValid(newbuf))
+ UnlockReleaseBuffer(newbuf);
+ PageIndexDeleteNoCompact(oldpage, &oldoff, 1);
+ if (PageAddItem(oldpage, (Item) newtup, newsz, oldoff, true,
+ false) == InvalidOffsetNumber)
+ elog(ERROR, "failed to add BRIN tuple");
+ MarkBufferDirty(oldbuf);
+ /* XLOG stuff */
+ if (RelationNeedsWAL(idxrel))
+ {
+ BlockNumber blk = BufferGetBlockNumber(oldbuf);
+ xl_brin_samepage_update xlrec;
+ XLogRecPtr recptr;
+ XLogRecData rdata[2];
+ xlrec.node = idxrel->rd_node;
+ ItemPointerSetBlockNumber(&xlrec.tid, blk);
+ ItemPointerSetOffsetNumber(&xlrec.tid, oldoff);
+ rdata[0].data = (char *) &xlrec;
+ rdata[0].len = SizeOfBrinSamepageUpdate;
+ rdata[0].buffer = InvalidBuffer;
+ rdata[0].next = &(rdata[1]);
+ rdata[1].data = (char *) newtup;
+ rdata[1].len = newsz;
+ rdata[1].buffer = oldbuf;
+ rdata[1].buffer_std = true;
+ rdata[1].next = NULL;
+ recptr = XLogInsert(RM_BRIN_ID, info, rdata);
+ PageSetLSN(oldpage, recptr);
+ }
+ LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
+ return true;
+ }
+ else if (newbuf == InvalidBuffer)
+ {
+ /*
+ * Not enough space, but caller said that there was. Tell them to
+ * start over.
+ */
+ LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
+ return false;
+ }
+ else
+ {
+ /*
+ * Not enough free space on the oldpage. Put the new tuple on the new
+ * page, and update the revmap.
+ */
+ Page newpage = BufferGetPage(newbuf);
+ Buffer revmapbuf;
+ ItemPointerData newtid;
+ OffsetNumber newoff;
+ revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
+ PageIndexDeleteNoCompact(oldpage, &oldoff, 1);
+ newoff = PageAddItem(newpage, (Item) newtup, newsz,
+ InvalidOffsetNumber, false, false);
+ if (newoff == InvalidOffsetNumber)
+ elog(ERROR, "failed to add BRIN tuple to new page");
+ MarkBufferDirty(oldbuf);
+ MarkBufferDirty(newbuf);
+ ItemPointerSet(&newtid, BufferGetBlockNumber(newbuf), newoff);
+ brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid);
+ MarkBufferDirty(revmapbuf);
+ /* XLOG stuff */
+ if (RelationNeedsWAL(idxrel))
+ {
+ xl_brin_update xlrec;
+ XLogRecPtr recptr;
+ XLogRecData rdata[4];
+ uint8 info;
+ info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0);
+ xlrec.new.node = idxrel->rd_node;
+ ItemPointerSet(&xlrec.new.tid, BufferGetBlockNumber(newbuf), newoff);
+ xlrec.new.heapBlk = heapBlk;
+ xlrec.new.tuplen = newsz;
+ xlrec.new.revmapBlk = BufferGetBlockNumber(revmapbuf);
+ xlrec.new.pagesPerRange = pagesPerRange;
+ ItemPointerSet(&xlrec.oldtid, BufferGetBlockNumber(oldbuf), oldoff);
+ rdata[0].data = (char *) &xlrec;
+ rdata[0].len = SizeOfBrinUpdate;
+ rdata[0].buffer = InvalidBuffer;
+ rdata[0].next = &(rdata[1]);
+ rdata[1].data = (char *) newtup;
+ rdata[1].len = newsz;
+ rdata[1].buffer = extended ? InvalidBuffer : newbuf;
+ rdata[1].buffer_std = true;
+ rdata[1].next = &(rdata[2]);
+ rdata[2].data = (char *) NULL;
+ rdata[2].len = 0;
+ rdata[2].buffer = revmapbuf;
+ rdata[2].buffer_std = true;
+ rdata[2].next = &(rdata[3]);
+ rdata[3].data = (char *) NULL;
+ rdata[3].len = 0;
+ rdata[3].buffer = oldbuf;
+ rdata[3].buffer_std = true;
+ rdata[3].next = NULL;
+ recptr = XLogInsert(RM_BRIN_ID, info, rdata);
+ PageSetLSN(oldpage, recptr);
+ PageSetLSN(newpage, recptr);
+ PageSetLSN(BufferGetPage(revmapbuf), recptr);
+ }
+ LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
+ LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
+ UnlockReleaseBuffer(newbuf);
+ return true;
+ }
+ * Return whether brin_doupdate can do a samepage update.
+ */
+brin_can_do_samepage_update(Buffer buffer, Size origsz, Size newsz)
+ return
+ ((newsz <= origsz) ||
+ PageGetExactFreeSpace(BufferGetPage(buffer)) >= (newsz - origsz));
+ * Insert an index tuple into the index relation. The revmap is updated to
+ * mark the range containing the given page as pointing to the inserted entry.
+ * A WAL record is written.
+ *
+ * The buffer, if valid, is first checked for free space to insert the new
+ * entry; if there isn't enough, a new buffer is obtained and pinned. No
+ * buffer lock must be held on entry, no buffer lock is held on exit.
+ *
+ * Return value is the offset number where the tuple was inserted.
+ */
+brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
+ BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk,
+ BrinTuple *tup, Size itemsz)
+ Page page;
+ BlockNumber blk;
+ OffsetNumber off;
+ Buffer revmapbuf;
+ ItemPointerData tid;
+ bool extended = false;
+ itemsz = MAXALIGN(itemsz);
+ /* Make sure the revmap is long enough to contain the entry we need */
+ brinRevmapExtend(revmap, heapBlk);
+ /*
+ * Obtain a locked buffer to insert the new tuple. Note
+ * brin_getinsertbuffer ensures there's enough space in the returned
+ * buffer.
+ */
+ if (BufferIsValid(*buffer))
+ {
+ /*
+ * It's possible that another backend (or ourselves!) extended the
+ * revmap over the page we held a pin on, so we cannot assume that
+ * it's still a regular page.
+ */
+ LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+ if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz)
+ {
+ UnlockReleaseBuffer(*buffer);
+ *buffer = InvalidBuffer;
+ }
+ }
+ if (!BufferIsValid(*buffer))
+ {
+ *buffer = brin_getinsertbuffer(idxrel, InvalidBuffer, itemsz, &extended);
+ Assert(BufferIsValid(*buffer));
+ Assert(br_page_get_freespace(BufferGetPage(*buffer)) >= itemsz);
+ }
+ /* Now obtain lock on revmap buffer */
+ revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
+ page = BufferGetPage(*buffer);
+ blk = BufferGetBlockNumber(*buffer);
+ off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber,
+ false, false);
+ if (off == InvalidOffsetNumber)
+ elog(ERROR, "could not insert new index tuple to page");
+ MarkBufferDirty(*buffer);
+ BRIN_elog(DEBUG2, "inserted tuple (%u,%u) for range starting at %u",
+ blk, off, heapBlk);
+ ItemPointerSet(&tid, blk, off);
+ brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, tid);
+ MarkBufferDirty(revmapbuf);
+ /* XLOG stuff */
+ if (RelationNeedsWAL(idxrel))
+ {
+ xl_brin_insert xlrec;
+ XLogRecPtr recptr;
+ XLogRecData rdata[3];
+ uint8 info;
+ info = XLOG_BRIN_INSERT | (extended ? XLOG_BRIN_INIT_PAGE : 0);
+ xlrec.node = idxrel->rd_node;
+ xlrec.heapBlk = heapBlk;
+ xlrec.pagesPerRange = pagesPerRange;
+ xlrec.revmapBlk = BufferGetBlockNumber(revmapbuf);
+ xlrec.tuplen = itemsz;
+ ItemPointerSet(&xlrec.tid, blk, off);
+ rdata[0].data = (char *) &xlrec;
+ rdata[0].len = SizeOfBrinInsert;
+ rdata[0].buffer = InvalidBuffer;
+ rdata[0].buffer_std = false;
+ rdata[0].next = &(rdata[1]);
+ rdata[1].data = (char *) tup;
+ rdata[1].len = itemsz;
+ rdata[1].buffer = extended ? InvalidBuffer : *buffer;
+ rdata[1].buffer_std = true;
+ rdata[1].next = &(rdata[2]);
+ rdata[2].data = (char *) NULL;
+ rdata[2].len = 0;
+ rdata[2].buffer = revmapbuf;
+ rdata[2].buffer_std = false;
+ rdata[2].next = NULL;
+ recptr = XLogInsert(RM_BRIN_ID, info, rdata);
+ PageSetLSN(page, recptr);
+ PageSetLSN(BufferGetPage(revmapbuf), recptr);
+ }
+ /* Tuple is firmly on buffer; we can release our locks */
+ LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
+ LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
+ if (extended)
+ FreeSpaceMapVacuum(idxrel);
+ return off;
+ * Initialize a page with the given type.
+ *
+ * Caller is responsible for marking it dirty, as appropriate.
+ */
+brin_page_init(Page page, uint16 type)
+ BrinSpecialSpace *special;
+ PageInit(page, BLCKSZ, sizeof(BrinSpecialSpace));
+ special = (BrinSpecialSpace *) PageGetSpecialPointer(page);
+ special->type = type;
+ * Initialize a new BRIN index' metapage.
+ */
+brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version)
+ BrinMetaPageData *metadata;
+ brin_page_init(page, BRIN_PAGETYPE_META);
+ metadata = (BrinMetaPageData *) PageGetContents(page);
+ metadata->brinMagic = BRIN_META_MAGIC;
+ metadata->brinVersion = version;
+ metadata->pagesPerRange = pagesPerRange;
+ /*
+ * Note we cheat here a little. 0 is not a valid revmap block number
+ * (because it's the metapage buffer), but doing this enables the first
+ * revmap page to be created when the index is.
+ */
+ metadata->lastRevmapPage = 0;
+ * Initiate page evacuation protocol.
+ *
+ * The page must be locked in exclusive mode by the caller.
+ *
+ * If the page is not yet initialized or empty, return false without doing
+ * anything; it can be used for revmap without any further changes. If it
+ * contains tuples, mark it for evacuation and return true.
+ */
+brin_start_evacuating_page(Relation idxRel, Buffer buf)
+ OffsetNumber off;
+ OffsetNumber maxoff;
+ BrinSpecialSpace *special;
+ Page page;
+ page = BufferGetPage(buf);
+ if (PageIsNew(page))
+ return false;
+ special = (BrinSpecialSpace *) PageGetSpecialPointer(page);
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (off = FirstOffsetNumber; off <= maxoff; off++)
+ {
+ ItemId lp;
+ lp = PageGetItemId(page, off);
+ if (ItemIdIsUsed(lp))
+ {
+ /* prevent other backends from adding more stuff to this page */
+ special->flags |= BRIN_EVACUATE_PAGE;
+ MarkBufferDirtyHint(buf, true);
+ return true;
+ }
+ }
+ return false;
+ * Move all tuples out of a page.
+ *
+ * The caller must hold lock on the page. The lock and pin are released.
+ */
+brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange,
+ BrinRevmap *revmap, Buffer buf)
+ OffsetNumber off;
+ OffsetNumber maxoff;
+ Page page;
+ page = BufferGetPage(buf);
+ Assert(((BrinSpecialSpace *)
+ PageGetSpecialPointer(page))->flags & BRIN_EVACUATE_PAGE);
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (off = FirstOffsetNumber; off <= maxoff; off++)
+ {
+ BrinTuple *tup;
+ Size sz;
+ ItemId lp;
+ lp = PageGetItemId(page, off);
+ if (ItemIdIsUsed(lp))
+ {
+ sz = ItemIdGetLength(lp);
+ tup = (BrinTuple *) PageGetItem(page, lp);
+ tup = brin_copy_tuple(tup, sz);
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ if (!brin_doupdate(idxRel, pagesPerRange, revmap, tup->bt_blkno,
+ buf, off, tup, sz, tup, sz, false))
+ off--; /* retry */
+ LockBuffer(buf, BUFFER_LOCK_SHARE);
+ /* It's possible that someone extended the revmap over this page */
+ if (!BRIN_IS_REGULAR_PAGE(page))
+ break;
+ }
+ }
+ UnlockReleaseBuffer(buf);
+ * Return a pinned and exclusively locked buffer which can be used to insert an
+ * index item of size itemsz. If oldbuf is a valid buffer, it is also locked
+ * (in a order determined to avoid deadlocks.)
+ *
+ * If there's no existing page with enough free space to accomodate the new
+ * item, the relation is extended. If this happens, *extended is set to true.
+ *
+ * If we find that the old page is no longer a regular index page (because
+ * of a revmap extension), the old buffer is unlocked and we return
+ * InvalidBuffer.
+ */
+static Buffer
+brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
+ bool *was_extended)
+ BlockNumber oldblk;
+ BlockNumber newblk;
+ Page page;
+ int freespace;
+ if (BufferIsValid(oldbuf))
+ oldblk = BufferGetBlockNumber(oldbuf);
+ else
+ oldblk = InvalidBlockNumber;
+ /*
+ * Loop until we find a page with sufficient free space. By the time we
+ * return to caller out of this loop, both buffers are valid and locked;
+ * if we have to restart here, neither buffer is locked and buf is not a
+ * pinned buffer.
+ */
+ newblk = RelationGetTargetBlock(irel);
+ if (newblk == InvalidBlockNumber)
+ newblk = GetPageWithFreeSpace(irel, itemsz);
+ for (;;)
+ {
+ Buffer buf;
+ bool extensionLockHeld = false;
+ bool extended = false;
+ if (newblk == InvalidBlockNumber)
+ {
+ /*
+ * There's not enough free space in any existing index page,
+ * according to the FSM: extend the relation to obtain a shiny new
+ * page.
+ */
+ if (!RELATION_IS_LOCAL(irel))
+ {
+ LockRelationForExtension(irel, ExclusiveLock);
+ extensionLockHeld = true;
+ }
+ buf = ReadBuffer(irel, P_NEW);
+ newblk = BufferGetBlockNumber(buf);
+ *was_extended = extended = true;
+ BRIN_elog(DEBUG2, "brin_getinsertbuffer: extending to page %u",
+ BufferGetBlockNumber(buf));
+ }
+ else if (newblk == oldblk)
+ {
+ /*
+ * There's an odd corner-case here where the FSM is out-of-date,
+ * and gave us the old page.
+ */
+ buf = oldbuf;
+ }
+ else
+ {
+ buf = ReadBuffer(irel, newblk);
+ }
+ /*
+ * We lock the old buffer first, if it's earlier than the new one; but
+ * before we do, we need to check that it hasn't been turned into a
+ * revmap page concurrently; if we detect that it happened, give up
+ * and tell caller to start over.
+ */
+ if (BufferIsValid(oldbuf) && oldblk < newblk)
+ {
+ LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
+ if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)))
+ {
+ LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
+ ReleaseBuffer(buf);
+ return InvalidBuffer;
+ }
+ }
+ if (extensionLockHeld)
+ UnlockRelationForExtension(irel, ExclusiveLock);
+ page = BufferGetPage(buf);
+ if (extended)
+ brin_page_init(page, BRIN_PAGETYPE_REGULAR);
+ /*
+ * We have a new buffer to insert into. Check that the new page has
+ * enough free space, and return it if it does; otherwise start over.
+ * Note that we allow for the FSM to be out of date here, and in that
+ * case we update it and move on.
+ *
+ * (br_page_get_freespace also checks that the FSM didn't hand us a
+ * page that has since been repurposed for the revmap.)
+ */
+ freespace = br_page_get_freespace(page);
+ if (freespace >= itemsz)
+ {
+ RelationSetTargetBlock(irel, BufferGetBlockNumber(buf));
+ /*
+ * Since the target block specification can get lost on cache
+ * invalidations, make sure we update the more permanent FSM with
+ * data about it before going away.
+ */
+ if (extended)
+ RecordPageWithFreeSpace(irel, BufferGetBlockNumber(buf),
+ freespace);
+ /*
+ * Lock the old buffer if not locked already. Note that in this
+ * case we know for sure it's a regular page: it's later than the
+ * new page we just got, which is not a revmap page, and revmap
+ * pages are always consecutive.
+ */
+ if (BufferIsValid(oldbuf) && oldblk > newblk)
+ {
+ LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
+ Assert(BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)));
+ }
+ return buf;
+ }
+ /* This page is no good. */
+ /*
+ * If an entirely new page does not contain enough free space for the
+ * new item, then surely that item is oversized. Complain loudly; but
+ * first make sure we record the page as free, for next time.
+ */
+ if (extended)
+ {
+ RecordPageWithFreeSpace(irel, BufferGetBlockNumber(buf),
+ freespace);
+ ereport(ERROR,
+ errmsg("index row size %lu exceeds maximum %lu for index \"%s\"",
+ (unsigned long) itemsz,
+ (unsigned long) freespace,
+ RelationGetRelationName(irel))));
+ return InvalidBuffer; /* keep compiler quiet */
+ }
+ if (newblk != oldblk)
+ UnlockReleaseBuffer(buf);
+ if (BufferIsValid(oldbuf) && oldblk <= newblk)
+ LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
+ newblk = RecordAndGetPageWithFreeSpace(irel, newblk, freespace, itemsz);
+ }
+ * Return the amount of free space on a regular BRIN index page.
+ *
+ * If the page is not a regular page, or has been marked with the
+ * BRIN_EVACUATE_PAGE flag, returns 0.
+ */
+static Size
+br_page_get_freespace(Page page)
+ BrinSpecialSpace *special;
+ special = (BrinSpecialSpace *) PageGetSpecialPointer(page);
+ if (!BRIN_IS_REGULAR_PAGE(page) ||
+ (special->flags & BRIN_EVACUATE_PAGE) != 0)
+ return 0;
+ else
+ return PageGetFreeSpace(page);
--- /dev/null
+ * brin_revmap.c
+ * Range map for BRIN indexes
+ *
+ * The range map (revmap) is a translation structure for BRIN indexes: for each
+ * page range there is one summary tuple, and its location is tracked by the
+ * revmap. Whenever a new tuple is inserted into a table that violates the
+ * previously recorded summary values, a new tuple is inserted into the index
+ * and the revmap is updated to point to it.
+ *
+ * The revmap is stored in the first pages of the index, immediately following
+ * the metapage. When the revmap needs to be expanded, all tuples on the
+ * regular BRIN page at that block (if any) are moved out of the way.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/brin/brin_revmap.c
+ */
+#include "postgres.h"
+#include "access/brin_page.h"
+#include "access/brin_pageops.h"
+#include "access/brin_revmap.h"
+#include "access/brin_tuple.h"
+#include "access/brin_xlog.h"
+#include "access/rmgr.h"
+#include "access/xloginsert.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "storage/lmgr.h"
+#include "utils/rel.h"
+ * In revmap pages, each item stores an ItemPointerData. These defines let one
+ * find the logical revmap page number and index number of the revmap item for
+ * the given heap block number.
+ */
+#define HEAPBLK_TO_REVMAP_BLK(pagesPerRange, heapBlk) \
+ ((heapBlk / pagesPerRange) / REVMAP_PAGE_MAXITEMS)
+#define HEAPBLK_TO_REVMAP_INDEX(pagesPerRange, heapBlk) \
+ ((heapBlk / pagesPerRange) % REVMAP_PAGE_MAXITEMS)
+struct BrinRevmap
+ Relation rm_irel;
+ BlockNumber rm_pagesPerRange;
+ BlockNumber rm_lastRevmapPage; /* cached from the metapage */
+ Buffer rm_metaBuf;
+ Buffer rm_currBuf;
+/* typedef appears in brin_revmap.h */
+static BlockNumber revmap_get_blkno(BrinRevmap *revmap,
+ BlockNumber heapBlk);
+static Buffer revmap_get_buffer(BrinRevmap *revmap, BlockNumber heapBlk);
+static BlockNumber revmap_extend_and_get_blkno(BrinRevmap *revmap,
+ BlockNumber heapBlk);
+static void revmap_physical_extend(BrinRevmap *revmap);
+ * Initialize an access object for a range map. This must be freed by
+ * brinRevmapTerminate when caller is done with it.
+ */
+BrinRevmap *
+brinRevmapInitialize(Relation idxrel, BlockNumber *pagesPerRange)
+ BrinRevmap *revmap;
+ Buffer meta;
+ BrinMetaPageData *metadata;
+ meta = ReadBuffer(idxrel, BRIN_METAPAGE_BLKNO);
+ LockBuffer(meta, BUFFER_LOCK_SHARE);
+ metadata = (BrinMetaPageData *) PageGetContents(BufferGetPage(meta));
+ revmap = palloc(sizeof(BrinRevmap));
+ revmap->rm_irel = idxrel;
+ revmap->rm_pagesPerRange = metadata->pagesPerRange;
+ revmap->rm_lastRevmapPage = metadata->lastRevmapPage;
+ revmap->rm_metaBuf = meta;
+ revmap->rm_currBuf = InvalidBuffer;
+ *pagesPerRange = metadata->pagesPerRange;
+ LockBuffer(meta, BUFFER_LOCK_UNLOCK);
+ return revmap;
+ * Release resources associated with a revmap access object.
+ */
+brinRevmapTerminate(BrinRevmap *revmap)
+ ReleaseBuffer(revmap->rm_metaBuf);
+ if (revmap->rm_currBuf != InvalidBuffer)
+ ReleaseBuffer(revmap->rm_currBuf);
+ pfree(revmap);
+ * Extend the revmap to cover the given heap block number.
+ */
+brinRevmapExtend(BrinRevmap *revmap, BlockNumber heapBlk)
+ BlockNumber mapBlk;
+ mapBlk = revmap_extend_and_get_blkno(revmap, heapBlk);
+ /* Ensure the buffer we got is in the expected range */
+ Assert(mapBlk != InvalidBlockNumber &&
+ mapBlk <= revmap->rm_lastRevmapPage);
+ * Prepare to insert an entry into the revmap; the revmap buffer in which the
+ * entry is to reside is locked and returned. Most callers should call
+ * brinRevmapExtend beforehand, as this routine does not extend the revmap if
+ * it's not long enough.
+ *
+ * The returned buffer is also recorded in the revmap struct; finishing that
+ * releases the buffer, therefore the caller needn't do it explicitely.
+ */
+brinLockRevmapPageForUpdate(BrinRevmap *revmap, BlockNumber heapBlk)
+ Buffer rmBuf;
+ rmBuf = revmap_get_buffer(revmap, heapBlk);
+ return rmBuf;
+ * In the given revmap buffer (locked appropriately by caller), which is used
+ * in a BRIN index of pagesPerRange pages per range, set the element
+ * corresponding to heap block number heapBlk to the given TID.
+ *
+ * Once the operation is complete, the caller must update the LSN on the
+ * returned buffer.
+ *
+ * This is used both in regular operation and during WAL replay.
+ */
+brinSetHeapBlockItemptr(Buffer buf, BlockNumber pagesPerRange,
+ BlockNumber heapBlk, ItemPointerData tid)
+ RevmapContents *contents;
+ ItemPointerData *iptr;
+ Page page;
+ /* The correct page should already be pinned and locked */
+ page = BufferGetPage(buf);
+ contents = (RevmapContents *) PageGetContents(page);
+ iptr = (ItemPointerData *) contents->rm_tids;
+ iptr += HEAPBLK_TO_REVMAP_INDEX(pagesPerRange, heapBlk);
+ ItemPointerSet(iptr,
+ ItemPointerGetBlockNumber(&tid),
+ ItemPointerGetOffsetNumber(&tid));
+ * Fetch the BrinTuple for a given heap block.
+ *
+ * The buffer containing the tuple is locked, and returned in *buf. As an
+ * optimization, the caller can pass a pinned buffer *buf on entry, which will
+ * avoid a pin-unpin cycle when the next tuple is on the same page as a
+ * previous one.
+ *
+ * If no tuple is found for the given heap range, returns NULL. In that case,
+ * *buf might still be updated, but it's not locked.
+ *
+ * The output tuple offset within the buffer is returned in *off, and its size
+ * is returned in *size.
+ */
+BrinTuple *
+brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk,
+ Buffer *buf, OffsetNumber *off, Size *size, int mode)
+ Relation idxRel = revmap->rm_irel;
+ BlockNumber mapBlk;
+ RevmapContents *contents;
+ ItemPointerData *iptr;
+ BlockNumber blk;
+ Page page;
+ ItemId lp;
+ BrinTuple *tup;
+ ItemPointerData previptr;
+ /* normalize the heap block number to be the first page in the range */
+ heapBlk = (heapBlk / revmap->rm_pagesPerRange) * revmap->rm_pagesPerRange;
+ /* Compute the revmap page number we need */
+ mapBlk = revmap_get_blkno(revmap, heapBlk);
+ if (mapBlk == InvalidBlockNumber)
+ {
+ *off = InvalidOffsetNumber;
+ return NULL;
+ }
+ ItemPointerSetInvalid(&previptr);
+ for (;;)
+ {
+ if (revmap->rm_currBuf == InvalidBuffer ||
+ BufferGetBlockNumber(revmap->rm_currBuf) != mapBlk)
+ {
+ if (revmap->rm_currBuf != InvalidBuffer)
+ ReleaseBuffer(revmap->rm_currBuf);
+ Assert(mapBlk != InvalidBlockNumber);
+ revmap->rm_currBuf = ReadBuffer(revmap->rm_irel, mapBlk);
+ }
+ LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_SHARE);
+ contents = (RevmapContents *)
+ PageGetContents(BufferGetPage(revmap->rm_currBuf));
+ iptr = contents->rm_tids;
+ iptr += HEAPBLK_TO_REVMAP_INDEX(revmap->rm_pagesPerRange, heapBlk);
+ if (!ItemPointerIsValid(iptr))
+ {
+ LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_UNLOCK);
+ return NULL;
+ }
+ /*
+ * Check the TID we got in a previous iteration, if any, and save the
+ * current TID we got from the revmap; if we loop, we can sanity-check
+ * that the next one we get is different. Otherwise we might be stuck
+ * looping forever if the revmap is somehow badly broken.
+ */
+ if (ItemPointerIsValid(&previptr) && ItemPointerEquals(&previptr, iptr))
+ ereport(ERROR,
+ errmsg_internal("corrupted BRIN index: inconsistent range map")));
+ previptr = *iptr;
+ blk = ItemPointerGetBlockNumber(iptr);
+ *off = ItemPointerGetOffsetNumber(iptr);
+ LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_UNLOCK);
+ /* Ok, got a pointer to where the BrinTuple should be. Fetch it. */
+ if (!BufferIsValid(*buf) || BufferGetBlockNumber(*buf) != blk)
+ {
+ if (BufferIsValid(*buf))
+ ReleaseBuffer(*buf);
+ *buf = ReadBuffer(idxRel, blk);
+ }
+ LockBuffer(*buf, mode);
+ page = BufferGetPage(*buf);
+ /* If we land on a revmap page, start over */
+ {
+ lp = PageGetItemId(page, *off);
+ if (ItemIdIsUsed(lp))
+ {
+ tup = (BrinTuple *) PageGetItem(page, lp);
+ if (tup->bt_blkno == heapBlk)
+ {
+ if (size)
+ *size = ItemIdGetLength(lp);
+ /* found it! */
+ return tup;
+ }
+ }
+ }
+ /*
+ * No luck. Assume that the revmap was updated concurrently.
+ */
+ LockBuffer(*buf, BUFFER_LOCK_UNLOCK);
+ }
+ /* not reached, but keep compiler quiet */
+ return NULL;
+ * Given a heap block number, find the corresponding physical revmap block
+ * number and return it. If the revmap page hasn't been allocated yet, return
+ * InvalidBlockNumber.
+ */
+static BlockNumber
+revmap_get_blkno(BrinRevmap *revmap, BlockNumber heapBlk)
+ BlockNumber targetblk;
+ /* obtain revmap block number, skip 1 for metapage block */
+ targetblk = HEAPBLK_TO_REVMAP_BLK(revmap->rm_pagesPerRange, heapBlk) + 1;
+ /* Normal case: the revmap page is already allocated */
+ if (targetblk <= revmap->rm_lastRevmapPage)
+ return targetblk;
+ return InvalidBlockNumber;
+ * Obtain and return a buffer containing the revmap page for the given heap
+ * page. The revmap must have been previously extended to cover that page.
+ * The returned buffer is also recorded in the revmap struct; finishing that
+ * releases the buffer, therefore the caller needn't do it explicitely.
+ */
+static Buffer
+revmap_get_buffer(BrinRevmap *revmap, BlockNumber heapBlk)
+ BlockNumber mapBlk;
+ /* Translate the heap block number to physical index location. */
+ mapBlk = revmap_get_blkno(revmap, heapBlk);
+ if (mapBlk == InvalidBlockNumber)
+ elog(ERROR, "revmap does not cover heap block %u", heapBlk);
+ /* Ensure the buffer we got is in the expected range */
+ Assert(mapBlk != BRIN_METAPAGE_BLKNO &&
+ mapBlk <= revmap->rm_lastRevmapPage);
+ BRIN_elog(DEBUG2, "getting revmap page for logical page %lu (physical %u) for heap %u",
+ HEAPBLK_TO_REVMAP_BLK(revmap->rm_pagesPerRange, heapBlk),
+ mapBlk, heapBlk);
+ /*
+ * Obtain the buffer from which we need to read. If we already have the
+ * correct buffer in our access struct, use that; otherwise, release that,
+ * (if valid) and read the one we need.
+ */
+ if (revmap->rm_currBuf == InvalidBuffer ||
+ mapBlk != BufferGetBlockNumber(revmap->rm_currBuf))
+ {
+ if (revmap->rm_currBuf != InvalidBuffer)
+ ReleaseBuffer(revmap->rm_currBuf);
+ revmap->rm_currBuf = ReadBuffer(revmap->rm_irel, mapBlk);
+ }
+ return revmap->rm_currBuf;
+ * Given a heap block number, find the corresponding physical revmap block
+ * number and return it. If the revmap page hasn't been allocated yet, extend
+ * the revmap until it is.
+ */
+static BlockNumber
+revmap_extend_and_get_blkno(BrinRevmap *revmap, BlockNumber heapBlk)
+ BlockNumber targetblk;
+ /* obtain revmap block number, skip 1 for metapage block */
+ targetblk = HEAPBLK_TO_REVMAP_BLK(revmap->rm_pagesPerRange, heapBlk) + 1;
+ /* Extend the revmap, if necessary */
+ while (targetblk > revmap->rm_lastRevmapPage)
+ {
+ revmap_physical_extend(revmap);
+ }
+ return targetblk;
+ * Try to extend the revmap by one page. This might not happen for a number of
+ * reasons; caller is expected to retry until the expected outcome is obtained.
+ */
+static void
+revmap_physical_extend(BrinRevmap *revmap)
+ Buffer buf;
+ Page page;
+ Page metapage;
+ BrinMetaPageData *metadata;
+ BlockNumber mapBlk;
+ BlockNumber nblocks;
+ Relation irel = revmap->rm_irel;
+ bool needLock = !RELATION_IS_LOCAL(irel);
+ /*
+ * Lock the metapage. This locks out concurrent extensions of the revmap,
+ * but note that we still need to grab the relation extension lock because
+ * another backend can extend the index with regular BRIN pages.
+ */
+ LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_EXCLUSIVE);
+ metapage = BufferGetPage(revmap->rm_metaBuf);
+ metadata = (BrinMetaPageData *) PageGetContents(metapage);
+ /*
+ * Check that our cached lastRevmapPage value was up-to-date; if it
+ * wasn't, update the cached copy and have caller start over.
+ */
+ if (metadata->lastRevmapPage != revmap->rm_lastRevmapPage)
+ {
+ revmap->rm_lastRevmapPage = metadata->lastRevmapPage;
+ LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK);
+ return;
+ }
+ mapBlk = metadata->lastRevmapPage + 1;
+ nblocks = RelationGetNumberOfBlocks(irel);
+ if (mapBlk < nblocks)
+ {
+ buf = ReadBuffer(irel, mapBlk);
+ page = BufferGetPage(buf);
+ }
+ else
+ {
+ if (needLock)
+ LockRelationForExtension(irel, ExclusiveLock);
+ buf = ReadBuffer(irel, P_NEW);
+ if (BufferGetBlockNumber(buf) != mapBlk)
+ {
+ /*
+ * Very rare corner case: somebody extended the relation
+ * concurrently after we read its length. If this happens, give
+ * up and have caller start over. We will have to evacuate that
+ * page from under whoever is using it.
+ */
+ if (needLock)
+ UnlockRelationForExtension(irel, ExclusiveLock);
+ LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK);
+ return;
+ }
+ page = BufferGetPage(buf);
+ if (needLock)
+ UnlockRelationForExtension(irel, ExclusiveLock);
+ }
+ /* Check that it's a regular block (or an empty page) */
+ if (!PageIsNew(page) && !BRIN_IS_REGULAR_PAGE(page))
+ ereport(ERROR,
+ errmsg("unexpected page type 0x%04X in BRIN index \"%s\" block %u",
+ RelationGetRelationName(irel),
+ BufferGetBlockNumber(buf))));
+ /* If the page is in use, evacuate it and restart */
+ if (brin_start_evacuating_page(irel, buf))
+ {
+ LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK);
+ brin_evacuate_page(irel, revmap->rm_pagesPerRange, revmap, buf);
+ /* have caller start over */
+ return;
+ }
+ /*
+ * Ok, we have now locked the metapage and the target block. Re-initialize
+ * it as a revmap page.
+ */
+ /* the rm_tids array is initialized to all invalid by PageInit */
+ brin_page_init(page, BRIN_PAGETYPE_REVMAP);
+ MarkBufferDirty(buf);
+ metadata->lastRevmapPage = mapBlk;
+ MarkBufferDirty(revmap->rm_metaBuf);
+ if (RelationNeedsWAL(revmap->rm_irel))
+ {
+ xl_brin_revmap_extend xlrec;
+ XLogRecPtr recptr;
+ XLogRecData rdata[2];
+ xlrec.node = revmap->rm_irel->rd_node;
+ xlrec.targetBlk = mapBlk;
+ rdata[0].data = (char *) &xlrec;
+ rdata[0].len = SizeOfBrinRevmapExtend;
+ rdata[0].buffer = InvalidBuffer;
+ rdata[0].buffer_std = false;
+ rdata[0].next = &(rdata[1]);
+ rdata[1].data = (char *) NULL;
+ rdata[1].len = 0;
+ rdata[1].buffer = revmap->rm_metaBuf;
+ rdata[1].buffer_std = false;
+ rdata[1].next = NULL;
+ recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_REVMAP_EXTEND, rdata);
+ PageSetLSN(metapage, recptr);
+ PageSetLSN(page, recptr);
+ }
+ LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK);
+ UnlockReleaseBuffer(buf);
--- /dev/null
+ * brin_tuples.c
+ * Method implementations for tuples in BRIN indexes.
+ *
+ * Intended usage is that code outside this file only deals with
+ * BrinMemTuples, and convert to and from the on-disk representation through
+ * functions in this file.
+ *
+ *
+ * A BRIN tuple is similar to a heap tuple, with a few key differences. The
+ * first interesting difference is that the tuple header is much simpler, only
+ * containing its total length and a small area for flags. Also, the stored
+ * data does not match the relation tuple descriptor exactly: for each
+ * attribute in the descriptor, the index tuple carries an arbitrary number
+ * of values, depending on the opclass.
+ *
+ * Also, for each column of the index relation there are two null bits: one
+ * (hasnulls) stores whether any tuple within the page range has that column
+ * set to null; the other one (allnulls) stores whether the column values are
+ * all null. If allnulls is true, then the tuple data area does not contain
+ * values for that column at all; whereas it does if the hasnulls is set.
+ * Note the size of the null bitmask may not be the same as that of the
+ * datum array.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/brin/brin_tuple.c
+ */
+#include "postgres.h"
+#include "access/htup_details.h"
+#include "access/brin_tuple.h"
+#include "access/tupdesc.h"
+#include "access/tupmacs.h"
+#include "utils/datum.h"
+#include "utils/memutils.h"
+static inline void brin_deconstruct_tuple(BrinDesc *brdesc,
+ char *tp, bits8 *nullbits, bool nulls,
+ Datum *values, bool *allnulls, bool *hasnulls);
+ * Return a tuple descriptor used for on-disk storage of BRIN tuples.
+ */
+static TupleDesc
+brtuple_disk_tupdesc(BrinDesc *brdesc)
+ /* We cache these in the BrinDesc */
+ if (brdesc->bd_disktdesc == NULL)
+ {
+ int i;
+ int j;
+ AttrNumber attno = 1;
+ TupleDesc tupdesc;
+ MemoryContext oldcxt;
+ /* make sure it's in the bdesc's context */
+ oldcxt = MemoryContextSwitchTo(brdesc->bd_context);
+ tupdesc = CreateTemplateTupleDesc(brdesc->bd_totalstored, false);
+ for (i = 0; i < brdesc->bd_tupdesc->natts; i++)
+ {
+ for (j = 0; j < brdesc->bd_info[i]->oi_nstored; j++)
+ TupleDescInitEntry(tupdesc, attno++, NULL,
+ brdesc->bd_info[i]->oi_typids[j],
+ -1, 0);
+ }
+ MemoryContextSwitchTo(oldcxt);
+ brdesc->bd_disktdesc = tupdesc;
+ }
+ return brdesc->bd_disktdesc;
+ * Generate a new on-disk tuple to be inserted in a BRIN index.
+ *
+ * See brin_form_placeholder_tuple if you touch this.
+ */
+BrinTuple *
+brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple,
+ Size *size)
+ Datum *values;
+ bool *nulls;
+ bool anynulls = false;
+ BrinTuple *rettuple;
+ int keyno;
+ int idxattno;
+ uint16 phony_infomask;
+ bits8 *phony_nullbitmap;
+ Size len,
+ hoff,
+ data_len;
+ Assert(brdesc->bd_totalstored > 0);
+ values = palloc(sizeof(Datum) * brdesc->bd_totalstored);
+ nulls = palloc0(sizeof(bool) * brdesc->bd_totalstored);
+ phony_nullbitmap = palloc(sizeof(bits8) * BITMAPLEN(brdesc->bd_totalstored));
+ /*
+ * Set up the values/nulls arrays for heap_fill_tuple
+ */
+ idxattno = 0;
+ for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
+ {
+ int datumno;
+ /*
+ * "allnulls" is set when there's no nonnull value in any row in the
+ * column; when this happens, there is no data to store. Thus set the
+ * nullable bits for all data elements of this column and we're done.
+ */
+ if (tuple->bt_columns[keyno].bv_allnulls)
+ {
+ for (datumno = 0;
+ datumno < brdesc->bd_info[keyno]->oi_nstored;
+ datumno++)
+ nulls[idxattno++] = true;
+ anynulls = true;
+ continue;
+ }
+ /*
+ * The "hasnulls" bit is set when there are some null values in the
+ * data. We still need to store a real value, but the presence of
+ * this means we need a null bitmap.
+ */
+ if (tuple->bt_columns[keyno].bv_hasnulls)
+ anynulls = true;
+ for (datumno = 0;
+ datumno < brdesc->bd_info[keyno]->oi_nstored;
+ datumno++)
+ values[idxattno++] = tuple->bt_columns[keyno].bv_values[datumno];
+ }
+ /* compute total space needed */
+ len = SizeOfBrinTuple;
+ if (anynulls)
+ {
+ /*
+ * We need a double-length bitmap on an on-disk BRIN index tuple; the
+ * first half stores the "allnulls" bits, the second stores
+ * "hasnulls".
+ */
+ len += BITMAPLEN(brdesc->bd_tupdesc->natts * 2);
+ }
+ len = hoff = MAXALIGN(len);
+ data_len = heap_compute_data_size(brtuple_disk_tupdesc(brdesc),
+ values, nulls);
+ len += data_len;
+ rettuple = palloc0(len);
+ rettuple->bt_blkno = blkno;
+ rettuple->bt_info = hoff;
+ Assert((rettuple->bt_info & BRIN_OFFSET_MASK) == hoff);
+ /*
+ * The infomask and null bitmap as computed by heap_fill_tuple are useless
+ * to us. However, that function will not accept a null infomask; and we
+ * need to pass a valid null bitmap so that it will correctly skip
+ * outputting null attributes in the data area.
+ */
+ heap_fill_tuple(brtuple_disk_tupdesc(brdesc),
+ values,
+ nulls,
+ (char *) rettuple + hoff,
+ data_len,
+ &phony_infomask,
+ phony_nullbitmap);
+ /* done with these */
+ pfree(values);
+ pfree(nulls);
+ pfree(phony_nullbitmap);
+ /*
+ * Now fill in the real null bitmasks. allnulls first.
+ */
+ if (anynulls)
+ {
+ bits8 *bitP;
+ int bitmask;
+ rettuple->bt_info |= BRIN_NULLS_MASK;
+ /*
+ * Note that we reverse the sense of null bits in this module: we
+ * store a 1 for a null attribute rather than a 0. So we must reverse
+ * the sense of the att_isnull test in br_deconstruct_tuple as well.
+ */
+ bitP = ((bits8 *) ((char *) rettuple + SizeOfBrinTuple)) - 1;
+ bitmask = HIGHBIT;
+ for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
+ {
+ if (bitmask != HIGHBIT)
+ bitmask <<= 1;
+ else
+ {
+ bitP += 1;
+ *bitP = 0x0;
+ bitmask = 1;
+ }
+ if (!tuple->bt_columns[keyno].bv_allnulls)
+ continue;
+ *bitP |= bitmask;
+ }
+ /* hasnulls bits follow */
+ for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
+ {
+ if (bitmask != HIGHBIT)
+ bitmask <<= 1;
+ else
+ {
+ bitP += 1;
+ *bitP = 0x0;
+ bitmask = 1;
+ }
+ if (!tuple->bt_columns[keyno].bv_hasnulls)
+ continue;
+ *bitP |= bitmask;
+ }
+ bitP = ((bits8 *) (rettuple + SizeOfBrinTuple)) - 1;
+ }
+ if (tuple->bt_placeholder)
+ rettuple->bt_info |= BRIN_PLACEHOLDER_MASK;
+ *size = len;
+ return rettuple;
+ * Generate a new on-disk tuple with no data values, marked as placeholder.
+ *
+ * This is a cut-down version of brin_form_tuple.
+ */
+BrinTuple *
+brin_form_placeholder_tuple(BrinDesc *brdesc, BlockNumber blkno, Size *size)
+ Size len;
+ Size hoff;
+ BrinTuple *rettuple;
+ int keyno;
+ bits8 *bitP;
+ int bitmask;
+ /* compute total space needed: always add nulls */
+ len = SizeOfBrinTuple;
+ len += BITMAPLEN(brdesc->bd_tupdesc->natts * 2);
+ len = hoff = MAXALIGN(len);
+ rettuple = palloc0(len);
+ rettuple->bt_blkno = blkno;
+ rettuple->bt_info = hoff;
+ bitP = ((bits8 *) ((char *) rettuple + SizeOfBrinTuple)) - 1;
+ bitmask = HIGHBIT;
+ /* set allnulls true for all attributes */
+ for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
+ {
+ if (bitmask != HIGHBIT)
+ bitmask <<= 1;
+ else
+ {
+ bitP += 1;
+ *bitP = 0x0;
+ bitmask = 1;
+ }
+ *bitP |= bitmask;
+ }
+ /* no need to set hasnulls */
+ *size = len;
+ return rettuple;
+ * Free a tuple created by brin_form_tuple
+ */
+brin_free_tuple(BrinTuple *tuple)
+ pfree(tuple);
+ * Create an palloc'd copy of a BrinTuple.
+ */
+BrinTuple *
+brin_copy_tuple(BrinTuple *tuple, Size len)
+ BrinTuple *newtup;
+ newtup = palloc(len);
+ memcpy(newtup, tuple, len);
+ return newtup;
+ * Return whether two BrinTuples are bitwise identical.
+ */
+brin_tuples_equal(const BrinTuple *a, Size alen, const BrinTuple *b, Size blen)
+ if (alen != blen)
+ return false;
+ if (memcmp(a, b, alen) != 0)
+ return false;
+ return true;
+ * Create a new BrinMemTuple from scratch, and initialize it to an empty
+ * state.
+ *
+ * Note: we don't provide any means to free a deformed tuple, so make sure to
+ * use a temporary memory context.
+ */
+BrinMemTuple *
+brin_new_memtuple(BrinDesc *brdesc)
+ BrinMemTuple *dtup;
+ char *currdatum;
+ long basesize;
+ int i;
+ basesize = MAXALIGN(sizeof(BrinMemTuple) +
+ sizeof(BrinValues) * brdesc->bd_tupdesc->natts);
+ dtup = palloc0(basesize + sizeof(Datum) * brdesc->bd_totalstored);
+ currdatum = (char *) dtup + basesize;
+ for (i = 0; i < brdesc->bd_tupdesc->natts; i++)
+ {
+ dtup->bt_columns[i].bv_attno = i + 1;
+ dtup->bt_columns[i].bv_allnulls = true;
+ dtup->bt_columns[i].bv_hasnulls = false;
+ dtup->bt_columns[i].bv_values = (Datum *) currdatum;
+ currdatum += sizeof(Datum) * brdesc->bd_info[i]->oi_nstored;
+ }
+ dtup->bt_context = AllocSetContextCreate(CurrentMemoryContext,
+ "brin dtuple",
+ return dtup;
+ * Reset a BrinMemTuple to initial state
+ */
+brin_memtuple_initialize(BrinMemTuple *dtuple, BrinDesc *brdesc)
+ int i;
+ MemoryContextReset(dtuple->bt_context);
+ for (i = 0; i < brdesc->bd_tupdesc->natts; i++)
+ {
+ dtuple->bt_columns[i].bv_allnulls = true;
+ dtuple->bt_columns[i].bv_hasnulls = false;
+ }
+ * Convert a BrinTuple back to a BrinMemTuple. This is the reverse of
+ * brin_form_tuple.
+ *
+ * Note we don't need the "on disk tupdesc" here; we rely on our own routine to
+ * deconstruct the tuple from the on-disk format.
+ */
+BrinMemTuple *
+brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple)
+ BrinMemTuple *dtup;
+ Datum *values;
+ bool *allnulls;
+ bool *hasnulls;
+ char *tp;
+ bits8 *nullbits;
+ int keyno;
+ int valueno;
+ MemoryContext oldcxt;
+ dtup = brin_new_memtuple(brdesc);
+ if (BrinTupleIsPlaceholder(tuple))
+ dtup->bt_placeholder = true;
+ dtup->bt_blkno = tuple->bt_blkno;
+ values = palloc(sizeof(Datum) * brdesc->bd_totalstored);
+ allnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts);
+ hasnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts);
+ tp = (char *) tuple + BrinTupleDataOffset(tuple);
+ if (BrinTupleHasNulls(tuple))
+ nullbits = (bits8 *) ((char *) tuple + SizeOfBrinTuple);
+ else
+ nullbits = NULL;
+ brin_deconstruct_tuple(brdesc,
+ tp, nullbits, BrinTupleHasNulls(tuple),
+ values, allnulls, hasnulls);
+ /*
+ * Iterate to assign each of the values to the corresponding item in the
+ * values array of each column. The copies occur in the tuple's context.
+ */
+ oldcxt = MemoryContextSwitchTo(dtup->bt_context);
+ for (valueno = 0, keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
+ {
+ int i;
+ if (allnulls[keyno])
+ {
+ valueno += brdesc->bd_info[keyno]->oi_nstored;
+ continue;
+ }
+ /*
+ * We would like to skip datumCopy'ing the values datum in some cases,
+ * caller permitting ...
+ */
+ for (i = 0; i < brdesc->bd_info[keyno]->oi_nstored; i++)
+ dtup->bt_columns[keyno].bv_values[i] =
+ datumCopy(values[valueno++],
+ brdesc->bd_tupdesc->attrs[keyno]->attbyval,
+ brdesc->bd_tupdesc->attrs[keyno]->attlen);
+ dtup->bt_columns[keyno].bv_hasnulls = hasnulls[keyno];
+ dtup->bt_columns[keyno].bv_allnulls = false;
+ }
+ MemoryContextSwitchTo(oldcxt);
+ pfree(values);
+ pfree(allnulls);
+ pfree(hasnulls);
+ return dtup;
+ * brin_deconstruct_tuple
+ * Guts of attribute extraction from an on-disk BRIN tuple.
+ *
+ * Its arguments are:
+ * brdesc BRIN descriptor for the stored tuple
+ * tp pointer to the tuple data area
+ * nullbits pointer to the tuple nulls bitmask
+ * nulls "has nulls" bit in tuple infomask
+ * values output values, array of size brdesc->bd_totalstored
+ * allnulls output "allnulls", size brdesc->bd_tupdesc->natts
+ * hasnulls output "hasnulls", size brdesc->bd_tupdesc->natts
+ *
+ * Output arrays must have been allocated by caller.
+ */
+static inline void
+brin_deconstruct_tuple(BrinDesc *brdesc,
+ char *tp, bits8 *nullbits, bool nulls,
+ Datum *values, bool *allnulls, bool *hasnulls)
+ int attnum;
+ int stored;
+ TupleDesc diskdsc;
+ long off;
+ /*
+ * First iterate to natts to obtain both null flags for each attribute.
+ * Note that we reverse the sense of the att_isnull test, because we store
+ * 1 for a null value (rather than a 1 for a not null value as is the
+ * att_isnull convention used elsewhere.) See brin_form_tuple.
+ */
+ for (attnum = 0; attnum < brdesc->bd_tupdesc->natts; attnum++)
+ {
+ /*
+ * the "all nulls" bit means that all values in the page range for
+ * this column are nulls. Therefore there are no values in the tuple
+ * data area.
+ */
+ allnulls[attnum] = nulls && !att_isnull(attnum, nullbits);
+ /*
+ * the "has nulls" bit means that some tuples have nulls, but others
+ * have not-null values. Therefore we know the tuple contains data
+ * for this column.
+ *
+ * The hasnulls bits follow the allnulls bits in the same bitmask.
+ */
+ hasnulls[attnum] =
+ nulls && !att_isnull(brdesc->bd_tupdesc->natts + attnum, nullbits);
+ }
+ /*
+ * Iterate to obtain each attribute's stored values. Note that since we
+ * may reuse attribute entries for more than one column, we cannot cache
+ * offsets here.
+ */
+ diskdsc = brtuple_disk_tupdesc(brdesc);
+ stored = 0;
+ off = 0;
+ for (attnum = 0; attnum < brdesc->bd_tupdesc->natts; attnum++)
+ {
+ int datumno;
+ if (allnulls[attnum])
+ {
+ stored += brdesc->bd_info[attnum]->oi_nstored;
+ continue;
+ }
+ for (datumno = 0;
+ datumno < brdesc->bd_info[attnum]->oi_nstored;
+ datumno++)
+ {
+ Form_pg_attribute thisatt = diskdsc->attrs[stored];
+ if (thisatt->attlen == -1)
+ {
+ off = att_align_pointer(off, thisatt->attalign, -1,
+ tp + off);
+ }
+ else
+ {
+ /* not varlena, so safe to use att_align_nominal */
+ off = att_align_nominal(off, thisatt->attalign);
+ }
+ values[stored++] = fetchatt(thisatt, tp + off);
+ off = att_addlength_pointer(off, thisatt->attlen, tp + off);
+ }
+ }
--- /dev/null
+ * brin_xlog.c
+ * XLog replay routines for BRIN indexes
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/brin/brin_xlog.c
+ */
+#include "postgres.h"
+#include "access/brin_page.h"
+#include "access/brin_pageops.h"
+#include "access/brin_xlog.h"
+#include "access/xlogutils.h"
+ * xlog replay routines
+ */
+static void
+brin_xlog_createidx(XLogRecPtr lsn, XLogRecord *record)
+ xl_brin_createidx *xlrec = (xl_brin_createidx *) XLogRecGetData(record);
+ Buffer buf;
+ Page page;
+ /* Backup blocks are not used in create_index records */
+ Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
+ /* create the index' metapage */
+ buf = XLogReadBuffer(xlrec->node, BRIN_METAPAGE_BLKNO, true);
+ Assert(BufferIsValid(buf));
+ page = (Page) BufferGetPage(buf);
+ brin_metapage_init(page, xlrec->pagesPerRange, xlrec->version);
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buf);
+ UnlockReleaseBuffer(buf);
+ * Common part of an insert or update. Inserts the new tuple and updates the
+ * revmap.
+ */
+static void
+brin_xlog_insert_update(XLogRecPtr lsn, XLogRecord *record,
+ xl_brin_insert *xlrec, BrinTuple *tuple)
+ BlockNumber blkno;
+ Buffer buffer;
+ Page page;
+ XLogRedoAction action;
+ blkno = ItemPointerGetBlockNumber(&xlrec->tid);
+ /*
+ * If we inserted the first and only tuple on the page, re-initialize the
+ * page from scratch.
+ */
+ if (record->xl_info & XLOG_BRIN_INIT_PAGE)
+ {
+ XLogReadBufferForRedoExtended(lsn, record, 0,
+ xlrec->node, MAIN_FORKNUM, blkno,
+ RBM_ZERO, false, &buffer);
+ page = BufferGetPage(buffer);
+ brin_page_init(page, BRIN_PAGETYPE_REGULAR);
+ action = BLK_NEEDS_REDO;
+ }
+ else
+ {
+ action = XLogReadBufferForRedo(lsn, record, 0,
+ xlrec->node, blkno, &buffer);
+ }
+ /* insert the index item into the page */
+ if (action == BLK_NEEDS_REDO)
+ {
+ OffsetNumber offnum;
+ Assert(tuple->bt_blkno == xlrec->heapBlk);
+ page = (Page) BufferGetPage(buffer);
+ offnum = ItemPointerGetOffsetNumber(&(xlrec->tid));
+ if (PageGetMaxOffsetNumber(page) + 1 < offnum)
+ elog(PANIC, "brin_xlog_insert_update: invalid max offset number");
+ offnum = PageAddItem(page, (Item) tuple, xlrec->tuplen, offnum, true,
+ false);
+ if (offnum == InvalidOffsetNumber)
+ elog(PANIC, "brin_xlog_insert_update: failed to add tuple");
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+ /* update the revmap */
+ action = XLogReadBufferForRedo(lsn, record, 1, xlrec->node,
+ xlrec->revmapBlk, &buffer);
+ if (action == BLK_NEEDS_REDO)
+ {
+ page = (Page) BufferGetPage(buffer);
+ brinSetHeapBlockItemptr(buffer, xlrec->pagesPerRange, xlrec->heapBlk,
+ xlrec->tid);
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+ /* XXX no FSM updates here ... */
+ * replay a BRIN index insertion
+ */
+static void
+brin_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
+ xl_brin_insert *xlrec = (xl_brin_insert *) XLogRecGetData(record);
+ BrinTuple *newtup;
+ newtup = (BrinTuple *) ((char *) xlrec + SizeOfBrinInsert);
+ brin_xlog_insert_update(lsn, record, xlrec, newtup);
+ * replay a BRIN index update
+ */
+static void
+brin_xlog_update(XLogRecPtr lsn, XLogRecord *record)
+ xl_brin_update *xlrec = (xl_brin_update *) XLogRecGetData(record);
+ BlockNumber blkno;
+ Buffer buffer;
+ BrinTuple *newtup;
+ XLogRedoAction action;
+ newtup = (BrinTuple *) ((char *) xlrec + SizeOfBrinUpdate);
+ /* First remove the old tuple */
+ blkno = ItemPointerGetBlockNumber(&(xlrec->oldtid));
+ action = XLogReadBufferForRedo(lsn, record, 2, xlrec->new.node,
+ blkno, &buffer);
+ if (action == BLK_NEEDS_REDO)
+ {
+ Page page;
+ OffsetNumber offnum;
+ page = (Page) BufferGetPage(buffer);
+ offnum = ItemPointerGetOffsetNumber(&(xlrec->oldtid));
+ if (PageGetMaxOffsetNumber(page) + 1 < offnum)
+ elog(PANIC, "brin_xlog_update: invalid max offset number");
+ PageIndexDeleteNoCompact(page, &offnum, 1);
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ /* Then insert the new tuple and update revmap, like in an insertion. */
+ brin_xlog_insert_update(lsn, record, &xlrec->new, newtup);
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+ * Update a tuple on a single page.
+ */
+static void
+brin_xlog_samepage_update(XLogRecPtr lsn, XLogRecord *record)
+ xl_brin_samepage_update *xlrec;
+ BlockNumber blkno;
+ Buffer buffer;
+ XLogRedoAction action;
+ xlrec = (xl_brin_samepage_update *) XLogRecGetData(record);
+ blkno = ItemPointerGetBlockNumber(&(xlrec->tid));
+ action = XLogReadBufferForRedo(lsn, record, 0, xlrec->node, blkno,
+ &buffer);
+ if (action == BLK_NEEDS_REDO)
+ {
+ int tuplen;
+ BrinTuple *mmtuple;
+ Page page;
+ OffsetNumber offnum;
+ tuplen = record->xl_len - SizeOfBrinSamepageUpdate;
+ mmtuple = (BrinTuple *) ((char *) xlrec + SizeOfBrinSamepageUpdate);
+ page = (Page) BufferGetPage(buffer);
+ offnum = ItemPointerGetOffsetNumber(&(xlrec->tid));
+ if (PageGetMaxOffsetNumber(page) + 1 < offnum)
+ elog(PANIC, "brin_xlog_samepage_update: invalid max offset number");
+ PageIndexDeleteNoCompact(page, &offnum, 1);
+ offnum = PageAddItem(page, (Item) mmtuple, tuplen, offnum, true, false);
+ if (offnum == InvalidOffsetNumber)
+ elog(PANIC, "brin_xlog_samepage_update: failed to add tuple");
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+ /* XXX no FSM updates here ... */
+ * Replay a revmap page extension
+ */
+static void
+brin_xlog_revmap_extend(XLogRecPtr lsn, XLogRecord *record)
+ xl_brin_revmap_extend *xlrec;
+ Buffer metabuf;
+ Buffer buf;
+ Page page;
+ XLogRedoAction action;
+ xlrec = (xl_brin_revmap_extend *) XLogRecGetData(record);
+ /* Update the metapage */
+ action = XLogReadBufferForRedo(lsn, record, 0, xlrec->node,
+ if (action == BLK_NEEDS_REDO)
+ {
+ Page metapg;
+ BrinMetaPageData *metadata;
+ metapg = BufferGetPage(metabuf);
+ metadata = (BrinMetaPageData *) PageGetContents(metapg);
+ Assert(metadata->lastRevmapPage == xlrec->targetBlk - 1);
+ metadata->lastRevmapPage = xlrec->targetBlk;
+ PageSetLSN(metapg, lsn);
+ MarkBufferDirty(metabuf);
+ }
+ /*
+ * Re-init the target block as a revmap page. There's never a full- page
+ * image here.
+ */
+ buf = XLogReadBuffer(xlrec->node, xlrec->targetBlk, true);
+ page = (Page) BufferGetPage(buf);
+ brin_page_init(page, BRIN_PAGETYPE_REVMAP);
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buf);
+ UnlockReleaseBuffer(buf);
+ if (BufferIsValid(metabuf))
+ UnlockReleaseBuffer(metabuf);
+brin_redo(XLogRecPtr lsn, XLogRecord *record)
+ uint8 info = record->xl_info & ~XLR_INFO_MASK;
+ switch (info & XLOG_BRIN_OPMASK)
+ {
+ brin_xlog_createidx(lsn, record);
+ break;
+ brin_xlog_insert(lsn, record);
+ break;
+ brin_xlog_update(lsn, record);
+ break;
+ brin_xlog_samepage_update(lsn, record);
+ break;
+ brin_xlog_revmap_extend(lsn, record);
+ break;
+ default:
+ elog(PANIC, "brin_redo: unknown op code %u", info);
+ }
}, -1, 0, 2000000000
+ {
+ {
+ "pages_per_range",
+ "Number of pages that each page range covers in a BRIN index",
+ }, 128, 1, 131072
+ },
/* list terminator */
scan->rs_startblock = 0;
+ scan->rs_initblock = 0;
+ scan->rs_numblocks = InvalidBlockNumber;
scan->rs_inited = false;
scan->rs_ctup.t_data = NULL;
+heap_setscanlimits(HeapScanDesc scan, BlockNumber startBlk, BlockNumber numBlks)
+ scan->rs_startblock = startBlk;
+ scan->rs_initblock = startBlk;
+ scan->rs_numblocks = numBlks;
* heapgetpage - subroutine for heapgettup()
if (backward)
- finished = (page == scan->rs_startblock);
+ finished = (page == scan->rs_startblock) ||
+ (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks <= 0 : false);
if (page == 0)
page = scan->rs_nblocks;
if (page >= scan->rs_nblocks)
page = 0;
- finished = (page == scan->rs_startblock);
+ finished = (page == scan->rs_startblock) ||
+ (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks <= 0 : false);
* Report our new scan position for synchronization purposes. We
if (backward)
- finished = (page == scan->rs_startblock);
+ finished = (page == scan->rs_startblock) ||
+ (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks <= 0 : false);
if (page == 0)
page = scan->rs_nblocks;
if (page >= scan->rs_nblocks)
page = 0;
- finished = (page == scan->rs_startblock);
+ finished = (page == scan->rs_startblock) ||
+ (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks <= 0 : false);
* Report our new scan position for synchronization purposes. We
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
-OBJS = clogdesc.o dbasedesc.o gindesc.o gistdesc.o hashdesc.o heapdesc.o \
+OBJS = brindesc.o clogdesc.o dbasedesc.o gindesc.o gistdesc.o \
+ hashdesc.o heapdesc.o \
mxactdesc.o nbtdesc.o relmapdesc.o seqdesc.o smgrdesc.o spgdesc.o \
standbydesc.o tblspcdesc.o xactdesc.o xlogdesc.o
--- /dev/null
+ *
+ * brindesc.c
+ * rmgr descriptor routines for BRIN indexes
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * src/backend/access/rmgrdesc/brindesc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "access/brin_xlog.h"
+brin_desc(StringInfo buf, XLogRecord *record)
+ char *rec = XLogRecGetData(record);
+ uint8 info = record->xl_info & ~XLR_INFO_MASK;
+ {
+ xl_brin_createidx *xlrec = (xl_brin_createidx *) rec;
+ appendStringInfo(buf, "v%d pagesPerRange %u rel %u/%u/%u",
+ xlrec->version, xlrec->pagesPerRange,
+ xlrec->node.spcNode, xlrec->node.dbNode,
+ xlrec->node.relNode);
+ }
+ else if (info == XLOG_BRIN_INSERT)
+ {
+ xl_brin_insert *xlrec = (xl_brin_insert *) rec;
+ appendStringInfo(buf, "rel %u/%u/%u heapBlk %u revmapBlk %u pagesPerRange %u TID (%u,%u)",
+ xlrec->node.spcNode, xlrec->node.dbNode,
+ xlrec->node.relNode,
+ xlrec->heapBlk, xlrec->revmapBlk,
+ xlrec->pagesPerRange,
+ ItemPointerGetBlockNumber(&xlrec->tid),
+ ItemPointerGetOffsetNumber(&xlrec->tid));
+ }
+ else if (info == XLOG_BRIN_UPDATE)
+ {
+ xl_brin_update *xlrec = (xl_brin_update *) rec;
+ appendStringInfo(buf, "rel %u/%u/%u heapBlk %u revmapBlk %u pagesPerRange %u old TID (%u,%u) TID (%u,%u)",
+ xlrec->new.node.spcNode, xlrec->new.node.dbNode,
+ xlrec->new.node.relNode,
+ xlrec->new.heapBlk, xlrec->new.revmapBlk,
+ xlrec->new.pagesPerRange,
+ ItemPointerGetBlockNumber(&xlrec->oldtid),
+ ItemPointerGetOffsetNumber(&xlrec->oldtid),
+ ItemPointerGetBlockNumber(&xlrec->new.tid),
+ ItemPointerGetOffsetNumber(&xlrec->new.tid));
+ }
+ else if (info == XLOG_BRIN_SAMEPAGE_UPDATE)
+ {
+ xl_brin_samepage_update *xlrec = (xl_brin_samepage_update *) rec;
+ appendStringInfo(buf, "rel %u/%u/%u TID (%u,%u)",
+ xlrec->node.spcNode, xlrec->node.dbNode,
+ xlrec->node.relNode,
+ ItemPointerGetBlockNumber(&xlrec->tid),
+ ItemPointerGetOffsetNumber(&xlrec->tid));
+ }
+ else if (info == XLOG_BRIN_REVMAP_EXTEND)
+ {
+ xl_brin_revmap_extend *xlrec = (xl_brin_revmap_extend *) rec;
+ appendStringInfo(buf, "rel %u/%u/%u targetBlk %u",
+ xlrec->node.spcNode, xlrec->node.dbNode,
+ xlrec->node.relNode, xlrec->targetBlk);
+ }
+const char *
+brin_identify(uint8 info)
+ const char *id = NULL;
+ switch (info & ~XLR_INFO_MASK)
+ {
+ id = "CREATE_INDEX";
+ break;
+ id = "INSERT";
+ break;
+ id = "INSERT+INIT";
+ break;
+ id = "UPDATE";
+ break;
+ id = "UPDATE+INIT";
+ break;
+ break;
+ break;
+ }
+ return id;
#include "access/gist_private.h"
#include "access/hash.h"
#include "access/heapam_xlog.h"
+#include "access/brin_xlog.h"
#include "access/multixact.h"
#include "access/nbtree.h"
#include "access/spgist.h"
bool allow_sync,
IndexBuildCallback callback,
void *callback_state)
+ return IndexBuildHeapRangeScan(heapRelation, indexRelation,
+ indexInfo, allow_sync,
+ 0, InvalidBlockNumber,
+ callback, callback_state);
+ * As above, except that instead of scanning the complete heap, only the given
+ * number of blocks are scanned. Scan to end-of-rel can be signalled by
+ * passing InvalidBlockNumber as numblocks.
+ */
+IndexBuildHeapRangeScan(Relation heapRelation,
+ Relation indexRelation,
+ IndexInfo *indexInfo,
+ bool allow_sync,
+ BlockNumber start_blockno,
+ BlockNumber numblocks,
+ IndexBuildCallback callback,
+ void *callback_state)
bool is_system_catalog;
bool checking_uniqueness;
true, /* buffer access strategy OK */
allow_sync); /* syncscan OK? */
+ /* set our scan endpoints */
+ heap_setscanlimits(scan, start_blockno, numblocks);
reltuples = 0;
case RM_GIST_ID:
case RM_SEQ_ID:
+ case RM_BRIN_ID:
case RM_NEXT_ID:
elog(ERROR, "unexpected RM_NEXT_ID rmgr_id: %u", (RmgrIds) buf.record.xl_rmid);
- * sorting support for PageRepairFragmentation and PageIndexMultiDelete
+ * sorting support for PageRepairFragmentation, PageIndexMultiDelete,
+ * PageIndexDeleteNoCompact
typedef struct itemIdSortData
phdr->pd_upper = upper;
+ * PageIndexDeleteNoCompact
+ * Delete the given items for an index page, and defragment the resulting
+ * free space, but do not compact the item pointers array.
+ *
+ * itemnos is the array of tuples to delete; nitems is its size. maxIdxTuples
+ * is the maximum number of tuples that can exist in a page.
+ *
+ * Unused items at the end of the array are removed.
+ *
+ * This is used for index AMs that require that existing TIDs of live tuples
+ * remain unchanged.
+ */
+PageIndexDeleteNoCompact(Page page, OffsetNumber *itemnos, int nitems)
+ PageHeader phdr = (PageHeader) page;
+ LocationIndex pd_lower = phdr->pd_lower;
+ LocationIndex pd_upper = phdr->pd_upper;
+ LocationIndex pd_special = phdr->pd_special;
+ int nline;
+ bool empty;
+ OffsetNumber offnum;
+ int nextitm;
+ /*
+ * As with PageRepairFragmentation, paranoia seems justified.
+ */
+ if (pd_lower < SizeOfPageHeaderData ||
+ pd_lower > pd_upper ||
+ pd_upper > pd_special ||
+ pd_special > BLCKSZ ||
+ pd_special != MAXALIGN(pd_special))
+ ereport(ERROR,
+ errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
+ pd_lower, pd_upper, pd_special)));
+ /*
+ * Scan the existing item pointer array and mark as unused those that are
+ * in our kill-list; make sure any non-interesting ones are marked unused
+ * as well.
+ */
+ nline = PageGetMaxOffsetNumber(page);
+ empty = true;
+ nextitm = 0;
+ for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum))
+ {
+ ItemId lp;
+ ItemLength itemlen;
+ ItemOffset offset;
+ lp = PageGetItemId(page, offnum);
+ itemlen = ItemIdGetLength(lp);
+ offset = ItemIdGetOffset(lp);
+ if (ItemIdIsUsed(lp))
+ {
+ if (offset < pd_upper ||
+ (offset + itemlen) > pd_special ||
+ offset != MAXALIGN(offset))
+ ereport(ERROR,
+ errmsg("corrupted item pointer: offset = %u, length = %u",
+ offset, (unsigned int) itemlen)));
+ if (nextitm < nitems && offnum == itemnos[nextitm])
+ {
+ /* this one is on our list to delete, so mark it unused */
+ ItemIdSetUnused(lp);
+ nextitm++;
+ }
+ else if (ItemIdHasStorage(lp))
+ {
+ /* This one's live -- must do the compaction dance */
+ empty = false;
+ }
+ else
+ {
+ /* get rid of this one too */
+ ItemIdSetUnused(lp);
+ }
+ }
+ }
+ /* this will catch invalid or out-of-order itemnos[] */
+ if (nextitm != nitems)
+ elog(ERROR, "incorrect index offsets supplied");
+ if (empty)
+ {
+ /* Page is completely empty, so just reset it quickly */
+ phdr->pd_lower = SizeOfPageHeaderData;
+ phdr->pd_upper = pd_special;
+ }
+ else
+ {
+ /* There are live items: need to compact the page the hard way */
+ itemIdSortData itemidbase[MaxOffsetNumber];
+ itemIdSort itemidptr;
+ int i;
+ Size totallen;
+ Offset upper;
+ /*
+ * Scan the page taking note of each item that we need to preserve.
+ * This includes both live items (those that contain data) and
+ * interspersed unused ones. It's critical to preserve these unused
+ * items, because otherwise the offset numbers for later live items
+ * would change, which is not acceptable. Unused items might get used
+ * again later; that is fine.
+ */
+ itemidptr = itemidbase;
+ totallen = 0;
+ for (i = 0; i < nline; i++, itemidptr++)
+ {
+ ItemId lp;
+ itemidptr->offsetindex = i;
+ lp = PageGetItemId(page, i + 1);
+ if (ItemIdHasStorage(lp))
+ {
+ itemidptr->itemoff = ItemIdGetOffset(lp);
+ itemidptr->alignedlen = MAXALIGN(ItemIdGetLength(lp));
+ totallen += itemidptr->alignedlen;
+ }
+ else
+ {
+ itemidptr->itemoff = 0;
+ itemidptr->alignedlen = 0;
+ }
+ }
+ /* By here, there are exactly nline elements in itemidbase array */
+ if (totallen > (Size) (pd_special - pd_lower))
+ ereport(ERROR,
+ errmsg("corrupted item lengths: total %u, available space %u",
+ (unsigned int) totallen, pd_special - pd_lower)));
+ /* sort itemIdSortData array into decreasing itemoff order */
+ qsort((char *) itemidbase, nline, sizeof(itemIdSortData),
+ itemoffcompare);
+ /*
+ * Defragment the data areas of each tuple, being careful to preserve
+ * each item's position in the linp array.
+ */
+ upper = pd_special;
+ PageClearHasFreeLinePointers(page);
+ for (i = 0, itemidptr = itemidbase; i < nline; i++, itemidptr++)
+ {
+ ItemId lp;
+ lp = PageGetItemId(page, itemidptr->offsetindex + 1);
+ if (itemidptr->alignedlen == 0)
+ {
+ PageSetHasFreeLinePointers(page);
+ ItemIdSetUnused(lp);
+ continue;
+ }
+ upper -= itemidptr->alignedlen;
+ memmove((char *) page + upper,
+ (char *) page + itemidptr->itemoff,
+ itemidptr->alignedlen);
+ lp->lp_off = upper;
+ /* lp_flags and lp_len remain the same as originally */
+ }
+ /* Set the new page limits */
+ phdr->pd_upper = upper;
+ phdr->pd_lower = SizeOfPageHeaderData + i * sizeof(ItemIdData);
+ }
* Set checksum for a page in shared buffers.
numIndexPages = 1.0;
- /* fetch estimated page cost for schema containing index */
+ /* fetch estimated page cost for tablespace containing index */
- /* fetch estimated page cost for schema containing index */
+ /* fetch estimated page cost for tablespace containing index */
+ * BRIN has search behavior completely different from other index types
+ */
+ PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+ IndexPath *path = (IndexPath *) PG_GETARG_POINTER(1);
+ double loop_count = PG_GETARG_FLOAT8(2);
+ Cost *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
+ Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
+ Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
+ double *indexCorrelation = (double *) PG_GETARG_POINTER(6);
+ IndexOptInfo *index = path->indexinfo;
+ List *indexQuals = path->indexquals;
+ List *indexOrderBys = path->indexorderbys;
+ double numPages = index->pages;
+ double numTuples = index->tuples;
+ Cost spc_seq_page_cost;
+ Cost spc_random_page_cost;
+ QualCost index_qual_cost;
+ double qual_op_cost;
+ double qual_arg_cost;
+ /* fetch estimated page cost for tablespace containing index */
+ get_tablespace_page_costs(index->reltablespace,
+ &spc_random_page_cost,
+ &spc_seq_page_cost);
+ /*
+ * BRIN indexes are always read in full; use that as startup cost.
+ * XXX maybe only include revmap pages here?
+ */
+ *indexStartupCost = spc_seq_page_cost * numPages * loop_count;
+ /*
+ * To read a BRIN index there might be a bit of back and forth over regular
+ * pages, as revmap might point to them out of sequential order; calculate
+ * this as reading the whole index in random order.
+ */
+ *indexTotalCost = spc_random_page_cost * numPages * loop_count;
+ *indexSelectivity =
+ clauselist_selectivity(root, path->indexquals,
+ path->indexinfo->rel->relid,
+ *indexCorrelation = 1;
+ /*
+ * Add on index qual eval costs, much as in genericcostestimate.
+ */
+ cost_qual_eval(&index_qual_cost, indexQuals, root);
+ qual_arg_cost = index_qual_cost.startup + index_qual_cost.per_tuple;
+ cost_qual_eval(&index_qual_cost, indexOrderBys, root);
+ qual_arg_cost += index_qual_cost.startup + index_qual_cost.per_tuple;
+ qual_op_cost = cpu_operator_cost *
+ (list_length(indexQuals) + list_length(indexOrderBys));
+ qual_arg_cost -= qual_op_cost;
+ if (qual_arg_cost < 0) /* just in case... */
+ qual_arg_cost = 0;
+ *indexStartupCost += qual_arg_cost;
+ *indexTotalCost += qual_arg_cost;
+ *indexTotalCost += (numTuples * *indexSelectivity) * (cpu_index_tuple_cost + qual_op_cost);
+ /* XXX what about pages_per_range? */
--- /dev/null
+ * AM-callable functions for BRIN indexes
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/brin.h
+ */
+#ifndef BRIN_H
+#define BRIN_H
+#include "fmgr.h"
+#include "nodes/execnodes.h"
+#include "utils/relcache.h"
+ * prototypes for functions in brin.c (external entry points for BRIN)
+ */
+extern Datum brinbuild(PG_FUNCTION_ARGS);
+extern Datum brinbuildempty(PG_FUNCTION_ARGS);
+extern Datum brininsert(PG_FUNCTION_ARGS);
+extern Datum brinbeginscan(PG_FUNCTION_ARGS);
+extern Datum bringettuple(PG_FUNCTION_ARGS);
+extern Datum bringetbitmap(PG_FUNCTION_ARGS);
+extern Datum brinrescan(PG_FUNCTION_ARGS);
+extern Datum brinendscan(PG_FUNCTION_ARGS);
+extern Datum brinmarkpos(PG_FUNCTION_ARGS);
+extern Datum brinrestrpos(PG_FUNCTION_ARGS);
+extern Datum brinbulkdelete(PG_FUNCTION_ARGS);
+extern Datum brinvacuumcleanup(PG_FUNCTION_ARGS);
+extern Datum brincanreturn(PG_FUNCTION_ARGS);
+extern Datum brincostestimate(PG_FUNCTION_ARGS);
+extern Datum brinoptions(PG_FUNCTION_ARGS);
+ * Storage type for BRIN's reloptions
+ */
+typedef struct BrinOptions
+ int32 vl_len_; /* varlena header (do not touch directly!) */
+ BlockNumber pagesPerRange;
+} BrinOptions;
+#define BrinGetPagesPerRange(relation) \
+ ((relation)->rd_options ? \
+ ((BrinOptions *) (relation)->rd_options)->pagesPerRange : \
+#endif /* BRIN_H */
--- /dev/null
+ * brin_internal.h
+ * internal declarations for BRIN indexes
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/brin_internal.h
+ */
+#include "fmgr.h"
+#include "storage/buf.h"
+#include "storage/bufpage.h"
+#include "storage/off.h"
+#include "utils/relcache.h"
+ * A BrinDesc is a struct designed to enable decoding a BRIN tuple from the
+ * on-disk format to an in-memory tuple and vice-versa.
+ */
+/* struct returned by "OpcInfo" amproc */
+typedef struct BrinOpcInfo
+ /* Number of columns stored in an index column of this opclass */
+ uint16 oi_nstored;
+ /* Opaque pointer for the opclass' private use */
+ void *oi_opaque;
+ /* Type IDs of the stored columns */
+} BrinOpcInfo;
+/* the size of a BrinOpcInfo for the given number of columns */
+#define SizeofBrinOpcInfo(ncols) \
+ (offsetof(BrinOpcInfo, oi_typids) + sizeof(Oid) * ncols)
+typedef struct BrinDesc
+ /* Containing memory context */
+ MemoryContext bd_context;
+ /* the index relation itself */
+ Relation bd_index;
+ /* tuple descriptor of the index relation */
+ TupleDesc bd_tupdesc;
+ /* cached copy for on-disk tuples; generated at first use */
+ TupleDesc bd_disktdesc;
+ /* total number of Datum entries that are stored on-disk for all columns */
+ int bd_totalstored;
+ /* per-column info; bd_tupdesc->natts entries long */
+ BrinOpcInfo *bd_info[FLEXIBLE_ARRAY_MEMBER];
+} BrinDesc;
+ * Globally-known function support numbers for BRIN indexes. Individual
+ * opclasses define their own function support numbers, which must not collide
+ * with the definitions here.
+ */
+/* procedure numbers up to 10 are reserved for BRIN future expansion */
+#define BRIN_DEBUG
+/* we allow debug if using GCC; otherwise don't bother */
+#if defined(BRIN_DEBUG) && defined(__GNUC__)
+#define BRIN_elog(level, ...) elog(level, __VA_ARGS__)
+#define BRIN_elog(a) void(0)
+/* brin.c */
+extern BrinDesc *brin_build_desc(Relation rel);
+extern void brin_free_desc(BrinDesc *bdesc);
+#endif /* BRIN_INTERNAL_H */
--- /dev/null
+ * brin_page.h
+ * Prototypes and definitions for BRIN page layouts
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/brin_page.h
+ *
+ *
+ * These structs should really be private to specific BRIN files, but it's
+ * useful to have them here so that they can be used by pageinspect and similar
+ * tools.
+ */
+#ifndef BRIN_PAGE_H
+#define BRIN_PAGE_H
+#include "storage/block.h"
+#include "storage/itemptr.h"
+/* special space on all BRIN pages stores a "type" identifier */
+#define BRIN_PAGETYPE_META 0xF091
+#define BRIN_PAGE_TYPE(page) \
+ (((BrinSpecialSpace *) PageGetSpecialPointer(page))->type)
+/* flags for BrinSpecialSpace */
+#define BRIN_EVACUATE_PAGE (1 << 0)
+typedef struct BrinSpecialSpace
+ uint16 flags;
+ uint16 type;
+} BrinSpecialSpace;
+/* Metapage definitions */
+typedef struct BrinMetaPageData
+ uint32 brinMagic;
+ uint32 brinVersion;
+ BlockNumber pagesPerRange;
+ BlockNumber lastRevmapPage;
+} BrinMetaPageData;
+#define BRIN_META_MAGIC 0xA8109CFA
+/* Definitions for revmap pages */
+typedef struct RevmapContents
+ ItemPointerData rm_tids[1]; /* really REVMAP_PAGE_MAXITEMS */
+} RevmapContents;
+ (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - \
+ offsetof(RevmapContents, rm_tids) - \
+ MAXALIGN(sizeof(BrinSpecialSpace)))
+/* max num of items in the array */
+ (REVMAP_CONTENT_SIZE / sizeof(ItemPointerData))
+#endif /* BRIN_PAGE_H */
--- /dev/null
+ * brin_pageops.h
+ * Prototypes for operating on BRIN pages.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/brin_pageops.h
+ */
+#include "access/brin_revmap.h"
+extern bool brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
+ BrinRevmap *revmap, BlockNumber heapBlk,
+ Buffer oldbuf, OffsetNumber oldoff,
+ const BrinTuple *origtup, Size origsz,
+ const BrinTuple *newtup, Size newsz,
+ bool samepage);
+extern bool brin_can_do_samepage_update(Buffer buffer, Size origsz,
+ Size newsz);
+extern OffsetNumber brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
+ BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk,
+ BrinTuple *tup, Size itemsz);
+extern void brin_page_init(Page page, uint16 type);
+extern void brin_metapage_init(Page page, BlockNumber pagesPerRange,
+ uint16 version);
+extern bool brin_start_evacuating_page(Relation idxRel, Buffer buf);
+extern void brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange,
+ BrinRevmap *revmap, Buffer buf);
+#endif /* BRIN_PAGEOPS_H */
--- /dev/null
+ * brin_revmap.h
+ * Prototypes for BRIN reverse range maps
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/brin_revmap.h
+ */
+#ifndef BRIN_REVMAP_H
+#define BRIN_REVMAP_H
+#include "access/brin_tuple.h"
+#include "storage/block.h"
+#include "storage/buf.h"
+#include "storage/itemptr.h"
+#include "storage/off.h"
+#include "utils/relcache.h"
+/* struct definition lives in brin_revmap.c */
+typedef struct BrinRevmap BrinRevmap;
+extern BrinRevmap *brinRevmapInitialize(Relation idxrel,
+ BlockNumber *pagesPerRange);
+extern void brinRevmapTerminate(BrinRevmap *revmap);
+extern void brinRevmapExtend(BrinRevmap *revmap,
+ BlockNumber heapBlk);
+extern Buffer brinLockRevmapPageForUpdate(BrinRevmap *revmap,
+ BlockNumber heapBlk);
+extern void brinSetHeapBlockItemptr(Buffer rmbuf, BlockNumber pagesPerRange,
+ BlockNumber heapBlk, ItemPointerData tid);
+extern BrinTuple *brinGetTupleForHeapBlock(BrinRevmap *revmap,
+ BlockNumber heapBlk, Buffer *buf, OffsetNumber *off,
+ Size *size, int mode);
+#endif /* BRIN_REVMAP_H */
--- /dev/null
+ * brin_tuple.h
+ * Declarations for dealing with BRIN-specific tuples.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/brin_tuple.h
+ */
+#ifndef BRIN_TUPLE_H
+#define BRIN_TUPLE_H
+#include "access/brin_internal.h"
+#include "access/tupdesc.h"
+ * A BRIN index stores one index tuple per page range. Each index tuple
+ * has one BrinValues struct for each indexed column; in turn, each BrinValues
+ * has (besides the null flags) an array of Datum whose size is determined by
+ * the opclass.
+ */
+typedef struct BrinValues
+ AttrNumber bv_attno; /* index attribute number */
+ bool bv_hasnulls; /* is there any nulls in the page range? */
+ bool bv_allnulls; /* are all values nulls in the page range? */
+ Datum *bv_values; /* current accumulated values */
+} BrinValues;
+ * This struct is used to represent an in-memory index tuple. The values can
+ * only be meaningfully decoded with an appropriate BrinDesc.
+ */
+typedef struct BrinMemTuple
+ bool bt_placeholder; /* this is a placeholder tuple */
+ BlockNumber bt_blkno; /* heap blkno that the tuple is for */
+ MemoryContext bt_context; /* memcxt holding the dt_column values */
+ BrinValues bt_columns[FLEXIBLE_ARRAY_MEMBER];
+} BrinMemTuple;
+ * An on-disk BRIN tuple. This is possibly followed by a nulls bitmask, with
+ * room for 2 null bits (two bits for each indexed column); an opclass-defined
+ * number of Datum values for each column follow.
+ */
+typedef struct BrinTuple
+ /* heap block number that the tuple is for */
+ BlockNumber bt_blkno;
+ /* ---------------
+ * mt_info is laid out in the following fashion:
+ *
+ * 7th (high) bit: has nulls
+ * 6th bit: is placeholder tuple
+ * 5th bit: unused
+ * 4-0 bit: offset of data
+ * ---------------
+ */
+ uint8 bt_info;
+} BrinTuple;
+#define SizeOfBrinTuple (offsetof(BrinTuple, bt_info) + sizeof(uint8))
+ * t_info manipulation macros
+ */
+#define BRIN_OFFSET_MASK 0x1F
+/* bit 0x20 is not used at present */
+#define BRIN_NULLS_MASK 0x80
+#define BrinTupleDataOffset(tup) ((Size) (((BrinTuple *) (tup))->bt_info & BRIN_OFFSET_MASK))
+#define BrinTupleHasNulls(tup) (((((BrinTuple *) (tup))->bt_info & BRIN_NULLS_MASK)) != 0)
+#define BrinTupleIsPlaceholder(tup) (((((BrinTuple *) (tup))->bt_info & BRIN_PLACEHOLDER_MASK)) != 0)
+extern BrinTuple *brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno,
+ BrinMemTuple *tuple, Size *size);
+extern BrinTuple *brin_form_placeholder_tuple(BrinDesc *brdesc,
+ BlockNumber blkno, Size *size);
+extern void brin_free_tuple(BrinTuple *tuple);
+extern BrinTuple *brin_copy_tuple(BrinTuple *tuple, Size len);
+extern bool brin_tuples_equal(const BrinTuple *a, Size alen,
+ const BrinTuple *b, Size blen);
+extern BrinMemTuple *brin_new_memtuple(BrinDesc *brdesc);
+extern void brin_memtuple_initialize(BrinMemTuple *dtuple,
+ BrinDesc *brdesc);
+extern BrinMemTuple *brin_deform_tuple(BrinDesc *brdesc,
+ BrinTuple *tuple);
+#endif /* BRIN_TUPLE_H */
--- /dev/null
+ *
+ * brin_xlog.h
+ * POSTGRES BRIN access XLOG definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/brin_xlog.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BRIN_XLOG_H
+#define BRIN_XLOG_H
+#include "access/xlogrecord.h"
+#include "lib/stringinfo.h"
+#include "storage/bufpage.h"
+#include "storage/itemptr.h"
+#include "storage/relfilenode.h"
+#include "utils/relcache.h"
+ * WAL record definitions for BRIN's WAL operations
+ *
+ * XLOG allows to store some information in high 4 bits of log
+ * record xl_info field.
+ */
+#define XLOG_BRIN_INSERT 0x10
+#define XLOG_BRIN_UPDATE 0x20
+#define XLOG_BRIN_OPMASK 0x70
+ * When we insert the first item on a new page, we restore the entire page in
+ * redo.
+ */
+#define XLOG_BRIN_INIT_PAGE 0x80
+/* This is what we need to know about a BRIN index create */
+typedef struct xl_brin_createidx
+ BlockNumber pagesPerRange;
+ RelFileNode node;
+ uint16 version;
+} xl_brin_createidx;
+#define SizeOfBrinCreateIdx (offsetof(xl_brin_createidx, version) + sizeof(uint16))
+ * This is what we need to know about a BRIN tuple insert
+ */
+typedef struct xl_brin_insert
+ RelFileNode node;
+ BlockNumber heapBlk;
+ /* extra information needed to update the revmap */
+ BlockNumber revmapBlk;
+ BlockNumber pagesPerRange;
+ uint16 tuplen;
+ ItemPointerData tid;
+ /* tuple data follows at end of struct */
+} xl_brin_insert;
+#define SizeOfBrinInsert (offsetof(xl_brin_insert, tid) + sizeof(ItemPointerData))
+ * A cross-page update is the same as an insert, but also store the old tid.
+ */
+typedef struct xl_brin_update
+ ItemPointerData oldtid;
+ xl_brin_insert new;
+} xl_brin_update;
+#define SizeOfBrinUpdate (offsetof(xl_brin_update, new) + SizeOfBrinInsert)
+/* This is what we need to know about a BRIN tuple samepage update */
+typedef struct xl_brin_samepage_update
+ RelFileNode node;
+ ItemPointerData tid;
+ /* tuple data follows at end of struct */
+} xl_brin_samepage_update;
+#define SizeOfBrinSamepageUpdate (offsetof(xl_brin_samepage_update, tid) + sizeof(ItemPointerData))
+/* This is what we need to know about a revmap extension */
+typedef struct xl_brin_revmap_extend
+ RelFileNode node;
+ BlockNumber targetBlk;
+} xl_brin_revmap_extend;
+#define SizeOfBrinRevmapExtend (offsetof(xl_brin_revmap_extend, targetBlk) + \
+ sizeof(BlockNumber))
+extern void brin_desc(StringInfo buf, XLogRecord *record);
+extern void brin_redo(XLogRecPtr lsn, XLogRecord *record);
+extern const char *brin_identify(uint8 info);
+#endif /* BRIN_XLOG_H */
bool allow_strat, bool allow_sync);
extern HeapScanDesc heap_beginscan_bm(Relation relation, Snapshot snapshot,
int nkeys, ScanKey key);
+extern void heap_setscanlimits(HeapScanDesc scan, BlockNumber startBlk,
+ BlockNumber endBlk);
extern void heap_rescan(HeapScanDesc scan, ScanKey key);
extern void heap_endscan(HeapScanDesc scan);
extern HeapTuple heap_getnext(HeapScanDesc scan, ScanDirection direction);
RELOPT_KIND_VIEW = (1 << 9),
+ RELOPT_KIND_BRIN = (1 << 10),
/* if you add a new kind, make sure you update "last_default" too */
/* some compilers treat enums as signed ints, so we can't use 1 << 31 */
RELOPT_KIND_MAX = (1 << 30)
} relopt_kind;
bool rs_temp_snap; /* unregister snapshot at scan end? */
/* state set up at initscan time */
- BlockNumber rs_nblocks; /* number of blocks to scan */
+ BlockNumber rs_nblocks; /* total number of blocks in rel */
BlockNumber rs_startblock; /* block # to start at */
+ BlockNumber rs_initblock; /* block # to consider initial of rel */
+ BlockNumber rs_numblocks; /* number of blocks to scan */
BufferAccessStrategy rs_strategy; /* access strategy for reads */
bool rs_syncscan; /* report location to syncscan logic? */
PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup)
PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL)
PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup)
+PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL)
/* yyyymmddN */
-#define CATALOG_VERSION_NO 201411041
+#define CATALOG_VERSION_NO 201411071
bool allow_sync,
IndexBuildCallback callback,
void *callback_state);
+extern double IndexBuildHeapRangeScan(Relation heapRelation,
+ Relation indexRelation,
+ IndexInfo *indexInfo,
+ bool allow_sync,
+ BlockNumber start_blockno,
+ BlockNumber end_blockno,
+ IndexBuildCallback callback,
+ void *callback_state);
extern void validate_index(Oid heapId, Oid indexId, Snapshot snapshot);
DATA(insert OID = 4000 ( spgist 0 5 f f f f f t f t f f f 0 spginsert spgbeginscan spggettuple spggetbitmap spgrescan spgendscan spgmarkpos spgrestrpos spgbuild spgbuildempty spgbulkdelete spgvacuumcleanup spgcanreturn spgcostestimate spgoptions ));
DESCR("SP-GiST index access method");
#define SPGIST_AM_OID 4000
+DATA(insert OID = 3580 ( brin 5 14 f f f f t t f t t f f 0 brininsert brinbeginscan - bringetbitmap brinrescan brinendscan brinmarkpos brinrestrpos brinbuild brinbuildempty brinbulkdelete brinvacuumcleanup - brincostestimate brinoptions ));
+#define BRIN_AM_OID 3580
#endif /* PG_AM_H */
DATA(insert ( 3550 869 869 26 s 933 783 0 ));
DATA(insert ( 3550 869 869 27 s 934 783 0 ));
+/* BRIN opclasses */
+/* minmax bytea */
+DATA(insert ( 4064 17 17 1 s 1957 3580 0 ));
+DATA(insert ( 4064 17 17 2 s 1958 3580 0 ));
+DATA(insert ( 4064 17 17 3 s 1955 3580 0 ));
+DATA(insert ( 4064 17 17 4 s 1960 3580 0 ));
+DATA(insert ( 4064 17 17 5 s 1959 3580 0 ));
+/* minmax "char" */
+DATA(insert ( 4062 18 18 1 s 631 3580 0 ));
+DATA(insert ( 4062 18 18 2 s 632 3580 0 ));
+DATA(insert ( 4062 18 18 3 s 92 3580 0 ));
+DATA(insert ( 4062 18 18 4 s 634 3580 0 ));
+DATA(insert ( 4062 18 18 5 s 633 3580 0 ));
+/* minmax name */
+DATA(insert ( 4065 19 19 1 s 660 3580 0 ));
+DATA(insert ( 4065 19 19 2 s 661 3580 0 ));
+DATA(insert ( 4065 19 19 3 s 93 3580 0 ));
+DATA(insert ( 4065 19 19 4 s 663 3580 0 ));
+DATA(insert ( 4065 19 19 5 s 662 3580 0 ));
+/* minmax bigint */
+DATA(insert ( 4063 20 20 1 s 412 3580 0 ));
+DATA(insert ( 4063 20 20 2 s 414 3580 0 ));
+DATA(insert ( 4063 20 20 3 s 410 3580 0 ));
+DATA(insert ( 4063 20 20 4 s 415 3580 0 ));
+DATA(insert ( 4063 20 20 5 s 413 3580 0 ));
+/* minmax smallint */
+DATA(insert ( 4067 21 21 1 s 95 3580 0 ));
+DATA(insert ( 4067 21 21 2 s 522 3580 0 ));
+DATA(insert ( 4067 21 21 3 s 94 3580 0 ));
+DATA(insert ( 4067 21 21 4 s 524 3580 0 ));
+DATA(insert ( 4067 21 21 5 s 520 3580 0 ));
+/* minmax integer */
+DATA(insert ( 4054 23 23 1 s 97 3580 0 ));
+DATA(insert ( 4054 23 23 2 s 523 3580 0 ));
+DATA(insert ( 4054 23 23 3 s 96 3580 0 ));
+DATA(insert ( 4054 23 23 4 s 525 3580 0 ));
+DATA(insert ( 4054 23 23 5 s 521 3580 0 ));
+/* minmax text */
+DATA(insert ( 4056 25 25 1 s 664 3580 0 ));
+DATA(insert ( 4056 25 25 2 s 665 3580 0 ));
+DATA(insert ( 4056 25 25 3 s 98 3580 0 ));
+DATA(insert ( 4056 25 25 4 s 667 3580 0 ));
+DATA(insert ( 4056 25 25 5 s 666 3580 0 ));
+/* minmax oid */
+DATA(insert ( 4068 26 26 1 s 609 3580 0 ));
+DATA(insert ( 4068 26 26 2 s 611 3580 0 ));
+DATA(insert ( 4068 26 26 3 s 607 3580 0 ));
+DATA(insert ( 4068 26 26 4 s 612 3580 0 ));
+DATA(insert ( 4068 26 26 5 s 610 3580 0 ));
+/* minmax tid */
+DATA(insert ( 4069 27 27 1 s 2799 3580 0 ));
+DATA(insert ( 4069 27 27 2 s 2801 3580 0 ));
+DATA(insert ( 4069 27 27 3 s 387 3580 0 ));
+DATA(insert ( 4069 27 27 4 s 2802 3580 0 ));
+DATA(insert ( 4069 27 27 5 s 2800 3580 0 ));
+/* minmax real */
+DATA(insert ( 4070 700 700 1 s 622 3580 0 ));
+DATA(insert ( 4070 700 700 2 s 624 3580 0 ));
+DATA(insert ( 4070 700 700 3 s 620 3580 0 ));
+DATA(insert ( 4070 700 700 4 s 625 3580 0 ));
+DATA(insert ( 4070 700 700 5 s 623 3580 0 ));
+/* minmax double precision */
+DATA(insert ( 4071 701 701 1 s 672 3580 0 ));
+DATA(insert ( 4071 701 701 2 s 673 3580 0 ));
+DATA(insert ( 4071 701 701 3 s 670 3580 0 ));
+DATA(insert ( 4071 701 701 4 s 675 3580 0 ));
+DATA(insert ( 4071 701 701 5 s 674 3580 0 ));
+/* minmax abstime */
+DATA(insert ( 4072 702 702 1 s 562 3580 0 ));
+DATA(insert ( 4072 702 702 2 s 564 3580 0 ));
+DATA(insert ( 4072 702 702 3 s 560 3580 0 ));
+DATA(insert ( 4072 702 702 4 s 565 3580 0 ));
+DATA(insert ( 4072 702 702 5 s 563 3580 0 ));
+/* minmax reltime */
+DATA(insert ( 4073 703 703 1 s 568 3580 0 ));
+DATA(insert ( 4073 703 703 2 s 570 3580 0 ));
+DATA(insert ( 4073 703 703 3 s 566 3580 0 ));
+DATA(insert ( 4073 703 703 4 s 571 3580 0 ));
+DATA(insert ( 4073 703 703 5 s 569 3580 0 ));
+/* minmax macaddr */
+DATA(insert ( 4074 829 829 1 s 1222 3580 0 ));
+DATA(insert ( 4074 829 829 2 s 1223 3580 0 ));
+DATA(insert ( 4074 829 829 3 s 1220 3580 0 ));
+DATA(insert ( 4074 829 829 4 s 1225 3580 0 ));
+DATA(insert ( 4074 829 829 5 s 1224 3580 0 ));
+/* minmax inet */
+DATA(insert ( 4075 869 869 1 s 1203 3580 0 ));
+DATA(insert ( 4075 869 869 2 s 1204 3580 0 ));
+DATA(insert ( 4075 869 869 3 s 1201 3580 0 ));
+DATA(insert ( 4075 869 869 4 s 1206 3580 0 ));
+DATA(insert ( 4075 869 869 5 s 1205 3580 0 ));
+/* minmax character */
+DATA(insert ( 4076 1042 1042 1 s 1058 3580 0 ));
+DATA(insert ( 4076 1042 1042 2 s 1059 3580 0 ));
+DATA(insert ( 4076 1042 1042 3 s 1054 3580 0 ));
+DATA(insert ( 4076 1042 1042 4 s 1061 3580 0 ));
+DATA(insert ( 4076 1042 1042 5 s 1060 3580 0 ));
+/* minmax date */
+DATA(insert ( 4061 1082 1082 1 s 1095 3580 0 ));
+DATA(insert ( 4061 1082 1082 2 s 1096 3580 0 ));
+DATA(insert ( 4061 1082 1082 3 s 1093 3580 0 ));
+DATA(insert ( 4061 1082 1082 4 s 1098 3580 0 ));
+DATA(insert ( 4061 1082 1082 5 s 1097 3580 0 ));
+/* minmax time without time zone */
+DATA(insert ( 4077 1083 1083 1 s 1110 3580 0 ));
+DATA(insert ( 4077 1083 1083 2 s 1111 3580 0 ));
+DATA(insert ( 4077 1083 1083 3 s 1108 3580 0 ));
+DATA(insert ( 4077 1083 1083 4 s 1113 3580 0 ));
+DATA(insert ( 4077 1083 1083 5 s 1112 3580 0 ));
+/* minmax timestamp without time zone */
+DATA(insert ( 4059 1114 1114 1 s 2062 3580 0 ));
+DATA(insert ( 4059 1114 1114 2 s 2063 3580 0 ));
+DATA(insert ( 4059 1114 1114 3 s 2060 3580 0 ));
+DATA(insert ( 4059 1114 1114 4 s 2065 3580 0 ));
+DATA(insert ( 4059 1114 1114 5 s 2064 3580 0 ));
+/* minmax timestamp with time zone */
+DATA(insert ( 4060 1184 1184 1 s 1322 3580 0 ));
+DATA(insert ( 4060 1184 1184 2 s 1323 3580 0 ));
+DATA(insert ( 4060 1184 1184 3 s 1320 3580 0 ));
+DATA(insert ( 4060 1184 1184 4 s 1325 3580 0 ));
+DATA(insert ( 4060 1184 1184 5 s 1324 3580 0 ));
+/* minmax interval */
+DATA(insert ( 4078 1186 1186 1 s 1332 3580 0 ));
+DATA(insert ( 4078 1186 1186 2 s 1333 3580 0 ));
+DATA(insert ( 4078 1186 1186 3 s 1330 3580 0 ));
+DATA(insert ( 4078 1186 1186 4 s 1335 3580 0 ));
+DATA(insert ( 4078 1186 1186 5 s 1334 3580 0 ));
+/* minmax time with time zone */
+DATA(insert ( 4058 1266 1266 1 s 1552 3580 0 ));
+DATA(insert ( 4058 1266 1266 2 s 1553 3580 0 ));
+DATA(insert ( 4058 1266 1266 3 s 1550 3580 0 ));
+DATA(insert ( 4058 1266 1266 4 s 1555 3580 0 ));
+DATA(insert ( 4058 1266 1266 5 s 1554 3580 0 ));
+/* minmax bit */
+DATA(insert ( 4079 1560 1560 1 s 1786 3580 0 ));
+DATA(insert ( 4079 1560 1560 2 s 1788 3580 0 ));
+DATA(insert ( 4079 1560 1560 3 s 1784 3580 0 ));
+DATA(insert ( 4079 1560 1560 4 s 1789 3580 0 ));
+DATA(insert ( 4079 1560 1560 5 s 1787 3580 0 ));
+/* minmax bit varying */
+DATA(insert ( 4080 1562 1562 1 s 1806 3580 0 ));
+DATA(insert ( 4080 1562 1562 2 s 1808 3580 0 ));
+DATA(insert ( 4080 1562 1562 3 s 1804 3580 0 ));
+DATA(insert ( 4080 1562 1562 4 s 1809 3580 0 ));
+DATA(insert ( 4080 1562 1562 5 s 1807 3580 0 ));
+/* minmax numeric */
+DATA(insert ( 4055 1700 1700 1 s 1754 3580 0 ));
+DATA(insert ( 4055 1700 1700 2 s 1755 3580 0 ));
+DATA(insert ( 4055 1700 1700 3 s 1752 3580 0 ));
+DATA(insert ( 4055 1700 1700 4 s 1757 3580 0 ));
+DATA(insert ( 4055 1700 1700 5 s 1756 3580 0 ));
+/* minmax uuid */
+DATA(insert ( 4081 2950 2950 1 s 2974 3580 0 ));
+DATA(insert ( 4081 2950 2950 2 s 2976 3580 0 ));
+DATA(insert ( 4081 2950 2950 3 s 2972 3580 0 ));
+DATA(insert ( 4081 2950 2950 4 s 2977 3580 0 ));
+DATA(insert ( 4081 2950 2950 5 s 2975 3580 0 ));
+/* minmax pg_lsn */
+DATA(insert ( 4082 3220 3220 1 s 3224 3580 0 ));
+DATA(insert ( 4082 3220 3220 2 s 3226 3580 0 ));
+DATA(insert ( 4082 3220 3220 3 s 3222 3580 0 ));
+DATA(insert ( 4082 3220 3220 4 s 3227 3580 0 ));
+DATA(insert ( 4082 3220 3220 5 s 3225 3580 0 ));
#endif /* PG_AMOP_H */
DATA(insert ( 4017 25 25 4 4030 ));
DATA(insert ( 4017 25 25 5 4031 ));
+/* BRIN opclasses */
+/* minmax bytea */
+DATA(insert ( 4064 17 17 1 3383 ));
+DATA(insert ( 4064 17 17 2 3384 ));
+DATA(insert ( 4064 17 17 3 3385 ));
+DATA(insert ( 4064 17 17 4 3386 ));
+DATA(insert ( 4064 17 17 11 1949 ));
+DATA(insert ( 4064 17 17 12 1950 ));
+DATA(insert ( 4064 17 17 13 1952 ));
+DATA(insert ( 4064 17 17 14 1951 ));
+/* minmax "char" */
+DATA(insert ( 4062 18 18 1 3383 ));
+DATA(insert ( 4062 18 18 2 3384 ));
+DATA(insert ( 4062 18 18 3 3385 ));
+DATA(insert ( 4062 18 18 4 3386 ));
+DATA(insert ( 4062 18 18 11 1246 ));
+DATA(insert ( 4062 18 18 12 72 ));
+DATA(insert ( 4062 18 18 13 74 ));
+DATA(insert ( 4062 18 18 14 73 ));
+/* minmax name */
+DATA(insert ( 4065 19 19 1 3383 ));
+DATA(insert ( 4065 19 19 2 3384 ));
+DATA(insert ( 4065 19 19 3 3385 ));
+DATA(insert ( 4065 19 19 4 3386 ));
+DATA(insert ( 4065 19 19 11 655 ));
+DATA(insert ( 4065 19 19 12 656 ));
+DATA(insert ( 4065 19 19 13 658 ));
+DATA(insert ( 4065 19 19 14 657 ));
+/* minmax bigint */
+DATA(insert ( 4063 20 20 1 3383 ));
+DATA(insert ( 4063 20 20 2 3384 ));
+DATA(insert ( 4063 20 20 3 3385 ));
+DATA(insert ( 4063 20 20 4 3386 ));
+DATA(insert ( 4063 20 20 11 469 ));
+DATA(insert ( 4063 20 20 12 471 ));
+DATA(insert ( 4063 20 20 13 472 ));
+DATA(insert ( 4063 20 20 14 470 ));
+/* minmax smallint */
+DATA(insert ( 4067 21 21 1 3383 ));
+DATA(insert ( 4067 21 21 2 3384 ));
+DATA(insert ( 4067 21 21 3 3385 ));
+DATA(insert ( 4067 21 21 4 3386 ));
+DATA(insert ( 4067 21 21 11 64 ));
+DATA(insert ( 4067 21 21 12 148 ));
+DATA(insert ( 4067 21 21 13 151 ));
+DATA(insert ( 4067 21 21 14 146 ));
+/* minmax integer */
+DATA(insert ( 4054 23 23 1 3383 ));
+DATA(insert ( 4054 23 23 2 3384 ));
+DATA(insert ( 4054 23 23 3 3385 ));
+DATA(insert ( 4054 23 23 4 3386 ));
+DATA(insert ( 4054 23 23 11 66 ));
+DATA(insert ( 4054 23 23 12 149 ));
+DATA(insert ( 4054 23 23 13 150 ));
+DATA(insert ( 4054 23 23 14 147 ));
+/* minmax text */
+DATA(insert ( 4056 25 25 1 3383 ));
+DATA(insert ( 4056 25 25 2 3384 ));
+DATA(insert ( 4056 25 25 3 3385 ));
+DATA(insert ( 4056 25 25 4 3386 ));
+DATA(insert ( 4056 25 25 11 740 ));
+DATA(insert ( 4056 25 25 12 741 ));
+DATA(insert ( 4056 25 25 13 743 ));
+DATA(insert ( 4056 25 25 14 742 ));
+/* minmax oid */
+DATA(insert ( 4068 26 26 1 3383 ));
+DATA(insert ( 4068 26 26 2 3384 ));
+DATA(insert ( 4068 26 26 3 3385 ));
+DATA(insert ( 4068 26 26 4 3386 ));
+DATA(insert ( 4068 26 26 11 716 ));
+DATA(insert ( 4068 26 26 12 717 ));
+DATA(insert ( 4068 26 26 13 1639 ));
+DATA(insert ( 4068 26 26 14 1638 ));
+/* minmax tid */
+DATA(insert ( 4069 27 27 1 3383 ));
+DATA(insert ( 4069 27 27 2 3384 ));
+DATA(insert ( 4069 27 27 3 3385 ));
+DATA(insert ( 4069 27 27 4 3386 ));
+DATA(insert ( 4069 27 27 11 2791 ));
+DATA(insert ( 4069 27 27 12 2793 ));
+DATA(insert ( 4069 27 27 13 2792 ));
+DATA(insert ( 4069 27 27 14 2790 ));
+/* minmax real */
+DATA(insert ( 4070 700 700 1 3383 ));
+DATA(insert ( 4070 700 700 2 3384 ));
+DATA(insert ( 4070 700 700 3 3385 ));
+DATA(insert ( 4070 700 700 4 3386 ));
+DATA(insert ( 4070 700 700 11 289 ));
+DATA(insert ( 4070 700 700 12 290 ));
+DATA(insert ( 4070 700 700 13 292 ));
+DATA(insert ( 4070 700 700 14 291 ));
+/* minmax double precision */
+DATA(insert ( 4071 701 701 1 3383 ));
+DATA(insert ( 4071 701 701 2 3384 ));
+DATA(insert ( 4071 701 701 3 3385 ));
+DATA(insert ( 4071 701 701 4 3386 ));
+DATA(insert ( 4071 701 701 11 295 ));
+DATA(insert ( 4071 701 701 12 296 ));
+DATA(insert ( 4071 701 701 13 298 ));
+DATA(insert ( 4071 701 701 14 297 ));
+/* minmax abstime */
+DATA(insert ( 4072 702 702 1 3383 ));
+DATA(insert ( 4072 702 702 2 3384 ));
+DATA(insert ( 4072 702 702 3 3385 ));
+DATA(insert ( 4072 702 702 4 3386 ));
+DATA(insert ( 4072 702 702 11 253 ));
+DATA(insert ( 4072 702 702 12 255 ));
+DATA(insert ( 4072 702 702 13 256 ));
+DATA(insert ( 4072 702 702 14 254 ));
+/* minmax reltime */
+DATA(insert ( 4073 703 703 1 3383 ));
+DATA(insert ( 4073 703 703 2 3384 ));
+DATA(insert ( 4073 703 703 3 3385 ));
+DATA(insert ( 4073 703 703 4 3386 ));
+DATA(insert ( 4073 703 703 11 259 ));
+DATA(insert ( 4073 703 703 12 261 ));
+DATA(insert ( 4073 703 703 13 262 ));
+DATA(insert ( 4073 703 703 14 260 ));
+/* minmax macaddr */
+DATA(insert ( 4074 829 829 1 3383 ));
+DATA(insert ( 4074 829 829 2 3384 ));
+DATA(insert ( 4074 829 829 3 3385 ));
+DATA(insert ( 4074 829 829 4 3386 ));
+DATA(insert ( 4074 829 829 11 831 ));
+DATA(insert ( 4074 829 829 12 832 ));
+DATA(insert ( 4074 829 829 13 834 ));
+DATA(insert ( 4074 829 829 14 833 ));
+/* minmax inet */
+DATA(insert ( 4075 869 869 1 3383 ));
+DATA(insert ( 4075 869 869 2 3384 ));
+DATA(insert ( 4075 869 869 3 3385 ));
+DATA(insert ( 4075 869 869 4 3386 ));
+DATA(insert ( 4075 869 869 11 921 ));
+DATA(insert ( 4075 869 869 12 922 ));
+DATA(insert ( 4075 869 869 13 924 ));
+DATA(insert ( 4075 869 869 14 923 ));
+/* minmax character */
+DATA(insert ( 4076 1042 1042 1 3383 ));
+DATA(insert ( 4076 1042 1042 2 3384 ));
+DATA(insert ( 4076 1042 1042 3 3385 ));
+DATA(insert ( 4076 1042 1042 4 3386 ));
+DATA(insert ( 4076 1042 1042 11 1049 ));
+DATA(insert ( 4076 1042 1042 12 1050 ));
+DATA(insert ( 4076 1042 1042 13 1052 ));
+DATA(insert ( 4076 1042 1042 14 1051 ));
+/* minmax date */
+DATA(insert ( 4061 1082 1082 1 3383 ));
+DATA(insert ( 4061 1082 1082 2 3384 ));
+DATA(insert ( 4061 1082 1082 3 3385 ));
+DATA(insert ( 4061 1082 1082 4 3386 ));
+DATA(insert ( 4061 1082 1082 11 1087 ));
+DATA(insert ( 4061 1082 1082 12 1088 ));
+DATA(insert ( 4061 1082 1082 13 1090 ));
+DATA(insert ( 4061 1082 1082 14 1089 ));
+/* minmax time without time zone */
+DATA(insert ( 4077 1083 1083 1 3383 ));
+DATA(insert ( 4077 1083 1083 2 3384 ));
+DATA(insert ( 4077 1083 1083 3 3385 ));
+DATA(insert ( 4077 1083 1083 4 3386 ));
+DATA(insert ( 4077 1083 1083 11 1102 ));
+DATA(insert ( 4077 1083 1083 12 1103 ));
+DATA(insert ( 4077 1083 1083 13 1105 ));
+DATA(insert ( 4077 1083 1083 14 1104 ));
+/* minmax timestamp without time zone */
+DATA(insert ( 4059 1114 1114 1 3383 ));
+DATA(insert ( 4059 1114 1114 2 3384 ));
+DATA(insert ( 4059 1114 1114 3 3385 ));
+DATA(insert ( 4059 1114 1114 4 3386 ));
+DATA(insert ( 4059 1114 1114 11 2054 ));
+DATA(insert ( 4059 1114 1114 12 2055 ));
+DATA(insert ( 4059 1114 1114 13 2056 ));
+DATA(insert ( 4059 1114 1114 14 2057 ));
+/* minmax timestamp with time zone */
+DATA(insert ( 4060 1184 1184 1 3383 ));
+DATA(insert ( 4060 1184 1184 2 3384 ));
+DATA(insert ( 4060 1184 1184 3 3385 ));
+DATA(insert ( 4060 1184 1184 4 3386 ));
+DATA(insert ( 4060 1184 1184 11 1154 ));
+DATA(insert ( 4060 1184 1184 12 1155 ));
+DATA(insert ( 4060 1184 1184 13 1156 ));
+DATA(insert ( 4060 1184 1184 14 1157 ));
+/* minmax interval */
+DATA(insert ( 4078 1186 1186 1 3383 ));
+DATA(insert ( 4078 1186 1186 2 3384 ));
+DATA(insert ( 4078 1186 1186 3 3385 ));
+DATA(insert ( 4078 1186 1186 4 3386 ));
+DATA(insert ( 4078 1186 1186 11 1164 ));
+DATA(insert ( 4078 1186 1186 12 1165 ));
+DATA(insert ( 4078 1186 1186 13 1166 ));
+DATA(insert ( 4078 1186 1186 14 1167 ));
+/* minmax time with time zone */
+DATA(insert ( 4058 1266 1266 1 3383 ));
+DATA(insert ( 4058 1266 1266 2 3384 ));
+DATA(insert ( 4058 1266 1266 3 3385 ));
+DATA(insert ( 4058 1266 1266 4 3386 ));
+DATA(insert ( 4058 1266 1266 11 1354 ));
+DATA(insert ( 4058 1266 1266 12 1355 ));
+DATA(insert ( 4058 1266 1266 13 1356 ));
+DATA(insert ( 4058 1266 1266 14 1357 ));
+/* minmax bit */
+DATA(insert ( 4079 1560 1560 1 3383 ));
+DATA(insert ( 4079 1560 1560 2 3384 ));
+DATA(insert ( 4079 1560 1560 3 3385 ));
+DATA(insert ( 4079 1560 1560 4 3386 ));
+DATA(insert ( 4079 1560 1560 11 1595 ));
+DATA(insert ( 4079 1560 1560 12 1594 ));
+DATA(insert ( 4079 1560 1560 13 1592 ));
+DATA(insert ( 4079 1560 1560 14 1593 ));
+/* minmax bit varying */
+DATA(insert ( 4080 1562 1562 1 3383 ));
+DATA(insert ( 4080 1562 1562 2 3384 ));
+DATA(insert ( 4080 1562 1562 3 3385 ));
+DATA(insert ( 4080 1562 1562 4 3386 ));
+DATA(insert ( 4080 1562 1562 11 1671 ));
+DATA(insert ( 4080 1562 1562 12 1670 ));
+DATA(insert ( 4080 1562 1562 13 1668 ));
+DATA(insert ( 4080 1562 1562 14 1669 ));
+/* minmax numeric */
+DATA(insert ( 4055 1700 1700 1 3383 ));
+DATA(insert ( 4055 1700 1700 2 3384 ));
+DATA(insert ( 4055 1700 1700 3 3385 ));
+DATA(insert ( 4055 1700 1700 4 3386 ));
+DATA(insert ( 4055 1700 1700 11 1722 ));
+DATA(insert ( 4055 1700 1700 12 1723 ));
+DATA(insert ( 4055 1700 1700 13 1721 ));
+DATA(insert ( 4055 1700 1700 14 1720 ));
+/* minmax uuid */
+DATA(insert ( 4081 2950 2950 1 3383 ));
+DATA(insert ( 4081 2950 2950 2 3384 ));
+DATA(insert ( 4081 2950 2950 3 3385 ));
+DATA(insert ( 4081 2950 2950 4 3386 ));
+DATA(insert ( 4081 2950 2950 11 2954 ));
+DATA(insert ( 4081 2950 2950 12 2955 ));
+DATA(insert ( 4081 2950 2950 13 2957 ));
+DATA(insert ( 4081 2950 2950 14 2958 ));
+/* minmax pg_lsn */
+DATA(insert ( 4082 3220 3220 1 3383 ));
+DATA(insert ( 4082 3220 3220 2 3384 ));
+DATA(insert ( 4082 3220 3220 3 3385 ));
+DATA(insert ( 4082 3220 3220 4 3386 ));
+DATA(insert ( 4082 3220 3220 11 3231 ));
+DATA(insert ( 4082 3220 3220 12 3232 ));
+DATA(insert ( 4082 3220 3220 13 3234 ));
+DATA(insert ( 4082 3220 3220 14 3235 ));
#endif /* PG_AMPROC_H */
DATA(insert ( 2742 jsonb_ops PGNSP PGUID 4036 3802 t 25 ));
DATA(insert ( 2742 jsonb_path_ops PGNSP PGUID 4037 3802 f 23 ));
+/* BRIN operator classes */
+/* no brin opclass for bool */
+DATA(insert ( 3580 bytea_minmax_ops PGNSP PGUID 4064 17 t 0 ));
+DATA(insert ( 3580 char_minmax_ops PGNSP PGUID 4062 18 t 0 ));
+DATA(insert ( 3580 name_minmax_ops PGNSP PGUID 4065 19 t 0 ));
+DATA(insert ( 3580 int8_minmax_ops PGNSP PGUID 4063 20 t 0 ));
+DATA(insert ( 3580 int2_minmax_ops PGNSP PGUID 4067 21 t 0 ));
+DATA(insert ( 3580 int4_minmax_ops PGNSP PGUID 4054 23 t 0 ));
+DATA(insert ( 3580 text_minmax_ops PGNSP PGUID 4056 25 t 0 ));
+DATA(insert ( 3580 oid_minmax_ops PGNSP PGUID 4068 26 t 0 ));
+DATA(insert ( 3580 tid_minmax_ops PGNSP PGUID 4069 27 t 0 ));
+DATA(insert ( 3580 float4_minmax_ops PGNSP PGUID 4070 700 t 0 ));
+DATA(insert ( 3580 float8_minmax_ops PGNSP PGUID 4071 701 t 0 ));
+DATA(insert ( 3580 abstime_minmax_ops PGNSP PGUID 4072 702 t 0 ));
+DATA(insert ( 3580 reltime_minmax_ops PGNSP PGUID 4073 703 t 0 ));
+DATA(insert ( 3580 macaddr_minmax_ops PGNSP PGUID 4074 829 t 0 ));
+DATA(insert ( 3580 inet_minmax_ops PGNSP PGUID 4075 869 t 0 ));
+DATA(insert ( 3580 bpchar_minmax_ops PGNSP PGUID 4076 1042 t 0 ));
+DATA(insert ( 3580 date_minmax_ops PGNSP PGUID 4061 1082 t 0 ));
+DATA(insert ( 3580 time_minmax_ops PGNSP PGUID 4077 1083 t 0 ));
+DATA(insert ( 3580 timestamp_minmax_ops PGNSP PGUID 4059 1114 t 0 ));
+DATA(insert ( 3580 timestamptz_minmax_ops PGNSP PGUID 4060 1184 t 0 ));
+DATA(insert ( 3580 interval_minmax_ops PGNSP PGUID 4078 1186 t 0 ));
+DATA(insert ( 3580 timetz_minmax_ops PGNSP PGUID 4058 1266 t 0 ));
+DATA(insert ( 3580 bit_minmax_ops PGNSP PGUID 4079 1560 t 0 ));
+DATA(insert ( 3580 varbit_minmax_ops PGNSP PGUID 4080 1562 t 0 ));
+DATA(insert ( 3580 numeric_minmax_ops PGNSP PGUID 4055 1700 t 0 ));
+/* no brin opclass for record, anyarray */
+DATA(insert ( 3580 uuid_minmax_ops PGNSP PGUID 4081 2950 t 0 ));
+DATA(insert ( 3580 pg_lsn_minmax_ops PGNSP PGUID 4082 3220 t 0 ));
+/* no brin opclass for enum, tsvector, tsquery, jsonb, range */
#endif /* PG_OPCLASS_H */
DATA(insert OID = 4036 ( 2742 jsonb_ops PGNSP PGUID ));
DATA(insert OID = 4037 ( 2742 jsonb_path_ops PGNSP PGUID ));
+DATA(insert OID = 4054 ( 3580 int4_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4055 ( 3580 numeric_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4056 ( 3580 text_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4058 ( 3580 timetz_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4059 ( 3580 timestamp_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4060 ( 3580 timestamptz_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4061 ( 3580 date_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4062 ( 3580 char_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4063 ( 3580 int8_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4064 ( 3580 bytea_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4065 ( 3580 name_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4067 ( 3580 int2_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4068 ( 3580 oid_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4069 ( 3580 tid_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4070 ( 3580 float4_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4071 ( 3580 float8_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4072 ( 3580 abstime_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4073 ( 3580 reltime_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4074 ( 3580 macaddr_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4075 ( 3580 inet_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4076 ( 3580 bpchar_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4077 ( 3580 time_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4078 ( 3580 interval_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4079 ( 3580 bit_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4080 ( 3580 varbit_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4081 ( 3580 uuid_minmax_ops PGNSP PGUID ));
+DATA(insert OID = 4082 ( 3580 pg_lsn_minmax_ops PGNSP PGUID ));
#endif /* PG_OPFAMILY_H */
DATA(insert OID = 2785 ( btoptions PGNSP PGUID 12 1 0 0 0 f f f f t f s 2 0 17 "1009 16" _null_ _null_ _null_ _null_ btoptions _null_ _null_ _null_ ));
+DATA(insert OID = 3789 ( bringetbitmap PGNSP PGUID 12 1 0 0 0 f f f f t f v 2 0 20 "2281 2281" _null_ _null_ _null_ _null_ bringetbitmap _null_ _null_ _null_ ));
+DATA(insert OID = 3790 ( brininsert PGNSP PGUID 12 1 0 0 0 f f f f t f v 6 0 16 "2281 2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ brininsert _null_ _null_ _null_ ));
+DATA(insert OID = 3791 ( brinbeginscan PGNSP PGUID 12 1 0 0 0 f f f f t f v 3 0 2281 "2281 2281 2281" _null_ _null_ _null_ _null_ brinbeginscan _null_ _null_ _null_ ));
+DATA(insert OID = 3792 ( brinrescan PGNSP PGUID 12 1 0 0 0 f f f f t f v 5 0 2278 "2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ brinrescan _null_ _null_ _null_ ));
+DATA(insert OID = 3793 ( brinendscan PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ brinendscan _null_ _null_ _null_ ));
+DATA(insert OID = 3794 ( brinmarkpos PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ brinmarkpos _null_ _null_ _null_ ));
+DATA(insert OID = 3795 ( brinrestrpos PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ brinrestrpos _null_ _null_ _null_ ));
+DATA(insert OID = 3796 ( brinbuild PGNSP PGUID 12 1 0 0 0 f f f f t f v 3 0 2281 "2281 2281 2281" _null_ _null_ _null_ _null_ brinbuild _null_ _null_ _null_ ));
+DATA(insert OID = 3797 ( brinbuildempty PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ brinbuildempty _null_ _null_ _null_ ));
+DATA(insert OID = 3798 ( brinbulkdelete PGNSP PGUID 12 1 0 0 0 f f f f t f v 4 0 2281 "2281 2281 2281 2281" _null_ _null_ _null_ _null_ brinbulkdelete _null_ _null_ _null_ ));
+DATA(insert OID = 3799 ( brinvacuumcleanup PGNSP PGUID 12 1 0 0 0 f f f f t f v 2 0 2281 "2281 2281" _null_ _null_ _null_ _null_ brinvacuumcleanup _null_ _null_ _null_ ));
+DATA(insert OID = 3800 ( brincostestimate PGNSP PGUID 12 1 0 0 0 f f f f t f v 7 0 2278 "2281 2281 2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ brincostestimate _null_ _null_ _null_ ));
+DATA(insert OID = 3801 ( brinoptions PGNSP PGUID 12 1 0 0 0 f f f f t f s 2 0 17 "1009 16" _null_ _null_ _null_ _null_ brinoptions _null_ _null_ _null_ ));
+DATA(insert OID = 3952 ( brin_summarize_new_values PGNSP PGUID 12 1 0 0 0 f f f f f f v 1 0 23 "2205" _null_ _null_ _null_ _null_ brin_summarize_new_values _null_ _null_ _null_ ));
+DESCR("brin: standalone scan new table pages");
DATA(insert OID = 339 ( poly_same PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "604 604" _null_ _null_ _null_ _null_ poly_same _null_ _null_ _null_ ));
DATA(insert OID = 340 ( poly_contain PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "604 604" _null_ _null_ _null_ _null_ poly_contain _null_ _null_ _null_ ));
DATA(insert OID = 341 ( poly_left PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "604 604" _null_ _null_ _null_ _null_ poly_left _null_ _null_ _null_ ));
DATA(insert OID = 2748 ( arraycontains PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "2277 2277" _null_ _null_ _null_ _null_ arraycontains _null_ _null_ _null_ ));
DATA(insert OID = 2749 ( arraycontained PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "2277 2277" _null_ _null_ _null_ _null_ arraycontained _null_ _null_ _null_ ));
+/* BRIN minmax */
+DATA(insert OID = 3383 ( brin_minmax_opcinfo PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 2281 "2281" _null_ _null_ _null_ _null_ minmaxOpcInfo _null_ _null_ _null_ ));
+DESCR("BRIN minmax support");
+DATA(insert OID = 3384 ( brin_minmax_add_value PGNSP PGUID 12 1 0 0 0 f f f f t f i 4 0 16 "2281 2281 2281 2281" _null_ _null_ _null_ _null_ minmaxAddValue _null_ _null_ _null_ ));
+DESCR("BRIN minmax support");
+DATA(insert OID = 3385 ( brin_minmax_consistent PGNSP PGUID 12 1 0 0 0 f f f f t f i 3 0 16 "2281 2281 2281" _null_ _null_ _null_ _null_ minmaxConsistent _null_ _null_ _null_ ));
+DESCR("BRIN minmax support");
+DATA(insert OID = 3386 ( brin_minmax_union PGNSP PGUID 12 1 0 0 0 f f f f t f i 3 0 16 "2281 2281 2281" _null_ _null_ _null_ _null_ minmaxUnion _null_ _null_ _null_ ));
+DESCR("BRIN minmax support");
/* userlock replacements */
DATA(insert OID = 2880 ( pg_advisory_lock PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "20" _null_ _null_ _null_ _null_ pg_advisory_lock_int8 _null_ _null_ _null_ ));
DESCR("obtain exclusive advisory lock");
extern Size PageGetHeapFreeSpace(Page page);
extern void PageIndexTupleDelete(Page page, OffsetNumber offset);
extern void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems);
+extern void PageIndexDeleteNoCompact(Page page, OffsetNumber *itemnos,
+ int nitems);
extern char *PageSetChecksumCopy(Page page, BlockNumber blkno);
extern void PageSetChecksumInplace(Page page, BlockNumber blkno);
extern Selectivity estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey,
double nbuckets);
+extern Datum brincostestimate(PG_FUNCTION_ARGS);
extern Datum btcostestimate(PG_FUNCTION_ARGS);
extern Datum hashcostestimate(PG_FUNCTION_ARGS);
extern Datum gistcostestimate(PG_FUNCTION_ARGS);
--- /dev/null
+SET synchronous_commit = 0;
+CREATE TABLE brintest (byteacol bytea,
+ charcol "char",
+ namecol name,
+ int8col bigint,
+ int2col smallint,
+ int4col integer,
+ textcol text,
+ oidcol oid,
+ tidcol tid,
+ float4col real,
+ float8col double precision,
+ macaddrcol macaddr,
+ inetcol inet,
+ bpcharcol character,
+ datecol date,
+ timecol time without time zone,
+ timestampcol timestamp without time zone,
+ timestamptzcol timestamp with time zone,
+ intervalcol interval,
+ timetzcol time with time zone,
+ bitcol bit(10),
+ varbitcol bit varying(16),
+ numericcol numeric,
+ uuidcol uuid,
+ lsncol pg_lsn
+) WITH (fillfactor=50);
+ repeat(stringu1, 42)::bytea,
+ substr(stringu1, 1, 1)::"char",
+ stringu1::name, 142857 * tenthous,
+ thousand,
+ twothousand,
+ repeat(stringu1, 42),
+ unique1::oid,
+ format('(%s,%s)', tenthous, twenty)::tid,
+ (four + 1.0)/(hundred+1),
+ odd::float8 / (tenthous + 1),
+ format('%s:00:%s:00:%s:00', to_hex(odd), to_hex(even), to_hex(hundred))::macaddr,
+ inet '' + tenthous,
+ substr(stringu1, 1, 1)::bpchar,
+ date '1995-08-15' + tenthous,
+ time '01:20:30' + thousand * interval '18.5 second',
+ timestamp '1942-07-23 03:05:09' + tenthous * interval '36.38 hours',
+ timestamptz '1972-10-10 03:00' + thousand * interval '1 hour',
+ justify_days(justify_hours(tenthous * interval '12 minutes')),
+ timetz '01:30:20' + hundred * interval '15 seconds',
+ thousand::bit(10),
+ tenthous::bit(16)::varbit,
+ tenthous::numeric(36,30) * fivethous * even / (hundred + 1),
+ format('%s%s-%s-%s-%s-%s%s%s', to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'))::uuid,
+ format('%s/%s%s', odd, even, tenthous)::pg_lsn
+FROM tenk1;
+CREATE INDEX brinidx ON brintest USING brin (
+ byteacol,
+ charcol,
+ namecol,
+ int8col,
+ int2col,
+ int4col,
+ textcol,
+ oidcol,
+ tidcol,
+ float4col,
+ float8col,
+ macaddrcol,
+ inetcol,
+ bpcharcol,
+ datecol,
+ timecol,
+ timestampcol,
+ timestamptzcol,
+ intervalcol,
+ timetzcol,
+ bitcol,
+ varbitcol,
+ numericcol,
+ uuidcol,
+ lsncol
+) with (pages_per_range = 1);
+CREATE TABLE brinopers (colname name, op text[], value text[],
+ check (cardinality(op) = cardinality(value)));
+INSERT INTO brinopers VALUES ('byteacol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAAA, AAAAAA, AAAAAA}');
+INSERT INTO brinopers VALUES ('charcol', '{>, >=, =, <=, <}', '{Z, Z, A, A, A}');
+INSERT INTO brinopers VALUES ('namecol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAAA, AAAAAA, AAAAAA}');
+INSERT INTO brinopers VALUES ('int8col', '{>, >=, =, <=, <}', '{1428427143, 1428427143, 0, 0, 0}');
+INSERT INTO brinopers VALUES ('int2col', '{>, >=, =, <=, <}', '{999, 999, 0, 0, 0}');
+INSERT INTO brinopers VALUES ('int4col', '{>, >=, =, <=, <}', '{1999, 1999, 0, 0, 0}');
+INSERT INTO brinopers VALUES ('textcol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAA, AAAAA, AAAAA}');
+INSERT INTO brinopers VALUES ('oidcol', '{>, >=, =, <=, <}', '{9999, 9999, 0, 0, 0}');
+INSERT INTO brinopers VALUES ('tidcol', '{>, >=, =, <=, <}', '{"(9999,19)", "(9999,19)", "(0,0)", "(0,0)", "(0,0)"}');
+INSERT INTO brinopers VALUES ('float4col', '{>, >=, =, <=, <}', '{1, 1, 0.0103093, 0.0103093, 0.0103093}');
+INSERT INTO brinopers VALUES ('float8col', '{>, >=, =, <=, <}', '{1.98, 1.98, 0, 0, 0}');
+INSERT INTO brinopers VALUES ('inetcol', '{>, >=, =, <=, <}', '{,,,,}');
+INSERT INTO brinopers VALUES ('bpcharcol', '{>, >=, =, <=, <}', '{Z, Z, A, A, A}');
+INSERT INTO brinopers VALUES ('datecol', '{>, >=, =, <=, <}', '{2022-12-30, 2022-12-30, 1995-08-15, 1995-08-15, 1995-08-15}');
+INSERT INTO brinopers VALUES ('timecol', '{>, >=, =, <=, <}', '{06:28:31.5, 06:28:31.5, 01:20:30, 01:20:30, 01:20:30}');
+INSERT INTO brinopers VALUES ('timestampcol', '{>, >=, =, <=, <}', '{1984-01-20 22:42:21, 1984-01-20 22:42:21, 1942-07-23 03:05:09, 1942-07-23 03:05:09, 1942-07-23 03:05:09}');
+INSERT INTO brinopers VALUES ('timestamptzcol', '{>, >=, =, <=, <}', '{1972-11-20 19:00:00-03, 1972-11-20 19:00:00-03, 1972-10-10 03:00:00-04, 1972-10-10 03:00:00-04, 1972-10-10 03:00:00-04}');
+INSERT INTO brinopers VALUES ('intervalcol', '{>, >=, =, <=, <}', '{2 mons 23 days 07:48:00, 2 mons 23 days 07:48:00, 00:00:00, 00:00:00, 00:00:00}');
+INSERT INTO brinopers VALUES ('timetzcol', '{>, >=, =, <=, <}', '{01:55:05-03, 01:55:05-03, 01:30:20-03, 01:30:20-03, 01:30:20-03}');
+INSERT INTO brinopers VALUES ('numericcol', '{>, >=, =, <=, <}', '{99470151.9, 99470151.9, 0.00, 0.01, 0.01}');
+INSERT INTO brinopers VALUES ('macaddrcol', '{>, >=, =, <=, <}', '{ff:fe:00:00:00:00, ff:fe:00:00:00:00, 00:00:01:00:00:00, 00:00:01:00:00:00, 00:00:01:00:00:00}');
+INSERT INTO brinopers VALUES ('bitcol', '{>, >=, =, <=, <}', '{1111111000, 1111111000, 0000000010, 0000000010, 0000000010}');
+INSERT INTO brinopers VALUES ('varbitcol', '{>, >=, =, <=, <}', '{1111111111111000, 1111111111111000, 0000000000000100, 0000000000000100, 0000000000000100}');
+INSERT INTO brinopers VALUES ('uuidcol', '{>, >=, =, <=, <}', '{99989998-9998-9998-9998-999899989998, 99989998-9998-9998-9998-999899989998, 00040004-0004-0004-0004-000400040004, 00040004-0004-0004-0004-000400040004, 00040004-0004-0004-0004-000400040005}');
+INSERT INTO brinopers VALUES ('lsncol', '{>, >=, =, <=, <}', '{198/1999799, 198/1999799, 30/312815, 0/1200, 0/1200}');
+DO $x$
+ r record;
+ tabname text;
+ tabname_ss text;
+ count int;
+ query text;
+ plan text;
+ FOR r IN SELECT row_number() OVER (), colname, oper, value[ordinality] FROM brinopers, unnest(op) WITH ORDINALITY AS oper LOOP
+ tabname := format('qry_%s', r.row_number);
+ tabname_ss := tabname || '_ss';
+ query = format($y$INSERT INTO %s SELECT ctid FROM brintest WHERE %s %s %L $y$,
+ tabname, r.colname, r.oper, r.value);
+ -- run the query using the brin index
+ SET enable_seqscan = 0;
+ SET enable_bitmapscan = 1;
+ EXECUTE format('create temp table %s (tid tid) /* ON COMMIT DROP*/', tabname);
+ EXECUTE query;
+ -- run the query using a seqscan
+ SET enable_seqscan = 1;
+ SET enable_bitmapscan = 0;
+ query = format($y$INSERT INTO %s SELECT ctid FROM brintest WHERE %s %s %L $y$,
+ tabname_ss, r.colname, r.oper, r.value);
+ EXECUTE format('create temp table %s (tid tid) /* ON COMMIT DROP */', tabname_ss);
+ EXECUTE query;
+ -- make sure both return the same results
+ EXECUTE format('SELECT * from %s EXCEPT ALL SELECT * FROM %s', tabname, tabname_ss);
+ IF count <> 0 THEN RAISE EXCEPTION 'something not right in %: count %', r, count; END IF;
+ EXECUTE format('SELECT * from %s EXCEPT ALL SELECT * FROM %s', tabname_ss, tabname);
+ IF count <> 0 THEN RAISE EXCEPTION 'something not right in %: count %', r, count; END IF;
+ end loop;
+ repeat(stringu1, 42)::bytea,
+ substr(stringu1, 1, 1)::"char",
+ stringu1::name, 142857 * tenthous,
+ thousand,
+ twothousand,
+ repeat(stringu1, 42),
+ unique1::oid,
+ format('(%s,%s)', tenthous, twenty)::tid,
+ (four + 1.0)/(hundred+1),
+ odd::float8 / (tenthous + 1),
+ format('%s:00:%s:00:%s:00', to_hex(odd), to_hex(even), to_hex(hundred))::macaddr,
+ inet '' + tenthous,
+ substr(stringu1, 1, 1)::bpchar,
+ date '1995-08-15' + tenthous,
+ time '01:20:30' + thousand * interval '18.5 second',
+ timestamp '1942-07-23 03:05:09' + tenthous * interval '36.38 hours',
+ timestamptz '1972-10-10 03:00' + thousand * interval '1 hour',
+ justify_days(justify_hours(tenthous * interval '12 minutes')),
+ timetz '01:30:20' + hundred * interval '15 seconds',
+ thousand::bit(10),
+ tenthous::bit(16)::varbit,
+ tenthous::numeric(36,30) * fivethous * even / (hundred + 1),
+ format('%s%s-%s-%s-%s-%s%s%s', to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'))::uuid,
+ format('%s/%s%s', odd, even, tenthous)::pg_lsn
+FROM tenk1;
+SELECT brin_summarize_new_values('brinidx'::regclass);
+ brin_summarize_new_values
+ 2000
+(1 row)
+UPDATE brintest SET int8col = int8col * int4col;
+SET synchronous_commit = 1;
2742 | 9 | ?
2742 | 10 | ?|
2742 | 11 | ?&
+ 3580 | 1 | <
+ 3580 | 2 | <=
+ 3580 | 3 | =
+ 3580 | 4 | >=
+ 3580 | 5 | >
4000 | 1 | <<
4000 | 1 | ~<~
4000 | 2 | &<
4000 | 15 | >
4000 | 16 | @>
4000 | 18 | =
-(80 rows)
+(85 rows)
-- Check that all opclass search operators have selectivity estimators.
-- This is not absolutely required, but it seems a reasonable thing
-- GIN has six support functions. 1-3 are mandatory, 5 is optional, and
-- at least one of 4 and 6 must be given.
-- SP-GiST has five support functions, all mandatory
+ -- BRIN has four mandatory support functions, and a bunch of optionals
amname = 'btree' AND procnums @> '{1}' OR
amname = 'hash' AND procnums = '{1}' OR
amname = 'gist' AND procnums @> '{1, 2, 3, 4, 5, 6, 7}' OR
amname = 'gin' AND (procnums @> '{1, 2, 3}' AND (procnums && '{4, 6}')) OR
- amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}'
+ amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}' OR
+ amname = 'brin' AND procnums @> '{1, 2, 3, 4}'
amname | opfname | amproclefttype | amprocrighttype | procnums
amname = 'hash' AND procnums = '{1}' OR
amname = 'gist' AND procnums @> '{1, 2, 3, 4, 5, 6, 7}' OR
amname = 'gin' AND (procnums @> '{1, 2, 3}' AND (procnums && '{4, 6}')) OR
- amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}'
+ amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}' OR
+ amname = 'brin' AND procnums @> '{1, 2, 3, 4}'
amname | opcname | procnums
+ brinopers
+ brintest
-(120 rows)
+(122 rows)
SELECT name(equipment(hobby_construct(text 'skywalking', text 'mer')));
# ----------
# Another group of parallel tests
# ----------
-test: privileges security_label collate matview lock replica_identity rowsecurity
+test: brin privileges security_label collate matview lock replica_identity rowsecurity
# ----------
# Another group of parallel tests
test: misc
test: psql
test: async
+test: brin
test: rules
test: event_trigger
test: select_views
--- /dev/null
+SET synchronous_commit = 0;
+CREATE TABLE brintest (byteacol bytea,
+ charcol "char",
+ namecol name,
+ int8col bigint,
+ int2col smallint,
+ int4col integer,
+ textcol text,
+ oidcol oid,
+ tidcol tid,
+ float4col real,
+ float8col double precision,
+ macaddrcol macaddr,
+ inetcol inet,
+ bpcharcol character,
+ datecol date,
+ timecol time without time zone,
+ timestampcol timestamp without time zone,
+ timestamptzcol timestamp with time zone,
+ intervalcol interval,
+ timetzcol time with time zone,
+ bitcol bit(10),
+ varbitcol bit varying(16),
+ numericcol numeric,
+ uuidcol uuid,
+ lsncol pg_lsn
+) WITH (fillfactor=50);
+ repeat(stringu1, 42)::bytea,
+ substr(stringu1, 1, 1)::"char",
+ stringu1::name, 142857 * tenthous,
+ thousand,
+ twothousand,
+ repeat(stringu1, 42),
+ unique1::oid,
+ format('(%s,%s)', tenthous, twenty)::tid,
+ (four + 1.0)/(hundred+1),
+ odd::float8 / (tenthous + 1),
+ format('%s:00:%s:00:%s:00', to_hex(odd), to_hex(even), to_hex(hundred))::macaddr,
+ inet '' + tenthous,
+ substr(stringu1, 1, 1)::bpchar,
+ date '1995-08-15' + tenthous,
+ time '01:20:30' + thousand * interval '18.5 second',
+ timestamp '1942-07-23 03:05:09' + tenthous * interval '36.38 hours',
+ timestamptz '1972-10-10 03:00' + thousand * interval '1 hour',
+ justify_days(justify_hours(tenthous * interval '12 minutes')),
+ timetz '01:30:20' + hundred * interval '15 seconds',
+ thousand::bit(10),
+ tenthous::bit(16)::varbit,
+ tenthous::numeric(36,30) * fivethous * even / (hundred + 1),
+ format('%s%s-%s-%s-%s-%s%s%s', to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'))::uuid,
+ format('%s/%s%s', odd, even, tenthous)::pg_lsn
+FROM tenk1;
+CREATE INDEX brinidx ON brintest USING brin (
+ byteacol,
+ charcol,
+ namecol,
+ int8col,
+ int2col,
+ int4col,
+ textcol,
+ oidcol,
+ tidcol,
+ float4col,
+ float8col,
+ macaddrcol,
+ inetcol,
+ bpcharcol,
+ datecol,
+ timecol,
+ timestampcol,
+ timestamptzcol,
+ intervalcol,
+ timetzcol,
+ bitcol,
+ varbitcol,
+ numericcol,
+ uuidcol,
+ lsncol
+) with (pages_per_range = 1);
+CREATE TABLE brinopers (colname name, op text[], value text[],
+ check (cardinality(op) = cardinality(value)));
+INSERT INTO brinopers VALUES ('byteacol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAAA, AAAAAA, AAAAAA}');
+INSERT INTO brinopers VALUES ('charcol', '{>, >=, =, <=, <}', '{Z, Z, A, A, A}');
+INSERT INTO brinopers VALUES ('namecol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAAA, AAAAAA, AAAAAA}');
+INSERT INTO brinopers VALUES ('int8col', '{>, >=, =, <=, <}', '{1428427143, 1428427143, 0, 0, 0}');
+INSERT INTO brinopers VALUES ('int2col', '{>, >=, =, <=, <}', '{999, 999, 0, 0, 0}');
+INSERT INTO brinopers VALUES ('int4col', '{>, >=, =, <=, <}', '{1999, 1999, 0, 0, 0}');
+INSERT INTO brinopers VALUES ('textcol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAA, AAAAA, AAAAA}');
+INSERT INTO brinopers VALUES ('oidcol', '{>, >=, =, <=, <}', '{9999, 9999, 0, 0, 0}');
+INSERT INTO brinopers VALUES ('tidcol', '{>, >=, =, <=, <}', '{"(9999,19)", "(9999,19)", "(0,0)", "(0,0)", "(0,0)"}');
+INSERT INTO brinopers VALUES ('float4col', '{>, >=, =, <=, <}', '{1, 1, 0.0103093, 0.0103093, 0.0103093}');
+INSERT INTO brinopers VALUES ('float8col', '{>, >=, =, <=, <}', '{1.98, 1.98, 0, 0, 0}');
+INSERT INTO brinopers VALUES ('inetcol', '{>, >=, =, <=, <}', '{,,,,}');
+INSERT INTO brinopers VALUES ('bpcharcol', '{>, >=, =, <=, <}', '{Z, Z, A, A, A}');
+INSERT INTO brinopers VALUES ('datecol', '{>, >=, =, <=, <}', '{2022-12-30, 2022-12-30, 1995-08-15, 1995-08-15, 1995-08-15}');
+INSERT INTO brinopers VALUES ('timecol', '{>, >=, =, <=, <}', '{06:28:31.5, 06:28:31.5, 01:20:30, 01:20:30, 01:20:30}');
+INSERT INTO brinopers VALUES ('timestampcol', '{>, >=, =, <=, <}', '{1984-01-20 22:42:21, 1984-01-20 22:42:21, 1942-07-23 03:05:09, 1942-07-23 03:05:09, 1942-07-23 03:05:09}');
+INSERT INTO brinopers VALUES ('timestamptzcol', '{>, >=, =, <=, <}', '{1972-11-20 19:00:00-03, 1972-11-20 19:00:00-03, 1972-10-10 03:00:00-04, 1972-10-10 03:00:00-04, 1972-10-10 03:00:00-04}');
+INSERT INTO brinopers VALUES ('intervalcol', '{>, >=, =, <=, <}', '{2 mons 23 days 07:48:00, 2 mons 23 days 07:48:00, 00:00:00, 00:00:00, 00:00:00}');
+INSERT INTO brinopers VALUES ('timetzcol', '{>, >=, =, <=, <}', '{01:55:05-03, 01:55:05-03, 01:30:20-03, 01:30:20-03, 01:30:20-03}');
+INSERT INTO brinopers VALUES ('numericcol', '{>, >=, =, <=, <}', '{99470151.9, 99470151.9, 0.00, 0.01, 0.01}');
+INSERT INTO brinopers VALUES ('macaddrcol', '{>, >=, =, <=, <}', '{ff:fe:00:00:00:00, ff:fe:00:00:00:00, 00:00:01:00:00:00, 00:00:01:00:00:00, 00:00:01:00:00:00}');
+INSERT INTO brinopers VALUES ('bitcol', '{>, >=, =, <=, <}', '{1111111000, 1111111000, 0000000010, 0000000010, 0000000010}');
+INSERT INTO brinopers VALUES ('varbitcol', '{>, >=, =, <=, <}', '{1111111111111000, 1111111111111000, 0000000000000100, 0000000000000100, 0000000000000100}');
+INSERT INTO brinopers VALUES ('uuidcol', '{>, >=, =, <=, <}', '{99989998-9998-9998-9998-999899989998, 99989998-9998-9998-9998-999899989998, 00040004-0004-0004-0004-000400040004, 00040004-0004-0004-0004-000400040004, 00040004-0004-0004-0004-000400040005}');
+INSERT INTO brinopers VALUES ('lsncol', '{>, >=, =, <=, <}', '{198/1999799, 198/1999799, 30/312815, 0/1200, 0/1200}');
+DO $x$
+ r record;
+ tabname text;
+ tabname_ss text;
+ count int;
+ query text;
+ plan text;
+ FOR r IN SELECT row_number() OVER (), colname, oper, value[ordinality] FROM brinopers, unnest(op) WITH ORDINALITY AS oper LOOP
+ tabname := format('qry_%s', r.row_number);
+ tabname_ss := tabname || '_ss';
+ query = format($y$INSERT INTO %s SELECT ctid FROM brintest WHERE %s %s %L $y$,
+ tabname, r.colname, r.oper, r.value);
+ -- run the query using the brin index
+ SET enable_seqscan = 0;
+ SET enable_bitmapscan = 1;
+ EXECUTE format('create temp table %s (tid tid) /* ON COMMIT DROP*/', tabname);
+ EXECUTE query;
+ -- run the query using a seqscan
+ SET enable_seqscan = 1;
+ SET enable_bitmapscan = 0;
+ query = format($y$INSERT INTO %s SELECT ctid FROM brintest WHERE %s %s %L $y$,
+ tabname_ss, r.colname, r.oper, r.value);
+ EXECUTE format('create temp table %s (tid tid) /* ON COMMIT DROP */', tabname_ss);
+ EXECUTE query;
+ -- make sure both return the same results
+ EXECUTE format('SELECT * from %s EXCEPT ALL SELECT * FROM %s', tabname, tabname_ss);
+ IF count <> 0 THEN RAISE EXCEPTION 'something not right in %: count %', r, count; END IF;
+ EXECUTE format('SELECT * from %s EXCEPT ALL SELECT * FROM %s', tabname_ss, tabname);
+ IF count <> 0 THEN RAISE EXCEPTION 'something not right in %: count %', r, count; END IF;
+ end loop;
+ repeat(stringu1, 42)::bytea,
+ substr(stringu1, 1, 1)::"char",
+ stringu1::name, 142857 * tenthous,
+ thousand,
+ twothousand,
+ repeat(stringu1, 42),
+ unique1::oid,
+ format('(%s,%s)', tenthous, twenty)::tid,
+ (four + 1.0)/(hundred+1),
+ odd::float8 / (tenthous + 1),
+ format('%s:00:%s:00:%s:00', to_hex(odd), to_hex(even), to_hex(hundred))::macaddr,
+ inet '' + tenthous,
+ substr(stringu1, 1, 1)::bpchar,
+ date '1995-08-15' + tenthous,
+ time '01:20:30' + thousand * interval '18.5 second',
+ timestamp '1942-07-23 03:05:09' + tenthous * interval '36.38 hours',
+ timestamptz '1972-10-10 03:00' + thousand * interval '1 hour',
+ justify_days(justify_hours(tenthous * interval '12 minutes')),
+ timetz '01:30:20' + hundred * interval '15 seconds',
+ thousand::bit(10),
+ tenthous::bit(16)::varbit,
+ tenthous::numeric(36,30) * fivethous * even / (hundred + 1),
+ format('%s%s-%s-%s-%s-%s%s%s', to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'))::uuid,
+ format('%s/%s%s', odd, even, tenthous)::pg_lsn
+FROM tenk1;
+SELECT brin_summarize_new_values('brinidx'::regclass);
+UPDATE brintest SET int8col = int8col * int4col;
+SET synchronous_commit = 1;
-- GIN has six support functions. 1-3 are mandatory, 5 is optional, and
-- at least one of 4 and 6 must be given.
-- SP-GiST has five support functions, all mandatory
+ -- BRIN has four mandatory support functions, and a bunch of optionals
amname = 'btree' AND procnums @> '{1}' OR
amname = 'hash' AND procnums = '{1}' OR
amname = 'gist' AND procnums @> '{1, 2, 3, 4, 5, 6, 7}' OR
amname = 'gin' AND (procnums @> '{1, 2, 3}' AND (procnums && '{4, 6}')) OR
- amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}'
+ amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}' OR
+ amname = 'brin' AND procnums @> '{1, 2, 3, 4}'
-- Also, check if there are any pg_opclass entries that don't seem to have
amname = 'hash' AND procnums = '{1}' OR
amname = 'gist' AND procnums @> '{1, 2, 3, 4, 5, 6, 7}' OR
amname = 'gin' AND (procnums @> '{1, 2, 3}' AND (procnums && '{4, 6}')) OR
- amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}'
+ amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}' OR
+ amname = 'brin' AND procnums @> '{1, 2, 3, 4}'
-- Unfortunately, we can't check the amproc link very well because the