Fix overflow check and comment in GIN posting list encoding.

author Heikki Linnakangas <heikki.linnakangas@iki.fi>

Wed, 28 Aug 2019 09:55:33 +0000 (12:55 +0300)

committer Heikki Linnakangas <heikki.linnakangas@iki.fi>

Wed, 28 Aug 2019 09:55:33 +0000 (12:55 +0300)
author Heikki Linnakangas <heikki.linnakangas@iki.fi>
Wed, 28 Aug 2019 09:55:33 +0000 (12:55 +0300)
committer Heikki Linnakangas <heikki.linnakangas@iki.fi>
Wed, 28 Aug 2019 09:55:33 +0000 (12:55 +0300)
diff --git a/src/backend/access/gin/ginpostinglist.c b/src/backend/access/gin/ginpostinglist.c

index 48ccfafdd224aa9e1aac4746d1c25cc3b9bbe13f..44aef77edb72d82bf7bfd9c416f48bfa768adba3 100644 (file)
--- a/src/backend/access/gin/ginpostinglist.c
+++ b/src/backend/access/gin/ginpostinglist.c
@@ -23,25 +23,32 @@
  /*
   * For encoding purposes, item pointers are represented as 64-bit unsigned
   * integers. The lowest 11 bits represent the offset number, and the next
- * lowest 32 bits are the block number. That leaves 17 bits unused, i.e.
+ * lowest 32 bits are the block number. That leaves 21 bits unused, i.e.
   * only 43 low bits are used.
   *
+ * 11 bits is enough for the offset number, because MaxHeapTuplesPerPage <
+ * 2^11 on all supported block sizes. We are frugal with the bits, because
+ * smaller integers use fewer bytes in the varbyte encoding, saving disk
+ * space. (If we get a new table AM in the future that wants to use the full
+ * range of possible offset numbers, we'll need to change this.)
+ *
   * These 43-bit integers are encoded using varbyte encoding. In each byte,
   * the 7 low bits contain data, while the highest bit is a continuation bit.
   * When the continuation bit is set, the next byte is part of the same
- * integer, otherwise this is the last byte of this integer.  43 bits fit
- * conveniently in at most 6 bytes when varbyte encoded (the 6th byte does
- * not need a continuation bit, because we know the max size to be 43 bits):
+ * integer, otherwise this is the last byte of this integer. 43 bits need
+ * at most 7 bytes in this encoding:
   *
   * 0XXXXXXX
   * 1XXXXXXX 0XXXXYYY
   * 1XXXXXXX 1XXXXYYY 0YYYYYYY
   * 1XXXXXXX 1XXXXYYY 1YYYYYYY 0YYYYYYY
   * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY
- * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY YYYYYYYY
+ * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY
+ * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0uuuuuuY
   *
   * X = bits used for offset number
   * Y = bits used for block number
+ * u = unused bit
   *
   * The bytes are in stored in little-endian order.
   *
@@ -73,6 +80,9 @@
   */
  #define MaxHeapTuplesPerPageBits               11
  
+/* Max. number of bytes needed to encode the largest supported integer. */
+#define MaxBytesPerInteger                             7
+
  static inline uint64
  itemptr_to_uint64(const ItemPointer iptr)
  {
@@ -126,33 +136,40 @@ decode_varbyte(unsigned char **ptr)
         unsigned char *p = *ptr;
         uint64          c;
  
+       /* 1st byte */
         c = *(p++);
         val = c & 0x7F;
         if (c & 0x80)
         {
+               /* 2nd byte */
                 c = *(p++);
                 val |= (c & 0x7F) << 7;
                 if (c & 0x80)
                 {
+                       /* 3rd byte */
                         c = *(p++);
                         val |= (c & 0x7F) << 14;
                         if (c & 0x80)
                         {
+                               /* 4th byte */
                                 c = *(p++);
                                 val |= (c & 0x7F) << 21;
                                 if (c & 0x80)
                                 {
+                                       /* 5th byte */
                                         c = *(p++);
                                         val |= (c & 0x7F) << 28;
                                         if (c & 0x80)
                                         {
+                                               /* 6th byte */
                                                 c = *(p++);
                                                 val |= (c & 0x7F) << 35;
                                                 if (c & 0x80)
                                                 {
-                                                       /* last byte, no continuation bit */
+                                                       /* 7th byte, should not have continuation bit */
                                                         c = *(p++);
                                                         val |= c << 42;
+                                                       Assert((c & 0x80) == 0);
                                                 }
                                         }
                                 }
@@ -208,15 +225,15 @@ ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize,
  
                 Assert(val > prev);
  
-               if (endptr - ptr >= 6)
+               if (endptr - ptr >= MaxBytesPerInteger)
                         encode_varbyte(delta, &ptr);
                 else
                 {
                         /*
-                        * There are less than 6 bytes left. Have to check if the next
+                        * There are less than 7 bytes left. Have to check if the next
                          * item fits in that space before writing it out.
                          */
-                       unsigned char buf[6];
+                       unsigned char buf[MaxBytesPerInteger];
                         unsigned char *p = buf;
  
                         encode_varbyte(delta, &p);
diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile

index 60d6d7be1b256e32fb51e21d67d85c38fb861a1f..953905d1eff6f02094bf79bb8a621b0c11dcdba5 100644 (file)
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -12,6 +12,7 @@ SUBDIRS = \
                   test_bloomfilter \
                   test_ddl_deparse \
                   test_extensions \
+                 test_ginpostinglist \
                   test_integerset \
                   test_parser \
                   test_pg_dump \
diff --git a/src/test/modules/test_ginpostinglist/Makefile b/src/test/modules/test_ginpostinglist/Makefile

new file mode 100644 (file)

index 0000000..4d45ac9
--- /dev/null
+++ b/src/test/modules/test_ginpostinglist/Makefile
@@ -0,0 +1,21 @@
+# src/test/modules/test_ginpostinglist/Makefile
+
+MODULE_big = test_ginpostinglist
+OBJS = test_ginpostinglist.o $(WIN32RES)
+PGFILEDESC = "test_ginpostinglist - test code for src/backend/access/gin//ginpostinglist.c"
+
+EXTENSION = test_ginpostinglist
+DATA = test_ginpostinglist--1.0.sql
+
+REGRESS = test_ginpostinglist
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_ginpostinglist
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/test_ginpostinglist/README b/src/test/modules/test_ginpostinglist/README

new file mode 100644 (file)

index 0000000..66684dd
--- /dev/null
+++ b/src/test/modules/test_ginpostinglist/README
@@ -0,0 +1,2 @@
+test_ginpostinglist contains unit tests for the GIN posting list code in
+src/backend/access/gin/ginpostinglist.c.
diff --git a/src/test/modules/test_ginpostinglist/expected/test_ginpostinglist.out b/src/test/modules/test_ginpostinglist/expected/test_ginpostinglist.out

new file mode 100644 (file)

index 0000000..4d0beae
--- /dev/null
+++ b/src/test/modules/test_ginpostinglist/expected/test_ginpostinglist.out
@@ -0,0 +1,19 @@
+CREATE EXTENSION test_ginpostinglist;
+--
+-- All the logic is in the test_ginpostinglist() function. It will throw
+-- a error if something fails.
+--
+SELECT test_ginpostinglist();
+NOTICE:  testing with (0, 1), (0, 2), max 14 bytes
+NOTICE:  encoded 2 item pointers to 10 bytes
+NOTICE:  testing with (0, 1), (0, 291), max 14 bytes
+NOTICE:  encoded 2 item pointers to 10 bytes
+NOTICE:  testing with (0, 1), (4294967294, 291), max 14 bytes
+NOTICE:  encoded 1 item pointers to 8 bytes
+NOTICE:  testing with (0, 1), (4294967294, 291), max 16 bytes
+NOTICE:  encoded 2 item pointers to 16 bytes
+ test_ginpostinglist 
+---------------------
+ 
+(1 row)
+
diff --git a/src/test/modules/test_ginpostinglist/sql/test_ginpostinglist.sql b/src/test/modules/test_ginpostinglist/sql/test_ginpostinglist.sql

new file mode 100644 (file)

index 0000000..b8cab7a
--- /dev/null
+++ b/src/test/modules/test_ginpostinglist/sql/test_ginpostinglist.sql
@@ -0,0 +1,7 @@
+CREATE EXTENSION test_ginpostinglist;
+
+--
+-- All the logic is in the test_ginpostinglist() function. It will throw
+-- a error if something fails.
+--
+SELECT test_ginpostinglist();
diff --git a/src/test/modules/test_ginpostinglist/test_ginpostinglist--1.0.sql b/src/test/modules/test_ginpostinglist/test_ginpostinglist--1.0.sql

new file mode 100644 (file)

index 0000000..37396a4
--- /dev/null
+++ b/src/test/modules/test_ginpostinglist/test_ginpostinglist--1.0.sql
@@ -0,0 +1,8 @@
+/* src/test/modules/test_ginpostinglist/test_ginpostinglist--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION test_ginpostinglist" to load this file. \quit
+
+CREATE FUNCTION test_ginpostinglist()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/src/test/modules/test_ginpostinglist/test_ginpostinglist.c b/src/test/modules/test_ginpostinglist/test_ginpostinglist.c

new file mode 100644 (file)

index 0000000..bab073b
--- /dev/null
+++ b/src/test/modules/test_ginpostinglist/test_ginpostinglist.c
@@ -0,0 +1,96 @@
+/*--------------------------------------------------------------------------
+ *
+ * test_ginpostinglist.c
+ *             Test varbyte-encoding in ginpostinglist.c
+ *
+ * Copyright (c) 2019, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *             src/test/modules/test_ginpostinglist/test_ginpostinglist.c
+ *
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "access/ginblock.h"
+#include "access/gin_private.h"
+#include "access/htup_details.h"
+
+PG_MODULE_MAGIC;
+
+PG_FUNCTION_INFO_V1(test_ginpostinglist);
+
+/*
+ * Encodes a pair of TIDs, and decodes it back. The first TID is always
+ * (0, 1), the second one is formed from the blk/off arguments. The 'maxsize'
+ * argument is passed to ginCompressPostingList(); it can be used to test the
+ * overflow checks.
+ *
+ * The reason that we test a pair, instead of just a single TID, is that
+ * the GinPostingList stores the first TID as is, and the varbyte-encoding
+ * is only used for the deltas between TIDs. So testing a single TID would
+ * not exercise the varbyte encoding at all.
+ *
+ * This function prints NOTICEs to describe what is tested, and how large the
+ * resulting GinPostingList is. Any incorrect results, e.g. if the encode +
+ * decode round trip doesn't return the original input, are reported as
+ * ERRORs.
+ */
+static void
+test_itemptr_pair(BlockNumber blk, OffsetNumber off, int maxsize)
+{
+       ItemPointerData orig_itemptrs[2];
+       ItemPointer decoded_itemptrs;
+       GinPostingList *pl;
+       int                     nwritten;
+       int                     ndecoded;
+
+       elog(NOTICE, "testing with (%u, %d), (%u, %d), max %d bytes",
+                0, 1, blk, off, maxsize);
+       ItemPointerSet(&orig_itemptrs[0], 0, 1);
+       ItemPointerSet(&orig_itemptrs[1], blk, off);
+
+       /* Encode, and decode it back */
+       pl = ginCompressPostingList(orig_itemptrs, 2, maxsize, &nwritten);
+       elog(NOTICE, "encoded %d item pointers to %zu bytes",
+                nwritten, SizeOfGinPostingList(pl));
+
+       if (SizeOfGinPostingList(pl) > maxsize)
+               elog(ERROR, "overflow: result was %zu bytes, max %d",
+                        SizeOfGinPostingList(pl), maxsize);
+
+       decoded_itemptrs = ginPostingListDecode(pl, &ndecoded);
+       if (nwritten != ndecoded)
+               elog(NOTICE, "encoded %d itemptrs, %d came back", nwritten, ndecoded);
+
+       /* Check the result */
+       if (!ItemPointerEquals(&orig_itemptrs[0], &decoded_itemptrs[0]))
+               elog(ERROR, "mismatch on first itemptr: (%u, %d) vs (%u, %d)",
+                        0, 1,
+                        ItemPointerGetBlockNumber(&decoded_itemptrs[0]),
+                        ItemPointerGetOffsetNumber(&decoded_itemptrs[0]));
+
+       if (ndecoded == 2 &&
+               !ItemPointerEquals(&orig_itemptrs[0], &decoded_itemptrs[0]))
+       {
+               elog(ERROR, "mismatch on second itemptr: (%u, %d) vs (%u, %d)",
+                        0, 1,
+                        ItemPointerGetBlockNumber(&decoded_itemptrs[0]),
+                        ItemPointerGetOffsetNumber(&decoded_itemptrs[0]));
+       }
+}
+
+/*
+ * SQL-callable entry point to perform all tests.
+ */
+Datum
+test_ginpostinglist(PG_FUNCTION_ARGS)
+{
+       test_itemptr_pair(0, 2, 14);
+       test_itemptr_pair(0, MaxHeapTuplesPerPage, 14);
+       test_itemptr_pair(MaxBlockNumber, MaxHeapTuplesPerPage, 14);
+       test_itemptr_pair(MaxBlockNumber, MaxHeapTuplesPerPage, 16);
+
+       PG_RETURN_VOID();
+}
diff --git a/src/test/modules/test_ginpostinglist/test_ginpostinglist.control b/src/test/modules/test_ginpostinglist/test_ginpostinglist.control

new file mode 100644 (file)

index 0000000..e4f5a7c
--- /dev/null
+++ b/src/test/modules/test_ginpostinglist/test_ginpostinglist.control
@@ -0,0 +1,4 @@
+comment = 'Test code for ginpostinglist.c'
+default_version = '1.0'
+module_pathname = '$libdir/test_ginpostinglist'
+relocatable = true
author	Heikki Linnakangas <heikki.linnakangas@iki.fi>
	Wed, 28 Aug 2019 09:55:33 +0000 (12:55 +0300)
committer	Heikki Linnakangas <heikki.linnakangas@iki.fi>
	Wed, 28 Aug 2019 09:55:33 +0000 (12:55 +0300)
src/backend/access/gin/ginpostinglist.c		patch \| blob \| history
src/test/modules/Makefile		patch \| blob \| history
src/test/modules/test_ginpostinglist/Makefile	[new file with mode: 0644]	patch \| blob
src/test/modules/test_ginpostinglist/README	[new file with mode: 0644]	patch \| blob
src/test/modules/test_ginpostinglist/expected/test_ginpostinglist.out	[new file with mode: 0644]	patch \| blob
src/test/modules/test_ginpostinglist/sql/test_ginpostinglist.sql	[new file with mode: 0644]	patch \| blob
src/test/modules/test_ginpostinglist/test_ginpostinglist--1.0.sql	[new file with mode: 0644]	patch \| blob
src/test/modules/test_ginpostinglist/test_ginpostinglist.c	[new file with mode: 0644]	patch \| blob
src/test/modules/test_ginpostinglist/test_ginpostinglist.control	[new file with mode: 0644]	patch \| blob