From: Robert Haas <rhaas@postgresql.org>
Date: Thu, 30 Aug 2012 17:05:45 +0000 (-0400)
Subject: Fix logic bug in gistchoose and gistRelocateBuildBuffersOnSplit.
X-Git-Tag: REL9_3_BETA1~991
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=c8ba697a4bdb934f0c51424c654e8db6133ea255;p=postgresql

Fix logic bug in gistchoose and gistRelocateBuildBuffersOnSplit.

Every time the best-tuple-found-so-far changes, we need to reset all
the penalty values in which_grow[] to the penalties for the new best
tuple.  The old code failed to do this, resulting in inferior index
quality.

The original patch from Alexander Korotkov was just two lines; I took
the liberty of fleshing that out by adding a bunch of comments that I
hope will make this logic easier for others to understand than it was
for me.
---

diff --git a/src/backend/access/gist/gistbuildbuffers.c b/src/backend/access/gist/gistbuildbuffers.c
index 39aec856f9..d1e246adad 100644
--- a/src/backend/access/gist/gistbuildbuffers.c
+++ b/src/backend/access/gist/gistbuildbuffers.c
@@ -625,8 +625,13 @@ gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
 	}
 
 	/*
-	 * Loop through all index tuples on the buffer on the splitted page,
-	 * moving them to buffers on the new pages.
+	 * Loop through all index tuples on the buffer on the page being split,
+	 * moving them to buffers on the new pages.  We try to move each tuple
+	 * the page that will result in the lowest penalty for the leading column
+	 * or, in the case of a tie, the lowest penalty for the earliest column
+	 * that is not tied.
+	 *
+	 * The guts of this loop are very similar to gistchoose().
 	 */
 	while (gistPopItupFromNodeBuffer(gfbb, &oldBuf, &itup))
 	{
@@ -637,14 +642,18 @@ gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
 		IndexTuple	newtup;
 		RelocationBufferInfo *targetBufferInfo;
 
-		/*
-		 * Choose which page this tuple should go to.
-		 */
 		gistDeCompressAtt(giststate, r,
 						  itup, NULL, (OffsetNumber) 0, entry, isnull);
 
 		which = -1;
 		*which_grow = -1.0f;
+
+		/*
+		 * Loop over possible target pages.  We'll exit early if we find an index key that
+		 * can accommodate the new key with no penalty on any column.  sum_grow is used to
+		 * track this condition.  It doesn't need to be exactly accurate, just >0 whenever
+		 * we want the loop to continue and equal to 0 when we want it to terminate.
+		 */
 		sum_grow = 1.0f;
 
 		for (i = 0; i < splitPagesCount && sum_grow; i++)
@@ -653,6 +662,8 @@ gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
 			RelocationBufferInfo *splitPageInfo = &relocationBuffersInfos[i];
 
 			sum_grow = 0.0f;
+
+			/* Loop over index attributes. */
 			for (j = 0; j < r->rd_att->natts; j++)
 			{
 				float		usize;
@@ -664,16 +675,36 @@ gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
 
 				if (which_grow[j] < 0 || usize < which_grow[j])
 				{
+					/*
+					 * We get here in two cases.  First, we may have just discovered that the
+					 * current tuple is the best one we've seen so far; that is, for the first
+					 * column for which the penalty is not equal to the best tuple seen so far,
+					 * this one has a lower penalty than the previously-seen one.  But, when
+					 * a new best tuple is found, we must record the best penalty value for
+					 * all the remaining columns.  We'll end up here for each remaining index
+					 * column in that case, too.
+					 */
 					which = i;
 					which_grow[j] = usize;
-					if (j < r->rd_att->natts - 1 && i == 0)
+					if (j < r->rd_att->natts - 1)
 						which_grow[j + 1] = -1;
 					sum_grow += which_grow[j];
 				}
 				else if (which_grow[j] == usize)
+				{
+					/*
+					 * The current tuple is exactly as good for this column as the best tuple
+					 * seen so far.  The next iteration of this loop will compare the next
+					 * column.
+					 */
 					sum_grow += usize;
+				}
 				else
 				{
+					/*
+					 * The current tuple is worse for this column than the best tuple seen so
+					 * far.  Skip the remaining columns and move on to the next tuple, if any.
+					 */
 					sum_grow = 1;
 					break;
 				}
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index df1e2e396f..7c6ce8495c 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -363,7 +363,12 @@ gistgetadjusted(Relation r, IndexTuple oldtup, IndexTuple addtup, GISTSTATE *gis
 }
 
 /*
- * find entry with lowest penalty
+ * Search a page for the entry with lowest penalty.
+ *
+ * The index may have multiple columns, and there's a penalty value for each column.
+ * The penalty associated with a column which appears earlier in the index definition is
+ * strictly more important than the penalty of column which appears later in the index
+ * definition.
  */
 OffsetNumber
 gistchoose(Relation r, Page p, IndexTuple it,	/* it has compressed entry */
@@ -389,12 +394,28 @@ gistchoose(Relation r, Page p, IndexTuple it,	/* it has compressed entry */
 	Assert(maxoff >= FirstOffsetNumber);
 	Assert(!GistPageIsLeaf(p));
 
+	/*
+	 * Loop over tuples on page.
+	 *
+	 * We'll exit early if we find an index key that can accommodate the new key with no
+	 * penalty on any column.  sum_grow is used to track this condition.  Normally, it is the
+	 * sum of the penalties we've seen for this column so far, which is not a very useful
+	 * quantity in general because the penalties for each column are only considered
+	 * independently, but all we really care about is whether or not it's greater than zero.
+	 * Since penalties can't be negative, the sum of the penalties will be greater than
+	 * zero if and only if at least one penalty was greater than zero.  To make things just
+	 * a bit more complicated, we arbitrarily set sum_grow to 1.0 whenever we want to force
+	 * the at least one more iteration of this outer loop.  Any non-zero value would serve
+	 * just as well.
+	 */
 	for (i = FirstOffsetNumber; i <= maxoff && sum_grow; i = OffsetNumberNext(i))
 	{
 		int			j;
 		IndexTuple	itup = (IndexTuple) PageGetItem(p, PageGetItemId(p, i));
 
 		sum_grow = 0;
+
+		/* Loop over indexed attribtues. */
 		for (j = 0; j < r->rd_att->natts; j++)
 		{
 			Datum		datum;
@@ -409,16 +430,36 @@ gistchoose(Relation r, Page p, IndexTuple it,	/* it has compressed entry */
 
 			if (which_grow[j] < 0 || usize < which_grow[j])
 			{
+				/*
+				 * We get here in two cases.  First, we may have just discovered that the
+				 * current tuple is the best one we've seen so far; that is, for the first
+				 * column for which the penalty is not equal to the best tuple seen so far,
+				 * this one has a lower penalty than the previously-seen one.  But, when
+				 * a new best tuple is found, we must record the best penalty value for
+				 * all the remaining columns.  We'll end up here for each remaining index
+				 * column in that case, too.
+				 */
 				which = i;
 				which_grow[j] = usize;
-				if (j < r->rd_att->natts - 1 && i == FirstOffsetNumber)
+				if (j < r->rd_att->natts - 1)
 					which_grow[j + 1] = -1;
 				sum_grow += which_grow[j];
 			}
 			else if (which_grow[j] == usize)
+			{
+				/*
+				 * The current tuple is exactly as good for this column as the best tuple
+				 * seen so far.  The next iteration of this loop will compare the next
+				 * column.
+				 */
 				sum_grow += usize;
+			}
 			else
 			{
+				/*
+				 * The current tuple is worse for this column than the best tuple seen so
+				 * far.  Skip the remaining columns and move on to the next tuple, if any.
+				 */
 				sum_grow = 1;
 				break;
 			}