Add some randomness to the choice of which GiST page to insert to.

author Heikki Linnakangas <heikki.linnakangas@iki.fi>

Fri, 25 Jan 2013 14:49:13 +0000 (16:49 +0200)

committer Heikki Linnakangas <heikki.linnakangas@iki.fi>

Fri, 25 Jan 2013 14:58:38 +0000 (16:58 +0200)
author Heikki Linnakangas <heikki.linnakangas@iki.fi>
Fri, 25 Jan 2013 14:49:13 +0000 (16:49 +0200)
committer Heikki Linnakangas <heikki.linnakangas@iki.fi>
Fri, 25 Jan 2013 14:58:38 +0000 (16:58 +0200)
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c

index 8b60774f50349222ee04fef13613e3592120973b..a127627334e80704edeed38d3522e4000a695abb 100644 (file)
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -379,6 +379,7 @@ gistchoose(Relation r, Page p, IndexTuple it,       /* it has compressed entry */
         GISTENTRY       entry,
                                 identry[INDEX_MAX_KEYS];
         bool            isnull[INDEX_MAX_KEYS];
+       int                     keep_current_best;
  
         Assert(!GistPageIsLeaf(p));
  
@@ -401,6 +402,31 @@ gistchoose(Relation r, Page p, IndexTuple it,      /* it has compressed entry */
          */
         best_penalty[0] = -1;
  
+       /*
+        * If we find a tuple that's exactly as good as the currently best one, we
+        * could use either one.  When inserting a lot of tuples with the same or
+        * similar keys, it's preferable to descend down the same path when
+        * possible, as that's more cache-friendly.  On the other hand, if all
+        * inserts land on the same leaf page after a split, we're never going to
+        * insert anything to the other half of the split, and will end up using
+        * only 50% of the available space.  Distributing the inserts evenly would
+        * lead to better space usage, but that hurts cache-locality during
+        * insertion.  To get the best of both worlds, when we find a tuple that's
+        * exactly as good as the previous best, choose randomly whether to stick
+        * to the old best, or use the new one.  Once we decide to stick to the
+        * old best, we keep sticking to it for any subsequent equally good tuples
+        * we might find.  This favors tuples with low offsets, but still allows
+        * some inserts to go to other equally-good subtrees.
+        *
+        * keep_current_best is -1 if we haven't yet had to make a random choice
+        * whether to keep the current best tuple.  If we have done so, and
+        * decided to keep it, keep_current_best is 1; if we've decided to
+        * replace, keep_current_best is 0.  (This state will be reset to -1 as
+        * soon as we've made the replacement, but sometimes we make the choice in
+        * advance of actually finding a replacement best tuple.)
+        */
+       keep_current_best = -1;
+
         /*
          * Loop over tuples on page.
          */
@@ -446,6 +472,9 @@ gistchoose(Relation r, Page p, IndexTuple it,       /* it has compressed entry */
  
                                 if (j < r->rd_att->natts - 1)
                                         best_penalty[j + 1] = -1;
+
+                               /* we have new best, so reset keep-it decision */
+                               keep_current_best = -1;
                         }
                         else if (best_penalty[j] == usize)
                         {
@@ -468,12 +497,41 @@ gistchoose(Relation r, Page p, IndexTuple it,     /* it has compressed entry */
                 }
  
                 /*
-                * If we find a tuple with zero penalty for all columns, there's no
-                * need to examine remaining tuples; just break out of the loop and
-                * return it.
+                * If we looped past the last column, and did not update "result",
+                * then this tuple is exactly as good as the prior best tuple.
+                */
+               if (j == r->rd_att->natts && result != i)
+               {
+                       if (keep_current_best == -1)
+                       {
+                               /* we didn't make the random choice yet for this old best */
+                               keep_current_best = (random() <= (MAX_RANDOM_VALUE / 2)) ? 1 : 0;
+                       }
+                       if (keep_current_best == 0)
+                       {
+                               /* we choose to use the new tuple */
+                               result = i;
+                               /* choose again if there are even more exactly-as-good ones */
+                               keep_current_best = -1;
+                       }
+               }
+
+               /*
+                * If we find a tuple with zero penalty for all columns, and we've
+                * decided we don't want to search for another tuple with equal
+                * penalty, there's no need to examine remaining tuples; just break
+                * out of the loop and return it.
                  */
                 if (zero_penalty)
-                       break;
+               {
+                       if (keep_current_best == -1)
+                       {
+                               /* we didn't make the random choice yet for this old best */
+                               keep_current_best = (random() <= (MAX_RANDOM_VALUE / 2)) ? 1 : 0;
+                       }
+                       if (keep_current_best == 1)
+                               break;
+               }
         }
  
         return result;
author	Heikki Linnakangas <heikki.linnakangas@iki.fi>
	Fri, 25 Jan 2013 14:49:13 +0000 (16:49 +0200)
committer	Heikki Linnakangas <heikki.linnakangas@iki.fi>
	Fri, 25 Jan 2013 14:58:38 +0000 (16:58 +0200)