Improve contrib/pg_trgm's heuristics for regexp index searches.

author Tom Lane <tgl@sss.pgh.pa.us>

Sun, 6 Apr 2014 00:48:47 +0000 (20:48 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Sun, 6 Apr 2014 00:48:47 +0000 (20:48 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Sun, 6 Apr 2014 00:48:47 +0000 (20:48 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Sun, 6 Apr 2014 00:48:47 +0000 (20:48 -0400)
diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c

index c4dfdf2a3db642a5212f659c342514268c9af839..2e6fa219350976337e4b08fe5bcb25dae8f1cf93 100644 (file)
--- a/contrib/pg_trgm/trgm_regexp.c
+++ b/contrib/pg_trgm/trgm_regexp.c
@@ -122,9 +122,23 @@
   * thousands of trigrams would be slow, and would likely produce so many
   * false positives that we would have to traverse a large fraction of the
   * index, the graph is simplified further in a lossy fashion by removing
- * color trigrams until the number of trigrams after expansion is below
- * the MAX_TRGM_COUNT threshold.  When a color trigram is removed, the states
- * connected by any arcs labelled with that trigram are merged.
+ * color trigrams. When a color trigram is removed, the states connected by
+ * any arcs labelled with that trigram are merged.
+ *
+ * Trigrams do not all have equivalent value for searching: some of them are
+ * more frequent and some of them are less frequent. Ideally, we would like
+ * to know the distribution of trigrams, but we don't. But because of padding
+ * we know for sure that the empty character is more frequent than others,
+ * so we can penalize trigrams according to presence of whitespace. The
+ * penalty assigned to each color trigram is the number of simple trigrams
+ * it would produce, times the penalties[] multiplier associated with its
+ * whitespace content. (The penalties[] constants were calculated by analysis
+ * of some real-life text.) We eliminate color trigrams starting with the
+ * highest-penalty one, until we get to a total penalty of no more than
+ * WISH_TRGM_PENALTY. However, we cannot remove a color trigram if that would
+ * lead to merging the initial and final states, so we may not be able to
+ * reach WISH_TRGM_PENALTY. It's still okay so long as we have no more than
+ * MAX_TRGM_COUNT simple trigrams in total, otherwise we fail.
   *
   * 4) Pack the graph into a compact representation
   * -----------------------------------------------
@@ -199,13 +213,30 @@
   *     MAX_EXPANDED_STATES - How many states we allow in expanded graph
   *     MAX_EXPANDED_ARCS - How many arcs we allow in expanded graph
   *     MAX_TRGM_COUNT - How many simple trigrams we allow to be extracted
+ *     WISH_TRGM_PENALTY - Maximum desired sum of color trigram penalties
   *     COLOR_COUNT_LIMIT - Maximum number of characters per color
   */
  #define MAX_EXPANDED_STATES 128
  #define MAX_EXPANDED_ARCS      1024
  #define MAX_TRGM_COUNT         256
+#define WISH_TRGM_PENALTY      16
  #define COLOR_COUNT_LIMIT      256
  
+/*
+ * Penalty multipliers for trigram counts depending on whitespace contents.
+ * Numbers based on analysis of real-life texts.
+ */
+const float4 penalties[8] = {
+       1.0,                                            /* "aaa" */
+       3.5,                                            /* "aa " */
+       0.0,                                            /* "a a" (impossible) */
+       0.0,                                            /* "a  " (impossible) */
+       4.2,                                            /* " aa" */
+       2.1,                                            /* " a " */
+       25.0,                                           /* "  a" */
+       0.0                                                     /* "   " (impossible) */
+};
+
  /* Struct representing a single pg_wchar, converted back to multibyte form */
  typedef struct
  {
@@ -339,6 +370,7 @@ typedef struct
         ColorTrgm       ctrgm;
         int                     number;
         int                     count;
+       float4          penalty;
         bool            expanded;
         List       *arcs;
  } ColorTrgmInfo;
@@ -459,7 +491,7 @@ static TRGM *expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext);
  static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3]);
  static void mergeStates(TrgmState *state1, TrgmState *state2);
  static int     colorTrgmInfoCmp(const void *p1, const void *p2);
-static int     colorTrgmInfoCountCmp(const void *p1, const void *p2);
+static int     colorTrgmInfoPenaltyCmp(const void *p1, const void *p2);
  static TrgmPackedGraph *packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext);
  static int     packArcInfoCmp(const void *a1, const void *a2);
  
@@ -1424,6 +1456,7 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
         TrgmState  *state;
         ColorTrgmInfo *colorTrgms;
         int64           totalTrgmCount;
+       float4          totalTrgmPenalty;
         int                     number;
  
         /* Collect color trigrams from all arcs */
@@ -1482,53 +1515,67 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
         }
  
         /*
-        * Count number of simple trigrams generated by each color trigram.
+        * Count number of simple trigrams generated by each color trigram, and
+        * also compute a penalty value, which is the number of simple trigrams
+        * times a multiplier that depends on its whitespace content.
          *
          * Note: per-color-trigram counts cannot overflow an int so long as
          * COLOR_COUNT_LIMIT is not more than the cube root of INT_MAX, ie about
          * 1290.  However, the grand total totalTrgmCount might conceivably
-        * overflow an int, so we use int64 for that within this routine.
+        * overflow an int, so we use int64 for that within this routine.  Also,
+        * penalties are calculated in float4 arithmetic to avoid any overflow
+        * worries.
          */
         totalTrgmCount = 0;
+       totalTrgmPenalty = 0.0f;
         for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
         {
                 ColorTrgmInfo *trgmInfo = &colorTrgms[i];
                 int                     j,
-                                       count = 1;
+                                       count = 1,
+                                       typeIndex = 0;
  
                 for (j = 0; j < 3; j++)
                 {
                         TrgmColor       c = trgmInfo->ctrgm.colors[j];
  
-                       if (c != COLOR_BLANK)
+                       typeIndex *= 2;
+                       if (c == COLOR_BLANK)
+                               typeIndex++;
+                       else
                                 count *= trgmNFA->colorInfo[c].wordCharsCount;
                 }
                 trgmInfo->count = count;
                 totalTrgmCount += count;
+               trgmInfo->penalty = penalties[typeIndex] * (float4) count;
+               totalTrgmPenalty += trgmInfo->penalty;
         }
  
-       /* Sort color trigrams in descending order of simple trigram counts */
+       /* Sort color trigrams in descending order of their penalties */
         qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
-                 colorTrgmInfoCountCmp);
+                 colorTrgmInfoPenaltyCmp);
  
         /*
-        * Remove color trigrams from the graph so long as total number of simple
-        * trigrams exceeds MAX_TRGM_COUNT.  We prefer to remove color trigrams
-        * with the most associated simple trigrams, since those are the most
-        * promising for reducing the total number of simple trigrams.  When
-        * removing a color trigram we have to merge states connected by arcs
-        * labeled with that trigram.  It's necessary to not merge initial and
-        * final states, because our graph becomes useless if that happens; so we
-        * cannot always remove the trigram we'd prefer to.
+        * Remove color trigrams from the graph so long as total penalty of color
+        * trigrams exceeds WISH_TRGM_PENALTY.  (If we fail to get down to
+        * WISH_TRGM_PENALTY, it's OK so long as total count is no more than
+        * MAX_TRGM_COUNT.)  We prefer to remove color trigrams with higher
+        * penalty, since those are the most promising for reducing the total
+        * penalty.  When removing a color trigram we have to merge states
+        * connected by arcs labeled with that trigram.  It's necessary to not
+        * merge initial and final states, because our graph becomes useless if
+        * that happens; so we cannot always remove the trigram we'd prefer to.
          */
-       for (i = 0;
-                (i < trgmNFA->colorTrgmsCount) && (totalTrgmCount > MAX_TRGM_COUNT);
-                i++)
+       for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
         {
                 ColorTrgmInfo *trgmInfo = &colorTrgms[i];
                 bool            canRemove = true;
                 ListCell   *cell;
  
+               /* Done if we've reached the target */
+               if (totalTrgmPenalty <= WISH_TRGM_PENALTY)
+                       break;
+
                 /*
                  * Does any arc of this color trigram connect initial and final
                  * states?      If so we can't remove it.
@@ -1570,9 +1617,10 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
                                 mergeStates(source, target);
                 }
  
-               /* Mark trigram unexpanded, and update totalTrgmCount */
+               /* Mark trigram unexpanded, and update totals */
                 trgmInfo->expanded = false;
                 totalTrgmCount -= trgmInfo->count;
+               totalTrgmPenalty -= trgmInfo->penalty;
         }
  
         /* Did we succeed in fitting into MAX_TRGM_COUNT? */
@@ -1746,17 +1794,17 @@ colorTrgmInfoCmp(const void *p1, const void *p2)
  
  /*
   * Compare function for sorting color trigrams in descending order of
- * their simple trigrams counts.
+ * their penalty fields.
   */
  static int
-colorTrgmInfoCountCmp(const void *p1, const void *p2)
+colorTrgmInfoPenaltyCmp(const void *p1, const void *p2)
  {
-       const ColorTrgmInfo *c1 = (const ColorTrgmInfo *) p1;
-       const ColorTrgmInfo *c2 = (const ColorTrgmInfo *) p2;
+       float4          penalty1 = ((const ColorTrgmInfo *) p1)->penalty;
+       float4          penalty2 = ((const ColorTrgmInfo *) p2)->penalty;
  
-       if (c1->count < c2->count)
+       if (penalty1 < penalty2)
                 return 1;
-       else if (c1->count == c2->count)
+       else if (penalty1 == penalty2)
                 return 0;
         else
                 return -1;
author	Tom Lane <tgl@sss.pgh.pa.us>
	Sun, 6 Apr 2014 00:48:47 +0000 (20:48 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Sun, 6 Apr 2014 00:48:47 +0000 (20:48 -0400)