* thousands of trigrams would be slow, and would likely produce so many
* false positives that we would have to traverse a large fraction of the
* index, the graph is simplified further in a lossy fashion by removing
- * color trigrams until the number of trigrams after expansion is below
- * the MAX_TRGM_COUNT threshold. When a color trigram is removed, the states
- * connected by any arcs labelled with that trigram are merged.
+ * color trigrams. When a color trigram is removed, the states connected by
+ * any arcs labelled with that trigram are merged.
+ *
+ * Trigrams do not all have equivalent value for searching: some of them are
+ * more frequent and some of them are less frequent. Ideally, we would like
+ * to know the distribution of trigrams, but we don't. But because of padding
+ * we know for sure that the empty character is more frequent than others,
+ * so we can penalize trigrams according to presence of whitespace. The
+ * penalty assigned to each color trigram is the number of simple trigrams
+ * it would produce, times the penalties[] multiplier associated with its
+ * whitespace content. (The penalties[] constants were calculated by analysis
+ * of some real-life text.) We eliminate color trigrams starting with the
+ * highest-penalty one, until we get to a total penalty of no more than
+ * WISH_TRGM_PENALTY. However, we cannot remove a color trigram if that would
+ * lead to merging the initial and final states, so we may not be able to
+ * reach WISH_TRGM_PENALTY. It's still okay so long as we have no more than
+ * MAX_TRGM_COUNT simple trigrams in total, otherwise we fail.
*
* 4) Pack the graph into a compact representation
* -----------------------------------------------
* MAX_EXPANDED_STATES - How many states we allow in expanded graph
* MAX_EXPANDED_ARCS - How many arcs we allow in expanded graph
* MAX_TRGM_COUNT - How many simple trigrams we allow to be extracted
+ * WISH_TRGM_PENALTY - Maximum desired sum of color trigram penalties
* COLOR_COUNT_LIMIT - Maximum number of characters per color
*/
#define MAX_EXPANDED_STATES 128
#define MAX_EXPANDED_ARCS 1024
#define MAX_TRGM_COUNT 256
+#define WISH_TRGM_PENALTY 16
#define COLOR_COUNT_LIMIT 256
+/*
+ * Penalty multipliers for trigram counts depending on whitespace contents.
+ * Numbers based on analysis of real-life texts.
+ */
+const float4 penalties[8] = {
+ 1.0, /* "aaa" */
+ 3.5, /* "aa " */
+ 0.0, /* "a a" (impossible) */
+ 0.0, /* "a " (impossible) */
+ 4.2, /* " aa" */
+ 2.1, /* " a " */
+ 25.0, /* " a" */
+ 0.0 /* " " (impossible) */
+};
+
/* Struct representing a single pg_wchar, converted back to multibyte form */
typedef struct
{
ColorTrgm ctrgm;
int number;
int count;
+ float4 penalty;
bool expanded;
List *arcs;
} ColorTrgmInfo;
static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3]);
static void mergeStates(TrgmState *state1, TrgmState *state2);
static int colorTrgmInfoCmp(const void *p1, const void *p2);
-static int colorTrgmInfoCountCmp(const void *p1, const void *p2);
+static int colorTrgmInfoPenaltyCmp(const void *p1, const void *p2);
static TrgmPackedGraph *packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext);
static int packArcInfoCmp(const void *a1, const void *a2);
TrgmState *state;
ColorTrgmInfo *colorTrgms;
int64 totalTrgmCount;
+ float4 totalTrgmPenalty;
int number;
/* Collect color trigrams from all arcs */
}
/*
- * Count number of simple trigrams generated by each color trigram.
+ * Count number of simple trigrams generated by each color trigram, and
+ * also compute a penalty value, which is the number of simple trigrams
+ * times a multiplier that depends on its whitespace content.
*
* Note: per-color-trigram counts cannot overflow an int so long as
* COLOR_COUNT_LIMIT is not more than the cube root of INT_MAX, ie about
* 1290. However, the grand total totalTrgmCount might conceivably
- * overflow an int, so we use int64 for that within this routine.
+ * overflow an int, so we use int64 for that within this routine. Also,
+ * penalties are calculated in float4 arithmetic to avoid any overflow
+ * worries.
*/
totalTrgmCount = 0;
+ totalTrgmPenalty = 0.0f;
for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
{
ColorTrgmInfo *trgmInfo = &colorTrgms[i];
int j,
- count = 1;
+ count = 1,
+ typeIndex = 0;
for (j = 0; j < 3; j++)
{
TrgmColor c = trgmInfo->ctrgm.colors[j];
- if (c != COLOR_BLANK)
+ typeIndex *= 2;
+ if (c == COLOR_BLANK)
+ typeIndex++;
+ else
count *= trgmNFA->colorInfo[c].wordCharsCount;
}
trgmInfo->count = count;
totalTrgmCount += count;
+ trgmInfo->penalty = penalties[typeIndex] * (float4) count;
+ totalTrgmPenalty += trgmInfo->penalty;
}
- /* Sort color trigrams in descending order of simple trigram counts */
+ /* Sort color trigrams in descending order of their penalties */
qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
- colorTrgmInfoCountCmp);
+ colorTrgmInfoPenaltyCmp);
/*
- * Remove color trigrams from the graph so long as total number of simple
- * trigrams exceeds MAX_TRGM_COUNT. We prefer to remove color trigrams
- * with the most associated simple trigrams, since those are the most
- * promising for reducing the total number of simple trigrams. When
- * removing a color trigram we have to merge states connected by arcs
- * labeled with that trigram. It's necessary to not merge initial and
- * final states, because our graph becomes useless if that happens; so we
- * cannot always remove the trigram we'd prefer to.
+ * Remove color trigrams from the graph so long as total penalty of color
+ * trigrams exceeds WISH_TRGM_PENALTY. (If we fail to get down to
+ * WISH_TRGM_PENALTY, it's OK so long as total count is no more than
+ * MAX_TRGM_COUNT.) We prefer to remove color trigrams with higher
+ * penalty, since those are the most promising for reducing the total
+ * penalty. When removing a color trigram we have to merge states
+ * connected by arcs labeled with that trigram. It's necessary to not
+ * merge initial and final states, because our graph becomes useless if
+ * that happens; so we cannot always remove the trigram we'd prefer to.
*/
- for (i = 0;
- (i < trgmNFA->colorTrgmsCount) && (totalTrgmCount > MAX_TRGM_COUNT);
- i++)
+ for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
{
ColorTrgmInfo *trgmInfo = &colorTrgms[i];
bool canRemove = true;
ListCell *cell;
+ /* Done if we've reached the target */
+ if (totalTrgmPenalty <= WISH_TRGM_PENALTY)
+ break;
+
/*
* Does any arc of this color trigram connect initial and final
* states? If so we can't remove it.
mergeStates(source, target);
}
- /* Mark trigram unexpanded, and update totalTrgmCount */
+ /* Mark trigram unexpanded, and update totals */
trgmInfo->expanded = false;
totalTrgmCount -= trgmInfo->count;
+ totalTrgmPenalty -= trgmInfo->penalty;
}
/* Did we succeed in fitting into MAX_TRGM_COUNT? */
/*
* Compare function for sorting color trigrams in descending order of
- * their simple trigrams counts.
+ * their penalty fields.
*/
static int
-colorTrgmInfoCountCmp(const void *p1, const void *p2)
+colorTrgmInfoPenaltyCmp(const void *p1, const void *p2)
{
- const ColorTrgmInfo *c1 = (const ColorTrgmInfo *) p1;
- const ColorTrgmInfo *c2 = (const ColorTrgmInfo *) p2;
+ float4 penalty1 = ((const ColorTrgmInfo *) p1)->penalty;
+ float4 penalty2 = ((const ColorTrgmInfo *) p2)->penalty;
- if (c1->count < c2->count)
+ if (penalty1 < penalty2)
return 1;
- else if (c1->count == c2->count)
+ else if (penalty1 == penalty2)
return 0;
else
return -1;