X-Git-Url: https://granicus.if.org/sourcecode?a=blobdiff_plain;f=src%2Fbackend%2Ftsearch%2Fts_typanalyze.c;h=0f851ead0607fcecb1fd5516593d65f78b1665e4;hb=ee943004466418595363d567f18c053bae407792;hp=1ca5cf0cb1d39905938f1983434c1272def1d21c;hpb=6416a82a62db4e66b2edb0fa8fc83a580c3f1931;p=postgresql

diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c
index 1ca5cf0cb1..0f851ead06 100644
--- a/src/backend/tsearch/ts_typanalyze.c
+++ b/src/backend/tsearch/ts_typanalyze.c
@@ -3,7 +3,7 @@
  * ts_typanalyze.c
  *	  functions for gathering statistics from tsvector columns
  *
- * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
  *
  *
  * IDENTIFICATION
@@ -114,13 +114,13 @@ ts_typanalyze(PG_FUNCTION_ARGS)
  *	language's frequency table, where K is the target number of entries in
  *	the MCELEM array plus an arbitrary constant, meant to reflect the fact
  *	that the most common words in any language would usually be stopwords
- *	so we will not actually see them in the input.	We assume that the
+ *	so we will not actually see them in the input.  We assume that the
  *	distribution of word frequencies (including the stopwords) follows Zipf's
  *	law with an exponent of 1.
  *
  *	Assuming Zipfian distribution, the frequency of the K'th word is equal
  *	to 1/(K * H(W)) where H(n) is 1/2 + 1/3 + ... + 1/n and W is the number of
- *	words in the language.	Putting W as one million, we get roughly 0.07/K.
+ *	words in the language.  Putting W as one million, we get roughly 0.07/K.
  *	Assuming top 10 words are stopwords gives s = 0.07/(K + 10).  We set
  *	epsilon = s/10, which gives bucket width w = (K + 10)/0.007 and
  *	maximum expected hashtable size of about 1000 * (K + 10).
@@ -161,7 +161,7 @@ compute_tsvector_stats(VacAttrStats *stats,
 	TrackItem  *item;
 
 	/*
-	 * We want statistics_target * 10 lexemes in the MCELEM array.	This
+	 * We want statistics_target * 10 lexemes in the MCELEM array.  This
 	 * multiplier is pretty arbitrary, but is meant to reflect the fact that
 	 * the number of individual lexeme values tracked in pg_statistic ought to
 	 * be more than the number of values for a simple scalar column.
@@ -186,7 +186,7 @@ compute_tsvector_stats(VacAttrStats *stats,
 	hash_ctl.match = lexeme_match;
 	hash_ctl.hcxt = CurrentMemoryContext;
 	lexemes_tab = hash_create("Analyzed lexemes table",
-							  bucket_width * 7,
+							  num_mcelem,
 							  &hash_ctl,
 					HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT);
 
@@ -232,7 +232,7 @@ compute_tsvector_stats(VacAttrStats *stats,
 
 		/*
 		 * We loop through the lexemes in the tsvector and add them to our
-		 * tracking hashtable.	Note: the hashtable entries will point into
+		 * tracking hashtable.  Note: the hashtable entries will point into
 		 * the (detoasted) tsvector value, therefore we cannot free that
 		 * storage until we're done.
 		 */
@@ -299,7 +299,7 @@ compute_tsvector_stats(VacAttrStats *stats,
 
 		/*
 		 * Construct an array of the interesting hashtable items, that is,
-		 * those meeting the cutoff frequency (s - epsilon)*N.	Also identify
+		 * those meeting the cutoff frequency (s - epsilon)*N.  Also identify
 		 * the minimum and maximum frequencies among these items.
 		 *
 		 * Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff
@@ -332,7 +332,7 @@ compute_tsvector_stats(VacAttrStats *stats,
 
 		/*
 		 * If we obtained more lexemes than we really want, get rid of those
-		 * with least frequencies.	The easiest way is to qsort the array into
+		 * with least frequencies.  The easiest way is to qsort the array into
 		 * descending frequency order and truncate the array.
 		 */
 		if (num_mcelem < track_len)
@@ -363,7 +363,7 @@ compute_tsvector_stats(VacAttrStats *stats,
 			 * they get sorted on frequencies. The rationale is that we
 			 * usually search through most common elements looking for a
 			 * specific value, so we can grab its frequency.  When values are
-			 * presorted we can employ binary search for that.	See
+			 * presorted we can employ binary search for that.  See
 			 * ts_selfuncs.c for a real usage scenario.
 			 */
 			qsort(sort_table, num_mcelem, sizeof(TrackItem *),
@@ -377,6 +377,11 @@ compute_tsvector_stats(VacAttrStats *stats,
 			 * able to find out the minimal and maximal frequency without
 			 * going through all the values.  We keep those two extra
 			 * frequencies in two extra cells in mcelem_freqs.
+			 *
+			 * (Note: the MCELEM statistics slot definition allows for a third
+			 * extra number containing the frequency of nulls, but we don't
+			 * create that for a tsvector column, since null elements aren't
+			 * possible.)
 			 */
 			mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum));
 			mcelem_freqs = (float4 *) palloc((num_mcelem + 2) * sizeof(float4));