Update copyright for 2016

[postgresql] / src / backend / tsearch / ts_typanalyze.c
diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c

index 2654d644579fd1959282d83919474f42540ca703..0f851ead0607fcecb1fd5516593d65f78b1665e4 100644 (file)
--- a/src/backend/tsearch/ts_typanalyze.c
+++ b/src/backend/tsearch/ts_typanalyze.c
@@ -3,7 +3,7 @@
   * ts_typanalyze.c
   *       functions for gathering statistics from tsvector columns
   *
- * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
   *
   *
   * IDENTIFICATION
@@ -18,7 +18,6 @@
  #include "commands/vacuum.h"
  #include "tsearch/ts_type.h"
  #include "utils/builtins.h"
-#include "utils/hsearch.h"
  
  
  /* A hash key for lexemes */
@@ -115,13 +114,13 @@ ts_typanalyze(PG_FUNCTION_ARGS)
   *     language's frequency table, where K is the target number of entries in
   *     the MCELEM array plus an arbitrary constant, meant to reflect the fact
   *     that the most common words in any language would usually be stopwords
- *     so we will not actually see them in the input.  We assume that the
+ *     so we will not actually see them in the input.  We assume that the
   *     distribution of word frequencies (including the stopwords) follows Zipf's
   *     law with an exponent of 1.
   *
   *     Assuming Zipfian distribution, the frequency of the K'th word is equal
   *     to 1/(K * H(W)) where H(n) is 1/2 + 1/3 + ... + 1/n and W is the number of
- *     words in the language.  Putting W as one million, we get roughly 0.07/K.
+ *     words in the language.  Putting W as one million, we get roughly 0.07/K.
   *     Assuming top 10 words are stopwords gives s = 0.07/(K + 10).  We set
   *     epsilon = s/10, which gives bucket width w = (K + 10)/0.007 and
   *     maximum expected hashtable size of about 1000 * (K + 10).
@@ -162,7 +161,7 @@ compute_tsvector_stats(VacAttrStats *stats,
         TrackItem  *item;
  
         /*
-        * We want statistics_target * 10 lexemes in the MCELEM array.  This
+        * We want statistics_target * 10 lexemes in the MCELEM array.  This
          * multiplier is pretty arbitrary, but is meant to reflect the fact that
          * the number of individual lexeme values tracked in pg_statistic ought to
          * be more than the number of values for a simple scalar column.
@@ -187,7 +186,7 @@ compute_tsvector_stats(VacAttrStats *stats,
         hash_ctl.match = lexeme_match;
         hash_ctl.hcxt = CurrentMemoryContext;
         lexemes_tab = hash_create("Analyzed lexemes table",
-                                                         bucket_width * 7,
+                                                         num_mcelem,
                                                           &hash_ctl,
                                         HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT);
  
@@ -233,7 +232,7 @@ compute_tsvector_stats(VacAttrStats *stats,
  
                 /*
                  * We loop through the lexemes in the tsvector and add them to our
-                * tracking hashtable.  Note: the hashtable entries will point into
+                * tracking hashtable.  Note: the hashtable entries will point into
                  * the (detoasted) tsvector value, therefore we cannot free that
                  * storage until we're done.
                  */
@@ -300,7 +299,7 @@ compute_tsvector_stats(VacAttrStats *stats,
  
                 /*
                  * Construct an array of the interesting hashtable items, that is,
-                * those meeting the cutoff frequency (s - epsilon)*N.  Also identify
+                * those meeting the cutoff frequency (s - epsilon)*N.  Also identify
                  * the minimum and maximum frequencies among these items.
                  *
                  * Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff
@@ -333,7 +332,7 @@ compute_tsvector_stats(VacAttrStats *stats,
  
                 /*
                  * If we obtained more lexemes than we really want, get rid of those
-                * with least frequencies.      The easiest way is to qsort the array into
+                * with least frequencies.  The easiest way is to qsort the array into
                  * descending frequency order and truncate the array.
                  */
                 if (num_mcelem < track_len)
@@ -364,7 +363,7 @@ compute_tsvector_stats(VacAttrStats *stats,
                          * they get sorted on frequencies. The rationale is that we
                          * usually search through most common elements looking for a
                          * specific value, so we can grab its frequency.  When values are
-                        * presorted we can employ binary search for that.      See
+                        * presorted we can employ binary search for that.  See
                          * ts_selfuncs.c for a real usage scenario.
                          */
                         qsort(sort_table, num_mcelem, sizeof(TrackItem *),
@@ -378,6 +377,11 @@ compute_tsvector_stats(VacAttrStats *stats,
                          * able to find out the minimal and maximal frequency without
                          * going through all the values.  We keep those two extra
                          * frequencies in two extra cells in mcelem_freqs.
+                        *
+                        * (Note: the MCELEM statistics slot definition allows for a third
+                        * extra number containing the frequency of nulls, but we don't
+                        * create that for a tsvector column, since null elements aren't
+                        * possible.)
                          */
                         mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum));
                         mcelem_freqs = (float4 *) palloc((num_mcelem + 2) * sizeof(float4));