* ts_typanalyze.c
* functions for gathering statistics from tsvector columns
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/tsearch/ts_typanalyze.c,v 1.9 2010/05/30 21:59:02 tgl Exp $
+ * src/backend/tsearch/ts_typanalyze.c
*
*-------------------------------------------------------------------------
*/
#include "commands/vacuum.h"
#include "tsearch/ts_type.h"
#include "utils/builtins.h"
-#include "utils/hsearch.h"
/* A hash key for lexemes */
/*
* We want statistics_target * 10 lexemes in the MCELEM array. This
* multiplier is pretty arbitrary, but is meant to reflect the fact that
- * the number of individual lexeme values tracked in pg_statistic ought
- * to be more than the number of values for a simple scalar column.
+ * the number of individual lexeme values tracked in pg_statistic ought to
+ * be more than the number of values for a simple scalar column.
*/
num_mcelem = stats->attr->attstattarget * 10;
hash_ctl.match = lexeme_match;
hash_ctl.hcxt = CurrentMemoryContext;
lexemes_tab = hash_create("Analyzed lexemes table",
- bucket_width * 7,
+ num_mcelem,
&hash_ctl,
HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT);
/*
* We loop through the lexemes in the tsvector and add them to our
- * tracking hashtable. Note: the hashtable entries will point into
+ * tracking hashtable. Note: the hashtable entries will point into
* the (detoasted) tsvector value, therefore we cannot free that
* storage until we're done.
*/
*/
cutoff_freq = 9 * lexeme_no / bucket_width;
- i = hash_get_num_entries(lexemes_tab); /* surely enough space */
+ i = hash_get_num_entries(lexemes_tab); /* surely enough space */
sort_table = (TrackItem **) palloc(sizeof(TrackItem *) * i);
hash_seq_init(&scan_status, lexemes_tab);
num_mcelem, bucket_width, lexeme_no, i, track_len);
/*
- * If we obtained more lexemes than we really want, get rid of
- * those with least frequencies. The easiest way is to qsort the
- * array into descending frequency order and truncate the array.
+ * If we obtained more lexemes than we really want, get rid of those
+ * with least frequencies. The easiest way is to qsort the array into
+ * descending frequency order and truncate the array.
*/
if (num_mcelem < track_len)
{
* they get sorted on frequencies. The rationale is that we
* usually search through most common elements looking for a
* specific value, so we can grab its frequency. When values are
- * presorted we can employ binary search for that. See
+ * presorted we can employ binary search for that. See
* ts_selfuncs.c for a real usage scenario.
*/
qsort(sort_table, num_mcelem, sizeof(TrackItem *),
* able to find out the minimal and maximal frequency without
* going through all the values. We keep those two extra
* frequencies in two extra cells in mcelem_freqs.
+ *
+ * (Note: the MCELEM statistics slot definition allows for a third
+ * extra number containing the frequency of nulls, but we don't
+ * create that for a tsvector column, since null elements aren't
+ * possible.)
*/
mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum));
mcelem_freqs = (float4 *) palloc((num_mcelem + 2) * sizeof(float4));
/*
- * See comments above about use of nonnull_cnt as the divisor
- * for the final frequency estimates.
+ * See comments above about use of nonnull_cnt as the divisor for
+ * the final frequency estimates.
*/
for (i = 0; i < num_mcelem; i++)
{