granicus.if.org Git - postgresql/blob - src/include/catalog/pg_statistic.h

   1 /*-------------------------------------------------------------------------
   2  *
   3  * pg_statistic.h
   4  *        definition of the system "statistic" relation (pg_statistic)
   5  *        along with the relation's initial contents.
   6  *
   7  *
   8  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
   9  * Portions Copyright (c) 1994, Regents of the University of California
  10  *
  11  * $PostgreSQL: pgsql/src/include/catalog/pg_statistic.h,v 1.31 2006/03/05 15:58:55 momjian Exp $
  12  *
  13  * NOTES
  14  *        the genbki.sh script reads this file and generates .bki
  15  *        information from the DATA() statements.
  16  *
  17  *-------------------------------------------------------------------------
  18  */
  19 #ifndef PG_STATISTIC_H
  20 #define PG_STATISTIC_H
  21
  22 /* ----------------
  23  *              postgres.h contains the system type definitions and the
  24  *              CATALOG(), BKI_BOOTSTRAP and DATA() sugar words so this file
  25  *              can be read by both genbki.sh and the C compiler.
  26  * ----------------
  27  */
  28
  29 /*
  30  * Keep C compiler happy with anyarray, below.  This will need to go elsewhere
  31  * if we ever use anyarray for more than pg_statistic.
  32  */
  33 typedef struct varlena anyarray;
  34
  35 /* ----------------
  36  *              pg_statistic definition.  cpp turns this into
  37  *              typedef struct FormData_pg_statistic
  38  * ----------------
  39  */
  40 #define StatisticRelationId  2619
  41
  42 CATALOG(pg_statistic,2619) BKI_WITHOUT_OIDS
  43 {
  44         /* These fields form the unique key for the entry: */
  45         Oid                     starelid;               /* relation containing attribute */
  46         int2            staattnum;              /* attribute (column) stats are for */
  47
  48         /* the fraction of the column's entries that are NULL: */
  49         float4          stanullfrac;
  50
  51         /*
  52          * stawidth is the average width in bytes of non-null entries.  For
  53          * fixed-width datatypes this is of course the same as the typlen, but for
  54          * var-width types it is more useful.  Note that this is the average width
  55          * of the data as actually stored, post-TOASTing (eg, for a
  56          * moved-out-of-line value, only the size of the pointer object is
  57          * counted).  This is the appropriate definition for the primary use of
  58          * the statistic, which is to estimate sizes of in-memory hash tables of
  59          * tuples.
  60          */
  61         int4            stawidth;
  62
  63         /* ----------------
  64          * stadistinct indicates the (approximate) number of distinct non-null
  65          * data values in the column.  The interpretation is:
  66          *              0               unknown or not computed
  67          *              > 0             actual number of distinct values
  68          *              < 0             negative of multiplier for number of rows
  69          * The special negative case allows us to cope with columns that are
  70          * unique (stadistinct = -1) or nearly so (for example, a column in
  71          * which values appear about twice on the average could be represented
  72          * by stadistinct = -0.5).      Because the number-of-rows statistic in
  73          * pg_class may be updated more frequently than pg_statistic is, it's
  74          * important to be able to describe such situations as a multiple of
  75          * the number of rows, rather than a fixed number of distinct values.
  76          * But in other cases a fixed number is correct (eg, a boolean column).
  77          * ----------------
  78          */
  79         float4          stadistinct;
  80
  81         /* ----------------
  82          * To allow keeping statistics on different kinds of datatypes,
  83          * we do not hard-wire any particular meaning for the remaining
  84          * statistical fields.  Instead, we provide several "slots" in which
  85          * statistical data can be placed.      Each slot includes:
  86          *              kind                    integer code identifying kind of data
  87          *              op                              OID of associated operator, if needed
  88          *              numbers                 float4 array (for statistical values)
  89          *              values                  anyarray (for representations of data values)
  90          * The ID and operator fields are never NULL; they are zeroes in an
  91          * unused slot.  The numbers and values fields are NULL in an unused
  92          * slot, and might also be NULL in a used slot if the slot kind has
  93          * no need for one or the other.
  94          * ----------------
  95          */
  96
  97         int2            stakind1;
  98         int2            stakind2;
  99         int2            stakind3;
 100         int2            stakind4;
 101
 102         Oid                     staop1;
 103         Oid                     staop2;
 104         Oid                     staop3;
 105         Oid                     staop4;
 106
 107         /*
 108          * THE REST OF THESE ARE VARIABLE LENGTH FIELDS, and may even be absent
 109          * (NULL). They cannot be accessed as C struct entries; you have to use
 110          * the full field access machinery (heap_getattr) for them.  We declare
 111          * them here for the catalog machinery.
 112          */
 113
 114         float4          stanumbers1[1];
 115         float4          stanumbers2[1];
 116         float4          stanumbers3[1];
 117         float4          stanumbers4[1];
 118
 119         /*
 120          * Values in these arrays are values of the column's data type.  We
 121          * presently have to cheat quite a bit to allow polymorphic arrays of this
 122          * kind, but perhaps someday it'll be a less bogus facility.
 123          */
 124         anyarray        stavalues1;
 125         anyarray        stavalues2;
 126         anyarray        stavalues3;
 127         anyarray        stavalues4;
 128 } FormData_pg_statistic;
 129
 130 #define STATISTIC_NUM_SLOTS  4
 131
 132 /* ----------------
 133  *              Form_pg_statistic corresponds to a pointer to a tuple with
 134  *              the format of pg_statistic relation.
 135  * ----------------
 136  */
 137 typedef FormData_pg_statistic *Form_pg_statistic;
 138
 139 /* ----------------
 140  *              compiler constants for pg_statistic
 141  * ----------------
 142  */
 143 #define Natts_pg_statistic                              21
 144 #define Anum_pg_statistic_starelid              1
 145 #define Anum_pg_statistic_staattnum             2
 146 #define Anum_pg_statistic_stanullfrac   3
 147 #define Anum_pg_statistic_stawidth              4
 148 #define Anum_pg_statistic_stadistinct   5
 149 #define Anum_pg_statistic_stakind1              6
 150 #define Anum_pg_statistic_stakind2              7
 151 #define Anum_pg_statistic_stakind3              8
 152 #define Anum_pg_statistic_stakind4              9
 153 #define Anum_pg_statistic_staop1                10
 154 #define Anum_pg_statistic_staop2                11
 155 #define Anum_pg_statistic_staop3                12
 156 #define Anum_pg_statistic_staop4                13
 157 #define Anum_pg_statistic_stanumbers1   14
 158 #define Anum_pg_statistic_stanumbers2   15
 159 #define Anum_pg_statistic_stanumbers3   16
 160 #define Anum_pg_statistic_stanumbers4   17
 161 #define Anum_pg_statistic_stavalues1    18
 162 #define Anum_pg_statistic_stavalues2    19
 163 #define Anum_pg_statistic_stavalues3    20
 164 #define Anum_pg_statistic_stavalues4    21
 165
 166 /*
 167  * Currently, three statistical slot "kinds" are defined: most common values,
 168  * histogram, and correlation.  Additional "kinds" will probably appear in
 169  * future to help cope with non-scalar datatypes.  Also, custom data types
 170  * can define their own "kind" codes by mutual agreement between a custom
 171  * typanalyze routine and the selectivity estimation functions of the type's
 172  * operators.
 173  *
 174  * Code reading the pg_statistic relation should not assume that a particular
 175  * data "kind" will appear in any particular slot.      Instead, search the
 176  * stakind fields to see if the desired data is available.      (The standard
 177  * function get_attstatsslot() may be used for this.)
 178  */
 179
 180 /*
 181  * The present allocation of "kind" codes is:
 182  *
 183  *      1-99:           reserved for assignment by the core PostgreSQL project
 184  *                              (values in this range will be documented in this file)
 185  *      100-199:        reserved for assignment by the PostGIS project
 186  *                              (values to be documented in PostGIS documentation)
 187  *      200-9999:       reserved for future public assignments
 188  *
 189  * For private use you may choose a "kind" code at random in the range
 190  * 10000-30000.  However, for code that is to be widely disseminated it is
 191  * better to obtain a publicly defined "kind" code by request from the
 192  * PostgreSQL Global Development Group.
 193  */
 194
 195 /*
 196  * In a "most common values" slot, staop is the OID of the "=" operator
 197  * used to decide whether values are the same or not.  stavalues contains
 198  * the K most common non-null values appearing in the column, and stanumbers
 199  * contains their frequencies (fractions of total row count).  The values
 200  * shall be ordered in decreasing frequency.  Note that since the arrays are
 201  * variable-size, K may be chosen by the statistics collector.  Values should
 202  * not appear in MCV unless they have been observed to occur more than once;
 203  * a unique column will have no MCV slot.
 204  */
 205 #define STATISTIC_KIND_MCV      1
 206
 207 /*
 208  * A "histogram" slot describes the distribution of scalar data.  staop is
 209  * the OID of the "<" operator that describes the sort ordering.  (In theory,
 210  * more than one histogram could appear, if a datatype has more than one
 211  * useful sort operator.)  stavalues contains M (>=2) non-null values that
 212  * divide the non-null column data values into M-1 bins of approximately equal
 213  * population.  The first stavalues item is the MIN and the last is the MAX.
 214  * stanumbers is not used and should be NULL.  IMPORTANT POINT: if an MCV
 215  * slot is also provided, then the histogram describes the data distribution
 216  * *after removing the values listed in MCV* (thus, it's a "compressed
 217  * histogram" in the technical parlance).  This allows a more accurate
 218  * representation of the distribution of a column with some very-common
 219  * values.      In a column with only a few distinct values, it's possible that
 220  * the MCV list describes the entire data population; in this case the
 221  * histogram reduces to empty and should be omitted.
 222  */
 223 #define STATISTIC_KIND_HISTOGRAM  2
 224
 225 /*
 226  * A "correlation" slot describes the correlation between the physical order
 227  * of table tuples and the ordering of data values of this column, as seen
 228  * by the "<" operator identified by staop.  (As with the histogram, more
 229  * than one entry could theoretically appear.)  stavalues is not used and
 230  * should be NULL.      stanumbers contains a single entry, the correlation
 231  * coefficient between the sequence of data values and the sequence of
 232  * their actual tuple positions.  The coefficient ranges from +1 to -1.
 233  */
 234 #define STATISTIC_KIND_CORRELATION      3
 235
 236 #endif   /* PG_STATISTIC_H */