granicus.if.org Git - postgresql/blob - src/include/catalog/pg_statistic.h

   1 /*-------------------------------------------------------------------------
   2  *
   3  * pg_statistic.h
   4  *        definition of the system "statistic" relation (pg_statistic)
   5  *        along with the relation's initial contents.
   6  *
   7  *
   8  * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
   9  * Portions Copyright (c) 1994, Regents of the University of California
  10  *
  11  * $Id: pg_statistic.h,v 1.12 2001/08/10 18:57:41 tgl Exp $
  12  *
  13  * NOTES
  14  *        the genbki.sh script reads this file and generates .bki
  15  *        information from the DATA() statements.
  16  *
  17  *-------------------------------------------------------------------------
  18  */
  19 #ifndef PG_STATISTIC_H
  20 #define PG_STATISTIC_H
  21
  22 /* ----------------
  23  *              postgres.h contains the system type definintions and the
  24  *              CATALOG(), BOOTSTRAP and DATA() sugar words so this file
  25  *              can be read by both genbki.sh and the C compiler.
  26  * ----------------
  27  */
  28
  29 /* ----------------
  30  *              pg_statistic definition.  cpp turns this into
  31  *              typedef struct FormData_pg_statistic
  32  * ----------------
  33  */
  34 CATALOG(pg_statistic) BKI_WITHOUT_OIDS
  35 {
  36         /* These fields form the unique key for the entry: */
  37         Oid                     starelid;               /* relation containing attribute */
  38         int2            staattnum;              /* attribute (column) stats are for */
  39
  40         /* the fraction of the column's entries that are NULL: */
  41         float4          stanullfrac;
  42
  43         /*
  44          * stawidth is the average width in bytes of non-null entries.  For
  45          * fixed-width datatypes this is of course the same as the typlen, but
  46          * for varlena types it is more useful.  Note that this is the average
  47          * width of the data as actually stored, post-TOASTing (eg, for a
  48          * moved-out-of-line value, only the size of the pointer object is
  49          * counted).  This is the appropriate definition for the primary use of
  50          * the statistic, which is to estimate sizes of in-memory hash tables of
  51          * tuples.
  52          */
  53         int4            stawidth;
  54
  55         /* ----------------
  56          * stadistinct indicates the (approximate) number of distinct non-null
  57          * data values in the column.  The interpretation is:
  58          *              0               unknown or not computed
  59          *              > 0             actual number of distinct values
  60          *              < 0             negative of multiplier for number of rows
  61          * The special negative case allows us to cope with columns that are
  62          * unique (stadistinct = -1) or nearly so (for example, a column in
  63          * which values appear about twice on the average could be represented
  64          * by stadistinct = -0.5).  Because the number-of-rows statistic in
  65          * pg_class may be updated more frequently than pg_statistic is, it's
  66          * important to be able to describe such situations as a multiple of
  67          * the number of rows, rather than a fixed number of distinct values.
  68          * But in other cases a fixed number is correct (eg, a boolean column).
  69          * ----------------
  70          */
  71         float4          stadistinct;
  72
  73         /* ----------------
  74          * To allow keeping statistics on different kinds of datatypes,
  75          * we do not hard-wire any particular meaning for the remaining
  76          * statistical fields.  Instead, we provide several "slots" in which
  77          * statistical data can be placed.  Each slot includes:
  78          *              kind                    integer code identifying kind of data
  79          *              op                              OID of associated operator, if needed
  80          *              numbers                 float4 array (for statistical values)
  81          *              values                  text array (for representations of data values)
  82          * The ID and operator fields are never NULL; they are zeroes in an
  83          * unused slot.  The numbers and values fields are NULL in an unused
  84          * slot, and might also be NULL in a used slot if the slot kind has
  85          * no need for one or the other.
  86          * ----------------
  87          */
  88
  89         int2            stakind1;
  90         int2            stakind2;
  91         int2            stakind3;
  92         int2            stakind4;
  93
  94         Oid                     staop1;
  95         Oid                     staop2;
  96         Oid                     staop3;
  97         Oid                     staop4;
  98
  99         /*
 100          * THE REST OF THESE ARE VARIABLE LENGTH FIELDS, and may even be absent
 101          * (NULL). They cannot be accessed as C struct entries; you have to use
 102          * the full field access machinery (heap_getattr) for them.  We declare
 103          * them here for the catalog machinery.
 104          */
 105
 106         float4          stanumbers1[1];
 107         float4          stanumbers2[1];
 108         float4          stanumbers3[1];
 109         float4          stanumbers4[1];
 110
 111         /*
 112          * Values in these text arrays are external representations of values
 113          * of the column's data type.  To re-create the actual Datum, do
 114          * datatypein(textout(arrayelement)).
 115          */
 116         text            stavalues1[1];
 117         text            stavalues2[1];
 118         text            stavalues3[1];
 119         text            stavalues4[1];
 120 } FormData_pg_statistic;
 121
 122 #define STATISTIC_NUM_SLOTS  4
 123
 124 /* ----------------
 125  *              Form_pg_statistic corresponds to a pointer to a tuple with
 126  *              the format of pg_statistic relation.
 127  * ----------------
 128  */
 129 typedef FormData_pg_statistic *Form_pg_statistic;
 130
 131 /* ----------------
 132  *              compiler constants for pg_statistic
 133  * ----------------
 134  */
 135 #define Natts_pg_statistic                              21
 136 #define Anum_pg_statistic_starelid              1
 137 #define Anum_pg_statistic_staattnum             2
 138 #define Anum_pg_statistic_stanullfrac   3
 139 #define Anum_pg_statistic_stawidth              4
 140 #define Anum_pg_statistic_stadistinct   5
 141 #define Anum_pg_statistic_stakind1              6
 142 #define Anum_pg_statistic_stakind2              7
 143 #define Anum_pg_statistic_stakind3              8
 144 #define Anum_pg_statistic_stakind4              9
 145 #define Anum_pg_statistic_staop1                10
 146 #define Anum_pg_statistic_staop2                11
 147 #define Anum_pg_statistic_staop3                12
 148 #define Anum_pg_statistic_staop4                13
 149 #define Anum_pg_statistic_stanumbers1   14
 150 #define Anum_pg_statistic_stanumbers2   15
 151 #define Anum_pg_statistic_stanumbers3   16
 152 #define Anum_pg_statistic_stanumbers4   17
 153 #define Anum_pg_statistic_stavalues1    18
 154 #define Anum_pg_statistic_stavalues2    19
 155 #define Anum_pg_statistic_stavalues3    20
 156 #define Anum_pg_statistic_stavalues4    21
 157
 158 /*
 159  * Currently, three statistical slot "kinds" are defined: most common values,
 160  * histogram, and correlation.  Additional "kinds" will probably appear in
 161  * future to help cope with non-scalar datatypes.
 162  *
 163  * Code reading the pg_statistic relation should not assume that a particular
 164  * data "kind" will appear in any particular slot.  Instead, search the
 165  * stakind fields to see if the desired data is available.
 166  */
 167
 168 /*
 169  * In a "most common values" slot, staop is the OID of the "=" operator
 170  * used to decide whether values are the same or not.  stavalues contains
 171  * the K most common non-null values appearing in the column, and stanumbers
 172  * contains their frequencies (fractions of total row count).  The values
 173  * shall be ordered in decreasing frequency.  Note that since the arrays are
 174  * variable-size, K may be chosen by the statistics collector.  Values should
 175  * not appear in MCV unless they have been observed to occur more than once;
 176  * a unique column will have no MCV slot.
 177  */
 178 #define STATISTIC_KIND_MCV  1
 179
 180 /*
 181  * A "histogram" slot describes the distribution of scalar data.  staop is
 182  * the OID of the "<" operator that describes the sort ordering.  (In theory,
 183  * more than one histogram could appear, if a datatype has more than one
 184  * useful sort operator.)  stavalues contains M (>=2) non-null values that
 185  * divide the non-null column data values into M-1 bins of approximately equal
 186  * population.  The first stavalues item is the MIN and the last is the MAX.
 187  * stanumbers is not used and should be NULL.  IMPORTANT POINT: if an MCV
 188  * slot is also provided, then the histogram describes the data distribution
 189  * *after removing the values listed in MCV* (thus, it's a "compressed
 190  * histogram" in the technical parlance).  This allows a more accurate
 191  * representation of the distribution of a column with some very-common
 192  * values.  In a column with only a few distinct values, it's possible that
 193  * the MCV list describes the entire data population; in this case the
 194  * histogram reduces to empty and should be omitted.
 195  */
 196 #define STATISTIC_KIND_HISTOGRAM  2
 197
 198 /*
 199  * A "correlation" slot describes the correlation between the physical order
 200  * of table tuples and the ordering of data values of this column, as seen
 201  * by the "<" operator identified by staop.  (As with the histogram, more
 202  * than one entry could theoretically appear.)  stavalues is not used and
 203  * should be NULL.  stanumbers contains a single entry, the correlation
 204  * coefficient between the sequence of data values and the sequence of
 205  * their actual tuple positions.  The coefficient ranges from +1 to -1.
 206  */
 207 #define STATISTIC_KIND_CORRELATION  3
 208
 209 #endif   /* PG_STATISTIC_H */