granicus.if.org Git - postgresql/blob - src/backend/utils/adt/array_selfuncs.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * array_selfuncs.c
   4  *        Functions for selectivity estimation of array operators
   5  *
   6  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        src/backend/utils/adt/array_selfuncs.c
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15 #include "postgres.h"
  16
  17 #include <math.h>
  18
  19 #include "access/htup_details.h"
  20 #include "catalog/pg_collation.h"
  21 #include "catalog/pg_operator.h"
  22 #include "catalog/pg_statistic.h"
  23 #include "utils/array.h"
  24 #include "utils/builtins.h"
  25 #include "utils/lsyscache.h"
  26 #include "utils/selfuncs.h"
  27 #include "utils/typcache.h"
  28
  29
  30 /* Default selectivity constant for "@>" and "<@" operators */
  31 #define DEFAULT_CONTAIN_SEL 0.005
  32
  33 /* Default selectivity constant for "&&" operator */
  34 #define DEFAULT_OVERLAP_SEL 0.01
  35
  36 /* Default selectivity for given operator */
  37 #define DEFAULT_SEL(operator) \
  38         ((operator) == OID_ARRAY_OVERLAP_OP ? \
  39                 DEFAULT_OVERLAP_SEL : DEFAULT_CONTAIN_SEL)
  40
  41 static Selectivity calc_arraycontsel(VariableStatData *vardata, Datum constval,
  42                                   Oid elemtype, Oid operator);
  43 static Selectivity mcelem_array_selec(ArrayType *array,
  44                                    TypeCacheEntry *typentry,
  45                                    Datum *mcelem, int nmcelem,
  46                                    float4 *numbers, int nnumbers,
  47                                    float4 *hist, int nhist,
  48                                    Oid operator);
  49 static Selectivity mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
  50                                                                    float4 *numbers, int nnumbers,
  51                                                                    Datum *array_data, int nitems,
  52                                                                    Oid operator, TypeCacheEntry *typentry);
  53 static Selectivity mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
  54                                                          float4 *numbers, int nnumbers,
  55                                                          Datum *array_data, int nitems,
  56                                                          float4 *hist, int nhist,
  57                                                          Oid operator, TypeCacheEntry *typentry);
  58 static float *calc_hist(const float4 *hist, int nhist, int n);
  59 static float *calc_distr(const float *p, int n, int m, float rest);
  60 static int      floor_log2(uint32 n);
  61 static bool find_next_mcelem(Datum *mcelem, int nmcelem, Datum value,
  62                                  int *index, TypeCacheEntry *typentry);
  63 static int      element_compare(const void *key1, const void *key2, void *arg);
  64 static int      float_compare_desc(const void *key1, const void *key2);
  65
  66
  67 /*
  68  * scalararraysel_containment
  69  *              Estimate selectivity of ScalarArrayOpExpr via array containment.
  70  *
  71  * If we have const =/<> ANY/ALL (array_var) then we can estimate the
  72  * selectivity as though this were an array containment operator,
  73  * array_var op ARRAY[const].
  74  *
  75  * scalararraysel() has already verified that the ScalarArrayOpExpr's operator
  76  * is the array element type's default equality or inequality operator, and
  77  * has aggressively simplified both inputs to constants.
  78  *
  79  * Returns selectivity (0..1), or -1 if we fail to estimate selectivity.
  80  */
  81 Selectivity
  82 scalararraysel_containment(PlannerInfo *root,
  83                                                    Node *leftop, Node *rightop,
  84                                                    Oid elemtype, bool isEquality, bool useOr,
  85                                                    int varRelid)
  86 {
  87         Selectivity selec;
  88         VariableStatData vardata;
  89         Datum           constval;
  90         TypeCacheEntry *typentry;
  91         FmgrInfo   *cmpfunc;
  92
  93         /*
  94          * rightop must be a variable, else punt.
  95          */
  96         examine_variable(root, rightop, varRelid, &vardata);
  97         if (!vardata.rel)
  98         {
  99                 ReleaseVariableStats(vardata);
 100                 return -1.0;
 101         }
 102
 103         /*
 104          * leftop must be a constant, else punt.
 105          */
 106         if (!IsA(leftop, Const))
 107         {
 108                 ReleaseVariableStats(vardata);
 109                 return -1.0;
 110         }
 111         if (((Const *) leftop)->constisnull)
 112         {
 113                 /* qual can't succeed if null on left */
 114                 ReleaseVariableStats(vardata);
 115                 return (Selectivity) 0.0;
 116         }
 117         constval = ((Const *) leftop)->constvalue;
 118
 119         /* Get element type's default comparison function */
 120         typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
 121         if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
 122         {
 123                 ReleaseVariableStats(vardata);
 124                 return -1.0;
 125         }
 126         cmpfunc = &typentry->cmp_proc_finfo;
 127
 128         /*
 129          * If the operator is <>, swap ANY/ALL, then invert the result later.
 130          */
 131         if (!isEquality)
 132                 useOr = !useOr;
 133
 134         /* Get array element stats for var, if available */
 135         if (HeapTupleIsValid(vardata.statsTuple) &&
 136                 statistic_proc_security_check(&vardata, cmpfunc->fn_oid))
 137         {
 138                 Form_pg_statistic stats;
 139                 AttStatsSlot sslot;
 140                 AttStatsSlot hslot;
 141
 142                 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
 143
 144                 /* MCELEM will be an array of same type as element */
 145                 if (get_attstatsslot(&sslot, vardata.statsTuple,
 146                                                          STATISTIC_KIND_MCELEM, InvalidOid,
 147                                                          ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))
 148                 {
 149                         /* For ALL case, also get histogram of distinct-element counts */
 150                         if (useOr ||
 151                                 !get_attstatsslot(&hslot, vardata.statsTuple,
 152                                                                   STATISTIC_KIND_DECHIST, InvalidOid,
 153                                                                   ATTSTATSSLOT_NUMBERS))
 154                                 memset(&hslot, 0, sizeof(hslot));
 155
 156                         /*
 157                          * For = ANY, estimate as var @> ARRAY[const].
 158                          *
 159                          * For = ALL, estimate as var <@ ARRAY[const].
 160                          */
 161                         if (useOr)
 162                                 selec = mcelem_array_contain_overlap_selec(sslot.values,
 163                                                                                                                    sslot.nvalues,
 164                                                                                                                    sslot.numbers,
 165                                                                                                                    sslot.nnumbers,
 166                                                                                                                    &constval, 1,
 167                                                                                                                    OID_ARRAY_CONTAINS_OP,
 168                                                                                                                    typentry);
 169                         else
 170                                 selec = mcelem_array_contained_selec(sslot.values,
 171                                                                                                          sslot.nvalues,
 172                                                                                                          sslot.numbers,
 173                                                                                                          sslot.nnumbers,
 174                                                                                                          &constval, 1,
 175                                                                                                          hslot.numbers,
 176                                                                                                          hslot.nnumbers,
 177                                                                                                          OID_ARRAY_CONTAINED_OP,
 178                                                                                                          typentry);
 179
 180                         free_attstatsslot(&hslot);
 181                         free_attstatsslot(&sslot);
 182                 }
 183                 else
 184                 {
 185                         /* No most-common-elements info, so do without */
 186                         if (useOr)
 187                                 selec = mcelem_array_contain_overlap_selec(NULL, 0,
 188                                                                                                                    NULL, 0,
 189                                                                                                                    &constval, 1,
 190                                                                                                                    OID_ARRAY_CONTAINS_OP,
 191                                                                                                                    typentry);
 192                         else
 193                                 selec = mcelem_array_contained_selec(NULL, 0,
 194                                                                                                          NULL, 0,
 195                                                                                                          &constval, 1,
 196                                                                                                          NULL, 0,
 197                                                                                                          OID_ARRAY_CONTAINED_OP,
 198                                                                                                          typentry);
 199                 }
 200
 201                 /*
 202                  * MCE stats count only non-null rows, so adjust for null rows.
 203                  */
 204                 selec *= (1.0 - stats->stanullfrac);
 205         }
 206         else
 207         {
 208                 /* No stats at all, so do without */
 209                 if (useOr)
 210                         selec = mcelem_array_contain_overlap_selec(NULL, 0,
 211                                                                                                            NULL, 0,
 212                                                                                                            &constval, 1,
 213                                                                                                            OID_ARRAY_CONTAINS_OP,
 214                                                                                                            typentry);
 215                 else
 216                         selec = mcelem_array_contained_selec(NULL, 0,
 217                                                                                                  NULL, 0,
 218                                                                                                  &constval, 1,
 219                                                                                                  NULL, 0,
 220                                                                                                  OID_ARRAY_CONTAINED_OP,
 221                                                                                                  typentry);
 222                 /* we assume no nulls here, so no stanullfrac correction */
 223         }
 224
 225         ReleaseVariableStats(vardata);
 226
 227         /*
 228          * If the operator is <>, invert the results.
 229          */
 230         if (!isEquality)
 231                 selec = 1.0 - selec;
 232
 233         CLAMP_PROBABILITY(selec);
 234
 235         return selec;
 236 }
 237
 238 /*
 239  * arraycontsel -- restriction selectivity for array @>, &&, <@ operators
 240  */
 241 Datum
 242 arraycontsel(PG_FUNCTION_ARGS)
 243 {
 244         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 245         Oid                     operator = PG_GETARG_OID(1);
 246         List       *args = (List *) PG_GETARG_POINTER(2);
 247         int                     varRelid = PG_GETARG_INT32(3);
 248         VariableStatData vardata;
 249         Node       *other;
 250         bool            varonleft;
 251         Selectivity selec;
 252         Oid                     element_typeid;
 253
 254         /*
 255          * If expression is not (variable op something) or (something op
 256          * variable), then punt and return a default estimate.
 257          */
 258         if (!get_restriction_variable(root, args, varRelid,
 259                                                                   &vardata, &other, &varonleft))
 260                 PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
 261
 262         /*
 263          * Can't do anything useful if the something is not a constant, either.
 264          */
 265         if (!IsA(other, Const))
 266         {
 267                 ReleaseVariableStats(vardata);
 268                 PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
 269         }
 270
 271         /*
 272          * The "&&", "@>" and "<@" operators are strict, so we can cope with a
 273          * NULL constant right away.
 274          */
 275         if (((Const *) other)->constisnull)
 276         {
 277                 ReleaseVariableStats(vardata);
 278                 PG_RETURN_FLOAT8(0.0);
 279         }
 280
 281         /*
 282          * If var is on the right, commute the operator, so that we can assume the
 283          * var is on the left in what follows.
 284          */
 285         if (!varonleft)
 286         {
 287                 if (operator == OID_ARRAY_CONTAINS_OP)
 288                         operator = OID_ARRAY_CONTAINED_OP;
 289                 else if (operator == OID_ARRAY_CONTAINED_OP)
 290                         operator = OID_ARRAY_CONTAINS_OP;
 291         }
 292
 293         /*
 294          * OK, there's a Var and a Const we're dealing with here.  We need the
 295          * Const to be an array with same element type as column, else we can't do
 296          * anything useful.  (Such cases will likely fail at runtime, but here
 297          * we'd rather just return a default estimate.)
 298          */
 299         element_typeid = get_base_element_type(((Const *) other)->consttype);
 300         if (element_typeid != InvalidOid &&
 301                 element_typeid == get_base_element_type(vardata.vartype))
 302         {
 303                 selec = calc_arraycontsel(&vardata, ((Const *) other)->constvalue,
 304                                                                   element_typeid, operator);
 305         }
 306         else
 307         {
 308                 selec = DEFAULT_SEL(operator);
 309         }
 310
 311         ReleaseVariableStats(vardata);
 312
 313         CLAMP_PROBABILITY(selec);
 314
 315         PG_RETURN_FLOAT8((float8) selec);
 316 }
 317
 318 /*
 319  * arraycontjoinsel -- join selectivity for array @>, &&, <@ operators
 320  */
 321 Datum
 322 arraycontjoinsel(PG_FUNCTION_ARGS)
 323 {
 324         /* For the moment this is just a stub */
 325         Oid                     operator = PG_GETARG_OID(1);
 326
 327         PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
 328 }
 329
 330 /*
 331  * Calculate selectivity for "arraycolumn @> const", "arraycolumn && const"
 332  * or "arraycolumn <@ const" based on the statistics
 333  *
 334  * This function is mainly responsible for extracting the pg_statistic data
 335  * to be used; we then pass the problem on to mcelem_array_selec().
 336  */
 337 static Selectivity
 338 calc_arraycontsel(VariableStatData *vardata, Datum constval,
 339                                   Oid elemtype, Oid operator)
 340 {
 341         Selectivity selec;
 342         TypeCacheEntry *typentry;
 343         FmgrInfo   *cmpfunc;
 344         ArrayType  *array;
 345
 346         /* Get element type's default comparison function */
 347         typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
 348         if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
 349                 return DEFAULT_SEL(operator);
 350         cmpfunc = &typentry->cmp_proc_finfo;
 351
 352         /*
 353          * The caller made sure the const is an array with same element type, so
 354          * get it now
 355          */
 356         array = DatumGetArrayTypeP(constval);
 357
 358         if (HeapTupleIsValid(vardata->statsTuple) &&
 359                 statistic_proc_security_check(vardata, cmpfunc->fn_oid))
 360         {
 361                 Form_pg_statistic stats;
 362                 AttStatsSlot sslot;
 363                 AttStatsSlot hslot;
 364
 365                 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
 366
 367                 /* MCELEM will be an array of same type as column */
 368                 if (get_attstatsslot(&sslot, vardata->statsTuple,
 369                                                          STATISTIC_KIND_MCELEM, InvalidOid,
 370                                                          ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))
 371                 {
 372                         /*
 373                          * For "array <@ const" case we also need histogram of distinct
 374                          * element counts.
 375                          */
 376                         if (operator != OID_ARRAY_CONTAINED_OP ||
 377                                 !get_attstatsslot(&hslot, vardata->statsTuple,
 378                                                                   STATISTIC_KIND_DECHIST, InvalidOid,
 379                                                                   ATTSTATSSLOT_NUMBERS))
 380                                 memset(&hslot, 0, sizeof(hslot));
 381
 382                         /* Use the most-common-elements slot for the array Var. */
 383                         selec = mcelem_array_selec(array, typentry,
 384                                                                            sslot.values, sslot.nvalues,
 385                                                                            sslot.numbers, sslot.nnumbers,
 386                                                                            hslot.numbers, hslot.nnumbers,
 387                                                                            operator);
 388
 389                         free_attstatsslot(&hslot);
 390                         free_attstatsslot(&sslot);
 391                 }
 392                 else
 393                 {
 394                         /* No most-common-elements info, so do without */
 395                         selec = mcelem_array_selec(array, typentry,
 396                                                                            NULL, 0, NULL, 0, NULL, 0,
 397                                                                            operator);
 398                 }
 399
 400                 /*
 401                  * MCE stats count only non-null rows, so adjust for null rows.
 402                  */
 403                 selec *= (1.0 - stats->stanullfrac);
 404         }
 405         else
 406         {
 407                 /* No stats at all, so do without */
 408                 selec = mcelem_array_selec(array, typentry,
 409                                                                    NULL, 0, NULL, 0, NULL, 0,
 410                                                                    operator);
 411                 /* we assume no nulls here, so no stanullfrac correction */
 412         }
 413
 414         /* If constant was toasted, release the copy we made */
 415         if (PointerGetDatum(array) != constval)
 416                 pfree(array);
 417
 418         return selec;
 419 }
 420
 421 /*
 422  * Array selectivity estimation based on most common elements statistics
 423  *
 424  * This function just deconstructs and sorts the array constant's contents,
 425  * and then passes the problem on to mcelem_array_contain_overlap_selec or
 426  * mcelem_array_contained_selec depending on the operator.
 427  */
 428 static Selectivity
 429 mcelem_array_selec(ArrayType *array, TypeCacheEntry *typentry,
 430                                    Datum *mcelem, int nmcelem,
 431                                    float4 *numbers, int nnumbers,
 432                                    float4 *hist, int nhist,
 433                                    Oid operator)
 434 {
 435         Selectivity selec;
 436         int                     num_elems;
 437         Datum      *elem_values;
 438         bool       *elem_nulls;
 439         bool            null_present;
 440         int                     nonnull_nitems;
 441         int                     i;
 442
 443         /*
 444          * Prepare constant array data for sorting.  Sorting lets us find unique
 445          * elements and efficiently merge with the MCELEM array.
 446          */
 447         deconstruct_array(array,
 448                                           typentry->type_id,
 449                                           typentry->typlen,
 450                                           typentry->typbyval,
 451                                           typentry->typalign,
 452                                           &elem_values, &elem_nulls, &num_elems);
 453
 454         /* Collapse out any null elements */
 455         nonnull_nitems = 0;
 456         null_present = false;
 457         for (i = 0; i < num_elems; i++)
 458         {
 459                 if (elem_nulls[i])
 460                         null_present = true;
 461                 else
 462                         elem_values[nonnull_nitems++] = elem_values[i];
 463         }
 464
 465         /*
 466          * Query "column @> '{anything, null}'" matches nothing.  For the other
 467          * two operators, presence of a null in the constant can be ignored.
 468          */
 469         if (null_present && operator == OID_ARRAY_CONTAINS_OP)
 470         {
 471                 pfree(elem_values);
 472                 pfree(elem_nulls);
 473                 return (Selectivity) 0.0;
 474         }
 475
 476         /* Sort extracted elements using their default comparison function. */
 477         qsort_arg(elem_values, nonnull_nitems, sizeof(Datum),
 478                           element_compare, typentry);
 479
 480         /* Separate cases according to operator */
 481         if (operator == OID_ARRAY_CONTAINS_OP || operator == OID_ARRAY_OVERLAP_OP)
 482                 selec = mcelem_array_contain_overlap_selec(mcelem, nmcelem,
 483                                                                                                    numbers, nnumbers,
 484                                                                                                    elem_values, nonnull_nitems,
 485                                                                                                    operator, typentry);
 486         else if (operator == OID_ARRAY_CONTAINED_OP)
 487                 selec = mcelem_array_contained_selec(mcelem, nmcelem,
 488                                                                                          numbers, nnumbers,
 489                                                                                          elem_values, nonnull_nitems,
 490                                                                                          hist, nhist,
 491                                                                                          operator, typentry);
 492         else
 493         {
 494                 elog(ERROR, "arraycontsel called for unrecognized operator %u",
 495                          operator);
 496                 selec = 0.0;                    /* keep compiler quiet */
 497         }
 498
 499         pfree(elem_values);
 500         pfree(elem_nulls);
 501         return selec;
 502 }
 503
 504 /*
 505  * Estimate selectivity of "column @> const" and "column && const" based on
 506  * most common element statistics.  This estimation assumes element
 507  * occurrences are independent.
 508  *
 509  * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
 510  * the array column's MCELEM statistics slot, or are NULL/0 if stats are
 511  * not available.  array_data (of length nitems) is the constant's elements.
 512  *
 513  * Both the mcelem and array_data arrays are assumed presorted according
 514  * to the element type's cmpfunc.  Null elements are not present.
 515  *
 516  * TODO: this estimate probably could be improved by using the distinct
 517  * elements count histogram.  For example, excepting the special case of
 518  * "column @> '{}'", we can multiply the calculated selectivity by the
 519  * fraction of nonempty arrays in the column.
 520  */
 521 static Selectivity
 522 mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
 523                                                                    float4 *numbers, int nnumbers,
 524                                                                    Datum *array_data, int nitems,
 525                                                                    Oid operator, TypeCacheEntry *typentry)
 526 {
 527         Selectivity selec,
 528                                 elem_selec;
 529         int                     mcelem_index,
 530                                 i;
 531         bool            use_bsearch;
 532         float4          minfreq;
 533
 534         /*
 535          * There should be three more Numbers than Values, because the last three
 536          * cells should hold minimal and maximal frequency among the non-null
 537          * elements, and then the frequency of null elements.  Ignore the Numbers
 538          * if not right.
 539          */
 540         if (nnumbers != nmcelem + 3)
 541         {
 542                 numbers = NULL;
 543                 nnumbers = 0;
 544         }
 545
 546         if (numbers)
 547         {
 548                 /* Grab the lowest observed frequency */
 549                 minfreq = numbers[nmcelem];
 550         }
 551         else
 552         {
 553                 /* Without statistics make some default assumptions */
 554                 minfreq = 2 * (float4) DEFAULT_CONTAIN_SEL;
 555         }
 556
 557         /* Decide whether it is faster to use binary search or not. */
 558         if (nitems * floor_log2((uint32) nmcelem) < nmcelem + nitems)
 559                 use_bsearch = true;
 560         else
 561                 use_bsearch = false;
 562
 563         if (operator == OID_ARRAY_CONTAINS_OP)
 564         {
 565                 /*
 566                  * Initial selectivity for "column @> const" query is 1.0, and it will
 567                  * be decreased with each element of constant array.
 568                  */
 569                 selec = 1.0;
 570         }
 571         else
 572         {
 573                 /*
 574                  * Initial selectivity for "column && const" query is 0.0, and it will
 575                  * be increased with each element of constant array.
 576                  */
 577                 selec = 0.0;
 578         }
 579
 580         /* Scan mcelem and array in parallel. */
 581         mcelem_index = 0;
 582         for (i = 0; i < nitems; i++)
 583         {
 584                 bool            match = false;
 585
 586                 /* Ignore any duplicates in the array data. */
 587                 if (i > 0 &&
 588                         element_compare(&array_data[i - 1], &array_data[i], typentry) == 0)
 589                         continue;
 590
 591                 /* Find the smallest MCELEM >= this array item. */
 592                 if (use_bsearch)
 593                 {
 594                         match = find_next_mcelem(mcelem, nmcelem, array_data[i],
 595                                                                          &mcelem_index, typentry);
 596                 }
 597                 else
 598                 {
 599                         while (mcelem_index < nmcelem)
 600                         {
 601                                 int                     cmp = element_compare(&mcelem[mcelem_index],
 602                                                                                                   &array_data[i],
 603                                                                                                   typentry);
 604
 605                                 if (cmp < 0)
 606                                         mcelem_index++;
 607                                 else
 608                                 {
 609                                         if (cmp == 0)
 610                                                 match = true;   /* mcelem is found */
 611                                         break;
 612                                 }
 613                         }
 614                 }
 615
 616                 if (match && numbers)
 617                 {
 618                         /* MCELEM matches the array item; use its frequency. */
 619                         elem_selec = numbers[mcelem_index];
 620                         mcelem_index++;
 621                 }
 622                 else
 623                 {
 624                         /*
 625                          * The element is not in MCELEM.  Punt, but assume that the
 626                          * selectivity cannot be more than minfreq / 2.
 627                          */
 628                         elem_selec = Min(DEFAULT_CONTAIN_SEL, minfreq / 2);
 629                 }
 630
 631                 /*
 632                  * Update overall selectivity using the current element's selectivity
 633                  * and an assumption of element occurrence independence.
 634                  */
 635                 if (operator == OID_ARRAY_CONTAINS_OP)
 636                         selec *= elem_selec;
 637                 else
 638                         selec = selec + elem_selec - selec * elem_selec;
 639
 640                 /* Clamp intermediate results to stay sane despite roundoff error */
 641                 CLAMP_PROBABILITY(selec);
 642         }
 643
 644         return selec;
 645 }
 646
 647 /*
 648  * Estimate selectivity of "column <@ const" based on most common element
 649  * statistics.
 650  *
 651  * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
 652  * the array column's MCELEM statistics slot, or are NULL/0 if stats are
 653  * not available.  array_data (of length nitems) is the constant's elements.
 654  * hist (of length nhist) is from the array column's DECHIST statistics slot,
 655  * or is NULL/0 if those stats are not available.
 656  *
 657  * Both the mcelem and array_data arrays are assumed presorted according
 658  * to the element type's cmpfunc.  Null elements are not present.
 659  *
 660  * Independent element occurrence would imply a particular distribution of
 661  * distinct element counts among matching rows.  Real data usually falsifies
 662  * that assumption.  For example, in a set of 11-element integer arrays having
 663  * elements in the range [0..10], element occurrences are typically not
 664  * independent.  If they were, a sufficiently-large set would include all
 665  * distinct element counts 0 through 11.  We correct for this using the
 666  * histogram of distinct element counts.
 667  *
 668  * In the "column @> const" and "column && const" cases, we usually have a
 669  * "const" with low number of elements (otherwise we have selectivity close
 670  * to 0 or 1 respectively).  That's why the effect of dependence related
 671  * to distinct element count distribution is negligible there.  In the
 672  * "column <@ const" case, number of elements is usually high (otherwise we
 673  * have selectivity close to 0).  That's why we should do a correction with
 674  * the array distinct element count distribution here.
 675  *
 676  * Using the histogram of distinct element counts produces a different
 677  * distribution law than independent occurrences of elements.  This
 678  * distribution law can be described as follows:
 679  *
 680  * P(o1, o2, ..., on) = f1^o1 * (1 - f1)^(1 - o1) * f2^o2 *
 681  *        (1 - f2)^(1 - o2) * ... * fn^on * (1 - fn)^(1 - on) * hist[m] / ind[m]
 682  *
 683  * where:
 684  * o1, o2, ..., on - occurrences of elements 1, 2, ..., n
 685  *              (1 - occurrence, 0 - no occurrence) in row
 686  * f1, f2, ..., fn - frequencies of elements 1, 2, ..., n
 687  *              (scalar values in [0..1]) according to collected statistics
 688  * m = o1 + o2 + ... + on = total number of distinct elements in row
 689  * hist[m] - histogram data for occurrence of m elements.
 690  * ind[m] - probability of m occurrences from n events assuming their
 691  *        probabilities to be equal to frequencies of array elements.
 692  *
 693  * ind[m] = sum(f1^o1 * (1 - f1)^(1 - o1) * f2^o2 * (1 - f2)^(1 - o2) *
 694  * ... * fn^on * (1 - fn)^(1 - on), o1, o2, ..., on) | o1 + o2 + .. on = m
 695  */
 696 static Selectivity
 697 mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
 698                                                          float4 *numbers, int nnumbers,
 699                                                          Datum *array_data, int nitems,
 700                                                          float4 *hist, int nhist,
 701                                                          Oid operator, TypeCacheEntry *typentry)
 702 {
 703         int                     mcelem_index,
 704                                 i,
 705                                 unique_nitems = 0;
 706         float           selec,
 707                                 minfreq,
 708                                 nullelem_freq;
 709         float      *dist,
 710                            *mcelem_dist,
 711                            *hist_part;
 712         float           avg_count,
 713                                 mult,
 714                                 rest;
 715         float      *elem_selec;
 716
 717         /*
 718          * There should be three more Numbers than Values in the MCELEM slot,
 719          * because the last three cells should hold minimal and maximal frequency
 720          * among the non-null elements, and then the frequency of null elements.
 721          * Punt if not right, because we can't do much without the element freqs.
 722          */
 723         if (numbers == NULL || nnumbers != nmcelem + 3)
 724                 return DEFAULT_CONTAIN_SEL;
 725
 726         /* Can't do much without a count histogram, either */
 727         if (hist == NULL || nhist < 3)
 728                 return DEFAULT_CONTAIN_SEL;
 729
 730         /*
 731          * Grab some of the summary statistics that compute_array_stats() stores:
 732          * lowest frequency, frequency of null elements, and average distinct
 733          * element count.
 734          */
 735         minfreq = numbers[nmcelem];
 736         nullelem_freq = numbers[nmcelem + 2];
 737         avg_count = hist[nhist - 1];
 738
 739         /*
 740          * "rest" will be the sum of the frequencies of all elements not
 741          * represented in MCELEM.  The average distinct element count is the sum
 742          * of the frequencies of *all* elements.  Begin with that; we will proceed
 743          * to subtract the MCELEM frequencies.
 744          */
 745         rest = avg_count;
 746
 747         /*
 748          * mult is a multiplier representing estimate of probability that each
 749          * mcelem that is not present in constant doesn't occur.
 750          */
 751         mult = 1.0f;
 752
 753         /*
 754          * elem_selec is array of estimated frequencies for elements in the
 755          * constant.
 756          */
 757         elem_selec = (float *) palloc(sizeof(float) * nitems);
 758
 759         /* Scan mcelem and array in parallel. */
 760         mcelem_index = 0;
 761         for (i = 0; i < nitems; i++)
 762         {
 763                 bool            match = false;
 764
 765                 /* Ignore any duplicates in the array data. */
 766                 if (i > 0 &&
 767                         element_compare(&array_data[i - 1], &array_data[i], typentry) == 0)
 768                         continue;
 769
 770                 /*
 771                  * Iterate over MCELEM until we find an entry greater than or equal to
 772                  * this element of the constant.  Update "rest" and "mult" for mcelem
 773                  * entries skipped over.
 774                  */
 775                 while (mcelem_index < nmcelem)
 776                 {
 777                         int                     cmp = element_compare(&mcelem[mcelem_index],
 778                                                                                           &array_data[i],
 779                                                                                           typentry);
 780
 781                         if (cmp < 0)
 782                         {
 783                                 mult *= (1.0f - numbers[mcelem_index]);
 784                                 rest -= numbers[mcelem_index];
 785                                 mcelem_index++;
 786                         }
 787                         else
 788                         {
 789                                 if (cmp == 0)
 790                                         match = true;   /* mcelem is found */
 791                                 break;
 792                         }
 793                 }
 794
 795                 if (match)
 796                 {
 797                         /* MCELEM matches the array item. */
 798                         elem_selec[unique_nitems] = numbers[mcelem_index];
 799                         /* "rest" is decremented for all mcelems, matched or not */
 800                         rest -= numbers[mcelem_index];
 801                         mcelem_index++;
 802                 }
 803                 else
 804                 {
 805                         /*
 806                          * The element is not in MCELEM.  Punt, but assume that the
 807                          * selectivity cannot be more than minfreq / 2.
 808                          */
 809                         elem_selec[unique_nitems] = Min(DEFAULT_CONTAIN_SEL,
 810                                                                                         minfreq / 2);
 811                 }
 812
 813                 unique_nitems++;
 814         }
 815
 816         /*
 817          * If we handled all constant elements without exhausting the MCELEM
 818          * array, finish walking it to complete calculation of "rest" and "mult".
 819          */
 820         while (mcelem_index < nmcelem)
 821         {
 822                 mult *= (1.0f - numbers[mcelem_index]);
 823                 rest -= numbers[mcelem_index];
 824                 mcelem_index++;
 825         }
 826
 827         /*
 828          * The presence of many distinct rare elements materially decreases
 829          * selectivity.  Use the Poisson distribution to estimate the probability
 830          * of a column value having zero occurrences of such elements.  See above
 831          * for the definition of "rest".
 832          */
 833         mult *= exp(-rest);
 834
 835         /*----------
 836          * Using the distinct element count histogram requires
 837          *              O(unique_nitems * (nmcelem + unique_nitems))
 838          * operations.  Beyond a certain computational cost threshold, it's
 839          * reasonable to sacrifice accuracy for decreased planning time.  We limit
 840          * the number of operations to EFFORT * nmcelem; since nmcelem is limited
 841          * by the column's statistics target, the work done is user-controllable.
 842          *
 843          * If the number of operations would be too large, we can reduce it
 844          * without losing all accuracy by reducing unique_nitems and considering
 845          * only the most-common elements of the constant array.  To make the
 846          * results exactly match what we would have gotten with only those
 847          * elements to start with, we'd have to remove any discarded elements'
 848          * frequencies from "mult", but since this is only an approximation
 849          * anyway, we don't bother with that.  Therefore it's sufficient to qsort
 850          * elem_selec[] and take the largest elements.  (They will no longer match
 851          * up with the elements of array_data[], but we don't care.)
 852          *----------
 853          */
 854 #define EFFORT 100
 855
 856         if ((nmcelem + unique_nitems) > 0 &&
 857                 unique_nitems > EFFORT * nmcelem / (nmcelem + unique_nitems))
 858         {
 859                 /*
 860                  * Use the quadratic formula to solve for largest allowable N.  We
 861                  * have A = 1, B = nmcelem, C = - EFFORT * nmcelem.
 862                  */
 863                 double          b = (double) nmcelem;
 864                 int                     n;
 865
 866                 n = (int) ((sqrt(b * b + 4 * EFFORT * b) - b) / 2);
 867
 868                 /* Sort, then take just the first n elements */
 869                 qsort(elem_selec, unique_nitems, sizeof(float),
 870                           float_compare_desc);
 871                 unique_nitems = n;
 872         }
 873
 874         /*
 875          * Calculate probabilities of each distinct element count for both mcelems
 876          * and constant elements.  At this point, assume independent element
 877          * occurrence.
 878          */
 879         dist = calc_distr(elem_selec, unique_nitems, unique_nitems, 0.0f);
 880         mcelem_dist = calc_distr(numbers, nmcelem, unique_nitems, rest);
 881
 882         /* ignore hist[nhist-1], which is the average not a histogram member */
 883         hist_part = calc_hist(hist, nhist - 1, unique_nitems);
 884
 885         selec = 0.0f;
 886         for (i = 0; i <= unique_nitems; i++)
 887         {
 888                 /*
 889                  * mult * dist[i] / mcelem_dist[i] gives us probability of qual
 890                  * matching from assumption of independent element occurrence with the
 891                  * condition that distinct element count = i.
 892                  */
 893                 if (mcelem_dist[i] > 0)
 894                         selec += hist_part[i] * mult * dist[i] / mcelem_dist[i];
 895         }
 896
 897         pfree(dist);
 898         pfree(mcelem_dist);
 899         pfree(hist_part);
 900         pfree(elem_selec);
 901
 902         /* Take into account occurrence of NULL element. */
 903         selec *= (1.0f - nullelem_freq);
 904
 905         CLAMP_PROBABILITY(selec);
 906
 907         return selec;
 908 }
 909
 910 /*
 911  * Calculate the first n distinct element count probabilities from a
 912  * histogram of distinct element counts.
 913  *
 914  * Returns a palloc'd array of n+1 entries, with array[k] being the
 915  * probability of element count k, k in [0..n].
 916  *
 917  * We assume that a histogram box with bounds a and b gives 1 / ((b - a + 1) *
 918  * (nhist - 1)) probability to each value in (a,b) and an additional half of
 919  * that to a and b themselves.
 920  */
 921 static float *
 922 calc_hist(const float4 *hist, int nhist, int n)
 923 {
 924         float      *hist_part;
 925         int                     k,
 926                                 i = 0;
 927         float           prev_interval = 0,
 928                                 next_interval;
 929         float           frac;
 930
 931         hist_part = (float *) palloc((n + 1) * sizeof(float));
 932
 933         /*
 934          * frac is a probability contribution for each interval between histogram
 935          * values.  We have nhist - 1 intervals, so contribution of each one will
 936          * be 1 / (nhist - 1).
 937          */
 938         frac = 1.0f / ((float) (nhist - 1));
 939
 940         for (k = 0; k <= n; k++)
 941         {
 942                 int                     count = 0;
 943
 944                 /*
 945                  * Count the histogram boundaries equal to k.  (Although the histogram
 946                  * should theoretically contain only exact integers, entries are
 947                  * floats so there could be roundoff error in large values.  Treat any
 948                  * fractional value as equal to the next larger k.)
 949                  */
 950                 while (i < nhist && hist[i] <= k)
 951                 {
 952                         count++;
 953                         i++;
 954                 }
 955
 956                 if (count > 0)
 957                 {
 958                         /* k is an exact bound for at least one histogram box. */
 959                         float           val;
 960
 961                         /* Find length between current histogram value and the next one */
 962                         if (i < nhist)
 963                                 next_interval = hist[i] - hist[i - 1];
 964                         else
 965                                 next_interval = 0;
 966
 967                         /*
 968                          * count - 1 histogram boxes contain k exclusively.  They
 969                          * contribute a total of (count - 1) * frac probability.  Also
 970                          * factor in the partial histogram boxes on either side.
 971                          */
 972                         val = (float) (count - 1);
 973                         if (next_interval > 0)
 974                                 val += 0.5f / next_interval;
 975                         if (prev_interval > 0)
 976                                 val += 0.5f / prev_interval;
 977                         hist_part[k] = frac * val;
 978
 979                         prev_interval = next_interval;
 980                 }
 981                 else
 982                 {
 983                         /* k does not appear as an exact histogram bound. */
 984                         if (prev_interval > 0)
 985                                 hist_part[k] = frac / prev_interval;
 986                         else
 987                                 hist_part[k] = 0.0f;
 988                 }
 989         }
 990
 991         return hist_part;
 992 }
 993
 994 /*
 995  * Consider n independent events with probabilities p[].  This function
 996  * calculates probabilities of exact k of events occurrence for k in [0..m].
 997  * Returns a palloc'd array of size m+1.
 998  *
 999  * "rest" is the sum of the probabilities of all low-probability events not
1000  * included in p.
1001  *
1002  * Imagine matrix M of size (n + 1) x (m + 1).  Element M[i,j] denotes the
1003  * probability that exactly j of first i events occur.  Obviously M[0,0] = 1.
1004  * For any constant j, each increment of i increases the probability iff the
1005  * event occurs.  So, by the law of total probability:
1006  *      M[i,j] = M[i - 1, j] * (1 - p[i]) + M[i - 1, j - 1] * p[i]
1007  *              for i > 0, j > 0.
1008  *      M[i,0] = M[i - 1, 0] * (1 - p[i]) for i > 0.
1009  */
1010 static float *
1011 calc_distr(const float *p, int n, int m, float rest)
1012 {
1013         float      *row,
1014                            *prev_row,
1015                            *tmp;
1016         int                     i,
1017                                 j;
1018
1019         /*
1020          * Since we return only the last row of the matrix and need only the
1021          * current and previous row for calculations, allocate two rows.
1022          */
1023         row = (float *) palloc((m + 1) * sizeof(float));
1024         prev_row = (float *) palloc((m + 1) * sizeof(float));
1025
1026         /* M[0,0] = 1 */
1027         row[0] = 1.0f;
1028         for (i = 1; i <= n; i++)
1029         {
1030                 float           t = p[i - 1];
1031
1032                 /* Swap rows */
1033                 tmp = row;
1034                 row = prev_row;
1035                 prev_row = tmp;
1036
1037                 /* Calculate next row */
1038                 for (j = 0; j <= i && j <= m; j++)
1039                 {
1040                         float           val = 0.0f;
1041
1042                         if (j < i)
1043                                 val += prev_row[j] * (1.0f - t);
1044                         if (j > 0)
1045                                 val += prev_row[j - 1] * t;
1046                         row[j] = val;
1047                 }
1048         }
1049
1050         /*
1051          * The presence of many distinct rare (not in "p") elements materially
1052          * decreases selectivity.  Model their collective occurrence with the
1053          * Poisson distribution.
1054          */
1055         if (rest > DEFAULT_CONTAIN_SEL)
1056         {
1057                 float           t;
1058
1059                 /* Swap rows */
1060                 tmp = row;
1061                 row = prev_row;
1062                 prev_row = tmp;
1063
1064                 for (i = 0; i <= m; i++)
1065                         row[i] = 0.0f;
1066
1067                 /* Value of Poisson distribution for 0 occurrences */
1068                 t = exp(-rest);
1069
1070                 /*
1071                  * Calculate convolution of previously computed distribution and the
1072                  * Poisson distribution.
1073                  */
1074                 for (i = 0; i <= m; i++)
1075                 {
1076                         for (j = 0; j <= m - i; j++)
1077                                 row[j + i] += prev_row[j] * t;
1078
1079                         /* Get Poisson distribution value for (i + 1) occurrences */
1080                         t *= rest / (float) (i + 1);
1081                 }
1082         }
1083
1084         pfree(prev_row);
1085         return row;
1086 }
1087
1088 /* Fast function for floor value of 2 based logarithm calculation. */
1089 static int
1090 floor_log2(uint32 n)
1091 {
1092         int                     logval = 0;
1093
1094         if (n == 0)
1095                 return -1;
1096         if (n >= (1 << 16))
1097         {
1098                 n >>= 16;
1099                 logval += 16;
1100         }
1101         if (n >= (1 << 8))
1102         {
1103                 n >>= 8;
1104                 logval += 8;
1105         }
1106         if (n >= (1 << 4))
1107         {
1108                 n >>= 4;
1109                 logval += 4;
1110         }
1111         if (n >= (1 << 2))
1112         {
1113                 n >>= 2;
1114                 logval += 2;
1115         }
1116         if (n >= (1 << 1))
1117         {
1118                 logval += 1;
1119         }
1120         return logval;
1121 }
1122
1123 /*
1124  * find_next_mcelem binary-searches a most common elements array, starting
1125  * from *index, for the first member >= value.  It saves the position of the
1126  * match into *index and returns true if it's an exact match.  (Note: we
1127  * assume the mcelem elements are distinct so there can't be more than one
1128  * exact match.)
1129  */
1130 static bool
1131 find_next_mcelem(Datum *mcelem, int nmcelem, Datum value, int *index,
1132                                  TypeCacheEntry *typentry)
1133 {
1134         int                     l = *index,
1135                                 r = nmcelem - 1,
1136                                 i,
1137                                 res;
1138
1139         while (l <= r)
1140         {
1141                 i = (l + r) / 2;
1142                 res = element_compare(&mcelem[i], &value, typentry);
1143                 if (res == 0)
1144                 {
1145                         *index = i;
1146                         return true;
1147                 }
1148                 else if (res < 0)
1149                         l = i + 1;
1150                 else
1151                         r = i - 1;
1152         }
1153         *index = l;
1154         return false;
1155 }
1156
1157 /*
1158  * Comparison function for elements.
1159  *
1160  * We use the element type's default btree opclass, and its default collation
1161  * if the type is collation-sensitive.
1162  *
1163  * XXX consider using SortSupport infrastructure
1164  */
1165 static int
1166 element_compare(const void *key1, const void *key2, void *arg)
1167 {
1168         Datum           d1 = *((const Datum *) key1);
1169         Datum           d2 = *((const Datum *) key2);
1170         TypeCacheEntry *typentry = (TypeCacheEntry *) arg;
1171         FmgrInfo   *cmpfunc = &typentry->cmp_proc_finfo;
1172         Datum           c;
1173
1174         c = FunctionCall2Coll(cmpfunc, typentry->typcollation, d1, d2);
1175         return DatumGetInt32(c);
1176 }
1177
1178 /*
1179  * Comparison function for sorting floats into descending order.
1180  */
1181 static int
1182 float_compare_desc(const void *key1, const void *key2)
1183 {
1184         float           d1 = *((const float *) key1);
1185         float           d2 = *((const float *) key2);
1186
1187         if (d1 > d2)
1188                 return -1;
1189         else if (d1 < d2)
1190                 return 1;
1191         else
1192                 return 0;
1193 }