]> granicus.if.org Git - postgresql/commitdiff
Avoid out-of-memory in a hash join with many duplicate inner keys.
authorTom Lane <tgl@sss.pgh.pa.us>
Tue, 15 Aug 2017 18:05:46 +0000 (14:05 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Tue, 15 Aug 2017 18:05:53 +0000 (14:05 -0400)
The executor is capable of splitting buckets during a hash join if
too much memory is being used by a small number of buckets.  However,
this only helps if a bucket's population is actually divisible; if
all the hash keys are alike, the tuples still end up in the same
new bucket.  This can result in an OOM failure if there are enough
inner keys with identical hash values.  The planner's cost estimates
will bias it against choosing a hash join in such situations, but not
by so much that it will never do so.  To mitigate the OOM hazard,
explicitly estimate the hash bucket space needed by just the inner
side's most common value, and if that would exceed work_mem then
add disable_cost to the hash cost estimate.

This approach doesn't account for the possibility that two or more
common values would share the same hash value.  On the other hand,
work_mem is normally a fairly conservative bound, so that eating
two or more times that much space is probably not going to kill us.

If we have no stats about the inner side, ignore this consideration.
There was some discussion of making a conservative assumption, but that
would effectively result in disabling hash join whenever we lack stats,
which seems like an overreaction given how seldom the problem manifests
in the field.

Per a complaint from David Hinkle.  Although this could be viewed
as a bug fix, the lack of similar complaints weighs against back-
patching; indeed we waited for v11 because it seemed already rather
late in the v10 cycle to be making plan choice changes like this one.

Discussion: https://postgr.es/m/32013.1487271761@sss.pgh.pa.us

src/backend/nodes/copyfuncs.c
src/backend/optimizer/path/costsize.c
src/backend/optimizer/prep/prepunion.c
src/backend/optimizer/util/restrictinfo.c
src/backend/utils/adt/selfuncs.c
src/include/nodes/relation.h
src/include/utils/selfuncs.h

index 45a04b0b2753a294c16c54edf20c5d2ab9bf0d8d..72041693dfda4fcce78a670989468f1d67caa5fb 100644 (file)
@@ -2185,6 +2185,8 @@ _copyRestrictInfo(const RestrictInfo *from)
        COPY_SCALAR_FIELD(hashjoinoperator);
        COPY_SCALAR_FIELD(left_bucketsize);
        COPY_SCALAR_FIELD(right_bucketsize);
+       COPY_SCALAR_FIELD(left_mcvfreq);
+       COPY_SCALAR_FIELD(right_mcvfreq);
 
        return newnode;
 }
index b35acb7bdcf17d2ba3baa728827f55f1449a8905..051a8544b0cb98f131c2dd083c4e1e0961ec0986 100644 (file)
@@ -3028,6 +3028,7 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
        double          hashjointuples;
        double          virtualbuckets;
        Selectivity innerbucketsize;
+       Selectivity innermcvfreq;
        ListCell   *hcl;
 
        /* Mark the path with the correct row estimate */
@@ -3060,9 +3061,9 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
        virtualbuckets = (double) numbuckets * (double) numbatches;
 
        /*
-        * Determine bucketsize fraction for inner relation.  We use the smallest
-        * bucketsize estimated for any individual hashclause; this is undoubtedly
-        * conservative.
+        * Determine bucketsize fraction and MCV frequency for the inner relation.
+        * We use the smallest bucketsize or MCV frequency estimated for any
+        * individual hashclause; this is undoubtedly conservative.
         *
         * BUT: if inner relation has been unique-ified, we can assume it's good
         * for hashing.  This is important both because it's the right answer, and
@@ -3070,22 +3071,27 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
         * non-unique-ified paths.
         */
        if (IsA(inner_path, UniquePath))
+       {
                innerbucketsize = 1.0 / virtualbuckets;
+               innermcvfreq = 0.0;
+       }
        else
        {
                innerbucketsize = 1.0;
+               innermcvfreq = 1.0;
                foreach(hcl, hashclauses)
                {
                        RestrictInfo *restrictinfo = lfirst_node(RestrictInfo, hcl);
                        Selectivity thisbucketsize;
+                       Selectivity thismcvfreq;
 
                        /*
                         * First we have to figure out which side of the hashjoin clause
                         * is the inner side.
                         *
                         * Since we tend to visit the same clauses over and over when
-                        * planning a large query, we cache the bucketsize estimate in the
-                        * RestrictInfo node to avoid repeated lookups of statistics.
+                        * planning a large query, we cache the bucket stats estimates in
+                        * the RestrictInfo node to avoid repeated lookups of statistics.
                         */
                        if (bms_is_subset(restrictinfo->right_relids,
                                                          inner_path->parent->relids))
@@ -3095,12 +3101,14 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
                                if (thisbucketsize < 0)
                                {
                                        /* not cached yet */
-                                       thisbucketsize =
-                                               estimate_hash_bucketsize(root,
-                                                                                                get_rightop(restrictinfo->clause),
-                                                                                                virtualbuckets);
-                                       restrictinfo->right_bucketsize = thisbucketsize;
+                                       estimate_hash_bucket_stats(root,
+                                                                                          get_rightop(restrictinfo->clause),
+                                                                                          virtualbuckets,
+                                                                                          &restrictinfo->right_mcvfreq,
+                                                                                          &restrictinfo->right_bucketsize);
+                                       thisbucketsize = restrictinfo->right_bucketsize;
                                }
+                               thismcvfreq = restrictinfo->right_mcvfreq;
                        }
                        else
                        {
@@ -3111,19 +3119,36 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
                                if (thisbucketsize < 0)
                                {
                                        /* not cached yet */
-                                       thisbucketsize =
-                                               estimate_hash_bucketsize(root,
-                                                                                                get_leftop(restrictinfo->clause),
-                                                                                                virtualbuckets);
-                                       restrictinfo->left_bucketsize = thisbucketsize;
+                                       estimate_hash_bucket_stats(root,
+                                                                                          get_leftop(restrictinfo->clause),
+                                                                                          virtualbuckets,
+                                                                                          &restrictinfo->left_mcvfreq,
+                                                                                          &restrictinfo->left_bucketsize);
+                                       thisbucketsize = restrictinfo->left_bucketsize;
                                }
+                               thismcvfreq = restrictinfo->left_mcvfreq;
                        }
 
                        if (innerbucketsize > thisbucketsize)
                                innerbucketsize = thisbucketsize;
+                       if (innermcvfreq > thismcvfreq)
+                               innermcvfreq = thismcvfreq;
                }
        }
 
+       /*
+        * If the bucket holding the inner MCV would exceed work_mem, we don't
+        * want to hash unless there is really no other alternative, so apply
+        * disable_cost.  (The executor normally copes with excessive memory usage
+        * by splitting batches, but obviously it cannot separate equal values
+        * that way, so it will be unable to drive the batch size below work_mem
+        * when this is true.)
+        */
+       if (relation_byte_size(clamp_row_est(inner_path_rows * innermcvfreq),
+                                                  inner_path->pathtarget->width) >
+               (work_mem * 1024L))
+               startup_cost += disable_cost;
+
        /*
         * Compute cost of the hashquals and qpquals (other restriction clauses)
         * separately.
index 9c6c47a1b9bb8ae3573ecda9963ee01ce152eb02..6d8f8938b2f833d296b428d1162084ebf992d447 100644 (file)
@@ -2067,6 +2067,8 @@ adjust_appendrel_attrs_mutator(Node *node,
                newinfo->scansel_cache = NIL;
                newinfo->left_bucketsize = -1;
                newinfo->right_bucketsize = -1;
+               newinfo->left_mcvfreq = -1;
+               newinfo->right_mcvfreq = -1;
 
                return (Node *) newinfo;
        }
index ebae0cd8ce0345a7e4a38e5db287229bd5fb303e..39b52aecc53a276468b172da320eb6afa147cbc5 100644 (file)
@@ -199,6 +199,8 @@ make_restrictinfo_internal(Expr *clause,
 
        restrictinfo->left_bucketsize = -1;
        restrictinfo->right_bucketsize = -1;
+       restrictinfo->left_mcvfreq = -1;
+       restrictinfo->right_mcvfreq = -1;
 
        return restrictinfo;
 }
index e103f5ef16c4d0b47de25248af73c01cacd07041..a7a06146a06cfd007d77aa4658e2a4e0c080e339 100644 (file)
@@ -3559,9 +3559,16 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
 }
 
 /*
- * Estimate hash bucketsize fraction (ie, number of entries in a bucket
- * divided by total tuples in relation) if the specified expression is used
- * as a hash key.
+ * Estimate hash bucket statistics when the specified expression is used
+ * as a hash key for the given number of buckets.
+ *
+ * This attempts to determine two values:
+ *
+ * 1. The frequency of the most common value of the expression (returns
+ * zero into *mcv_freq if we can't get that).
+ *
+ * 2. The "bucketsize fraction", ie, average number of entries in a bucket
+ * divided by total tuples in relation.
  *
  * XXX This is really pretty bogus since we're effectively assuming that the
  * distribution of hash keys will be the same after applying restriction
@@ -3587,29 +3594,58 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
  * discourage use of a hash rather strongly if the inner relation is large,
  * which is what we want.  We do not want to hash unless we know that the
  * inner rel is well-dispersed (or the alternatives seem much worse).
+ *
+ * The caller should also check that the mcv_freq is not so large that the
+ * most common value would by itself require an impractically large bucket.
+ * In a hash join, the executor can split buckets if they get too big, but
+ * obviously that doesn't help for a bucket that contains many duplicates of
+ * the same value.
  */
-Selectivity
-estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets)
+void
+estimate_hash_bucket_stats(PlannerInfo *root, Node *hashkey, double nbuckets,
+                                                  Selectivity *mcv_freq,
+                                                  Selectivity *bucketsize_frac)
 {
        VariableStatData vardata;
        double          estfract,
                                ndistinct,
                                stanullfrac,
-                               mcvfreq,
                                avgfreq;
        bool            isdefault;
        AttStatsSlot sslot;
 
        examine_variable(root, hashkey, 0, &vardata);
 
+       /* Look up the frequency of the most common value, if available */
+       *mcv_freq = 0.0;
+
+       if (HeapTupleIsValid(vardata.statsTuple))
+       {
+               if (get_attstatsslot(&sslot, vardata.statsTuple,
+                                                        STATISTIC_KIND_MCV, InvalidOid,
+                                                        ATTSTATSSLOT_NUMBERS))
+               {
+                       /*
+                        * The first MCV stat is for the most common value.
+                        */
+                       if (sslot.nnumbers > 0)
+                               *mcv_freq = sslot.numbers[0];
+                       free_attstatsslot(&sslot);
+               }
+       }
+
        /* Get number of distinct values */
        ndistinct = get_variable_numdistinct(&vardata, &isdefault);
 
-       /* If ndistinct isn't real, punt and return 0.1, per comments above */
+       /*
+        * If ndistinct isn't real, punt.  We normally return 0.1, but if the
+        * mcv_freq is known to be even higher than that, use it instead.
+        */
        if (isdefault)
        {
+               *bucketsize_frac = (Selectivity) Max(0.1, *mcv_freq);
                ReleaseVariableStats(vardata);
-               return (Selectivity) 0.1;
+               return;
        }
 
        /* Get fraction that are null */
@@ -3650,31 +3686,11 @@ estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets)
        else
                estfract = 1.0 / ndistinct;
 
-       /*
-        * Look up the frequency of the most common value, if available.
-        */
-       mcvfreq = 0.0;
-
-       if (HeapTupleIsValid(vardata.statsTuple))
-       {
-               if (get_attstatsslot(&sslot, vardata.statsTuple,
-                                                        STATISTIC_KIND_MCV, InvalidOid,
-                                                        ATTSTATSSLOT_NUMBERS))
-               {
-                       /*
-                        * The first MCV stat is for the most common value.
-                        */
-                       if (sslot.nnumbers > 0)
-                               mcvfreq = sslot.numbers[0];
-                       free_attstatsslot(&sslot);
-               }
-       }
-
        /*
         * Adjust estimated bucketsize upward to account for skewed distribution.
         */
-       if (avgfreq > 0.0 && mcvfreq > avgfreq)
-               estfract *= mcvfreq / avgfreq;
+       if (avgfreq > 0.0 && *mcv_freq > avgfreq)
+               estfract *= *mcv_freq / avgfreq;
 
        /*
         * Clamp bucketsize to sane range (the above adjustment could easily
@@ -3686,9 +3702,9 @@ estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets)
        else if (estfract > 1.0)
                estfract = 1.0;
 
-       ReleaseVariableStats(vardata);
+       *bucketsize_frac = (Selectivity) estfract;
 
-       return (Selectivity) estfract;
+       ReleaseVariableStats(vardata);
 }
 
 
index 9bae3c6ab98e75bebadc0db8ad4009731c092f17..be2028867a85d01f4b07ab0fe03362e14c6c56a0 100644 (file)
@@ -1807,6 +1807,8 @@ typedef struct RestrictInfo
        /* cache space for hashclause processing; -1 if not yet set */
        Selectivity left_bucketsize;    /* avg bucketsize of left side */
        Selectivity right_bucketsize;   /* avg bucketsize of right side */
+       Selectivity left_mcvfreq;       /* left side's most common val's freq */
+       Selectivity right_mcvfreq;      /* right side's most common val's freq */
 } RestrictInfo;
 
 /*
index c7fdd540e8417a200c590d4853957796cf66b603..dc6069d43556286b6abb481e6f12f0a5462d6774 100644 (file)
@@ -206,8 +206,10 @@ extern void mergejoinscansel(PlannerInfo *root, Node *clause,
 extern double estimate_num_groups(PlannerInfo *root, List *groupExprs,
                                        double input_rows, List **pgset);
 
-extern Selectivity estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey,
-                                                double nbuckets);
+extern void estimate_hash_bucket_stats(PlannerInfo *root,
+                                                  Node *hashkey, double nbuckets,
+                                                  Selectivity *mcv_freq,
+                                                  Selectivity *bucketsize_frac);
 
 extern List *deconstruct_indexquals(IndexPath *path);
 extern void genericcostestimate(PlannerInfo *root, IndexPath *path,