From 84c7cef5eb0f0d08d40dbfcf396d4c830c1f1030 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 18 Sep 2004 19:39:50 +0000 Subject: [PATCH] Fix estimate_num_groups to be able to use expression-index statistics when there is an expressional index matching a GROUP BY item. --- src/backend/utils/adt/selfuncs.c | 184 +++++++++++++++++++------------ 1 file changed, 112 insertions(+), 72 deletions(-) diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 412a0a3231..24759bf5c0 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -15,7 +15,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.165 2004/08/30 02:54:39 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.166 2004/09/18 19:39:50 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1869,6 +1869,71 @@ fail: ReleaseVariableStats(rightvar); } + +/* + * Helper routine for estimate_num_groups: add an item to a list of + * GroupVarInfos, but only if it's not known equal to any of the existing + * entries. + */ +typedef struct +{ + Node *var; /* might be an expression, not just a Var */ + RelOptInfo *rel; /* relation it belongs to */ + double ndistinct; /* # distinct values */ +} GroupVarInfo; + +static List * +add_unique_group_var(Query *root, List *varinfos, + Node *var, VariableStatData *vardata) +{ + GroupVarInfo *varinfo; + double ndistinct; + ListCell *lc; + + ndistinct = get_variable_numdistinct(vardata); + + /* cannot use foreach here because of possible list_delete */ + lc = list_head(varinfos); + while (lc) + { + varinfo = (GroupVarInfo *) lfirst(lc); + + /* must advance lc before list_delete possibly pfree's it */ + lc = lnext(lc); + + /* Drop exact duplicates */ + if (equal(var, varinfo->var)) + return varinfos; + + /* + * Drop known-equal vars, but only if they belong to different + * relations (see comments for estimate_num_groups) + */ + if (vardata->rel != varinfo->rel && + exprs_known_equal(root, var, varinfo->var)) + { + if (varinfo->ndistinct <= ndistinct) + { + /* Keep older item, forget new one */ + return varinfos; + } + else + { + /* Delete the older item */ + varinfos = list_delete_ptr(varinfos, varinfo); + } + } + } + + varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo)); + + varinfo->var = var; + varinfo->rel = vardata->rel; + varinfo->ndistinct = ndistinct; + varinfos = lappend(varinfos, varinfo); + return varinfos; +} + /* * estimate_num_groups - Estimate number of groups in a grouped query * @@ -1900,6 +1965,9 @@ fail: * increase the number of distinct values (unless it is volatile, * which we consider unlikely for grouping), but it probably won't * reduce the number of distinct values much either. + * As a special case, if a GROUP BY expression can be matched to an + * expressional index for which we have statistics, then we treat the + * whole expression as though it were just a Var. * 2. If the list contains Vars of different relations that are known equal * due to equijoin clauses, then drop all but one of the Vars from each * known-equal set, keeping the one with smallest estimated # of values @@ -1926,25 +1994,44 @@ fail: double estimate_num_groups(Query *root, List *groupExprs, double input_rows) { - List *allvars = NIL; List *varinfos = NIL; double numdistinct; ListCell *l; - typedef struct - { /* varinfos is a List of these */ - Var *var; - double ndistinct; - } MyVarInfo; /* We should not be called unless query has GROUP BY (or DISTINCT) */ Assert(groupExprs != NIL); - /* Step 1: get the unique Vars used */ + /* + * Steps 1/2: find the unique Vars used, treating an expression as a Var + * if we can find stats for it. For each one, record the statistical + * estimate of number of distinct values (total in its table, without + * regard for filtering). + */ foreach(l, groupExprs) { Node *groupexpr = (Node *) lfirst(l); + VariableStatData vardata; List *varshere; + ListCell *l2; + + /* + * If examine_variable is able to deduce anything about the GROUP BY + * expression, treat it as a single variable even if it's really more + * complicated. + */ + examine_variable(root, groupexpr, 0, &vardata); + if (vardata.statsTuple != NULL || vardata.isunique) + { + varinfos = add_unique_group_var(root, varinfos, + groupexpr, &vardata); + ReleaseVariableStats(vardata); + continue; + } + ReleaseVariableStats(vardata); + /* + * Else pull out the component Vars + */ varshere = pull_var_clause(groupexpr, false); /* @@ -1959,70 +2046,24 @@ estimate_num_groups(Query *root, List *groupExprs, double input_rows) return input_rows; continue; } - allvars = list_concat(allvars, varshere); - } - - /* If now no Vars, we must have an all-constant GROUP BY list. */ - if (allvars == NIL) - return 1.0; - - /* Use list_union() to discard duplicates */ - allvars = list_union(NIL, allvars); - - /* - * Step 2: acquire statistical estimate of number of distinct values - * of each Var (total in its table, without regard for filtering). - * Also, detect known-equal Vars and discard the ones we don't want. - */ - foreach(l, allvars) - { - Var *var = (Var *) lfirst(l); - VariableStatData vardata; - double ndistinct; - bool keep = true; - ListCell *l2; - - examine_variable(root, (Node *) var, 0, &vardata); - ndistinct = get_variable_numdistinct(&vardata); - ReleaseVariableStats(vardata); - - /* cannot use foreach here because of possible list_delete */ - l2 = list_head(varinfos); - while (l2) - { - MyVarInfo *varinfo = (MyVarInfo *) lfirst(l2); - - /* must advance l2 before list_delete possibly pfree's it */ - l2 = lnext(l2); - - if (var->varno != varinfo->var->varno && - exprs_known_equal(root, (Node *) var, (Node *) varinfo->var)) - { - /* Found a match */ - if (varinfo->ndistinct <= ndistinct) - { - /* Keep older item, forget new one */ - keep = false; - break; - } - else - { - /* Delete the older item */ - varinfos = list_delete_ptr(varinfos, varinfo); - } - } - } - if (keep) + /* + * Else add variables to varinfos list + */ + foreach(l2, varshere) { - MyVarInfo *varinfo = (MyVarInfo *) palloc(sizeof(MyVarInfo)); + Node *var = (Node *) lfirst(l2); - varinfo->var = var; - varinfo->ndistinct = ndistinct; - varinfos = lcons(varinfo, varinfos); + examine_variable(root, var, 0, &vardata); + varinfos = add_unique_group_var(root, varinfos, var, &vardata); + ReleaseVariableStats(vardata); } } + /* If now no Vars, we must have an all-constant GROUP BY list. */ + if (varinfos == NIL) + return 1.0; + /* * Steps 3/4: group Vars by relation and estimate total numdistinct. * @@ -2031,25 +2072,24 @@ estimate_num_groups(Query *root, List *groupExprs, double input_rows) * these Vars from the newvarinfos list for the next iteration. This * is the easiest way to group Vars of same rel together. */ - Assert(varinfos != NIL); numdistinct = 1.0; do { - MyVarInfo *varinfo1 = (MyVarInfo *) linitial(varinfos); - RelOptInfo *rel = find_base_rel(root, varinfo1->var->varno); + GroupVarInfo *varinfo1 = (GroupVarInfo *) linitial(varinfos); + RelOptInfo *rel = varinfo1->rel; double reldistinct = varinfo1->ndistinct; List *newvarinfos = NIL; /* - * Get the largest numdistinct estimate of the Vars for this rel. + * Get the product of numdistinct estimates of the Vars for this rel. * Also, construct new varinfos list of remaining Vars. */ for_each_cell(l, lnext(list_head(varinfos))) { - MyVarInfo *varinfo2 = (MyVarInfo *) lfirst(l); + GroupVarInfo *varinfo2 = (GroupVarInfo *) lfirst(l); - if (varinfo2->var->varno == varinfo1->var->varno) + if (varinfo2->rel == varinfo1->rel) reldistinct *= varinfo2->ndistinct; else { -- 2.40.0