]> granicus.if.org Git - postgresql/blob - src/backend/optimizer/plan/planagg.c
Create the planner mechanism for optimizing simple MIN and MAX queries
[postgresql] / src / backend / optimizer / plan / planagg.c
1 /*-------------------------------------------------------------------------
2  *
3  * planagg.c
4  *        Special planning for aggregate queries.
5  *
6  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *        $PostgreSQL: pgsql/src/backend/optimizer/plan/planagg.c,v 1.1 2005/04/11 23:06:55 tgl Exp $
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16
17 #include "access/skey.h"
18 #include "catalog/pg_aggregate.h"
19 #include "catalog/pg_type.h"
20 #include "nodes/makefuncs.h"
21 #include "optimizer/clauses.h"
22 #include "optimizer/cost.h"
23 #include "optimizer/pathnode.h"
24 #include "optimizer/paths.h"
25 #include "optimizer/planmain.h"
26 #include "optimizer/subselect.h"
27 #include "parser/parsetree.h"
28 #include "parser/parse_clause.h"
29 #include "parser/parse_expr.h"
30 #include "utils/lsyscache.h"
31 #include "utils/syscache.h"
32
33
34 typedef struct
35 {
36         Oid                     aggfnoid;               /* pg_proc Oid of the aggregate */
37         Oid                     aggsortop;              /* Oid of its sort operator */
38         Expr       *target;                     /* expression we are aggregating on */
39         IndexPath  *path;                       /* access path for index scan */
40         Cost            pathcost;               /* estimated cost to fetch first row */
41         Param      *param;                      /* param for subplan's output */
42 } MinMaxAggInfo;
43
44 static bool find_minmax_aggs_walker(Node *node, List **context);
45 static bool build_minmax_path(Query *root, RelOptInfo *rel,
46                                                           MinMaxAggInfo *info);
47 static ScanDirection match_agg_to_index_col(MinMaxAggInfo *info,
48                                                                                         IndexOptInfo *index, int indexcol);
49 static void make_agg_subplan(Query *root, MinMaxAggInfo *info,
50                                                          List *constant_quals);
51 static Node *replace_aggs_with_params_mutator(Node *node,  List **context);
52 static Oid      fetch_agg_sort_op(Oid aggfnoid);
53
54
55 /*
56  * optimize_minmax_aggregates - check for optimizing MIN/MAX via indexes
57  *
58  * This checks to see if we can replace MIN/MAX aggregate functions by
59  * subqueries of the form
60  *              (SELECT col FROM tab WHERE ... ORDER BY col ASC/DESC LIMIT 1)
61  * Given a suitable index on tab.col, this can be much faster than the
62  * generic scan-all-the-rows plan.
63  *
64  * We are passed the Query, the preprocessed tlist, and the best path
65  * devised for computing the input of a standard Agg node.  If we are able
66  * to optimize all the aggregates, and the result is estimated to be cheaper
67  * than the generic aggregate method, then generate and return a Plan that
68  * does it that way.  Otherwise, return NULL.
69  */
70 Plan *
71 optimize_minmax_aggregates(Query *root, List *tlist, Path *best_path)
72 {
73         RangeTblRef *rtr;
74         RangeTblEntry *rte;
75         RelOptInfo *rel;
76         List       *aggs_list;
77         ListCell   *l;
78         Cost            total_cost;
79         Path            agg_p;
80         Plan       *plan;
81         Node       *hqual;
82         QualCost        tlist_cost;
83         List       *constant_quals;
84
85         /* Nothing to do if query has no aggregates */
86         if (!root->hasAggs)
87                 return NULL;
88
89         Assert(!root->setOperations); /* shouldn't get here if a setop */
90         Assert(root->rowMarks == NIL); /* nor if FOR UPDATE */
91
92         /*
93          * Reject unoptimizable cases.
94          *
95          * We don't handle GROUP BY, because our current implementations of
96          * grouping require looking at all the rows anyway, and so there's not
97          * much point in optimizing MIN/MAX.
98          */
99         if (root->groupClause)
100                 return NULL;
101
102         /*
103          * We also restrict the query to reference exactly one table, since
104          * join conditions can't be handled reasonably.  (We could perhaps
105          * handle a query containing cartesian-product joins, but it hardly
106          * seems worth the trouble.)
107          */
108         Assert(root->jointree != NULL && IsA(root->jointree, FromExpr));
109         if (list_length(root->jointree->fromlist) != 1)
110                 return NULL;
111         rtr = (RangeTblRef *) linitial(root->jointree->fromlist);
112         if (!IsA(rtr, RangeTblRef))
113                 return NULL;
114         rte = rt_fetch(rtr->rtindex, root->rtable);
115         if (rte->rtekind != RTE_RELATION)
116                 return NULL;
117         rel = find_base_rel(root, rtr->rtindex);
118
119         /*
120          * Also reject cases with subplans or volatile functions in WHERE.
121          * This may be overly paranoid, but it's not entirely clear if the
122          * transformation is safe then.
123          */
124         if (contain_subplans(root->jointree->quals) ||
125                 contain_volatile_functions(root->jointree->quals))
126                 return NULL;
127
128         /*
129          * Since this optimization is not applicable all that often, we want
130          * to fall out before doing very much work if possible.  Therefore
131          * we do the work in several passes.  The first pass scans the tlist
132          * and HAVING qual to find all the aggregates and verify that
133          * each of them is a MIN/MAX aggregate.  If that succeeds, the second
134          * pass looks at each aggregate to see if it is optimizable; if so
135          * we make an IndexPath describing how we would scan it.  (We do not
136          * try to optimize if only some aggs are optimizable, since that means
137          * we'll have to scan all the rows anyway.)  If that succeeds, we have
138          * enough info to compare costs against the generic implementation.
139          * Only if that test passes do we build a Plan.
140          */
141
142         /* Pass 1: find all the aggregates */
143         aggs_list = NIL;
144         if (find_minmax_aggs_walker((Node *) tlist, &aggs_list))
145                 return NULL;
146         if (find_minmax_aggs_walker(root->havingQual, &aggs_list))
147                 return NULL;
148
149         /* Pass 2: see if each one is optimizable */
150         total_cost = 0;
151         foreach(l, aggs_list)
152         {
153                 MinMaxAggInfo *info = (MinMaxAggInfo *) lfirst(l);
154
155                 if (!build_minmax_path(root, rel, info))
156                         return NULL;
157                 total_cost += info->pathcost;
158         }
159
160         /*
161          * Make the cost comparison.
162          *
163          * Note that we don't include evaluation cost of the tlist here;
164          * this is OK since it isn't included in best_path's cost either,
165          * and should be the same in either case.
166          */
167         cost_agg(&agg_p, root, AGG_PLAIN, list_length(aggs_list),
168                          0, 0,
169                          best_path->startup_cost, best_path->total_cost,
170                          best_path->parent->rows);
171
172         if (total_cost > agg_p.total_cost)
173                 return NULL;                    /* too expensive */
174
175         /*
176          * OK, we are going to generate an optimized plan.  The first thing we
177          * need to do is look for any non-variable WHERE clauses that query_planner
178          * might have removed from the basic plan.  (Normal WHERE clauses will
179          * be properly incorporated into the sub-plans by create_plan.)  If there
180          * are any, they will be in a gating Result node atop the best_path.
181          * They have to be incorporated into a gating Result in each sub-plan
182          * in order to produce the semantically correct result.
183          */
184         if (IsA(best_path, ResultPath))
185         {
186                 Assert(((ResultPath *) best_path)->subpath != NULL);
187                 constant_quals = ((ResultPath *) best_path)->constantqual;
188         }
189         else
190                 constant_quals = NIL;
191
192         /* Pass 3: generate subplans and output Param nodes */
193         foreach(l, aggs_list)
194         {
195                 make_agg_subplan(root, (MinMaxAggInfo *) lfirst(l), constant_quals);
196         }
197
198         /*
199          * Modify the targetlist and HAVING qual to reference subquery outputs
200          */
201         tlist = (List *) replace_aggs_with_params_mutator((Node *) tlist,
202                                                                                                           &aggs_list);
203         hqual = replace_aggs_with_params_mutator(root->havingQual,
204                                                                                          &aggs_list);
205
206         /*
207          * Generate the output plan --- basically just a Result
208          */
209         plan = (Plan *) make_result(tlist, hqual, NULL);
210
211         /* Account for evaluation cost of the tlist (make_result did the rest) */
212         cost_qual_eval(&tlist_cost, tlist);
213         plan->startup_cost += tlist_cost.startup;
214         plan->total_cost += tlist_cost.startup + tlist_cost.per_tuple;
215
216         return plan;
217 }
218
219 /*
220  * find_minmax_aggs_walker
221  *              Recursively scan the Aggref nodes in an expression tree, and check
222  *              that each one is a MIN/MAX aggregate.  If so, build a list of the
223  *              distinct aggregate calls in the tree.
224  *
225  * Returns TRUE if a non-MIN/MAX aggregate is found, FALSE otherwise.
226  * (This seemingly-backward definition is used because expression_tree_walker
227  * aborts the scan on TRUE return, which is what we want.)
228  *
229  * Found aggregates are added to the list at *context; it's up to the caller
230  * to initialize the list to NIL.
231  *
232  * This does not descend into subqueries, and so should be used only after
233  * reduction of sublinks to subplans.  There mustn't be outer-aggregate
234  * references either.
235  */
236 static bool
237 find_minmax_aggs_walker(Node *node, List **context)
238 {
239         if (node == NULL)
240                 return false;
241         if (IsA(node, Aggref))
242         {
243                 Aggref     *aggref = (Aggref *) node;
244                 Oid                     aggsortop;
245                 MinMaxAggInfo *info;
246                 ListCell   *l;
247
248                 Assert(aggref->agglevelsup == 0);
249                 if (aggref->aggstar)
250                         return true;            /* foo(*) is surely not optimizable */
251                 /* note: we do not care if DISTINCT is mentioned ... */
252
253                 aggsortop = fetch_agg_sort_op(aggref->aggfnoid);
254                 if (!OidIsValid(aggsortop))
255                         return true;            /* not a MIN/MAX aggregate */
256
257                 /*
258                  * Check whether it's already in the list, and add it if not.
259                  */
260                 foreach(l, *context)
261                 {
262                         info = (MinMaxAggInfo *) lfirst(l);
263                         if (info->aggfnoid == aggref->aggfnoid &&
264                                 equal(info->target, aggref->target))
265                                 return false;
266                 }
267
268                 info = (MinMaxAggInfo *) palloc0(sizeof(MinMaxAggInfo));
269                 info->aggfnoid = aggref->aggfnoid;
270                 info->aggsortop = aggsortop;
271                 info->target = aggref->target;
272
273                 *context = lappend(*context, info);
274
275                 /*
276                  * We need not recurse into the argument, since it can't contain
277                  * any aggregates.
278                  */
279                 return false;
280         }
281         Assert(!IsA(node, SubLink));
282         return expression_tree_walker(node, find_minmax_aggs_walker,
283                                                                   (void *) context);
284 }
285
286 /*
287  * build_minmax_path
288  *              Given a MIN/MAX aggregate, try to find an index it can be optimized
289  *              with.  Build a Path describing the best such index path.
290  *
291  * Returns TRUE if successful, FALSE if not.  In the TRUE case, info->path
292  * is filled in.
293  *
294  * XXX look at sharing more code with indxpath.c.
295  *
296  * Note: check_partial_indexes() must have been run previously.
297  */
298 static bool
299 build_minmax_path(Query *root, RelOptInfo *rel, MinMaxAggInfo *info)
300 {
301         IndexPath  *best_path = NULL;
302         Cost            best_cost = 0;
303         ListCell   *l;
304
305         foreach(l, rel->indexlist)
306         {
307                 IndexOptInfo *index = (IndexOptInfo *) lfirst(l);
308                 ScanDirection indexscandir = NoMovementScanDirection;
309                 int                     indexcol;
310                 int                     prevcol;
311                 List       *restrictclauses;
312                 IndexPath  *new_path;
313                 Cost            new_cost;
314
315                 /* Ignore non-btree indexes */
316                 if (index->relam != BTREE_AM_OID)
317                         continue;
318
319                 /* Ignore partial indexes that do not match the query */
320                 if (index->indpred != NIL && !index->predOK)
321                         continue;
322
323                 /*
324                  * Look for a match to one of the index columns.  (In a stupidly
325                  * designed index, there could be multiple matches, but we only
326                  * care about the first one.)
327                  */
328                 for (indexcol = 0; indexcol < index->ncolumns; indexcol++)
329                 {
330                         indexscandir = match_agg_to_index_col(info, index, indexcol);
331                         if (!ScanDirectionIsNoMovement(indexscandir))
332                                 break;
333                 }
334                 if (ScanDirectionIsNoMovement(indexscandir))
335                         continue;
336
337                 /*
338                  * If the match is not at the first index column, we have to verify
339                  * that there are "x = something" restrictions on all the earlier
340                  * index columns.  Since we'll need the restrictclauses list anyway
341                  * to build the path, it's convenient to extract that first and then
342                  * look through it for the equality restrictions.
343                  */
344                 restrictclauses = group_clauses_by_indexkey(index);
345
346                 if (list_length(restrictclauses) < indexcol)
347                         continue;                       /* definitely haven't got enough */
348                 for (prevcol = 0; prevcol < indexcol; prevcol++)
349                 {
350                         List   *rinfos = (List *) list_nth(restrictclauses, prevcol);
351                         ListCell *ll;
352
353                         foreach(ll, rinfos)
354                         {
355                                 RestrictInfo *rinfo = (RestrictInfo *) lfirst(ll);
356                                 int                     strategy;
357
358                                 Assert(is_opclause(rinfo->clause));
359                                 strategy =
360                                         get_op_opclass_strategy(((OpExpr *) rinfo->clause)->opno,
361                                                                                         index->classlist[prevcol]);
362                                 if (strategy == BTEqualStrategyNumber)
363                                         break;
364                         }
365                         if (ll == NULL)
366                                 break;                  /* none are Equal for this index col */
367                 }
368                 if (prevcol < indexcol)
369                         continue;                       /* didn't find all Equal clauses */
370
371                 /*
372                  * Build the access path.  We don't bother marking it with pathkeys.
373                  */
374                 new_path = create_index_path(root, index,
375                                                                          restrictclauses,
376                                                                          NIL,
377                                                                          indexscandir);
378
379                 /*
380                  * Estimate actual cost of fetching just one row.
381                  */
382                 if (new_path->rows > 1.0)
383                         new_cost = new_path->path.startup_cost +
384                                 (new_path->path.total_cost - new_path->path.startup_cost)
385                                 * 1.0 / new_path->rows;
386                 else
387                         new_cost = new_path->path.total_cost;
388
389                 /*
390                  * Keep if first or if cheaper than previous best.
391                  */
392                 if (best_path == NULL || new_cost < best_cost)
393                 {
394                         best_path = new_path;
395                         best_cost = new_cost;
396                 }
397         }
398
399         info->path = best_path;
400         info->pathcost = best_cost;
401         return (best_path != NULL);
402 }
403
404 /*
405  * match_agg_to_index_col
406  *              Does an aggregate match an index column?
407  *
408  * It matches if its argument is equal to the index column's data and its
409  * sortop is either the LessThan or GreaterThan member of the column's opclass.
410  *
411  * We return ForwardScanDirection if match the LessThan member,
412  * BackwardScanDirection if match the GreaterThan member,
413  * and NoMovementScanDirection if there's no match.
414  */
415 static ScanDirection
416 match_agg_to_index_col(MinMaxAggInfo *info, IndexOptInfo *index, int indexcol)
417 {
418         int                     strategy;
419
420         /* Check for data match */
421         if (!match_index_to_operand((Node *) info->target, indexcol, index))
422                 return NoMovementScanDirection;
423
424         /* Look up the operator in the opclass */
425         strategy = get_op_opclass_strategy(info->aggsortop,
426                                                                            index->classlist[indexcol]);
427         if (strategy == BTLessStrategyNumber)
428                 return ForwardScanDirection;
429         if (strategy == BTGreaterStrategyNumber)
430                 return BackwardScanDirection;
431         return NoMovementScanDirection;
432 }
433
434 /*
435  * Construct a suitable plan for a converted aggregate query
436  */
437 static void
438 make_agg_subplan(Query *root, MinMaxAggInfo *info, List *constant_quals)
439 {
440         Query      *subquery;
441         Path       *path;
442         Plan       *plan;
443         TargetEntry *tle;
444         SortClause *sortcl;
445
446         /*
447          * Generate a suitably modified Query node.  Much of the work here is
448          * probably unnecessary in the normal case, but we want to make it look
449          * good if someone tries to EXPLAIN the result.
450          */
451         subquery = (Query *) copyObject(root);
452         subquery->commandType = CMD_SELECT;
453         subquery->resultRelation = 0;
454         subquery->resultRelations = NIL;
455         subquery->into = NULL;
456         subquery->hasAggs = false;
457         subquery->groupClause = NIL;
458         subquery->havingQual = NULL;
459         subquery->hasHavingQual = false;
460         subquery->distinctClause = NIL;
461
462         /* single tlist entry that is the aggregate target */
463         tle = makeTargetEntry(copyObject(info->target),
464                                                   1,
465                                                   pstrdup("agg_target"),
466                                                   false);
467         subquery->targetList = list_make1(tle);
468
469         /* set up the appropriate ORDER BY entry */
470         sortcl = makeNode(SortClause);
471         sortcl->tleSortGroupRef = assignSortGroupRef(tle, subquery->targetList);
472         sortcl->sortop = info->aggsortop;
473         subquery->sortClause = list_make1(sortcl);
474
475         /* set up LIMIT 1 */
476         subquery->limitOffset = NULL;
477         subquery->limitCount = (Node *) makeConst(INT4OID, sizeof(int4),
478                                                                                           Int32GetDatum(1),
479                                                                                           false, true);
480
481         /*
482          * Generate the plan for the subquery.  We already have a Path for
483          * the basic indexscan, but we have to convert it to a Plan and
484          * attach a LIMIT node above it.  We might need a gating Result, too,
485          * which is most easily added at the Path stage.
486          */
487         path = (Path *) info->path;
488
489         if (constant_quals)
490                 path = (Path *) create_result_path(NULL,
491                                                                                    path,
492                                                                                    copyObject(constant_quals));
493
494         plan = create_plan(subquery, path);
495
496         plan->targetlist = copyObject(subquery->targetList);
497
498         plan = (Plan *) make_limit(plan, 
499                                                            subquery->limitOffset,
500                                                            subquery->limitCount);
501
502         /*
503          * Convert the plan into an InitPlan, and make a Param for its result.
504          */
505         info->param = SS_make_initplan_from_plan(subquery, plan,
506                                                                                          exprType((Node *) tle->expr),
507                                                                                          -1);
508 }
509
510 /*
511  * Replace original aggregate calls with subplan output Params
512  */
513 static Node *
514 replace_aggs_with_params_mutator(Node *node,  List **context)
515 {
516         if (node == NULL)
517                 return NULL;
518         if (IsA(node, Aggref))
519         {
520                 Aggref     *aggref = (Aggref *) node;
521                 ListCell   *l;
522
523                 foreach(l, *context)
524                 {
525                         MinMaxAggInfo *info = (MinMaxAggInfo *) lfirst(l);
526
527                         if (info->aggfnoid == aggref->aggfnoid &&
528                                 equal(info->target, aggref->target))
529                                 return (Node *) info->param;
530                 }
531                 elog(ERROR, "failed to re-find aggregate info record");
532         }
533         Assert(!IsA(node, SubLink));
534         return expression_tree_mutator(node, replace_aggs_with_params_mutator,
535                                                                    (void *) context);
536 }
537
538 /*
539  * Get the OID of the sort operator, if any, associated with an aggregate.
540  * Returns InvalidOid if there is no such operator.
541  */
542 static Oid
543 fetch_agg_sort_op(Oid aggfnoid)
544 {
545 #ifdef NOT_YET
546         HeapTuple       aggTuple;
547         Form_pg_aggregate aggform;
548         Oid                     aggsortop;
549
550         /* fetch aggregate entry from pg_aggregate */
551         aggTuple = SearchSysCache(AGGFNOID,
552                                                           ObjectIdGetDatum(aggfnoid),
553                                                           0, 0, 0);
554         if (!HeapTupleIsValid(aggTuple))
555                 return InvalidOid;
556         aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple);
557         aggsortop = aggform->aggsortop;
558         ReleaseSysCache(aggTuple);
559
560         return aggsortop;
561 #else
562         /*
563          * XXX stub implementation for testing: hardwire a few cases.
564          */
565         if (aggfnoid == 2132)           /* min(int4) -> int4lt */
566                 return 97;
567         if (aggfnoid == 2116)           /* max(int4) -> int4gt */
568                 return 521;
569         if (aggfnoid == 2145)           /* min(text) -> text_lt */
570                 return 664;
571         if (aggfnoid == 2129)           /* max(text) -> text_gt */
572                 return 666;
573         return InvalidOid;
574 #endif
575 }