Tweak GROUP BY so that it will still accept result-column names, but only

author Tom Lane <tgl@sss.pgh.pa.us>

Wed, 15 Mar 2000 23:31:19 +0000 (23:31 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Wed, 15 Mar 2000 23:31:19 +0000 (23:31 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Wed, 15 Mar 2000 23:31:19 +0000 (23:31 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Wed, 15 Mar 2000 23:31:19 +0000 (23:31 +0000)
diff --git a/doc/src/sgml/ref/select.sgml b/doc/src/sgml/ref/select.sgml

index 674234d8e335a90cb2351548110e5c024b99eaa0..c88d9b54f920ecc113f4a40fd38991eab2012f2a 100644 (file)
--- a/doc/src/sgml/ref/select.sgml
+++ b/doc/src/sgml/ref/select.sgml
@@ -1,5 +1,5 @@
  <!--
-$Header: /cvsroot/pgsql/doc/src/sgml/ref/select.sgml,v 1.25 2000/02/21 01:13:52 tgl Exp $
+$Header: /cvsroot/pgsql/doc/src/sgml/ref/select.sgml,v 1.26 2000/03/15 23:31:19 tgl Exp $
  Postgres documentation
  -->
  
@@ -37,7 +37,7 @@ SELECT [ ALL | DISTINCT [ ON ( <replaceable class="PARAMETER">expression</replac
    
    <refsect2 id="R2-SQL-SELECT-1">
     <refsect2info>
-    <date>1998-09-24</date>
+    <date>2000-03-15</date>
     </refsect2info>
     <title>
      Inputs
@@ -59,10 +59,12 @@ SELECT [ ALL | DISTINCT [ ON ( <replaceable class="PARAMETER">expression</replac
        <listitem>
         <para>
         Specifies another name for a column or an expression using
-       the AS clause.  This name is primarily used to label the output
-       column.  The <replaceable class="PARAMETER">name</replaceable>
-       cannot be used in the WHERE, GROUP BY, or HAVING clauses.
-       It can, however, be referenced in ORDER BY clauses.
+       the AS clause.  This name is primarily used to label the column
+       for display.  It can also be used to refer to the column's value in
+       ORDER BY and GROUP BY clauses.  But the
+       <replaceable class="PARAMETER">name</replaceable>
+       cannot be used in the WHERE or HAVING clauses; write out the
+       expression instead.
         </para>
        </listitem>
       </varlistentry>
@@ -72,7 +74,8 @@ SELECT [ ALL | DISTINCT [ ON ( <replaceable class="PARAMETER">expression</replac
       <term>TEMP</term>
       <listitem>
        <para>
-       The table is created unique to this session, and is
+       If TEMPORARY or TEMP is specified,
+       the table is created unique to this session, and is
         automatically dropped on session exit.
        </para>
       </listitem>
@@ -83,10 +86,10 @@ SELECT [ ALL | DISTINCT [ ON ( <replaceable class="PARAMETER">expression</replac
        <listitem>
         <para>
         If the INTO TABLE clause is specified, the result of the
-       query will be stored in another table with the indicated
+       query will be stored in a new table with the indicated
         name.
         The target table (<replaceable class="PARAMETER">new_table</replaceable>) will
-       be created automatically and should not exist before this command.
+       be created automatically and must not exist before this command.
          Refer to <command>SELECT INTO</command> for more information.
  
         <note>
@@ -143,7 +146,8 @@ SELECT [ ALL | DISTINCT [ ON ( <replaceable class="PARAMETER">expression</replac
        <term><replaceable class="PARAMETER">select</replaceable></term>
        <listitem>
         <para>
-       A select statement with all features except the ORDER BY clause.
+       A select statement with all features except the ORDER BY and
+       LIMIT clauses.
         </para>
        </listitem>
       </varlistentry>
@@ -188,7 +192,7 @@ SELECT [ ALL | DISTINCT [ ON ( <replaceable class="PARAMETER">expression</replac
  
   <refsect1 id="R1-SQL-SELECT-1">
    <refsect1info>
-   <date>1998-09-24</date>
+   <date>2000-03-15</date>
    </refsect1info>
    <title>
     Description
@@ -210,7 +214,9 @@ SELECT [ ALL | DISTINCT [ ON ( <replaceable class="PARAMETER">expression</replac
    <para>
     <command>DISTINCT ON</command> eliminates rows that match on all the
     specified expressions, keeping only the first row of each set of
-   duplicates.  Note that "the first row" of each set is unpredictable
+   duplicates.  The DISTINCT ON expressions are interpreted using the
+   same rules as for ORDER BY items; see below.
+   Note that "the first row" of each set is unpredictable
     unless <command>ORDER BY</command> is used to ensure that the desired
     row appears first.  For example,
     <programlisting>
@@ -226,21 +232,20 @@ SELECT [ ALL | DISTINCT [ ON ( <replaceable class="PARAMETER">expression</replac
  
    <para>
     The GROUP BY clause allows a user to divide a table
-   conceptually into groups.
+   into groups of rows that match on one or more values.
     (See <xref linkend="sql-groupby" endterm="sql-groupby-title">.)
    </para>
  
    <para>
-   The HAVING clause specifies a grouped table derived by the
-   elimination of groups from the result of the previously
-   specified clause.
+   The HAVING clause allows selection of only those groups of rows
+   meeting the specified condition.
     (See <xref linkend="sql-having" endterm="sql-having-title">.)
    </para>
     
    <para>
-   The ORDER BY clause allows a user to specify that he/she
-   wishes the rows sorted according to the ASCending or 
-   DESCending mode operator.
+   The ORDER BY clause causes the returned rows to be sorted in a specified
+   order.  If ORDER BY is not given, the rows are returned in whatever order
+   the system finds cheapest to produce.
     (See <xref linkend="sql-orderby-title" endterm="sql-orderby-title">.)
    </para>
     
@@ -279,7 +284,7 @@ SELECT [ ALL | DISTINCT [ ON ( <replaceable class="PARAMETER">expression</replac
     
    <refsect2 id="SQL-WHERE">
     <refsect2info>
-    <date>1998-09-24</date>
+    <date>2000-03-15</date>
     </refsect2info>
     <title id="sql-where-title">
      WHERE Clause
@@ -312,15 +317,14 @@ WHERE <replaceable class="PARAMETER">boolean_expr</replaceable>
      locally-defined operator, 
      and <replaceable class="PARAMETER">log_op</replaceable> can be one 
      of: AND, OR, NOT.
-    The comparison returns either TRUE or FALSE and all
-    instances will be discarded
-    if the expression evaluates to FALSE.
+    SELECT will ignore all rows for which the WHERE condition does not return
+    TRUE.
     </para>
    </refsect2>
    
    <refsect2 id="SQL-GROUPBY">
     <refsect2info>
-    <date>1998-09-24</date>
+    <date>2000-03-15</date>
     </refsect2info>
     <title id="sql-groupby-title">
      GROUP BY Clause
@@ -334,20 +338,28 @@ GROUP BY <replaceable class="PARAMETER">column</replaceable> [, ...]
     </para>
  
     <para>
-    GROUP BY will condense into a single row all rows that share the
+    GROUP BY will condense into a single row all selected rows that share the
      same values for the grouped columns.  Aggregate functions, if any,
      are computed across all rows making up each group, producing a
      separate value for each group (whereas without GROUP BY, an
      aggregate produces a single value computed across all the selected
-    rows).  When GROUP BY is present, it is not valid to refer to
+    rows).  When GROUP BY is present, it is not valid for the SELECT
+    output expression(s) to refer to
      ungrouped columns except within aggregate functions, since there
      would be more than one possible value to return for an ungrouped column.
     </para>
+
+   <para>
+    An item in GROUP BY can also be the name or ordinal number of an output
+    column (SELECT expression), or it can be an arbitrary expression formed
+    from input-column values.  In case of ambiguity, a GROUP BY name will
+    be interpreted as an input-column name rather than an output column name.
+   </para>
    </refsect2>
  
    <refsect2 id="SQL-HAVING">
     <refsect2info>
-    <date>1998-09-24</date>
+    <date>2000-03-15</date>
     </refsect2info>
     <title id="sql-having-title">
      HAVING Clause
@@ -365,8 +377,12 @@ HAVING <replaceable class="PARAMETER">cond_expr</replaceable>
      
     <para>
      HAVING specifies a grouped table derived by the elimination
-    of groups from the result of the previously specified clause
-    that do not meet the <replaceable class="PARAMETER">cond_expr</replaceable>.</para>
+    of group rows that do not satisfy the
+    <replaceable class="PARAMETER">cond_expr</replaceable>.
+    HAVING is different from WHERE:
+    WHERE filters individual rows before application of GROUP BY,
+    while HAVING filters group rows created by GROUP BY.
+   </para>
  
     <para>
      Each column referenced in 
@@ -378,7 +394,7 @@ HAVING <replaceable class="PARAMETER">cond_expr</replaceable>
    
    <refsect2 id="SQL-ORDERBY">
     <refsect2info>
-    <date>1998-09-24</date>
+    <date>2000-03-15</date>
     </refsect2info>
     <title id="sql-orderby-title">
      ORDER BY Clause
@@ -389,15 +405,15 @@ ORDER BY <replaceable class="PARAMETER">column</replaceable> [ ASC | DESC ] [, .
      </synopsis></para>
      
     <para>
-    <replaceable class="PARAMETER">column</replaceable> can be either a column
-    name or an ordinal number.
+    <replaceable class="PARAMETER">column</replaceable> can be either a
+    result column name or an ordinal number.
     </para>
     <para>
      The ordinal numbers refers to the ordinal (left-to-right) position
-    of the column. This feature makes it possible to define an ordering
+    of the result column. This feature makes it possible to define an ordering
      on the basis of a column that does not have a proper name.
      This is never absolutely necessary because it is always possible
-    to assign a name to a calculated column using the AS clause, e.g.:
+    to assign a name to a result column using the AS clause, e.g.:
      <programlisting>
  SELECT title, date_prod + 1 AS newlen FROM films ORDER BY newlen;
      </programlisting></para>
@@ -410,6 +426,11 @@ SELECT title, date_prod + 1 AS newlen FROM films ORDER BY newlen;
      <programlisting>
  SELECT name FROM distributors ORDER BY code;
      </programlisting>
+    Note that if an ORDER BY item is a simple name that matches both
+    a result column name and an input column name, ORDER BY will interpret
+    it as the result column name.  This is the opposite of the choice that
+    GROUP BY will make in the same situation.  This inconsistency is
+    mandated by the SQL92 standard.
     </para>
      
     <para>
@@ -436,7 +457,7 @@ SELECT name FROM distributors ORDER BY code;
  
      where
      <replaceable class="PARAMETER">table_query</replaceable>
-    specifies any select expression without an ORDER BY clause.
+    specifies any select expression without an ORDER BY or LIMIT clause.
     </para>
      
     <para>
@@ -476,7 +497,7 @@ SELECT name FROM distributors ORDER BY code;
      
      where
      <replaceable class="PARAMETER">table_query</replaceable>
-    specifies any select expression without an ORDER BY clause.
+    specifies any select expression without an ORDER BY or LIMIT clause.
     </para>
  
     <para>
@@ -507,7 +528,7 @@ SELECT name FROM distributors ORDER BY code;
      
      where
      <replaceable class="PARAMETER">table_query</replaceable>
-    specifies any select expression without an ORDER BY clause.
+    specifies any select expression without an ORDER BY or LIMIT clause.
     </para>
  
     <para>
@@ -560,7 +581,7 @@ SELECT name FROM distributors ORDER BY code;
     </para>
  
     <para>
-    As of Postgres 7.0, the
+    As of PostgreSQL 7.0, the
      query optimizer takes LIMIT into account when generating a query plan,
      so you are very likely to get different plans (yielding different row
      orders) depending on what you give for LIMIT and OFFSET.  Thus, using
@@ -765,6 +786,18 @@ SELECT distributors.* WHERE name = 'Westwood';
       The DISTINCT ON phrase is not part of <acronym>SQL92</acronym>.
       Nor are LIMIT and OFFSET.
      </para>
+     
+    <para>
+     In <acronym>SQL92</acronym>, an ORDER BY clause may only use result
+     column names or numbers, while a GROUP BY clause may only use input
+     column names.
+     <productname>Postgres</productname> extends each of these clauses to
+     allow the other choice as well (but it uses the standard's interpretation
+     if there is ambiguity).
+     <productname>Postgres</productname> also allows both clauses to specify
+     arbitrary expressions.  Note that names appearing in an expression will
+     always be taken as input-column names, not as result-column names.
+    </para>
     </refsect3>
  
     <refsect3 id="R3-SQL-UNION-1">
diff --git a/src/backend/parser/parse_clause.c b/src/backend/parser/parse_clause.c

index 53d9b25f11b278d1c3eb6a98d923e73334030118..96a005ff0d919a29f17392f3de60928f00313d35 100644 (file)
--- a/src/backend/parser/parse_clause.c
+++ b/src/backend/parser/parse_clause.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/parser/parse_clause.c,v 1.56 2000/03/14 23:06:32 thomas Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/parser/parse_clause.c,v 1.57 2000/03/15 23:31:04 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -719,9 +719,9 @@ parseFromClause(ParseState *pstate, List *frmList)
   *       list as a "resjunk" node.
   *
   * node                the ORDER BY, GROUP BY, or DISTINCT ON expression to be matched
- * tlist       the existing target list (NB: this cannot be NIL, which is a
- *                     good thing since we'd be unable to append to it...)
- * clause      identifies clause type (mainly for error messages).
+ * tlist       the existing target list (NB: this will never be NIL, which is a
+ *                     good thing since we'd be unable to append to it if it were...)
+ * clause      identifies clause type being processed.
   */
  static TargetEntry *
  findTargetlistEntry(ParseState *pstate, Node *node, List *tlist, int clause)
@@ -733,7 +733,7 @@ findTargetlistEntry(ParseState *pstate, Node *node, List *tlist, int clause)
         /*----------
          * Handle two special cases as mandated by the SQL92 spec:
          *
-        * 1. ORDER BY ColumnName
+        * 1. Bare ColumnName (no qualifier or subscripts)
          *    For a bare identifier, we search for a matching column name
          *        in the existing target list.  Multiple matches are an error
          *        unless they refer to identical values; for example,
@@ -741,49 +741,76 @@ findTargetlistEntry(ParseState *pstate, Node *node, List *tlist, int clause)
          *        but not   SELECT a AS b, b FROM table ORDER BY b
          *        If no match is found, we fall through and treat the identifier
          *        as an expression.
-        *        We do NOT attempt this match for GROUP BY, since it is clearly
-        *        contrary to the spec to use an output column name in preference
-        *        to an underlying column name in GROUP BY.  DISTINCT ON isn't in
-        *        the standard, so we can do what we like there; we choose to make
-        *        it work like GROUP BY.
+        *        For GROUP BY, it is incorrect to match the grouping item against
+        *        targetlist entries: according to SQL92, an identifier in GROUP BY
+        *        is a reference to a column name exposed by FROM, not to a target
+        *        list column.  However, many implementations (including pre-7.0
+        *        PostgreSQL) accept this anyway.  So for GROUP BY, we look first
+        *        to see if the identifier matches any FROM column name, and only
+        *        try for a targetlist name if it doesn't.  This ensures that we
+        *        adhere to the spec in the case where the name could be both.
+        *        DISTINCT ON isn't in the standard, so we can do what we like there;
+        *        we choose to make it work like ORDER BY, on the rather flimsy
+        *        grounds that ordinary DISTINCT works on targetlist entries.
          *
-        * 2. ORDER BY/GROUP BY/DISTINCT ON IntegerConstant
+        * 2. IntegerConstant
          *        This means to use the n'th item in the existing target list.
          *        Note that it would make no sense to order/group/distinct by an
          *        actual constant, so this does not create a conflict with our
          *        extension to order/group by an expression.
-        *        I believe that GROUP BY column-number is not sanctioned by SQL92,
-        *        but since the standard has no other behavior defined for this
-        *        syntax, we may as well continue to support our past behavior.
+        *        GROUP BY column-number is not allowed by SQL92, but since
+        *        the standard has no other behavior defined for this syntax,
+        *        we may as well accept this common extension.
          *
-        * Note that pre-existing resjunk targets must not be used in either case.
+        * Note that pre-existing resjunk targets must not be used in either case,
+        * since the user didn't write them in his SELECT list.
+        *
+        * If neither special case applies, fall through to treat the item as
+        * an expression.
          *----------
          */
-       if (clause == ORDER_CLAUSE &&
-               IsA(node, Ident) && ((Ident *) node)->indirection == NIL)
+       if (IsA(node, Ident) && ((Ident *) node)->indirection == NIL)
         {
                 char       *name = ((Ident *) node)->name;
-               foreach(tl, tlist)
+
+               if (clause == GROUP_CLAUSE)
                 {
-                       TargetEntry *tle = (TargetEntry *) lfirst(tl);
-                       Resdom     *resnode = tle->resdom;
+                       /*
+                        * In GROUP BY, we must prefer a match against a FROM-clause
+                        * column to one against the targetlist.  Look to see if there is
+                        * a matching column.  If so, fall through to let transformExpr()
+                        * do the rest.  NOTE: if name could refer ambiguously to more
+                        * than one column name exposed by FROM, colnameRangeTableEntry
+                        * will elog(ERROR).  That's just what we want here.
+                        */
+                       if (colnameRangeTableEntry(pstate, name) != NULL)
+                               name = NULL;
+               }
  
-                       if (!resnode->resjunk &&
-                               strcmp(resnode->resname, name) == 0)
+               if (name != NULL)
+               {
+                       foreach(tl, tlist)
                         {
-                               if (target_result != NULL)
+                               TargetEntry *tle = (TargetEntry *) lfirst(tl);
+                               Resdom     *resnode = tle->resdom;
+
+                               if (!resnode->resjunk &&
+                                       strcmp(resnode->resname, name) == 0)
                                 {
-                                       if (! equal(target_result->expr, tle->expr))
-                                               elog(ERROR, "%s '%s' is ambiguous",
-                                                        clauseText[clause], name);
+                                       if (target_result != NULL)
+                                       {
+                                               if (! equal(target_result->expr, tle->expr))
+                                                       elog(ERROR, "%s '%s' is ambiguous",
+                                                                clauseText[clause], name);
+                                       }
+                                       else
+                                               target_result = tle;
+                                       /* Stay in loop to check for ambiguity */
                                 }
-                               else
-                                       target_result = tle;
-                               /* Stay in loop to check for ambiguity */
                         }
+                       if (target_result != NULL)
+                               return target_result; /* return the first match */
                 }
-               if (target_result != NULL)
-                       return target_result; /* return the first match */
         }
         if (IsA(node, A_Const))
         {
diff --git a/src/test/regress/expected/numerology.out b/src/test/regress/expected/numerology.out

index 8e13a9e6acbc94a120cc98cfc72f81aad88493d0..c5ad36fdd32df016b14ccd5ee49823117619660d 100644 (file)
--- a/src/test/regress/expected/numerology.out
+++ b/src/test/regress/expected/numerology.out
@@ -96,14 +96,18 @@ SELECT f1 AS two, max(f3) AS max_float, min(f3) as min_float
     2 |                    0 | -1.2345678901234e+200
  (2 rows)
  
--- Postgres used to accept this, but it is clearly against SQL92 to
--- interpret GROUP BY arguments as result column names; they should
--- be source column names *only*.  An error is expected.
+-- GROUP BY a result column name is not legal per SQL92, but we accept it
+-- anyway (if the name is not the name of any column exposed by FROM).
  SELECT f1 AS two, max(f3) AS max_float, min(f3) AS min_float
    FROM TEMP_GROUP
    GROUP BY two
    ORDER BY two, max_float, min_float;
-ERROR:  Attribute 'two' not found
+ two |      max_float       |       min_float       
+-----+----------------------+-----------------------
+   1 | 1.2345678901234e+200 |                     0
+   2 |                    0 | -1.2345678901234e+200
+(2 rows)
+
  SELECT f1 AS two, (max(f3) + 1) AS max_plus_1, (min(f3) - 1) AS min_minus_1
    FROM TEMP_GROUP
    GROUP BY f1
diff --git a/src/test/regress/sql/numerology.sql b/src/test/regress/sql/numerology.sql

index b30f008bffb0ff21b6488d96820761ba46758966..2220fdba38587ea1599124ac01b3a2cc8ddf0dc1 100644 (file)
--- a/src/test/regress/sql/numerology.sql
+++ b/src/test/regress/sql/numerology.sql
@@ -70,9 +70,8 @@ SELECT f1 AS two, max(f3) AS max_float, min(f3) as min_float
    GROUP BY f1
    ORDER BY two, max_float, min_float;
  
--- Postgres used to accept this, but it is clearly against SQL92 to
--- interpret GROUP BY arguments as result column names; they should
--- be source column names *only*.  An error is expected.
+-- GROUP BY a result column name is not legal per SQL92, but we accept it
+-- anyway (if the name is not the name of any column exposed by FROM).
  SELECT f1 AS two, max(f3) AS max_float, min(f3) AS min_float
    FROM TEMP_GROUP
    GROUP BY two
author	Tom Lane <tgl@sss.pgh.pa.us>
	Wed, 15 Mar 2000 23:31:19 +0000 (23:31 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Wed, 15 Mar 2000 23:31:19 +0000 (23:31 +0000)
doc/src/sgml/ref/select.sgml		patch \| blob \| history
src/backend/parser/parse_clause.c		patch \| blob \| history
src/test/regress/expected/numerology.out		patch \| blob \| history
src/test/regress/sql/numerology.sql		patch \| blob \| history