From: Robert Haas Date: Wed, 11 Mar 2015 14:44:04 +0000 (-0400) Subject: Suggest to the user the column they may have meant to reference. X-Git-Tag: REL9_5_ALPHA1~639 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e529cd4ffa605c6f;p=postgresql Suggest to the user the column they may have meant to reference. Error messages informing the user that no such column exists can sometimes provoke a perplexed response. This often happens due to a subtle typo in the column name or, perhaps less likely, in the alias name. To speed discovery of what the real issue is in such cases, we'll now search the range table for approximate matches. If there are one or two such matches that are good enough to think that they might be what the user intended to type, and better than all other approximate matches, we'll issue a hint suggesting that the user might have intended to reference those columns. Peter Geoghegan and Robert Haas --- diff --git a/src/backend/parser/parse_expr.c b/src/backend/parser/parse_expr.c index 7829bcbac1..130e52b26c 100644 --- a/src/backend/parser/parse_expr.c +++ b/src/backend/parser/parse_expr.c @@ -556,7 +556,8 @@ transformColumnRef(ParseState *pstate, ColumnRef *cref) colname = strVal(field2); /* Try to identify as a column of the RTE */ - node = scanRTEForColumn(pstate, rte, colname, cref->location); + node = scanRTEForColumn(pstate, rte, colname, cref->location, + 0, NULL); if (node == NULL) { /* Try it as a function call on the whole row */ @@ -601,7 +602,8 @@ transformColumnRef(ParseState *pstate, ColumnRef *cref) colname = strVal(field3); /* Try to identify as a column of the RTE */ - node = scanRTEForColumn(pstate, rte, colname, cref->location); + node = scanRTEForColumn(pstate, rte, colname, cref->location, + 0, NULL); if (node == NULL) { /* Try it as a function call on the whole row */ @@ -659,7 +661,8 @@ transformColumnRef(ParseState *pstate, ColumnRef *cref) colname = strVal(field4); /* Try to identify as a column of the RTE */ - node = scanRTEForColumn(pstate, rte, colname, cref->location); + node = scanRTEForColumn(pstate, rte, colname, cref->location, + 0, NULL); if (node == NULL) { /* Try it as a function call on the whole row */ diff --git a/src/backend/parser/parse_func.c b/src/backend/parser/parse_func.c index a20080400e..53bbaecac3 100644 --- a/src/backend/parser/parse_func.c +++ b/src/backend/parser/parse_func.c @@ -1779,7 +1779,7 @@ ParseComplexProjection(ParseState *pstate, char *funcname, Node *first_arg, ((Var *) first_arg)->varno, ((Var *) first_arg)->varlevelsup); /* Return a Var if funcname matches a column, else NULL */ - return scanRTEForColumn(pstate, rte, funcname, location); + return scanRTEForColumn(pstate, rte, funcname, location, 0, NULL); } /* diff --git a/src/backend/parser/parse_relation.c b/src/backend/parser/parse_relation.c index f416fc2a45..80daeb9dc4 100644 --- a/src/backend/parser/parse_relation.c +++ b/src/backend/parser/parse_relation.c @@ -33,6 +33,8 @@ #include "utils/syscache.h" +#define MAX_FUZZY_DISTANCE 3 + static RangeTblEntry *scanNameSpaceForRefname(ParseState *pstate, const char *refname, int location); static RangeTblEntry *scanNameSpaceForRelid(ParseState *pstate, Oid relid, @@ -519,6 +521,101 @@ GetCTEForRTE(ParseState *pstate, RangeTblEntry *rte, int rtelevelsup) return NULL; /* keep compiler quiet */ } +/* + * updateFuzzyAttrMatchState + * Using Levenshtein distance, consider if column is best fuzzy match. + */ +static void +updateFuzzyAttrMatchState(int fuzzy_rte_penalty, + FuzzyAttrMatchState *fuzzystate, RangeTblEntry *rte, + const char *actual, const char *match, int attnum) +{ + int columndistance; + int matchlen; + + /* Bail before computing the Levenshtein distance if there's no hope. */ + if (fuzzy_rte_penalty > fuzzystate->distance) + return; + + /* + * Outright reject dropped columns, which can appear here with apparent + * empty actual names, per remarks within scanRTEForColumn(). + */ + if (actual[0] == '\0') + return; + + /* Use Levenshtein to compute match distance. */ + matchlen = strlen(match); + columndistance = + varstr_levenshtein_less_equal(actual, strlen(actual), match, matchlen, + 1, 1, 1, + fuzzystate->distance + 1 + - fuzzy_rte_penalty); + + /* + * If more than half the characters are different, don't treat it as a + * match, to avoid making ridiculous suggestions. + */ + if (columndistance > matchlen / 2) + return; + + /* + * From this point on, we can ignore the distinction between the + * RTE-name distance and the column-name distance. + */ + columndistance += fuzzy_rte_penalty; + + /* + * If the new distance is less than or equal to that of the best match + * found so far, update fuzzystate. + */ + if (columndistance < fuzzystate->distance) + { + /* Store new lowest observed distance for RTE */ + fuzzystate->distance = columndistance; + fuzzystate->rfirst = rte; + fuzzystate->first = attnum; + fuzzystate->rsecond = NULL; + fuzzystate->second = InvalidAttrNumber; + } + else if (columndistance == fuzzystate->distance) + { + /* + * This match distance may equal a prior match within this same + * range table. When that happens, the prior match may also be + * given, but only if there is no more than two equally distant + * matches from the RTE (in turn, our caller will only accept + * two equally distant matches overall). + */ + if (AttributeNumberIsValid(fuzzystate->second)) + { + /* Too many RTE-level matches */ + fuzzystate->rfirst = NULL; + fuzzystate->first = InvalidAttrNumber; + fuzzystate->rsecond = NULL; + fuzzystate->second = InvalidAttrNumber; + /* Clearly, distance is too low a bar (for *any* RTE) */ + fuzzystate->distance = columndistance - 1; + } + else if (AttributeNumberIsValid(fuzzystate->first)) + { + /* Record as provisional second match for RTE */ + fuzzystate->rsecond = rte; + fuzzystate->second = attnum; + } + else if (fuzzystate->distance <= MAX_FUZZY_DISTANCE) + { + /* + * Record as provisional first match (this can occasionally + * occur because previous lowest distance was "too low a + * bar", rather than being associated with a real match) + */ + fuzzystate->rfirst = rte; + fuzzystate->first = attnum; + } + } +} + /* * scanRTEForColumn * Search the column names of a single RTE for the given name. @@ -527,10 +624,14 @@ GetCTEForRTE(ParseState *pstate, RangeTblEntry *rte, int rtelevelsup) * * Side effect: if we find a match, mark the RTE as requiring read access * for the column. + * + * Additional side effect: if fuzzystate is non-NULL, check non-system columns + * for an approximate match and update fuzzystate accordingly. */ Node * scanRTEForColumn(ParseState *pstate, RangeTblEntry *rte, char *colname, - int location) + int location, int fuzzy_rte_penalty, + FuzzyAttrMatchState *fuzzystate) { Node *result = NULL; int attnum = 0; @@ -548,12 +649,16 @@ scanRTEForColumn(ParseState *pstate, RangeTblEntry *rte, char *colname, * Should this somehow go wrong and we try to access a dropped column, * we'll still catch it by virtue of the checks in * get_rte_attribute_type(), which is called by make_var(). That routine - * has to do a cache lookup anyway, so the check there is cheap. + * has to do a cache lookup anyway, so the check there is cheap. Callers + * interested in finding match with shortest distance need to defend + * against this directly, though. */ foreach(c, rte->eref->colnames) { + const char *attcolname = strVal(lfirst(c)); + attnum++; - if (strcmp(strVal(lfirst(c)), colname) == 0) + if (strcmp(attcolname, colname) == 0) { if (result) ereport(ERROR, @@ -566,6 +671,11 @@ scanRTEForColumn(ParseState *pstate, RangeTblEntry *rte, char *colname, markVarForSelectPriv(pstate, var, rte); result = (Node *) var; } + + /* Updating fuzzy match state, if provided. */ + if (fuzzystate != NULL) + updateFuzzyAttrMatchState(fuzzy_rte_penalty, fuzzystate, + rte, attcolname, colname, attnum); } /* @@ -642,7 +752,8 @@ colNameToVar(ParseState *pstate, char *colname, bool localonly, continue; /* use orig_pstate here to get the right sublevels_up */ - newresult = scanRTEForColumn(orig_pstate, rte, colname, location); + newresult = scanRTEForColumn(orig_pstate, rte, colname, location, + 0, NULL); if (newresult) { @@ -668,8 +779,8 @@ colNameToVar(ParseState *pstate, char *colname, bool localonly, /* * searchRangeTableForCol - * See if any RangeTblEntry could possibly provide the given column name. - * If so, return a pointer to the RangeTblEntry; else return NULL. + * See if any RangeTblEntry could possibly provide the given column name (or + * find the best match available). Returns state with relevant details. * * This is different from colNameToVar in that it considers every entry in * the ParseState's rangetable(s), not only those that are currently visible @@ -677,11 +788,31 @@ colNameToVar(ParseState *pstate, char *colname, bool localonly, * and it may give ambiguous results (there might be multiple equally valid * matches, but only one will be returned). This must be used ONLY as a * heuristic in giving suitable error messages. See errorMissingColumn. + * + * This function is also different in that it will consider approximate + * matches -- if the user entered an alias/column pair that is only slightly + * different from a valid pair, we may be able to infer what they meant to + * type and provide a reasonable hint. + * + * The FuzzyAttrMatchState will have 'rfirst' pointing to the best RTE + * containing the most promising match for the alias and column name. If + * the alias and column names match exactly, 'first' will be InvalidAttrNumber; + * otherwise, it will be the attribute number for the match. In the latter + * case, 'rsecond' may point to a second, equally close approximate match, + * and 'second' will contain the attribute number for the second match. */ -static RangeTblEntry * -searchRangeTableForCol(ParseState *pstate, char *colname, int location) +static FuzzyAttrMatchState * +searchRangeTableForCol(ParseState *pstate, const char *alias, char *colname, + int location) { ParseState *orig_pstate = pstate; + FuzzyAttrMatchState *fuzzystate = palloc(sizeof(FuzzyAttrMatchState)); + + fuzzystate->distance = MAX_FUZZY_DISTANCE + 1; + fuzzystate->rfirst = NULL; + fuzzystate->rsecond = NULL; + fuzzystate->first = InvalidAttrNumber; + fuzzystate->second = InvalidAttrNumber; while (pstate != NULL) { @@ -689,15 +820,51 @@ searchRangeTableForCol(ParseState *pstate, char *colname, int location) foreach(l, pstate->p_rtable) { - RangeTblEntry *rte = (RangeTblEntry *) lfirst(l); + RangeTblEntry *rte = (RangeTblEntry *) lfirst(l); + int fuzzy_rte_penalty = 0; - if (scanRTEForColumn(orig_pstate, rte, colname, location)) - return rte; + /* + * Typically, it is not useful to look for matches within join + * RTEs; they effectively duplicate other RTEs for our purposes, + * and if a match is chosen from a join RTE, an unhelpful alias is + * displayed in the final diagnostic message. + */ + if (rte->rtekind == RTE_JOIN) + continue; + + /* + * If the user didn't specify an alias, then matches against one + * RTE are as good as another. But if the user did specify an + * alias, then we want at least a fuzzy - and preferably an exact + * - match for the range table entry. + */ + if (alias != NULL) + fuzzy_rte_penalty = + varstr_levenshtein(alias, strlen(alias), + rte->eref->aliasname, + strlen(rte->eref->aliasname), + 1, 1, 1); + + /* + * Scan for a matching column; if we find an exact match, we're + * done. Otherwise, update fuzzystate. + */ + if (scanRTEForColumn(orig_pstate, rte, colname, location, + fuzzy_rte_penalty, fuzzystate) + && fuzzy_rte_penalty == 0) + { + fuzzystate->rfirst = rte; + fuzzystate->first = InvalidAttrNumber; + fuzzystate->rsecond = NULL; + fuzzystate->second = InvalidAttrNumber; + return fuzzystate; + } } pstate = pstate->parentParseState; } - return NULL; + + return fuzzystate; } /* @@ -2860,34 +3027,67 @@ void errorMissingColumn(ParseState *pstate, char *relname, char *colname, int location) { - RangeTblEntry *rte; + FuzzyAttrMatchState *state; + char *closestfirst = NULL; /* - * If relname was given, just play dumb and report it. (In practice, a - * bad qualification name should end up at errorMissingRTE, not here, so - * no need to work hard on this case.) + * Search the entire rtable looking for possible matches. If we find one, + * emit a hint about it. + * + * TODO: improve this code (and also errorMissingRTE) to mention using + * LATERAL if appropriate. */ - if (relname) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_COLUMN), - errmsg("column %s.%s does not exist", relname, colname), - parser_errposition(pstate, location))); + state = searchRangeTableForCol(pstate, relname, colname, location); /* - * Otherwise, search the entire rtable looking for possible matches. If - * we find one, emit a hint about it. + * Extract closest col string for best match, if any. * - * TODO: improve this code (and also errorMissingRTE) to mention using - * LATERAL if appropriate. + * Infer an exact match referenced despite not being visible from the fact + * that an attribute number was not present in state passed back -- this is + * what is reported when !closestfirst. There might also be an exact match + * that was qualified with an incorrect alias, in which case closestfirst + * will be set (so hint is the same as generic fuzzy case). */ - rte = searchRangeTableForCol(pstate, colname, location); - - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_COLUMN), - errmsg("column \"%s\" does not exist", colname), - rte ? errhint("There is a column named \"%s\" in table \"%s\", but it cannot be referenced from this part of the query.", - colname, rte->eref->aliasname) : 0, - parser_errposition(pstate, location))); + if (state->rfirst && AttributeNumberIsValid(state->first)) + closestfirst = strVal(list_nth(state->rfirst->eref->colnames, + state->first - 1)); + + if (!state->rsecond) + { + /* + * Handle case where there is zero or one column suggestions to hint, + * including exact matches referenced but not visible. + */ + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + relname ? + errmsg("column %s.%s does not exist", relname, colname): + errmsg("column \"%s\" does not exist", colname), + state->rfirst ? closestfirst ? + errhint("Perhaps you meant to reference the column \"%s\".\"%s\".", + state->rfirst->eref->aliasname, closestfirst): + errhint("There is a column named \"%s\" in table \"%s\", but it cannot be referenced from this part of the query.", + colname, state->rfirst->eref->aliasname): 0, + parser_errposition(pstate, location))); + } + else + { + /* Handle case where there are two equally useful column hints */ + char *closestsecond; + + closestsecond = strVal(list_nth(state->rsecond->eref->colnames, + state->second - 1)); + + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + relname ? + errmsg("column %s.%s does not exist", relname, colname): + errmsg("column \"%s\" does not exist", colname), + errhint("Perhaps you meant to reference the column \"%s\".\"%s\" or the column \"%s\".\"%s\".", + state->rfirst->eref->aliasname, closestfirst, + state->rsecond->eref->aliasname, closestsecond), + parser_errposition(pstate, location))); + } } diff --git a/src/backend/utils/adt/levenshtein.c b/src/backend/utils/adt/levenshtein.c index 3669adc18a..f6e2ca6452 100644 --- a/src/backend/utils/adt/levenshtein.c +++ b/src/backend/utils/adt/levenshtein.c @@ -95,6 +95,15 @@ varstr_levenshtein(const char *source, int slen, const char *target, int tlen, #define STOP_COLUMN m #endif + /* + * A common use for Levenshtein distance is to match attributes when building + * diagnostic, user-visible messages. Restrict the size of + * MAX_LEVENSHTEIN_STRLEN at compile time so that this is guaranteed to + * work. + */ + StaticAssertStmt(NAMEDATALEN <= MAX_LEVENSHTEIN_STRLEN, + "Levenshtein hinting mechanism restricts NAMEDATALEN"); + m = pg_mbstrlen_with_len(source, slen); n = pg_mbstrlen_with_len(target, tlen); diff --git a/src/include/parser/parse_relation.h b/src/include/parser/parse_relation.h index fc0a2577f3..9dc0d5846b 100644 --- a/src/include/parser/parse_relation.h +++ b/src/include/parser/parse_relation.h @@ -16,6 +16,24 @@ #include "parser/parse_node.h" + +/* + * Support for fuzzily matching column. + * + * This is for building diagnostic messages, where non-exact matching + * attributes are suggested to the user. The struct's fields may be facets of + * a particular RTE, or of an entire range table, depending on context. + */ +typedef struct +{ + int distance; /* Weighted distance (lowest so far) */ + RangeTblEntry *rfirst; /* RTE of first */ + AttrNumber first; /* Closest attribute so far */ + RangeTblEntry *rsecond; /* RTE of second */ + AttrNumber second; /* Second closest attribute so far */ +} FuzzyAttrMatchState; + + extern RangeTblEntry *refnameRangeTblEntry(ParseState *pstate, const char *schemaname, const char *refname, @@ -35,7 +53,8 @@ extern RangeTblEntry *GetRTEByRangeTablePosn(ParseState *pstate, extern CommonTableExpr *GetCTEForRTE(ParseState *pstate, RangeTblEntry *rte, int rtelevelsup); extern Node *scanRTEForColumn(ParseState *pstate, RangeTblEntry *rte, - char *colname, int location); + char *colname, int location, + int fuzzy_rte_penalty, FuzzyAttrMatchState *fuzzystate); extern Node *colNameToVar(ParseState *pstate, char *colname, bool localonly, int location); extern void markVarForSelectPriv(ParseState *pstate, Var *var, diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index d233710871..51db1b6792 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -536,6 +536,7 @@ create table atacc1 ( test int ); -- add a check constraint (fails) alter table atacc1 add constraint atacc_test1 check (test1>3); ERROR: column "test1" does not exist +HINT: Perhaps you meant to reference the column "atacc1"."test". drop table atacc1; -- something a little more complicated create table atacc1 ( test int, test2 int, test3 int); @@ -1342,6 +1343,7 @@ select f1 from c1; ERROR: column "f1" does not exist LINE 1: select f1 from c1; ^ +HINT: Perhaps you meant to reference the column "c1"."f2". drop table p1 cascade; NOTICE: drop cascades to table c1 create table p1 (f1 int, f2 int); @@ -1355,6 +1357,7 @@ select f1 from c1; ERROR: column "f1" does not exist LINE 1: select f1 from c1; ^ +HINT: Perhaps you meant to reference the column "c1"."f2". drop table p1 cascade; NOTICE: drop cascades to table c1 create table p1 (f1 int, f2 int); diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index ca3a17b283..1e3fe07350 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -2222,6 +2222,12 @@ select * from t1 left join t2 on (t1.a = t2.a); 200 | 1000 | 200 | 2001 (5 rows) +-- Test matching of column name with wrong alias +select t1.x from t1 join t3 on (t1.a = t3.x); +ERROR: column t1.x does not exist +LINE 1: select t1.x from t1 join t3 on (t1.a = t3.x); + ^ +HINT: Perhaps you meant to reference the column "t3"."x". -- -- regression test for 8.1 merge right join bug -- @@ -3433,6 +3439,38 @@ select * from ----+----+----+---- (0 rows) +-- +-- Test hints given on incorrect column references are useful +-- +select t1.uunique1 from + tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, prefer "t1" suggestipn +ERROR: column t1.uunique1 does not exist +LINE 1: select t1.uunique1 from + ^ +HINT: Perhaps you meant to reference the column "t1"."unique1". +select t2.uunique1 from + tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, prefer "t2" suggestion +ERROR: column t2.uunique1 does not exist +LINE 1: select t2.uunique1 from + ^ +HINT: Perhaps you meant to reference the column "t2"."unique1". +select uunique1 from + tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, suggest both at once +ERROR: column "uunique1" does not exist +LINE 1: select uunique1 from + ^ +HINT: Perhaps you meant to reference the column "t1"."unique1" or the column "t2"."unique1". +-- +-- Take care to reference the correct RTE +-- +select atts.relid::regclass, s.* from pg_stats s join + pg_attribute a on s.attname = a.attname and s.tablename = + a.attrelid::regclass::text join (select unnest(indkey) attnum, + indexrelid from pg_index i) atts on atts.attnum = a.attnum where + schemaname != 'pg_catalog'; +ERROR: column atts.relid does not exist +LINE 1: select atts.relid::regclass, s.* from pg_stats s join + ^ -- -- Test LATERAL -- diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index 6005476c16..7a08bdff7b 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -397,6 +397,10 @@ insert into t2a values (200, 2001); select * from t1 left join t2 on (t1.a = t2.a); +-- Test matching of column name with wrong alias + +select t1.x from t1 join t3 on (t1.a = t3.x); + -- -- regression test for 8.1 merge right join bug -- @@ -1060,6 +1064,26 @@ select * from int8_tbl x join (int4_tbl x cross join int4_tbl y(ff)) j on q1 = f1; -- ok -- +-- Test hints given on incorrect column references are useful +-- + +select t1.uunique1 from + tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, prefer "t1" suggestipn +select t2.uunique1 from + tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, prefer "t2" suggestion +select uunique1 from + tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, suggest both at once + +-- +-- Take care to reference the correct RTE +-- + +select atts.relid::regclass, s.* from pg_stats s join + pg_attribute a on s.attname = a.attname and s.tablename = + a.attrelid::regclass::text join (select unnest(indkey) attnum, + indexrelid from pg_index i) atts on atts.attnum = a.attnum where + schemaname != 'pg_catalog'; +-- -- Test LATERAL --