From: Tom Lane Date: Thu, 14 Feb 2019 15:51:59 +0000 (-0500) Subject: Move pattern selectivity code from selfuncs.c to like_support.c. X-Git-Tag: REL_12_BETA1~743 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=49fa99e54ec0ded180b52a4a14e543702d53e8c9;p=postgresql Move pattern selectivity code from selfuncs.c to like_support.c. While at it, refactor patternsel() a bit so that it can be used from the LIKE/regex planner support functions as well. This makes the planner able to deal equally well with either operator or function syntax for these operations. I'm not excited about that as a feature in itself, but it provides a nice model for extensions to follow if they want such behavior for their operations. This change localizes the use of pattern_fixed_prefix() and make_greater_string() so that they no longer need be exported. (We might get pushback from extensions about that, perhaps, in which case I'd be inclined to re-export them in a new header file like_support.h.) This reduces the bulk of selfuncs.c a fair amount, removing ~1370 lines or about one-sixth of that file; it's still too big, but this is progress. Discussion: https://postgr.es/m/24537.1550093915@sss.pgh.pa.us --- diff --git a/src/backend/utils/adt/like_support.c b/src/backend/utils/adt/like_support.c index b001dde5fc..69509811ef 100644 --- a/src/backend/utils/adt/like_support.c +++ b/src/backend/utils/adt/like_support.c @@ -34,17 +34,39 @@ */ #include "postgres.h" +#include + +#include "access/htup_details.h" #include "access/stratnum.h" +#include "catalog/pg_collation.h" #include "catalog/pg_opfamily.h" +#include "catalog/pg_statistic.h" #include "catalog/pg_type.h" +#include "mb/pg_wchar.h" #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" #include "nodes/supportnodes.h" #include "utils/builtins.h" +#include "utils/datum.h" #include "utils/lsyscache.h" #include "utils/pg_locale.h" #include "utils/selfuncs.h" +#include "utils/varlena.h" + + +typedef enum +{ + Pattern_Type_Like, + Pattern_Type_Like_IC, + Pattern_Type_Regex, + Pattern_Type_Regex_IC, + Pattern_Type_Prefix +} Pattern_Type; +typedef enum +{ + Pattern_Prefix_None, Pattern_Prefix_Partial, Pattern_Prefix_Exact +} Pattern_Prefix_Status; static Node *like_regex_support(Node *rawreq, Pattern_Type ptype); static List *match_pattern_prefix(Node *leftop, @@ -53,6 +75,34 @@ static List *match_pattern_prefix(Node *leftop, Oid expr_coll, Oid opfamily, Oid indexcollation); +static double patternsel_common(PlannerInfo *root, + Oid oprid, + Oid opfuncid, + List *args, + int varRelid, + Oid collation, + Pattern_Type ptype, + bool negate); +static Pattern_Prefix_Status pattern_fixed_prefix(Const *patt, + Pattern_Type ptype, + Oid collation, + Const **prefix, + Selectivity *rest_selec); +static Selectivity prefix_selectivity(PlannerInfo *root, + VariableStatData *vardata, + Oid vartype, Oid opfamily, Const *prefixcon); +static Selectivity like_selectivity(const char *patt, int pattlen, + bool case_insensitive); +static Selectivity regex_selectivity(const char *patt, int pattlen, + bool case_insensitive, + int fixed_prefix_len); +static int pattern_char_isalpha(char c, bool is_multibyte, + pg_locale_t locale, bool locale_is_c); +static Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc, + Oid collation); +static Datum string_to_datum(const char *str, Oid datatype); +static Const *string_to_const(const char *str, Oid datatype); +static Const *string_to_bytea_const(const char *str, size_t str_len); /* @@ -96,7 +146,39 @@ like_regex_support(Node *rawreq, Pattern_Type ptype) { Node *ret = NULL; - if (IsA(rawreq, SupportRequestIndexCondition)) + if (IsA(rawreq, SupportRequestSelectivity)) + { + /* + * Make a selectivity estimate for a function call, just as we'd do if + * the call was via the corresponding operator. + */ + SupportRequestSelectivity *req = (SupportRequestSelectivity *) rawreq; + Selectivity s1; + + if (req->is_join) + { + /* + * For the moment we just punt. If patternjoinsel is ever + * improved to do better, this should be made to call it. + */ + s1 = DEFAULT_MATCH_SEL; + } + else + { + /* Share code with operator restriction selectivity functions */ + s1 = patternsel_common(req->root, + InvalidOid, + req->funcid, + req->args, + req->varRelid, + req->inputcollid, + ptype, + false); + } + req->selectivity = s1; + ret = (Node *) req; + } + else if (IsA(rawreq, SupportRequestIndexCondition)) { /* Try to convert operator/function call to index conditions */ SupportRequestIndexCondition *req = (SupportRequestIndexCondition *) rawreq; @@ -311,3 +393,1333 @@ match_pattern_prefix(Node *leftop, return result; } + + +/* + * patternsel_common - generic code for pattern-match restriction selectivity. + * + * To support using this from either the operator or function paths, caller + * may pass either operator OID or underlying function OID; we look up the + * latter from the former if needed. (We could just have patternsel() call + * get_opcode(), but the work would be wasted if we don't have a need to + * compare a fixed prefix to the pg_statistic data.) + * + * Note that oprid and/or opfuncid should be for the positive-match operator + * even when negate is true. + */ +static double +patternsel_common(PlannerInfo *root, + Oid oprid, + Oid opfuncid, + List *args, + int varRelid, + Oid collation, + Pattern_Type ptype, + bool negate) +{ + VariableStatData vardata; + Node *other; + bool varonleft; + Datum constval; + Oid consttype; + Oid vartype; + Oid opfamily; + Pattern_Prefix_Status pstatus; + Const *patt; + Const *prefix = NULL; + Selectivity rest_selec = 0; + double nullfrac = 0.0; + double result; + + /* + * Initialize result to the appropriate default estimate depending on + * whether it's a match or not-match operator. + */ + if (negate) + result = 1.0 - DEFAULT_MATCH_SEL; + else + result = DEFAULT_MATCH_SEL; + + /* + * If expression is not variable op constant, then punt and return the + * default estimate. + */ + if (!get_restriction_variable(root, args, varRelid, + &vardata, &other, &varonleft)) + return result; + if (!varonleft || !IsA(other, Const)) + { + ReleaseVariableStats(vardata); + return result; + } + + /* + * If the constant is NULL, assume operator is strict and return zero, ie, + * operator will never return TRUE. (It's zero even for a negator op.) + */ + if (((Const *) other)->constisnull) + { + ReleaseVariableStats(vardata); + return 0.0; + } + constval = ((Const *) other)->constvalue; + consttype = ((Const *) other)->consttype; + + /* + * The right-hand const is type text or bytea for all supported operators. + * We do not expect to see binary-compatible types here, since + * const-folding should have relabeled the const to exactly match the + * operator's declared type. + */ + if (consttype != TEXTOID && consttype != BYTEAOID) + { + ReleaseVariableStats(vardata); + return result; + } + + /* + * Similarly, the exposed type of the left-hand side should be one of + * those we know. (Do not look at vardata.atttype, which might be + * something binary-compatible but different.) We can use it to choose + * the index opfamily from which we must draw the comparison operators. + * + * NOTE: It would be more correct to use the PATTERN opfamilies than the + * simple ones, but at the moment ANALYZE will not generate statistics for + * the PATTERN operators. But our results are so approximate anyway that + * it probably hardly matters. + */ + vartype = vardata.vartype; + + switch (vartype) + { + case TEXTOID: + case NAMEOID: + opfamily = TEXT_BTREE_FAM_OID; + break; + case BPCHAROID: + opfamily = BPCHAR_BTREE_FAM_OID; + break; + case BYTEAOID: + opfamily = BYTEA_BTREE_FAM_OID; + break; + default: + ReleaseVariableStats(vardata); + return result; + } + + /* + * Grab the nullfrac for use below. + */ + if (HeapTupleIsValid(vardata.statsTuple)) + { + Form_pg_statistic stats; + + stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); + nullfrac = stats->stanullfrac; + } + + /* + * Pull out any fixed prefix implied by the pattern, and estimate the + * fractional selectivity of the remainder of the pattern. Unlike many + * other selectivity estimators, we use the pattern operator's actual + * collation for this step. This is not because we expect the collation + * to make a big difference in the selectivity estimate (it seldom would), + * but because we want to be sure we cache compiled regexps under the + * right cache key, so that they can be re-used at runtime. + */ + patt = (Const *) other; + pstatus = pattern_fixed_prefix(patt, ptype, collation, + &prefix, &rest_selec); + + /* + * If necessary, coerce the prefix constant to the right type. + */ + if (prefix && prefix->consttype != vartype) + { + char *prefixstr; + + switch (prefix->consttype) + { + case TEXTOID: + prefixstr = TextDatumGetCString(prefix->constvalue); + break; + case BYTEAOID: + prefixstr = DatumGetCString(DirectFunctionCall1(byteaout, + prefix->constvalue)); + break; + default: + elog(ERROR, "unrecognized consttype: %u", + prefix->consttype); + ReleaseVariableStats(vardata); + return result; + } + prefix = string_to_const(prefixstr, vartype); + pfree(prefixstr); + } + + if (pstatus == Pattern_Prefix_Exact) + { + /* + * Pattern specifies an exact match, so pretend operator is '=' + */ + Oid eqopr = get_opfamily_member(opfamily, vartype, vartype, + BTEqualStrategyNumber); + + if (eqopr == InvalidOid) + elog(ERROR, "no = operator for opfamily %u", opfamily); + result = var_eq_const(&vardata, eqopr, prefix->constvalue, + false, true, false); + } + else + { + /* + * Not exact-match pattern. If we have a sufficiently large + * histogram, estimate selectivity for the histogram part of the + * population by counting matches in the histogram. If not, estimate + * selectivity of the fixed prefix and remainder of pattern + * separately, then combine the two to get an estimate of the + * selectivity for the part of the column population represented by + * the histogram. (For small histograms, we combine these + * approaches.) + * + * We then add up data for any most-common-values values; these are + * not in the histogram population, and we can get exact answers for + * them by applying the pattern operator, so there's no reason to + * approximate. (If the MCVs cover a significant part of the total + * population, this gives us a big leg up in accuracy.) + */ + Selectivity selec; + int hist_size; + FmgrInfo opproc; + double mcv_selec, + sumcommon; + + /* Try to use the histogram entries to get selectivity */ + if (!OidIsValid(opfuncid)) + opfuncid = get_opcode(oprid); + fmgr_info(opfuncid, &opproc); + + selec = histogram_selectivity(&vardata, &opproc, constval, true, + 10, 1, &hist_size); + + /* If not at least 100 entries, use the heuristic method */ + if (hist_size < 100) + { + Selectivity heursel; + Selectivity prefixsel; + + if (pstatus == Pattern_Prefix_Partial) + prefixsel = prefix_selectivity(root, &vardata, vartype, + opfamily, prefix); + else + prefixsel = 1.0; + heursel = prefixsel * rest_selec; + + if (selec < 0) /* fewer than 10 histogram entries? */ + selec = heursel; + else + { + /* + * For histogram sizes from 10 to 100, we combine the + * histogram and heuristic selectivities, putting increasingly + * more trust in the histogram for larger sizes. + */ + double hist_weight = hist_size / 100.0; + + selec = selec * hist_weight + heursel * (1.0 - hist_weight); + } + } + + /* In any case, don't believe extremely small or large estimates. */ + if (selec < 0.0001) + selec = 0.0001; + else if (selec > 0.9999) + selec = 0.9999; + + /* + * If we have most-common-values info, add up the fractions of the MCV + * entries that satisfy MCV OP PATTERN. These fractions contribute + * directly to the result selectivity. Also add up the total fraction + * represented by MCV entries. + */ + mcv_selec = mcv_selectivity(&vardata, &opproc, constval, true, + &sumcommon); + + /* + * Now merge the results from the MCV and histogram calculations, + * realizing that the histogram covers only the non-null values that + * are not listed in MCV. + */ + selec *= 1.0 - nullfrac - sumcommon; + selec += mcv_selec; + result = selec; + } + + /* now adjust if we wanted not-match rather than match */ + if (negate) + result = 1.0 - result - nullfrac; + + /* result should be in range, but make sure... */ + CLAMP_PROBABILITY(result); + + if (prefix) + { + pfree(DatumGetPointer(prefix->constvalue)); + pfree(prefix); + } + + ReleaseVariableStats(vardata); + + return result; +} + +/* + * Fix impedance mismatch between SQL-callable functions and patternsel_common + */ +static double +patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) +{ + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + Oid operator = PG_GETARG_OID(1); + List *args = (List *) PG_GETARG_POINTER(2); + int varRelid = PG_GETARG_INT32(3); + Oid collation = PG_GET_COLLATION(); + + /* + * If this is for a NOT LIKE or similar operator, get the corresponding + * positive-match operator and work with that. + */ + if (negate) + { + operator = get_negator(operator); + if (!OidIsValid(operator)) + elog(ERROR, "patternsel called for operator without a negator"); + } + + return patternsel_common(root, + operator, + InvalidOid, + args, + varRelid, + collation, + ptype, + negate); +} + +/* + * regexeqsel - Selectivity of regular-expression pattern match. + */ +Datum +regexeqsel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, false)); +} + +/* + * icregexeqsel - Selectivity of case-insensitive regex match. + */ +Datum +icregexeqsel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, false)); +} + +/* + * likesel - Selectivity of LIKE pattern match. + */ +Datum +likesel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, false)); +} + +/* + * prefixsel - selectivity of prefix operator + */ +Datum +prefixsel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Prefix, false)); +} + +/* + * + * iclikesel - Selectivity of ILIKE pattern match. + */ +Datum +iclikesel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, false)); +} + +/* + * regexnesel - Selectivity of regular-expression pattern non-match. + */ +Datum +regexnesel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, true)); +} + +/* + * icregexnesel - Selectivity of case-insensitive regex non-match. + */ +Datum +icregexnesel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, true)); +} + +/* + * nlikesel - Selectivity of LIKE pattern non-match. + */ +Datum +nlikesel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, true)); +} + +/* + * icnlikesel - Selectivity of ILIKE pattern non-match. + */ +Datum +icnlikesel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, true)); +} + +/* + * patternjoinsel - Generic code for pattern-match join selectivity. + */ +static double +patternjoinsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) +{ + /* For the moment we just punt. */ + return negate ? (1.0 - DEFAULT_MATCH_SEL) : DEFAULT_MATCH_SEL; +} + +/* + * regexeqjoinsel - Join selectivity of regular-expression pattern match. + */ +Datum +regexeqjoinsel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, false)); +} + +/* + * icregexeqjoinsel - Join selectivity of case-insensitive regex match. + */ +Datum +icregexeqjoinsel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, false)); +} + +/* + * likejoinsel - Join selectivity of LIKE pattern match. + */ +Datum +likejoinsel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, false)); +} + +/* + * prefixjoinsel - Join selectivity of prefix operator + */ +Datum +prefixjoinsel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Prefix, false)); +} + +/* + * iclikejoinsel - Join selectivity of ILIKE pattern match. + */ +Datum +iclikejoinsel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, false)); +} + +/* + * regexnejoinsel - Join selectivity of regex non-match. + */ +Datum +regexnejoinsel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, true)); +} + +/* + * icregexnejoinsel - Join selectivity of case-insensitive regex non-match. + */ +Datum +icregexnejoinsel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, true)); +} + +/* + * nlikejoinsel - Join selectivity of LIKE pattern non-match. + */ +Datum +nlikejoinsel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, true)); +} + +/* + * icnlikejoinsel - Join selectivity of ILIKE pattern non-match. + */ +Datum +icnlikejoinsel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, true)); +} + + +/*------------------------------------------------------------------------- + * + * Pattern analysis functions + * + * These routines support analysis of LIKE and regular-expression patterns + * by the planner/optimizer. It's important that they agree with the + * regular-expression code in backend/regex/ and the LIKE code in + * backend/utils/adt/like.c. Also, the computation of the fixed prefix + * must be conservative: if we report a string longer than the true fixed + * prefix, the query may produce actually wrong answers, rather than just + * getting a bad selectivity estimate! + * + *------------------------------------------------------------------------- + */ + +/* + * Extract the fixed prefix, if any, for a pattern. + * + * *prefix is set to a palloc'd prefix string (in the form of a Const node), + * or to NULL if no fixed prefix exists for the pattern. + * If rest_selec is not NULL, *rest_selec is set to an estimate of the + * selectivity of the remainder of the pattern (without any fixed prefix). + * The prefix Const has the same type (TEXT or BYTEA) as the input pattern. + * + * The return value distinguishes no fixed prefix, a partial prefix, + * or an exact-match-only pattern. + */ + +static Pattern_Prefix_Status +like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, + Const **prefix_const, Selectivity *rest_selec) +{ + char *match; + char *patt; + int pattlen; + Oid typeid = patt_const->consttype; + int pos, + match_pos; + bool is_multibyte = (pg_database_encoding_max_length() > 1); + pg_locale_t locale = 0; + bool locale_is_c = false; + + /* the right-hand const is type text or bytea */ + Assert(typeid == BYTEAOID || typeid == TEXTOID); + + if (case_insensitive) + { + if (typeid == BYTEAOID) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("case insensitive matching not supported on type bytea"))); + + /* If case-insensitive, we need locale info */ + if (lc_ctype_is_c(collation)) + locale_is_c = true; + else if (collation != DEFAULT_COLLATION_OID) + { + if (!OidIsValid(collation)) + { + /* + * This typically means that the parser could not resolve a + * conflict of implicit collations, so report it that way. + */ + ereport(ERROR, + (errcode(ERRCODE_INDETERMINATE_COLLATION), + errmsg("could not determine which collation to use for ILIKE"), + errhint("Use the COLLATE clause to set the collation explicitly."))); + } + locale = pg_newlocale_from_collation(collation); + } + } + + if (typeid != BYTEAOID) + { + patt = TextDatumGetCString(patt_const->constvalue); + pattlen = strlen(patt); + } + else + { + bytea *bstr = DatumGetByteaPP(patt_const->constvalue); + + pattlen = VARSIZE_ANY_EXHDR(bstr); + patt = (char *) palloc(pattlen); + memcpy(patt, VARDATA_ANY(bstr), pattlen); + Assert((Pointer) bstr == DatumGetPointer(patt_const->constvalue)); + } + + match = palloc(pattlen + 1); + match_pos = 0; + for (pos = 0; pos < pattlen; pos++) + { + /* % and _ are wildcard characters in LIKE */ + if (patt[pos] == '%' || + patt[pos] == '_') + break; + + /* Backslash escapes the next character */ + if (patt[pos] == '\\') + { + pos++; + if (pos >= pattlen) + break; + } + + /* Stop if case-varying character (it's sort of a wildcard) */ + if (case_insensitive && + pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c)) + break; + + match[match_pos++] = patt[pos]; + } + + match[match_pos] = '\0'; + + if (typeid != BYTEAOID) + *prefix_const = string_to_const(match, typeid); + else + *prefix_const = string_to_bytea_const(match, match_pos); + + if (rest_selec != NULL) + *rest_selec = like_selectivity(&patt[pos], pattlen - pos, + case_insensitive); + + pfree(patt); + pfree(match); + + /* in LIKE, an empty pattern is an exact match! */ + if (pos == pattlen) + return Pattern_Prefix_Exact; /* reached end of pattern, so exact */ + + if (match_pos > 0) + return Pattern_Prefix_Partial; + + return Pattern_Prefix_None; +} + +static Pattern_Prefix_Status +regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, + Const **prefix_const, Selectivity *rest_selec) +{ + Oid typeid = patt_const->consttype; + char *prefix; + bool exact; + + /* + * Should be unnecessary, there are no bytea regex operators defined. As + * such, it should be noted that the rest of this function has *not* been + * made safe for binary (possibly NULL containing) strings. + */ + if (typeid == BYTEAOID) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("regular-expression matching not supported on type bytea"))); + + /* Use the regexp machinery to extract the prefix, if any */ + prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue), + case_insensitive, collation, + &exact); + + if (prefix == NULL) + { + *prefix_const = NULL; + + if (rest_selec != NULL) + { + char *patt = TextDatumGetCString(patt_const->constvalue); + + *rest_selec = regex_selectivity(patt, strlen(patt), + case_insensitive, + 0); + pfree(patt); + } + + return Pattern_Prefix_None; + } + + *prefix_const = string_to_const(prefix, typeid); + + if (rest_selec != NULL) + { + if (exact) + { + /* Exact match, so there's no additional selectivity */ + *rest_selec = 1.0; + } + else + { + char *patt = TextDatumGetCString(patt_const->constvalue); + + *rest_selec = regex_selectivity(patt, strlen(patt), + case_insensitive, + strlen(prefix)); + pfree(patt); + } + } + + pfree(prefix); + + if (exact) + return Pattern_Prefix_Exact; /* pattern specifies exact match */ + else + return Pattern_Prefix_Partial; +} + +static Pattern_Prefix_Status +pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation, + Const **prefix, Selectivity *rest_selec) +{ + Pattern_Prefix_Status result; + + switch (ptype) + { + case Pattern_Type_Like: + result = like_fixed_prefix(patt, false, collation, + prefix, rest_selec); + break; + case Pattern_Type_Like_IC: + result = like_fixed_prefix(patt, true, collation, + prefix, rest_selec); + break; + case Pattern_Type_Regex: + result = regex_fixed_prefix(patt, false, collation, + prefix, rest_selec); + break; + case Pattern_Type_Regex_IC: + result = regex_fixed_prefix(patt, true, collation, + prefix, rest_selec); + break; + case Pattern_Type_Prefix: + /* Prefix type work is trivial. */ + result = Pattern_Prefix_Partial; + *rest_selec = 1.0; /* all */ + *prefix = makeConst(patt->consttype, + patt->consttypmod, + patt->constcollid, + patt->constlen, + datumCopy(patt->constvalue, + patt->constbyval, + patt->constlen), + patt->constisnull, + patt->constbyval); + break; + default: + elog(ERROR, "unrecognized ptype: %d", (int) ptype); + result = Pattern_Prefix_None; /* keep compiler quiet */ + break; + } + return result; +} + +/* + * Estimate the selectivity of a fixed prefix for a pattern match. + * + * A fixed prefix "foo" is estimated as the selectivity of the expression + * "variable >= 'foo' AND variable < 'fop'" (see also indxpath.c). + * + * The selectivity estimate is with respect to the portion of the column + * population represented by the histogram --- the caller must fold this + * together with info about MCVs and NULLs. + * + * We use the >= and < operators from the specified btree opfamily to do the + * estimation. The given variable and Const must be of the associated + * datatype. + * + * XXX Note: we make use of the upper bound to estimate operator selectivity + * even if the locale is such that we cannot rely on the upper-bound string. + * The selectivity only needs to be approximately right anyway, so it seems + * more useful to use the upper-bound code than not. + */ +static Selectivity +prefix_selectivity(PlannerInfo *root, VariableStatData *vardata, + Oid vartype, Oid opfamily, Const *prefixcon) +{ + Selectivity prefixsel; + Oid cmpopr; + FmgrInfo opproc; + AttStatsSlot sslot; + Const *greaterstrcon; + Selectivity eq_sel; + + cmpopr = get_opfamily_member(opfamily, vartype, vartype, + BTGreaterEqualStrategyNumber); + if (cmpopr == InvalidOid) + elog(ERROR, "no >= operator for opfamily %u", opfamily); + fmgr_info(get_opcode(cmpopr), &opproc); + + prefixsel = ineq_histogram_selectivity(root, vardata, + &opproc, true, true, + prefixcon->constvalue, + prefixcon->consttype); + + if (prefixsel < 0.0) + { + /* No histogram is present ... return a suitable default estimate */ + return DEFAULT_MATCH_SEL; + } + + /*------- + * If we can create a string larger than the prefix, say + * "x < greaterstr". We try to generate the string referencing the + * collation of the var's statistics, but if that's not available, + * use DEFAULT_COLLATION_OID. + *------- + */ + if (HeapTupleIsValid(vardata->statsTuple) && + get_attstatsslot(&sslot, vardata->statsTuple, + STATISTIC_KIND_HISTOGRAM, InvalidOid, 0)) + /* sslot.stacoll is set up */ ; + else + sslot.stacoll = DEFAULT_COLLATION_OID; + cmpopr = get_opfamily_member(opfamily, vartype, vartype, + BTLessStrategyNumber); + if (cmpopr == InvalidOid) + elog(ERROR, "no < operator for opfamily %u", opfamily); + fmgr_info(get_opcode(cmpopr), &opproc); + greaterstrcon = make_greater_string(prefixcon, &opproc, sslot.stacoll); + if (greaterstrcon) + { + Selectivity topsel; + + topsel = ineq_histogram_selectivity(root, vardata, + &opproc, false, false, + greaterstrcon->constvalue, + greaterstrcon->consttype); + + /* ineq_histogram_selectivity worked before, it shouldn't fail now */ + Assert(topsel >= 0.0); + + /* + * Merge the two selectivities in the same way as for a range query + * (see clauselist_selectivity()). Note that we don't need to worry + * about double-exclusion of nulls, since ineq_histogram_selectivity + * doesn't count those anyway. + */ + prefixsel = topsel + prefixsel - 1.0; + } + + /* + * If the prefix is long then the two bounding values might be too close + * together for the histogram to distinguish them usefully, resulting in a + * zero estimate (plus or minus roundoff error). To avoid returning a + * ridiculously small estimate, compute the estimated selectivity for + * "variable = 'foo'", and clamp to that. (Obviously, the resultant + * estimate should be at least that.) + * + * We apply this even if we couldn't make a greater string. That case + * suggests that the prefix is near the maximum possible, and thus + * probably off the end of the histogram, and thus we probably got a very + * small estimate from the >= condition; so we still need to clamp. + */ + cmpopr = get_opfamily_member(opfamily, vartype, vartype, + BTEqualStrategyNumber); + if (cmpopr == InvalidOid) + elog(ERROR, "no = operator for opfamily %u", opfamily); + eq_sel = var_eq_const(vardata, cmpopr, prefixcon->constvalue, + false, true, false); + + prefixsel = Max(prefixsel, eq_sel); + + return prefixsel; +} + + +/* + * Estimate the selectivity of a pattern of the specified type. + * Note that any fixed prefix of the pattern will have been removed already, + * so actually we may be looking at just a fragment of the pattern. + * + * For now, we use a very simplistic approach: fixed characters reduce the + * selectivity a good deal, character ranges reduce it a little, + * wildcards (such as % for LIKE or .* for regex) increase it. + */ + +#define FIXED_CHAR_SEL 0.20 /* about 1/5 */ +#define CHAR_RANGE_SEL 0.25 +#define ANY_CHAR_SEL 0.9 /* not 1, since it won't match end-of-string */ +#define FULL_WILDCARD_SEL 5.0 +#define PARTIAL_WILDCARD_SEL 2.0 + +static Selectivity +like_selectivity(const char *patt, int pattlen, bool case_insensitive) +{ + Selectivity sel = 1.0; + int pos; + + /* Skip any leading wildcard; it's already factored into initial sel */ + for (pos = 0; pos < pattlen; pos++) + { + if (patt[pos] != '%' && patt[pos] != '_') + break; + } + + for (; pos < pattlen; pos++) + { + /* % and _ are wildcard characters in LIKE */ + if (patt[pos] == '%') + sel *= FULL_WILDCARD_SEL; + else if (patt[pos] == '_') + sel *= ANY_CHAR_SEL; + else if (patt[pos] == '\\') + { + /* Backslash quotes the next character */ + pos++; + if (pos >= pattlen) + break; + sel *= FIXED_CHAR_SEL; + } + else + sel *= FIXED_CHAR_SEL; + } + /* Could get sel > 1 if multiple wildcards */ + if (sel > 1.0) + sel = 1.0; + return sel; +} + +static Selectivity +regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive) +{ + Selectivity sel = 1.0; + int paren_depth = 0; + int paren_pos = 0; /* dummy init to keep compiler quiet */ + int pos; + + for (pos = 0; pos < pattlen; pos++) + { + if (patt[pos] == '(') + { + if (paren_depth == 0) + paren_pos = pos; /* remember start of parenthesized item */ + paren_depth++; + } + else if (patt[pos] == ')' && paren_depth > 0) + { + paren_depth--; + if (paren_depth == 0) + sel *= regex_selectivity_sub(patt + (paren_pos + 1), + pos - (paren_pos + 1), + case_insensitive); + } + else if (patt[pos] == '|' && paren_depth == 0) + { + /* + * If unquoted | is present at paren level 0 in pattern, we have + * multiple alternatives; sum their probabilities. + */ + sel += regex_selectivity_sub(patt + (pos + 1), + pattlen - (pos + 1), + case_insensitive); + break; /* rest of pattern is now processed */ + } + else if (patt[pos] == '[') + { + bool negclass = false; + + if (patt[++pos] == '^') + { + negclass = true; + pos++; + } + if (patt[pos] == ']') /* ']' at start of class is not special */ + pos++; + while (pos < pattlen && patt[pos] != ']') + pos++; + if (paren_depth == 0) + sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL); + } + else if (patt[pos] == '.') + { + if (paren_depth == 0) + sel *= ANY_CHAR_SEL; + } + else if (patt[pos] == '*' || + patt[pos] == '?' || + patt[pos] == '+') + { + /* Ought to be smarter about quantifiers... */ + if (paren_depth == 0) + sel *= PARTIAL_WILDCARD_SEL; + } + else if (patt[pos] == '{') + { + while (pos < pattlen && patt[pos] != '}') + pos++; + if (paren_depth == 0) + sel *= PARTIAL_WILDCARD_SEL; + } + else if (patt[pos] == '\\') + { + /* backslash quotes the next character */ + pos++; + if (pos >= pattlen) + break; + if (paren_depth == 0) + sel *= FIXED_CHAR_SEL; + } + else + { + if (paren_depth == 0) + sel *= FIXED_CHAR_SEL; + } + } + /* Could get sel > 1 if multiple wildcards */ + if (sel > 1.0) + sel = 1.0; + return sel; +} + +static Selectivity +regex_selectivity(const char *patt, int pattlen, bool case_insensitive, + int fixed_prefix_len) +{ + Selectivity sel; + + /* If patt doesn't end with $, consider it to have a trailing wildcard */ + if (pattlen > 0 && patt[pattlen - 1] == '$' && + (pattlen == 1 || patt[pattlen - 2] != '\\')) + { + /* has trailing $ */ + sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive); + } + else + { + /* no trailing $ */ + sel = regex_selectivity_sub(patt, pattlen, case_insensitive); + sel *= FULL_WILDCARD_SEL; + } + + /* If there's a fixed prefix, discount its selectivity */ + if (fixed_prefix_len > 0) + sel /= pow(FIXED_CHAR_SEL, fixed_prefix_len); + + /* Make sure result stays in range */ + CLAMP_PROBABILITY(sel); + return sel; +} + +/* + * Check whether char is a letter (and, hence, subject to case-folding) + * + * In multibyte character sets or with ICU, we can't use isalpha, and it does + * not seem worth trying to convert to wchar_t to use iswalpha. Instead, just + * assume any multibyte char is potentially case-varying. + */ +static int +pattern_char_isalpha(char c, bool is_multibyte, + pg_locale_t locale, bool locale_is_c) +{ + if (locale_is_c) + return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); + else if (is_multibyte && IS_HIGHBIT_SET(c)) + return true; + else if (locale && locale->provider == COLLPROVIDER_ICU) + return IS_HIGHBIT_SET(c) ? true : false; +#ifdef HAVE_LOCALE_T + else if (locale && locale->provider == COLLPROVIDER_LIBC) + return isalpha_l((unsigned char) c, locale->info.lt); +#endif + else + return isalpha((unsigned char) c); +} + + +/* + * For bytea, the increment function need only increment the current byte + * (there are no multibyte characters to worry about). + */ +static bool +byte_increment(unsigned char *ptr, int len) +{ + if (*ptr >= 255) + return false; + (*ptr)++; + return true; +} + +/* + * Try to generate a string greater than the given string or any + * string it is a prefix of. If successful, return a palloc'd string + * in the form of a Const node; else return NULL. + * + * The caller must provide the appropriate "less than" comparison function + * for testing the strings, along with the collation to use. + * + * The key requirement here is that given a prefix string, say "foo", + * we must be able to generate another string "fop" that is greater than + * all strings "foobar" starting with "foo". We can test that we have + * generated a string greater than the prefix string, but in non-C collations + * that is not a bulletproof guarantee that an extension of the string might + * not sort after it; an example is that "foo " is less than "foo!", but it + * is not clear that a "dictionary" sort ordering will consider "foo!" less + * than "foo bar". CAUTION: Therefore, this function should be used only for + * estimation purposes when working in a non-C collation. + * + * To try to catch most cases where an extended string might otherwise sort + * before the result value, we determine which of the strings "Z", "z", "y", + * and "9" is seen as largest by the collation, and append that to the given + * prefix before trying to find a string that compares as larger. + * + * To search for a greater string, we repeatedly "increment" the rightmost + * character, using an encoding-specific character incrementer function. + * When it's no longer possible to increment the last character, we truncate + * off that character and start incrementing the next-to-rightmost. + * For example, if "z" were the last character in the sort order, then we + * could produce "foo" as a string greater than "fonz". + * + * This could be rather slow in the worst case, but in most cases we + * won't have to try more than one or two strings before succeeding. + * + * Note that it's important for the character incrementer not to be too anal + * about producing every possible character code, since in some cases the only + * way to get a larger string is to increment a previous character position. + * So we don't want to spend too much time trying every possible character + * code at the last position. A good rule of thumb is to be sure that we + * don't try more than 256*K values for a K-byte character (and definitely + * not 256^K, which is what an exhaustive search would approach). + */ +static Const * +make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation) +{ + Oid datatype = str_const->consttype; + char *workstr; + int len; + Datum cmpstr; + char *cmptxt = NULL; + mbcharacter_incrementer charinc; + + /* + * Get a modifiable copy of the prefix string in C-string format, and set + * up the string we will compare to as a Datum. In C locale this can just + * be the given prefix string, otherwise we need to add a suffix. Type + * BYTEA sorts bytewise so it never needs a suffix either. + */ + if (datatype == BYTEAOID) + { + bytea *bstr = DatumGetByteaPP(str_const->constvalue); + + len = VARSIZE_ANY_EXHDR(bstr); + workstr = (char *) palloc(len); + memcpy(workstr, VARDATA_ANY(bstr), len); + Assert((Pointer) bstr == DatumGetPointer(str_const->constvalue)); + cmpstr = str_const->constvalue; + } + else + { + if (datatype == NAMEOID) + workstr = DatumGetCString(DirectFunctionCall1(nameout, + str_const->constvalue)); + else + workstr = TextDatumGetCString(str_const->constvalue); + len = strlen(workstr); + if (lc_collate_is_c(collation) || len == 0) + cmpstr = str_const->constvalue; + else + { + /* If first time through, determine the suffix to use */ + static char suffixchar = 0; + static Oid suffixcollation = 0; + + if (!suffixchar || suffixcollation != collation) + { + char *best; + + best = "Z"; + if (varstr_cmp(best, 1, "z", 1, collation) < 0) + best = "z"; + if (varstr_cmp(best, 1, "y", 1, collation) < 0) + best = "y"; + if (varstr_cmp(best, 1, "9", 1, collation) < 0) + best = "9"; + suffixchar = *best; + suffixcollation = collation; + } + + /* And build the string to compare to */ + if (datatype == NAMEOID) + { + cmptxt = palloc(len + 2); + memcpy(cmptxt, workstr, len); + cmptxt[len] = suffixchar; + cmptxt[len + 1] = '\0'; + cmpstr = PointerGetDatum(cmptxt); + } + else + { + cmptxt = palloc(VARHDRSZ + len + 1); + SET_VARSIZE(cmptxt, VARHDRSZ + len + 1); + memcpy(VARDATA(cmptxt), workstr, len); + *(VARDATA(cmptxt) + len) = suffixchar; + cmpstr = PointerGetDatum(cmptxt); + } + } + } + + /* Select appropriate character-incrementer function */ + if (datatype == BYTEAOID) + charinc = byte_increment; + else + charinc = pg_database_encoding_character_incrementer(); + + /* And search ... */ + while (len > 0) + { + int charlen; + unsigned char *lastchar; + + /* Identify the last character --- for bytea, just the last byte */ + if (datatype == BYTEAOID) + charlen = 1; + else + charlen = len - pg_mbcliplen(workstr, len, len - 1); + lastchar = (unsigned char *) (workstr + len - charlen); + + /* + * Try to generate a larger string by incrementing the last character + * (for BYTEA, we treat each byte as a character). + * + * Note: the incrementer function is expected to return true if it's + * generated a valid-per-the-encoding new character, otherwise false. + * The contents of the character on false return are unspecified. + */ + while (charinc(lastchar, charlen)) + { + Const *workstr_const; + + if (datatype == BYTEAOID) + workstr_const = string_to_bytea_const(workstr, len); + else + workstr_const = string_to_const(workstr, datatype); + + if (DatumGetBool(FunctionCall2Coll(ltproc, + collation, + cmpstr, + workstr_const->constvalue))) + { + /* Successfully made a string larger than cmpstr */ + if (cmptxt) + pfree(cmptxt); + pfree(workstr); + return workstr_const; + } + + /* No good, release unusable value and try again */ + pfree(DatumGetPointer(workstr_const->constvalue)); + pfree(workstr_const); + } + + /* + * No luck here, so truncate off the last character and try to + * increment the next one. + */ + len -= charlen; + workstr[len] = '\0'; + } + + /* Failed... */ + if (cmptxt) + pfree(cmptxt); + pfree(workstr); + + return NULL; +} + +/* + * Generate a Datum of the appropriate type from a C string. + * Note that all of the supported types are pass-by-ref, so the + * returned value should be pfree'd if no longer needed. + */ +static Datum +string_to_datum(const char *str, Oid datatype) +{ + Assert(str != NULL); + + /* + * We cheat a little by assuming that CStringGetTextDatum() will do for + * bpchar and varchar constants too... + */ + if (datatype == NAMEOID) + return DirectFunctionCall1(namein, CStringGetDatum(str)); + else if (datatype == BYTEAOID) + return DirectFunctionCall1(byteain, CStringGetDatum(str)); + else + return CStringGetTextDatum(str); +} + +/* + * Generate a Const node of the appropriate type from a C string. + */ +static Const * +string_to_const(const char *str, Oid datatype) +{ + Datum conval = string_to_datum(str, datatype); + Oid collation; + int constlen; + + /* + * We only need to support a few datatypes here, so hard-wire properties + * instead of incurring the expense of catalog lookups. + */ + switch (datatype) + { + case TEXTOID: + case VARCHAROID: + case BPCHAROID: + collation = DEFAULT_COLLATION_OID; + constlen = -1; + break; + + case NAMEOID: + collation = C_COLLATION_OID; + constlen = NAMEDATALEN; + break; + + case BYTEAOID: + collation = InvalidOid; + constlen = -1; + break; + + default: + elog(ERROR, "unexpected datatype in string_to_const: %u", + datatype); + return NULL; + } + + return makeConst(datatype, -1, collation, constlen, + conval, false, false); +} + +/* + * Generate a Const node of bytea type from a binary C string and a length. + */ +static Const * +string_to_bytea_const(const char *str, size_t str_len) +{ + bytea *bstr = palloc(VARHDRSZ + str_len); + Datum conval; + + memcpy(VARDATA(bstr), str, str_len); + SET_VARSIZE(bstr, VARHDRSZ + str_len); + conval = PointerGetDatum(bstr); + + return makeConst(BYTEAOID, -1, InvalidOid, -1, conval, false, false); +} diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index b9f99fa050..c25357a008 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -110,12 +110,9 @@ #include "catalog/pg_am.h" #include "catalog/pg_collation.h" #include "catalog/pg_operator.h" -#include "catalog/pg_opfamily.h" #include "catalog/pg_statistic.h" #include "catalog/pg_statistic_ext.h" -#include "catalog/pg_type.h" #include "executor/executor.h" -#include "mb/pg_wchar.h" #include "miscadmin.h" #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" @@ -125,14 +122,10 @@ #include "optimizer/pathnode.h" #include "optimizer/paths.h" #include "optimizer/plancat.h" -#include "optimizer/restrictinfo.h" #include "parser/parse_clause.h" -#include "parser/parse_coerce.h" #include "parser/parsetree.h" #include "statistics/statistics.h" -#include "utils/acl.h" #include "utils/builtins.h" -#include "utils/bytea.h" #include "utils/date.h" #include "utils/datum.h" #include "utils/fmgroids.h" @@ -146,7 +139,6 @@ #include "utils/syscache.h" #include "utils/timestamp.h" #include "utils/typcache.h" -#include "utils/varlena.h" /* Hooks for plugins to get control when we ask for stats */ @@ -154,16 +146,6 @@ get_relation_stats_hook_type get_relation_stats_hook = NULL; get_index_stats_hook_type get_index_stats_hook = NULL; static double eqsel_internal(PG_FUNCTION_ARGS, bool negate); -static double var_eq_const(VariableStatData *vardata, Oid operator, - Datum constval, bool constisnull, - bool varonleft, bool negate); -static double var_eq_non_const(VariableStatData *vardata, Oid operator, - Node *other, - bool varonleft, bool negate); -static double ineq_histogram_selectivity(PlannerInfo *root, - VariableStatData *vardata, - FmgrInfo *opproc, bool isgt, bool iseq, - Datum constval, Oid consttype); static double eqjoinsel_inner(Oid opfuncoid, VariableStatData *vardata1, VariableStatData *vardata2, double nd1, double nd2, @@ -215,17 +197,6 @@ static bool get_actual_variable_range(PlannerInfo *root, Oid sortop, Datum *min, Datum *max); static RelOptInfo *find_join_input_rel(PlannerInfo *root, Relids relids); -static Selectivity prefix_selectivity(PlannerInfo *root, - VariableStatData *vardata, - Oid vartype, Oid opfamily, Const *prefixcon); -static Selectivity like_selectivity(const char *patt, int pattlen, - bool case_insensitive); -static Selectivity regex_selectivity(const char *patt, int pattlen, - bool case_insensitive, - int fixed_prefix_len); -static Datum string_to_datum(const char *str, Oid datatype); -static Const *string_to_const(const char *str, Oid datatype); -static Const *string_to_bytea_const(const char *str, size_t str_len); static IndexQualInfo *deconstruct_indexqual(RestrictInfo *rinfo, IndexOptInfo *index, int indexcol); static List *add_predicate_to_quals(IndexOptInfo *index, List *indexQuals); @@ -304,9 +275,9 @@ eqsel_internal(PG_FUNCTION_ARGS, bool negate) /* * var_eq_const --- eqsel for var = const case * - * This is split out so that some other estimation functions can use it. + * This is exported so that some other estimation functions can use it. */ -static double +double var_eq_const(VariableStatData *vardata, Oid operator, Datum constval, bool constisnull, bool varonleft, bool negate) @@ -457,8 +428,10 @@ var_eq_const(VariableStatData *vardata, Oid operator, /* * var_eq_non_const --- eqsel for var = something-other-than-const case + * + * This is exported so that some other estimation functions can use it. */ -static double +double var_eq_non_const(VariableStatData *vardata, Oid operator, Node *other, bool varonleft, bool negate) @@ -784,8 +757,10 @@ histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc, * Note that the result disregards both the most-common-values (if any) and * null entries. The caller is expected to combine this result with * statistics for those portions of the column population. + * + * This is exported so that some other estimation functions can use it. */ -static double +double ineq_histogram_selectivity(PlannerInfo *root, VariableStatData *vardata, FmgrInfo *opproc, bool isgt, bool iseq, @@ -1198,361 +1173,6 @@ scalargesel(PG_FUNCTION_ARGS) return scalarineqsel_wrapper(fcinfo, true, true); } -/* - * patternsel - Generic code for pattern-match selectivity. - */ -static double -patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) -{ - PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); - Oid operator = PG_GETARG_OID(1); - List *args = (List *) PG_GETARG_POINTER(2); - int varRelid = PG_GETARG_INT32(3); - Oid collation = PG_GET_COLLATION(); - VariableStatData vardata; - Node *other; - bool varonleft; - Datum constval; - Oid consttype; - Oid vartype; - Oid opfamily; - Pattern_Prefix_Status pstatus; - Const *patt; - Const *prefix = NULL; - Selectivity rest_selec = 0; - double nullfrac = 0.0; - double result; - - /* - * If this is for a NOT LIKE or similar operator, get the corresponding - * positive-match operator and work with that. Set result to the correct - * default estimate, too. - */ - if (negate) - { - operator = get_negator(operator); - if (!OidIsValid(operator)) - elog(ERROR, "patternsel called for operator without a negator"); - result = 1.0 - DEFAULT_MATCH_SEL; - } - else - { - result = DEFAULT_MATCH_SEL; - } - - /* - * If expression is not variable op constant, then punt and return a - * default estimate. - */ - if (!get_restriction_variable(root, args, varRelid, - &vardata, &other, &varonleft)) - return result; - if (!varonleft || !IsA(other, Const)) - { - ReleaseVariableStats(vardata); - return result; - } - - /* - * If the constant is NULL, assume operator is strict and return zero, ie, - * operator will never return TRUE. (It's zero even for a negator op.) - */ - if (((Const *) other)->constisnull) - { - ReleaseVariableStats(vardata); - return 0.0; - } - constval = ((Const *) other)->constvalue; - consttype = ((Const *) other)->consttype; - - /* - * The right-hand const is type text or bytea for all supported operators. - * We do not expect to see binary-compatible types here, since - * const-folding should have relabeled the const to exactly match the - * operator's declared type. - */ - if (consttype != TEXTOID && consttype != BYTEAOID) - { - ReleaseVariableStats(vardata); - return result; - } - - /* - * Similarly, the exposed type of the left-hand side should be one of - * those we know. (Do not look at vardata.atttype, which might be - * something binary-compatible but different.) We can use it to choose - * the index opfamily from which we must draw the comparison operators. - * - * NOTE: It would be more correct to use the PATTERN opfamilies than the - * simple ones, but at the moment ANALYZE will not generate statistics for - * the PATTERN operators. But our results are so approximate anyway that - * it probably hardly matters. - */ - vartype = vardata.vartype; - - switch (vartype) - { - case TEXTOID: - case NAMEOID: - opfamily = TEXT_BTREE_FAM_OID; - break; - case BPCHAROID: - opfamily = BPCHAR_BTREE_FAM_OID; - break; - case BYTEAOID: - opfamily = BYTEA_BTREE_FAM_OID; - break; - default: - ReleaseVariableStats(vardata); - return result; - } - - /* - * Grab the nullfrac for use below. - */ - if (HeapTupleIsValid(vardata.statsTuple)) - { - Form_pg_statistic stats; - - stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); - nullfrac = stats->stanullfrac; - } - - /* - * Pull out any fixed prefix implied by the pattern, and estimate the - * fractional selectivity of the remainder of the pattern. Unlike many of - * the other functions in this file, we use the pattern operator's actual - * collation for this step. This is not because we expect the collation - * to make a big difference in the selectivity estimate (it seldom would), - * but because we want to be sure we cache compiled regexps under the - * right cache key, so that they can be re-used at runtime. - */ - patt = (Const *) other; - pstatus = pattern_fixed_prefix(patt, ptype, collation, - &prefix, &rest_selec); - - /* - * If necessary, coerce the prefix constant to the right type. - */ - if (prefix && prefix->consttype != vartype) - { - char *prefixstr; - - switch (prefix->consttype) - { - case TEXTOID: - prefixstr = TextDatumGetCString(prefix->constvalue); - break; - case BYTEAOID: - prefixstr = DatumGetCString(DirectFunctionCall1(byteaout, - prefix->constvalue)); - break; - default: - elog(ERROR, "unrecognized consttype: %u", - prefix->consttype); - ReleaseVariableStats(vardata); - return result; - } - prefix = string_to_const(prefixstr, vartype); - pfree(prefixstr); - } - - if (pstatus == Pattern_Prefix_Exact) - { - /* - * Pattern specifies an exact match, so pretend operator is '=' - */ - Oid eqopr = get_opfamily_member(opfamily, vartype, vartype, - BTEqualStrategyNumber); - - if (eqopr == InvalidOid) - elog(ERROR, "no = operator for opfamily %u", opfamily); - result = var_eq_const(&vardata, eqopr, prefix->constvalue, - false, true, false); - } - else - { - /* - * Not exact-match pattern. If we have a sufficiently large - * histogram, estimate selectivity for the histogram part of the - * population by counting matches in the histogram. If not, estimate - * selectivity of the fixed prefix and remainder of pattern - * separately, then combine the two to get an estimate of the - * selectivity for the part of the column population represented by - * the histogram. (For small histograms, we combine these - * approaches.) - * - * We then add up data for any most-common-values values; these are - * not in the histogram population, and we can get exact answers for - * them by applying the pattern operator, so there's no reason to - * approximate. (If the MCVs cover a significant part of the total - * population, this gives us a big leg up in accuracy.) - */ - Selectivity selec; - int hist_size; - FmgrInfo opproc; - double mcv_selec, - sumcommon; - - /* Try to use the histogram entries to get selectivity */ - fmgr_info(get_opcode(operator), &opproc); - - selec = histogram_selectivity(&vardata, &opproc, constval, true, - 10, 1, &hist_size); - - /* If not at least 100 entries, use the heuristic method */ - if (hist_size < 100) - { - Selectivity heursel; - Selectivity prefixsel; - - if (pstatus == Pattern_Prefix_Partial) - prefixsel = prefix_selectivity(root, &vardata, vartype, - opfamily, prefix); - else - prefixsel = 1.0; - heursel = prefixsel * rest_selec; - - if (selec < 0) /* fewer than 10 histogram entries? */ - selec = heursel; - else - { - /* - * For histogram sizes from 10 to 100, we combine the - * histogram and heuristic selectivities, putting increasingly - * more trust in the histogram for larger sizes. - */ - double hist_weight = hist_size / 100.0; - - selec = selec * hist_weight + heursel * (1.0 - hist_weight); - } - } - - /* In any case, don't believe extremely small or large estimates. */ - if (selec < 0.0001) - selec = 0.0001; - else if (selec > 0.9999) - selec = 0.9999; - - /* - * If we have most-common-values info, add up the fractions of the MCV - * entries that satisfy MCV OP PATTERN. These fractions contribute - * directly to the result selectivity. Also add up the total fraction - * represented by MCV entries. - */ - mcv_selec = mcv_selectivity(&vardata, &opproc, constval, true, - &sumcommon); - - /* - * Now merge the results from the MCV and histogram calculations, - * realizing that the histogram covers only the non-null values that - * are not listed in MCV. - */ - selec *= 1.0 - nullfrac - sumcommon; - selec += mcv_selec; - result = selec; - } - - /* now adjust if we wanted not-match rather than match */ - if (negate) - result = 1.0 - result - nullfrac; - - /* result should be in range, but make sure... */ - CLAMP_PROBABILITY(result); - - if (prefix) - { - pfree(DatumGetPointer(prefix->constvalue)); - pfree(prefix); - } - - ReleaseVariableStats(vardata); - - return result; -} - -/* - * regexeqsel - Selectivity of regular-expression pattern match. - */ -Datum -regexeqsel(PG_FUNCTION_ARGS) -{ - PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, false)); -} - -/* - * icregexeqsel - Selectivity of case-insensitive regex match. - */ -Datum -icregexeqsel(PG_FUNCTION_ARGS) -{ - PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, false)); -} - -/* - * likesel - Selectivity of LIKE pattern match. - */ -Datum -likesel(PG_FUNCTION_ARGS) -{ - PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, false)); -} - -/* - * prefixsel - selectivity of prefix operator - */ -Datum -prefixsel(PG_FUNCTION_ARGS) -{ - PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Prefix, false)); -} - -/* - * - * iclikesel - Selectivity of ILIKE pattern match. - */ -Datum -iclikesel(PG_FUNCTION_ARGS) -{ - PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, false)); -} - -/* - * regexnesel - Selectivity of regular-expression pattern non-match. - */ -Datum -regexnesel(PG_FUNCTION_ARGS) -{ - PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, true)); -} - -/* - * icregexnesel - Selectivity of case-insensitive regex non-match. - */ -Datum -icregexnesel(PG_FUNCTION_ARGS) -{ - PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, true)); -} - -/* - * nlikesel - Selectivity of LIKE pattern non-match. - */ -Datum -nlikesel(PG_FUNCTION_ARGS) -{ - PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, true)); -} - -/* - * icnlikesel - Selectivity of ILIKE pattern non-match. - */ -Datum -icnlikesel(PG_FUNCTION_ARGS) -{ - PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, true)); -} - /* * boolvarsel - Selectivity of Boolean variable. * @@ -2896,123 +2516,33 @@ scalargejoinsel(PG_FUNCTION_ARGS) PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); } -/* - * patternjoinsel - Generic code for pattern-match join selectivity. - */ -static double -patternjoinsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) -{ - /* For the moment we just punt. */ - return negate ? (1.0 - DEFAULT_MATCH_SEL) : DEFAULT_MATCH_SEL; -} - -/* - * regexeqjoinsel - Join selectivity of regular-expression pattern match. - */ -Datum -regexeqjoinsel(PG_FUNCTION_ARGS) -{ - PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, false)); -} - -/* - * icregexeqjoinsel - Join selectivity of case-insensitive regex match. - */ -Datum -icregexeqjoinsel(PG_FUNCTION_ARGS) -{ - PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, false)); -} - -/* - * likejoinsel - Join selectivity of LIKE pattern match. - */ -Datum -likejoinsel(PG_FUNCTION_ARGS) -{ - PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, false)); -} /* - * prefixjoinsel - Join selectivity of prefix operator + * mergejoinscansel - Scan selectivity of merge join. + * + * A merge join will stop as soon as it exhausts either input stream. + * Therefore, if we can estimate the ranges of both input variables, + * we can estimate how much of the input will actually be read. This + * can have a considerable impact on the cost when using indexscans. + * + * Also, we can estimate how much of each input has to be read before the + * first join pair is found, which will affect the join's startup time. + * + * clause should be a clause already known to be mergejoinable. opfamily, + * strategy, and nulls_first specify the sort ordering being used. + * + * The outputs are: + * *leftstart is set to the fraction of the left-hand variable expected + * to be scanned before the first join pair is found (0 to 1). + * *leftend is set to the fraction of the left-hand variable expected + * to be scanned before the join terminates (0 to 1). + * *rightstart, *rightend similarly for the right-hand variable. */ -Datum -prefixjoinsel(PG_FUNCTION_ARGS) -{ - PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Prefix, false)); -} - -/* - * iclikejoinsel - Join selectivity of ILIKE pattern match. - */ -Datum -iclikejoinsel(PG_FUNCTION_ARGS) -{ - PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, false)); -} - -/* - * regexnejoinsel - Join selectivity of regex non-match. - */ -Datum -regexnejoinsel(PG_FUNCTION_ARGS) -{ - PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, true)); -} - -/* - * icregexnejoinsel - Join selectivity of case-insensitive regex non-match. - */ -Datum -icregexnejoinsel(PG_FUNCTION_ARGS) -{ - PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, true)); -} - -/* - * nlikejoinsel - Join selectivity of LIKE pattern non-match. - */ -Datum -nlikejoinsel(PG_FUNCTION_ARGS) -{ - PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, true)); -} - -/* - * icnlikejoinsel - Join selectivity of ILIKE pattern non-match. - */ -Datum -icnlikejoinsel(PG_FUNCTION_ARGS) -{ - PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, true)); -} - -/* - * mergejoinscansel - Scan selectivity of merge join. - * - * A merge join will stop as soon as it exhausts either input stream. - * Therefore, if we can estimate the ranges of both input variables, - * we can estimate how much of the input will actually be read. This - * can have a considerable impact on the cost when using indexscans. - * - * Also, we can estimate how much of each input has to be read before the - * first join pair is found, which will affect the join's startup time. - * - * clause should be a clause already known to be mergejoinable. opfamily, - * strategy, and nulls_first specify the sort ordering being used. - * - * The outputs are: - * *leftstart is set to the fraction of the left-hand variable expected - * to be scanned before the first join pair is found (0 to 1). - * *leftend is set to the fraction of the left-hand variable expected - * to be scanned before the join terminates (0 to 1). - * *rightstart, *rightend similarly for the right-hand variable. - */ -void -mergejoinscansel(PlannerInfo *root, Node *clause, - Oid opfamily, int strategy, bool nulls_first, - Selectivity *leftstart, Selectivity *leftend, - Selectivity *rightstart, Selectivity *rightend) +void +mergejoinscansel(PlannerInfo *root, Node *clause, + Oid opfamily, int strategy, bool nulls_first, + Selectivity *leftstart, Selectivity *leftend, + Selectivity *rightstart, Selectivity *rightend) { Node *left, *right; @@ -5716,853 +5246,6 @@ find_join_input_rel(PlannerInfo *root, Relids relids) } -/*------------------------------------------------------------------------- - * - * Pattern analysis functions - * - * These routines support analysis of LIKE and regular-expression patterns - * by the planner/optimizer. It's important that they agree with the - * regular-expression code in backend/regex/ and the LIKE code in - * backend/utils/adt/like.c. Also, the computation of the fixed prefix - * must be conservative: if we report a string longer than the true fixed - * prefix, the query may produce actually wrong answers, rather than just - * getting a bad selectivity estimate! - * - * Note that the prefix-analysis functions are called from - * backend/optimizer/path/indxpath.c as well as from routines in this file. - * - *------------------------------------------------------------------------- - */ - -/* - * Check whether char is a letter (and, hence, subject to case-folding) - * - * In multibyte character sets or with ICU, we can't use isalpha, and it does not seem - * worth trying to convert to wchar_t to use iswalpha. Instead, just assume - * any multibyte char is potentially case-varying. - */ -static int -pattern_char_isalpha(char c, bool is_multibyte, - pg_locale_t locale, bool locale_is_c) -{ - if (locale_is_c) - return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); - else if (is_multibyte && IS_HIGHBIT_SET(c)) - return true; - else if (locale && locale->provider == COLLPROVIDER_ICU) - return IS_HIGHBIT_SET(c) ? true : false; -#ifdef HAVE_LOCALE_T - else if (locale && locale->provider == COLLPROVIDER_LIBC) - return isalpha_l((unsigned char) c, locale->info.lt); -#endif - else - return isalpha((unsigned char) c); -} - -/* - * Extract the fixed prefix, if any, for a pattern. - * - * *prefix is set to a palloc'd prefix string (in the form of a Const node), - * or to NULL if no fixed prefix exists for the pattern. - * If rest_selec is not NULL, *rest_selec is set to an estimate of the - * selectivity of the remainder of the pattern (without any fixed prefix). - * The prefix Const has the same type (TEXT or BYTEA) as the input pattern. - * - * The return value distinguishes no fixed prefix, a partial prefix, - * or an exact-match-only pattern. - */ - -static Pattern_Prefix_Status -like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, - Const **prefix_const, Selectivity *rest_selec) -{ - char *match; - char *patt; - int pattlen; - Oid typeid = patt_const->consttype; - int pos, - match_pos; - bool is_multibyte = (pg_database_encoding_max_length() > 1); - pg_locale_t locale = 0; - bool locale_is_c = false; - - /* the right-hand const is type text or bytea */ - Assert(typeid == BYTEAOID || typeid == TEXTOID); - - if (case_insensitive) - { - if (typeid == BYTEAOID) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("case insensitive matching not supported on type bytea"))); - - /* If case-insensitive, we need locale info */ - if (lc_ctype_is_c(collation)) - locale_is_c = true; - else if (collation != DEFAULT_COLLATION_OID) - { - if (!OidIsValid(collation)) - { - /* - * This typically means that the parser could not resolve a - * conflict of implicit collations, so report it that way. - */ - ereport(ERROR, - (errcode(ERRCODE_INDETERMINATE_COLLATION), - errmsg("could not determine which collation to use for ILIKE"), - errhint("Use the COLLATE clause to set the collation explicitly."))); - } - locale = pg_newlocale_from_collation(collation); - } - } - - if (typeid != BYTEAOID) - { - patt = TextDatumGetCString(patt_const->constvalue); - pattlen = strlen(patt); - } - else - { - bytea *bstr = DatumGetByteaPP(patt_const->constvalue); - - pattlen = VARSIZE_ANY_EXHDR(bstr); - patt = (char *) palloc(pattlen); - memcpy(patt, VARDATA_ANY(bstr), pattlen); - Assert((Pointer) bstr == DatumGetPointer(patt_const->constvalue)); - } - - match = palloc(pattlen + 1); - match_pos = 0; - for (pos = 0; pos < pattlen; pos++) - { - /* % and _ are wildcard characters in LIKE */ - if (patt[pos] == '%' || - patt[pos] == '_') - break; - - /* Backslash escapes the next character */ - if (patt[pos] == '\\') - { - pos++; - if (pos >= pattlen) - break; - } - - /* Stop if case-varying character (it's sort of a wildcard) */ - if (case_insensitive && - pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c)) - break; - - match[match_pos++] = patt[pos]; - } - - match[match_pos] = '\0'; - - if (typeid != BYTEAOID) - *prefix_const = string_to_const(match, typeid); - else - *prefix_const = string_to_bytea_const(match, match_pos); - - if (rest_selec != NULL) - *rest_selec = like_selectivity(&patt[pos], pattlen - pos, - case_insensitive); - - pfree(patt); - pfree(match); - - /* in LIKE, an empty pattern is an exact match! */ - if (pos == pattlen) - return Pattern_Prefix_Exact; /* reached end of pattern, so exact */ - - if (match_pos > 0) - return Pattern_Prefix_Partial; - - return Pattern_Prefix_None; -} - -static Pattern_Prefix_Status -regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, - Const **prefix_const, Selectivity *rest_selec) -{ - Oid typeid = patt_const->consttype; - char *prefix; - bool exact; - - /* - * Should be unnecessary, there are no bytea regex operators defined. As - * such, it should be noted that the rest of this function has *not* been - * made safe for binary (possibly NULL containing) strings. - */ - if (typeid == BYTEAOID) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("regular-expression matching not supported on type bytea"))); - - /* Use the regexp machinery to extract the prefix, if any */ - prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue), - case_insensitive, collation, - &exact); - - if (prefix == NULL) - { - *prefix_const = NULL; - - if (rest_selec != NULL) - { - char *patt = TextDatumGetCString(patt_const->constvalue); - - *rest_selec = regex_selectivity(patt, strlen(patt), - case_insensitive, - 0); - pfree(patt); - } - - return Pattern_Prefix_None; - } - - *prefix_const = string_to_const(prefix, typeid); - - if (rest_selec != NULL) - { - if (exact) - { - /* Exact match, so there's no additional selectivity */ - *rest_selec = 1.0; - } - else - { - char *patt = TextDatumGetCString(patt_const->constvalue); - - *rest_selec = regex_selectivity(patt, strlen(patt), - case_insensitive, - strlen(prefix)); - pfree(patt); - } - } - - pfree(prefix); - - if (exact) - return Pattern_Prefix_Exact; /* pattern specifies exact match */ - else - return Pattern_Prefix_Partial; -} - -Pattern_Prefix_Status -pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation, - Const **prefix, Selectivity *rest_selec) -{ - Pattern_Prefix_Status result; - - switch (ptype) - { - case Pattern_Type_Like: - result = like_fixed_prefix(patt, false, collation, - prefix, rest_selec); - break; - case Pattern_Type_Like_IC: - result = like_fixed_prefix(patt, true, collation, - prefix, rest_selec); - break; - case Pattern_Type_Regex: - result = regex_fixed_prefix(patt, false, collation, - prefix, rest_selec); - break; - case Pattern_Type_Regex_IC: - result = regex_fixed_prefix(patt, true, collation, - prefix, rest_selec); - break; - case Pattern_Type_Prefix: - /* Prefix type work is trivial. */ - result = Pattern_Prefix_Partial; - *rest_selec = 1.0; /* all */ - *prefix = makeConst(patt->consttype, - patt->consttypmod, - patt->constcollid, - patt->constlen, - datumCopy(patt->constvalue, - patt->constbyval, - patt->constlen), - patt->constisnull, - patt->constbyval); - break; - default: - elog(ERROR, "unrecognized ptype: %d", (int) ptype); - result = Pattern_Prefix_None; /* keep compiler quiet */ - break; - } - return result; -} - -/* - * Estimate the selectivity of a fixed prefix for a pattern match. - * - * A fixed prefix "foo" is estimated as the selectivity of the expression - * "variable >= 'foo' AND variable < 'fop'" (see also indxpath.c). - * - * The selectivity estimate is with respect to the portion of the column - * population represented by the histogram --- the caller must fold this - * together with info about MCVs and NULLs. - * - * We use the >= and < operators from the specified btree opfamily to do the - * estimation. The given variable and Const must be of the associated - * datatype. - * - * XXX Note: we make use of the upper bound to estimate operator selectivity - * even if the locale is such that we cannot rely on the upper-bound string. - * The selectivity only needs to be approximately right anyway, so it seems - * more useful to use the upper-bound code than not. - */ -static Selectivity -prefix_selectivity(PlannerInfo *root, VariableStatData *vardata, - Oid vartype, Oid opfamily, Const *prefixcon) -{ - Selectivity prefixsel; - Oid cmpopr; - FmgrInfo opproc; - AttStatsSlot sslot; - Const *greaterstrcon; - Selectivity eq_sel; - - cmpopr = get_opfamily_member(opfamily, vartype, vartype, - BTGreaterEqualStrategyNumber); - if (cmpopr == InvalidOid) - elog(ERROR, "no >= operator for opfamily %u", opfamily); - fmgr_info(get_opcode(cmpopr), &opproc); - - prefixsel = ineq_histogram_selectivity(root, vardata, - &opproc, true, true, - prefixcon->constvalue, - prefixcon->consttype); - - if (prefixsel < 0.0) - { - /* No histogram is present ... return a suitable default estimate */ - return DEFAULT_MATCH_SEL; - } - - /*------- - * If we can create a string larger than the prefix, say - * "x < greaterstr". We try to generate the string referencing the - * collation of the var's statistics, but if that's not available, - * use DEFAULT_COLLATION_OID. - *------- - */ - if (HeapTupleIsValid(vardata->statsTuple) && - get_attstatsslot(&sslot, vardata->statsTuple, - STATISTIC_KIND_HISTOGRAM, InvalidOid, 0)) - /* sslot.stacoll is set up */ ; - else - sslot.stacoll = DEFAULT_COLLATION_OID; - cmpopr = get_opfamily_member(opfamily, vartype, vartype, - BTLessStrategyNumber); - if (cmpopr == InvalidOid) - elog(ERROR, "no < operator for opfamily %u", opfamily); - fmgr_info(get_opcode(cmpopr), &opproc); - greaterstrcon = make_greater_string(prefixcon, &opproc, sslot.stacoll); - if (greaterstrcon) - { - Selectivity topsel; - - topsel = ineq_histogram_selectivity(root, vardata, - &opproc, false, false, - greaterstrcon->constvalue, - greaterstrcon->consttype); - - /* ineq_histogram_selectivity worked before, it shouldn't fail now */ - Assert(topsel >= 0.0); - - /* - * Merge the two selectivities in the same way as for a range query - * (see clauselist_selectivity()). Note that we don't need to worry - * about double-exclusion of nulls, since ineq_histogram_selectivity - * doesn't count those anyway. - */ - prefixsel = topsel + prefixsel - 1.0; - } - - /* - * If the prefix is long then the two bounding values might be too close - * together for the histogram to distinguish them usefully, resulting in a - * zero estimate (plus or minus roundoff error). To avoid returning a - * ridiculously small estimate, compute the estimated selectivity for - * "variable = 'foo'", and clamp to that. (Obviously, the resultant - * estimate should be at least that.) - * - * We apply this even if we couldn't make a greater string. That case - * suggests that the prefix is near the maximum possible, and thus - * probably off the end of the histogram, and thus we probably got a very - * small estimate from the >= condition; so we still need to clamp. - */ - cmpopr = get_opfamily_member(opfamily, vartype, vartype, - BTEqualStrategyNumber); - if (cmpopr == InvalidOid) - elog(ERROR, "no = operator for opfamily %u", opfamily); - eq_sel = var_eq_const(vardata, cmpopr, prefixcon->constvalue, - false, true, false); - - prefixsel = Max(prefixsel, eq_sel); - - return prefixsel; -} - - -/* - * Estimate the selectivity of a pattern of the specified type. - * Note that any fixed prefix of the pattern will have been removed already, - * so actually we may be looking at just a fragment of the pattern. - * - * For now, we use a very simplistic approach: fixed characters reduce the - * selectivity a good deal, character ranges reduce it a little, - * wildcards (such as % for LIKE or .* for regex) increase it. - */ - -#define FIXED_CHAR_SEL 0.20 /* about 1/5 */ -#define CHAR_RANGE_SEL 0.25 -#define ANY_CHAR_SEL 0.9 /* not 1, since it won't match end-of-string */ -#define FULL_WILDCARD_SEL 5.0 -#define PARTIAL_WILDCARD_SEL 2.0 - -static Selectivity -like_selectivity(const char *patt, int pattlen, bool case_insensitive) -{ - Selectivity sel = 1.0; - int pos; - - /* Skip any leading wildcard; it's already factored into initial sel */ - for (pos = 0; pos < pattlen; pos++) - { - if (patt[pos] != '%' && patt[pos] != '_') - break; - } - - for (; pos < pattlen; pos++) - { - /* % and _ are wildcard characters in LIKE */ - if (patt[pos] == '%') - sel *= FULL_WILDCARD_SEL; - else if (patt[pos] == '_') - sel *= ANY_CHAR_SEL; - else if (patt[pos] == '\\') - { - /* Backslash quotes the next character */ - pos++; - if (pos >= pattlen) - break; - sel *= FIXED_CHAR_SEL; - } - else - sel *= FIXED_CHAR_SEL; - } - /* Could get sel > 1 if multiple wildcards */ - if (sel > 1.0) - sel = 1.0; - return sel; -} - -static Selectivity -regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive) -{ - Selectivity sel = 1.0; - int paren_depth = 0; - int paren_pos = 0; /* dummy init to keep compiler quiet */ - int pos; - - for (pos = 0; pos < pattlen; pos++) - { - if (patt[pos] == '(') - { - if (paren_depth == 0) - paren_pos = pos; /* remember start of parenthesized item */ - paren_depth++; - } - else if (patt[pos] == ')' && paren_depth > 0) - { - paren_depth--; - if (paren_depth == 0) - sel *= regex_selectivity_sub(patt + (paren_pos + 1), - pos - (paren_pos + 1), - case_insensitive); - } - else if (patt[pos] == '|' && paren_depth == 0) - { - /* - * If unquoted | is present at paren level 0 in pattern, we have - * multiple alternatives; sum their probabilities. - */ - sel += regex_selectivity_sub(patt + (pos + 1), - pattlen - (pos + 1), - case_insensitive); - break; /* rest of pattern is now processed */ - } - else if (patt[pos] == '[') - { - bool negclass = false; - - if (patt[++pos] == '^') - { - negclass = true; - pos++; - } - if (patt[pos] == ']') /* ']' at start of class is not special */ - pos++; - while (pos < pattlen && patt[pos] != ']') - pos++; - if (paren_depth == 0) - sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL); - } - else if (patt[pos] == '.') - { - if (paren_depth == 0) - sel *= ANY_CHAR_SEL; - } - else if (patt[pos] == '*' || - patt[pos] == '?' || - patt[pos] == '+') - { - /* Ought to be smarter about quantifiers... */ - if (paren_depth == 0) - sel *= PARTIAL_WILDCARD_SEL; - } - else if (patt[pos] == '{') - { - while (pos < pattlen && patt[pos] != '}') - pos++; - if (paren_depth == 0) - sel *= PARTIAL_WILDCARD_SEL; - } - else if (patt[pos] == '\\') - { - /* backslash quotes the next character */ - pos++; - if (pos >= pattlen) - break; - if (paren_depth == 0) - sel *= FIXED_CHAR_SEL; - } - else - { - if (paren_depth == 0) - sel *= FIXED_CHAR_SEL; - } - } - /* Could get sel > 1 if multiple wildcards */ - if (sel > 1.0) - sel = 1.0; - return sel; -} - -static Selectivity -regex_selectivity(const char *patt, int pattlen, bool case_insensitive, - int fixed_prefix_len) -{ - Selectivity sel; - - /* If patt doesn't end with $, consider it to have a trailing wildcard */ - if (pattlen > 0 && patt[pattlen - 1] == '$' && - (pattlen == 1 || patt[pattlen - 2] != '\\')) - { - /* has trailing $ */ - sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive); - } - else - { - /* no trailing $ */ - sel = regex_selectivity_sub(patt, pattlen, case_insensitive); - sel *= FULL_WILDCARD_SEL; - } - - /* If there's a fixed prefix, discount its selectivity */ - if (fixed_prefix_len > 0) - sel /= pow(FIXED_CHAR_SEL, fixed_prefix_len); - - /* Make sure result stays in range */ - CLAMP_PROBABILITY(sel); - return sel; -} - - -/* - * For bytea, the increment function need only increment the current byte - * (there are no multibyte characters to worry about). - */ -static bool -byte_increment(unsigned char *ptr, int len) -{ - if (*ptr >= 255) - return false; - (*ptr)++; - return true; -} - -/* - * Try to generate a string greater than the given string or any - * string it is a prefix of. If successful, return a palloc'd string - * in the form of a Const node; else return NULL. - * - * The caller must provide the appropriate "less than" comparison function - * for testing the strings, along with the collation to use. - * - * The key requirement here is that given a prefix string, say "foo", - * we must be able to generate another string "fop" that is greater than - * all strings "foobar" starting with "foo". We can test that we have - * generated a string greater than the prefix string, but in non-C collations - * that is not a bulletproof guarantee that an extension of the string might - * not sort after it; an example is that "foo " is less than "foo!", but it - * is not clear that a "dictionary" sort ordering will consider "foo!" less - * than "foo bar". CAUTION: Therefore, this function should be used only for - * estimation purposes when working in a non-C collation. - * - * To try to catch most cases where an extended string might otherwise sort - * before the result value, we determine which of the strings "Z", "z", "y", - * and "9" is seen as largest by the collation, and append that to the given - * prefix before trying to find a string that compares as larger. - * - * To search for a greater string, we repeatedly "increment" the rightmost - * character, using an encoding-specific character incrementer function. - * When it's no longer possible to increment the last character, we truncate - * off that character and start incrementing the next-to-rightmost. - * For example, if "z" were the last character in the sort order, then we - * could produce "foo" as a string greater than "fonz". - * - * This could be rather slow in the worst case, but in most cases we - * won't have to try more than one or two strings before succeeding. - * - * Note that it's important for the character incrementer not to be too anal - * about producing every possible character code, since in some cases the only - * way to get a larger string is to increment a previous character position. - * So we don't want to spend too much time trying every possible character - * code at the last position. A good rule of thumb is to be sure that we - * don't try more than 256*K values for a K-byte character (and definitely - * not 256^K, which is what an exhaustive search would approach). - */ -Const * -make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation) -{ - Oid datatype = str_const->consttype; - char *workstr; - int len; - Datum cmpstr; - char *cmptxt = NULL; - mbcharacter_incrementer charinc; - - /* - * Get a modifiable copy of the prefix string in C-string format, and set - * up the string we will compare to as a Datum. In C locale this can just - * be the given prefix string, otherwise we need to add a suffix. Type - * BYTEA sorts bytewise so it never needs a suffix either. - */ - if (datatype == BYTEAOID) - { - bytea *bstr = DatumGetByteaPP(str_const->constvalue); - - len = VARSIZE_ANY_EXHDR(bstr); - workstr = (char *) palloc(len); - memcpy(workstr, VARDATA_ANY(bstr), len); - Assert((Pointer) bstr == DatumGetPointer(str_const->constvalue)); - cmpstr = str_const->constvalue; - } - else - { - if (datatype == NAMEOID) - workstr = DatumGetCString(DirectFunctionCall1(nameout, - str_const->constvalue)); - else - workstr = TextDatumGetCString(str_const->constvalue); - len = strlen(workstr); - if (lc_collate_is_c(collation) || len == 0) - cmpstr = str_const->constvalue; - else - { - /* If first time through, determine the suffix to use */ - static char suffixchar = 0; - static Oid suffixcollation = 0; - - if (!suffixchar || suffixcollation != collation) - { - char *best; - - best = "Z"; - if (varstr_cmp(best, 1, "z", 1, collation) < 0) - best = "z"; - if (varstr_cmp(best, 1, "y", 1, collation) < 0) - best = "y"; - if (varstr_cmp(best, 1, "9", 1, collation) < 0) - best = "9"; - suffixchar = *best; - suffixcollation = collation; - } - - /* And build the string to compare to */ - if (datatype == NAMEOID) - { - cmptxt = palloc(len + 2); - memcpy(cmptxt, workstr, len); - cmptxt[len] = suffixchar; - cmptxt[len + 1] = '\0'; - cmpstr = PointerGetDatum(cmptxt); - } - else - { - cmptxt = palloc(VARHDRSZ + len + 1); - SET_VARSIZE(cmptxt, VARHDRSZ + len + 1); - memcpy(VARDATA(cmptxt), workstr, len); - *(VARDATA(cmptxt) + len) = suffixchar; - cmpstr = PointerGetDatum(cmptxt); - } - } - } - - /* Select appropriate character-incrementer function */ - if (datatype == BYTEAOID) - charinc = byte_increment; - else - charinc = pg_database_encoding_character_incrementer(); - - /* And search ... */ - while (len > 0) - { - int charlen; - unsigned char *lastchar; - - /* Identify the last character --- for bytea, just the last byte */ - if (datatype == BYTEAOID) - charlen = 1; - else - charlen = len - pg_mbcliplen(workstr, len, len - 1); - lastchar = (unsigned char *) (workstr + len - charlen); - - /* - * Try to generate a larger string by incrementing the last character - * (for BYTEA, we treat each byte as a character). - * - * Note: the incrementer function is expected to return true if it's - * generated a valid-per-the-encoding new character, otherwise false. - * The contents of the character on false return are unspecified. - */ - while (charinc(lastchar, charlen)) - { - Const *workstr_const; - - if (datatype == BYTEAOID) - workstr_const = string_to_bytea_const(workstr, len); - else - workstr_const = string_to_const(workstr, datatype); - - if (DatumGetBool(FunctionCall2Coll(ltproc, - collation, - cmpstr, - workstr_const->constvalue))) - { - /* Successfully made a string larger than cmpstr */ - if (cmptxt) - pfree(cmptxt); - pfree(workstr); - return workstr_const; - } - - /* No good, release unusable value and try again */ - pfree(DatumGetPointer(workstr_const->constvalue)); - pfree(workstr_const); - } - - /* - * No luck here, so truncate off the last character and try to - * increment the next one. - */ - len -= charlen; - workstr[len] = '\0'; - } - - /* Failed... */ - if (cmptxt) - pfree(cmptxt); - pfree(workstr); - - return NULL; -} - -/* - * Generate a Datum of the appropriate type from a C string. - * Note that all of the supported types are pass-by-ref, so the - * returned value should be pfree'd if no longer needed. - */ -static Datum -string_to_datum(const char *str, Oid datatype) -{ - Assert(str != NULL); - - /* - * We cheat a little by assuming that CStringGetTextDatum() will do for - * bpchar and varchar constants too... - */ - if (datatype == NAMEOID) - return DirectFunctionCall1(namein, CStringGetDatum(str)); - else if (datatype == BYTEAOID) - return DirectFunctionCall1(byteain, CStringGetDatum(str)); - else - return CStringGetTextDatum(str); -} - -/* - * Generate a Const node of the appropriate type from a C string. - */ -static Const * -string_to_const(const char *str, Oid datatype) -{ - Datum conval = string_to_datum(str, datatype); - Oid collation; - int constlen; - - /* - * We only need to support a few datatypes here, so hard-wire properties - * instead of incurring the expense of catalog lookups. - */ - switch (datatype) - { - case TEXTOID: - case VARCHAROID: - case BPCHAROID: - collation = DEFAULT_COLLATION_OID; - constlen = -1; - break; - - case NAMEOID: - collation = C_COLLATION_OID; - constlen = NAMEDATALEN; - break; - - case BYTEAOID: - collation = InvalidOid; - constlen = -1; - break; - - default: - elog(ERROR, "unexpected datatype in string_to_const: %u", - datatype); - return NULL; - } - - return makeConst(datatype, -1, collation, constlen, - conval, false, false); -} - -/* - * Generate a Const node of bytea type from a binary C string and a length. - */ -static Const * -string_to_bytea_const(const char *str, size_t str_len) -{ - bytea *bstr = palloc(VARHDRSZ + str_len); - Datum conval; - - memcpy(VARDATA(bstr), str, str_len); - SET_VARSIZE(bstr, VARHDRSZ + str_len); - conval = PointerGetDatum(bstr); - - return makeConst(BYTEAOID, -1, InvalidOid, -1, conval, false, false); -} - /*------------------------------------------------------------------------- * * Index cost estimation functions diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h index 087b56f917..8829889f9f 100644 --- a/src/include/utils/selfuncs.h +++ b/src/include/utils/selfuncs.h @@ -15,7 +15,6 @@ #ifndef SELFUNCS_H #define SELFUNCS_H -#include "fmgr.h" #include "access/htup.h" #include "nodes/pathnodes.h" @@ -85,20 +84,6 @@ typedef struct VariableStatData } while(0) -typedef enum -{ - Pattern_Type_Like, - Pattern_Type_Like_IC, - Pattern_Type_Regex, - Pattern_Type_Regex_IC, - Pattern_Type_Prefix -} Pattern_Type; - -typedef enum -{ - Pattern_Prefix_None, Pattern_Prefix_Partial, Pattern_Prefix_Exact -} Pattern_Prefix_Status; - /* * deconstruct_indexquals is a simple function to examine the indexquals * attached to a proposed IndexPath. It returns a list of IndexQualInfo @@ -175,14 +160,16 @@ extern double histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc, Datum constval, bool varonleft, int min_hist_size, int n_skip, int *hist_size); - -extern Pattern_Prefix_Status pattern_fixed_prefix(Const *patt, - Pattern_Type ptype, - Oid collation, - Const **prefix, - Selectivity *rest_selec); -extern Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc, - Oid collation); +extern double ineq_histogram_selectivity(PlannerInfo *root, + VariableStatData *vardata, + FmgrInfo *opproc, bool isgt, bool iseq, + Datum constval, Oid consttype); +extern double var_eq_const(VariableStatData *vardata, Oid oproid, + Datum constval, bool constisnull, + bool varonleft, bool negate); +extern double var_eq_non_const(VariableStatData *vardata, Oid oproid, + Node *other, + bool varonleft, bool negate); extern Selectivity boolvarsel(PlannerInfo *root, Node *arg, int varRelid); extern Selectivity booltestsel(PlannerInfo *root, BoolTestType booltesttype,