]> granicus.if.org Git - postgresql/commitdiff
Teach pattern_fixed_prefix() about collations.
authorTom Lane <tgl@sss.pgh.pa.us>
Mon, 11 Apr 2011 16:28:28 +0000 (12:28 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Mon, 11 Apr 2011 16:28:28 +0000 (12:28 -0400)
This is necessary, not optional, now that ILIKE and regexes are collation
aware --- else we might derive a wrong comparison constant for index
optimized pattern matches.

src/backend/optimizer/path/indxpath.c
src/backend/utils/adt/selfuncs.c
src/include/utils/selfuncs.h

index ef65cf222442e4a920f1947f54aaff4838062548..c7ed1b6ee9d67848cfc997e830fea7cd6cf27e87 100644 (file)
@@ -2446,6 +2446,7 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation,
        bool            isIndexable = false;
        Node       *rightop;
        Oid                     expr_op;
+       Oid                     expr_coll;
        Const      *patt;
        Const      *prefix = NULL;
        Const      *rest = NULL;
@@ -2462,6 +2463,7 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation,
        /* we know these will succeed */
        rightop = get_rightop(clause);
        expr_op = ((OpExpr *) clause)->opno;
+       expr_coll = ((OpExpr *) clause)->inputcollid;
 
        /* again, required for all current special ops: */
        if (!IsA(rightop, Const) ||
@@ -2475,13 +2477,13 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation,
                case OID_BPCHAR_LIKE_OP:
                case OID_NAME_LIKE_OP:
                        /* the right-hand const is type text for all of these */
-                       pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like,
+                       pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like, expr_coll,
                                                                                   &prefix, &rest);
                        isIndexable = (pstatus != Pattern_Prefix_None);
                        break;
 
                case OID_BYTEA_LIKE_OP:
-                       pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like,
+                       pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like, expr_coll,
                                                                                   &prefix, &rest);
                        isIndexable = (pstatus != Pattern_Prefix_None);
                        break;
@@ -2490,7 +2492,7 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation,
                case OID_BPCHAR_ICLIKE_OP:
                case OID_NAME_ICLIKE_OP:
                        /* the right-hand const is type text for all of these */
-                       pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like_IC,
+                       pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like_IC, expr_coll,
                                                                                   &prefix, &rest);
                        isIndexable = (pstatus != Pattern_Prefix_None);
                        break;
@@ -2499,7 +2501,7 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation,
                case OID_BPCHAR_REGEXEQ_OP:
                case OID_NAME_REGEXEQ_OP:
                        /* the right-hand const is type text for all of these */
-                       pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex,
+                       pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex, expr_coll,
                                                                                   &prefix, &rest);
                        isIndexable = (pstatus != Pattern_Prefix_None);
                        break;
@@ -2508,7 +2510,7 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation,
                case OID_BPCHAR_ICREGEXEQ_OP:
                case OID_NAME_ICREGEXEQ_OP:
                        /* the right-hand const is type text for all of these */
-                       pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex_IC,
+                       pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex_IC, expr_coll,
                                                                                   &prefix, &rest);
                        isIndexable = (pstatus != Pattern_Prefix_None);
                        break;
@@ -2544,10 +2546,9 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation,
         *
         * The non-pattern opclasses will not sort the way we need in most non-C
         * locales.  We can use such an index anyway for an exact match (simple
-        * equality), but not for prefix-match cases.  Note that we are looking at
-        * the index's collation, not the expression's collation -- this test is
-        * not dependent on the LIKE/regex operator's collation (which would only
-        * affect case folding behavior of ILIKE, anyway).
+        * equality), but not for prefix-match cases.  Note that here we are
+        * looking at the index's collation, not the expression's collation --
+        * this test is *not* dependent on the LIKE/regex operator's collation.
         */
        switch (expr_op)
        {
@@ -2558,7 +2559,8 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation,
                        isIndexable =
                                (opfamily == TEXT_PATTERN_BTREE_FAM_OID) ||
                                (opfamily == TEXT_BTREE_FAM_OID &&
-                                (pstatus == Pattern_Prefix_Exact || lc_collate_is_c(idxcollation)));
+                                (pstatus == Pattern_Prefix_Exact ||
+                                 lc_collate_is_c(idxcollation)));
                        break;
 
                case OID_BPCHAR_LIKE_OP:
@@ -2568,7 +2570,8 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation,
                        isIndexable =
                                (opfamily == BPCHAR_PATTERN_BTREE_FAM_OID) ||
                                (opfamily == BPCHAR_BTREE_FAM_OID &&
-                                (pstatus == Pattern_Prefix_Exact || lc_collate_is_c(idxcollation)));
+                                (pstatus == Pattern_Prefix_Exact ||
+                                 lc_collate_is_c(idxcollation)));
                        break;
 
                case OID_NAME_LIKE_OP:
@@ -2770,6 +2773,7 @@ expand_indexqual_opclause(RestrictInfo *rinfo, Oid opfamily, Oid idxcollation)
        Node       *leftop = get_leftop(clause);
        Node       *rightop = get_rightop(clause);
        Oid                     expr_op = ((OpExpr *) clause)->opno;
+       Oid                     expr_coll = ((OpExpr *) clause)->inputcollid;
        Const      *patt = (Const *) rightop;
        Const      *prefix = NULL;
        Const      *rest = NULL;
@@ -2791,7 +2795,7 @@ expand_indexqual_opclause(RestrictInfo *rinfo, Oid opfamily, Oid idxcollation)
                case OID_BYTEA_LIKE_OP:
                        if (!op_in_opfamily(expr_op, opfamily))
                        {
-                               pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like,
+                               pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like, expr_coll,
                                                                                           &prefix, &rest);
                                return prefix_quals(leftop, opfamily, idxcollation, prefix, pstatus);
                        }
@@ -2803,7 +2807,7 @@ expand_indexqual_opclause(RestrictInfo *rinfo, Oid opfamily, Oid idxcollation)
                        if (!op_in_opfamily(expr_op, opfamily))
                        {
                                /* the right-hand const is type text for all of these */
-                               pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like_IC,
+                               pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like_IC, expr_coll,
                                                                                           &prefix, &rest);
                                return prefix_quals(leftop, opfamily, idxcollation, prefix, pstatus);
                        }
@@ -2815,7 +2819,7 @@ expand_indexqual_opclause(RestrictInfo *rinfo, Oid opfamily, Oid idxcollation)
                        if (!op_in_opfamily(expr_op, opfamily))
                        {
                                /* the right-hand const is type text for all of these */
-                               pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex,
+                               pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex, expr_coll,
                                                                                           &prefix, &rest);
                                return prefix_quals(leftop, opfamily, idxcollation, prefix, pstatus);
                        }
@@ -2827,7 +2831,7 @@ expand_indexqual_opclause(RestrictInfo *rinfo, Oid opfamily, Oid idxcollation)
                        if (!op_in_opfamily(expr_op, opfamily))
                        {
                                /* the right-hand const is type text for all of these */
-                               pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex_IC,
+                               pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex_IC, expr_coll,
                                                                                           &prefix, &rest);
                                return prefix_quals(leftop, opfamily, idxcollation, prefix, pstatus);
                        }
index 41c5202146d4e41bf1870fdbabab6e1b9a2844a4..534425a6b5304d19f24081aaf170d1e44455a2d6 100644 (file)
@@ -1181,9 +1181,14 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
                        return result;
        }
 
-       /* divide pattern into fixed prefix and remainder */
+       /*
+        * Divide pattern into fixed prefix and remainder.  XXX we have to assume
+        * default collation here, because we don't have access to the actual
+        * input collation for the operator.  FIXME ...
+        */
        patt = (Const *) other;
-       pstatus = pattern_fixed_prefix(patt, ptype, &prefix, &rest);
+       pstatus = pattern_fixed_prefix(patt, ptype, DEFAULT_COLLATION_OID,
+                                                                  &prefix, &rest);
 
        /*
         * If necessary, coerce the prefix constant to the right type. (The "rest"
@@ -4755,6 +4760,29 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
  *-------------------------------------------------------------------------
  */
 
+/*
+ * Check whether char is a letter (and, hence, subject to case-folding)
+ *
+ * In multibyte character sets, we can't use isalpha, and it does not seem
+ * worth trying to convert to wchar_t to use iswalpha.  Instead, just assume
+ * any multibyte char is potentially case-varying.
+ */
+static int
+pattern_char_isalpha(char c, bool is_multibyte,
+                                        pg_locale_t locale, bool locale_is_c)
+{
+       if (locale_is_c)
+               return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
+       else if (is_multibyte && IS_HIGHBIT_SET(c))
+               return true;
+#ifdef HAVE_LOCALE_T
+       else if (locale)
+               return isalpha_l((unsigned char) c, locale);
+#endif
+       else
+               return isalpha((unsigned char) c);
+}
+
 /*
  * Extract the fixed prefix, if any, for a pattern.
  *
@@ -4769,7 +4797,7 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
  */
 
 static Pattern_Prefix_Status
-like_fixed_prefix(Const *patt_const, bool case_insensitive,
+like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
                                  Const **prefix_const, Const **rest_const)
 {
        char       *match;
@@ -4780,15 +4808,39 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
        int                     pos,
                                match_pos;
        bool            is_multibyte = (pg_database_encoding_max_length() > 1);
+       pg_locale_t     locale = 0;
+       bool            locale_is_c = false;
 
        /* the right-hand const is type text or bytea */
        Assert(typeid == BYTEAOID || typeid == TEXTOID);
 
-       if (typeid == BYTEAOID && case_insensitive)
-               ereport(ERROR,
-                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+       if (case_insensitive)
+       {
+               if (typeid == BYTEAOID)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                   errmsg("case insensitive matching not supported on type bytea")));
 
+               /* If case-insensitive, we need locale info */
+               if (lc_ctype_is_c(collation))
+                       locale_is_c = true;
+               else if (collation != DEFAULT_COLLATION_OID)
+               {
+                       if (!OidIsValid(collation))
+                       {
+                               /*
+                                * This typically means that the parser could not resolve a
+                                * conflict of implicit collations, so report it that way.
+                                */
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_INDETERMINATE_COLLATION),
+                                                errmsg("could not determine which collation to use for ILIKE"),
+                                                errhint("Use the COLLATE clause to set the collation explicitly.")));
+                       }
+                       locale = pg_newlocale_from_collation(collation);
+               }
+       }
+
        if (typeid != BYTEAOID)
        {
                patt = TextDatumGetCString(patt_const->constvalue);
@@ -4822,23 +4874,11 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
                                break;
                }
 
-               /*
-                * XXX In multibyte character sets, we can't trust isalpha, so assume
-                * any multibyte char is potentially case-varying.
-                */
-               if (case_insensitive)
-               {
-                       if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
-                               break;
-                       if (isalpha((unsigned char) patt[pos]))
-                               break;
-               }
+               /* Stop if case-varying character (it's sort of a wildcard) */
+               if (case_insensitive &&
+                       pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c))
+                       break;
 
-               /*
-                * NOTE: this code used to think that %% meant a literal %, but
-                * textlike() itself does not think that, and the SQL92 spec doesn't
-                * say any such thing either.
-                */
                match[match_pos++] = patt[pos];
        }
 
@@ -4870,7 +4910,7 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
 }
 
 static Pattern_Prefix_Status
-regex_fixed_prefix(Const *patt_const, bool case_insensitive,
+regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
                                   Const **prefix_const, Const **rest_const)
 {
        char       *match;
@@ -4883,6 +4923,8 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
        char       *rest;
        Oid                     typeid = patt_const->consttype;
        bool            is_multibyte = (pg_database_encoding_max_length() > 1);
+       pg_locale_t     locale = 0;
+       bool            locale_is_c = false;
 
        /*
         * Should be unnecessary, there are no bytea regex operators defined. As
@@ -4894,6 +4936,28 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
                                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                 errmsg("regular-expression matching not supported on type bytea")));
 
+       if (case_insensitive)
+       {
+               /* If case-insensitive, we need locale info */
+               if (lc_ctype_is_c(collation))
+                       locale_is_c = true;
+               else if (collation != DEFAULT_COLLATION_OID)
+               {
+                       if (!OidIsValid(collation))
+                       {
+                               /*
+                                * This typically means that the parser could not resolve a
+                                * conflict of implicit collations, so report it that way.
+                                */
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_INDETERMINATE_COLLATION),
+                                                errmsg("could not determine which collation to use for regular expression"),
+                                                errhint("Use the COLLATE clause to set the collation explicitly.")));
+                       }
+                       locale = pg_newlocale_from_collation(collation);
+               }
+       }
+
        /* the right-hand const is type text for all of these */
        patt = TextDatumGetCString(patt_const->constvalue);
 
@@ -4969,17 +5033,10 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
                        patt[pos] == '$')
                        break;
 
-               /*
-                * XXX In multibyte character sets, we can't trust isalpha, so assume
-                * any multibyte char is potentially case-varying.
-                */
-               if (case_insensitive)
-               {
-                       if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
-                               break;
-                       if (isalpha((unsigned char) patt[pos]))
-                               break;
-               }
+               /* Stop if case-varying character (it's sort of a wildcard) */
+               if (case_insensitive &&
+                       pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c))
+                       break;
 
                /*
                 * Check for quantifiers.  Except for +, this means the preceding
@@ -5004,7 +5061,7 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
                 * backslash followed by alphanumeric is an escape, not a quoted
                 * character.  Must treat it as having multiple possible matches.
                 * Note: since only ASCII alphanumerics are escapes, we don't have to
-                * be paranoid about multibyte here.
+                * be paranoid about multibyte or collations here.
                 */
                if (patt[pos] == '\\')
                {
@@ -5056,7 +5113,7 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
 }
 
 Pattern_Prefix_Status
-pattern_fixed_prefix(Const *patt, Pattern_Type ptype,
+pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation,
                                         Const **prefix, Const **rest)
 {
        Pattern_Prefix_Status result;
@@ -5064,16 +5121,16 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype,
        switch (ptype)
        {
                case Pattern_Type_Like:
-                       result = like_fixed_prefix(patt, false, prefix, rest);
+                       result = like_fixed_prefix(patt, false, collation, prefix, rest);
                        break;
                case Pattern_Type_Like_IC:
-                       result = like_fixed_prefix(patt, true, prefix, rest);
+                       result = like_fixed_prefix(patt, true, collation, prefix, rest);
                        break;
                case Pattern_Type_Regex:
-                       result = regex_fixed_prefix(patt, false, prefix, rest);
+                       result = regex_fixed_prefix(patt, false, collation, prefix, rest);
                        break;
                case Pattern_Type_Regex_IC:
-                       result = regex_fixed_prefix(patt, true, prefix, rest);
+                       result = regex_fixed_prefix(patt, true, collation, prefix, rest);
                        break;
                default:
                        elog(ERROR, "unrecognized ptype: %d", (int) ptype);
index e9913aa049f28da92efe296e9925e21cf7719f21..c1b417ad8fbd3be553afcd529862c139b1e8c75f 100644 (file)
@@ -132,6 +132,7 @@ extern double histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
 
 extern Pattern_Prefix_Status pattern_fixed_prefix(Const *patt,
                                         Pattern_Type ptype,
+                                        Oid collation,
                                         Const **prefix,
                                         Const **rest);
 extern Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc);