]> granicus.if.org Git - postgresql/commitdiff
Back-patch fix for extraction of fixed prefixes from regular expressions.
authorTom Lane <tgl@sss.pgh.pa.us>
Tue, 10 Jul 2012 22:00:44 +0000 (18:00 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Tue, 10 Jul 2012 22:00:44 +0000 (18:00 -0400)
Back-patch of commits 628cbb50ba80c83917b07a7609ddec12cda172d0 and
c6aae3042be5249e672b731ebeb21875b5343010.  This has been broken since
7.3, so back-patch to all supported branches.

src/backend/regex/Makefile
src/backend/regex/regc_color.c
src/backend/regex/regc_nfa.c
src/backend/regex/regcomp.c
src/backend/regex/rege_dfa.c
src/backend/regex/regprefix.c [new file with mode: 0644]
src/backend/utils/adt/regexp.c
src/backend/utils/adt/selfuncs.c
src/include/regex/regex.h
src/include/regex/regguts.h
src/include/utils/builtins.h

index 21e7fa5329b9384333d6a8c9912be81dd24a4867..74a4c0c89d8efedcb8699dffac465def523431a9 100644 (file)
@@ -12,7 +12,7 @@ subdir = src/backend/regex
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = regcomp.o regerror.o regexec.o regfree.o
+OBJS = regcomp.o regerror.o regexec.o regfree.o regprefix.o
 
 include $(top_srcdir)/src/backend/common.mk
 
index 2aeb861d9762f0d9f5ec667007ce1b00d22fccde..1c60566fbf57458a1f43a4116432e0636ebcd92d 100644 (file)
@@ -66,8 +66,9 @@ initcm(struct vars * v,
        cd = cm->cd;                            /* cm->cd[WHITE] */
        cd->sub = NOSUB;
        cd->arcs = NULL;
-       cd->flags = 0;
+       cd->firstchr = CHR_MIN;
        cd->nchrs = CHR_MAX - CHR_MIN + 1;
+       cd->flags = 0;
 
        /* upper levels of tree */
        for (t = &cm->tree[0], j = NBYTS - 1; j > 0; t = nextt, j--)
@@ -272,6 +273,7 @@ newcolor(struct colormap * cm)
        cd->nchrs = 0;
        cd->sub = NOSUB;
        cd->arcs = NULL;
+       cd->firstchr = CHR_MIN;         /* in case never set otherwise */
        cd->flags = 0;
        cd->block = NULL;
 
@@ -371,6 +373,8 @@ subcolor(struct colormap * cm, chr c)
        if (co == sco)                          /* already in an open subcolor */
                return co;                              /* rest is redundant */
        cm->cd[co].nchrs--;
+       if (cm->cd[sco].nchrs == 0)
+               cm->cd[sco].firstchr = c;
        cm->cd[sco].nchrs++;
        setcolor(cm, c, sco);
        return sco;
@@ -438,6 +442,11 @@ subrange(struct vars * v,
 
 /*
  * subblock - allocate new subcolors for one tree block of chrs, fill in arcs
+ *
+ * Note: subcolors that are created during execution of this function
+ * will not be given a useful value of firstchr; it'll be left as CHR_MIN.
+ * For the current usage of firstchr in pg_regprefix, this does not matter
+ * because such subcolors won't occur in the common prefix of a regex.
  */
 static void
 subblock(struct vars * v,
index 66a361ee2ffe7d61c16ac609a9d9be4d249b8497..085842c92b70d74ea969488e0cc50d1466f2e53e 100644 (file)
@@ -1330,14 +1330,16 @@ compact(struct nfa * nfa,
        for (s = nfa->states; s != NULL; s = s->next)
        {
                nstates++;
-               narcs += 1 + s->nouts + 1;
-               /* 1 as a fake for flags, nouts for arcs, 1 as endmarker */
+               narcs += s->nouts + 1;          /* need one extra for endmarker */
        }
 
+       cnfa->stflags = (char *) MALLOC(nstates * sizeof(char));
        cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *));
        cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc));
-       if (cnfa->states == NULL || cnfa->arcs == NULL)
+       if (cnfa->stflags == NULL || cnfa->states == NULL || cnfa->arcs == NULL)
        {
+               if (cnfa->stflags != NULL)
+                       FREE(cnfa->stflags);
                if (cnfa->states != NULL)
                        FREE(cnfa->states);
                if (cnfa->arcs != NULL)
@@ -1359,9 +1361,8 @@ compact(struct nfa * nfa,
        for (s = nfa->states; s != NULL; s = s->next)
        {
                assert((size_t) s->no < nstates);
+               cnfa->stflags[s->no] = 0;
                cnfa->states[s->no] = ca;
-               ca->co = 0;                             /* clear and skip flags "arc" */
-               ca++;
                first = ca;
                for (a = s->outs; a != NULL; a = a->outchain)
                        switch (a->type)
@@ -1392,8 +1393,8 @@ compact(struct nfa * nfa,
 
        /* mark no-progress states */
        for (a = nfa->pre->outs; a != NULL; a = a->outchain)
-               cnfa->states[a->to->no]->co = 1;
-       cnfa->states[nfa->pre->no]->co = 1;
+               cnfa->stflags[a->to->no] = CNFA_NOPROGRESS;
+       cnfa->stflags[nfa->pre->no] = CNFA_NOPROGRESS;
 }
 
 /*
@@ -1433,6 +1434,7 @@ freecnfa(struct cnfa * cnfa)
 {
        assert(cnfa->nstates != 0); /* not empty already */
        cnfa->nstates = 0;
+       FREE(cnfa->stflags);
        FREE(cnfa->states);
        FREE(cnfa->arcs);
 }
@@ -1617,7 +1619,7 @@ dumpcnfa(struct cnfa * cnfa,
                fprintf(f, ", haslacons");
        fprintf(f, "\n");
        for (st = 0; st < cnfa->nstates; st++)
-               dumpcstate(st, cnfa->states[st], cnfa, f);
+               dumpcstate(st, cnfa, f);
        fflush(f);
 }
 #endif
@@ -1629,22 +1631,20 @@ dumpcnfa(struct cnfa * cnfa,
  */
 static void
 dumpcstate(int st,
-                  struct carc * ca,
                   struct cnfa * cnfa,
                   FILE *f)
 {
-       int                     i;
+       struct carc * ca;
        int                     pos;
 
-       fprintf(f, "%d%s", st, (ca[0].co) ? ":" : ".");
+       fprintf(f, "%d%s", st, (cnfa->stflags[st] & CNFA_NOPROGRESS) ? ":" : ".");
        pos = 1;
-       for (i = 1; ca[i].co != COLORLESS; i++)
+       for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
        {
-               if (ca[i].co < cnfa->ncolors)
-                       fprintf(f, "\t[%ld]->%d", (long) ca[i].co, ca[i].to);
+               if (ca->co < cnfa->ncolors)
+                       fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to);
                else
-                       fprintf(f, "\t:%ld:->%d", (long) ca[i].co - cnfa->ncolors,
-                                       ca[i].to);
+                       fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to);
                if (pos == 5)
                {
                        fprintf(f, "\n");
@@ -1653,7 +1653,7 @@ dumpcstate(int st,
                else
                        pos++;
        }
-       if (i == 1 || pos != 1)
+       if (ca == cnfa->states[st] || pos != 1)
                fprintf(f, "\n");
        fflush(f);
 }
index 4dac8f5d3c23cedd8bf4be415c6d7713020f5c4f..3f7d57c163a8492f04de1f63183c16ca6d30ddde 100644 (file)
@@ -162,7 +162,7 @@ static void dumparcs(struct state *, FILE *);
 static int     dumprarcs(struct arc *, struct state *, FILE *, int);
 static void dumparc(struct arc *, struct state *, FILE *);
 static void dumpcnfa(struct cnfa *, FILE *);
-static void dumpcstate(int, struct carc *, struct cnfa *, FILE *);
+static void dumpcstate(int, struct cnfa *, FILE *);
 #endif
 /* === regc_cvec.c === */
 static struct cvec *newcvec(int, int);
index e521261a57192fd08e1482f8ccdd99eda3e4ff70..5efb761d6d7207b34715f9760a0f991d50de644f 100644 (file)
@@ -458,14 +458,14 @@ miss(struct vars * v,                     /* used only for debug flags */
        gotstate = 0;
        for (i = 0; i < d->nstates; i++)
                if (ISBSET(css->states, i))
-                       for (ca = cnfa->states[i] + 1; ca->co != COLORLESS; ca++)
+                       for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
                                if (ca->co == co)
                                {
                                        BSET(d->work, ca->to);
                                        gotstate = 1;
                                        if (ca->to == cnfa->post)
                                                ispost = 1;
-                                       if (!cnfa->states[ca->to]->co)
+                                       if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
                                                noprogress = 0;
                                        FDEBUG(("%d -> %d\n", i, ca->to));
                                }
@@ -476,10 +476,9 @@ miss(struct vars * v,                      /* used only for debug flags */
                dolacons = 0;
                for (i = 0; i < d->nstates; i++)
                        if (ISBSET(d->work, i))
-                               for (ca = cnfa->states[i] + 1; ca->co != COLORLESS;
-                                        ca++)
+                               for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
                                {
-                                       if (ca->co <= cnfa->ncolors)
+                                       if (ca->co < cnfa->ncolors)
                                                continue;               /* NOTE CONTINUE */
                                        sawlacons = 1;
                                        if (ISBSET(d->work, ca->to))
@@ -490,7 +489,7 @@ miss(struct vars * v,                       /* used only for debug flags */
                                        dolacons = 1;
                                        if (ca->to == cnfa->post)
                                                ispost = 1;
-                                       if (!cnfa->states[ca->to]->co)
+                                       if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
                                                noprogress = 0;
                                        FDEBUG(("%d :> %d\n", i, ca->to));
                                }
diff --git a/src/backend/regex/regprefix.c b/src/backend/regex/regprefix.c
new file mode 100644 (file)
index 0000000..6f91288
--- /dev/null
@@ -0,0 +1,259 @@
+/*-------------------------------------------------------------------------
+ *
+ * regprefix.c
+ *       Extract a common prefix, if any, from a compiled regex.
+ *
+ *
+ * Portions Copyright (c) 2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1998, 1999 Henry Spencer
+ *
+ * IDENTIFICATION
+ *       src/backend/regex/regprefix.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "regex/regguts.h"
+
+
+/*
+ * forward declarations
+ */
+static int findprefix(struct cnfa * cnfa, struct colormap * cm,
+                                         chr *string, size_t *slength);
+
+
+/*
+ * pg_regprefix - get common prefix for regular expression
+ *
+ * Returns one of:
+ *     REG_NOMATCH: there is no common prefix of strings matching the regex
+ *     REG_PREFIX: there is a common prefix of strings matching the regex
+ *     REG_EXACT: all strings satisfying the regex must match the same string
+ *     or a REG_XXX error code
+ *
+ * In the non-failure cases, *string is set to a malloc'd string containing
+ * the common prefix or exact value, of length *slength (measured in chrs
+ * not bytes!).
+ *
+ * This function does not analyze all complex cases (such as lookahead
+ * constraints) exactly.  Therefore it is possible that some strings matching
+ * the reported prefix or exact-match string do not satisfy the regex.  But
+ * it should never be the case that a string satisfying the regex does not
+ * match the reported prefix or exact-match string.
+ */
+int
+pg_regprefix(regex_t *re,
+                        chr **string,
+                        size_t *slength)
+{
+       struct guts *g;
+       struct cnfa *cnfa;
+       int                     st;
+
+       /* sanity checks */
+       if (string == NULL || slength == NULL)
+               return REG_INVARG;
+       *string = NULL;                         /* initialize for failure cases */
+       *slength = 0;
+       if (re == NULL || re->re_magic != REMAGIC)
+               return REG_INVARG;
+       if (re->re_csize != sizeof(chr))
+               return REG_MIXED;
+
+       /* Initialize locale-dependent support */
+       pg_set_regex_collation(re->re_collation);
+
+       /* setup */
+       g = (struct guts *) re->re_guts;
+       if (g->info & REG_UIMPOSSIBLE)
+               return REG_NOMATCH;
+
+       /*
+        * This implementation considers only the search NFA for the topmost regex
+        * tree node.  Therefore, constraints such as backrefs are not fully
+        * applied, which is allowed per the function's API spec.
+        */
+       assert(g->tree != NULL);
+       cnfa = &g->tree->cnfa;
+
+       /*
+        * Since a correct NFA should never contain any exit-free loops, it should
+        * not be possible for our traversal to return to a previously visited
+        * NFA state.  Hence we need at most nstates chrs in the output string.
+        */
+       *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
+       if (*string == NULL)
+               return REG_ESPACE;
+
+       /* do it */
+       st = findprefix(cnfa, &g->cmap, *string, slength);
+
+       assert(*slength <= cnfa->nstates);
+
+       /* clean up */
+       if (st != REG_PREFIX && st != REG_EXACT)
+       {
+               FREE(*string);
+               *string = NULL;
+               *slength = 0;
+       }
+
+       return st;
+}
+
+/*
+ * findprefix - extract common prefix from cNFA
+ *
+ * Results are returned into the preallocated chr array string[], with
+ * *slength (which must be preset to zero) incremented for each chr.
+ */
+static int                                             /* regprefix return code */
+findprefix(struct cnfa * cnfa,
+                  struct colormap * cm,
+                  chr *string,
+                  size_t *slength)
+{
+       int                     st;
+       int                     nextst;
+       color           thiscolor;
+       chr                     c;
+       struct carc *ca;
+
+       /*
+        * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
+        * anchored left.  If we have both BOS and BOL, they must go to the
+        * same next state.
+        */
+       st = cnfa->pre;
+       nextst = -1;
+       for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+       {
+               if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
+               {
+                       if (nextst == -1)
+                               nextst = ca->to;
+                       else if (nextst != ca->to)
+                               return REG_NOMATCH;
+               }
+               else
+                       return REG_NOMATCH;
+       }
+       if (nextst == -1)
+               return REG_NOMATCH;
+
+       /*
+        * Scan through successive states, stopping as soon as we find one with
+        * more than one acceptable transition character (either multiple colors
+        * on out-arcs, or a color with more than one member chr).
+        *
+        * We could find a state with multiple out-arcs that are all labeled with
+        * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
+        * In that case we add the chr "c" to the output string but then exit the
+        * loop with nextst == -1.  This leaves a little bit on the table: if the
+        * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
+        * to the prefix.  But chasing multiple parallel state chains doesn't seem
+        * worth the trouble.
+        */
+       do
+       {
+               st = nextst;
+               nextst = -1;
+               thiscolor = COLORLESS;
+               for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+               {
+                       /* We ignore lookahead constraints */
+                       if (ca->co >= cnfa->ncolors)
+                               continue;
+                       /* We can also ignore BOS/BOL arcs */
+                       if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
+                               continue;
+                       /* ... but EOS/EOL arcs terminate the search */
+                       if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
+                       {
+                               thiscolor = COLORLESS;
+                               break;
+                       }
+                       if (thiscolor == COLORLESS)
+                       {
+                               /* First plain outarc */
+                               thiscolor = ca->co;
+                               nextst = ca->to;
+                       }
+                       else if (thiscolor == ca->co)
+                       {
+                               /* Another plain outarc for same color */
+                               nextst = -1;
+                       }
+                       else
+                       {
+                               /* More than one plain outarc color terminates the search */
+                               thiscolor = COLORLESS;
+                               break;
+                       }
+               }
+               /* Done if we didn't find exactly one color on plain outarcs */
+               if (thiscolor == COLORLESS)
+                       break;
+               /* The color must be a singleton */
+               if (cm->cd[thiscolor].nchrs != 1)
+                       break;
+
+               /*
+                * Identify the color's sole member chr and add it to the prefix
+                * string.  In general the colormap data structure doesn't provide a
+                * way to find color member chrs, except by trying GETCOLOR() on each
+                * possible chr value, which won't do at all.  However, for the cases
+                * we care about it should be sufficient to test the "firstchr" value,
+                * that is the first chr ever added to the color.  There are cases
+                * where this might no longer be a member of the color (so we do need
+                * to test), but none of them are likely to arise for a character that
+                * is a member of a common prefix.  If we do hit such a corner case,
+                * we just fall out without adding anything to the prefix string.
+                */
+               c = cm->cd[thiscolor].firstchr;
+               if (GETCOLOR(cm, c) != thiscolor)
+                       break;
+
+               string[(*slength)++] = c;
+
+               /* Advance to next state, but only if we have a unique next state */
+       } while (nextst != -1);
+
+       /*
+        * If we ended at a state that only has EOS/EOL outarcs leading to the
+        * "post" state, then we have an exact-match string.  Note this is true
+        * even if the string is of zero length.
+        */
+       nextst = -1;
+       for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+       {
+               if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
+               {
+                       if (nextst == -1)
+                               nextst = ca->to;
+                       else if (nextst != ca->to)
+                       {
+                               nextst = -1;
+                               break;
+                       }
+               }
+               else
+               {
+                       nextst = -1;
+                       break;
+               }
+       }
+       if (nextst == cnfa->post)
+               return REG_EXACT;
+
+       /*
+        * Otherwise, if we were unable to identify any prefix characters, say
+        * NOMATCH --- the pattern is anchored left, but doesn't specify any
+        * particular first character.
+        */
+       if (*slength > 0)
+               return REG_PREFIX;
+
+       return REG_NOMATCH;
+}
index 0dbbd6715c98b2c8b8ba598e73cd72915bb70036..c590f7e950ec6047fbebbf67b6d8e61e875508c8 100644 (file)
@@ -1170,3 +1170,68 @@ build_regexp_split_result(regexp_matches_ctx *splitctx)
                                                                   Int32GetDatum(startpos + 1));
        }
 }
+
+/*
+ * regexp_fixed_prefix - extract fixed prefix, if any, for a regexp
+ *
+ * The result is NULL if there is no fixed prefix, else a palloc'd string.
+ * If it is an exact match, not just a prefix, *exact is returned as TRUE.
+ */
+char *
+regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation,
+                                       bool *exact)
+{
+       char       *result;
+       regex_t    *re;
+       int                     cflags;
+       int                     re_result;
+       pg_wchar   *str;
+       size_t          slen;
+       size_t          maxlen;
+       char            errMsg[100];
+
+       *exact = false;                         /* default result */
+
+       /* Compile RE */
+       cflags = REG_ADVANCED;
+       if (case_insensitive)
+               cflags |= REG_ICASE;
+
+       re = RE_compile_and_cache(text_re, cflags, collation);
+
+       /* Examine it to see if there's a fixed prefix */
+       re_result = pg_regprefix(re, &str, &slen);
+
+       switch (re_result)
+       {
+               case REG_NOMATCH:
+                       return NULL;
+
+               case REG_PREFIX:
+                       /* continue with wchar conversion */
+                       break;
+
+               case REG_EXACT:
+                       *exact = true;
+                       /* continue with wchar conversion */
+                       break;
+
+               default:
+                       /* re failed??? */
+                       pg_regerror(re_result, re, errMsg, sizeof(errMsg));
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
+                                        errmsg("regular expression failed: %s", errMsg)));
+                       break;
+       }
+
+       /* Convert pg_wchar result back to database encoding */
+       maxlen = pg_database_encoding_max_length() * slen + 1;
+       result = (char *) palloc(maxlen);
+       slen = pg_wchar2mb_with_len(str, result, slen);
+       Assert(slen < maxlen);
+
+       free(str);
+
+       return result;
+}
index 0c52a6ed5503bcfcf5aae23d818ac7cda66d3f91..abeef769baa3c920b2e30f843e237dd246798625 100644 (file)
@@ -189,7 +189,8 @@ static Selectivity prefix_selectivity(PlannerInfo *root,
 static Selectivity like_selectivity(const char *patt, int pattlen,
                                                                        bool case_insensitive);
 static Selectivity regex_selectivity(const char *patt, int pattlen,
-                                                                        bool case_insensitive);
+                                                                        bool case_insensitive,
+                                                                        int fixed_prefix_len);
 static Datum string_to_datum(const char *str, Oid datatype);
 static Const *string_to_const(const char *str, Oid datatype);
 static Const *string_to_bytea_const(const char *str, size_t str_len);
@@ -5013,18 +5014,9 @@ static Pattern_Prefix_Status
 regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
                                   Const **prefix_const, Selectivity *rest_selec)
 {
-       char       *match;
-       int                     pos,
-                               match_pos,
-                               prev_pos,
-                               prev_match_pos;
-       bool            have_leading_paren;
-       char       *patt;
-       char       *rest;
        Oid                     typeid = patt_const->consttype;
-       bool            is_multibyte = (pg_database_encoding_max_length() > 1);
-       pg_locale_t locale = 0;
-       bool            locale_is_c = false;
+       char       *prefix;
+       bool            exact;
 
        /*
         * Should be unnecessary, there are no bytea regex operators defined. As
@@ -5036,185 +5028,54 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
                                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                 errmsg("regular-expression matching not supported on type bytea")));
 
-       if (case_insensitive)
-       {
-               /* If case-insensitive, we need locale info */
-               if (lc_ctype_is_c(collation))
-                       locale_is_c = true;
-               else if (collation != DEFAULT_COLLATION_OID)
-               {
-                       if (!OidIsValid(collation))
-                       {
-                               /*
-                                * This typically means that the parser could not resolve a
-                                * conflict of implicit collations, so report it that way.
-                                */
-                               ereport(ERROR,
-                                               (errcode(ERRCODE_INDETERMINATE_COLLATION),
-                                                errmsg("could not determine which collation to use for regular expression"),
-                                                errhint("Use the COLLATE clause to set the collation explicitly.")));
-                       }
-                       locale = pg_newlocale_from_collation(collation);
-               }
-       }
-
-       /* the right-hand const is type text for all of these */
-       patt = TextDatumGetCString(patt_const->constvalue);
-
-       /*
-        * Check for ARE director prefix.  It's worth our trouble to recognize
-        * this because similar_escape() used to use it, and some other code might
-        * still use it, to force ARE mode.
-        */
-       pos = 0;
-       if (strncmp(patt, "***:", 4) == 0)
-               pos = 4;
+       /* Use the regexp machinery to extract the prefix, if any */
+       prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue),
+                                                                case_insensitive, collation,
+                                                                &exact);
 
-       /* Pattern must be anchored left */
-       if (patt[pos] != '^')
+       if (prefix == NULL)
        {
                *prefix_const = NULL;
 
                if (rest_selec != NULL)
-                       *rest_selec = regex_selectivity(patt, strlen(patt),
-                                                                                       case_insensitive);
-
-               return Pattern_Prefix_None;
-       }
-       pos++;
-
-       /*
-        * If '|' is present in pattern, then there may be multiple alternatives
-        * for the start of the string.  (There are cases where this isn't so, for
-        * instance if the '|' is inside parens, but detecting that reliably is
-        * too hard.)
-        */
-       if (strchr(patt + pos, '|') != NULL)
-       {
-               *prefix_const = NULL;
+               {
+                       char   *patt = TextDatumGetCString(patt_const->constvalue);
 
-               if (rest_selec != NULL)
                        *rest_selec = regex_selectivity(patt, strlen(patt),
-                                                                                       case_insensitive);
+                                                                                       case_insensitive,
+                                                                                       0);
+                       pfree(patt);
+               }
 
                return Pattern_Prefix_None;
        }
 
-       /* OK, allocate space for pattern */
-       match = palloc(strlen(patt) + 1);
-       prev_match_pos = match_pos = 0;
+       *prefix_const = string_to_const(prefix, typeid);
 
-       /*
-        * We special-case the syntax '^(...)$' because psql uses it.  But beware:
-        * sequences beginning "(?" are not what they seem, unless they're "(?:".
-        * (We must recognize that because of similar_escape().)
-        */
-       have_leading_paren = false;
-       if (patt[pos] == '(' &&
-               (patt[pos + 1] != '?' || patt[pos + 2] == ':'))
-       {
-               have_leading_paren = true;
-               pos += (patt[pos + 1] != '?' ? 1 : 3);
-       }
-
-       /* Scan remainder of pattern */
-       prev_pos = pos;
-       while (patt[pos])
+       if (rest_selec != NULL)
        {
-               int                     len;
-
-               /*
-                * Check for characters that indicate multiple possible matches here.
-                * Also, drop out at ')' or '$' so the termination test works right.
-                */
-               if (patt[pos] == '.' ||
-                       patt[pos] == '(' ||
-                       patt[pos] == ')' ||
-                       patt[pos] == '[' ||
-                       patt[pos] == '^' ||
-                       patt[pos] == '$')
-                       break;
-
-               /* Stop if case-varying character (it's sort of a wildcard) */
-               if (case_insensitive &&
-                 pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c))
-                       break;
-
-               /*
-                * Check for quantifiers.  Except for +, this means the preceding
-                * character is optional, so we must remove it from the prefix too!
-                */
-               if (patt[pos] == '*' ||
-                       patt[pos] == '?' ||
-                       patt[pos] == '{')
+               if (exact)
                {
-                       match_pos = prev_match_pos;
-                       pos = prev_pos;
-                       break;
+                       /* Exact match, so there's no additional selectivity */
+                       *rest_selec = 1.0;
                }
-               if (patt[pos] == '+')
+               else
                {
-                       pos = prev_pos;
-                       break;
-               }
+                       char   *patt = TextDatumGetCString(patt_const->constvalue);
 
-               /*
-                * Normally, backslash quotes the next character.  But in AREs,
-                * backslash followed by alphanumeric is an escape, not a quoted
-                * character.  Must treat it as having multiple possible matches.
-                * Note: since only ASCII alphanumerics are escapes, we don't have to
-                * be paranoid about multibyte or collations here.
-                */
-               if (patt[pos] == '\\')
-               {
-                       if (isalnum((unsigned char) patt[pos + 1]))
-                               break;
-                       pos++;
-                       if (patt[pos] == '\0')
-                               break;
+                       *rest_selec = regex_selectivity(patt, strlen(patt),
+                                                                                       case_insensitive,
+                                                                                       strlen(prefix));
+                       pfree(patt);
                }
-               /* save position in case we need to back up on next loop cycle */
-               prev_match_pos = match_pos;
-               prev_pos = pos;
-               /* must use encoding-aware processing here */
-               len = pg_mblen(&patt[pos]);
-               memcpy(&match[match_pos], &patt[pos], len);
-               match_pos += len;
-               pos += len;
        }
 
-       match[match_pos] = '\0';
-       rest = &patt[pos];
-
-       if (have_leading_paren && patt[pos] == ')')
-               pos++;
-
-       if (patt[pos] == '$' && patt[pos + 1] == '\0')
-       {
-               *prefix_const = string_to_const(match, typeid);
-
-               if (rest_selec != NULL)
-                       *rest_selec = 1.0;
-
-               pfree(patt);
-               pfree(match);
+       pfree(prefix);
 
+       if (exact)
                return Pattern_Prefix_Exact;    /* pattern specifies exact match */
-       }
-
-       *prefix_const = string_to_const(match, typeid);
-
-       if (rest_selec != NULL)
-               *rest_selec = regex_selectivity(rest, strlen(rest),
-                                                                               case_insensitive);
-
-       pfree(patt);
-       pfree(match);
-
-       if (match_pos > 0)
+       else
                return Pattern_Prefix_Partial;
-
-       return Pattern_Prefix_None;
 }
 
 Pattern_Prefix_Status
@@ -5499,7 +5360,8 @@ regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive)
 }
 
 static Selectivity
-regex_selectivity(const char *patt, int pattlen, bool case_insensitive)
+regex_selectivity(const char *patt, int pattlen, bool case_insensitive,
+                                 int fixed_prefix_len)
 {
        Selectivity sel;
 
@@ -5515,9 +5377,14 @@ regex_selectivity(const char *patt, int pattlen, bool case_insensitive)
                /* no trailing $ */
                sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
                sel *= FULL_WILDCARD_SEL;
-               if (sel > 1.0)
-                       sel = 1.0;
        }
+
+       /* If there's a fixed prefix, discount its selectivity */
+       if (fixed_prefix_len > 0)
+               sel /= pow(FIXED_CHAR_SEL, fixed_prefix_len);
+
+       /* Make sure result stays in range */
+       CLAMP_PROBABILITY(sel);
        return sel;
 }
 
index cec4b837cd15665da8e800ac1729245c21f1b22d..616c2c6450d80443361ca7e4660f52aaa266c9ad 100644 (file)
@@ -156,6 +156,9 @@ typedef struct
 /* two specials for debugging and testing */
 #define REG_ATOI       101                     /* convert error-code name to number */
 #define REG_ITOA       102                     /* convert error-code number to name */
+/* non-error result codes for pg_regprefix */
+#define REG_PREFIX     (-1)            /* identified a common prefix */
+#define REG_EXACT      (-2)            /* identified an exact match */
 
 
 
@@ -164,6 +167,7 @@ typedef struct
  */
 extern int     pg_regcomp(regex_t *, const pg_wchar *, size_t, int, Oid);
 extern int     pg_regexec(regex_t *, const pg_wchar *, size_t, size_t, rm_detail_t *, size_t, regmatch_t[], int);
+extern int     pg_regprefix(regex_t *, pg_wchar **, size_t *);
 extern void pg_regfree(regex_t *);
 extern size_t pg_regerror(int, const regex_t *, char *, size_t);
 extern void pg_set_regex_collation(Oid collation);
index 0cced701dbdc84578ff3d8df2dcbd91a65107064..81442a231e37436cfac33cb3bc9802a6e2b9cdae 100644 (file)
@@ -188,6 +188,7 @@ struct colordesc
        color           sub;                    /* open subcolor (if any); free chain ptr */
 #define  NOSUB  COLORLESS
        struct arc *arcs;                       /* color chain */
+       chr                     firstchr;               /* char first assigned to this color */
        int                     flags;
 #define  FREECOL 01                            /* currently free */
 #define  PSEUDO  02                            /* pseudocolor, no real chars */
@@ -255,15 +256,14 @@ struct state;
 
 struct arc
 {
-       int                     type;
-#define  ARCFREE '\0'
+       int                     type;                   /* 0 if free, else an NFA arc type code */
        color           co;
        struct state *from;                     /* where it's from (and contained within) */
        struct state *to;                       /* where it's to */
-       struct arc *outchain;           /* *from's outs chain or free chain */
+       struct arc *outchain;           /* link in *from's outs chain or free chain */
 #define  freechain      outchain
-       struct arc *inchain;            /* *to's ins chain */
-       struct arc *colorchain;         /* color's arc chain */
+       struct arc *inchain;            /* link in *to's ins chain */
+       struct arc *colorchain;         /* link in color's arc chain */
        struct arc *colorchainRev;      /* back-link in color's arc chain */
 };
 
@@ -315,24 +315,38 @@ struct nfa
 
 /*
  * definitions for compacted NFA
+ *
+ * The main space savings in a compacted NFA is from making the arcs as small
+ * as possible.  We store only the transition color and next-state number for
+ * each arc.  The list of out arcs for each state is an array beginning at
+ * cnfa.states[statenumber], and terminated by a dummy carc struct with
+ * co == COLORLESS.
+ *
+ * The non-dummy carc structs are of two types: plain arcs and LACON arcs.
+ * Plain arcs just store the transition color number as "co".  LACON arcs
+ * store the lookahead constraint number plus cnfa.ncolors as "co".  LACON
+ * arcs can be distinguished from plain by testing for co >= cnfa.ncolors.
  */
 struct carc
 {
        color           co;                             /* COLORLESS is list terminator */
-       int                     to;                             /* state number */
+       int                     to;                             /* next-state number */
 };
 
 struct cnfa
 {
        int                     nstates;                /* number of states */
-       int                     ncolors;                /* number of colors */
+       int                     ncolors;                /* number of colors (max color in use + 1) */
        int                     flags;
-#define  HASLACONS      01                     /* uses lookahead constraints */
+#define  HASLACONS     01                      /* uses lookahead constraints */
        int                     pre;                    /* setup state number */
        int                     post;                   /* teardown state number */
        color           bos[2];                 /* colors, if any, assigned to BOS and BOL */
        color           eos[2];                 /* colors, if any, assigned to EOS and EOL */
+       char       *stflags;            /* vector of per-state flags bytes */
+#define  CNFA_NOPROGRESS       01      /* flag bit for a no-progress state */
        struct carc **states;           /* vector of pointers to outarc lists */
+       /* states[n] are pointers into a single malloc'd array of arcs */
        struct carc *arcs;                      /* the area for the lists */
 };
 
index 14215db1b4f04faa85c95f43eb5cbdf1c5baf590..82804dbe8527caa37fc367b39d54ba55ef5643af 100644 (file)
@@ -544,6 +544,8 @@ extern Datum regexp_split_to_table(PG_FUNCTION_ARGS);
 extern Datum regexp_split_to_table_no_flags(PG_FUNCTION_ARGS);
 extern Datum regexp_split_to_array(PG_FUNCTION_ARGS);
 extern Datum regexp_split_to_array_no_flags(PG_FUNCTION_ARGS);
+extern char *regexp_fixed_prefix(text *text_re, bool case_insensitive,
+                                                                Oid collation, bool *exact);
 
 /* regproc.c */
 extern Datum regprocin(PG_FUNCTION_ARGS);