From: Tom Lane Date: Tue, 10 Jul 2012 22:00:44 +0000 (-0400) Subject: Back-patch fix for extraction of fixed prefixes from regular expressions. X-Git-Tag: REL9_1_5~31 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=a9287de1760450e7fe3b4309ee1ba7ea2af39217;p=postgresql Back-patch fix for extraction of fixed prefixes from regular expressions. Back-patch of commits 628cbb50ba80c83917b07a7609ddec12cda172d0 and c6aae3042be5249e672b731ebeb21875b5343010. This has been broken since 7.3, so back-patch to all supported branches. --- diff --git a/src/backend/regex/Makefile b/src/backend/regex/Makefile index 21e7fa5329..74a4c0c89d 100644 --- a/src/backend/regex/Makefile +++ b/src/backend/regex/Makefile @@ -12,7 +12,7 @@ subdir = src/backend/regex top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -OBJS = regcomp.o regerror.o regexec.o regfree.o +OBJS = regcomp.o regerror.o regexec.o regfree.o regprefix.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/regex/regc_color.c b/src/backend/regex/regc_color.c index 2aeb861d97..1c60566fbf 100644 --- a/src/backend/regex/regc_color.c +++ b/src/backend/regex/regc_color.c @@ -66,8 +66,9 @@ initcm(struct vars * v, cd = cm->cd; /* cm->cd[WHITE] */ cd->sub = NOSUB; cd->arcs = NULL; - cd->flags = 0; + cd->firstchr = CHR_MIN; cd->nchrs = CHR_MAX - CHR_MIN + 1; + cd->flags = 0; /* upper levels of tree */ for (t = &cm->tree[0], j = NBYTS - 1; j > 0; t = nextt, j--) @@ -272,6 +273,7 @@ newcolor(struct colormap * cm) cd->nchrs = 0; cd->sub = NOSUB; cd->arcs = NULL; + cd->firstchr = CHR_MIN; /* in case never set otherwise */ cd->flags = 0; cd->block = NULL; @@ -371,6 +373,8 @@ subcolor(struct colormap * cm, chr c) if (co == sco) /* already in an open subcolor */ return co; /* rest is redundant */ cm->cd[co].nchrs--; + if (cm->cd[sco].nchrs == 0) + cm->cd[sco].firstchr = c; cm->cd[sco].nchrs++; setcolor(cm, c, sco); return sco; @@ -438,6 +442,11 @@ subrange(struct vars * v, /* * subblock - allocate new subcolors for one tree block of chrs, fill in arcs + * + * Note: subcolors that are created during execution of this function + * will not be given a useful value of firstchr; it'll be left as CHR_MIN. + * For the current usage of firstchr in pg_regprefix, this does not matter + * because such subcolors won't occur in the common prefix of a regex. */ static void subblock(struct vars * v, diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c index 66a361ee2f..085842c92b 100644 --- a/src/backend/regex/regc_nfa.c +++ b/src/backend/regex/regc_nfa.c @@ -1330,14 +1330,16 @@ compact(struct nfa * nfa, for (s = nfa->states; s != NULL; s = s->next) { nstates++; - narcs += 1 + s->nouts + 1; - /* 1 as a fake for flags, nouts for arcs, 1 as endmarker */ + narcs += s->nouts + 1; /* need one extra for endmarker */ } + cnfa->stflags = (char *) MALLOC(nstates * sizeof(char)); cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *)); cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc)); - if (cnfa->states == NULL || cnfa->arcs == NULL) + if (cnfa->stflags == NULL || cnfa->states == NULL || cnfa->arcs == NULL) { + if (cnfa->stflags != NULL) + FREE(cnfa->stflags); if (cnfa->states != NULL) FREE(cnfa->states); if (cnfa->arcs != NULL) @@ -1359,9 +1361,8 @@ compact(struct nfa * nfa, for (s = nfa->states; s != NULL; s = s->next) { assert((size_t) s->no < nstates); + cnfa->stflags[s->no] = 0; cnfa->states[s->no] = ca; - ca->co = 0; /* clear and skip flags "arc" */ - ca++; first = ca; for (a = s->outs; a != NULL; a = a->outchain) switch (a->type) @@ -1392,8 +1393,8 @@ compact(struct nfa * nfa, /* mark no-progress states */ for (a = nfa->pre->outs; a != NULL; a = a->outchain) - cnfa->states[a->to->no]->co = 1; - cnfa->states[nfa->pre->no]->co = 1; + cnfa->stflags[a->to->no] = CNFA_NOPROGRESS; + cnfa->stflags[nfa->pre->no] = CNFA_NOPROGRESS; } /* @@ -1433,6 +1434,7 @@ freecnfa(struct cnfa * cnfa) { assert(cnfa->nstates != 0); /* not empty already */ cnfa->nstates = 0; + FREE(cnfa->stflags); FREE(cnfa->states); FREE(cnfa->arcs); } @@ -1617,7 +1619,7 @@ dumpcnfa(struct cnfa * cnfa, fprintf(f, ", haslacons"); fprintf(f, "\n"); for (st = 0; st < cnfa->nstates; st++) - dumpcstate(st, cnfa->states[st], cnfa, f); + dumpcstate(st, cnfa, f); fflush(f); } #endif @@ -1629,22 +1631,20 @@ dumpcnfa(struct cnfa * cnfa, */ static void dumpcstate(int st, - struct carc * ca, struct cnfa * cnfa, FILE *f) { - int i; + struct carc * ca; int pos; - fprintf(f, "%d%s", st, (ca[0].co) ? ":" : "."); + fprintf(f, "%d%s", st, (cnfa->stflags[st] & CNFA_NOPROGRESS) ? ":" : "."); pos = 1; - for (i = 1; ca[i].co != COLORLESS; i++) + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) { - if (ca[i].co < cnfa->ncolors) - fprintf(f, "\t[%ld]->%d", (long) ca[i].co, ca[i].to); + if (ca->co < cnfa->ncolors) + fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to); else - fprintf(f, "\t:%ld:->%d", (long) ca[i].co - cnfa->ncolors, - ca[i].to); + fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to); if (pos == 5) { fprintf(f, "\n"); @@ -1653,7 +1653,7 @@ dumpcstate(int st, else pos++; } - if (i == 1 || pos != 1) + if (ca == cnfa->states[st] || pos != 1) fprintf(f, "\n"); fflush(f); } diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index 4dac8f5d3c..3f7d57c163 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -162,7 +162,7 @@ static void dumparcs(struct state *, FILE *); static int dumprarcs(struct arc *, struct state *, FILE *, int); static void dumparc(struct arc *, struct state *, FILE *); static void dumpcnfa(struct cnfa *, FILE *); -static void dumpcstate(int, struct carc *, struct cnfa *, FILE *); +static void dumpcstate(int, struct cnfa *, FILE *); #endif /* === regc_cvec.c === */ static struct cvec *newcvec(int, int); diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c index e521261a57..5efb761d6d 100644 --- a/src/backend/regex/rege_dfa.c +++ b/src/backend/regex/rege_dfa.c @@ -458,14 +458,14 @@ miss(struct vars * v, /* used only for debug flags */ gotstate = 0; for (i = 0; i < d->nstates; i++) if (ISBSET(css->states, i)) - for (ca = cnfa->states[i] + 1; ca->co != COLORLESS; ca++) + for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++) if (ca->co == co) { BSET(d->work, ca->to); gotstate = 1; if (ca->to == cnfa->post) ispost = 1; - if (!cnfa->states[ca->to]->co) + if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS)) noprogress = 0; FDEBUG(("%d -> %d\n", i, ca->to)); } @@ -476,10 +476,9 @@ miss(struct vars * v, /* used only for debug flags */ dolacons = 0; for (i = 0; i < d->nstates; i++) if (ISBSET(d->work, i)) - for (ca = cnfa->states[i] + 1; ca->co != COLORLESS; - ca++) + for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++) { - if (ca->co <= cnfa->ncolors) + if (ca->co < cnfa->ncolors) continue; /* NOTE CONTINUE */ sawlacons = 1; if (ISBSET(d->work, ca->to)) @@ -490,7 +489,7 @@ miss(struct vars * v, /* used only for debug flags */ dolacons = 1; if (ca->to == cnfa->post) ispost = 1; - if (!cnfa->states[ca->to]->co) + if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS)) noprogress = 0; FDEBUG(("%d :> %d\n", i, ca->to)); } diff --git a/src/backend/regex/regprefix.c b/src/backend/regex/regprefix.c new file mode 100644 index 0000000000..6f91288e9d --- /dev/null +++ b/src/backend/regex/regprefix.c @@ -0,0 +1,259 @@ +/*------------------------------------------------------------------------- + * + * regprefix.c + * Extract a common prefix, if any, from a compiled regex. + * + * + * Portions Copyright (c) 2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1998, 1999 Henry Spencer + * + * IDENTIFICATION + * src/backend/regex/regprefix.c + * + *------------------------------------------------------------------------- + */ + +#include "regex/regguts.h" + + +/* + * forward declarations + */ +static int findprefix(struct cnfa * cnfa, struct colormap * cm, + chr *string, size_t *slength); + + +/* + * pg_regprefix - get common prefix for regular expression + * + * Returns one of: + * REG_NOMATCH: there is no common prefix of strings matching the regex + * REG_PREFIX: there is a common prefix of strings matching the regex + * REG_EXACT: all strings satisfying the regex must match the same string + * or a REG_XXX error code + * + * In the non-failure cases, *string is set to a malloc'd string containing + * the common prefix or exact value, of length *slength (measured in chrs + * not bytes!). + * + * This function does not analyze all complex cases (such as lookahead + * constraints) exactly. Therefore it is possible that some strings matching + * the reported prefix or exact-match string do not satisfy the regex. But + * it should never be the case that a string satisfying the regex does not + * match the reported prefix or exact-match string. + */ +int +pg_regprefix(regex_t *re, + chr **string, + size_t *slength) +{ + struct guts *g; + struct cnfa *cnfa; + int st; + + /* sanity checks */ + if (string == NULL || slength == NULL) + return REG_INVARG; + *string = NULL; /* initialize for failure cases */ + *slength = 0; + if (re == NULL || re->re_magic != REMAGIC) + return REG_INVARG; + if (re->re_csize != sizeof(chr)) + return REG_MIXED; + + /* Initialize locale-dependent support */ + pg_set_regex_collation(re->re_collation); + + /* setup */ + g = (struct guts *) re->re_guts; + if (g->info & REG_UIMPOSSIBLE) + return REG_NOMATCH; + + /* + * This implementation considers only the search NFA for the topmost regex + * tree node. Therefore, constraints such as backrefs are not fully + * applied, which is allowed per the function's API spec. + */ + assert(g->tree != NULL); + cnfa = &g->tree->cnfa; + + /* + * Since a correct NFA should never contain any exit-free loops, it should + * not be possible for our traversal to return to a previously visited + * NFA state. Hence we need at most nstates chrs in the output string. + */ + *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr)); + if (*string == NULL) + return REG_ESPACE; + + /* do it */ + st = findprefix(cnfa, &g->cmap, *string, slength); + + assert(*slength <= cnfa->nstates); + + /* clean up */ + if (st != REG_PREFIX && st != REG_EXACT) + { + FREE(*string); + *string = NULL; + *slength = 0; + } + + return st; +} + +/* + * findprefix - extract common prefix from cNFA + * + * Results are returned into the preallocated chr array string[], with + * *slength (which must be preset to zero) incremented for each chr. + */ +static int /* regprefix return code */ +findprefix(struct cnfa * cnfa, + struct colormap * cm, + chr *string, + size_t *slength) +{ + int st; + int nextst; + color thiscolor; + chr c; + struct carc *ca; + + /* + * The "pre" state must have only BOS/BOL outarcs, else pattern isn't + * anchored left. If we have both BOS and BOL, they must go to the + * same next state. + */ + st = cnfa->pre; + nextst = -1; + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) + { + if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1]) + { + if (nextst == -1) + nextst = ca->to; + else if (nextst != ca->to) + return REG_NOMATCH; + } + else + return REG_NOMATCH; + } + if (nextst == -1) + return REG_NOMATCH; + + /* + * Scan through successive states, stopping as soon as we find one with + * more than one acceptable transition character (either multiple colors + * on out-arcs, or a color with more than one member chr). + * + * We could find a state with multiple out-arcs that are all labeled with + * the same singleton color; this comes from patterns like "^ab(cde|cxy)". + * In that case we add the chr "c" to the output string but then exit the + * loop with nextst == -1. This leaves a little bit on the table: if the + * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added + * to the prefix. But chasing multiple parallel state chains doesn't seem + * worth the trouble. + */ + do + { + st = nextst; + nextst = -1; + thiscolor = COLORLESS; + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) + { + /* We ignore lookahead constraints */ + if (ca->co >= cnfa->ncolors) + continue; + /* We can also ignore BOS/BOL arcs */ + if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1]) + continue; + /* ... but EOS/EOL arcs terminate the search */ + if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1]) + { + thiscolor = COLORLESS; + break; + } + if (thiscolor == COLORLESS) + { + /* First plain outarc */ + thiscolor = ca->co; + nextst = ca->to; + } + else if (thiscolor == ca->co) + { + /* Another plain outarc for same color */ + nextst = -1; + } + else + { + /* More than one plain outarc color terminates the search */ + thiscolor = COLORLESS; + break; + } + } + /* Done if we didn't find exactly one color on plain outarcs */ + if (thiscolor == COLORLESS) + break; + /* The color must be a singleton */ + if (cm->cd[thiscolor].nchrs != 1) + break; + + /* + * Identify the color's sole member chr and add it to the prefix + * string. In general the colormap data structure doesn't provide a + * way to find color member chrs, except by trying GETCOLOR() on each + * possible chr value, which won't do at all. However, for the cases + * we care about it should be sufficient to test the "firstchr" value, + * that is the first chr ever added to the color. There are cases + * where this might no longer be a member of the color (so we do need + * to test), but none of them are likely to arise for a character that + * is a member of a common prefix. If we do hit such a corner case, + * we just fall out without adding anything to the prefix string. + */ + c = cm->cd[thiscolor].firstchr; + if (GETCOLOR(cm, c) != thiscolor) + break; + + string[(*slength)++] = c; + + /* Advance to next state, but only if we have a unique next state */ + } while (nextst != -1); + + /* + * If we ended at a state that only has EOS/EOL outarcs leading to the + * "post" state, then we have an exact-match string. Note this is true + * even if the string is of zero length. + */ + nextst = -1; + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) + { + if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1]) + { + if (nextst == -1) + nextst = ca->to; + else if (nextst != ca->to) + { + nextst = -1; + break; + } + } + else + { + nextst = -1; + break; + } + } + if (nextst == cnfa->post) + return REG_EXACT; + + /* + * Otherwise, if we were unable to identify any prefix characters, say + * NOMATCH --- the pattern is anchored left, but doesn't specify any + * particular first character. + */ + if (*slength > 0) + return REG_PREFIX; + + return REG_NOMATCH; +} diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index 0dbbd6715c..c590f7e950 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -1170,3 +1170,68 @@ build_regexp_split_result(regexp_matches_ctx *splitctx) Int32GetDatum(startpos + 1)); } } + +/* + * regexp_fixed_prefix - extract fixed prefix, if any, for a regexp + * + * The result is NULL if there is no fixed prefix, else a palloc'd string. + * If it is an exact match, not just a prefix, *exact is returned as TRUE. + */ +char * +regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation, + bool *exact) +{ + char *result; + regex_t *re; + int cflags; + int re_result; + pg_wchar *str; + size_t slen; + size_t maxlen; + char errMsg[100]; + + *exact = false; /* default result */ + + /* Compile RE */ + cflags = REG_ADVANCED; + if (case_insensitive) + cflags |= REG_ICASE; + + re = RE_compile_and_cache(text_re, cflags, collation); + + /* Examine it to see if there's a fixed prefix */ + re_result = pg_regprefix(re, &str, &slen); + + switch (re_result) + { + case REG_NOMATCH: + return NULL; + + case REG_PREFIX: + /* continue with wchar conversion */ + break; + + case REG_EXACT: + *exact = true; + /* continue with wchar conversion */ + break; + + default: + /* re failed??? */ + pg_regerror(re_result, re, errMsg, sizeof(errMsg)); + ereport(ERROR, + (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), + errmsg("regular expression failed: %s", errMsg))); + break; + } + + /* Convert pg_wchar result back to database encoding */ + maxlen = pg_database_encoding_max_length() * slen + 1; + result = (char *) palloc(maxlen); + slen = pg_wchar2mb_with_len(str, result, slen); + Assert(slen < maxlen); + + free(str); + + return result; +} diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 0c52a6ed55..abeef769ba 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -189,7 +189,8 @@ static Selectivity prefix_selectivity(PlannerInfo *root, static Selectivity like_selectivity(const char *patt, int pattlen, bool case_insensitive); static Selectivity regex_selectivity(const char *patt, int pattlen, - bool case_insensitive); + bool case_insensitive, + int fixed_prefix_len); static Datum string_to_datum(const char *str, Oid datatype); static Const *string_to_const(const char *str, Oid datatype); static Const *string_to_bytea_const(const char *str, size_t str_len); @@ -5013,18 +5014,9 @@ static Pattern_Prefix_Status regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, Const **prefix_const, Selectivity *rest_selec) { - char *match; - int pos, - match_pos, - prev_pos, - prev_match_pos; - bool have_leading_paren; - char *patt; - char *rest; Oid typeid = patt_const->consttype; - bool is_multibyte = (pg_database_encoding_max_length() > 1); - pg_locale_t locale = 0; - bool locale_is_c = false; + char *prefix; + bool exact; /* * Should be unnecessary, there are no bytea regex operators defined. As @@ -5036,185 +5028,54 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("regular-expression matching not supported on type bytea"))); - if (case_insensitive) - { - /* If case-insensitive, we need locale info */ - if (lc_ctype_is_c(collation)) - locale_is_c = true; - else if (collation != DEFAULT_COLLATION_OID) - { - if (!OidIsValid(collation)) - { - /* - * This typically means that the parser could not resolve a - * conflict of implicit collations, so report it that way. - */ - ereport(ERROR, - (errcode(ERRCODE_INDETERMINATE_COLLATION), - errmsg("could not determine which collation to use for regular expression"), - errhint("Use the COLLATE clause to set the collation explicitly."))); - } - locale = pg_newlocale_from_collation(collation); - } - } - - /* the right-hand const is type text for all of these */ - patt = TextDatumGetCString(patt_const->constvalue); - - /* - * Check for ARE director prefix. It's worth our trouble to recognize - * this because similar_escape() used to use it, and some other code might - * still use it, to force ARE mode. - */ - pos = 0; - if (strncmp(patt, "***:", 4) == 0) - pos = 4; + /* Use the regexp machinery to extract the prefix, if any */ + prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue), + case_insensitive, collation, + &exact); - /* Pattern must be anchored left */ - if (patt[pos] != '^') + if (prefix == NULL) { *prefix_const = NULL; if (rest_selec != NULL) - *rest_selec = regex_selectivity(patt, strlen(patt), - case_insensitive); - - return Pattern_Prefix_None; - } - pos++; - - /* - * If '|' is present in pattern, then there may be multiple alternatives - * for the start of the string. (There are cases where this isn't so, for - * instance if the '|' is inside parens, but detecting that reliably is - * too hard.) - */ - if (strchr(patt + pos, '|') != NULL) - { - *prefix_const = NULL; + { + char *patt = TextDatumGetCString(patt_const->constvalue); - if (rest_selec != NULL) *rest_selec = regex_selectivity(patt, strlen(patt), - case_insensitive); + case_insensitive, + 0); + pfree(patt); + } return Pattern_Prefix_None; } - /* OK, allocate space for pattern */ - match = palloc(strlen(patt) + 1); - prev_match_pos = match_pos = 0; + *prefix_const = string_to_const(prefix, typeid); - /* - * We special-case the syntax '^(...)$' because psql uses it. But beware: - * sequences beginning "(?" are not what they seem, unless they're "(?:". - * (We must recognize that because of similar_escape().) - */ - have_leading_paren = false; - if (patt[pos] == '(' && - (patt[pos + 1] != '?' || patt[pos + 2] == ':')) - { - have_leading_paren = true; - pos += (patt[pos + 1] != '?' ? 1 : 3); - } - - /* Scan remainder of pattern */ - prev_pos = pos; - while (patt[pos]) + if (rest_selec != NULL) { - int len; - - /* - * Check for characters that indicate multiple possible matches here. - * Also, drop out at ')' or '$' so the termination test works right. - */ - if (patt[pos] == '.' || - patt[pos] == '(' || - patt[pos] == ')' || - patt[pos] == '[' || - patt[pos] == '^' || - patt[pos] == '$') - break; - - /* Stop if case-varying character (it's sort of a wildcard) */ - if (case_insensitive && - pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c)) - break; - - /* - * Check for quantifiers. Except for +, this means the preceding - * character is optional, so we must remove it from the prefix too! - */ - if (patt[pos] == '*' || - patt[pos] == '?' || - patt[pos] == '{') + if (exact) { - match_pos = prev_match_pos; - pos = prev_pos; - break; + /* Exact match, so there's no additional selectivity */ + *rest_selec = 1.0; } - if (patt[pos] == '+') + else { - pos = prev_pos; - break; - } + char *patt = TextDatumGetCString(patt_const->constvalue); - /* - * Normally, backslash quotes the next character. But in AREs, - * backslash followed by alphanumeric is an escape, not a quoted - * character. Must treat it as having multiple possible matches. - * Note: since only ASCII alphanumerics are escapes, we don't have to - * be paranoid about multibyte or collations here. - */ - if (patt[pos] == '\\') - { - if (isalnum((unsigned char) patt[pos + 1])) - break; - pos++; - if (patt[pos] == '\0') - break; + *rest_selec = regex_selectivity(patt, strlen(patt), + case_insensitive, + strlen(prefix)); + pfree(patt); } - /* save position in case we need to back up on next loop cycle */ - prev_match_pos = match_pos; - prev_pos = pos; - /* must use encoding-aware processing here */ - len = pg_mblen(&patt[pos]); - memcpy(&match[match_pos], &patt[pos], len); - match_pos += len; - pos += len; } - match[match_pos] = '\0'; - rest = &patt[pos]; - - if (have_leading_paren && patt[pos] == ')') - pos++; - - if (patt[pos] == '$' && patt[pos + 1] == '\0') - { - *prefix_const = string_to_const(match, typeid); - - if (rest_selec != NULL) - *rest_selec = 1.0; - - pfree(patt); - pfree(match); + pfree(prefix); + if (exact) return Pattern_Prefix_Exact; /* pattern specifies exact match */ - } - - *prefix_const = string_to_const(match, typeid); - - if (rest_selec != NULL) - *rest_selec = regex_selectivity(rest, strlen(rest), - case_insensitive); - - pfree(patt); - pfree(match); - - if (match_pos > 0) + else return Pattern_Prefix_Partial; - - return Pattern_Prefix_None; } Pattern_Prefix_Status @@ -5499,7 +5360,8 @@ regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive) } static Selectivity -regex_selectivity(const char *patt, int pattlen, bool case_insensitive) +regex_selectivity(const char *patt, int pattlen, bool case_insensitive, + int fixed_prefix_len) { Selectivity sel; @@ -5515,9 +5377,14 @@ regex_selectivity(const char *patt, int pattlen, bool case_insensitive) /* no trailing $ */ sel = regex_selectivity_sub(patt, pattlen, case_insensitive); sel *= FULL_WILDCARD_SEL; - if (sel > 1.0) - sel = 1.0; } + + /* If there's a fixed prefix, discount its selectivity */ + if (fixed_prefix_len > 0) + sel /= pow(FIXED_CHAR_SEL, fixed_prefix_len); + + /* Make sure result stays in range */ + CLAMP_PROBABILITY(sel); return sel; } diff --git a/src/include/regex/regex.h b/src/include/regex/regex.h index cec4b837cd..616c2c6450 100644 --- a/src/include/regex/regex.h +++ b/src/include/regex/regex.h @@ -156,6 +156,9 @@ typedef struct /* two specials for debugging and testing */ #define REG_ATOI 101 /* convert error-code name to number */ #define REG_ITOA 102 /* convert error-code number to name */ +/* non-error result codes for pg_regprefix */ +#define REG_PREFIX (-1) /* identified a common prefix */ +#define REG_EXACT (-2) /* identified an exact match */ @@ -164,6 +167,7 @@ typedef struct */ extern int pg_regcomp(regex_t *, const pg_wchar *, size_t, int, Oid); extern int pg_regexec(regex_t *, const pg_wchar *, size_t, size_t, rm_detail_t *, size_t, regmatch_t[], int); +extern int pg_regprefix(regex_t *, pg_wchar **, size_t *); extern void pg_regfree(regex_t *); extern size_t pg_regerror(int, const regex_t *, char *, size_t); extern void pg_set_regex_collation(Oid collation); diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h index 0cced701db..81442a231e 100644 --- a/src/include/regex/regguts.h +++ b/src/include/regex/regguts.h @@ -188,6 +188,7 @@ struct colordesc color sub; /* open subcolor (if any); free chain ptr */ #define NOSUB COLORLESS struct arc *arcs; /* color chain */ + chr firstchr; /* char first assigned to this color */ int flags; #define FREECOL 01 /* currently free */ #define PSEUDO 02 /* pseudocolor, no real chars */ @@ -255,15 +256,14 @@ struct state; struct arc { - int type; -#define ARCFREE '\0' + int type; /* 0 if free, else an NFA arc type code */ color co; struct state *from; /* where it's from (and contained within) */ struct state *to; /* where it's to */ - struct arc *outchain; /* *from's outs chain or free chain */ + struct arc *outchain; /* link in *from's outs chain or free chain */ #define freechain outchain - struct arc *inchain; /* *to's ins chain */ - struct arc *colorchain; /* color's arc chain */ + struct arc *inchain; /* link in *to's ins chain */ + struct arc *colorchain; /* link in color's arc chain */ struct arc *colorchainRev; /* back-link in color's arc chain */ }; @@ -315,24 +315,38 @@ struct nfa /* * definitions for compacted NFA + * + * The main space savings in a compacted NFA is from making the arcs as small + * as possible. We store only the transition color and next-state number for + * each arc. The list of out arcs for each state is an array beginning at + * cnfa.states[statenumber], and terminated by a dummy carc struct with + * co == COLORLESS. + * + * The non-dummy carc structs are of two types: plain arcs and LACON arcs. + * Plain arcs just store the transition color number as "co". LACON arcs + * store the lookahead constraint number plus cnfa.ncolors as "co". LACON + * arcs can be distinguished from plain by testing for co >= cnfa.ncolors. */ struct carc { color co; /* COLORLESS is list terminator */ - int to; /* state number */ + int to; /* next-state number */ }; struct cnfa { int nstates; /* number of states */ - int ncolors; /* number of colors */ + int ncolors; /* number of colors (max color in use + 1) */ int flags; -#define HASLACONS 01 /* uses lookahead constraints */ +#define HASLACONS 01 /* uses lookahead constraints */ int pre; /* setup state number */ int post; /* teardown state number */ color bos[2]; /* colors, if any, assigned to BOS and BOL */ color eos[2]; /* colors, if any, assigned to EOS and EOL */ + char *stflags; /* vector of per-state flags bytes */ +#define CNFA_NOPROGRESS 01 /* flag bit for a no-progress state */ struct carc **states; /* vector of pointers to outarc lists */ + /* states[n] are pointers into a single malloc'd array of arcs */ struct carc *arcs; /* the area for the lists */ }; diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index 14215db1b4..82804dbe85 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -544,6 +544,8 @@ extern Datum regexp_split_to_table(PG_FUNCTION_ARGS); extern Datum regexp_split_to_table_no_flags(PG_FUNCTION_ARGS); extern Datum regexp_split_to_array(PG_FUNCTION_ARGS); extern Datum regexp_split_to_array_no_flags(PG_FUNCTION_ARGS); +extern char *regexp_fixed_prefix(text *text_re, bool case_insensitive, + Oid collation, bool *exact); /* regproc.c */ extern Datum regprocin(PG_FUNCTION_ARGS);