top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
-OBJS = regcomp.o regerror.o regexec.o regfree.o
+OBJS = regcomp.o regerror.o regexec.o regfree.o regprefix.o
include $(top_srcdir)/src/backend/common.mk
cd = cm->cd; /* cm->cd[WHITE] */
cd->sub = NOSUB;
cd->arcs = NULL;
- cd->flags = 0;
+ cd->firstchr = CHR_MIN;
cd->nchrs = CHR_MAX - CHR_MIN + 1;
+ cd->flags = 0;
/* upper levels of tree */
for (t = &cm->tree[0], j = NBYTS - 1; j > 0; t = nextt, j--)
cd->nchrs = 0;
cd->sub = NOSUB;
cd->arcs = NULL;
+ cd->firstchr = CHR_MIN; /* in case never set otherwise */
cd->flags = 0;
cd->block = NULL;
if (co == sco) /* already in an open subcolor */
return co; /* rest is redundant */
cm->cd[co].nchrs--;
+ if (cm->cd[sco].nchrs == 0)
+ cm->cd[sco].firstchr = c;
cm->cd[sco].nchrs++;
setcolor(cm, c, sco);
return sco;
/*
* subblock - allocate new subcolors for one tree block of chrs, fill in arcs
+ *
+ * Note: subcolors that are created during execution of this function
+ * will not be given a useful value of firstchr; it'll be left as CHR_MIN.
+ * For the current usage of firstchr in pg_regprefix, this does not matter
+ * because such subcolors won't occur in the common prefix of a regex.
*/
static void
subblock(struct vars * v,
for (s = nfa->states; s != NULL; s = s->next)
{
nstates++;
- narcs += 1 + s->nouts + 1;
- /* 1 as a fake for flags, nouts for arcs, 1 as endmarker */
+ narcs += s->nouts + 1; /* need one extra for endmarker */
}
+ cnfa->stflags = (char *) MALLOC(nstates * sizeof(char));
cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *));
cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc));
- if (cnfa->states == NULL || cnfa->arcs == NULL)
+ if (cnfa->stflags == NULL || cnfa->states == NULL || cnfa->arcs == NULL)
{
+ if (cnfa->stflags != NULL)
+ FREE(cnfa->stflags);
if (cnfa->states != NULL)
FREE(cnfa->states);
if (cnfa->arcs != NULL)
for (s = nfa->states; s != NULL; s = s->next)
{
assert((size_t) s->no < nstates);
+ cnfa->stflags[s->no] = 0;
cnfa->states[s->no] = ca;
- ca->co = 0; /* clear and skip flags "arc" */
- ca++;
first = ca;
for (a = s->outs; a != NULL; a = a->outchain)
switch (a->type)
/* mark no-progress states */
for (a = nfa->pre->outs; a != NULL; a = a->outchain)
- cnfa->states[a->to->no]->co = 1;
- cnfa->states[nfa->pre->no]->co = 1;
+ cnfa->stflags[a->to->no] = CNFA_NOPROGRESS;
+ cnfa->stflags[nfa->pre->no] = CNFA_NOPROGRESS;
}
/*
{
assert(cnfa->nstates != 0); /* not empty already */
cnfa->nstates = 0;
+ FREE(cnfa->stflags);
FREE(cnfa->states);
FREE(cnfa->arcs);
}
fprintf(f, ", haslacons");
fprintf(f, "\n");
for (st = 0; st < cnfa->nstates; st++)
- dumpcstate(st, cnfa->states[st], cnfa, f);
+ dumpcstate(st, cnfa, f);
fflush(f);
}
#endif
*/
static void
dumpcstate(int st,
- struct carc * ca,
struct cnfa * cnfa,
FILE *f)
{
- int i;
+ struct carc * ca;
int pos;
- fprintf(f, "%d%s", st, (ca[0].co) ? ":" : ".");
+ fprintf(f, "%d%s", st, (cnfa->stflags[st] & CNFA_NOPROGRESS) ? ":" : ".");
pos = 1;
- for (i = 1; ca[i].co != COLORLESS; i++)
+ for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
{
- if (ca[i].co < cnfa->ncolors)
- fprintf(f, "\t[%ld]->%d", (long) ca[i].co, ca[i].to);
+ if (ca->co < cnfa->ncolors)
+ fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to);
else
- fprintf(f, "\t:%ld:->%d", (long) ca[i].co - cnfa->ncolors,
- ca[i].to);
+ fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to);
if (pos == 5)
{
fprintf(f, "\n");
else
pos++;
}
- if (i == 1 || pos != 1)
+ if (ca == cnfa->states[st] || pos != 1)
fprintf(f, "\n");
fflush(f);
}
static int dumprarcs(struct arc *, struct state *, FILE *, int);
static void dumparc(struct arc *, struct state *, FILE *);
static void dumpcnfa(struct cnfa *, FILE *);
-static void dumpcstate(int, struct carc *, struct cnfa *, FILE *);
+static void dumpcstate(int, struct cnfa *, FILE *);
#endif
/* === regc_cvec.c === */
static struct cvec *newcvec(int, int);
gotstate = 0;
for (i = 0; i < d->nstates; i++)
if (ISBSET(css->states, i))
- for (ca = cnfa->states[i] + 1; ca->co != COLORLESS; ca++)
+ for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
if (ca->co == co)
{
BSET(d->work, ca->to);
gotstate = 1;
if (ca->to == cnfa->post)
ispost = 1;
- if (!cnfa->states[ca->to]->co)
+ if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
noprogress = 0;
FDEBUG(("%d -> %d\n", i, ca->to));
}
dolacons = 0;
for (i = 0; i < d->nstates; i++)
if (ISBSET(d->work, i))
- for (ca = cnfa->states[i] + 1; ca->co != COLORLESS;
- ca++)
+ for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
{
- if (ca->co <= cnfa->ncolors)
+ if (ca->co < cnfa->ncolors)
continue; /* NOTE CONTINUE */
sawlacons = 1;
if (ISBSET(d->work, ca->to))
dolacons = 1;
if (ca->to == cnfa->post)
ispost = 1;
- if (!cnfa->states[ca->to]->co)
+ if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
noprogress = 0;
FDEBUG(("%d :> %d\n", i, ca->to));
}
--- /dev/null
+/*-------------------------------------------------------------------------
+ *
+ * regprefix.c
+ * Extract a common prefix, if any, from a compiled regex.
+ *
+ *
+ * Portions Copyright (c) 2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1998, 1999 Henry Spencer
+ *
+ * IDENTIFICATION
+ * src/backend/regex/regprefix.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "regex/regguts.h"
+
+
+/*
+ * forward declarations
+ */
+static int findprefix(struct cnfa * cnfa, struct colormap * cm,
+ chr *string, size_t *slength);
+
+
+/*
+ * pg_regprefix - get common prefix for regular expression
+ *
+ * Returns one of:
+ * REG_NOMATCH: there is no common prefix of strings matching the regex
+ * REG_PREFIX: there is a common prefix of strings matching the regex
+ * REG_EXACT: all strings satisfying the regex must match the same string
+ * or a REG_XXX error code
+ *
+ * In the non-failure cases, *string is set to a malloc'd string containing
+ * the common prefix or exact value, of length *slength (measured in chrs
+ * not bytes!).
+ *
+ * This function does not analyze all complex cases (such as lookahead
+ * constraints) exactly. Therefore it is possible that some strings matching
+ * the reported prefix or exact-match string do not satisfy the regex. But
+ * it should never be the case that a string satisfying the regex does not
+ * match the reported prefix or exact-match string.
+ */
+int
+pg_regprefix(regex_t *re,
+ chr **string,
+ size_t *slength)
+{
+ struct guts *g;
+ struct cnfa *cnfa;
+ int st;
+
+ /* sanity checks */
+ if (string == NULL || slength == NULL)
+ return REG_INVARG;
+ *string = NULL; /* initialize for failure cases */
+ *slength = 0;
+ if (re == NULL || re->re_magic != REMAGIC)
+ return REG_INVARG;
+ if (re->re_csize != sizeof(chr))
+ return REG_MIXED;
+
+ /* Initialize locale-dependent support */
+ pg_set_regex_collation(re->re_collation);
+
+ /* setup */
+ g = (struct guts *) re->re_guts;
+ if (g->info & REG_UIMPOSSIBLE)
+ return REG_NOMATCH;
+
+ /*
+ * This implementation considers only the search NFA for the topmost regex
+ * tree node. Therefore, constraints such as backrefs are not fully
+ * applied, which is allowed per the function's API spec.
+ */
+ assert(g->tree != NULL);
+ cnfa = &g->tree->cnfa;
+
+ /*
+ * Since a correct NFA should never contain any exit-free loops, it should
+ * not be possible for our traversal to return to a previously visited
+ * NFA state. Hence we need at most nstates chrs in the output string.
+ */
+ *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
+ if (*string == NULL)
+ return REG_ESPACE;
+
+ /* do it */
+ st = findprefix(cnfa, &g->cmap, *string, slength);
+
+ assert(*slength <= cnfa->nstates);
+
+ /* clean up */
+ if (st != REG_PREFIX && st != REG_EXACT)
+ {
+ FREE(*string);
+ *string = NULL;
+ *slength = 0;
+ }
+
+ return st;
+}
+
+/*
+ * findprefix - extract common prefix from cNFA
+ *
+ * Results are returned into the preallocated chr array string[], with
+ * *slength (which must be preset to zero) incremented for each chr.
+ */
+static int /* regprefix return code */
+findprefix(struct cnfa * cnfa,
+ struct colormap * cm,
+ chr *string,
+ size_t *slength)
+{
+ int st;
+ int nextst;
+ color thiscolor;
+ chr c;
+ struct carc *ca;
+
+ /*
+ * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
+ * anchored left. If we have both BOS and BOL, they must go to the
+ * same next state.
+ */
+ st = cnfa->pre;
+ nextst = -1;
+ for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+ {
+ if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
+ {
+ if (nextst == -1)
+ nextst = ca->to;
+ else if (nextst != ca->to)
+ return REG_NOMATCH;
+ }
+ else
+ return REG_NOMATCH;
+ }
+ if (nextst == -1)
+ return REG_NOMATCH;
+
+ /*
+ * Scan through successive states, stopping as soon as we find one with
+ * more than one acceptable transition character (either multiple colors
+ * on out-arcs, or a color with more than one member chr).
+ *
+ * We could find a state with multiple out-arcs that are all labeled with
+ * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
+ * In that case we add the chr "c" to the output string but then exit the
+ * loop with nextst == -1. This leaves a little bit on the table: if the
+ * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
+ * to the prefix. But chasing multiple parallel state chains doesn't seem
+ * worth the trouble.
+ */
+ do
+ {
+ st = nextst;
+ nextst = -1;
+ thiscolor = COLORLESS;
+ for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+ {
+ /* We ignore lookahead constraints */
+ if (ca->co >= cnfa->ncolors)
+ continue;
+ /* We can also ignore BOS/BOL arcs */
+ if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
+ continue;
+ /* ... but EOS/EOL arcs terminate the search */
+ if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
+ {
+ thiscolor = COLORLESS;
+ break;
+ }
+ if (thiscolor == COLORLESS)
+ {
+ /* First plain outarc */
+ thiscolor = ca->co;
+ nextst = ca->to;
+ }
+ else if (thiscolor == ca->co)
+ {
+ /* Another plain outarc for same color */
+ nextst = -1;
+ }
+ else
+ {
+ /* More than one plain outarc color terminates the search */
+ thiscolor = COLORLESS;
+ break;
+ }
+ }
+ /* Done if we didn't find exactly one color on plain outarcs */
+ if (thiscolor == COLORLESS)
+ break;
+ /* The color must be a singleton */
+ if (cm->cd[thiscolor].nchrs != 1)
+ break;
+
+ /*
+ * Identify the color's sole member chr and add it to the prefix
+ * string. In general the colormap data structure doesn't provide a
+ * way to find color member chrs, except by trying GETCOLOR() on each
+ * possible chr value, which won't do at all. However, for the cases
+ * we care about it should be sufficient to test the "firstchr" value,
+ * that is the first chr ever added to the color. There are cases
+ * where this might no longer be a member of the color (so we do need
+ * to test), but none of them are likely to arise for a character that
+ * is a member of a common prefix. If we do hit such a corner case,
+ * we just fall out without adding anything to the prefix string.
+ */
+ c = cm->cd[thiscolor].firstchr;
+ if (GETCOLOR(cm, c) != thiscolor)
+ break;
+
+ string[(*slength)++] = c;
+
+ /* Advance to next state, but only if we have a unique next state */
+ } while (nextst != -1);
+
+ /*
+ * If we ended at a state that only has EOS/EOL outarcs leading to the
+ * "post" state, then we have an exact-match string. Note this is true
+ * even if the string is of zero length.
+ */
+ nextst = -1;
+ for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+ {
+ if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
+ {
+ if (nextst == -1)
+ nextst = ca->to;
+ else if (nextst != ca->to)
+ {
+ nextst = -1;
+ break;
+ }
+ }
+ else
+ {
+ nextst = -1;
+ break;
+ }
+ }
+ if (nextst == cnfa->post)
+ return REG_EXACT;
+
+ /*
+ * Otherwise, if we were unable to identify any prefix characters, say
+ * NOMATCH --- the pattern is anchored left, but doesn't specify any
+ * particular first character.
+ */
+ if (*slength > 0)
+ return REG_PREFIX;
+
+ return REG_NOMATCH;
+}
Int32GetDatum(startpos + 1));
}
}
+
+/*
+ * regexp_fixed_prefix - extract fixed prefix, if any, for a regexp
+ *
+ * The result is NULL if there is no fixed prefix, else a palloc'd string.
+ * If it is an exact match, not just a prefix, *exact is returned as TRUE.
+ */
+char *
+regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation,
+ bool *exact)
+{
+ char *result;
+ regex_t *re;
+ int cflags;
+ int re_result;
+ pg_wchar *str;
+ size_t slen;
+ size_t maxlen;
+ char errMsg[100];
+
+ *exact = false; /* default result */
+
+ /* Compile RE */
+ cflags = REG_ADVANCED;
+ if (case_insensitive)
+ cflags |= REG_ICASE;
+
+ re = RE_compile_and_cache(text_re, cflags, collation);
+
+ /* Examine it to see if there's a fixed prefix */
+ re_result = pg_regprefix(re, &str, &slen);
+
+ switch (re_result)
+ {
+ case REG_NOMATCH:
+ return NULL;
+
+ case REG_PREFIX:
+ /* continue with wchar conversion */
+ break;
+
+ case REG_EXACT:
+ *exact = true;
+ /* continue with wchar conversion */
+ break;
+
+ default:
+ /* re failed??? */
+ pg_regerror(re_result, re, errMsg, sizeof(errMsg));
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
+ errmsg("regular expression failed: %s", errMsg)));
+ break;
+ }
+
+ /* Convert pg_wchar result back to database encoding */
+ maxlen = pg_database_encoding_max_length() * slen + 1;
+ result = (char *) palloc(maxlen);
+ slen = pg_wchar2mb_with_len(str, result, slen);
+ Assert(slen < maxlen);
+
+ free(str);
+
+ return result;
+}
static Selectivity like_selectivity(const char *patt, int pattlen,
bool case_insensitive);
static Selectivity regex_selectivity(const char *patt, int pattlen,
- bool case_insensitive);
+ bool case_insensitive,
+ int fixed_prefix_len);
static Datum string_to_datum(const char *str, Oid datatype);
static Const *string_to_const(const char *str, Oid datatype);
static Const *string_to_bytea_const(const char *str, size_t str_len);
regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
Const **prefix_const, Selectivity *rest_selec)
{
- char *match;
- int pos,
- match_pos,
- prev_pos,
- prev_match_pos;
- bool have_leading_paren;
- char *patt;
- char *rest;
Oid typeid = patt_const->consttype;
- bool is_multibyte = (pg_database_encoding_max_length() > 1);
- pg_locale_t locale = 0;
- bool locale_is_c = false;
+ char *prefix;
+ bool exact;
/*
* Should be unnecessary, there are no bytea regex operators defined. As
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("regular-expression matching not supported on type bytea")));
- if (case_insensitive)
- {
- /* If case-insensitive, we need locale info */
- if (lc_ctype_is_c(collation))
- locale_is_c = true;
- else if (collation != DEFAULT_COLLATION_OID)
- {
- if (!OidIsValid(collation))
- {
- /*
- * This typically means that the parser could not resolve a
- * conflict of implicit collations, so report it that way.
- */
- ereport(ERROR,
- (errcode(ERRCODE_INDETERMINATE_COLLATION),
- errmsg("could not determine which collation to use for regular expression"),
- errhint("Use the COLLATE clause to set the collation explicitly.")));
- }
- locale = pg_newlocale_from_collation(collation);
- }
- }
-
- /* the right-hand const is type text for all of these */
- patt = TextDatumGetCString(patt_const->constvalue);
-
- /*
- * Check for ARE director prefix. It's worth our trouble to recognize
- * this because similar_escape() used to use it, and some other code might
- * still use it, to force ARE mode.
- */
- pos = 0;
- if (strncmp(patt, "***:", 4) == 0)
- pos = 4;
+ /* Use the regexp machinery to extract the prefix, if any */
+ prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue),
+ case_insensitive, collation,
+ &exact);
- /* Pattern must be anchored left */
- if (patt[pos] != '^')
+ if (prefix == NULL)
{
*prefix_const = NULL;
if (rest_selec != NULL)
- *rest_selec = regex_selectivity(patt, strlen(patt),
- case_insensitive);
-
- return Pattern_Prefix_None;
- }
- pos++;
-
- /*
- * If '|' is present in pattern, then there may be multiple alternatives
- * for the start of the string. (There are cases where this isn't so, for
- * instance if the '|' is inside parens, but detecting that reliably is
- * too hard.)
- */
- if (strchr(patt + pos, '|') != NULL)
- {
- *prefix_const = NULL;
+ {
+ char *patt = TextDatumGetCString(patt_const->constvalue);
- if (rest_selec != NULL)
*rest_selec = regex_selectivity(patt, strlen(patt),
- case_insensitive);
+ case_insensitive,
+ 0);
+ pfree(patt);
+ }
return Pattern_Prefix_None;
}
- /* OK, allocate space for pattern */
- match = palloc(strlen(patt) + 1);
- prev_match_pos = match_pos = 0;
+ *prefix_const = string_to_const(prefix, typeid);
- /*
- * We special-case the syntax '^(...)$' because psql uses it. But beware:
- * sequences beginning "(?" are not what they seem, unless they're "(?:".
- * (We must recognize that because of similar_escape().)
- */
- have_leading_paren = false;
- if (patt[pos] == '(' &&
- (patt[pos + 1] != '?' || patt[pos + 2] == ':'))
- {
- have_leading_paren = true;
- pos += (patt[pos + 1] != '?' ? 1 : 3);
- }
-
- /* Scan remainder of pattern */
- prev_pos = pos;
- while (patt[pos])
+ if (rest_selec != NULL)
{
- int len;
-
- /*
- * Check for characters that indicate multiple possible matches here.
- * Also, drop out at ')' or '$' so the termination test works right.
- */
- if (patt[pos] == '.' ||
- patt[pos] == '(' ||
- patt[pos] == ')' ||
- patt[pos] == '[' ||
- patt[pos] == '^' ||
- patt[pos] == '$')
- break;
-
- /* Stop if case-varying character (it's sort of a wildcard) */
- if (case_insensitive &&
- pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c))
- break;
-
- /*
- * Check for quantifiers. Except for +, this means the preceding
- * character is optional, so we must remove it from the prefix too!
- */
- if (patt[pos] == '*' ||
- patt[pos] == '?' ||
- patt[pos] == '{')
+ if (exact)
{
- match_pos = prev_match_pos;
- pos = prev_pos;
- break;
+ /* Exact match, so there's no additional selectivity */
+ *rest_selec = 1.0;
}
- if (patt[pos] == '+')
+ else
{
- pos = prev_pos;
- break;
- }
+ char *patt = TextDatumGetCString(patt_const->constvalue);
- /*
- * Normally, backslash quotes the next character. But in AREs,
- * backslash followed by alphanumeric is an escape, not a quoted
- * character. Must treat it as having multiple possible matches.
- * Note: since only ASCII alphanumerics are escapes, we don't have to
- * be paranoid about multibyte or collations here.
- */
- if (patt[pos] == '\\')
- {
- if (isalnum((unsigned char) patt[pos + 1]))
- break;
- pos++;
- if (patt[pos] == '\0')
- break;
+ *rest_selec = regex_selectivity(patt, strlen(patt),
+ case_insensitive,
+ strlen(prefix));
+ pfree(patt);
}
- /* save position in case we need to back up on next loop cycle */
- prev_match_pos = match_pos;
- prev_pos = pos;
- /* must use encoding-aware processing here */
- len = pg_mblen(&patt[pos]);
- memcpy(&match[match_pos], &patt[pos], len);
- match_pos += len;
- pos += len;
}
- match[match_pos] = '\0';
- rest = &patt[pos];
-
- if (have_leading_paren && patt[pos] == ')')
- pos++;
-
- if (patt[pos] == '$' && patt[pos + 1] == '\0')
- {
- *prefix_const = string_to_const(match, typeid);
-
- if (rest_selec != NULL)
- *rest_selec = 1.0;
-
- pfree(patt);
- pfree(match);
+ pfree(prefix);
+ if (exact)
return Pattern_Prefix_Exact; /* pattern specifies exact match */
- }
-
- *prefix_const = string_to_const(match, typeid);
-
- if (rest_selec != NULL)
- *rest_selec = regex_selectivity(rest, strlen(rest),
- case_insensitive);
-
- pfree(patt);
- pfree(match);
-
- if (match_pos > 0)
+ else
return Pattern_Prefix_Partial;
-
- return Pattern_Prefix_None;
}
Pattern_Prefix_Status
}
static Selectivity
-regex_selectivity(const char *patt, int pattlen, bool case_insensitive)
+regex_selectivity(const char *patt, int pattlen, bool case_insensitive,
+ int fixed_prefix_len)
{
Selectivity sel;
/* no trailing $ */
sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
sel *= FULL_WILDCARD_SEL;
- if (sel > 1.0)
- sel = 1.0;
}
+
+ /* If there's a fixed prefix, discount its selectivity */
+ if (fixed_prefix_len > 0)
+ sel /= pow(FIXED_CHAR_SEL, fixed_prefix_len);
+
+ /* Make sure result stays in range */
+ CLAMP_PROBABILITY(sel);
return sel;
}
/* two specials for debugging and testing */
#define REG_ATOI 101 /* convert error-code name to number */
#define REG_ITOA 102 /* convert error-code number to name */
+/* non-error result codes for pg_regprefix */
+#define REG_PREFIX (-1) /* identified a common prefix */
+#define REG_EXACT (-2) /* identified an exact match */
*/
extern int pg_regcomp(regex_t *, const pg_wchar *, size_t, int, Oid);
extern int pg_regexec(regex_t *, const pg_wchar *, size_t, size_t, rm_detail_t *, size_t, regmatch_t[], int);
+extern int pg_regprefix(regex_t *, pg_wchar **, size_t *);
extern void pg_regfree(regex_t *);
extern size_t pg_regerror(int, const regex_t *, char *, size_t);
extern void pg_set_regex_collation(Oid collation);
color sub; /* open subcolor (if any); free chain ptr */
#define NOSUB COLORLESS
struct arc *arcs; /* color chain */
+ chr firstchr; /* char first assigned to this color */
int flags;
#define FREECOL 01 /* currently free */
#define PSEUDO 02 /* pseudocolor, no real chars */
struct arc
{
- int type;
-#define ARCFREE '\0'
+ int type; /* 0 if free, else an NFA arc type code */
color co;
struct state *from; /* where it's from (and contained within) */
struct state *to; /* where it's to */
- struct arc *outchain; /* *from's outs chain or free chain */
+ struct arc *outchain; /* link in *from's outs chain or free chain */
#define freechain outchain
- struct arc *inchain; /* *to's ins chain */
- struct arc *colorchain; /* color's arc chain */
+ struct arc *inchain; /* link in *to's ins chain */
+ struct arc *colorchain; /* link in color's arc chain */
struct arc *colorchainRev; /* back-link in color's arc chain */
};
/*
* definitions for compacted NFA
+ *
+ * The main space savings in a compacted NFA is from making the arcs as small
+ * as possible. We store only the transition color and next-state number for
+ * each arc. The list of out arcs for each state is an array beginning at
+ * cnfa.states[statenumber], and terminated by a dummy carc struct with
+ * co == COLORLESS.
+ *
+ * The non-dummy carc structs are of two types: plain arcs and LACON arcs.
+ * Plain arcs just store the transition color number as "co". LACON arcs
+ * store the lookahead constraint number plus cnfa.ncolors as "co". LACON
+ * arcs can be distinguished from plain by testing for co >= cnfa.ncolors.
*/
struct carc
{
color co; /* COLORLESS is list terminator */
- int to; /* state number */
+ int to; /* next-state number */
};
struct cnfa
{
int nstates; /* number of states */
- int ncolors; /* number of colors */
+ int ncolors; /* number of colors (max color in use + 1) */
int flags;
-#define HASLACONS 01 /* uses lookahead constraints */
+#define HASLACONS 01 /* uses lookahead constraints */
int pre; /* setup state number */
int post; /* teardown state number */
color bos[2]; /* colors, if any, assigned to BOS and BOL */
color eos[2]; /* colors, if any, assigned to EOS and EOL */
+ char *stflags; /* vector of per-state flags bytes */
+#define CNFA_NOPROGRESS 01 /* flag bit for a no-progress state */
struct carc **states; /* vector of pointers to outarc lists */
+ /* states[n] are pointers into a single malloc'd array of arcs */
struct carc *arcs; /* the area for the lists */
};
extern Datum regexp_split_to_table_no_flags(PG_FUNCTION_ARGS);
extern Datum regexp_split_to_array(PG_FUNCTION_ARGS);
extern Datum regexp_split_to_array_no_flags(PG_FUNCTION_ARGS);
+extern char *regexp_fixed_prefix(text *text_re, bool case_insensitive,
+ Oid collation, bool *exact);
/* regproc.c */
extern Datum regprocin(PG_FUNCTION_ARGS);