1 /*-------------------------------------------------------------------------
4 * Extract a common prefix, if any, from a compiled regex.
7 * Portions Copyright (c) 2012-2013, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1998, 1999 Henry Spencer
11 * src/backend/regex/regprefix.c
13 *-------------------------------------------------------------------------
16 #include "regex/regguts.h"
20 * forward declarations
22 static int findprefix(struct cnfa * cnfa, struct colormap * cm,
23 chr *string, size_t *slength);
27 * pg_regprefix - get common prefix for regular expression
30 * REG_NOMATCH: there is no common prefix of strings matching the regex
31 * REG_PREFIX: there is a common prefix of strings matching the regex
32 * REG_EXACT: all strings satisfying the regex must match the same string
33 * or a REG_XXX error code
35 * In the non-failure cases, *string is set to a malloc'd string containing
36 * the common prefix or exact value, of length *slength (measured in chrs
39 * This function does not analyze all complex cases (such as lookahead
40 * constraints) exactly. Therefore it is possible that some strings matching
41 * the reported prefix or exact-match string do not satisfy the regex. But
42 * it should never be the case that a string satisfying the regex does not
43 * match the reported prefix or exact-match string.
46 pg_regprefix(regex_t *re,
55 if (string == NULL || slength == NULL)
57 *string = NULL; /* initialize for failure cases */
59 if (re == NULL || re->re_magic != REMAGIC)
61 if (re->re_csize != sizeof(chr))
64 /* Initialize locale-dependent support */
65 pg_set_regex_collation(re->re_collation);
68 g = (struct guts *) re->re_guts;
69 if (g->info & REG_UIMPOSSIBLE)
73 * This implementation considers only the search NFA for the topmost regex
74 * tree node. Therefore, constraints such as backrefs are not fully
75 * applied, which is allowed per the function's API spec.
77 assert(g->tree != NULL);
78 cnfa = &g->tree->cnfa;
81 * Since a correct NFA should never contain any exit-free loops, it should
82 * not be possible for our traversal to return to a previously visited
83 * NFA state. Hence we need at most nstates chrs in the output string.
85 *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
90 st = findprefix(cnfa, &g->cmap, *string, slength);
92 assert(*slength <= cnfa->nstates);
95 if (st != REG_PREFIX && st != REG_EXACT)
106 * findprefix - extract common prefix from cNFA
108 * Results are returned into the preallocated chr array string[], with
109 * *slength (which must be preset to zero) incremented for each chr.
111 static int /* regprefix return code */
112 findprefix(struct cnfa * cnfa,
113 struct colormap * cm,
124 * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
125 * anchored left. If we have both BOS and BOL, they must go to the
130 for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
132 if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
136 else if (nextst != ca->to)
146 * Scan through successive states, stopping as soon as we find one with
147 * more than one acceptable transition character (either multiple colors
148 * on out-arcs, or a color with more than one member chr).
150 * We could find a state with multiple out-arcs that are all labeled with
151 * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
152 * In that case we add the chr "c" to the output string but then exit the
153 * loop with nextst == -1. This leaves a little bit on the table: if the
154 * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
155 * to the prefix. But chasing multiple parallel state chains doesn't seem
162 thiscolor = COLORLESS;
163 for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
165 /* We ignore lookahead constraints */
166 if (ca->co >= cnfa->ncolors)
168 /* We can also ignore BOS/BOL arcs */
169 if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
171 /* ... but EOS/EOL arcs terminate the search */
172 if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
174 thiscolor = COLORLESS;
177 if (thiscolor == COLORLESS)
179 /* First plain outarc */
183 else if (thiscolor == ca->co)
185 /* Another plain outarc for same color */
190 /* More than one plain outarc color terminates the search */
191 thiscolor = COLORLESS;
195 /* Done if we didn't find exactly one color on plain outarcs */
196 if (thiscolor == COLORLESS)
198 /* The color must be a singleton */
199 if (cm->cd[thiscolor].nchrs != 1)
203 * Identify the color's sole member chr and add it to the prefix
204 * string. In general the colormap data structure doesn't provide a
205 * way to find color member chrs, except by trying GETCOLOR() on each
206 * possible chr value, which won't do at all. However, for the cases
207 * we care about it should be sufficient to test the "firstchr" value,
208 * that is the first chr ever added to the color. There are cases
209 * where this might no longer be a member of the color (so we do need
210 * to test), but none of them are likely to arise for a character that
211 * is a member of a common prefix. If we do hit such a corner case,
212 * we just fall out without adding anything to the prefix string.
214 c = cm->cd[thiscolor].firstchr;
215 if (GETCOLOR(cm, c) != thiscolor)
218 string[(*slength)++] = c;
220 /* Advance to next state, but only if we have a unique next state */
221 } while (nextst != -1);
224 * If we ended at a state that only has EOS/EOL outarcs leading to the
225 * "post" state, then we have an exact-match string. Note this is true
226 * even if the string is of zero length.
229 for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
231 if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
235 else if (nextst != ca->to)
247 if (nextst == cnfa->post)
251 * Otherwise, if we were unable to identify any prefix characters, say
252 * NOMATCH --- the pattern is anchored left, but doesn't specify any
253 * particular first character.