1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
10 Written by: Philip Hazel <ph10@cam.ac.uk>
12 Copyright (c) 1997-2000 University of Cambridge
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
29 4. If PCRE is embedded in any software that is released under the GNU
30 General Purpose Licence (GPL), then the terms of that licence shall
31 supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
36 /* Define DEBUG to get debugging output on stdout. */
40 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41 inline, and there are *still* stupid compilers about that don't like indented
42 pre-processor statements. I suppose it's only been 10 years... */
45 #define DPRINTF(p) printf p
47 #define DPRINTF(p) /*nothing*/
50 /* Include the internals header, which itself includes Standard C headers plus
51 the external pcre header. */
56 /* Allow compilation as C++ source code, should anybody want to do that. */
59 #define class pcre_class
63 /* Number of items on the nested bracket stacks at compile time. This should
64 not be set greater than 200. */
66 #define BRASTACK_SIZE 200
69 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
71 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
72 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
74 /* Text forms of OP_ values and things, for debugging (not all used) */
77 static const char *OP_names[] = {
78 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
79 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
80 "Opt", "^", "$", "Any", "chars", "not",
81 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
82 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
83 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
84 "*", "*?", "+", "+?", "?", "??", "{", "{",
85 "class", "Ref", "Recurse",
86 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
87 "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
88 "Brazero", "Braminzero", "Bra"
92 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
93 are simple data values; negative values are for special things like \d and so
94 on. Zero means further processing is needed (for things like \x), or the escape
97 static const short int escapes[] = {
98 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
99 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
100 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
101 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
102 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
103 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
104 '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
105 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
106 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
107 0, 0, -ESC_z /* x - z */
110 /* Tables of names of POSIX character classes and their lengths. The list is
111 terminated by a zero length entry. The first three must be alpha, upper, lower,
112 as this is assumed for handling case independence. */
114 static const char *posix_names[] = {
115 "alpha", "lower", "upper",
116 "alnum", "ascii", "cntrl", "digit", "graph",
117 "print", "punct", "space", "word", "xdigit" };
119 static const uschar posix_name_lengths[] = {
120 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
122 /* Table of class bit maps for each POSIX class; up to three may be combined
123 to form the class. */
125 static const int posix_class_maps[] = {
126 cbit_lower, cbit_upper, -1, /* alpha */
127 cbit_lower, -1, -1, /* lower */
128 cbit_upper, -1, -1, /* upper */
129 cbit_digit, cbit_lower, cbit_upper, /* alnum */
130 cbit_print, cbit_cntrl, -1, /* ascii */
131 cbit_cntrl, -1, -1, /* cntrl */
132 cbit_digit, -1, -1, /* digit */
133 cbit_graph, -1, -1, /* graph */
134 cbit_print, -1, -1, /* print */
135 cbit_punct, -1, -1, /* punct */
136 cbit_space, -1, -1, /* space */
137 cbit_word, -1, -1, /* word */
138 cbit_xdigit,-1, -1 /* xdigit */
142 /* Definition to allow mutual recursion */
145 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
146 BOOL, int, int *, int *, compile_data *);
150 /*************************************************
152 *************************************************/
154 /* PCRE is thread-clean and doesn't use any global variables in the normal
155 sense. However, it calls memory allocation and free functions via the two
156 indirections below, which are can be changed by the caller, but are shared
157 between all threads. */
159 void *(*pcre_malloc)(size_t) = malloc;
160 void (*pcre_free)(void *) = free;
165 /*************************************************
166 * Default character tables *
167 *************************************************/
169 /* A default set of character tables is included in the PCRE binary. Its source
170 is built by the maketables auxiliary program, which uses the default C ctypes
171 functions, and put in the file chartables.c. These tables are used by PCRE
172 whenever the caller of pcre_compile() does not provide an alternate set of
175 #include "chartables.c"
179 /*************************************************
180 * Return version string *
181 *************************************************/
183 #define STRING(a) # a
184 #define XSTRING(s) STRING(s)
189 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
195 /*************************************************
196 * (Obsolete) Return info about compiled pattern *
197 *************************************************/
199 /* This is the original "info" function. It picks potentially useful data out
200 of the private structure, but its interface was too rigid. It remains for
201 backwards compatibility. The public options are passed back in an int - though
202 the re->options field has been expanded to a long int, all the public options
203 at the low end of it, and so even on 16-bit systems this will still be OK.
204 Therefore, I haven't changed the API for pcre_info().
207 external_re points to compiled code
208 optptr where to pass back the options
209 first_char where to pass back the first character,
210 or -1 if multiline and all branches start ^,
213 Returns: number of capturing subpatterns
214 or negative values on error
218 pcre_info(const pcre *external_re, int *optptr, int *first_char)
220 const real_pcre *re = (const real_pcre *)external_re;
221 if (re == NULL) return PCRE_ERROR_NULL;
222 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
223 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
224 if (first_char != NULL)
225 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
226 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
227 return re->top_bracket;
232 /*************************************************
233 * Return info about compiled pattern *
234 *************************************************/
236 /* This is a newer "info" function which has an extensible interface so
237 that additional items can be added compatibly.
240 external_re points to compiled code
241 external_study points to study data, or NULL
242 what what information is required
243 where where to put the information
245 Returns: 0 if data returned, negative on error
249 pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
252 const real_pcre *re = (const real_pcre *)external_re;
253 const real_pcre_extra *study = (const real_pcre_extra *)study_data;
255 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
256 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
260 case PCRE_INFO_OPTIONS:
261 *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
265 *((size_t *)where) = re->size;
268 case PCRE_INFO_CAPTURECOUNT:
269 *((int *)where) = re->top_bracket;
272 case PCRE_INFO_BACKREFMAX:
273 *((int *)where) = re->top_backref;
276 case PCRE_INFO_FIRSTCHAR:
278 ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
279 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
282 case PCRE_INFO_FIRSTTABLE:
283 *((const uschar **)where) =
284 (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
285 study->start_bits : NULL;
288 case PCRE_INFO_LASTLITERAL:
290 ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
293 default: return PCRE_ERROR_BADOPTION;
302 /*************************************************
303 * Debugging function to print chars *
304 *************************************************/
306 /* Print a sequence of chars in printable format, stopping at the end of the
307 subject if the requested.
310 p points to characters
311 length number to print
312 is_subject TRUE if printing from within md->start_subject
313 md pointer to matching data block, if is_subject is TRUE
319 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
322 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
324 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
331 /*************************************************
333 *************************************************/
335 /* This function is called when a \ has been encountered. It either returns a
336 positive value for a simple escape such as \n, or a negative value which
337 encodes one of the more complicated things such as \d. On entry, ptr is
338 pointing at the \. On exit, it is on the final character of the escape
342 ptrptr points to the pattern position pointer
343 errorptr points to the pointer to the error message
344 bracount number of previous extracting brackets
345 options the options bits
346 isclass TRUE if inside a character class
347 cd pointer to char tables block
349 Returns: zero or positive => a data character
350 negative => a special escape sequence
351 on error, errorptr is set
355 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
356 int options, BOOL isclass, compile_data *cd)
358 const uschar *ptr = *ptrptr;
361 c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
362 if (c == 0) *errorptr = ERR1;
364 /* Digits or letters may have special meaning; all others are literals. */
366 else if (c < '0' || c > 'z') {}
368 /* Do an initial lookup in a table. A non-zero result is something that can be
369 returned immediately. Otherwise further processing may be required. */
371 else if ((i = escapes[c - '0']) != 0) c = i;
373 /* Escapes that need further processing, or are illegal. */
377 const uschar *oldptr;
380 /* The handling of escape sequences consisting of a string of digits
381 starting with one that is not zero is not straightforward. By experiment,
382 the way Perl works seems to be as follows:
384 Outside a character class, the digits are read as a decimal number. If the
385 number is less than 10, or if there are that many previous extracting
386 left brackets, then it is a back reference. Otherwise, up to three octal
387 digits are read to form an escaped byte. Thus \123 is likely to be octal
388 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
389 value is greater than 377, the least significant 8 bits are taken. Inside a
390 character class, \ followed by a digit is always an octal number. */
392 case '1': case '2': case '3': case '4': case '5':
393 case '6': case '7': case '8': case '9':
399 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
400 c = c * 10 + *(++ptr) - '0';
401 if (c < 10 || c <= bracount)
406 ptr = oldptr; /* Put the pointer back and fall through */
409 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
410 generates a binary zero byte and treats the digit as a following literal.
411 Thus we have to pull back the pointer by one. */
413 if ((c = *ptr) >= '8')
420 /* \0 always starts an octal number, but we may drop through to here with a
421 larger first octal digit */
425 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
426 ptr[1] != '8' && ptr[1] != '9')
427 c = c * 8 + *(++ptr) - '0';
430 /* Special escapes not starting with a digit are straightforward */
434 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
437 c = c * 16 + cd->lcc[*ptr] -
438 (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
450 /* A letter is upper-cased; then the 0x40 bit is flipped */
452 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
456 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
457 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
458 for Perl compatibility, it is a literal. This code looks a bit odd, but
459 there used to be some cases other than the default, and there may be again
460 in future, so I haven't "optimized" it. */
463 if ((options & PCRE_EXTRA) != 0) switch(c)
479 /*************************************************
480 * Check for counted repeat *
481 *************************************************/
483 /* This function is called when a '{' is encountered in a place where it might
484 start a quantifier. It looks ahead to see if it really is a quantifier or not.
485 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
486 where the ddds are digits.
489 p pointer to the first char after '{'
490 cd pointer to char tables block
492 Returns: TRUE or FALSE
496 is_counted_repeat(const uschar *p, compile_data *cd)
498 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
499 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
500 if (*p == '}') return TRUE;
502 if (*p++ != ',') return FALSE;
503 if (*p == '}') return TRUE;
505 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
506 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
512 /*************************************************
513 * Read repeat counts *
514 *************************************************/
516 /* Read an item of the form {n,m} and return the values. This is called only
517 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
518 so the syntax is guaranteed to be correct, but we need to check the values.
521 p pointer to first char after '{'
522 minp pointer to int for min
523 maxp pointer to int for max
524 returned as -1 if no max
525 errorptr points to pointer to error message
526 cd pointer to character tables clock
528 Returns: pointer to '}' on success;
529 current ptr on error, with errorptr set
532 static const uschar *
533 read_repeat_counts(const uschar *p, int *minp, int *maxp,
534 const char **errorptr, compile_data *cd)
539 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
541 if (*p == '}') max = min; else
546 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
555 /* Do paranoid checks, then fill in the required variables, and pass back the
556 pointer to the terminating '}'. */
558 if (min > 65535 || max > 65535)
570 /*************************************************
571 * Find the fixed length of a pattern *
572 *************************************************/
574 /* Scan a pattern and compute the fixed length of subject that will match it,
575 if the length is fixed. This is needed for dealing with backward assertions.
578 code points to the start of the pattern (the bracket)
580 Returns: the fixed length, or -1 if there is no fixed length
584 find_fixedlength(uschar *code)
588 register int branchlength = 0;
589 register uschar *cc = code + 3;
591 /* Scan along the opcodes for this branch. If we get to the end of the
592 branch, check the length against that of the other branches. */
597 register int op = *cc;
598 if (op >= OP_BRA) op = OP_BRA;
605 d = find_fixedlength(cc);
606 if (d < 0) return -1;
608 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
612 /* Reached end of a branch; if it's a ket it is the end of a nested
613 call. If it's ALT it is an alternation in a nested call. If it is
614 END it's the end of the outer call. All can be handled by the same code. */
621 if (length < 0) length = branchlength;
622 else if (length != branchlength) return -1;
623 if (*cc != OP_ALT) return length;
628 /* Skip over assertive subpatterns */
633 case OP_ASSERTBACK_NOT:
634 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
638 /* Skip over things that don't match chars */
654 case OP_NOT_WORD_BOUNDARY:
655 case OP_WORD_BOUNDARY:
659 /* Handle char strings */
662 branchlength += *(++cc);
666 /* Handle exact repetitions */
670 branchlength += (cc[1] << 8) + cc[2];
674 /* Handle single-char matchers */
678 case OP_NOT_WHITESPACE:
680 case OP_NOT_WORDCHAR:
688 /* Check a class for variable quantification */
691 cc += (*cc == OP_REF)? 2 : 33;
703 if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
704 branchlength += (cc[1] << 8) + cc[2];
713 /* Anything else is variable length */
719 /* Control never gets here */
725 /*************************************************
726 * Check for POSIX class syntax *
727 *************************************************/
729 /* This function is called when the sequence "[:" or "[." or "[=" is
730 encountered in a character class. It checks whether this is followed by an
731 optional ^ and then a sequence of letters, terminated by a matching ":]" or
735 ptr pointer to the initial [
736 endptr where to return the end pointer
737 cd pointer to compile data
739 Returns: TRUE or FALSE
743 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
745 int terminator; /* Don't combine these lines; the Solaris cc */
746 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
747 if (*(++ptr) == '^') ptr++;
748 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
749 if (*ptr == terminator && ptr[1] == ']')
760 /*************************************************
761 * Check POSIX class name *
762 *************************************************/
764 /* This function is called to check the name given in a POSIX-style class entry
768 ptr points to the first letter
769 len the length of the name
771 Returns: a value representing the name, or -1 if unknown
775 check_posix_name(const uschar *ptr, int len)
777 register int yield = 0;
778 while (posix_name_lengths[yield] != 0)
780 if (len == posix_name_lengths[yield] &&
781 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
790 /*************************************************
791 * Compile one branch *
792 *************************************************/
794 /* Scan the pattern, compiling it into the code vector.
797 options the option bits
798 brackets points to number of brackets used
799 code points to the pointer to the current code point
800 ptrptr points to the current pattern pointer
801 errorptr points to pointer to error message
802 optchanged set to the value of the last OP_OPT item compiled
803 reqchar set to the last literal character required, else -1
804 countlits set to count of mandatory literal characters
805 cd contains pointers to tables
807 Returns: TRUE on success
808 FALSE, with *errorptr set on error
812 compile_branch(int options, int *brackets, uschar **codeptr,
813 const uschar **ptrptr, const char **errorptr, int *optchanged,
814 int *reqchar, int *countlits, compile_data *cd)
816 int repeat_type, op_type;
817 int repeat_min, repeat_max;
818 int bravalue, length;
819 int greedy_default, greedy_non_default;
822 int subcountlits = 0;
824 register uschar *code = *codeptr;
826 const uschar *ptr = *ptrptr;
827 const uschar *tempptr;
828 uschar *previous = NULL;
831 /* Set up the default and non-default settings for greediness */
833 greedy_default = ((options & PCRE_UNGREEDY) != 0);
834 greedy_non_default = greedy_default ^ 1;
836 /* Initialize no required char, and count of literals */
838 *reqchar = prevreqchar = -1;
841 /* Switch on next character until the end of the branch */
853 if ((options & PCRE_EXTENDED) != 0)
855 if ((cd->ctypes[c] & ctype_space) != 0) continue;
858 while ((c = *(++ptr)) != 0 && c != '\n');
865 /* The branch terminates at end of string, |, or ). */
874 /* Handle single-character metacharacters */
891 /* Character classes. These always build a 32-byte bitmap of the permitted
892 characters, except in the special case where there is only one character.
893 For negated classes, we build the map as usual, then invert it at the end.
900 /* If the first character is '^', set the negation flag and skip it. */
902 if ((c = *(++ptr)) == '^')
907 else negate_class = FALSE;
909 /* Keep a count of chars so that we can optimize the case of just a single
915 /* Initialize the 32-char bit map to all zeros. We have to build the
916 map in a temporary bit of store, in case the class contains only 1
917 character, because in that case the compiled code doesn't use the
920 memset(class, 0, 32 * sizeof(uschar));
922 /* Process characters until ] is reached. By writing this as a "do" it
923 means that an initial ] is taken as a data character. */
933 /* Handle POSIX class names. Perl allows a negation extension of the
934 form [:^name]. A square bracket that doesn't match the syntax is
935 treated as a literal. We also recognize the POSIX constructions
936 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
940 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
941 check_posix_syntax(ptr, &tempptr, cd))
943 BOOL local_negate = FALSE;
945 register const uschar *cbits = cd->cbits;
960 posix_class = check_posix_name(ptr, tempptr - ptr);
967 /* If matching is caseless, upper and lower are converted to
968 alpha. This relies on the fact that the class table starts with
969 alpha, lower, upper as the first 3 entries. */
971 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
974 /* Or into the map we are building up to 3 of the static class
975 tables, or their negations. */
978 for (i = 0; i < 3; i++)
980 int taboffset = posix_class_maps[posix_class + i];
981 if (taboffset < 0) break;
983 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
985 for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
989 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
993 /* Backslash may introduce a single character, or it may introduce one
994 of the specials, which just set a flag. Escaped items are checked for
995 validity in the pre-compiling pass. The sequence \b is a special case.
996 Inside a class (and only there) it is treated as backspace. Elsewhere
997 it marks a word boundary. Other escapes have preset maps ready to
998 or into the one we are building. We assume they have more than one
999 character in them, so set class_count bigger than one. */
1003 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1004 if (-c == ESC_b) c = '\b';
1007 register const uschar *cbits = cd->cbits;
1008 class_charcount = 10;
1012 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1016 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1020 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1024 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1028 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1032 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1040 /* Fall through if single character */
1043 /* A single character may be followed by '-' to form a range. However,
1044 Perl does not permit ']' to be the end of the range. A '-' character
1045 here is treated as a literal. */
1047 if (ptr[1] == '-' && ptr[2] != ']')
1059 /* The second part of a range can be a single-character escape, but
1060 not any of the other escapes. */
1064 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1067 if (d == -ESC_b) d = '\b'; else
1083 class[c/8] |= (1 << (c&7));
1084 if ((options & PCRE_CASELESS) != 0)
1086 int uc = cd->fcc[c]; /* flip case */
1087 class[uc/8] |= (1 << (uc&7));
1089 class_charcount++; /* in case a one-char range */
1092 continue; /* Go get the next char in the class */
1095 /* Handle a lone single character - we can get here for a normal
1096 non-escape char, or after \ that introduces a single character. */
1098 class [c/8] |= (1 << (c&7));
1099 if ((options & PCRE_CASELESS) != 0)
1101 c = cd->fcc[c]; /* flip case */
1102 class[c/8] |= (1 << (c&7));
1108 /* Loop until ']' reached; the check for end of string happens inside the
1109 loop. This "while" is the end of the "do" above. */
1111 while ((c = *(++ptr)) != ']');
1113 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1114 precisely one character. This doesn't need the whole 32-byte bit map.
1115 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1118 if (class_charcount == 1 && class_lastchar >= 0)
1126 code[-1] = OP_CHARS;
1129 *code++ = class_lastchar;
1132 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1138 for (c = 0; c < 32; c++) code[c] = ~class[c];
1140 memcpy(code, class, 32);
1145 /* Various kinds of repeat */
1148 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
1149 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
1150 if (*errorptr != NULL) goto FAILED;
1168 if (previous == NULL)
1174 /* If the next character is '?' this is a minimizing repeat, by default,
1175 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1179 { repeat_type = greedy_non_default; ptr++; }
1180 else repeat_type = greedy_default;
1182 /* If previous was a string of characters, chop off the last one and use it
1183 as the subject of the repeat. If there was only one character, we can
1184 abolish the previous item altogether. A repeat with a zero minimum wipes
1185 out any reqchar setting, backing up to the previous value. We must also
1186 adjust the countlits value. */
1188 if (*previous == OP_CHARS)
1190 int len = previous[1];
1192 if (repeat_min == 0) *reqchar = prevreqchar;
1193 *countlits += repeat_min - 1;
1202 c = previous[len+1];
1206 op_type = 0; /* Use single-char op codes */
1207 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1210 /* If previous was a single negated character ([^a] or similar), we use
1211 one of the special opcodes, replacing it. The code is shared with single-
1212 character repeats by adding a suitable offset into repeat_type. */
1214 else if ((int)*previous == OP_NOT)
1216 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1219 goto OUTPUT_SINGLE_REPEAT;
1222 /* If previous was a character type match (\d or similar), abolish it and
1223 create a suitable repeat item. The code is shared with single-character
1224 repeats by adding a suitable offset into repeat_type. */
1226 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1228 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1232 OUTPUT_SINGLE_REPEAT:
1234 /* If the maximum is zero then the minimum must also be zero; Perl allows
1235 this case, so we do too - by simply omitting the item altogether. */
1237 if (repeat_max == 0) goto END_REPEAT;
1239 /* Combine the op_type with the repeat_type */
1241 repeat_type += op_type;
1243 /* A minimum of zero is handled either as the special case * or ?, or as
1244 an UPTO, with the maximum given. */
1246 if (repeat_min == 0)
1248 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1249 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1252 *code++ = OP_UPTO + repeat_type;
1253 *code++ = repeat_max >> 8;
1254 *code++ = (repeat_max & 255);
1258 /* The case {1,} is handled as the special case + */
1260 else if (repeat_min == 1 && repeat_max == -1)
1261 *code++ = OP_PLUS + repeat_type;
1263 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1264 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1268 if (repeat_min != 1)
1270 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1271 *code++ = repeat_min >> 8;
1272 *code++ = (repeat_min & 255);
1275 /* If the mininum is 1 and the previous item was a character string,
1276 we either have to put back the item that got cancelled if the string
1277 length was 1, or add the character back onto the end of a longer
1278 string. For a character type nothing need be done; it will just get
1279 put back naturally. Note that the final character is always going to
1282 else if (*previous == OP_CHARS)
1284 if (code == previous) code += 2; else previous[1]++;
1287 /* For a single negated character we also have to put back the
1288 item that got cancelled. */
1290 else if (*previous == OP_NOT) code++;
1292 /* If the maximum is unlimited, insert an OP_STAR. */
1297 *code++ = OP_STAR + repeat_type;
1300 /* Else insert an UPTO if the max is greater than the min. */
1302 else if (repeat_max != repeat_min)
1305 repeat_max -= repeat_min;
1306 *code++ = OP_UPTO + repeat_type;
1307 *code++ = repeat_max >> 8;
1308 *code++ = (repeat_max & 255);
1312 /* The character or character type itself comes last in all cases. */
1317 /* If previous was a character class or a back reference, we put the repeat
1318 stuff after it, but just skip the item if the repeat was {0,0}. */
1320 else if (*previous == OP_CLASS || *previous == OP_REF)
1322 if (repeat_max == 0)
1327 if (repeat_min == 0 && repeat_max == -1)
1328 *code++ = OP_CRSTAR + repeat_type;
1329 else if (repeat_min == 1 && repeat_max == -1)
1330 *code++ = OP_CRPLUS + repeat_type;
1331 else if (repeat_min == 0 && repeat_max == 1)
1332 *code++ = OP_CRQUERY + repeat_type;
1335 *code++ = OP_CRRANGE + repeat_type;
1336 *code++ = repeat_min >> 8;
1337 *code++ = repeat_min & 255;
1338 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1339 *code++ = repeat_max >> 8;
1340 *code++ = repeat_max & 255;
1344 /* If previous was a bracket group, we may have to replicate it in certain
1347 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1348 (int)*previous == OP_COND)
1352 int len = code - previous;
1353 uschar *bralink = NULL;
1355 /* If the maximum repeat count is unlimited, find the end of the bracket
1356 by scanning through from the start, and compute the offset back to it
1357 from the current code pointer. There may be an OP_OPT setting following
1358 the final KET, so we can't find the end just by going back from the code
1361 if (repeat_max == -1)
1363 register uschar *ket = previous;
1364 do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1365 ketoffset = code - ket;
1368 /* The case of a zero minimum is special because of the need to stick
1369 OP_BRAZERO in front of it, and because the group appears once in the
1370 data, whereas in other cases it appears the minimum number of times. For
1371 this reason, it is simplest to treat this case separately, as otherwise
1372 the code gets far too mess. There are several special subcases when the
1375 if (repeat_min == 0)
1377 /* If we set up a required char from the bracket, we must back off
1378 to the previous value and reset the countlits value too. */
1380 if (subcountlits > 0)
1382 *reqchar = prevreqchar;
1383 *countlits -= subcountlits;
1386 /* If the maximum is also zero, we just omit the group from the output
1389 if (repeat_max == 0)
1395 /* If the maximum is 1 or unlimited, we just have to stick in the
1396 BRAZERO and do no more at this point. */
1398 if (repeat_max <= 1)
1400 memmove(previous+1, previous, len);
1402 *previous++ = OP_BRAZERO + repeat_type;
1405 /* If the maximum is greater than 1 and limited, we have to replicate
1406 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1407 The first one has to be handled carefully because it's the original
1408 copy, which has to be moved up. The remainder can be handled by code
1409 that is common with the non-zero minimum case below. We just have to
1410 adjust the value or repeat_max, since one less copy is required. */
1415 memmove(previous+4, previous, len);
1417 *previous++ = OP_BRAZERO + repeat_type;
1418 *previous++ = OP_BRA;
1420 /* We chain together the bracket offset fields that have to be
1421 filled in later when the ends of the brackets are reached. */
1423 offset = (bralink == NULL)? 0 : previous - bralink;
1425 *previous++ = offset >> 8;
1426 *previous++ = offset & 255;
1432 /* If the minimum is greater than zero, replicate the group as many
1433 times as necessary, and adjust the maximum to the number of subsequent
1434 copies that we need. */
1438 for (i = 1; i < repeat_min; i++)
1440 memcpy(code, previous, len);
1443 if (repeat_max > 0) repeat_max -= repeat_min;
1446 /* This code is common to both the zero and non-zero minimum cases. If
1447 the maximum is limited, it replicates the group in a nested fashion,
1448 remembering the bracket starts on a stack. In the case of a zero minimum,
1449 the first one was set up above. In all cases the repeat_max now specifies
1450 the number of additional copies needed. */
1452 if (repeat_max >= 0)
1454 for (i = repeat_max - 1; i >= 0; i--)
1456 *code++ = OP_BRAZERO + repeat_type;
1458 /* All but the final copy start a new nesting, maintaining the
1459 chain of brackets outstanding. */
1465 offset = (bralink == NULL)? 0 : code - bralink;
1467 *code++ = offset >> 8;
1468 *code++ = offset & 255;
1471 memcpy(code, previous, len);
1475 /* Now chain through the pending brackets, and fill in their length
1476 fields (which are holding the chain links pro tem). */
1478 while (bralink != NULL)
1481 int offset = code - bralink + 1;
1482 uschar *bra = code - offset;
1483 oldlinkoffset = (bra[1] << 8) + bra[2];
1484 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1486 *code++ = bra[1] = offset >> 8;
1487 *code++ = bra[2] = (offset & 255);
1491 /* If the maximum is unlimited, set a repeater in the final copy. We
1492 can't just offset backwards from the current code point, because we
1493 don't know if there's been an options resetting after the ket. The
1494 correct offset was computed above. */
1496 else code[-ketoffset] = OP_KETRMAX + repeat_type;
1499 /* Else there's some kind of shambles */
1507 /* In all case we no longer have a previous item. */
1514 /* Start of nested bracket sub-expression, or comment or lookahead or
1515 lookbehind or option setting or condition. First deal with special things
1516 that can come after a bracket; all are introduced by ?, and the appearance
1517 of any of them means that this is not a referencing group. They were
1518 checked for validity in the first pass over the string, so we don't have to
1519 check for syntax errors here. */
1522 newoptions = options;
1525 if (*(++ptr) == '?')
1532 case '#': /* Comment; skip to ket */
1534 while (*ptr != ')') ptr++;
1537 case ':': /* Non-extracting bracket */
1543 bravalue = OP_COND; /* Conditional group */
1544 if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1546 condref = *ptr - '0';
1547 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1553 case '=': /* Positive lookahead */
1554 bravalue = OP_ASSERT;
1558 case '!': /* Negative lookahead */
1559 bravalue = OP_ASSERT_NOT;
1563 case '<': /* Lookbehinds */
1566 case '=': /* Positive lookbehind */
1567 bravalue = OP_ASSERTBACK;
1571 case '!': /* Negative lookbehind */
1572 bravalue = OP_ASSERTBACK_NOT;
1576 default: /* Syntax error */
1582 case '>': /* One-time brackets */
1587 case 'R': /* Pattern recursion */
1588 *code++ = OP_RECURSE;
1592 default: /* Option setting */
1596 while (*ptr != ')' && *ptr != ':')
1600 case '-': optset = &unset; break;
1602 case 'i': *optset |= PCRE_CASELESS; break;
1603 case 'm': *optset |= PCRE_MULTILINE; break;
1604 case 's': *optset |= PCRE_DOTALL; break;
1605 case 'x': *optset |= PCRE_EXTENDED; break;
1606 case 'U': *optset |= PCRE_UNGREEDY; break;
1607 case 'X': *optset |= PCRE_EXTRA; break;
1615 /* Set up the changed option bits, but don't change anything yet. */
1617 newoptions = (options | set) & (~unset);
1619 /* If the options ended with ')' this is not the start of a nested
1620 group with option changes, so the options change at this level. At top
1621 level there is nothing else to be done (the options will in fact have
1622 been set from the start of compiling as a result of the first pass) but
1623 at an inner level we must compile code to change the ims options if
1624 necessary, and pass the new setting back so that it can be put at the
1625 start of any following branches, and when this group ends, a resetting
1626 item can be compiled. */
1630 if ((options & PCRE_INGROUP) != 0 &&
1631 (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1634 *code++ = *optchanged = newoptions & PCRE_IMS;
1636 options = newoptions; /* Change options at this level */
1637 previous = NULL; /* This item can't be repeated */
1638 continue; /* It is complete */
1641 /* If the options ended with ':' we are heading into a nested group
1642 with possible change of options. Such groups are non-capturing and are
1643 not assertions of any kind. All we need to do is skip over the ':';
1644 the newoptions value is handled below. */
1651 /* Else we have a referencing group; adjust the opcode. */
1655 if (++(*brackets) > EXTRACT_MAX)
1660 bravalue = OP_BRA + *brackets;
1663 /* Process nested bracketed re. Assertions may not be repeated, but other
1664 kinds can be. We copy code into a non-register variable in order to be able
1665 to pass its address because some compilers complain otherwise. Pass in a
1666 new setting for the ims options if they have changed. */
1668 previous = (bravalue >= OP_ONCE)? code : NULL;
1673 options | PCRE_INGROUP, /* Set for all nested groups */
1674 ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1675 newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1676 brackets, /* Bracket level */
1677 &tempcode, /* Where to put code (updated) */
1678 &ptr, /* Input pointer (updated) */
1679 errorptr, /* Where to put an error message */
1680 (bravalue == OP_ASSERTBACK ||
1681 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1682 condref, /* Condition reference number */
1683 &subreqchar, /* For possible last char */
1684 &subcountlits, /* For literal count */
1685 cd)) /* Tables block */
1688 /* At the end of compiling, code is still pointing to the start of the
1689 group, while tempcode has been updated to point past the end of the group
1690 and any option resetting that may follow it. The pattern pointer (ptr)
1691 is on the bracket. */
1693 /* If this is a conditional bracket, check that there are no more than
1694 two branches in the group. */
1696 if (bravalue == OP_COND)
1703 tc += (tc[1] << 8) | tc[2];
1705 while (*tc != OP_KET);
1714 /* Handle updating of the required character. If the subpattern didn't
1715 set one, leave it as it was. Otherwise, update it for normal brackets of
1716 all kinds, forward assertions, and conditions with two branches. Don't
1717 update the literal count for forward assertions, however. If the bracket
1718 is followed by a quantifier with zero repeat, we have to back off. Hence
1719 the definition of prevreqchar and subcountlits outside the main loop so
1720 that they can be accessed for the back off. */
1722 if (subreqchar > 0 &&
1723 (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1724 (bravalue == OP_COND && condcount == 2)))
1726 prevreqchar = *reqchar;
1727 *reqchar = subreqchar;
1728 if (bravalue != OP_ASSERT) *countlits += subcountlits;
1731 /* Now update the main code pointer to the end of the group. */
1735 /* Error if hit end of pattern */
1744 /* Check \ for being a real metacharacter; if not, fall through and handle
1745 it as a data character at the start of a string. Escape items are checked
1746 for validity in the pre-compiling pass. */
1750 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1752 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1753 are arranged to be the negation of the corresponding OP_values. For the
1754 back references, the values are ESC_REF plus the reference number. Only
1755 back references and those types that consume a character may be repeated.
1756 We can test for values between ESC_b and ESC_Z for the latter; this may
1757 have to change if any new ones are ever created. */
1765 *code++ = -c - ESC_REF;
1769 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1775 /* Data character: reset and fall through */
1780 /* Handle a run of data characters until a metacharacter is encountered.
1781 The first character is guaranteed not to be whitespace or # when the
1782 extended flag is set. */
1793 if ((options & PCRE_EXTENDED) != 0)
1795 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1798 while ((c = *(++ptr)) != 0 && c != '\n');
1804 /* Backslash may introduce a data char or a metacharacter. Escaped items
1805 are checked for validity in the pre-compiling pass. Stop the string
1806 before a metaitem. */
1811 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1812 if (c < 0) { ptr = tempptr; break; }
1815 /* Ordinary character or single-char escape */
1821 /* This "while" is the end of the "do" above. */
1823 while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
1825 /* Update the last character and the count of literals */
1827 prevreqchar = (length > 1)? code[-2] : *reqchar;
1828 *reqchar = code[-1];
1829 *countlits += length;
1831 /* Compute the length and set it in the data vector, and advance to
1834 previous[1] = length;
1835 if (length < 255) ptr--;
1838 } /* end of big loop */
1840 /* Control never reaches here by falling through, only by a goto for all the
1841 error states. Pass back the position in the pattern so that it can be displayed
1842 to the user for diagnosing the error. */
1852 /*************************************************
1853 * Compile sequence of alternatives *
1854 *************************************************/
1856 /* On entry, ptr is pointing past the bracket character, but on return
1857 it points to the closing bracket, or vertical bar, or end of string.
1858 The code variable is pointing at the byte into which the BRA operator has been
1859 stored. If the ims options are changed at the start (for a (?ims: group) or
1860 during any branch, we need to insert an OP_OPT item at the start of every
1861 following branch to ensure they get set correctly at run time, and also pass
1862 the new options into every subsequent branch compile.
1865 options the option bits
1866 optchanged new ims options to set as if (?ims) were at the start, or -1
1868 brackets -> int containing the number of extracting brackets used
1869 codeptr -> the address of the current code pointer
1870 ptrptr -> the address of the current pattern pointer
1871 errorptr -> pointer to error message
1872 lookbehind TRUE if this is a lookbehind assertion
1873 condref > 0 for OPT_CREF setting at start of conditional group
1874 reqchar -> place to put the last required character, or a negative number
1875 countlits -> place to put the shortest literal count of any branch
1876 cd points to the data block with tables pointers
1878 Returns: TRUE on success
1882 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
1883 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
1884 int *reqchar, int *countlits, compile_data *cd)
1886 const uschar *ptr = *ptrptr;
1887 uschar *code = *codeptr;
1888 uschar *last_branch = code;
1889 uschar *start_bracket = code;
1890 uschar *reverse_count = NULL;
1891 int oldoptions = options & PCRE_IMS;
1892 int branchreqchar, branchcountlits;
1895 *countlits = INT_MAX;
1898 /* At the start of a reference-based conditional group, insert the reference
1899 number as an OP_CREF item. */
1907 /* Loop for each alternative branch */
1913 /* Handle change of options */
1915 if (optchanged >= 0)
1918 *code++ = optchanged;
1919 options = (options & ~PCRE_IMS) | optchanged;
1922 /* Set up dummy OP_REVERSE if lookbehind assertion */
1926 *code++ = OP_REVERSE;
1927 reverse_count = code;
1932 /* Now compile the branch */
1934 if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
1935 &branchreqchar, &branchcountlits, cd))
1941 /* Fill in the length of the last branch */
1943 length = code - last_branch;
1944 last_branch[1] = length >> 8;
1945 last_branch[2] = length & 255;
1947 /* Save the last required character if all branches have the same; a current
1948 value of -1 means unset, while -2 means "previous branch had no last required
1953 if (branchreqchar >= 0)
1955 if (*reqchar == -1) *reqchar = branchreqchar;
1956 else if (*reqchar != branchreqchar) *reqchar = -2;
1961 /* Keep the shortest literal count */
1963 if (branchcountlits < *countlits) *countlits = branchcountlits;
1964 DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
1966 /* If lookbehind, check that this branch matches a fixed-length string,
1967 and put the length into the OP_REVERSE item. Temporarily mark the end of
1968 the branch with OP_END. */
1973 length = find_fixedlength(last_branch);
1974 DPRINTF(("fixed length = %d\n", length));
1981 reverse_count[0] = (length >> 8);
1982 reverse_count[1] = length & 255;
1985 /* Reached end of expression, either ')' or end of pattern. Insert a
1986 terminating ket and the length of the whole bracketed item, and return,
1987 leaving the pointer at the terminating char. If any of the ims options
1988 were changed inside the group, compile a resetting op-code following. */
1992 length = code - start_bracket;
1994 *code++ = length >> 8;
1995 *code++ = length & 255;
1996 if (optchanged >= 0)
1999 *code++ = oldoptions;
2006 /* Another branch follows; insert an "or" node and advance the pointer. */
2013 /* Control never reaches here */
2019 /*************************************************
2020 * Find first significant op code *
2021 *************************************************/
2023 /* This is called by several functions that scan a compiled expression looking
2024 for a fixed first character, or an anchoring op code etc. It skips over things
2025 that do not influence this. For one application, a change of caseless option is
2029 code pointer to the start of the group
2030 options pointer to external options
2031 optbit the option bit whose changing is significant, or
2033 optstop TRUE to return on option change, otherwise change the options
2036 Returns: pointer to the first significant opcode
2039 static const uschar*
2040 first_significant_code(const uschar *code, int *options, int optbit,
2048 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
2050 if (optstop) return code;
2051 *options = (int)code[1];
2060 case OP_WORD_BOUNDARY:
2061 case OP_NOT_WORD_BOUNDARY:
2067 case OP_ASSERTBACK_NOT:
2068 do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
2076 /* Control never reaches here */
2082 /*************************************************
2083 * Check for anchored expression *
2084 *************************************************/
2086 /* Try to find out if this is an anchored regular expression. Consider each
2087 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2088 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2089 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2090 counts, since OP_CIRC can match in the middle.
2092 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2093 because that will try the rest of the pattern at all possible matching points,
2094 so there is no point trying them again.
2097 code points to start of expression (the bracket)
2098 options points to the options setting
2100 Returns: TRUE or FALSE
2104 is_anchored(register const uschar *code, int *options)
2107 const uschar *scode = first_significant_code(code + 3, options,
2108 PCRE_MULTILINE, FALSE);
2109 register int op = *scode;
2110 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2111 { if (!is_anchored(scode, options)) return FALSE; }
2112 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2113 (*options & PCRE_DOTALL) != 0)
2114 { if (scode[1] != OP_ANY) return FALSE; }
2115 else if (op != OP_SOD &&
2116 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
2118 code += (code[1] << 8) + code[2];
2120 while (*code == OP_ALT);
2126 /*************************************************
2127 * Check for starting with ^ or .* *
2128 *************************************************/
2130 /* This is called to find out if every branch starts with ^ or .* so that
2131 "first char" processing can be done to speed things up in multiline
2132 matching and for non-DOTALL patterns that start with .* (which must start at
2133 the beginning or after \n).
2135 Argument: points to start of expression (the bracket)
2136 Returns: TRUE or FALSE
2140 is_startline(const uschar *code)
2143 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
2144 register int op = *scode;
2145 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2146 { if (!is_startline(scode)) return FALSE; }
2147 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2148 { if (scode[1] != OP_ANY) return FALSE; }
2149 else if (op != OP_CIRC) return FALSE;
2150 code += (code[1] << 8) + code[2];
2152 while (*code == OP_ALT);
2158 /*************************************************
2159 * Check for fixed first char *
2160 *************************************************/
2162 /* Try to find out if there is a fixed first character. This is called for
2163 unanchored expressions, as it speeds up their processing quite considerably.
2164 Consider each alternative branch. If they all start with the same char, or with
2165 a bracket all of whose alternatives start with the same char (recurse ad lib),
2166 then we return that char, otherwise -1.
2169 code points to start of expression (the bracket)
2170 options pointer to the options (used to check casing changes)
2172 Returns: -1 or the fixed first char
2176 find_firstchar(const uschar *code, int *options)
2178 register int c = -1;
2181 const uschar *scode = first_significant_code(code + 3, options,
2182 PCRE_CASELESS, TRUE);
2183 register int op = *scode;
2185 if (op >= OP_BRA) op = OP_BRA;
2196 if ((d = find_firstchar(scode, options)) < 0) return -1;
2197 if (c < 0) c = d; else if (c != d) return -1;
2200 case OP_EXACT: /* Fall through */
2203 case OP_CHARS: /* Fall through */
2208 if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
2212 code += (code[1] << 8) + code[2];
2214 while (*code == OP_ALT);
2222 /*************************************************
2223 * Compile a Regular Expression *
2224 *************************************************/
2226 /* This function takes a string and returns a pointer to a block of store
2227 holding a compiled version of the expression.
2230 pattern the regular expression
2231 options various option bits
2232 errorptr pointer to pointer to error text
2233 erroroffset ptr offset in pattern where error was detected
2234 tables pointer to character tables or NULL
2236 Returns: pointer to compiled data block, or NULL on error,
2237 with errorptr and erroroffset set
2241 pcre_compile(const char *pattern, int options, const char **errorptr,
2242 int *erroroffset, const unsigned char *tables)
2245 int length = 3; /* For initial BRA plus length */
2247 int c, reqchar, countlits;
2249 int top_backref = 0;
2250 int branch_extra = 0;
2251 int branch_newextra;
2252 unsigned int brastackptr = 0;
2256 compile_data compile_block;
2257 int brastack[BRASTACK_SIZE];
2258 uschar bralenstack[BRASTACK_SIZE];
2261 uschar *code_base, *code_end;
2264 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2265 can do is just return NULL. */
2267 if (errorptr == NULL) return NULL;
2270 /* However, we can give a message for this error */
2272 if (erroroffset == NULL)
2279 if ((options & ~PUBLIC_OPTIONS) != 0)
2285 /* Set up pointers to the individual character tables */
2287 if (tables == NULL) tables = pcre_default_tables;
2288 compile_block.lcc = tables + lcc_offset;
2289 compile_block.fcc = tables + fcc_offset;
2290 compile_block.cbits = tables + cbits_offset;
2291 compile_block.ctypes = tables + ctypes_offset;
2293 /* Reflect pattern for debugging output */
2295 DPRINTF(("------------------------------------------------------------------\n"));
2296 DPRINTF(("%s\n", pattern));
2298 /* The first thing to do is to make a pass over the pattern to compute the
2299 amount of store required to hold the compiled code. This does not have to be
2300 perfect as long as errors are overestimates. At the same time we can detect any
2301 internal flag settings. Make an attempt to correct for any counted white space
2302 if an "extended" flag setting appears late in the pattern. We can't be so
2303 clever for #-comments. */
2305 ptr = (const uschar *)(pattern - 1);
2306 while ((c = *(++ptr)) != 0)
2309 int class_charcount;
2311 if ((options & PCRE_EXTENDED) != 0)
2313 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2316 while ((c = *(++ptr)) != 0 && c != '\n');
2323 /* A backslashed item may be an escaped "normal" character or a
2324 character type. For a "normal" character, put the pointers and
2325 character back so that tests for whitespace etc. in the input
2326 are done correctly. */
2330 const uschar *save_ptr = ptr;
2331 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2332 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2342 /* A back reference needs an additional char, plus either one or 5
2343 bytes for a repeat. We also need to keep the value of the highest
2348 int refnum = -c - ESC_REF;
2349 if (refnum > top_backref) top_backref = refnum;
2350 length++; /* For single back reference */
2351 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2353 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2354 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2355 if ((min == 0 && (max == 1 || max == -1)) ||
2356 (min == 1 && max == -1))
2359 if (ptr[1] == '?') ptr++;
2367 case '*': /* These repeats won't be after brackets; */
2368 case '+': /* those are handled separately */
2373 /* This covers the cases of repeats after a single char, metachar, class,
2374 or back reference. */
2377 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2378 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2379 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2380 if ((min == 0 && (max == 1 || max == -1)) ||
2381 (min == 1 && max == -1))
2385 length--; /* Uncount the original char or metachar */
2386 if (min == 1) length++; else if (min > 0) length += 4;
2387 if (max > 0) length += 4; else length += 2;
2389 if (ptr[1] == '?') ptr++;
2392 /* An alternation contains an offset to the next branch or ket. If any ims
2393 options changed in the previous branch(es), and/or if we are in a
2394 lookbehind assertion, extra space will be needed at the start of the
2395 branch. This is handled by branch_extra. */
2398 length += 3 + branch_extra;
2401 /* A character class uses 33 characters. Don't worry about character types
2402 that aren't allowed in classes - they'll get picked up during the compile.
2403 A character class that contains only one character uses 2 or 3 bytes,
2404 depending on whether it is negated or not. Notice this where we can. */
2407 class_charcount = 0;
2408 if (*(++ptr) == '^') ptr++;
2413 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2415 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2416 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2418 else class_charcount++;
2421 while (*ptr != 0 && *ptr != ']');
2423 /* Repeats for negated single chars are handled by the general code */
2425 if (class_charcount == 1) length += 3; else
2429 /* A repeat needs either 1 or 5 bytes. */
2431 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2433 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2434 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2435 if ((min == 0 && (max == 1 || max == -1)) ||
2436 (min == 1 && max == -1))
2439 if (ptr[1] == '?') ptr++;
2444 /* Brackets may be genuine groups or special things */
2447 branch_newextra = 0;
2449 /* Handle special forms of bracket, which all start (? */
2458 /* Skip over comments entirely */
2461 while (*ptr != 0 && *ptr != ')') ptr++;
2465 goto PCRE_ERROR_RETURN;
2469 /* Non-referencing groups and lookaheads just move the pointer on, and
2470 then behave like a non-special bracket, except that they don't increment
2471 the count of extracting brackets. Ditto for the "once only" bracket,
2472 which is in Perl from version 5.005. */
2481 /* A recursive call to the regex is an extension, to provide the
2482 facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2488 goto PCRE_ERROR_RETURN;
2494 /* Lookbehinds are in Perl from version 5.005 */
2497 if (ptr[3] == '=' || ptr[3] == '!')
2500 branch_newextra = 3;
2501 length += 3; /* For the first branch */
2505 goto PCRE_ERROR_RETURN;
2507 /* Conditionals are in Perl from version 5.005. The bracket must either
2508 be followed by a number (for bracket reference) or by an assertion
2512 if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2516 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2520 goto PCRE_ERROR_RETURN;
2523 else /* An assertion must follow */
2525 ptr++; /* Can treat like ':' as far as spacing is concerned */
2527 if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)
2529 ptr += 2; /* To get right offset in message */
2531 goto PCRE_ERROR_RETURN;
2536 /* Else loop checking valid options until ) is met. Anything else is an
2537 error. If we are without any brackets, i.e. at top level, the settings
2538 act as if specified in the options, so massage the options immediately.
2539 This is for backward compatibility with Perl 5.004. */
2552 *optset |= PCRE_CASELESS;
2556 *optset |= PCRE_MULTILINE;
2560 *optset |= PCRE_DOTALL;
2564 *optset |= PCRE_EXTENDED;
2568 *optset |= PCRE_EXTRA;
2572 *optset |= PCRE_UNGREEDY;
2579 /* A termination by ')' indicates an options-setting-only item;
2580 this is global at top level; otherwise nothing is done here and
2581 it is handled during the compiling process on a per-bracket-group
2585 if (brastackptr == 0)
2587 options = (options | set) & (~unset);
2588 set = unset = 0; /* To save length */
2592 /* A termination by ':' indicates the start of a nested group with
2593 the given options set. This is again handled at compile time, but
2594 we must allow for compiled space if any of the ims options are
2595 set. We also have to allow for resetting space at the end of
2596 the group, which is why 4 is added to the length and not just 2.
2597 If there are several changes of options within the same group, this
2598 will lead to an over-estimate on the length, but this shouldn't
2599 matter very much. We also have to allow for resetting options at
2600 the start of any alternations, which we do by setting
2601 branch_newextra to 2. Finally, we record whether the case-dependent
2602 flag ever changes within the regex. This is used by the "required
2606 if (((set|unset) & PCRE_IMS) != 0)
2609 branch_newextra = 2;
2610 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2614 /* Unrecognized option character */
2618 goto PCRE_ERROR_RETURN;
2622 /* If we hit a closing bracket, that's it - this is a freestanding
2623 option-setting. We need to ensure that branch_extra is updated if
2624 necessary. The only values branch_newextra can have here are 0 or 2.
2625 If the value is 2, then branch_extra must either be 2 or 5, depending
2626 on whether this is a lookbehind group or not. */
2631 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2632 branch_extra += branch_newextra;
2636 /* If options were terminated by ':' control comes here. Fall through
2637 to handle the group below. */
2641 /* Extracting brackets must be counted so we can process escapes in a
2646 /* Non-special forms of bracket. Save length for computing whole length
2647 at end if there's a repeat that requires duplication of the group. Also
2648 save the current value of branch_extra, and start the new group with
2649 the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2650 for a lookbehind assertion. */
2652 if (brastackptr >= sizeof(brastack)/sizeof(int))
2655 goto PCRE_ERROR_RETURN;
2658 bralenstack[brastackptr] = branch_extra;
2659 branch_extra = branch_newextra;
2661 brastack[brastackptr++] = length;
2665 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2666 have to replicate this bracket up to that many times. If brastackptr is
2667 0 this is an unmatched bracket which will generate an error, but take care
2668 not to try to access brastack[-1] when computing the length and restoring
2669 the branch_extra value. */
2678 if (brastackptr > 0)
2680 duplength = length - brastack[--brastackptr];
2681 branch_extra = bralenstack[brastackptr];
2685 /* Leave ptr at the final char; for read_repeat_counts this happens
2686 automatically; for the others we need an increment. */
2688 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2690 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2692 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2694 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2695 else if (c == '+') { maxval = -1; ptr++; }
2696 else if (c == '?') { minval = 0; ptr++; }
2698 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2699 group, and if the maximum is greater than zero, we have to replicate
2700 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2701 bracket set - hence the 7. */
2706 if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2709 /* When the minimum is greater than zero, 1 we have to replicate up to
2710 minval-1 times, with no additions required in the copies. Then, if
2711 there is a limited maximum we have to replicate up to maxval-1 times
2712 allowing for a BRAZERO item before each optional copy and nesting
2713 brackets for all but one of the optional copies. */
2717 length += (minval - 1) * duplength;
2718 if (maxval > minval) /* Need this test as maxval=-1 means no limit */
2719 length += (maxval - minval) * (duplength + 7) - 6;
2724 /* Non-special character. For a run of such characters the length required
2725 is the number of characters + 2, except that the maximum run length is 255.
2726 We won't get a skipped space or a non-data escape or the start of a #
2727 comment as the first character, so the length can't be zero. */
2735 if ((options & PCRE_EXTENDED) != 0)
2737 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2740 while ((c = *(++ptr)) != 0 && c != '\n');
2745 /* Backslash may introduce a data char or a metacharacter; stop the
2746 string before the latter. */
2750 const uschar *saveptr = ptr;
2751 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2753 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2754 if (c < 0) { ptr = saveptr; break; }
2757 /* Ordinary character or single-char escape */
2762 /* This "while" is the end of the "do" above. */
2764 while (runlength < 255 &&
2765 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
2768 length += runlength;
2773 length += 4; /* For final KET and END */
2781 /* Compute the size of data block needed and get it, either from malloc or
2782 externally provided function. We specify "code[0]" in the offsetof() expression
2783 rather than just "code", because it has been reported that one broken compiler
2784 fails on "code" because it is also an independent variable. It should make no
2785 difference to the value of the offsetof(). */
2787 size = length + offsetof(real_pcre, code[0]);
2788 re = (real_pcre *)(pcre_malloc)(size);
2796 /* Put in the magic number, and save the size, options, and table pointer */
2798 re->magic_number = MAGIC_NUMBER;
2800 re->options = options;
2801 re->tables = tables;
2803 /* Set up a starting, non-extracting bracket, then compile the expression. On
2804 error, *errorptr will be set non-NULL, so we don't need to look at the result
2805 of the function here. */
2807 ptr = (const uschar *)pattern;
2811 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
2812 &reqchar, &countlits, &compile_block);
2813 re->top_bracket = bracount;
2814 re->top_backref = top_backref;
2816 /* If not reached end of pattern on success, there's an excess bracket. */
2818 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2820 /* Fill in the terminating state and check for disastrous overflow, but
2821 if debugging, leave the test till after things are printed out. */
2826 if (code - re->code > length) *errorptr = ERR23;
2829 /* Give an error if there's back reference to a non-existent capturing
2832 if (top_backref > re->top_bracket) *errorptr = ERR15;
2834 /* Failed to compile */
2836 if (*errorptr != NULL)
2840 *erroroffset = ptr - (const uschar *)pattern;
2844 /* If the anchored option was not passed, set flag if we can determine that the
2845 pattern is anchored by virtue of ^ characters or \A or anything else (such as
2846 starting with .* when DOTALL is set).
2848 Otherwise, see if we can determine what the first character has to be, because
2849 that speeds up unanchored matches no end. If not, see if we can set the
2850 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
2851 start with ^. and also when all branches start with .* for non-DOTALL matches.
2854 if ((options & PCRE_ANCHORED) == 0)
2856 int temp_options = options;
2857 if (is_anchored(re->code, &temp_options))
2858 re->options |= PCRE_ANCHORED;
2861 int ch = find_firstchar(re->code, &temp_options);
2864 re->first_char = ch;
2865 re->options |= PCRE_FIRSTSET;
2867 else if (is_startline(re->code))
2868 re->options |= PCRE_STARTLINE;
2872 /* Save the last required character if there are at least two literal
2873 characters on all paths, or if there is no first character setting. */
2875 if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
2877 re->req_char = reqchar;
2878 re->options |= PCRE_REQCHSET;
2881 /* Print out the compiled data for debugging */
2885 printf("Length = %d top_bracket = %d top_backref = %d\n",
2886 length, re->top_bracket, re->top_backref);
2888 if (re->options != 0)
2890 printf("%s%s%s%s%s%s%s%s%s\n",
2891 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2892 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2893 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
2894 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2895 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2896 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2897 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2898 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2899 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2902 if ((re->options & PCRE_FIRSTSET) != 0)
2904 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2905 else printf("First char = \\x%02x\n", re->first_char);
2908 if ((re->options & PCRE_REQCHSET) != 0)
2910 if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
2911 else printf("Req char = \\x%02x\n", re->req_char);
2915 code_base = code = re->code;
2917 while (code < code_end)
2921 printf("%3d ", code - code_base);
2923 if (*code >= OP_BRA)
2925 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2932 printf(" %.2x %s", code[1], OP_names[*code]);
2937 printf("%3d Cond", (code[1] << 8) + code[2]);
2942 printf(" %.2d %s", code[1], OP_names[*code]);
2947 charlength = *(++code);
2948 printf("%3d ", charlength);
2949 while (charlength-- > 0)
2950 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2960 case OP_ASSERTBACK_NOT:
2962 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2967 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2978 case OP_TYPEMINSTAR:
2980 case OP_TYPEMINPLUS:
2982 case OP_TYPEMINQUERY:
2983 if (*code >= OP_TYPESTAR)
2984 printf(" %s", OP_names[code[1]]);
2985 else if (isprint(c = code[1])) printf(" %c", c);
2986 else printf(" \\x%02x", c);
2987 printf("%s", OP_names[*code++]);
2993 if (isprint(c = code[3])) printf(" %c{", c);
2994 else printf(" \\x%02x{", c);
2995 if (*code != OP_EXACT) printf("0,");
2996 printf("%d}", (code[1] << 8) + code[2]);
2997 if (*code == OP_MINUPTO) printf("?");
3003 case OP_TYPEMINUPTO:
3004 printf(" %s{", OP_names[code[3]]);
3005 if (*code != OP_TYPEEXACT) printf(",");
3006 printf("%d}", (code[1] << 8) + code[2]);
3007 if (*code == OP_TYPEMINUPTO) printf("?");
3012 if (isprint(c = *(++code))) printf(" [^%c]", c);
3013 else printf(" [^\\x%02x]", c);
3021 case OP_NOTMINQUERY:
3022 if (isprint(c = code[1])) printf(" [^%c]", c);
3023 else printf(" [^\\x%02x]", c);
3024 printf("%s", OP_names[*code++]);
3030 if (isprint(c = code[3])) printf(" [^%c]{", c);
3031 else printf(" [^\\x%02x]{", c);
3032 if (*code != OP_NOTEXACT) printf(",");
3033 printf("%d}", (code[1] << 8) + code[2]);
3034 if (*code == OP_NOTMINUPTO) printf("?");
3039 printf(" \\%d", *(++code));
3041 goto CLASS_REF_REPEAT;
3049 for (i = 0; i < 256; i++)
3051 if ((code[i/8] & (1 << (i&7))) != 0)
3054 for (j = i+1; j < 256; j++)
3055 if ((code[j/8] & (1 << (j&7))) == 0) break;
3056 if (i == '-' || i == ']') printf("\\");
3057 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
3061 if (j == '-' || j == ']') printf("\\");
3062 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
3080 printf("%s", OP_names[*code]);
3085 min = (code[1] << 8) + code[2];
3086 max = (code[3] << 8) + code[4];
3087 if (max == 0) printf("{%d,}", min);
3088 else printf("{%d,%d}", min, max);
3089 if (*code == OP_CRMINRANGE) printf("?");
3099 /* Anything else is just a one-node item */
3102 printf(" %s", OP_names[*code]);
3109 printf("------------------------------------------------------------------\n");
3111 /* This check is done here in the debugging case so that the code that
3112 was compiled can be seen. */
3114 if (code - re->code > length)
3118 *erroroffset = ptr - (uschar *)pattern;
3128 /*************************************************
3129 * Match a back-reference *
3130 *************************************************/
3132 /* If a back reference hasn't been set, the length that is passed is greater
3133 than the number of characters left in the string, so the match fails.
3136 offset index into the offset vector
3137 eptr points into the subject
3138 length length to be matched
3139 md points to match data block
3142 Returns: TRUE if matched
3146 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3147 unsigned long int ims)
3149 const uschar *p = md->start_subject + md->offset_vector[offset];
3152 if (eptr >= md->end_subject)
3153 printf("matching subject <null>");
3156 printf("matching subject ");
3157 pchars(eptr, length, TRUE, md);
3159 printf(" against backref ");
3160 pchars(p, length, FALSE, md);
3164 /* Always fail if not enough characters left */
3166 if (length > md->end_subject - eptr) return FALSE;
3168 /* Separate the caselesss case for speed */
3170 if ((ims & PCRE_CASELESS) != 0)
3172 while (length-- > 0)
3173 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
3176 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3183 /*************************************************
3184 * Match from current position *
3185 *************************************************/
3187 /* On entry ecode points to the first opcode, and eptr to the first character
3188 in the subject string, while eptrb holds the value of eptr at the start of the
3189 last bracketed group - used for breaking infinite loops matching zero-length
3193 eptr pointer in subject
3194 ecode position in code
3195 offset_top current top pointer
3196 md pointer to "static" info for the match
3197 ims current /i, /m, and /s options
3198 condassert TRUE if called to check a condition assertion
3199 eptrb eptr at start of last bracket
3201 Returns: TRUE if matched
3205 match(register const uschar *eptr, register const uschar *ecode,
3206 int offset_top, match_data *md, unsigned long int ims, BOOL condassert,
3207 const uschar *eptrb)
3209 unsigned long int original_ims = ims; /* Save for resetting on ')' */
3213 int op = (int)*ecode;
3214 int min, max, ctype;
3217 BOOL minimize = FALSE;
3219 /* Opening capturing bracket. If there is space in the offset vector, save
3220 the current subject position in the working slot at the top of the vector. We
3221 mustn't change the current values of the data slot, because they may be set
3222 from a previous iteration of this group, and be referred to by a reference
3225 If the bracket fails to match, we need to restore this value and also the
3226 values of the final offsets, in case they were set by a previous iteration of
3229 If there isn't enough space in the offset vector, treat this as if it were a
3230 non-capturing bracket. Don't worry about setting the flag for the error case
3231 here; that is handled in the code for KET. */
3235 int number = op - OP_BRA;
3236 int offset = number << 1;
3239 printf("start bracket %d subject=", number);
3240 pchars(eptr, 16, TRUE, md);
3244 if (offset < md->offset_max)
3246 int save_offset1 = md->offset_vector[offset];
3247 int save_offset2 = md->offset_vector[offset+1];
3248 int save_offset3 = md->offset_vector[md->offset_end - number];
3250 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3251 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3255 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3256 ecode += (ecode[1] << 8) + ecode[2];
3258 while (*ecode == OP_ALT);
3260 DPRINTF(("bracket %d failed\n", number));
3262 md->offset_vector[offset] = save_offset1;
3263 md->offset_vector[offset+1] = save_offset2;
3264 md->offset_vector[md->offset_end - number] = save_offset3;
3268 /* Insufficient room for saving captured contents */
3273 /* Other types of node can be handled by a switch */
3277 case OP_BRA: /* Non-capturing bracket: optimized */
3278 DPRINTF(("start bracket 0\n"));
3281 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3282 ecode += (ecode[1] << 8) + ecode[2];
3284 while (*ecode == OP_ALT);
3285 DPRINTF(("bracket 0 failed\n"));
3288 /* Conditional group: compilation checked that there are no more than
3289 two branches. If the condition is false, skipping the first branch takes us
3290 past the end if there is only one branch, but that's OK because that is
3291 exactly what going to the ket would do. */
3294 if (ecode[3] == OP_CREF) /* Condition is extraction test */
3296 int offset = ecode[4] << 1; /* Doubled reference number */
3298 ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3299 5 : 3 + (ecode[1] << 8) + ecode[2]),
3300 offset_top, md, ims, FALSE, eptr);
3303 /* The condition is an assertion. Call match() to evaluate it - setting
3304 the final argument TRUE causes it to stop at the end of an assertion. */
3308 if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))
3310 ecode += 3 + (ecode[4] << 8) + ecode[5];
3311 while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3313 else ecode += (ecode[1] << 8) + ecode[2];
3314 return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);
3316 /* Control never reaches here */
3318 /* Skip over conditional reference data if encountered (should not be) */
3324 /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3325 an empty string - recursion will then try other alternatives, if any. */
3328 if (md->notempty && eptr == md->start_match) return FALSE;
3329 md->end_match_ptr = eptr; /* Record where we ended */
3330 md->end_offset_top = offset_top; /* and how many extracts were taken */
3333 /* Change option settings */
3338 DPRINTF(("ims set to %02lx\n", ims));
3341 /* Assertion brackets. Check the alternative branches in turn - the
3342 matching won't pass the KET for an assertion. If any one branch matches,
3343 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3344 start of each branch to move the current point backwards, so the code at
3345 this level is identical to the lookahead case. */
3351 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;
3352 ecode += (ecode[1] << 8) + ecode[2];
3354 while (*ecode == OP_ALT);
3355 if (*ecode == OP_KET) return FALSE;
3357 /* If checking an assertion for a condition, return TRUE. */
3359 if (condassert) return TRUE;
3361 /* Continue from after the assertion, updating the offsets high water
3362 mark, since extracts may have been taken during the assertion. */
3364 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3366 offset_top = md->end_offset_top;
3369 /* Negative assertion: all branches must fail to match */
3372 case OP_ASSERTBACK_NOT:
3375 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;
3376 ecode += (ecode[1] << 8) + ecode[2];
3378 while (*ecode == OP_ALT);
3380 if (condassert) return TRUE;
3384 /* Move the subject pointer back. This occurs only at the start of
3385 each branch of a lookbehind assertion. If we are too close to the start to
3386 move back, this match function fails. */
3389 eptr -= (ecode[1] << 8) + ecode[2];
3390 if (eptr < md->start_subject) return FALSE;
3394 /* Recursion matches the current regex, nested. If there are any capturing
3395 brackets started but not finished, we have to save their starting points
3396 and reinstate them after the recursion. However, we don't know how many
3397 such there are (offset_top records the completed total) so we just have
3398 to save all the potential data. There may be up to 99 such values, which
3399 is a bit large to put on the stack, but using malloc for small numbers
3400 seems expensive. As a compromise, the stack is used when there are fewer
3401 than 16 values to store; otherwise malloc is used. A problem is what to do
3402 if the malloc fails ... there is no way of returning to the top level with
3403 an error. Save the top 15 values on the stack, and accept that the rest
3414 if (c < 16) save = stacksave; else
3416 save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3424 for (i = 1; i <= c; i++)
3425 save[i] = md->offset_vector[md->offset_end - i];
3426 rc = match(eptr, md->start_pattern, offset_top, md, ims, FALSE, eptrb);
3427 for (i = 1; i <= c; i++)
3428 md->offset_vector[md->offset_end - i] = save[i];
3429 if (save != stacksave) (pcre_free)(save);
3430 if (!rc) return FALSE;
3432 /* In case the recursion has set more capturing values, save the final
3433 number, then move along the subject till after the recursive match,
3434 and advance one byte in the pattern code. */
3436 offset_top = md->end_offset_top;
3437 eptr = md->end_match_ptr;
3442 /* "Once" brackets are like assertion brackets except that after a match,
3443 the point in the subject string is not moved back. Thus there can never be
3444 a move back into the brackets. Check the alternative branches in turn - the
3445 matching won't pass the KET for this kind of subpattern. If any one branch
3446 matches, we carry on as at the end of a normal bracket, leaving the subject
3451 const uschar *prev = ecode;
3455 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;
3456 ecode += (ecode[1] << 8) + ecode[2];
3458 while (*ecode == OP_ALT);
3460 /* If hit the end of the group (which could be repeated), fail */
3462 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3464 /* Continue as from after the assertion, updating the offsets high water
3465 mark, since extracts may have been taken. */
3467 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3469 offset_top = md->end_offset_top;
3470 eptr = md->end_match_ptr;
3472 /* For a non-repeating ket, just continue at this level. This also
3473 happens for a repeating ket if no characters were matched in the group.
3474 This is the forcible breaking of infinite loops as implemented in Perl
3475 5.005. If there is an options reset, it will get obeyed in the normal
3476 course of events. */
3478 if (*ecode == OP_KET || eptr == eptrb)
3484 /* The repeating kets try the rest of the pattern or restart from the
3485 preceding bracket, in the appropriate order. We need to reset any options
3486 that changed within the bracket before re-running it, so check the next
3489 if (ecode[3] == OP_OPT)
3491 ims = (ims & ~PCRE_IMS) | ecode[4];
3492 DPRINTF(("ims set to %02lx at group repeat\n", ims));
3495 if (*ecode == OP_KETRMIN)
3497 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3498 match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3500 else /* OP_KETRMAX */
3502 if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3503 match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3508 /* An alternation is the end of a branch; scan along to find the end of the
3509 bracketed group and go to there. */
3512 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3515 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3516 that it may occur zero times. It may repeat infinitely, or not at all -
3517 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3518 repeat limits are compiled as a number of copies, with the optional ones
3519 preceded by BRAZERO or BRAMINZERO. */
3523 const uschar *next = ecode+1;
3524 if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;
3525 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3532 const uschar *next = ecode+1;
3533 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3534 if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3539 /* End of a group, repeated or non-repeating. If we are at the end of
3540 an assertion "group", stop matching and return TRUE, but record the
3541 current high water mark for use by positive assertions. Do this also
3542 for the "once" (not-backup up) groups. */
3548 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3550 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3551 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3554 md->end_match_ptr = eptr; /* For ONCE */
3555 md->end_offset_top = offset_top;
3559 /* In all other cases except a conditional group we have to check the
3560 group number back at the start and if necessary complete handling an
3561 extraction by setting the offsets and bumping the high water mark. */
3563 if (*prev != OP_COND)
3565 int number = *prev - OP_BRA;
3566 int offset = number << 1;
3568 DPRINTF(("end bracket %d\n", number));
3572 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3574 md->offset_vector[offset] =
3575 md->offset_vector[md->offset_end - number];
3576 md->offset_vector[offset+1] = eptr - md->start_subject;
3577 if (offset_top <= offset) offset_top = offset + 2;
3582 /* Reset the value of the ims flags, in case they got changed during
3586 DPRINTF(("ims reset to %02lx\n", ims));
3588 /* For a non-repeating ket, just continue at this level. This also
3589 happens for a repeating ket if no characters were matched in the group.
3590 This is the forcible breaking of infinite loops as implemented in Perl
3591 5.005. If there is an options reset, it will get obeyed in the normal
3592 course of events. */
3594 if (*ecode == OP_KET || eptr == eptrb)
3600 /* The repeating kets try the rest of the pattern or restart from the
3601 preceding bracket, in the appropriate order. */
3603 if (*ecode == OP_KETRMIN)
3605 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3606 match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3608 else /* OP_KETRMAX */
3610 if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3611 match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3616 /* Start of subject unless notbol, or after internal newline if multiline */
3619 if (md->notbol && eptr == md->start_subject) return FALSE;
3620 if ((ims & PCRE_MULTILINE) != 0)
3622 if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
3626 /* ... else fall through */
3628 /* Start of subject assertion */
3631 if (eptr != md->start_subject) return FALSE;
3635 /* Assert before internal newline if multiline, or before a terminating
3636 newline unless endonly is set, else end of subject unless noteol is set. */
3639 if ((ims & PCRE_MULTILINE) != 0)
3641 if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
3642 else { if (md->noteol) return FALSE; }
3648 if (md->noteol) return FALSE;
3651 if (eptr < md->end_subject - 1 ||
3652 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3658 /* ... else fall through */
3660 /* End of subject assertion (\z) */
3663 if (eptr < md->end_subject) return FALSE;
3667 /* End of subject or ending \n assertion (\Z) */
3670 if (eptr < md->end_subject - 1 ||
3671 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3675 /* Word boundary assertions */
3677 case OP_NOT_WORD_BOUNDARY:
3678 case OP_WORD_BOUNDARY:
3680 BOOL prev_is_word = (eptr != md->start_subject) &&
3681 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3682 BOOL cur_is_word = (eptr < md->end_subject) &&
3683 ((md->ctypes[*eptr] & ctype_word) != 0);
3684 if ((*ecode++ == OP_WORD_BOUNDARY)?
3685 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3690 /* Match a single character type; inline for speed */
3693 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
3695 if (eptr++ >= md->end_subject) return FALSE;
3700 if (eptr >= md->end_subject ||
3701 (md->ctypes[*eptr++] & ctype_digit) != 0)
3707 if (eptr >= md->end_subject ||
3708 (md->ctypes[*eptr++] & ctype_digit) == 0)
3713 case OP_NOT_WHITESPACE:
3714 if (eptr >= md->end_subject ||
3715 (md->ctypes[*eptr++] & ctype_space) != 0)
3721 if (eptr >= md->end_subject ||
3722 (md->ctypes[*eptr++] & ctype_space) == 0)
3727 case OP_NOT_WORDCHAR:
3728 if (eptr >= md->end_subject ||
3729 (md->ctypes[*eptr++] & ctype_word) != 0)
3735 if (eptr >= md->end_subject ||
3736 (md->ctypes[*eptr++] & ctype_word) == 0)
3741 /* Match a back reference, possibly repeatedly. Look past the end of the
3742 item to see if there is repeat information following. The code is similar
3743 to that for character classes, but repeated for efficiency. Then obey
3744 similar code to character type repeats - written out again for speed.
3745 However, if the referenced string is the empty string, always treat
3746 it as matched, any number of times (otherwise there could be infinite
3752 int offset = ecode[1] << 1; /* Doubled reference number */
3753 ecode += 2; /* Advance past the item */
3755 /* If the reference is unset, set the length to be longer than the amount
3756 of subject left; this ensures that every attempt at a match fails. We
3757 can't just fail here, because of the possibility of quantifiers with zero
3760 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
3761 md->end_subject - eptr + 1 :
3762 md->offset_vector[offset+1] - md->offset_vector[offset];
3764 /* Set up for repetition, or handle the non-repeated case */
3774 c = *ecode++ - OP_CRSTAR;
3775 minimize = (c & 1) != 0;
3776 min = rep_min[c]; /* Pick up values from tables; */
3777 max = rep_max[c]; /* zero for max => infinity */
3778 if (max == 0) max = INT_MAX;
3783 minimize = (*ecode == OP_CRMINRANGE);
3784 min = (ecode[1] << 8) + ecode[2];
3785 max = (ecode[3] << 8) + ecode[4];
3786 if (max == 0) max = INT_MAX;
3790 default: /* No repeat follows */
3791 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3793 continue; /* With the main loop */
3796 /* If the length of the reference is zero, just continue with the
3799 if (length == 0) continue;
3801 /* First, ensure the minimum number of matches are present. We get back
3802 the length of the reference string explicitly rather than passing the
3803 address of eptr, so that eptr can be a register variable. */
3805 for (i = 1; i <= min; i++)
3807 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3811 /* If min = max, continue at the same level without recursion.
3812 They are not both allowed to be zero. */
3814 if (min == max) continue;
3816 /* If minimizing, keep trying and advancing the pointer */
3822 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3824 if (i >= max || !match_ref(offset, eptr, length, md, ims))
3828 /* Control never gets here */
3831 /* If maximizing, find the longest string and work backwards */
3835 const uschar *pp = eptr;
3836 for (i = min; i < max; i++)
3838 if (!match_ref(offset, eptr, length, md, ims)) break;
3843 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3850 /* Control never gets here */
3854 /* Match a character class, possibly repeatedly. Look past the end of the
3855 item to see if there is repeat information following. Then obey similar
3856 code to character type repeats - written out again for speed. */
3860 const uschar *data = ecode + 1; /* Save for matching */
3861 ecode += 33; /* Advance past the item */
3871 c = *ecode++ - OP_CRSTAR;
3872 minimize = (c & 1) != 0;
3873 min = rep_min[c]; /* Pick up values from tables; */
3874 max = rep_max[c]; /* zero for max => infinity */
3875 if (max == 0) max = INT_MAX;
3880 minimize = (*ecode == OP_CRMINRANGE);
3881 min = (ecode[1] << 8) + ecode[2];
3882 max = (ecode[3] << 8) + ecode[4];
3883 if (max == 0) max = INT_MAX;
3887 default: /* No repeat follows */
3892 /* First, ensure the minimum number of matches are present. */
3894 for (i = 1; i <= min; i++)
3896 if (eptr >= md->end_subject) return FALSE;
3898 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3902 /* If max == min we can continue with the main loop without the
3905 if (min == max) continue;
3907 /* If minimizing, keep testing the rest of the expression and advancing
3908 the pointer while it matches the class. */
3914 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3916 if (i >= max || eptr >= md->end_subject) return FALSE;
3918 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3921 /* Control never gets here */
3924 /* If maximizing, find the longest possible run, then work backwards. */
3928 const uschar *pp = eptr;
3929 for (i = min; i < max; eptr++, i++)
3931 if (eptr >= md->end_subject) break;
3933 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3938 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3943 /* Control never gets here */
3945 /* Match a run of characters */
3949 register int length = ecode[1];
3952 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3953 if (eptr >= md->end_subject)
3954 printf("matching subject <null> against pattern ");
3957 printf("matching subject ");
3958 pchars(eptr, length, TRUE, md);
3959 printf(" against pattern ");
3961 pchars(ecode, length, FALSE, md);
3965 if (length > md->end_subject - eptr) return FALSE;
3966 if ((ims & PCRE_CASELESS) != 0)
3968 while (length-- > 0)
3969 if (md->lcc[*ecode++] != md->lcc[*eptr++])
3974 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
3979 /* Match a single character repeatedly; different opcodes share code. */
3982 min = max = (ecode[1] << 8) + ecode[2];
3989 max = (ecode[1] << 8) + ecode[2];
3990 minimize = *ecode == OP_MINUPTO;
4000 c = *ecode++ - OP_STAR;
4001 minimize = (c & 1) != 0;
4002 min = rep_min[c]; /* Pick up values from tables; */
4003 max = rep_max[c]; /* zero for max => infinity */
4004 if (max == 0) max = INT_MAX;
4006 /* Common code for all repeated single-character matches. We can give
4007 up quickly if there are fewer than the minimum number of characters left in
4011 if (min > md->end_subject - eptr) return FALSE;
4014 /* The code is duplicated for the caseless and caseful cases, for speed,
4015 since matching characters is likely to be quite common. First, ensure the
4016 minimum number of matches are present. If min = max, continue at the same
4017 level without recursing. Otherwise, if minimizing, keep trying the rest of
4018 the expression and advancing one matching character if failing, up to the
4019 maximum. Alternatively, if maximizing, find the maximum number of
4020 characters and work backwards. */
4022 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4025 if ((ims & PCRE_CASELESS) != 0)
4028 for (i = 1; i <= min; i++)
4029 if (c != md->lcc[*eptr++]) return FALSE;
4030 if (min == max) continue;
4035 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
4037 if (i >= max || eptr >= md->end_subject ||
4038 c != md->lcc[*eptr++])
4041 /* Control never gets here */
4045 const uschar *pp = eptr;
4046 for (i = min; i < max; i++)
4048 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
4052 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
4056 /* Control never gets here */
4059 /* Caseful comparisons */
4063 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
4064 if (min == max) continue;
4069 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
4071 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
4073 /* Control never gets here */
4077 const uschar *pp = eptr;
4078 for (i = min; i < max; i++)
4080 if (eptr >= md->end_subject || c != *eptr) break;
4084 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
4089 /* Control never gets here */
4091 /* Match a negated single character */
4094 if (eptr >= md->end_subject) return FALSE;
4096 if ((ims & PCRE_CASELESS) != 0)
4098 if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
4102 if (*ecode++ == *eptr++) return FALSE;
4106 /* Match a negated single character repeatedly. This is almost a repeat of
4107 the code for a repeated single character, but I haven't found a nice way of
4108 commoning these up that doesn't require a test of the positive/negative
4109 option for each character match. Maybe that wouldn't add very much to the
4110 time taken, but character matching *is* what this is all about... */
4113 min = max = (ecode[1] << 8) + ecode[2];
4120 max = (ecode[1] << 8) + ecode[2];
4121 minimize = *ecode == OP_NOTMINUPTO;
4130 case OP_NOTMINQUERY:
4131 c = *ecode++ - OP_NOTSTAR;
4132 minimize = (c & 1) != 0;
4133 min = rep_min[c]; /* Pick up values from tables; */
4134 max = rep_max[c]; /* zero for max => infinity */
4135 if (max == 0) max = INT_MAX;
4137 /* Common code for all repeated single-character matches. We can give
4138 up quickly if there are fewer than the minimum number of characters left in
4142 if (min > md->end_subject - eptr) return FALSE;
4145 /* The code is duplicated for the caseless and caseful cases, for speed,
4146 since matching characters is likely to be quite common. First, ensure the
4147 minimum number of matches are present. If min = max, continue at the same
4148 level without recursing. Otherwise, if minimizing, keep trying the rest of
4149 the expression and advancing one matching character if failing, up to the
4150 maximum. Alternatively, if maximizing, find the maximum number of
4151 characters and work backwards. */
4153 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4156 if ((ims & PCRE_CASELESS) != 0)
4159 for (i = 1; i <= min; i++)
4160 if (c == md->lcc[*eptr++]) return FALSE;
4161 if (min == max) continue;
4166 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
4168 if (i >= max || eptr >= md->end_subject ||
4169 c == md->lcc[*eptr++])
4172 /* Control never gets here */
4176 const uschar *pp = eptr;
4177 for (i = min; i < max; i++)
4179 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
4183 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
4187 /* Control never gets here */
4190 /* Caseful comparisons */
4194 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
4195 if (min == max) continue;
4200 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
4202 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
4204 /* Control never gets here */
4208 const uschar *pp = eptr;
4209 for (i = min; i < max; i++)
4211 if (eptr >= md->end_subject || c == *eptr) break;
4215 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
4220 /* Control never gets here */
4222 /* Match a single character type repeatedly; several different opcodes
4223 share code. This is very similar to the code for single characters, but we
4224 repeat it in the interests of efficiency. */
4227 min = max = (ecode[1] << 8) + ecode[2];
4233 case OP_TYPEMINUPTO:
4235 max = (ecode[1] << 8) + ecode[2];
4236 minimize = *ecode == OP_TYPEMINUPTO;
4241 case OP_TYPEMINSTAR:
4243 case OP_TYPEMINPLUS:
4245 case OP_TYPEMINQUERY:
4246 c = *ecode++ - OP_TYPESTAR;
4247 minimize = (c & 1) != 0;
4248 min = rep_min[c]; /* Pick up values from tables; */
4249 max = rep_max[c]; /* zero for max => infinity */
4250 if (max == 0) max = INT_MAX;
4252 /* Common code for all repeated single character type matches */
4255 ctype = *ecode++; /* Code for the character type */
4257 /* First, ensure the minimum number of matches are present. Use inline
4258 code for maximizing the speed, and do the type test once at the start
4259 (i.e. keep it out of the loop). Also test that there are at least the
4260 minimum number of characters before we start. */
4262 if (min > md->end_subject - eptr) return FALSE;
4263 if (min > 0) switch(ctype)
4266 if ((ims & PCRE_DOTALL) == 0)
4267 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
4272 for (i = 1; i <= min; i++)
4273 if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
4277 for (i = 1; i <= min; i++)
4278 if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
4281 case OP_NOT_WHITESPACE:
4282 for (i = 1; i <= min; i++)
4283 if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
4287 for (i = 1; i <= min; i++)
4288 if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
4291 case OP_NOT_WORDCHAR:
4292 for (i = 1; i <= min; i++)
4293 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4298 for (i = 1; i <= min; i++)
4299 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4304 /* If min = max, continue at the same level without recursing */
4306 if (min == max) continue;
4308 /* If minimizing, we have to test the rest of the pattern before each
4309 subsequent match. */
4315 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;
4316 if (i >= max || eptr >= md->end_subject) return FALSE;
4322 if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
4326 if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
4330 if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
4333 case OP_NOT_WHITESPACE:
4334 if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
4338 if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
4341 case OP_NOT_WORDCHAR:
4342 if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
4346 if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
4350 /* Control never gets here */
4353 /* If maximizing it is worth using inline code for speed, doing the type
4354 test once at the start (i.e. keep it out of the loop). */
4358 const uschar *pp = eptr;
4362 if ((ims & PCRE_DOTALL) == 0)
4364 for (i = min; i < max; i++)
4366 if (eptr >= md->end_subject || *eptr == '\n') break;
4373 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4379 for (i = min; i < max; i++)
4381 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4388 for (i = min; i < max; i++)
4390 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4396 case OP_NOT_WHITESPACE:
4397 for (i = min; i < max; i++)
4399 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4406 for (i = min; i < max; i++)
4408 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4414 case OP_NOT_WORDCHAR:
4415 for (i = min; i < max; i++)
4417 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4424 for (i = min; i < max; i++)
4426 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4434 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
4438 /* Control never gets here */
4440 /* There's been some horrible disaster. */
4443 DPRINTF(("Unknown opcode %d\n", *ecode));
4444 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4448 /* Do not stick any code in here without much thought; it is assumed
4449 that "continue" in the code above comes out to here to repeat the main
4452 } /* End of main loop */
4453 /* Control never reaches here */
4459 /*************************************************
4460 * Execute a Regular Expression *
4461 *************************************************/
4463 /* This function applies a compiled re to a subject string and picks out
4464 portions of the string if it matches. Two elements in the vector are set for
4465 each substring: the offsets to the start and end of the substring.
4468 external_re points to the compiled expression
4469 external_extra points to "hints" from pcre_study() or is NULL
4470 subject points to the subject string
4471 length length of subject string (may contain binary zeros)
4472 start_offset where to start in the subject string
4474 offsets points to a vector of ints to be filled in with offsets
4475 offsetcount the number of elements in the vector
4477 Returns: > 0 => success; value is the number of elements filled in
4478 = 0 => success, but offsets is not big enough
4479 -1 => failed to match
4480 < -1 => some kind of unexpected problem
4484 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4485 const char *subject, int length, int start_offset, int options, int *offsets,
4488 int resetcount, ocount;
4489 int first_char = -1;
4492 unsigned long int ims = 0;
4493 match_data match_block;
4494 const uschar *start_bits = NULL;
4495 const uschar *start_match = (const uschar *)subject + start_offset;
4496 const uschar *end_subject;
4497 const uschar *req_char_ptr = start_match - 1;
4498 const real_pcre *re = (const real_pcre *)external_re;
4499 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4500 BOOL using_temporary_offsets = FALSE;
4501 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4502 BOOL startline = (re->options & PCRE_STARTLINE) != 0;
4504 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4506 if (re == NULL || subject == NULL ||
4507 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4508 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4510 match_block.start_pattern = re->code;
4511 match_block.start_subject = (const uschar *)subject;
4512 match_block.end_subject = match_block.start_subject + length;
4513 end_subject = match_block.end_subject;
4515 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4517 match_block.notbol = (options & PCRE_NOTBOL) != 0;
4518 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4519 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4521 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4523 match_block.lcc = re->tables + lcc_offset;
4524 match_block.ctypes = re->tables + ctypes_offset;
4526 /* The ims options can vary during the matching as a result of the presence
4527 of (?ims) items in the pattern. They are kept in a local variable so that
4528 restoring at the exit of a group is easy. */
4530 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4532 /* If the expression has got more back references than the offsets supplied can
4533 hold, we get a temporary bit of working store to use during the matching.
4534 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4537 ocount = offsetcount - (offsetcount % 3);
4539 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4541 ocount = re->top_backref * 3 + 3;
4542 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4543 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4544 using_temporary_offsets = TRUE;
4545 DPRINTF(("Got memory to hold back references\n"));
4547 else match_block.offset_vector = offsets;
4549 match_block.offset_end = ocount;
4550 match_block.offset_max = (2*ocount)/3;
4551 match_block.offset_overflow = FALSE;
4553 /* Compute the minimum number of offsets that we need to reset each time. Doing
4554 this makes a huge difference to execution time when there aren't many brackets
4557 resetcount = 2 + re->top_bracket * 2;
4558 if (resetcount > offsetcount) resetcount = ocount;
4560 /* Reset the working variable associated with each extraction. These should
4561 never be used unless previously set, but they get saved and restored, and so we
4562 initialize them to avoid reading uninitialized locations. */
4564 if (match_block.offset_vector != NULL)
4566 register int *iptr = match_block.offset_vector + ocount;
4567 register int *iend = iptr - resetcount/2 + 1;
4568 while (--iptr >= iend) *iptr = -1;
4571 /* Set up the first character to match, if available. The first_char value is
4572 never set for an anchored regular expression, but the anchoring may be forced
4573 at run time, so we have to test for anchoring. The first char may be unset for
4574 an unanchored pattern, of course. If there's no first char and the pattern was
4575 studied, there may be a bitmap of possible first characters. */
4579 if ((re->options & PCRE_FIRSTSET) != 0)
4581 first_char = re->first_char;
4582 if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
4585 if (!startline && extra != NULL &&
4586 (extra->options & PCRE_STUDY_MAPPED) != 0)
4587 start_bits = extra->start_bits;
4590 /* For anchored or unanchored matches, there may be a "last known required
4591 character" set. If the PCRE_CASELESS is set, implying that the match starts
4592 caselessly, or if there are any changes of this flag within the regex, set up
4593 both cases of the character. Otherwise set the two values the same, which will
4594 avoid duplicate testing (which takes significant time). This covers the vast
4595 majority of cases. It will be suboptimal when the case flag changes in a regex
4596 and the required character in fact is caseful. */
4598 if ((re->options & PCRE_REQCHSET) != 0)
4600 req_char = re->req_char;
4601 req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
4602 (re->tables + fcc_offset)[req_char] : req_char;
4605 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4606 the loop runs just once. */
4611 register int *iptr = match_block.offset_vector;
4612 register int *iend = iptr + resetcount;
4614 /* Reset the maximum number of extractions we might see. */
4616 while (iptr < iend) *iptr++ = -1;
4618 /* Advance to a unique first char if possible */
4620 if (first_char >= 0)
4622 if ((ims & PCRE_CASELESS) != 0)
4623 while (start_match < end_subject &&
4624 match_block.lcc[*start_match] != first_char)
4627 while (start_match < end_subject && *start_match != first_char)
4631 /* Or to just after \n for a multiline match if possible */
4635 if (start_match > match_block.start_subject + start_offset)
4637 while (start_match < end_subject && start_match[-1] != '\n')
4642 /* Or to a non-unique first char after study */
4644 else if (start_bits != NULL)
4646 while (start_match < end_subject)
4648 register int c = *start_match;
4649 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4653 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4654 printf(">>>> Match against: ");
4655 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4659 /* If req_char is set, we know that that character must appear in the subject
4660 for the match to succeed. If the first character is set, req_char must be
4661 later in the subject; otherwise the test starts at the match point. This
4662 optimization can save a huge amount of backtracking in patterns with nested
4663 unlimited repeats that aren't going to match. We don't know what the state of
4664 case matching may be when this character is hit, so test for it in both its
4665 cases if necessary. However, the different cased versions will not be set up
4666 unless PCRE_CASELESS was given or the casing state changes within the regex.
4667 Writing separate code makes it go faster, as does using an autoincrement and
4668 backing off on a match. */
4672 register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
4674 /* We don't need to repeat the search if we haven't yet reached the
4675 place we found it at last time. */
4677 if (p > req_char_ptr)
4679 /* Do a single test if no case difference is set up */
4681 if (req_char == req_char2)
4683 while (p < end_subject)
4685 if (*p++ == req_char) { p--; break; }
4689 /* Otherwise test for either case */
4693 while (p < end_subject)
4695 register int pp = *p++;
4696 if (pp == req_char || pp == req_char2) { p--; break; }
4700 /* If we can't find the required character, break the matching loop */
4702 if (p >= end_subject) break;
4704 /* If we have found the required character, save the point where we
4705 found it, so that we don't search again next time round the loop if
4706 the start hasn't passed this character yet. */
4712 /* When a match occurs, substrings will be set for all internal extractions;
4713 we just need to set up the whole thing as substring 0 before returning. If
4714 there were too many extractions, set the return code to zero. In the case
4715 where we had to get some local store to hold offsets for backreferences, copy
4716 those back references that we can. In this case there need not be overflow
4717 if certain parts of the pattern were not used. */
4719 match_block.start_match = start_match;
4720 if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))
4723 /* Copy the offset information from temporary store if necessary */
4725 if (using_temporary_offsets)
4727 if (offsetcount >= 4)
4729 memcpy(offsets + 2, match_block.offset_vector + 2,
4730 (offsetcount - 2) * sizeof(int));
4731 DPRINTF(("Copied offsets from temporary memory\n"));
4733 if (match_block.end_offset_top > offsetcount)
4734 match_block.offset_overflow = TRUE;
4736 DPRINTF(("Freeing temporary memory\n"));
4737 (pcre_free)(match_block.offset_vector);
4740 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4742 if (match_block.offset_end < 2) rc = 0; else
4744 offsets[0] = start_match - match_block.start_subject;
4745 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4748 DPRINTF((">>>> returning %d\n", rc));
4752 /* This "while" is the end of the "do" above */
4755 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4756 start_match++ < end_subject);
4758 if (using_temporary_offsets)
4760 DPRINTF(("Freeing temporary memory\n"));
4761 (pcre_free)(match_block.offset_vector);
4764 DPRINTF((">>>> returning %d\n", match_block.errorcode));
4766 return match_block.errorcode;