From 90a2d1979486b2e7fa14be9c3210b9d321c668f8 Mon Sep 17 00:00:00 2001 From: Nuno Lopes Date: Sat, 11 Apr 2009 18:57:27 +0000 Subject: [PATCH] upgrade PCRE to version 7.9 --- NEWS | 1 + ext/pcre/pcrelib/AUTHORS | 2 +- ext/pcre/pcrelib/ChangeLog | 122 ++ ext/pcre/pcrelib/LICENCE | 2 +- ext/pcre/pcrelib/NEWS | 5 + ext/pcre/pcrelib/NON-UNIX-USE | 13 +- ext/pcre/pcrelib/README | 27 +- ext/pcre/pcrelib/config.h | 16 +- ext/pcre/pcrelib/dftables.c | 2 +- ext/pcre/pcrelib/doc/pcre.txt | 1894 +++++++++++++------------ ext/pcre/pcrelib/pcre.h | 9 +- ext/pcre/pcrelib/pcre_compile.c | 861 ++++++----- ext/pcre/pcrelib/pcre_config.c | 4 +- ext/pcre/pcrelib/pcre_exec.c | 233 +-- ext/pcre/pcrelib/pcre_fullinfo.c | 2 +- ext/pcre/pcrelib/pcre_get.c | 2 +- ext/pcre/pcrelib/pcre_globals.c | 2 +- ext/pcre/pcrelib/pcre_info.c | 2 +- ext/pcre/pcrelib/pcre_internal.h | 588 +++++++- ext/pcre/pcrelib/pcre_maketables.c | 2 +- ext/pcre/pcrelib/pcre_newline.c | 6 +- ext/pcre/pcrelib/pcre_ord2utf8.c | 2 +- ext/pcre/pcrelib/pcre_printint.src | 6 +- ext/pcre/pcrelib/pcre_refcount.c | 2 +- ext/pcre/pcrelib/pcre_study.c | 2 +- ext/pcre/pcrelib/pcre_tables.c | 358 +++-- ext/pcre/pcrelib/pcre_try_flipped.c | 2 +- ext/pcre/pcrelib/pcre_valid_utf8.c | 4 +- ext/pcre/pcrelib/pcre_version.c | 2 +- ext/pcre/pcrelib/pcre_xclass.c | 7 +- ext/pcre/pcrelib/pcreposix.c | 1 + ext/pcre/pcrelib/pcreposix.h | 1 + ext/pcre/pcrelib/testdata/grepinputx | 1 + ext/pcre/pcrelib/testdata/grepoutput | 27 +- ext/pcre/pcrelib/testdata/testinput1 | 25 + ext/pcre/pcrelib/testdata/testinput2 | 47 + ext/pcre/pcrelib/testdata/testinput5 | 5 + ext/pcre/pcrelib/testdata/testinput6 | 8 + ext/pcre/pcrelib/testdata/testinput7 | 33 +- ext/pcre/pcrelib/testdata/testoutput1 | 37 + ext/pcre/pcrelib/testdata/testoutput2 | 202 +++ ext/pcre/pcrelib/testdata/testoutput5 | 11 + ext/pcre/pcrelib/testdata/testoutput6 | 10 + ext/pcre/pcrelib/testdata/testoutput7 | 120 +- ext/pcre/pcrelib/ucp.h | 5 +- 45 files changed, 3205 insertions(+), 1508 deletions(-) diff --git a/NEWS b/NEWS index a1a9050dfd..319edf87d5 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,7 @@ PHP NEWS ?? ??? 200?, PHP 5.3.0 RC 2 - Undeprecated ticks. (Arnaud) - Upgraded bundled sqlite to version 3.6.12. (Scott) +- Upgraded bundled PCRE to version 7.9. (Nuno) - Added 'n' flag to fopen to allow passing O_NONBLOCK to the underlying open(2) system call. (Mikko) diff --git a/ext/pcre/pcrelib/AUTHORS b/ext/pcre/pcrelib/AUTHORS index 88b993b698..44ff433d8d 100644 --- a/ext/pcre/pcrelib/AUTHORS +++ b/ext/pcre/pcrelib/AUTHORS @@ -8,7 +8,7 @@ Email domain: cam.ac.uk University of Cambridge Computing Service, Cambridge, England. -Copyright (c) 1997-2008 University of Cambridge +Copyright (c) 1997-2009 University of Cambridge All rights reserved diff --git a/ext/pcre/pcrelib/ChangeLog b/ext/pcre/pcrelib/ChangeLog index fa137fb74d..93a5415ee7 100644 --- a/ext/pcre/pcrelib/ChangeLog +++ b/ext/pcre/pcrelib/ChangeLog @@ -1,6 +1,128 @@ ChangeLog for PCRE ------------------ +Version 7.9 11-Apr-09 +--------------------- + +1. When building with support for bzlib/zlib (pcregrep) and/or readline + (pcretest), all targets were linked against these libraries. This included + libpcre, libpcreposix, and libpcrecpp, even though they do not use these + libraries. This caused unwanted dependencies to be created. This problem + has been fixed, and now only pcregrep is linked with bzlib/zlib and only + pcretest is linked with readline. + +2. The "typedef int BOOL" in pcre_internal.h that was included inside the + "#ifndef FALSE" condition by an earlier change (probably 7.8/18) has been + moved outside it again, because FALSE and TRUE are already defined in AIX, + but BOOL is not. + +3. The pcre_config() function was treating the PCRE_MATCH_LIMIT and + PCRE_MATCH_LIMIT_RECURSION values as ints, when they should be long ints. + +4. The pcregrep documentation said spaces were inserted as well as colons (or + hyphens) following file names and line numbers when outputting matching + lines. This is not true; no spaces are inserted. I have also clarified the + wording for the --colour (or --color) option. + +5. In pcregrep, when --colour was used with -o, the list of matching strings + was not coloured; this is different to GNU grep, so I have changed it to be + the same. + +6. When --colo(u)r was used in pcregrep, only the first matching substring in + each matching line was coloured. Now it goes on to look for further matches + of any of the test patterns, which is the same behaviour as GNU grep. + +7. A pattern that could match an empty string could cause pcregrep to loop; it + doesn't make sense to accept an empty string match in pcregrep, so I have + locked it out (using PCRE's PCRE_NOTEMPTY option). By experiment, this + seems to be how GNU grep behaves. + +8. The pattern (?(?=.*b)b|^) was incorrectly compiled as "match must be at + start or after a newline", because the conditional assertion was not being + correctly handled. The rule now is that both the assertion and what follows + in the first alternative must satisfy the test. + +9. If auto-callout was enabled in a pattern with a conditional group whose + condition was an assertion, PCRE could crash during matching, both with + pcre_exec() and pcre_dfa_exec(). + +10. The PCRE_DOLLAR_ENDONLY option was not working when pcre_dfa_exec() was + used for matching. + +11. Unicode property support in character classes was not working for + characters (bytes) greater than 127 when not in UTF-8 mode. + +12. Added the -M command line option to pcretest. + +14. Added the non-standard REG_NOTEMPTY option to the POSIX interface. + +15. Added the PCRE_NO_START_OPTIMIZE match-time option. + +16. Added comments and documentation about mis-use of no_arg in the C++ + wrapper. + +17. Implemented support for UTF-8 encoding in EBCDIC environments, a patch + from Martin Jerabek that uses macro names for all relevant character and + string constants. + +18. Added to pcre_internal.h two configuration checks: (a) If both EBCDIC and + SUPPORT_UTF8 are set, give an error; (b) If SUPPORT_UCP is set without + SUPPORT_UTF8, define SUPPORT_UTF8. The "configure" script handles both of + these, but not everybody uses configure. + +19. A conditional group that had only one branch was not being correctly + recognized as an item that could match an empty string. This meant that an + enclosing group might also not be so recognized, causing infinite looping + (and probably a segfault) for patterns such as ^"((?(?=[a])[^"])|b)*"$ + with the subject "ab", where knowledge that the repeated group can match + nothing is needed in order to break the loop. + +20. If a pattern that was compiled with callouts was matched using pcre_dfa_ + exec(), but without supplying a callout function, matching went wrong. + +21. If PCRE_ERROR_MATCHLIMIT occurred during a recursion, there was a memory + leak if the size of the offset vector was greater than 30. When the vector + is smaller, the saved offsets during recursion go onto a local stack + vector, but for larger vectors malloc() is used. It was failing to free + when the recursion yielded PCRE_ERROR_MATCH_LIMIT (or any other "abnormal" + error, in fact). + +22. There was a missing #ifdef SUPPORT_UTF8 round one of the variables in the + heapframe that is used only when UTF-8 support is enabled. This caused no + problem, but was untidy. + +23. Steven Van Ingelgem's patch to CMakeLists.txt to change the name + CMAKE_BINARY_DIR to PROJECT_BINARY_DIR so that it works when PCRE is + included within another project. + +24. Steven Van Ingelgem's patches to add more options to the CMake support, + slightly modified by me: + + (a) PCRE_BUILD_TESTS can be set OFF not to build the tests, including + not building pcregrep. + + (b) PCRE_BUILD_PCREGREP can be see OFF not to build pcregrep, but only + if PCRE_BUILD_TESTS is also set OFF, because the tests use pcregrep. + +25. Forward references, both numeric and by name, in patterns that made use of + duplicate group numbers, could behave incorrectly or give incorrect errors, + because when scanning forward to find the reference group, PCRE was not + taking into account the duplicate group numbers. A pattern such as + ^X(?3)(a)(?|(b)|(q))(Y) is an example. + +26. Changed a few more instances of "const unsigned char *" to USPTR, making + the feature of a custom pointer more persuasive (as requested by a user). + +27. Wrapped the definitions of fileno and isatty for Windows, which appear in + pcretest.c, inside #ifndefs, because it seems they are sometimes already + pre-defined. + +28. Added support for (*UTF8) at the start of a pattern. + +29. Arrange for flags added by the "release type" setting in CMake to be shown + in the configuration summary. + + Version 7.8 05-Sep-08 --------------------- diff --git a/ext/pcre/pcrelib/LICENCE b/ext/pcre/pcrelib/LICENCE index 03fabc6aef..ff443a929c 100644 --- a/ext/pcre/pcrelib/LICENCE +++ b/ext/pcre/pcrelib/LICENCE @@ -22,7 +22,7 @@ Email domain: cam.ac.uk University of Cambridge Computing Service, Cambridge, England. -Copyright (c) 1997-2008 University of Cambridge +Copyright (c) 1997-2009 University of Cambridge All rights reserved. diff --git a/ext/pcre/pcrelib/NEWS b/ext/pcre/pcrelib/NEWS index 43e47b6ed6..2b26fccf51 100644 --- a/ext/pcre/pcrelib/NEWS +++ b/ext/pcre/pcrelib/NEWS @@ -1,6 +1,11 @@ News about PCRE releases ------------------------ +Release 7.9 11-Apr-09 +--------------------- + +Mostly bugfixes and tidies with just a couple of minor functional additions. + Release 7.8 05-Sep-08 --------------------- diff --git a/ext/pcre/pcrelib/NON-UNIX-USE b/ext/pcre/pcrelib/NON-UNIX-USE index bf5c41a5ee..803e73e98b 100644 --- a/ext/pcre/pcrelib/NON-UNIX-USE +++ b/ext/pcre/pcrelib/NON-UNIX-USE @@ -23,8 +23,8 @@ I (Philip Hazel) have no experience of Windows or VMS sytems and how their libraries work. The items in the PCRE distribution and Makefile that relate to anything other than Unix-like systems are untested by me. -There are some other comments and files in the Contrib directory on the ftp -site that you may find useful. See +There are some other comments and files (including some documentation in CHM +format) in the Contrib directory on the FTP site: ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/Contrib @@ -373,6 +373,13 @@ Michael Roy sent these comments about building PCRE under Windows with BCC5.5: line. +BUILDING UNDER WINDOWS CE WITH VISUAL STUDIO 200x + +Vincent Richomme sent a zip archive of files to help with this process. They +can be found in the file "pcre-vsbuild.zip" in the Contrib directory of the FTP +site. + + BUILDING PCRE ON OPENVMS Dan Mooney sent the following comments about building PCRE on OpenVMS. They @@ -437,5 +444,5 @@ $! Locale could not be set to fr $! ========================= -Last Updated: 05 September 2008 +Last Updated: 17 March 2009 **** diff --git a/ext/pcre/pcrelib/README b/ext/pcre/pcrelib/README index 3879038936..6b7c83fed3 100644 --- a/ext/pcre/pcrelib/README +++ b/ext/pcre/pcrelib/README @@ -85,6 +85,10 @@ documentation is supplied in two other forms: in various ways, and rooted in a file called index.html, is distributed in doc/html and installed in /share/doc/pcre/html. +Users of PCRE have contributed files containing the documentation for various +releases in CHM format. These can be found in the Contrib directory of the FTP +site (see next section). + Contributions by users of PCRE ------------------------------ @@ -161,10 +165,13 @@ library. You can read more about them in the pcrebuild man page. it will try to find a C++ compiler and C++ header files, and if it succeeds, it will try to build the C++ wrapper. -. If you want to make use of the support for UTF-8 character strings in PCRE, - you must add --enable-utf8 to the "configure" command. Without it, the code - for handling UTF-8 is not included in the library. (Even when included, it - still has to be enabled by an option at run time.) +. If you want to make use of the support for UTF-8 Unicode character strings in + PCRE, you must add --enable-utf8 to the "configure" command. Without it, the + code for handling UTF-8 is not included in the library. Even when included, + it still has to be enabled by an option at run time. When PCRE is compiled + with this option, its input can only either be ASCII or UTF-8, even when + running on EBCDIC platforms. It is not possible to use both --enable-utf8 and + --enable-ebcdic at the same time. . If, in addition to support for UTF-8 character strings, you want to include support for the \P, \p, and \X sequences that recognize Unicode character @@ -255,11 +262,13 @@ library. You can read more about them in the pcrebuild man page. pcre_chartables.c.dist. See "Character tables" below for further information. . It is possible to compile PCRE for use on systems that use EBCDIC as their - default character code (as opposed to ASCII) by specifying + character code (as opposed to ASCII) by specifying --enable-ebcdic - This automatically implies --enable-rebuild-chartables (see above). + This automatically implies --enable-rebuild-chartables (see above). However, + when PCRE is built this way, it always operates in EBCDIC. It cannot support + both EBCDIC and UTF-8. . It is possible to compile pcregrep to use libz and/or libbz2, in order to read .gz and .bz2 files (respectively), by specifying one or both of @@ -286,7 +295,9 @@ library. You can read more about them in the pcrebuild man page. to specify something like LIBS="-lncurses" as well. This is because, to quote the readline INSTALL, "Readline uses the termcap functions, but does not link with the termcap or curses library itself, allowing applications which link - with readline the to choose an appropriate library." + with readline the to choose an appropriate library." If you get error + messages about missing functions tgetstr, tgetent, tputs, tgetflag, or tgoto, + this is the problem, and linking with the ncurses library should fix it. The "configure" script builds the following files for the basic C library: @@ -753,4 +764,4 @@ The distribution should contain the following files: Philip Hazel Email local part: ph10 Email domain: cam.ac.uk -Last updated: 05 September 2008 +Last updated: 21 March 2009 diff --git a/ext/pcre/pcrelib/config.h b/ext/pcre/pcrelib/config.h index 98e827f1ca..8133d978ba 100644 --- a/ext/pcre/pcrelib/config.h +++ b/ext/pcre/pcrelib/config.h @@ -50,7 +50,10 @@ them both to 0; an emulation function will be used. */ /* If you are compiling for a system that uses EBCDIC instead of ASCII character codes, define this macro as 1. On systems that can use - "configure", this can be done via --enable-ebcdic. */ + "configure", this can be done via --enable-ebcdic. PCRE will then assume + that all input strings are in EBCDIC. If you do not define this macro, PCRE + will assume input strings are ASCII or UTF-8 Unicode. It is not possible to + build a version of PCRE that supports both EBCDIC and UTF-8. */ /* #undef EBCDIC */ /* Define to 1 if you have the `bcopy' function. */ @@ -259,13 +262,13 @@ them both to 0; an emulation function will be used. */ #define PACKAGE_NAME "PCRE" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "PCRE 7.8" +#define PACKAGE_STRING "PCRE 7.9" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "pcre" /* Define to the version of this package. */ -#define PACKAGE_VERSION "7.8" +#define PACKAGE_VERSION "7.9" /* If you are compiling for a system other than a Unix-like system or @@ -313,12 +316,15 @@ them both to 0; an emulation function will be used. */ /* Define to enable support for Unicode properties */ /* #undef SUPPORT_UCP */ -/* Define to enable support for the UTF-8 Unicode encoding. */ +/* Define to enable support for the UTF-8 Unicode encoding. This will work + even in an EBCDIC environment, but it is incompatible with the EBCDIC + macro. That is, PCRE can support *either* EBCDIC code *or* ASCII/UTF-8, but + not both at once. */ /* #undef SUPPORT_UTF8 */ /* Version number of package */ #ifndef VERSION -#define VERSION "7.8" +#define VERSION "7.9" #endif /* Define to empty if `const' does not conform to ANSI C. */ diff --git a/ext/pcre/pcrelib/dftables.c b/ext/pcre/pcrelib/dftables.c index 9593e6456c..63fc7074ec 100644 --- a/ext/pcre/pcrelib/dftables.c +++ b/ext/pcre/pcrelib/dftables.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2009 University of Cambridge + Copyright (c) 1997-2008 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without diff --git a/ext/pcre/pcrelib/doc/pcre.txt b/ext/pcre/pcrelib/doc/pcre.txt index d07bfea006..9a2ce31598 100644 --- a/ext/pcre/pcrelib/doc/pcre.txt +++ b/ext/pcre/pcrelib/doc/pcre.txt @@ -28,7 +28,7 @@ INTRODUCTION mately with Perl 5.10, including support for UTF-8 encoded strings and Unicode general category properties. However, UTF-8 and Unicode support has to be explicitly enabled; it is not the default. The Unicode tables - correspond to Unicode release 5.0.0. + correspond to Unicode release 5.1. In addition to the Perl-compatible matching function, PCRE contains an alternative matching function that matches the same compiled patterns @@ -94,21 +94,21 @@ USER DOCUMENTATION pcrestack discussion of stack usage pcretest description of the pcretest testing command - In addition, in the "man" and HTML formats, there is a short page for + In addition, in the "man" and HTML formats, there is a short page for each C library function, listing its arguments and results. LIMITATIONS - There are some size limitations in PCRE but it is hoped that they will + There are some size limitations in PCRE but it is hoped that they will never in practice be relevant. - The maximum length of a compiled pattern is 65539 (sic) bytes if PCRE + The maximum length of a compiled pattern is 65539 (sic) bytes if PCRE is compiled with the default internal linkage size of 2. If you want to - process regular expressions that are truly enormous, you can compile - PCRE with an internal linkage size of 3 or 4 (see the README file in - the source distribution and the pcrebuild documentation for details). - In these cases the limit is substantially larger. However, the speed + process regular expressions that are truly enormous, you can compile + PCRE with an internal linkage size of 3 or 4 (see the README file in + the source distribution and the pcrebuild documentation for details). + In these cases the limit is substantially larger. However, the speed of execution is slower. All values in repeating quantifiers must be less than 65536. @@ -119,26 +119,27 @@ LIMITATIONS The maximum length of name for a named subpattern is 32 characters, and the maximum number of named subpatterns is 10000. - The maximum length of a subject string is the largest positive number - that an integer variable can hold. However, when using the traditional + The maximum length of a subject string is the largest positive number + that an integer variable can hold. However, when using the traditional matching function, PCRE uses recursion to handle subpatterns and indef- - inite repetition. This means that the available stack space may limit + inite repetition. This means that the available stack space may limit the size of a subject string that can be processed by certain patterns. For a discussion of stack issues, see the pcrestack documentation. UTF-8 AND UNICODE PROPERTY SUPPORT - From release 3.3, PCRE has had some support for character strings - encoded in the UTF-8 format. For release 4.0 this was greatly extended - to cover most common requirements, and in release 5.0 additional sup- + From release 3.3, PCRE has had some support for character strings + encoded in the UTF-8 format. For release 4.0 this was greatly extended + to cover most common requirements, and in release 5.0 additional sup- port for Unicode general category properties was added. - In order process UTF-8 strings, you must build PCRE to include UTF-8 - support in the code, and, in addition, you must call pcre_compile() - with the PCRE_UTF8 option flag. When you do this, both the pattern and - any subject strings that are matched against it are treated as UTF-8 - strings instead of just strings of bytes. + In order process UTF-8 strings, you must build PCRE to include UTF-8 + support in the code, and, in addition, you must call pcre_compile() + with the PCRE_UTF8 option flag, or the pattern must start with the + sequence (*UTF8). When either of these is the case, both the pattern + and any subject strings that are matched against it are treated as + UTF-8 strings instead of just strings of bytes. If you compile PCRE with UTF-8 support, but do not use it at run time, the library will be a bit bigger, but the additional run time overhead @@ -224,24 +225,25 @@ UTF-8 AND UNICODE PROPERTY SUPPORT includes Unicode property support, because to do otherwise would slow down PCRE in many common cases. If you really want to test for a wider sense of, say, "digit", you must use Unicode property tests such as - \p{Nd}. + \p{Nd}. Note that this also applies to \b, because it is defined in + terms of \w and \W. - 7. Similarly, characters that match the POSIX named character classes + 7. Similarly, characters that match the POSIX named character classes are all low-valued characters. - 8. However, the Perl 5.10 horizontal and vertical whitespace matching + 8. However, the Perl 5.10 horizontal and vertical whitespace matching escapes (\h, \H, \v, and \V) do match all the appropriate Unicode char- acters. - 9. Case-insensitive matching applies only to characters whose values - are less than 128, unless PCRE is built with Unicode property support. - Even when Unicode property support is available, PCRE still uses its - own character tables when checking the case of low-valued characters, - so as not to degrade performance. The Unicode property information is + 9. Case-insensitive matching applies only to characters whose values + are less than 128, unless PCRE is built with Unicode property support. + Even when Unicode property support is available, PCRE still uses its + own character tables when checking the case of low-valued characters, + so as not to degrade performance. The Unicode property information is used only for characters with higher values. Even when Unicode property support is available, PCRE supports case-insensitive matching only when - there is a one-to-one mapping between a letter's cases. There are a - small number of many-to-one mappings in Unicode; these are not sup- + there is a one-to-one mapping between a letter's cases. There are a + small number of many-to-one mappings in Unicode; these are not sup- ported by PCRE. @@ -251,15 +253,15 @@ AUTHOR University Computing Service Cambridge CB2 3QH, England. - Putting an actual email address here seems to have been a spam magnet, - so I've taken it away. If you want to email me, use my two initials, + Putting an actual email address here seems to have been a spam magnet, + so I've taken it away. If you want to email me, use my two initials, followed by the two digits 10, at the domain cam.ac.uk. REVISION - Last updated: 12 April 2008 - Copyright (c) 1997-2008 University of Cambridge. + Last updated: 11 April 2009 + Copyright (c) 1997-2009 University of Cambridge. ------------------------------------------------------------------------------ @@ -307,7 +309,7 @@ C++ SUPPORT UTF-8 SUPPORT - To build PCRE with support for UTF-8 character strings, add + To build PCRE with support for UTF-8 Unicode character strings, add --enable-utf8 @@ -316,6 +318,12 @@ UTF-8 SUPPORT have have to set the PCRE_UTF8 option when you call the pcre_compile() function. + If you set --enable-utf8 when compiling in an EBCDIC environment, PCRE + expects its input to be either ASCII or UTF-8 (depending on the runtime + option). It is not possible to support both EBCDIC and UTF-8 codes in + the same version of the library. Consequently, --enable-utf8 and + --enable-ebcdic are mutually exclusive. + UNICODE CHARACTER PROPERTY SUPPORT @@ -337,10 +345,10 @@ UNICODE CHARACTER PROPERTY SUPPORT CODE VALUE OF NEWLINE - By default, PCRE interprets character 10 (linefeed, LF) as indicating + By default, PCRE interprets the linefeed (LF) character as indicating the end of a line. This is the normal newline character on Unix-like - systems. You can compile PCRE to use character 13 (carriage return, CR) - instead, by adding + systems. You can compile PCRE to use carriage return (CR) instead, by + adding --enable-newline-is-cr @@ -363,28 +371,28 @@ CODE VALUE OF NEWLINE causes PCRE to recognize any Unicode newline sequence. - Whatever line ending convention is selected when PCRE is built can be - overridden when the library functions are called. At build time it is + Whatever line ending convention is selected when PCRE is built can be + overridden when the library functions are called. At build time it is conventional to use the standard for your operating system. WHAT \R MATCHES - By default, the sequence \R in a pattern matches any Unicode newline - sequence, whatever has been selected as the line ending sequence. If + By default, the sequence \R in a pattern matches any Unicode newline + sequence, whatever has been selected as the line ending sequence. If you specify --enable-bsr-anycrlf - the default is changed so that \R matches only CR, LF, or CRLF. What- - ever is selected when PCRE is built can be overridden when the library + the default is changed so that \R matches only CR, LF, or CRLF. What- + ever is selected when PCRE is built can be overridden when the library functions are called. BUILDING SHARED AND STATIC LIBRARIES - The PCRE building process uses libtool to build both shared and static - Unix libraries by default. You can suppress one of these by adding one + The PCRE building process uses libtool to build both shared and static + Unix libraries by default. You can suppress one of these by adding one of --disable-shared @@ -396,9 +404,9 @@ BUILDING SHARED AND STATIC LIBRARIES POSIX MALLOC USAGE When PCRE is called through the POSIX interface (see the pcreposix doc- - umentation), additional working storage is required for holding the - pointers to capturing substrings, because PCRE requires three integers - per substring, whereas the POSIX interface provides only two. If the + umentation), additional working storage is required for holding the + pointers to capturing substrings, because PCRE requires three integers + per substring, whereas the POSIX interface provides only two. If the number of expected substrings is small, the wrapper function uses space on the stack, because this is faster than using malloc() for each call. The default threshold above which the stack is no longer used is 10; it @@ -411,112 +419,113 @@ POSIX MALLOC USAGE HANDLING VERY LARGE PATTERNS - Within a compiled pattern, offset values are used to point from one - part to another (for example, from an opening parenthesis to an alter- - nation metacharacter). By default, two-byte values are used for these - offsets, leading to a maximum size for a compiled pattern of around - 64K. This is sufficient to handle all but the most gigantic patterns. - Nevertheless, some people do want to process enormous patterns, so it - is possible to compile PCRE to use three-byte or four-byte offsets by + Within a compiled pattern, offset values are used to point from one + part to another (for example, from an opening parenthesis to an alter- + nation metacharacter). By default, two-byte values are used for these + offsets, leading to a maximum size for a compiled pattern of around + 64K. This is sufficient to handle all but the most gigantic patterns. + Nevertheless, some people do want to process enormous patterns, so it + is possible to compile PCRE to use three-byte or four-byte offsets by adding a setting such as --with-link-size=3 - to the configure command. The value given must be 2, 3, or 4. Using - longer offsets slows down the operation of PCRE because it has to load + to the configure command. The value given must be 2, 3, or 4. Using + longer offsets slows down the operation of PCRE because it has to load additional bytes when handling them. AVOIDING EXCESSIVE STACK USAGE When matching with the pcre_exec() function, PCRE implements backtrack- - ing by making recursive calls to an internal function called match(). - In environments where the size of the stack is limited, this can se- - verely limit PCRE's operation. (The Unix environment does not usually + ing by making recursive calls to an internal function called match(). + In environments where the size of the stack is limited, this can se- + verely limit PCRE's operation. (The Unix environment does not usually suffer from this problem, but it may sometimes be necessary to increase - the maximum stack size. There is a discussion in the pcrestack docu- - mentation.) An alternative approach to recursion that uses memory from - the heap to remember data, instead of using recursive function calls, - has been implemented to work round the problem of limited stack size. + the maximum stack size. There is a discussion in the pcrestack docu- + mentation.) An alternative approach to recursion that uses memory from + the heap to remember data, instead of using recursive function calls, + has been implemented to work round the problem of limited stack size. If you want to build a version of PCRE that works this way, add --disable-stack-for-recursion - to the configure command. With this configuration, PCRE will use the - pcre_stack_malloc and pcre_stack_free variables to call memory manage- - ment functions. By default these point to malloc() and free(), but you + to the configure command. With this configuration, PCRE will use the + pcre_stack_malloc and pcre_stack_free variables to call memory manage- + ment functions. By default these point to malloc() and free(), but you can replace the pointers so that your own functions are used. - Separate functions are provided rather than using pcre_malloc and - pcre_free because the usage is very predictable: the block sizes - requested are always the same, and the blocks are always freed in - reverse order. A calling program might be able to implement optimized - functions that perform better than malloc() and free(). PCRE runs + Separate functions are provided rather than using pcre_malloc and + pcre_free because the usage is very predictable: the block sizes + requested are always the same, and the blocks are always freed in + reverse order. A calling program might be able to implement optimized + functions that perform better than malloc() and free(). PCRE runs noticeably more slowly when built in this way. This option affects only - the pcre_exec() function; it is not relevant for the the + the pcre_exec() function; it is not relevant for the the pcre_dfa_exec() function. LIMITING PCRE RESOURCE USAGE - Internally, PCRE has a function called match(), which it calls repeat- - edly (sometimes recursively) when matching a pattern with the - pcre_exec() function. By controlling the maximum number of times this - function may be called during a single matching operation, a limit can - be placed on the resources used by a single call to pcre_exec(). The - limit can be changed at run time, as described in the pcreapi documen- - tation. The default is 10 million, but this can be changed by adding a + Internally, PCRE has a function called match(), which it calls repeat- + edly (sometimes recursively) when matching a pattern with the + pcre_exec() function. By controlling the maximum number of times this + function may be called during a single matching operation, a limit can + be placed on the resources used by a single call to pcre_exec(). The + limit can be changed at run time, as described in the pcreapi documen- + tation. The default is 10 million, but this can be changed by adding a setting such as --with-match-limit=500000 - to the configure command. This setting has no effect on the + to the configure command. This setting has no effect on the pcre_dfa_exec() matching function. - In some environments it is desirable to limit the depth of recursive + In some environments it is desirable to limit the depth of recursive calls of match() more strictly than the total number of calls, in order - to restrict the maximum amount of stack (or heap, if --disable-stack- + to restrict the maximum amount of stack (or heap, if --disable-stack- for-recursion is specified) that is used. A second limit controls this; - it defaults to the value that is set for --with-match-limit, which - imposes no additional constraints. However, you can set a lower limit + it defaults to the value that is set for --with-match-limit, which + imposes no additional constraints. However, you can set a lower limit by adding, for example, --with-match-limit-recursion=10000 - to the configure command. This value can also be overridden at run + to the configure command. This value can also be overridden at run time. CREATING CHARACTER TABLES AT BUILD TIME - PCRE uses fixed tables for processing characters whose code values are - less than 256. By default, PCRE is built with a set of tables that are - distributed in the file pcre_chartables.c.dist. These tables are for + PCRE uses fixed tables for processing characters whose code values are + less than 256. By default, PCRE is built with a set of tables that are + distributed in the file pcre_chartables.c.dist. These tables are for ASCII codes only. If you add --enable-rebuild-chartables - to the configure command, the distributed tables are no longer used. - Instead, a program called dftables is compiled and run. This outputs + to the configure command, the distributed tables are no longer used. + Instead, a program called dftables is compiled and run. This outputs the source for new set of tables, created in the default locale of your C runtime system. (This method of replacing the tables does not work if - you are cross compiling, because dftables is run on the local host. If - you need to create alternative tables when cross compiling, you will + you are cross compiling, because dftables is run on the local host. If + you need to create alternative tables when cross compiling, you will have to do so "by hand".) USING EBCDIC CODE - PCRE assumes by default that it will run in an environment where the - character code is ASCII (or Unicode, which is a superset of ASCII). - This is the case for most computer operating systems. PCRE can, how- + PCRE assumes by default that it will run in an environment where the + character code is ASCII (or Unicode, which is a superset of ASCII). + This is the case for most computer operating systems. PCRE can, how- ever, be compiled to run in an EBCDIC environment by adding --enable-ebcdic to the configure command. This setting implies --enable-rebuild-charta- - bles. You should only use it if you know that you are in an EBCDIC - environment (for example, an IBM mainframe operating system). + bles. You should only use it if you know that you are in an EBCDIC + environment (for example, an IBM mainframe operating system). The + --enable-ebcdic option is incompatible with --enable-utf8. PCREGREP OPTIONS FOR COMPRESSED FILE SUPPORT @@ -578,8 +587,8 @@ AUTHOR REVISION - Last updated: 13 April 2008 - Copyright (c) 1997-2008 University of Cambridge. + Last updated: 17 March 2009 + Copyright (c) 1997-2009 University of Cambridge. ------------------------------------------------------------------------------ @@ -999,7 +1008,7 @@ MULTITHREADING pcre_malloc, pcre_free, pcre_stack_malloc, and pcre_stack_free, and the callout function pointed to by pcre_callout, are shared by all threads. - The compiled form of a regular expression is not altered during match- + The compiled form of a regular expression is not altered during match- ing, so the same compiled pattern can safely be used by several threads at once. @@ -1007,10 +1016,10 @@ MULTITHREADING SAVING PRECOMPILED PATTERNS FOR LATER USE The compiled form of a regular expression can be saved and re-used at a - later time, possibly by a different program, and even on a host other - than the one on which it was compiled. Details are given in the - pcreprecompile documentation. However, compiling a regular expression - with one version of PCRE for use with a different version is not guar- + later time, possibly by a different program, and even on a host other + than the one on which it was compiled. Details are given in the + pcreprecompile documentation. However, compiling a regular expression + with one version of PCRE for use with a different version is not guar- anteed to work and may cause crashes. @@ -1018,33 +1027,34 @@ CHECKING BUILD-TIME OPTIONS int pcre_config(int what, void *where); - The function pcre_config() makes it possible for a PCRE client to dis- + The function pcre_config() makes it possible for a PCRE client to dis- cover which optional features have been compiled into the PCRE library. - The pcrebuild documentation has more details about these optional fea- + The pcrebuild documentation has more details about these optional fea- tures. - The first argument for pcre_config() is an integer, specifying which + The first argument for pcre_config() is an integer, specifying which information is required; the second argument is a pointer to a variable - into which the information is placed. The following information is + into which the information is placed. The following information is available: PCRE_CONFIG_UTF8 - The output is an integer that is set to one if UTF-8 support is avail- + The output is an integer that is set to one if UTF-8 support is avail- able; otherwise it is set to zero. PCRE_CONFIG_UNICODE_PROPERTIES - The output is an integer that is set to one if support for Unicode + The output is an integer that is set to one if support for Unicode character properties is available; otherwise it is set to zero. PCRE_CONFIG_NEWLINE - The output is an integer whose value specifies the default character - sequence that is recognized as meaning "newline". The four values that + The output is an integer whose value specifies the default character + sequence that is recognized as meaning "newline". The four values that are supported are: 10 for LF, 13 for CR, 3338 for CRLF, -2 for ANYCRLF, - and -1 for ANY. The default should normally be the standard sequence - for your operating system. + and -1 for ANY. Though they are derived from ASCII, the same values + are returned in EBCDIC environments. The default should normally corre- + spond to the standard sequence for your operating system. PCRE_CONFIG_BSR @@ -1071,24 +1081,25 @@ CHECKING BUILD-TIME OPTIONS PCRE_CONFIG_MATCH_LIMIT - The output is an integer that gives the default limit for the number of - internal matching function calls in a pcre_exec() execution. Further - details are given with pcre_exec() below. + The output is a long integer that gives the default limit for the num- + ber of internal matching function calls in a pcre_exec() execution. + Further details are given with pcre_exec() below. PCRE_CONFIG_MATCH_LIMIT_RECURSION - The output is an integer that gives the default limit for the depth of - recursion when calling the internal matching function in a pcre_exec() - execution. Further details are given with pcre_exec() below. + The output is a long integer that gives the default limit for the depth + of recursion when calling the internal matching function in a + pcre_exec() execution. Further details are given with pcre_exec() + below. PCRE_CONFIG_STACKRECURSE - The output is an integer that is set to one if internal recursion when + The output is an integer that is set to one if internal recursion when running pcre_exec() is implemented by recursive function calls that use - the stack to remember their state. This is the usual way that PCRE is + the stack to remember their state. This is the usual way that PCRE is compiled. The output is zero if PCRE was compiled to use blocks of data - on the heap instead of recursive function calls. In this case, - pcre_stack_malloc and pcre_stack_free are called to manage memory + on the heap instead of recursive function calls. In this case, + pcre_stack_malloc and pcre_stack_free are called to manage memory blocks on the heap, thus avoiding the use of the stack. @@ -1105,31 +1116,32 @@ COMPILING A PATTERN Either of the functions pcre_compile() or pcre_compile2() can be called to compile a pattern into an internal form. The only difference between - the two interfaces is that pcre_compile2() has an additional argument, + the two interfaces is that pcre_compile2() has an additional argument, errorcodeptr, via which a numerical error code can be returned. The pattern is a C string terminated by a binary zero, and is passed in - the pattern argument. A pointer to a single block of memory that is - obtained via pcre_malloc is returned. This contains the compiled code + the pattern argument. A pointer to a single block of memory that is + obtained via pcre_malloc is returned. This contains the compiled code and related data. The pcre type is defined for the returned block; this is a typedef for a structure whose contents are not externally defined. It is up to the caller to free the memory (via pcre_free) when it is no longer required. - Although the compiled code of a PCRE regex is relocatable, that is, it + Although the compiled code of a PCRE regex is relocatable, that is, it does not depend on memory location, the complete pcre data block is not - fully relocatable, because it may contain a copy of the tableptr argu- + fully relocatable, because it may contain a copy of the tableptr argu- ment, which is an address (see below). The options argument contains various bit settings that affect the com- - pilation. It should be zero if no options are required. The available - options are described below. Some of them, in particular, those that - are compatible with Perl, can also be set and unset from within the - pattern (see the detailed description in the pcrepattern documenta- - tion). For these options, the contents of the options argument speci- - fies their initial settings at the start of compilation and execution. - The PCRE_ANCHORED and PCRE_NEWLINE_xxx options can be set at the time - of matching as well as at compile time. + pilation. It should be zero if no options are required. The available + options are described below. Some of them (in particular, those that + are compatible with Perl, but also some others) can also be set and + unset from within the pattern (see the detailed description in the + pcrepattern documentation). For those options that can be different in + different parts of the pattern, the contents of the options argument + specifies their initial settings at the start of compilation and execu- + tion. The PCRE_ANCHORED and PCRE_NEWLINE_xxx options can be set at the + time of matching as well as at compile time. If errptr is NULL, pcre_compile() returns NULL immediately. Otherwise, if compilation of a pattern fails, pcre_compile() returns NULL, and @@ -1335,51 +1347,51 @@ COMPILING A PATTERN and are therefore ignored. The newline option that is set at compile time becomes the default that - is used for pcre_exec() and pcre_dfa_exec(), but it can be overridden. + is used for pcre_exec() and pcre_dfa_exec(), but it can be overridden. PCRE_NO_AUTO_CAPTURE If this option is set, it disables the use of numbered capturing paren- - theses in the pattern. Any opening parenthesis that is not followed by - ? behaves as if it were followed by ?: but named parentheses can still - be used for capturing (and they acquire numbers in the usual way). + theses in the pattern. Any opening parenthesis that is not followed by + ? behaves as if it were followed by ?: but named parentheses can still + be used for capturing (and they acquire numbers in the usual way). There is no equivalent of this option in Perl. PCRE_UNGREEDY - This option inverts the "greediness" of the quantifiers so that they - are not greedy by default, but become greedy if followed by "?". It is - not compatible with Perl. It can also be set by a (?U) option setting + This option inverts the "greediness" of the quantifiers so that they + are not greedy by default, but become greedy if followed by "?". It is + not compatible with Perl. It can also be set by a (?U) option setting within the pattern. PCRE_UTF8 - This option causes PCRE to regard both the pattern and the subject as - strings of UTF-8 characters instead of single-byte character strings. - However, it is available only when PCRE is built to include UTF-8 sup- - port. If not, the use of this option provokes an error. Details of how - this option changes the behaviour of PCRE are given in the section on + This option causes PCRE to regard both the pattern and the subject as + strings of UTF-8 characters instead of single-byte character strings. + However, it is available only when PCRE is built to include UTF-8 sup- + port. If not, the use of this option provokes an error. Details of how + this option changes the behaviour of PCRE are given in the section on UTF-8 support in the main pcre page. PCRE_NO_UTF8_CHECK When PCRE_UTF8 is set, the validity of the pattern as a UTF-8 string is - automatically checked. There is a discussion about the validity of - UTF-8 strings in the main pcre page. If an invalid UTF-8 sequence of - bytes is found, pcre_compile() returns an error. If you already know + automatically checked. There is a discussion about the validity of + UTF-8 strings in the main pcre page. If an invalid UTF-8 sequence of + bytes is found, pcre_compile() returns an error. If you already know that your pattern is valid, and you want to skip this check for perfor- - mance reasons, you can set the PCRE_NO_UTF8_CHECK option. When it is - set, the effect of passing an invalid UTF-8 string as a pattern is - undefined. It may cause your program to crash. Note that this option - can also be passed to pcre_exec() and pcre_dfa_exec(), to suppress the + mance reasons, you can set the PCRE_NO_UTF8_CHECK option. When it is + set, the effect of passing an invalid UTF-8 string as a pattern is + undefined. It may cause your program to crash. Note that this option + can also be passed to pcre_exec() and pcre_dfa_exec(), to suppress the UTF-8 validity checking of subject strings. COMPILATION ERROR CODES - The following table lists the error codes than may be returned by - pcre_compile2(), along with the error messages that may be returned by - both compiling functions. As PCRE has developed, some error codes have + The following table lists the error codes than may be returned by + pcre_compile2(), along with the error messages that may be returned by + both compiling functions. As PCRE has developed, some error codes have fallen out of use. To avoid confusion, they have not been re-used. 0 no error @@ -1435,7 +1447,7 @@ COMPILATION ERROR CODES 50 [this code is not in use] 51 octal value is greater than \377 (not in UTF-8 mode) 52 internal error: overran compiling workspace - 53 internal error: previously-checked referenced subpattern not + 53 internal error: previously-checked referenced subpattern not found 54 DEFINE group contains more than one branch 55 repeating a DEFINE group is not allowed @@ -1450,7 +1462,7 @@ COMPILATION ERROR CODES 63 digit expected after (?+ 64 ] is an invalid data character in JavaScript compatibility mode - The numbers 32 and 10000 in errors 48 and 49 are defaults; different + The numbers 32 and 10000 in errors 48 and 49 are defaults; different values may be used if the limits were changed when PCRE was built. @@ -1459,32 +1471,32 @@ STUDYING A PATTERN pcre_extra *pcre_study(const pcre *code, int options const char **errptr); - If a compiled pattern is going to be used several times, it is worth + If a compiled pattern is going to be used several times, it is worth spending more time analyzing it in order to speed up the time taken for - matching. The function pcre_study() takes a pointer to a compiled pat- + matching. The function pcre_study() takes a pointer to a compiled pat- tern as its first argument. If studying the pattern produces additional - information that will help speed up matching, pcre_study() returns a - pointer to a pcre_extra block, in which the study_data field points to + information that will help speed up matching, pcre_study() returns a + pointer to a pcre_extra block, in which the study_data field points to the results of the study. The returned value from pcre_study() can be passed directly to - pcre_exec(). However, a pcre_extra block also contains other fields - that can be set by the caller before the block is passed; these are + pcre_exec(). However, a pcre_extra block also contains other fields + that can be set by the caller before the block is passed; these are described below in the section on matching a pattern. - If studying the pattern does not produce any additional information + If studying the pattern does not produce any additional information pcre_study() returns NULL. In that circumstance, if the calling program - wants to pass any of the other fields to pcre_exec(), it must set up + wants to pass any of the other fields to pcre_exec(), it must set up its own pcre_extra block. - The second argument of pcre_study() contains option bits. At present, + The second argument of pcre_study() contains option bits. At present, no options are defined, and this argument should always be zero. - The third argument for pcre_study() is a pointer for an error message. - If studying succeeds (even if no data is returned), the variable it - points to is set to NULL. Otherwise it is set to point to a textual + The third argument for pcre_study() is a pointer for an error message. + If studying succeeds (even if no data is returned), the variable it + points to is set to NULL. Otherwise it is set to point to a textual error message. This is a static string that is part of the library. You - must not try to free it. You should test the error pointer for NULL + must not try to free it. You should test the error pointer for NULL after calling pcre_study(), to be sure that it has run successfully. This is a typical call to pcre_study(): @@ -1496,62 +1508,62 @@ STUDYING A PATTERN &error); /* set to NULL or points to a message */ At present, studying a pattern is useful only for non-anchored patterns - that do not have a single fixed starting character. A bitmap of possi- + that do not have a single fixed starting character. A bitmap of possi- ble starting bytes is created. LOCALE SUPPORT - PCRE handles caseless matching, and determines whether characters are - letters, digits, or whatever, by reference to a set of tables, indexed - by character value. When running in UTF-8 mode, this applies only to - characters with codes less than 128. Higher-valued codes never match - escapes such as \w or \d, but can be tested with \p if PCRE is built - with Unicode character property support. The use of locales with Uni- - code is discouraged. If you are handling characters with codes greater - than 128, you should either use UTF-8 and Unicode, or use locales, but + PCRE handles caseless matching, and determines whether characters are + letters, digits, or whatever, by reference to a set of tables, indexed + by character value. When running in UTF-8 mode, this applies only to + characters with codes less than 128. Higher-valued codes never match + escapes such as \w or \d, but can be tested with \p if PCRE is built + with Unicode character property support. The use of locales with Uni- + code is discouraged. If you are handling characters with codes greater + than 128, you should either use UTF-8 and Unicode, or use locales, but not try to mix the two. - PCRE contains an internal set of tables that are used when the final - argument of pcre_compile() is NULL. These are sufficient for many + PCRE contains an internal set of tables that are used when the final + argument of pcre_compile() is NULL. These are sufficient for many applications. Normally, the internal tables recognize only ASCII char- acters. However, when PCRE is built, it is possible to cause the inter- nal tables to be rebuilt in the default "C" locale of the local system, which may cause them to be different. - The internal tables can always be overridden by tables supplied by the + The internal tables can always be overridden by tables supplied by the application that calls PCRE. These may be created in a different locale - from the default. As more and more applications change to using Uni- + from the default. As more and more applications change to using Uni- code, the need for this locale support is expected to die away. - External tables are built by calling the pcre_maketables() function, - which has no arguments, in the relevant locale. The result can then be - passed to pcre_compile() or pcre_exec() as often as necessary. For - example, to build and use tables that are appropriate for the French - locale (where accented characters with values greater than 128 are + External tables are built by calling the pcre_maketables() function, + which has no arguments, in the relevant locale. The result can then be + passed to pcre_compile() or pcre_exec() as often as necessary. For + example, to build and use tables that are appropriate for the French + locale (where accented characters with values greater than 128 are treated as letters), the following code could be used: setlocale(LC_CTYPE, "fr_FR"); tables = pcre_maketables(); re = pcre_compile(..., tables); - The locale name "fr_FR" is used on Linux and other Unix-like systems; + The locale name "fr_FR" is used on Linux and other Unix-like systems; if you are using Windows, the name for the French locale is "french". - When pcre_maketables() runs, the tables are built in memory that is - obtained via pcre_malloc. It is the caller's responsibility to ensure - that the memory containing the tables remains available for as long as + When pcre_maketables() runs, the tables are built in memory that is + obtained via pcre_malloc. It is the caller's responsibility to ensure + that the memory containing the tables remains available for as long as it is needed. The pointer that is passed to pcre_compile() is saved with the compiled - pattern, and the same tables are used via this pointer by pcre_study() + pattern, and the same tables are used via this pointer by pcre_study() and normally also by pcre_exec(). Thus, by default, for any single pat- tern, compilation, studying and matching all happen in the same locale, but different patterns can be compiled in different locales. - It is possible to pass a table pointer or NULL (indicating the use of - the internal tables) to pcre_exec(). Although not intended for this - purpose, this facility could be used to match a pattern in a different + It is possible to pass a table pointer or NULL (indicating the use of + the internal tables) to pcre_exec(). Although not intended for this + purpose, this facility could be used to match a pattern in a different locale from the one in which it was compiled. Passing table pointers at run time is discussed below in the section on matching a pattern. @@ -1561,15 +1573,15 @@ INFORMATION ABOUT A PATTERN int pcre_fullinfo(const pcre *code, const pcre_extra *extra, int what, void *where); - The pcre_fullinfo() function returns information about a compiled pat- + The pcre_fullinfo() function returns information about a compiled pat- tern. It replaces the obsolete pcre_info() function, which is neverthe- less retained for backwards compability (and is documented below). - The first argument for pcre_fullinfo() is a pointer to the compiled - pattern. The second argument is the result of pcre_study(), or NULL if - the pattern was not studied. The third argument specifies which piece - of information is required, and the fourth argument is a pointer to a - variable to receive the data. The yield of the function is zero for + The first argument for pcre_fullinfo() is a pointer to the compiled + pattern. The second argument is the result of pcre_study(), or NULL if + the pattern was not studied. The third argument specifies which piece + of information is required, and the fourth argument is a pointer to a + variable to receive the data. The yield of the function is zero for success, or one of the following negative numbers: PCRE_ERROR_NULL the argument code was NULL @@ -1577,9 +1589,9 @@ INFORMATION ABOUT A PATTERN PCRE_ERROR_BADMAGIC the "magic number" was not found PCRE_ERROR_BADOPTION the value of what was invalid - The "magic number" is placed at the start of each compiled pattern as - an simple check against passing an arbitrary memory pointer. Here is a - typical call of pcre_fullinfo(), to obtain the length of the compiled + The "magic number" is placed at the start of each compiled pattern as + an simple check against passing an arbitrary memory pointer. Here is a + typical call of pcre_fullinfo(), to obtain the length of the compiled pattern: int rc; @@ -1590,76 +1602,76 @@ INFORMATION ABOUT A PATTERN PCRE_INFO_SIZE, /* what is required */ &length); /* where to put the data */ - The possible values for the third argument are defined in pcre.h, and + The possible values for the third argument are defined in pcre.h, and are as follows: PCRE_INFO_BACKREFMAX - Return the number of the highest back reference in the pattern. The - fourth argument should point to an int variable. Zero is returned if + Return the number of the highest back reference in the pattern. The + fourth argument should point to an int variable. Zero is returned if there are no back references. PCRE_INFO_CAPTURECOUNT - Return the number of capturing subpatterns in the pattern. The fourth + Return the number of capturing subpatterns in the pattern. The fourth argument should point to an int variable. PCRE_INFO_DEFAULT_TABLES - Return a pointer to the internal default character tables within PCRE. - The fourth argument should point to an unsigned char * variable. This + Return a pointer to the internal default character tables within PCRE. + The fourth argument should point to an unsigned char * variable. This information call is provided for internal use by the pcre_study() func- - tion. External callers can cause PCRE to use its internal tables by + tion. External callers can cause PCRE to use its internal tables by passing a NULL table pointer. PCRE_INFO_FIRSTBYTE - Return information about the first byte of any matched string, for a - non-anchored pattern. The fourth argument should point to an int vari- - able. (This option used to be called PCRE_INFO_FIRSTCHAR; the old name + Return information about the first byte of any matched string, for a + non-anchored pattern. The fourth argument should point to an int vari- + able. (This option used to be called PCRE_INFO_FIRSTCHAR; the old name is still recognized for backwards compatibility.) - If there is a fixed first byte, for example, from a pattern such as + If there is a fixed first byte, for example, from a pattern such as (cat|cow|coyote), its value is returned. Otherwise, if either - (a) the pattern was compiled with the PCRE_MULTILINE option, and every + (a) the pattern was compiled with the PCRE_MULTILINE option, and every branch starts with "^", or (b) every branch of the pattern starts with ".*" and PCRE_DOTALL is not set (if it were set, the pattern would be anchored), - -1 is returned, indicating that the pattern matches only at the start - of a subject string or after any newline within the string. Otherwise + -1 is returned, indicating that the pattern matches only at the start + of a subject string or after any newline within the string. Otherwise -2 is returned. For anchored patterns, -2 is returned. PCRE_INFO_FIRSTTABLE - If the pattern was studied, and this resulted in the construction of a + If the pattern was studied, and this resulted in the construction of a 256-bit table indicating a fixed set of bytes for the first byte in any - matching string, a pointer to the table is returned. Otherwise NULL is - returned. The fourth argument should point to an unsigned char * vari- + matching string, a pointer to the table is returned. Otherwise NULL is + returned. The fourth argument should point to an unsigned char * vari- able. PCRE_INFO_HASCRORLF - Return 1 if the pattern contains any explicit matches for CR or LF - characters, otherwise 0. The fourth argument should point to an int - variable. An explicit match is either a literal CR or LF character, or + Return 1 if the pattern contains any explicit matches for CR or LF + characters, otherwise 0. The fourth argument should point to an int + variable. An explicit match is either a literal CR or LF character, or \r or \n. PCRE_INFO_JCHANGED - Return 1 if the (?J) or (?-J) option setting is used in the pattern, - otherwise 0. The fourth argument should point to an int variable. (?J) + Return 1 if the (?J) or (?-J) option setting is used in the pattern, + otherwise 0. The fourth argument should point to an int variable. (?J) and (?-J) set and unset the local PCRE_DUPNAMES option, respectively. PCRE_INFO_LASTLITERAL - Return the value of the rightmost literal byte that must exist in any - matched string, other than at its start, if such a byte has been + Return the value of the rightmost literal byte that must exist in any + matched string, other than at its start, if such a byte has been recorded. The fourth argument should point to an int variable. If there - is no such byte, -1 is returned. For anchored patterns, a last literal - byte is recorded only if it follows something of variable length. For + is no such byte, -1 is returned. For anchored patterns, a last literal + byte is recorded only if it follows something of variable length. For example, for the pattern /^a\d+z\d+/ the returned value is "z", but for /^a\dz\d/ the returned value is -1. @@ -1667,34 +1679,34 @@ INFORMATION ABOUT A PATTERN PCRE_INFO_NAMEENTRYSIZE PCRE_INFO_NAMETABLE - PCRE supports the use of named as well as numbered capturing parenthe- - ses. The names are just an additional way of identifying the parenthe- + PCRE supports the use of named as well as numbered capturing parenthe- + ses. The names are just an additional way of identifying the parenthe- ses, which still acquire numbers. Several convenience functions such as - pcre_get_named_substring() are provided for extracting captured sub- - strings by name. It is also possible to extract the data directly, by - first converting the name to a number in order to access the correct + pcre_get_named_substring() are provided for extracting captured sub- + strings by name. It is also possible to extract the data directly, by + first converting the name to a number in order to access the correct pointers in the output vector (described with pcre_exec() below). To do - the conversion, you need to use the name-to-number map, which is + the conversion, you need to use the name-to-number map, which is described by these three values. The map consists of a number of fixed-size entries. PCRE_INFO_NAMECOUNT gives the number of entries, and PCRE_INFO_NAMEENTRYSIZE gives the size - of each entry; both of these return an int value. The entry size - depends on the length of the longest name. PCRE_INFO_NAMETABLE returns - a pointer to the first entry of the table (a pointer to char). The + of each entry; both of these return an int value. The entry size + depends on the length of the longest name. PCRE_INFO_NAMETABLE returns + a pointer to the first entry of the table (a pointer to char). The first two bytes of each entry are the number of the capturing parenthe- - sis, most significant byte first. The rest of the entry is the corre- - sponding name, zero terminated. The names are in alphabetical order. + sis, most significant byte first. The rest of the entry is the corre- + sponding name, zero terminated. The names are in alphabetical order. When PCRE_DUPNAMES is set, duplicate names are in order of their paren- - theses numbers. For example, consider the following pattern (assume - PCRE_EXTENDED is set, so white space - including newlines - is + theses numbers. For example, consider the following pattern (assume + PCRE_EXTENDED is set, so white space - including newlines - is ignored): (? (?(\d\d)?\d\d) - (?\d\d) - (?\d\d) ) - There are four named subpatterns, so the table has four entries, and - each entry in the table is eight bytes long. The table is as follows, + There are four named subpatterns, so the table has four entries, and + each entry in the table is eight bytes long. The table is as follows, with non-printing bytes shows in hexadecimal, and undefined bytes shown as ??: @@ -1703,29 +1715,29 @@ INFORMATION ABOUT A PATTERN 00 04 m o n t h 00 00 02 y e a r 00 ?? - When writing code to extract data from named subpatterns using the - name-to-number map, remember that the length of the entries is likely + When writing code to extract data from named subpatterns using the + name-to-number map, remember that the length of the entries is likely to be different for each compiled pattern. PCRE_INFO_OKPARTIAL - Return 1 if the pattern can be used for partial matching, otherwise 0. - The fourth argument should point to an int variable. The pcrepartial - documentation lists the restrictions that apply to patterns when par- + Return 1 if the pattern can be used for partial matching, otherwise 0. + The fourth argument should point to an int variable. The pcrepartial + documentation lists the restrictions that apply to patterns when par- tial matching is used. PCRE_INFO_OPTIONS - Return a copy of the options with which the pattern was compiled. The - fourth argument should point to an unsigned long int variable. These + Return a copy of the options with which the pattern was compiled. The + fourth argument should point to an unsigned long int variable. These option bits are those specified in the call to pcre_compile(), modified by any top-level option settings at the start of the pattern itself. In - other words, they are the options that will be in force when matching - starts. For example, if the pattern /(?im)abc(?-i)d/ is compiled with - the PCRE_EXTENDED option, the result is PCRE_CASELESS, PCRE_MULTILINE, + other words, they are the options that will be in force when matching + starts. For example, if the pattern /(?im)abc(?-i)d/ is compiled with + the PCRE_EXTENDED option, the result is PCRE_CASELESS, PCRE_MULTILINE, and PCRE_EXTENDED. - A pattern is automatically anchored by PCRE if all of its top-level + A pattern is automatically anchored by PCRE if all of its top-level alternatives begin with one of the following: ^ unless PCRE_MULTILINE is set @@ -1739,7 +1751,7 @@ INFORMATION ABOUT A PATTERN PCRE_INFO_SIZE - Return the size of the compiled pattern, that is, the value that was + Return the size of the compiled pattern, that is, the value that was passed as the argument to pcre_malloc() when PCRE was getting memory in which to place the compiled data. The fourth argument should point to a size_t variable. @@ -1747,9 +1759,9 @@ INFORMATION ABOUT A PATTERN PCRE_INFO_STUDYSIZE Return the size of the data block pointed to by the study_data field in - a pcre_extra block. That is, it is the value that was passed to + a pcre_extra block. That is, it is the value that was passed to pcre_malloc() when PCRE was getting memory into which to place the data - created by pcre_study(). The fourth argument should point to a size_t + created by pcre_study(). The fourth argument should point to a size_t variable. @@ -1757,21 +1769,21 @@ OBSOLETE INFO FUNCTION int pcre_info(const pcre *code, int *optptr, int *firstcharptr); - The pcre_info() function is now obsolete because its interface is too - restrictive to return all the available data about a compiled pattern. - New programs should use pcre_fullinfo() instead. The yield of - pcre_info() is the number of capturing subpatterns, or one of the fol- + The pcre_info() function is now obsolete because its interface is too + restrictive to return all the available data about a compiled pattern. + New programs should use pcre_fullinfo() instead. The yield of + pcre_info() is the number of capturing subpatterns, or one of the fol- lowing negative numbers: PCRE_ERROR_NULL the argument code was NULL PCRE_ERROR_BADMAGIC the "magic number" was not found - If the optptr argument is not NULL, a copy of the options with which - the pattern was compiled is placed in the integer it points to (see + If the optptr argument is not NULL, a copy of the options with which + the pattern was compiled is placed in the integer it points to (see PCRE_INFO_OPTIONS above). - If the pattern is not anchored and the firstcharptr argument is not - NULL, it is used to pass back information about the first character of + If the pattern is not anchored and the firstcharptr argument is not + NULL, it is used to pass back information about the first character of any matched string (see PCRE_INFO_FIRSTBYTE above). @@ -1779,21 +1791,21 @@ REFERENCE COUNTS int pcre_refcount(pcre *code, int adjust); - The pcre_refcount() function is used to maintain a reference count in + The pcre_refcount() function is used to maintain a reference count in the data block that contains a compiled pattern. It is provided for the - benefit of applications that operate in an object-oriented manner, + benefit of applications that operate in an object-oriented manner, where different parts of the application may be using the same compiled pattern, but you want to free the block when they are all done. When a pattern is compiled, the reference count field is initialized to - zero. It is changed only by calling this function, whose action is to - add the adjust value (which may be positive or negative) to it. The + zero. It is changed only by calling this function, whose action is to + add the adjust value (which may be positive or negative) to it. The yield of the function is the new value. However, the value of the count - is constrained to lie between 0 and 65535, inclusive. If the new value + is constrained to lie between 0 and 65535, inclusive. If the new value is outside these limits, it is forced to the appropriate limit value. - Except when it is zero, the reference count is not correctly preserved - if a pattern is compiled on one host and then transferred to a host + Except when it is zero, the reference count is not correctly preserved + if a pattern is compiled on one host and then transferred to a host whose byte-order is different. (This seems a highly unlikely scenario.) @@ -1887,51 +1899,51 @@ MATCHING A PATTERN: THE TRADITIONAL FUNCTION the total number of calls, because not all calls to match() are recur- sive. This limit is of use only if it is set smaller than match_limit. - Limiting the recursion depth limits the amount of stack that can be + Limiting the recursion depth limits the amount of stack that can be used, or, when PCRE has been compiled to use memory on the heap instead of the stack, the amount of heap memory that can be used. - The default value for match_limit_recursion can be set when PCRE is - built; the default default is the same value as the default for - match_limit. You can override the default by suppling pcre_exec() with - a pcre_extra block in which match_limit_recursion is set, and - PCRE_EXTRA_MATCH_LIMIT_RECURSION is set in the flags field. If the + The default value for match_limit_recursion can be set when PCRE is + built; the default default is the same value as the default for + match_limit. You can override the default by suppling pcre_exec() with + a pcre_extra block in which match_limit_recursion is set, and + PCRE_EXTRA_MATCH_LIMIT_RECURSION is set in the flags field. If the limit is exceeded, pcre_exec() returns PCRE_ERROR_RECURSIONLIMIT. - The pcre_callout field is used in conjunction with the "callout" fea- + The pcre_callout field is used in conjunction with the "callout" fea- ture, which is described in the pcrecallout documentation. - The tables field is used to pass a character tables pointer to - pcre_exec(); this overrides the value that is stored with the compiled - pattern. A non-NULL value is stored with the compiled pattern only if - custom tables were supplied to pcre_compile() via its tableptr argu- + The tables field is used to pass a character tables pointer to + pcre_exec(); this overrides the value that is stored with the compiled + pattern. A non-NULL value is stored with the compiled pattern only if + custom tables were supplied to pcre_compile() via its tableptr argu- ment. If NULL is passed to pcre_exec() using this mechanism, it forces - PCRE's internal tables to be used. This facility is helpful when re- - using patterns that have been saved after compiling with an external - set of tables, because the external tables might be at a different - address when pcre_exec() is called. See the pcreprecompile documenta- + PCRE's internal tables to be used. This facility is helpful when re- + using patterns that have been saved after compiling with an external + set of tables, because the external tables might be at a different + address when pcre_exec() is called. See the pcreprecompile documenta- tion for a discussion of saving compiled patterns for later use. Option bits for pcre_exec() - The unused bits of the options argument for pcre_exec() must be zero. - The only bits that may be set are PCRE_ANCHORED, PCRE_NEWLINE_xxx, - PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NO_UTF8_CHECK and - PCRE_PARTIAL. + The unused bits of the options argument for pcre_exec() must be zero. + The only bits that may be set are PCRE_ANCHORED, PCRE_NEWLINE_xxx, + PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NO_START_OPTIMIZE, + PCRE_NO_UTF8_CHECK and PCRE_PARTIAL. PCRE_ANCHORED - The PCRE_ANCHORED option limits pcre_exec() to matching at the first - matching position. If a pattern was compiled with PCRE_ANCHORED, or - turned out to be anchored by virtue of its contents, it cannot be made + The PCRE_ANCHORED option limits pcre_exec() to matching at the first + matching position. If a pattern was compiled with PCRE_ANCHORED, or + turned out to be anchored by virtue of its contents, it cannot be made unachored at matching time. PCRE_BSR_ANYCRLF PCRE_BSR_UNICODE These options (which are mutually exclusive) control what the \R escape - sequence matches. The choice is either to match only CR, LF, or CRLF, - or to match any Unicode newline sequence. These options override the + sequence matches. The choice is either to match only CR, LF, or CRLF, + or to match any Unicode newline sequence. These options override the choice that was made or defaulted when the pattern was compiled. PCRE_NEWLINE_CR @@ -1940,77 +1952,88 @@ MATCHING A PATTERN: THE TRADITIONAL FUNCTION PCRE_NEWLINE_ANYCRLF PCRE_NEWLINE_ANY - These options override the newline definition that was chosen or - defaulted when the pattern was compiled. For details, see the descrip- - tion of pcre_compile() above. During matching, the newline choice - affects the behaviour of the dot, circumflex, and dollar metacharac- - ters. It may also alter the way the match position is advanced after a + These options override the newline definition that was chosen or + defaulted when the pattern was compiled. For details, see the descrip- + tion of pcre_compile() above. During matching, the newline choice + affects the behaviour of the dot, circumflex, and dollar metacharac- + ters. It may also alter the way the match position is advanced after a match failure for an unanchored pattern. - When PCRE_NEWLINE_CRLF, PCRE_NEWLINE_ANYCRLF, or PCRE_NEWLINE_ANY is - set, and a match attempt for an unanchored pattern fails when the cur- - rent position is at a CRLF sequence, and the pattern contains no - explicit matches for CR or LF characters, the match position is + When PCRE_NEWLINE_CRLF, PCRE_NEWLINE_ANYCRLF, or PCRE_NEWLINE_ANY is + set, and a match attempt for an unanchored pattern fails when the cur- + rent position is at a CRLF sequence, and the pattern contains no + explicit matches for CR or LF characters, the match position is advanced by two characters instead of one, in other words, to after the CRLF. The above rule is a compromise that makes the most common cases work as - expected. For example, if the pattern is .+A (and the PCRE_DOTALL + expected. For example, if the pattern is .+A (and the PCRE_DOTALL option is not set), it does not match the string "\r\nA" because, after - failing at the start, it skips both the CR and the LF before retrying. - However, the pattern [\r\n]A does match that string, because it con- + failing at the start, it skips both the CR and the LF before retrying. + However, the pattern [\r\n]A does match that string, because it con- tains an explicit CR or LF reference, and so advances only by one char- acter after the first failure. An explicit match for CR of LF is either a literal appearance of one of - those characters, or one of the \r or \n escape sequences. Implicit - matches such as [^X] do not count, nor does \s (which includes CR and + those characters, or one of the \r or \n escape sequences. Implicit + matches such as [^X] do not count, nor does \s (which includes CR and LF in the characters that it matches). - Notwithstanding the above, anomalous effects may still occur when CRLF + Notwithstanding the above, anomalous effects may still occur when CRLF is a valid newline sequence and explicit \r or \n escapes appear in the pattern. PCRE_NOTBOL This option specifies that first character of the subject string is not - the beginning of a line, so the circumflex metacharacter should not - match before it. Setting this without PCRE_MULTILINE (at compile time) - causes circumflex never to match. This option affects only the behav- + the beginning of a line, so the circumflex metacharacter should not + match before it. Setting this without PCRE_MULTILINE (at compile time) + causes circumflex never to match. This option affects only the behav- iour of the circumflex metacharacter. It does not affect \A. PCRE_NOTEOL This option specifies that the end of the subject string is not the end - of a line, so the dollar metacharacter should not match it nor (except - in multiline mode) a newline immediately before it. Setting this with- + of a line, so the dollar metacharacter should not match it nor (except + in multiline mode) a newline immediately before it. Setting this with- out PCRE_MULTILINE (at compile time) causes dollar never to match. This - option affects only the behaviour of the dollar metacharacter. It does + option affects only the behaviour of the dollar metacharacter. It does not affect \Z or \z. PCRE_NOTEMPTY An empty string is not considered to be a valid match if this option is - set. If there are alternatives in the pattern, they are tried. If all - the alternatives match the empty string, the entire match fails. For + set. If there are alternatives in the pattern, they are tried. If all + the alternatives match the empty string, the entire match fails. For example, if the pattern a?b? - is applied to a string not beginning with "a" or "b", it matches the - empty string at the start of the subject. With PCRE_NOTEMPTY set, this + is applied to a string not beginning with "a" or "b", it matches the + empty string at the start of the subject. With PCRE_NOTEMPTY set, this match is not valid, so PCRE searches further into the string for occur- rences of "a" or "b". Perl has no direct equivalent of PCRE_NOTEMPTY, but it does make a spe- - cial case of a pattern match of the empty string within its split() - function, and when using the /g modifier. It is possible to emulate + cial case of a pattern match of the empty string within its split() + function, and when using the /g modifier. It is possible to emulate Perl's behaviour after matching a null string by first trying the match again at the same offset with PCRE_NOTEMPTY and PCRE_ANCHORED, and then - if that fails by advancing the starting offset (see below) and trying + if that fails by advancing the starting offset (see below) and trying an ordinary match again. There is some code that demonstrates how to do this in the pcredemo.c sample program. + PCRE_NO_START_OPTIMIZE + + There are a number of optimizations that pcre_exec() uses at the start + of a match, in order to speed up the process. For example, if it is + known that a match must start with a specific character, it searches + the subject for that character, and fails immediately if it cannot find + it, without actually running the main matching function. When callouts + are in use, these optimizations can cause them to be skipped. This + option disables the "start-up" optimizations, causing performance to + suffer, but ensuring that the callouts do occur. + PCRE_NO_UTF8_CHECK When PCRE_UTF8 is set at compile time, the validity of the subject as a @@ -2239,12 +2262,12 @@ MATCHING A PATTERN: THE TRADITIONAL FUNCTION PCRE_ERROR_BADCOUNT (-15) - This error is given if the value of the ovecsize argument is negative. + This error is given if the value of the ovecsize argument is negative. PCRE_ERROR_RECURSIONLIMIT (-21) The internal recursion limit, as specified by the match_limit_recursion - field in a pcre_extra structure (or defaulted) was reached. See the + field in a pcre_extra structure (or defaulted) was reached. See the description above. PCRE_ERROR_BADNEWLINE (-23) @@ -2267,78 +2290,78 @@ EXTRACTING CAPTURED SUBSTRINGS BY NUMBER int pcre_get_substring_list(const char *subject, int *ovector, int stringcount, const char ***listptr); - Captured substrings can be accessed directly by using the offsets - returned by pcre_exec() in ovector. For convenience, the functions + Captured substrings can be accessed directly by using the offsets + returned by pcre_exec() in ovector. For convenience, the functions pcre_copy_substring(), pcre_get_substring(), and pcre_get_sub- - string_list() are provided for extracting captured substrings as new, - separate, zero-terminated strings. These functions identify substrings - by number. The next section describes functions for extracting named + string_list() are provided for extracting captured substrings as new, + separate, zero-terminated strings. These functions identify substrings + by number. The next section describes functions for extracting named substrings. - A substring that contains a binary zero is correctly extracted and has - a further zero added on the end, but the result is not, of course, a C - string. However, you can process such a string by referring to the - length that is returned by pcre_copy_substring() and pcre_get_sub- + A substring that contains a binary zero is correctly extracted and has + a further zero added on the end, but the result is not, of course, a C + string. However, you can process such a string by referring to the + length that is returned by pcre_copy_substring() and pcre_get_sub- string(). Unfortunately, the interface to pcre_get_substring_list() is - not adequate for handling strings containing binary zeros, because the + not adequate for handling strings containing binary zeros, because the end of the final string is not independently indicated. - The first three arguments are the same for all three of these func- - tions: subject is the subject string that has just been successfully + The first three arguments are the same for all three of these func- + tions: subject is the subject string that has just been successfully matched, ovector is a pointer to the vector of integer offsets that was passed to pcre_exec(), and stringcount is the number of substrings that - were captured by the match, including the substring that matched the + were captured by the match, including the substring that matched the entire regular expression. This is the value returned by pcre_exec() if - it is greater than zero. If pcre_exec() returned zero, indicating that - it ran out of space in ovector, the value passed as stringcount should + it is greater than zero. If pcre_exec() returned zero, indicating that + it ran out of space in ovector, the value passed as stringcount should be the number of elements in the vector divided by three. - The functions pcre_copy_substring() and pcre_get_substring() extract a - single substring, whose number is given as stringnumber. A value of - zero extracts the substring that matched the entire pattern, whereas - higher values extract the captured substrings. For pcre_copy_sub- - string(), the string is placed in buffer, whose length is given by - buffersize, while for pcre_get_substring() a new block of memory is - obtained via pcre_malloc, and its address is returned via stringptr. - The yield of the function is the length of the string, not including + The functions pcre_copy_substring() and pcre_get_substring() extract a + single substring, whose number is given as stringnumber. A value of + zero extracts the substring that matched the entire pattern, whereas + higher values extract the captured substrings. For pcre_copy_sub- + string(), the string is placed in buffer, whose length is given by + buffersize, while for pcre_get_substring() a new block of memory is + obtained via pcre_malloc, and its address is returned via stringptr. + The yield of the function is the length of the string, not including the terminating zero, or one of these error codes: PCRE_ERROR_NOMEMORY (-6) - The buffer was too small for pcre_copy_substring(), or the attempt to + The buffer was too small for pcre_copy_substring(), or the attempt to get memory failed for pcre_get_substring(). PCRE_ERROR_NOSUBSTRING (-7) There is no substring whose number is stringnumber. - The pcre_get_substring_list() function extracts all available sub- - strings and builds a list of pointers to them. All this is done in a + The pcre_get_substring_list() function extracts all available sub- + strings and builds a list of pointers to them. All this is done in a single block of memory that is obtained via pcre_malloc. The address of - the memory block is returned via listptr, which is also the start of - the list of string pointers. The end of the list is marked by a NULL - pointer. The yield of the function is zero if all went well, or the + the memory block is returned via listptr, which is also the start of + the list of string pointers. The end of the list is marked by a NULL + pointer. The yield of the function is zero if all went well, or the error code PCRE_ERROR_NOMEMORY (-6) if the attempt to get the memory block failed. - When any of these functions encounter a substring that is unset, which - can happen when capturing subpattern number n+1 matches some part of - the subject, but subpattern n has not been used at all, they return an + When any of these functions encounter a substring that is unset, which + can happen when capturing subpattern number n+1 matches some part of + the subject, but subpattern n has not been used at all, they return an empty string. This can be distinguished from a genuine zero-length sub- - string by inspecting the appropriate offset in ovector, which is nega- + string by inspecting the appropriate offset in ovector, which is nega- tive for unset substrings. - The two convenience functions pcre_free_substring() and pcre_free_sub- - string_list() can be used to free the memory returned by a previous + The two convenience functions pcre_free_substring() and pcre_free_sub- + string_list() can be used to free the memory returned by a previous call of pcre_get_substring() or pcre_get_substring_list(), respec- - tively. They do nothing more than call the function pointed to by - pcre_free, which of course could be called directly from a C program. - However, PCRE is used in some situations where it is linked via a spe- - cial interface to another programming language that cannot use - pcre_free directly; it is for these cases that the functions are pro- + tively. They do nothing more than call the function pointed to by + pcre_free, which of course could be called directly from a C program. + However, PCRE is used in some situations where it is linked via a spe- + cial interface to another programming language that cannot use + pcre_free directly; it is for these cases that the functions are pro- vided. @@ -2357,7 +2380,7 @@ EXTRACTING CAPTURED SUBSTRINGS BY NAME int stringcount, const char *stringname, const char **stringptr); - To extract a substring by name, you first have to find associated num- + To extract a substring by name, you first have to find associated num- ber. For example, for this pattern (a+)b(?\d+)... @@ -2366,29 +2389,34 @@ EXTRACTING CAPTURED SUBSTRINGS BY NAME be unique (PCRE_DUPNAMES was not set), you can find the number from the name by calling pcre_get_stringnumber(). The first argument is the com- piled pattern, and the second is the name. The yield of the function is - the subpattern number, or PCRE_ERROR_NOSUBSTRING (-7) if there is no + the subpattern number, or PCRE_ERROR_NOSUBSTRING (-7) if there is no subpattern of that name. Given the number, you can extract the substring directly, or use one of the functions described in the previous section. For convenience, there are also two functions that do the whole job. - Most of the arguments of pcre_copy_named_substring() and - pcre_get_named_substring() are the same as those for the similarly - named functions that extract by number. As these are described in the - previous section, they are not re-described here. There are just two + Most of the arguments of pcre_copy_named_substring() and + pcre_get_named_substring() are the same as those for the similarly + named functions that extract by number. As these are described in the + previous section, they are not re-described here. There are just two differences: - First, instead of a substring number, a substring name is given. Sec- + First, instead of a substring number, a substring name is given. Sec- ond, there is an extra argument, given at the start, which is a pointer - to the compiled pattern. This is needed in order to gain access to the + to the compiled pattern. This is needed in order to gain access to the name-to-number translation table. - These functions call pcre_get_stringnumber(), and if it succeeds, they - then call pcre_copy_substring() or pcre_get_substring(), as appropri- - ate. NOTE: If PCRE_DUPNAMES is set and there are duplicate names, the + These functions call pcre_get_stringnumber(), and if it succeeds, they + then call pcre_copy_substring() or pcre_get_substring(), as appropri- + ate. NOTE: If PCRE_DUPNAMES is set and there are duplicate names, the behaviour may not be what you want (see the next section). + Warning: If the pattern uses the "(?|" feature to set up multiple sub- + patterns with the same number, you cannot use names to distinguish + them, because names are not included in the compiled code. The matching + process uses only numbers. + DUPLICATE SUBPATTERN NAMES @@ -2596,7 +2624,7 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION SEE ALSO pcrebuild(3), pcrecallout(3), pcrecpp(3)(3), pcrematching(3), pcrepar- - tial(3), pcreposix(3), pcreprecompile(3), pcresample(3), pcrestack(3). + tial(3), pcreposix(3), pcreprecompile(3), pcresample(3), pcrestack(3). AUTHOR @@ -2608,8 +2636,8 @@ AUTHOR REVISION - Last updated: 24 August 2008 - Copyright (c) 1997-2008 University of Cambridge. + Last updated: 11 April 2009 + Copyright (c) 1997-2009 University of Cambridge. ------------------------------------------------------------------------------ @@ -2660,8 +2688,8 @@ PCRE CALLOUTS MISSING CALLOUTS You should be aware that, because of optimizations in the way PCRE - matches patterns, callouts sometimes do not happen. For example, if the - pattern is + matches patterns by default, callouts sometimes do not happen. For + example, if the pattern is ab(?C4)cd @@ -2670,13 +2698,18 @@ MISSING CALLOUTS ever start, and the callout is never reached. However, with "abyd", though the result is still no match, the callout is obeyed. + You can disable these optimizations by passing the PCRE_NO_START_OPTI- + MIZE option to pcre_exec() or pcre_dfa_exec(). This slows down the + matching process, but does ensure that callouts such as the example + above are obeyed. + THE CALLOUT INTERFACE - During matching, when PCRE reaches a callout point, the external func- - tion defined by pcre_callout is called (if it is set). This applies to - both the pcre_exec() and the pcre_dfa_exec() matching functions. The - only argument to the callout function is a pointer to a pcre_callout + During matching, when PCRE reaches a callout point, the external func- + tion defined by pcre_callout is called (if it is set). This applies to + both the pcre_exec() and the pcre_dfa_exec() matching functions. The + only argument to the callout function is a pointer to a pcre_callout block. This structure contains the following fields: int version; @@ -2692,9 +2725,9 @@ THE CALLOUT INTERFACE int pattern_position; int next_item_length; - The version field is an integer containing the version number of the - block format. The initial version was 0; the current version is 1. The - version number will change again in future if additional fields are + The version field is an integer containing the version number of the + block format. The initial version was 0; the current version is 1. The + version number will change again in future if additional fields are added, but the intention is never to remove any of the existing fields. The callout_number field contains the number of the callout, as com- @@ -2779,8 +2812,8 @@ AUTHOR REVISION - Last updated: 29 May 2007 - Copyright (c) 1997-2007 University of Cambridge. + Last updated: 15 March 2009 + Copyright (c) 1997-2009 University of Cambridge. ------------------------------------------------------------------------------ @@ -2952,10 +2985,16 @@ PCRE REGULAR EXPRESSION DETAILS The original operation of PCRE was on strings of one-byte characters. However, there is now also support for UTF-8 character strings. To use this, you must build PCRE to include UTF-8 support, and then call - pcre_compile() with the PCRE_UTF8 option. How this affects pattern - matching is mentioned in several places below. There is also a summary - of UTF-8 features in the section on UTF-8 support in the main pcre - page. + pcre_compile() with the PCRE_UTF8 option. There is also a special + sequence that can be given at the start of a pattern: + + (*UTF8) + + Starting a pattern with this sequence is equivalent to setting the + PCRE_UTF8 option. This feature is not Perl-compatible. How setting + UTF-8 mode affects pattern matching is mentioned in several places + below. There is also a summary of UTF-8 features in the section on + UTF-8 support in the main pcre page. The remainder of this document discusses the patterns that are sup- ported by PCRE when its main matching function, pcre_exec(), is used. @@ -3059,33 +3098,33 @@ CHARACTERS AND METACHARACTERS syntax) ] terminates the character class - The following sections describe the use of each of the metacharacters. + The following sections describe the use of each of the metacharacters. BACKSLASH The backslash character has several uses. Firstly, if it is followed by - a non-alphanumeric character, it takes away any special meaning that - character may have. This use of backslash as an escape character + a non-alphanumeric character, it takes away any special meaning that + character may have. This use of backslash as an escape character applies both inside and outside character classes. - For example, if you want to match a * character, you write \* in the - pattern. This escaping action applies whether or not the following - character would otherwise be interpreted as a metacharacter, so it is - always safe to precede a non-alphanumeric with backslash to specify - that it stands for itself. In particular, if you want to match a back- + For example, if you want to match a * character, you write \* in the + pattern. This escaping action applies whether or not the following + character would otherwise be interpreted as a metacharacter, so it is + always safe to precede a non-alphanumeric with backslash to specify + that it stands for itself. In particular, if you want to match a back- slash, you write \\. - If a pattern is compiled with the PCRE_EXTENDED option, whitespace in - the pattern (other than in a character class) and characters between a + If a pattern is compiled with the PCRE_EXTENDED option, whitespace in + the pattern (other than in a character class) and characters between a # outside a character class and the next newline are ignored. An escap- - ing backslash can be used to include a whitespace or # character as + ing backslash can be used to include a whitespace or # character as part of the pattern. - If you want to remove the special meaning from a sequence of charac- - ters, you can do so by putting them between \Q and \E. This is differ- - ent from Perl in that $ and @ are handled as literals in \Q...\E - sequences in PCRE, whereas in Perl, $ and @ cause variable interpola- + If you want to remove the special meaning from a sequence of charac- + ters, you can do so by putting them between \Q and \E. This is differ- + ent from Perl in that $ and @ are handled as literals in \Q...\E + sequences in PCRE, whereas in Perl, $ and @ cause variable interpola- tion. Note the following examples: Pattern PCRE matches Perl matches @@ -3095,16 +3134,16 @@ BACKSLASH \Qabc\$xyz\E abc\$xyz abc\$xyz \Qabc\E\$\Qxyz\E abc$xyz abc$xyz - The \Q...\E sequence is recognized both inside and outside character + The \Q...\E sequence is recognized both inside and outside character classes. Non-printing characters A second use of backslash provides a way of encoding non-printing char- - acters in patterns in a visible manner. There is no restriction on the - appearance of non-printing characters, apart from the binary zero that - terminates a pattern, but when a pattern is being prepared by text - editing, it is usually easier to use one of the following escape + acters in patterns in a visible manner. There is no restriction on the + appearance of non-printing characters, apart from the binary zero that + terminates a pattern, but when a pattern is being prepared by text + editing, it is usually easier to use one of the following escape sequences than the binary character it represents: \a alarm, that is, the BEL character (hex 07) @@ -3118,48 +3157,48 @@ BACKSLASH \xhh character with hex code hh \x{hhh..} character with hex code hhh.. - The precise effect of \cx is as follows: if x is a lower case letter, - it is converted to upper case. Then bit 6 of the character (hex 40) is - inverted. Thus \cz becomes hex 1A, but \c{ becomes hex 3B, while \c; + The precise effect of \cx is as follows: if x is a lower case letter, + it is converted to upper case. Then bit 6 of the character (hex 40) is + inverted. Thus \cz becomes hex 1A, but \c{ becomes hex 3B, while \c; becomes hex 7B. - After \x, from zero to two hexadecimal digits are read (letters can be - in upper or lower case). Any number of hexadecimal digits may appear - between \x{ and }, but the value of the character code must be less + After \x, from zero to two hexadecimal digits are read (letters can be + in upper or lower case). Any number of hexadecimal digits may appear + between \x{ and }, but the value of the character code must be less than 256 in non-UTF-8 mode, and less than 2**31 in UTF-8 mode. That is, - the maximum value in hexadecimal is 7FFFFFFF. Note that this is bigger + the maximum value in hexadecimal is 7FFFFFFF. Note that this is bigger than the largest Unicode code point, which is 10FFFF. - If characters other than hexadecimal digits appear between \x{ and }, + If characters other than hexadecimal digits appear between \x{ and }, or if there is no terminating }, this form of escape is not recognized. - Instead, the initial \x will be interpreted as a basic hexadecimal - escape, with no following digits, giving a character whose value is + Instead, the initial \x will be interpreted as a basic hexadecimal + escape, with no following digits, giving a character whose value is zero. Characters whose value is less than 256 can be defined by either of the - two syntaxes for \x. There is no difference in the way they are han- + two syntaxes for \x. There is no difference in the way they are han- dled. For example, \xdc is exactly the same as \x{dc}. - After \0 up to two further octal digits are read. If there are fewer - than two digits, just those that are present are used. Thus the + After \0 up to two further octal digits are read. If there are fewer + than two digits, just those that are present are used. Thus the sequence \0\x\07 specifies two binary zeros followed by a BEL character - (code value 7). Make sure you supply two digits after the initial zero + (code value 7). Make sure you supply two digits after the initial zero if the pattern character that follows is itself an octal digit. The handling of a backslash followed by a digit other than 0 is compli- cated. Outside a character class, PCRE reads it and any following dig- - its as a decimal number. If the number is less than 10, or if there + its as a decimal number. If the number is less than 10, or if there have been at least that many previous capturing left parentheses in the - expression, the entire sequence is taken as a back reference. A - description of how this works is given later, following the discussion + expression, the entire sequence is taken as a back reference. A + description of how this works is given later, following the discussion of parenthesized subpatterns. - Inside a character class, or if the decimal number is greater than 9 - and there have not been that many capturing subpatterns, PCRE re-reads + Inside a character class, or if the decimal number is greater than 9 + and there have not been that many capturing subpatterns, PCRE re-reads up to three octal digits following the backslash, and uses them to gen- - erate a data character. Any subsequent digits stand for themselves. In - non-UTF-8 mode, the value of a character specified in octal must be - less than \400. In UTF-8 mode, values up to \777 are permitted. For + erate a data character. Any subsequent digits stand for themselves. In + non-UTF-8 mode, the value of a character specified in octal must be + less than \400. In UTF-8 mode, values up to \777 are permitted. For example: \040 is another way of writing a space @@ -3177,30 +3216,30 @@ BACKSLASH \81 is either a back reference, or a binary zero followed by the two characters "8" and "1" - Note that octal values of 100 or greater must not be introduced by a + Note that octal values of 100 or greater must not be introduced by a leading zero, because no more than three octal digits are ever read. All the sequences that define a single character value can be used both - inside and outside character classes. In addition, inside a character - class, the sequence \b is interpreted as the backspace character (hex - 08), and the sequences \R and \X are interpreted as the characters "R" - and "X", respectively. Outside a character class, these sequences have + inside and outside character classes. In addition, inside a character + class, the sequence \b is interpreted as the backspace character (hex + 08), and the sequences \R and \X are interpreted as the characters "R" + and "X", respectively. Outside a character class, these sequences have different meanings (see below). Absolute and relative back references - The sequence \g followed by an unsigned or a negative number, option- - ally enclosed in braces, is an absolute or relative back reference. A + The sequence \g followed by an unsigned or a negative number, option- + ally enclosed in braces, is an absolute or relative back reference. A named back reference can be coded as \g{name}. Back references are dis- cussed later, following the discussion of parenthesized subpatterns. Absolute and relative subroutine calls - For compatibility with Oniguruma, the non-Perl syntax \g followed by a + For compatibility with Oniguruma, the non-Perl syntax \g followed by a name or a number enclosed either in angle brackets or single quotes, is - an alternative syntax for referencing a subpattern as a "subroutine". - Details are discussed later. Note that \g{...} (Perl syntax) and - \g<...> (Oniguruma syntax) are not synonymous. The former is a back + an alternative syntax for referencing a subpattern as a "subroutine". + Details are discussed later. Note that \g{...} (Perl syntax) and + \g<...> (Oniguruma syntax) are not synonymous. The former is a back reference; the latter is a subroutine call. Generic character types @@ -3220,25 +3259,26 @@ BACKSLASH \W any "non-word" character Each pair of escape sequences partitions the complete set of characters - into two disjoint sets. Any given character matches one, and only one, + into two disjoint sets. Any given character matches one, and only one, of each pair. These character type sequences can appear both inside and outside char- - acter classes. They each match one character of the appropriate type. - If the current matching point is at the end of the subject string, all + acter classes. They each match one character of the appropriate type. + If the current matching point is at the end of the subject string, all of them fail, since there is no character to match. - For compatibility with Perl, \s does not match the VT character (code - 11). This makes it different from the the POSIX "space" class. The \s - characters are HT (9), LF (10), FF (12), CR (13), and space (32). If + For compatibility with Perl, \s does not match the VT character (code + 11). This makes it different from the the POSIX "space" class. The \s + characters are HT (9), LF (10), FF (12), CR (13), and space (32). If "use locale;" is included in a Perl script, \s may match the VT charac- ter. In PCRE, it never does. - In UTF-8 mode, characters with values greater than 128 never match \d, + In UTF-8 mode, characters with values greater than 128 never match \d, \s, or \w, and always match \D, \S, and \W. This is true even when Uni- - code character property support is available. These sequences retain + code character property support is available. These sequences retain their original meanings from before UTF-8 support was available, mainly - for efficiency reasons. + for efficiency reasons. Note that this also affects \b, because it is + defined in terms of \w and \W. The sequences \h, \H, \v, and \V are Perl 5.10 features. In contrast to the other sequences, these do match certain high-valued codepoints in @@ -3764,17 +3804,17 @@ POSIX CHARACTER CLASSES VERTICAL BAR - Vertical bar characters are used to separate alternative patterns. For + Vertical bar characters are used to separate alternative patterns. For example, the pattern gilbert|sullivan - matches either "gilbert" or "sullivan". Any number of alternatives may - appear, and an empty alternative is permitted (matching the empty + matches either "gilbert" or "sullivan". Any number of alternatives may + appear, and an empty alternative is permitted (matching the empty string). The matching process tries each alternative in turn, from left - to right, and the first one that succeeds is used. If the alternatives - are within a subpattern (defined below), "succeeds" means matching the - rest of the main pattern as well as the alternative in the subpattern. + to right, and the first one that succeeds is used. If the alternatives + are within a subpattern (defined below), "succeeds" means matching the + rest of the main pattern as well as the alternative in the subpattern. INTERNAL OPTION SETTING @@ -3800,11 +3840,11 @@ INTERNAL OPTION SETTING can be changed in the same way as the Perl-compatible options by using the characters J, U and X respectively. - When an option change occurs at top level (that is, not inside subpat- - tern parentheses), the change applies to the remainder of the pattern - that follows. If the change is placed right at the start of a pattern, - PCRE extracts it into the global options (and it will therefore show up - in data extracted by the pcre_fullinfo() function). + When one of these option changes occurs at top level (that is, not + inside subpattern parentheses), the change applies to the remainder of + the pattern that follows. If the change is placed right at the start of + a pattern, PCRE extracts it into the global options (and it will there- + fore show up in data extracted by the pcre_fullinfo() function). An option change within a subpattern (see below for a description of subpatterns) affects only that part of the current pattern that follows @@ -3827,9 +3867,11 @@ INTERNAL OPTION SETTING Note: There are other PCRE-specific options that can be set by the application when the compile or match functions are called. In some - cases the pattern can contain special leading sequences to override - what the application has set or what has been defaulted. Details are - given in the section entitled "Newline sequences" above. + cases the pattern can contain special leading sequences such as (*CRLF) + to override what the application has set or what has been defaulted. + Details are given in the section entitled "Newline sequences" above. + There is also the (*UTF8) leading sequence that can be used to set + UTF-8 mode; this is equivalent to setting the PCRE_UTF8 option. SUBPATTERNS @@ -3968,6 +4010,10 @@ NAMED SUBPATTERNS lowest number is used. For further details of the interfaces for han- dling named subpatterns, see the pcreapi documentation. + Warning: You cannot use different names to distinguish between two sub- + patterns with the same number (see the previous section) because PCRE + uses only the numbers when matching. + REPETITION @@ -4008,44 +4054,44 @@ REPETITION the syntax of a quantifier, is taken as a literal character. For exam- ple, {,6} is not a quantifier, but a literal string of four characters. - In UTF-8 mode, quantifiers apply to UTF-8 characters rather than to + In UTF-8 mode, quantifiers apply to UTF-8 characters rather than to individual bytes. Thus, for example, \x{100}{2} matches two UTF-8 char- acters, each of which is represented by a two-byte sequence. Similarly, when Unicode property support is available, \X{3} matches three Unicode - extended sequences, each of which may be several bytes long (and they + extended sequences, each of which may be several bytes long (and they may be of different lengths). The quantifier {0} is permitted, causing the expression to behave as if the previous item and the quantifier were not present. This may be use- - ful for subpatterns that are referenced as subroutines from elsewhere + ful for subpatterns that are referenced as subroutines from elsewhere in the pattern. Items other than subpatterns that have a {0} quantifier are omitted from the compiled pattern. - For convenience, the three most common quantifiers have single-charac- + For convenience, the three most common quantifiers have single-charac- ter abbreviations: * is equivalent to {0,} + is equivalent to {1,} ? is equivalent to {0,1} - It is possible to construct infinite loops by following a subpattern + It is possible to construct infinite loops by following a subpattern that can match no characters with a quantifier that has no upper limit, for example: (a?)* Earlier versions of Perl and PCRE used to give an error at compile time - for such patterns. However, because there are cases where this can be - useful, such patterns are now accepted, but if any repetition of the - subpattern does in fact match no characters, the loop is forcibly bro- + for such patterns. However, because there are cases where this can be + useful, such patterns are now accepted, but if any repetition of the + subpattern does in fact match no characters, the loop is forcibly bro- ken. - By default, the quantifiers are "greedy", that is, they match as much - as possible (up to the maximum number of permitted times), without - causing the rest of the pattern to fail. The classic example of where + By default, the quantifiers are "greedy", that is, they match as much + as possible (up to the maximum number of permitted times), without + causing the rest of the pattern to fail. The classic example of where this gives problems is in trying to match comments in C programs. These - appear between /* and */ and within the comment, individual * and / - characters may appear. An attempt to match C comments by applying the + appear between /* and */ and within the comment, individual * and / + characters may appear. An attempt to match C comments by applying the pattern /\*.*\*/ @@ -4054,19 +4100,19 @@ REPETITION /* first comment */ not comment /* second comment */ - fails, because it matches the entire string owing to the greediness of + fails, because it matches the entire string owing to the greediness of the .* item. - However, if a quantifier is followed by a question mark, it ceases to + However, if a quantifier is followed by a question mark, it ceases to be greedy, and instead matches the minimum number of times possible, so the pattern /\*.*?\*/ - does the right thing with the C comments. The meaning of the various - quantifiers is not otherwise changed, just the preferred number of - matches. Do not confuse this use of question mark with its use as a - quantifier in its own right. Because it has two uses, it can sometimes + does the right thing with the C comments. The meaning of the various + quantifiers is not otherwise changed, just the preferred number of + matches. Do not confuse this use of question mark with its use as a + quantifier in its own right. Because it has two uses, it can sometimes appear doubled, as in \d??\d @@ -4074,36 +4120,36 @@ REPETITION which matches one digit by preference, but can match two if that is the only way the rest of the pattern matches. - If the PCRE_UNGREEDY option is set (an option that is not available in - Perl), the quantifiers are not greedy by default, but individual ones - can be made greedy by following them with a question mark. In other + If the PCRE_UNGREEDY option is set (an option that is not available in + Perl), the quantifiers are not greedy by default, but individual ones + can be made greedy by following them with a question mark. In other words, it inverts the default behaviour. - When a parenthesized subpattern is quantified with a minimum repeat - count that is greater than 1 or with a limited maximum, more memory is - required for the compiled pattern, in proportion to the size of the + When a parenthesized subpattern is quantified with a minimum repeat + count that is greater than 1 or with a limited maximum, more memory is + required for the compiled pattern, in proportion to the size of the minimum or maximum. If a pattern starts with .* or .{0,} and the PCRE_DOTALL option (equiv- - alent to Perl's /s) is set, thus allowing the dot to match newlines, - the pattern is implicitly anchored, because whatever follows will be - tried against every character position in the subject string, so there - is no point in retrying the overall match at any position after the - first. PCRE normally treats such a pattern as though it were preceded + alent to Perl's /s) is set, thus allowing the dot to match newlines, + the pattern is implicitly anchored, because whatever follows will be + tried against every character position in the subject string, so there + is no point in retrying the overall match at any position after the + first. PCRE normally treats such a pattern as though it were preceded by \A. - In cases where it is known that the subject string contains no new- - lines, it is worth setting PCRE_DOTALL in order to obtain this opti- + In cases where it is known that the subject string contains no new- + lines, it is worth setting PCRE_DOTALL in order to obtain this opti- mization, or alternatively using ^ to indicate anchoring explicitly. - However, there is one situation where the optimization cannot be used. - When .* is inside capturing parentheses that are the subject of a - backreference elsewhere in the pattern, a match at the start may fail + However, there is one situation where the optimization cannot be used. + When .* is inside capturing parentheses that are the subject of a + backreference elsewhere in the pattern, a match at the start may fail where a later one succeeds. Consider, for example: (.*)abc\1 - If the subject is "xyz123abc123" the match point is the fourth charac- + If the subject is "xyz123abc123" the match point is the fourth charac- ter. For this reason, such a pattern is not implicitly anchored. When a capturing subpattern is repeated, the value captured is the sub- @@ -4112,8 +4158,8 @@ REPETITION (tweedle[dume]{3}\s*)+ has matched "tweedledum tweedledee" the value of the captured substring - is "tweedledee". However, if there are nested capturing subpatterns, - the corresponding captured values may have been set in previous itera- + is "tweedledee". However, if there are nested capturing subpatterns, + the corresponding captured values may have been set in previous itera- tions. For example, after /(a|(b))+/ @@ -4123,28 +4169,28 @@ REPETITION ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS - With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy") - repetition, failure of what follows normally causes the repeated item - to be re-evaluated to see if a different number of repeats allows the - rest of the pattern to match. Sometimes it is useful to prevent this, - either to change the nature of the match, or to cause it fail earlier - than it otherwise might, when the author of the pattern knows there is + With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy") + repetition, failure of what follows normally causes the repeated item + to be re-evaluated to see if a different number of repeats allows the + rest of the pattern to match. Sometimes it is useful to prevent this, + either to change the nature of the match, or to cause it fail earlier + than it otherwise might, when the author of the pattern knows there is no point in carrying on. - Consider, for example, the pattern \d+foo when applied to the subject + Consider, for example, the pattern \d+foo when applied to the subject line 123456bar After matching all 6 digits and then failing to match "foo", the normal - action of the matcher is to try again with only 5 digits matching the - \d+ item, and then with 4, and so on, before ultimately failing. - "Atomic grouping" (a term taken from Jeffrey Friedl's book) provides - the means for specifying that once a subpattern has matched, it is not + action of the matcher is to try again with only 5 digits matching the + \d+ item, and then with 4, and so on, before ultimately failing. + "Atomic grouping" (a term taken from Jeffrey Friedl's book) provides + the means for specifying that once a subpattern has matched, it is not to be re-evaluated in this way. - If we use atomic grouping for the previous example, the matcher gives - up immediately on failing to match "foo" the first time. The notation + If we use atomic grouping for the previous example, the matcher gives + up immediately on failing to match "foo" the first time. The notation is a kind of special parenthesis, starting with (?> as in this example: (?>\d+)foo @@ -4222,44 +4268,44 @@ ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS ((?>\D+)|<\d+>)*[!?] - sequences of non-digits cannot be broken, and failure happens quickly. + sequences of non-digits cannot be broken, and failure happens quickly. BACK REFERENCES Outside a character class, a backslash followed by a digit greater than 0 (and possibly further digits) is a back reference to a capturing sub- - pattern earlier (that is, to its left) in the pattern, provided there + pattern earlier (that is, to its left) in the pattern, provided there have been that many previous capturing left parentheses. However, if the decimal number following the backslash is less than 10, - it is always taken as a back reference, and causes an error only if - there are not that many capturing left parentheses in the entire pat- - tern. In other words, the parentheses that are referenced need not be - to the left of the reference for numbers less than 10. A "forward back - reference" of this type can make sense when a repetition is involved - and the subpattern to the right has participated in an earlier itera- + it is always taken as a back reference, and causes an error only if + there are not that many capturing left parentheses in the entire pat- + tern. In other words, the parentheses that are referenced need not be + to the left of the reference for numbers less than 10. A "forward back + reference" of this type can make sense when a repetition is involved + and the subpattern to the right has participated in an earlier itera- tion. - It is not possible to have a numerical "forward back reference" to a - subpattern whose number is 10 or more using this syntax because a - sequence such as \50 is interpreted as a character defined in octal. + It is not possible to have a numerical "forward back reference" to a + subpattern whose number is 10 or more using this syntax because a + sequence such as \50 is interpreted as a character defined in octal. See the subsection entitled "Non-printing characters" above for further - details of the handling of digits following a backslash. There is no - such problem when named parentheses are used. A back reference to any + details of the handling of digits following a backslash. There is no + such problem when named parentheses are used. A back reference to any subpattern is possible using named parentheses (see below). - Another way of avoiding the ambiguity inherent in the use of digits + Another way of avoiding the ambiguity inherent in the use of digits following a backslash is to use the \g escape sequence, which is a fea- - ture introduced in Perl 5.10. This escape must be followed by an - unsigned number or a negative number, optionally enclosed in braces. + ture introduced in Perl 5.10. This escape must be followed by an + unsigned number or a negative number, optionally enclosed in braces. These examples are all identical: (ring), \1 (ring), \g1 (ring), \g{1} - An unsigned number specifies an absolute reference without the ambigu- + An unsigned number specifies an absolute reference without the ambigu- ity that is present in the older syntax. It is also useful when literal digits follow the reference. A negative number is a relative reference. Consider this example: @@ -4267,33 +4313,33 @@ BACK REFERENCES (abc(def)ghi)\g{-1} The sequence \g{-1} is a reference to the most recently started captur- - ing subpattern before \g, that is, is it equivalent to \2. Similarly, + ing subpattern before \g, that is, is it equivalent to \2. Similarly, \g{-2} would be equivalent to \1. The use of relative references can be - helpful in long patterns, and also in patterns that are created by + helpful in long patterns, and also in patterns that are created by joining together fragments that contain references within themselves. - A back reference matches whatever actually matched the capturing sub- - pattern in the current subject string, rather than anything matching + A back reference matches whatever actually matched the capturing sub- + pattern in the current subject string, rather than anything matching the subpattern itself (see "Subpatterns as subroutines" below for a way of doing that). So the pattern (sens|respons)e and \1ibility - matches "sense and sensibility" and "response and responsibility", but - not "sense and responsibility". If caseful matching is in force at the - time of the back reference, the case of letters is relevant. For exam- + matches "sense and sensibility" and "response and responsibility", but + not "sense and responsibility". If caseful matching is in force at the + time of the back reference, the case of letters is relevant. For exam- ple, ((?i)rah)\s+\1 - matches "rah rah" and "RAH RAH", but not "RAH rah", even though the + matches "rah rah" and "RAH RAH", but not "RAH rah", even though the original capturing subpattern is matched caselessly. - There are several different ways of writing back references to named - subpatterns. The .NET syntax \k{name} and the Perl syntax \k or - \k'name' are supported, as is the Python syntax (?P=name). Perl 5.10's + There are several different ways of writing back references to named + subpatterns. The .NET syntax \k{name} and the Perl syntax \k or + \k'name' are supported, as is the Python syntax (?P=name). Perl 5.10's unified back reference syntax, in which \g can be used for both numeric - and named references, is also supported. We could rewrite the above + and named references, is also supported. We could rewrite the above example in any of the following ways: (?(?i)rah)\s+\k @@ -4301,57 +4347,57 @@ BACK REFERENCES (?P(?i)rah)\s+(?P=p1) (?(?i)rah)\s+\g{p1} - A subpattern that is referenced by name may appear in the pattern + A subpattern that is referenced by name may appear in the pattern before or after the reference. - There may be more than one back reference to the same subpattern. If a - subpattern has not actually been used in a particular match, any back + There may be more than one back reference to the same subpattern. If a + subpattern has not actually been used in a particular match, any back references to it always fail. For example, the pattern (a|(bc))\2 - always fails if it starts to match "a" rather than "bc". Because there - may be many capturing parentheses in a pattern, all digits following - the backslash are taken as part of a potential back reference number. + always fails if it starts to match "a" rather than "bc". Because there + may be many capturing parentheses in a pattern, all digits following + the backslash are taken as part of a potential back reference number. If the pattern continues with a digit character, some delimiter must be - used to terminate the back reference. If the PCRE_EXTENDED option is - set, this can be whitespace. Otherwise an empty comment (see "Com- + used to terminate the back reference. If the PCRE_EXTENDED option is + set, this can be whitespace. Otherwise an empty comment (see "Com- ments" below) can be used. - A back reference that occurs inside the parentheses to which it refers - fails when the subpattern is first used, so, for example, (a\1) never - matches. However, such references can be useful inside repeated sub- + A back reference that occurs inside the parentheses to which it refers + fails when the subpattern is first used, so, for example, (a\1) never + matches. However, such references can be useful inside repeated sub- patterns. For example, the pattern (a|b\1)+ matches any number of "a"s and also "aba", "ababbaa" etc. At each iter- - ation of the subpattern, the back reference matches the character - string corresponding to the previous iteration. In order for this to - work, the pattern must be such that the first iteration does not need - to match the back reference. This can be done using alternation, as in + ation of the subpattern, the back reference matches the character + string corresponding to the previous iteration. In order for this to + work, the pattern must be such that the first iteration does not need + to match the back reference. This can be done using alternation, as in the example above, or by a quantifier with a minimum of zero. ASSERTIONS - An assertion is a test on the characters following or preceding the - current matching point that does not actually consume any characters. - The simple assertions coded as \b, \B, \A, \G, \Z, \z, ^ and $ are + An assertion is a test on the characters following or preceding the + current matching point that does not actually consume any characters. + The simple assertions coded as \b, \B, \A, \G, \Z, \z, ^ and $ are described above. - More complicated assertions are coded as subpatterns. There are two - kinds: those that look ahead of the current position in the subject - string, and those that look behind it. An assertion subpattern is - matched in the normal way, except that it does not cause the current + More complicated assertions are coded as subpatterns. There are two + kinds: those that look ahead of the current position in the subject + string, and those that look behind it. An assertion subpattern is + matched in the normal way, except that it does not cause the current matching position to be changed. - Assertion subpatterns are not capturing subpatterns, and may not be - repeated, because it makes no sense to assert the same thing several - times. If any kind of assertion contains capturing subpatterns within - it, these are counted for the purposes of numbering the capturing sub- + Assertion subpatterns are not capturing subpatterns, and may not be + repeated, because it makes no sense to assert the same thing several + times. If any kind of assertion contains capturing subpatterns within + it, these are counted for the purposes of numbering the capturing sub- patterns in the whole pattern. However, substring capturing is carried - out only for positive assertions, because it does not make sense for + out only for positive assertions, because it does not make sense for negative assertions. Lookahead assertions @@ -4361,37 +4407,37 @@ ASSERTIONS \w+(?=;) - matches a word followed by a semicolon, but does not include the semi- + matches a word followed by a semicolon, but does not include the semi- colon in the match, and foo(?!bar) - matches any occurrence of "foo" that is not followed by "bar". Note + matches any occurrence of "foo" that is not followed by "bar". Note that the apparently similar pattern (?!foo)bar - does not find an occurrence of "bar" that is preceded by something - other than "foo"; it finds any occurrence of "bar" whatsoever, because + does not find an occurrence of "bar" that is preceded by something + other than "foo"; it finds any occurrence of "bar" whatsoever, because the assertion (?!foo) is always true when the next three characters are "bar". A lookbehind assertion is needed to achieve the other effect. If you want to force a matching failure at some point in a pattern, the - most convenient way to do it is with (?!) because an empty string - always matches, so an assertion that requires there not to be an empty + most convenient way to do it is with (?!) because an empty string + always matches, so an assertion that requires there not to be an empty string must always fail. Lookbehind assertions - Lookbehind assertions start with (?<= for positive assertions and (?)...) or (?('name')...) to test for a - used subpattern by name. For compatibility with earlier versions of - PCRE, which had this facility before Perl, the syntax (?(name)...) is - also recognized. However, there is a possible ambiguity with this syn- - tax, because subpattern names may consist entirely of digits. PCRE - looks first for a named subpattern; if it cannot find one and the name - consists entirely of digits, PCRE looks for a subpattern of that num- - ber, which must be greater than zero. Using subpattern names that con- + Perl uses the syntax (?()...) or (?('name')...) to test for a + used subpattern by name. For compatibility with earlier versions of + PCRE, which had this facility before Perl, the syntax (?(name)...) is + also recognized. However, there is a possible ambiguity with this syn- + tax, because subpattern names may consist entirely of digits. PCRE + looks first for a named subpattern; if it cannot find one and the name + consists entirely of digits, PCRE looks for a subpattern of that num- + ber, which must be greater than zero. Using subpattern names that con- sist entirely of digits is not recommended. Rewriting the above example to use a named subpattern gives this: @@ -4563,85 +4609,85 @@ CONDITIONAL SUBPATTERNS Checking for pattern recursion If the condition is the string (R), and there is no subpattern with the - name R, the condition is true if a recursive call to the whole pattern + name R, the condition is true if a recursive call to the whole pattern or any subpattern has been made. If digits or a name preceded by amper- sand follow the letter R, for example: (?(R3)...) or (?(R&name)...) - the condition is true if the most recent recursion is into the subpat- - tern whose number or name is given. This condition does not check the + the condition is true if the most recent recursion is into the subpat- + tern whose number or name is given. This condition does not check the entire recursion stack. - At "top level", all these recursion test conditions are false. Recur- + At "top level", all these recursion test conditions are false. Recur- sive patterns are described below. Defining subpatterns for use by reference only - If the condition is the string (DEFINE), and there is no subpattern - with the name DEFINE, the condition is always false. In this case, - there may be only one alternative in the subpattern. It is always - skipped if control reaches this point in the pattern; the idea of - DEFINE is that it can be used to define "subroutines" that can be ref- - erenced from elsewhere. (The use of "subroutines" is described below.) - For example, a pattern to match an IPv4 address could be written like + If the condition is the string (DEFINE), and there is no subpattern + with the name DEFINE, the condition is always false. In this case, + there may be only one alternative in the subpattern. It is always + skipped if control reaches this point in the pattern; the idea of + DEFINE is that it can be used to define "subroutines" that can be ref- + erenced from elsewhere. (The use of "subroutines" is described below.) + For example, a pattern to match an IPv4 address could be written like this (ignore whitespace and line breaks): (?(DEFINE) (? 2[0-4]\d | 25[0-5] | 1\d\d | [1-9]?\d) ) \b (?&byte) (\.(?&byte)){3} \b - The first part of the pattern is a DEFINE group inside which a another - group named "byte" is defined. This matches an individual component of - an IPv4 address (a number less than 256). When matching takes place, - this part of the pattern is skipped because DEFINE acts like a false + The first part of the pattern is a DEFINE group inside which a another + group named "byte" is defined. This matches an individual component of + an IPv4 address (a number less than 256). When matching takes place, + this part of the pattern is skipped because DEFINE acts like a false condition. The rest of the pattern uses references to the named group to match the - four dot-separated components of an IPv4 address, insisting on a word + four dot-separated components of an IPv4 address, insisting on a word boundary at each end. Assertion conditions - If the condition is not in any of the above formats, it must be an - assertion. This may be a positive or negative lookahead or lookbehind - assertion. Consider this pattern, again containing non-significant + If the condition is not in any of the above formats, it must be an + assertion. This may be a positive or negative lookahead or lookbehind + assertion. Consider this pattern, again containing non-significant white space, and with the two alternatives on the second line: (?(?=[^a-z]*[a-z]) \d{2}-[a-z]{3}-\d{2} | \d{2}-\d{2}-\d{2} ) - The condition is a positive lookahead assertion that matches an - optional sequence of non-letters followed by a letter. In other words, - it tests for the presence of at least one letter in the subject. If a - letter is found, the subject is matched against the first alternative; - otherwise it is matched against the second. This pattern matches - strings in one of the two forms dd-aaa-dd or dd-dd-dd, where aaa are + The condition is a positive lookahead assertion that matches an + optional sequence of non-letters followed by a letter. In other words, + it tests for the presence of at least one letter in the subject. If a + letter is found, the subject is matched against the first alternative; + otherwise it is matched against the second. This pattern matches + strings in one of the two forms dd-aaa-dd or dd-dd-dd, where aaa are letters and dd are digits. COMMENTS - The sequence (?# marks the start of a comment that continues up to the - next closing parenthesis. Nested parentheses are not permitted. The - characters that make up a comment play no part in the pattern matching + The sequence (?# marks the start of a comment that continues up to the + next closing parenthesis. Nested parentheses are not permitted. The + characters that make up a comment play no part in the pattern matching at all. - If the PCRE_EXTENDED option is set, an unescaped # character outside a - character class introduces a comment that continues to immediately + If the PCRE_EXTENDED option is set, an unescaped # character outside a + character class introduces a comment that continues to immediately after the next newline in the pattern. RECURSIVE PATTERNS - Consider the problem of matching a string in parentheses, allowing for - unlimited nested parentheses. Without the use of recursion, the best - that can be done is to use a pattern that matches up to some fixed - depth of nesting. It is not possible to handle an arbitrary nesting + Consider the problem of matching a string in parentheses, allowing for + unlimited nested parentheses. Without the use of recursion, the best + that can be done is to use a pattern that matches up to some fixed + depth of nesting. It is not possible to handle an arbitrary nesting depth. For some time, Perl has provided a facility that allows regular expres- - sions to recurse (amongst other things). It does this by interpolating - Perl code in the expression at run time, and the code can refer to the + sions to recurse (amongst other things). It does this by interpolating + Perl code in the expression at run time, and the code can refer to the expression itself. A Perl pattern using code interpolation to solve the parentheses problem can be created like this: @@ -4651,117 +4697,117 @@ RECURSIVE PATTERNS refers recursively to the pattern in which it appears. Obviously, PCRE cannot support the interpolation of Perl code. Instead, - it supports special syntax for recursion of the entire pattern, and - also for individual subpattern recursion. After its introduction in - PCRE and Python, this kind of recursion was introduced into Perl at + it supports special syntax for recursion of the entire pattern, and + also for individual subpattern recursion. After its introduction in + PCRE and Python, this kind of recursion was introduced into Perl at release 5.10. - A special item that consists of (? followed by a number greater than + A special item that consists of (? followed by a number greater than zero and a closing parenthesis is a recursive call of the subpattern of - the given number, provided that it occurs inside that subpattern. (If - not, it is a "subroutine" call, which is described in the next sec- - tion.) The special item (?R) or (?0) is a recursive call of the entire + the given number, provided that it occurs inside that subpattern. (If + not, it is a "subroutine" call, which is described in the next sec- + tion.) The special item (?R) or (?0) is a recursive call of the entire regular expression. - In PCRE (like Python, but unlike Perl), a recursive subpattern call is + In PCRE (like Python, but unlike Perl), a recursive subpattern call is always treated as an atomic group. That is, once it has matched some of the subject string, it is never re-entered, even if it contains untried alternatives and there is a subsequent matching failure. - This PCRE pattern solves the nested parentheses problem (assume the + This PCRE pattern solves the nested parentheses problem (assume the PCRE_EXTENDED option is set so that white space is ignored): \( ( (?>[^()]+) | (?R) )* \) - First it matches an opening parenthesis. Then it matches any number of - substrings which can either be a sequence of non-parentheses, or a - recursive match of the pattern itself (that is, a correctly parenthe- + First it matches an opening parenthesis. Then it matches any number of + substrings which can either be a sequence of non-parentheses, or a + recursive match of the pattern itself (that is, a correctly parenthe- sized substring). Finally there is a closing parenthesis. - If this were part of a larger pattern, you would not want to recurse + If this were part of a larger pattern, you would not want to recurse the entire pattern, so instead you could use this: ( \( ( (?>[^()]+) | (?1) )* \) ) - We have put the pattern into parentheses, and caused the recursion to + We have put the pattern into parentheses, and caused the recursion to refer to them instead of the whole pattern. - In a larger pattern, keeping track of parenthesis numbers can be - tricky. This is made easier by the use of relative references. (A Perl - 5.10 feature.) Instead of (?1) in the pattern above you can write + In a larger pattern, keeping track of parenthesis numbers can be + tricky. This is made easier by the use of relative references. (A Perl + 5.10 feature.) Instead of (?1) in the pattern above you can write (?-2) to refer to the second most recently opened parentheses preceding - the recursion. In other words, a negative number counts capturing + the recursion. In other words, a negative number counts capturing parentheses leftwards from the point at which it is encountered. - It is also possible to refer to subsequently opened parentheses, by - writing references such as (?+2). However, these cannot be recursive - because the reference is not inside the parentheses that are refer- - enced. They are always "subroutine" calls, as described in the next + It is also possible to refer to subsequently opened parentheses, by + writing references such as (?+2). However, these cannot be recursive + because the reference is not inside the parentheses that are refer- + enced. They are always "subroutine" calls, as described in the next section. - An alternative approach is to use named parentheses instead. The Perl - syntax for this is (?&name); PCRE's earlier syntax (?P>name) is also + An alternative approach is to use named parentheses instead. The Perl + syntax for this is (?&name); PCRE's earlier syntax (?P>name) is also supported. We could rewrite the above example as follows: (? \( ( (?>[^()]+) | (?&pn) )* \) ) - If there is more than one subpattern with the same name, the earliest + If there is more than one subpattern with the same name, the earliest one is used. - This particular example pattern that we have been looking at contains - nested unlimited repeats, and so the use of atomic grouping for match- - ing strings of non-parentheses is important when applying the pattern + This particular example pattern that we have been looking at contains + nested unlimited repeats, and so the use of atomic grouping for match- + ing strings of non-parentheses is important when applying the pattern to strings that do not match. For example, when this pattern is applied to (aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa() - it yields "no match" quickly. However, if atomic grouping is not used, - the match runs for a very long time indeed because there are so many - different ways the + and * repeats can carve up the subject, and all + it yields "no match" quickly. However, if atomic grouping is not used, + the match runs for a very long time indeed because there are so many + different ways the + and * repeats can carve up the subject, and all have to be tested before failure can be reported. At the end of a match, the values set for any capturing subpatterns are those from the outermost level of the recursion at which the subpattern - value is set. If you want to obtain intermediate values, a callout - function can be used (see below and the pcrecallout documentation). If + value is set. If you want to obtain intermediate values, a callout + function can be used (see below and the pcrecallout documentation). If the pattern above is matched against (ab(cd)ef) - the value for the capturing parentheses is "ef", which is the last - value taken on at the top level. If additional parentheses are added, + the value for the capturing parentheses is "ef", which is the last + value taken on at the top level. If additional parentheses are added, giving \( ( ( (?>[^()]+) | (?R) )* ) \) ^ ^ ^ ^ - the string they capture is "ab(cd)ef", the contents of the top level - parentheses. If there are more than 15 capturing parentheses in a pat- + the string they capture is "ab(cd)ef", the contents of the top level + parentheses. If there are more than 15 capturing parentheses in a pat- tern, PCRE has to obtain extra memory to store data during a recursion, - which it does by using pcre_malloc, freeing it via pcre_free after- - wards. If no memory can be obtained, the match fails with the + which it does by using pcre_malloc, freeing it via pcre_free after- + wards. If no memory can be obtained, the match fails with the PCRE_ERROR_NOMEMORY error. - Do not confuse the (?R) item with the condition (R), which tests for - recursion. Consider this pattern, which matches text in angle brack- - ets, allowing for arbitrary nesting. Only digits are allowed in nested - brackets (that is, when recursing), whereas any characters are permit- + Do not confuse the (?R) item with the condition (R), which tests for + recursion. Consider this pattern, which matches text in angle brack- + ets, allowing for arbitrary nesting. Only digits are allowed in nested + brackets (that is, when recursing), whereas any characters are permit- ted at the outer level. < (?: (?(R) \d++ | [^<>]*+) | (?R)) * > - In this pattern, (?(R) is the start of a conditional subpattern, with - two different alternatives for the recursive and non-recursive cases. + In this pattern, (?(R) is the start of a conditional subpattern, with + two different alternatives for the recursive and non-recursive cases. The (?R) item is the actual recursive call. SUBPATTERNS AS SUBROUTINES If the syntax for a recursive subpattern reference (either by number or - by name) is used outside the parentheses to which it refers, it oper- - ates like a subroutine in a programming language. The "called" subpat- + by name) is used outside the parentheses to which it refers, it oper- + ates like a subroutine in a programming language. The "called" subpat- tern may be defined before or after the reference. A numbered reference can be absolute or relative, as in these examples: @@ -4773,105 +4819,105 @@ SUBPATTERNS AS SUBROUTINES (sens|respons)e and \1ibility - matches "sense and sensibility" and "response and responsibility", but + matches "sense and sensibility" and "response and responsibility", but not "sense and responsibility". If instead the pattern (sens|respons)e and (?1)ibility - is used, it does match "sense and responsibility" as well as the other - two strings. Another example is given in the discussion of DEFINE + is used, it does match "sense and responsibility" as well as the other + two strings. Another example is given in the discussion of DEFINE above. Like recursive subpatterns, a "subroutine" call is always treated as an - atomic group. That is, once it has matched some of the subject string, - it is never re-entered, even if it contains untried alternatives and + atomic group. That is, once it has matched some of the subject string, + it is never re-entered, even if it contains untried alternatives and there is a subsequent matching failure. - When a subpattern is used as a subroutine, processing options such as + When a subpattern is used as a subroutine, processing options such as case-independence are fixed when the subpattern is defined. They cannot be changed for different calls. For example, consider this pattern: (abc)(?i:(?-1)) - It matches "abcabc". It does not match "abcABC" because the change of + It matches "abcabc". It does not match "abcABC" because the change of processing option does not affect the called subpattern. ONIGURUMA SUBROUTINE SYNTAX - For compatibility with Oniguruma, the non-Perl syntax \g followed by a + For compatibility with Oniguruma, the non-Perl syntax \g followed by a name or a number enclosed either in angle brackets or single quotes, is - an alternative syntax for referencing a subpattern as a subroutine, - possibly recursively. Here are two of the examples used above, rewrit- + an alternative syntax for referencing a subpattern as a subroutine, + possibly recursively. Here are two of the examples used above, rewrit- ten using this syntax: (? \( ( (?>[^()]+) | \g )* \) ) (sens|respons)e and \g'1'ibility - PCRE supports an extension to Oniguruma: if a number is preceded by a + PCRE supports an extension to Oniguruma: if a number is preceded by a plus or a minus sign it is taken as a relative reference. For example: (abc)(?i:\g<-1>) - Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not - synonymous. The former is a back reference; the latter is a subroutine + Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not + synonymous. The former is a back reference; the latter is a subroutine call. CALLOUTS Perl has a feature whereby using the sequence (?{...}) causes arbitrary - Perl code to be obeyed in the middle of matching a regular expression. + Perl code to be obeyed in the middle of matching a regular expression. This makes it possible, amongst other things, to extract different sub- strings that match the same pair of parentheses when there is a repeti- tion. PCRE provides a similar feature, but of course it cannot obey arbitrary Perl code. The feature is called "callout". The caller of PCRE provides - an external function by putting its entry point in the global variable - pcre_callout. By default, this variable contains NULL, which disables + an external function by putting its entry point in the global variable + pcre_callout. By default, this variable contains NULL, which disables all calling out. - Within a regular expression, (?C) indicates the points at which the - external function is to be called. If you want to identify different - callout points, you can put a number less than 256 after the letter C. - The default value is zero. For example, this pattern has two callout + Within a regular expression, (?C) indicates the points at which the + external function is to be called. If you want to identify different + callout points, you can put a number less than 256 after the letter C. + The default value is zero. For example, this pattern has two callout points: (?C1)abc(?C2)def If the PCRE_AUTO_CALLOUT flag is passed to pcre_compile(), callouts are - automatically installed before each item in the pattern. They are all + automatically installed before each item in the pattern. They are all numbered 255. During matching, when PCRE reaches a callout point (and pcre_callout is - set), the external function is called. It is provided with the number - of the callout, the position in the pattern, and, optionally, one item - of data originally supplied by the caller of pcre_exec(). The callout - function may cause matching to proceed, to backtrack, or to fail alto- + set), the external function is called. It is provided with the number + of the callout, the position in the pattern, and, optionally, one item + of data originally supplied by the caller of pcre_exec(). The callout + function may cause matching to proceed, to backtrack, or to fail alto- gether. A complete description of the interface to the callout function is given in the pcrecallout documentation. BACKTRACKING CONTROL - Perl 5.10 introduced a number of "Special Backtracking Control Verbs", + Perl 5.10 introduced a number of "Special Backtracking Control Verbs", which are described in the Perl documentation as "experimental and sub- - ject to change or removal in a future version of Perl". It goes on to - say: "Their usage in production code should be noted to avoid problems + ject to change or removal in a future version of Perl". It goes on to + say: "Their usage in production code should be noted to avoid problems during upgrades." The same remarks apply to the PCRE features described in this section. - Since these verbs are specifically related to backtracking, most of - them can be used only when the pattern is to be matched using + Since these verbs are specifically related to backtracking, most of + them can be used only when the pattern is to be matched using pcre_exec(), which uses a backtracking algorithm. With the exception of (*FAIL), which behaves like a failing negative assertion, they cause an error if encountered by pcre_dfa_exec(). - The new verbs make use of what was previously invalid syntax: an open- + The new verbs make use of what was previously invalid syntax: an open- ing parenthesis followed by an asterisk. In Perl, they are generally of the form (*VERB:ARG) but PCRE does not support the use of arguments, so - its general form is just (*VERB). Any number of these verbs may occur + its general form is just (*VERB). Any number of these verbs may occur in a pattern. There are two kinds: Verbs that act immediately @@ -4880,94 +4926,94 @@ BACKTRACKING CONTROL (*ACCEPT) - This verb causes the match to end successfully, skipping the remainder - of the pattern. When inside a recursion, only the innermost pattern is - ended immediately. PCRE differs from Perl in what happens if the - (*ACCEPT) is inside capturing parentheses. In Perl, the data so far is + This verb causes the match to end successfully, skipping the remainder + of the pattern. When inside a recursion, only the innermost pattern is + ended immediately. PCRE differs from Perl in what happens if the + (*ACCEPT) is inside capturing parentheses. In Perl, the data so far is captured: in PCRE no data is captured. For example: A(A|B(*ACCEPT)|C)D - This matches "AB", "AAD", or "ACD", but when it matches "AB", no data + This matches "AB", "AAD", or "ACD", but when it matches "AB", no data is captured. (*FAIL) or (*F) - This verb causes the match to fail, forcing backtracking to occur. It - is equivalent to (?!) but easier to read. The Perl documentation notes - that it is probably useful only when combined with (?{}) or (??{}). - Those are, of course, Perl features that are not present in PCRE. The - nearest equivalent is the callout feature, as for example in this pat- + This verb causes the match to fail, forcing backtracking to occur. It + is equivalent to (?!) but easier to read. The Perl documentation notes + that it is probably useful only when combined with (?{}) or (??{}). + Those are, of course, Perl features that are not present in PCRE. The + nearest equivalent is the callout feature, as for example in this pat- tern: a+(?C)(*FAIL) - A match with the string "aaaa" always fails, but the callout is taken + A match with the string "aaaa" always fails, but the callout is taken before each backtrack happens (in this example, 10 times). Verbs that act after backtracking The following verbs do nothing when they are encountered. Matching con- - tinues with what follows, but if there is no subsequent match, a fail- - ure is forced. The verbs differ in exactly what kind of failure + tinues with what follows, but if there is no subsequent match, a fail- + ure is forced. The verbs differ in exactly what kind of failure occurs. (*COMMIT) - This verb causes the whole match to fail outright if the rest of the - pattern does not match. Even if the pattern is unanchored, no further - attempts to find a match by advancing the start point take place. Once - (*COMMIT) has been passed, pcre_exec() is committed to finding a match + This verb causes the whole match to fail outright if the rest of the + pattern does not match. Even if the pattern is unanchored, no further + attempts to find a match by advancing the start point take place. Once + (*COMMIT) has been passed, pcre_exec() is committed to finding a match at the current starting point, or not at all. For example: a+(*COMMIT)b - This matches "xxaab" but not "aacaab". It can be thought of as a kind + This matches "xxaab" but not "aacaab". It can be thought of as a kind of dynamic anchor, or "I've started, so I must finish." (*PRUNE) - This verb causes the match to fail at the current position if the rest + This verb causes the match to fail at the current position if the rest of the pattern does not match. If the pattern is unanchored, the normal - "bumpalong" advance to the next starting character then happens. Back- - tracking can occur as usual to the left of (*PRUNE), or when matching - to the right of (*PRUNE), but if there is no match to the right, back- - tracking cannot cross (*PRUNE). In simple cases, the use of (*PRUNE) + "bumpalong" advance to the next starting character then happens. Back- + tracking can occur as usual to the left of (*PRUNE), or when matching + to the right of (*PRUNE), but if there is no match to the right, back- + tracking cannot cross (*PRUNE). In simple cases, the use of (*PRUNE) is just an alternative to an atomic group or possessive quantifier, but - there are some uses of (*PRUNE) that cannot be expressed in any other + there are some uses of (*PRUNE) that cannot be expressed in any other way. (*SKIP) - This verb is like (*PRUNE), except that if the pattern is unanchored, - the "bumpalong" advance is not to the next character, but to the posi- - tion in the subject where (*SKIP) was encountered. (*SKIP) signifies - that whatever text was matched leading up to it cannot be part of a + This verb is like (*PRUNE), except that if the pattern is unanchored, + the "bumpalong" advance is not to the next character, but to the posi- + tion in the subject where (*SKIP) was encountered. (*SKIP) signifies + that whatever text was matched leading up to it cannot be part of a successful match. Consider: a+(*SKIP)b - If the subject is "aaaac...", after the first match attempt fails - (starting at the first character in the string), the starting point + If the subject is "aaaac...", after the first match attempt fails + (starting at the first character in the string), the starting point skips on to start the next attempt at "c". Note that a possessive quan- - tifer does not have the same effect in this example; although it would - suppress backtracking during the first match attempt, the second - attempt would start at the second character instead of skipping on to + tifer does not have the same effect in this example; although it would + suppress backtracking during the first match attempt, the second + attempt would start at the second character instead of skipping on to "c". (*THEN) This verb causes a skip to the next alternation if the rest of the pat- tern does not match. That is, it cancels pending backtracking, but only - within the current alternation. Its name comes from the observation + within the current alternation. Its name comes from the observation that it can be used for a pattern-based if-then-else block: ( COND1 (*THEN) FOO | COND2 (*THEN) BAR | COND3 (*THEN) BAZ ) ... - If the COND1 pattern matches, FOO is tried (and possibly further items - after the end of the group if FOO succeeds); on failure the matcher - skips to the second alternative and tries COND2, without backtracking - into COND1. If (*THEN) is used outside of any alternation, it acts + If the COND1 pattern matches, FOO is tried (and possibly further items + after the end of the group if FOO succeeds); on failure the matcher + skips to the second alternative and tries COND2, without backtracking + into COND1. If (*THEN) is used outside of any alternation, it acts exactly like (*PRUNE). @@ -4985,8 +5031,8 @@ AUTHOR REVISION - Last updated: 19 April 2008 - Copyright (c) 1997-2008 University of Cambridge. + Last updated: 11 April 2009 + Copyright (c) 1997-2009 University of Cambridge. ------------------------------------------------------------------------------ @@ -5098,14 +5144,16 @@ GENERAL CATEGORY PROPERTY CODES FOR \p and \P SCRIPT NAMES FOR \p AND \P Arabic, Armenian, Balinese, Bengali, Bopomofo, Braille, Buginese, - Buhid, Canadian_Aboriginal, Cherokee, Common, Coptic, Cuneiform, - Cypriot, Cyrillic, Deseret, Devanagari, Ethiopic, Georgian, Glagolitic, - Gothic, Greek, Gujarati, Gurmukhi, Han, Hangul, Hanunoo, Hebrew, Hira- - gana, Inherited, Kannada, Katakana, Kharoshthi, Khmer, Lao, Latin, - Limbu, Linear_B, Malayalam, Mongolian, Myanmar, New_Tai_Lue, Nko, - Ogham, Old_Italic, Old_Persian, Oriya, Osmanya, Phags_Pa, Phoenician, - Runic, Shavian, Sinhala, Syloti_Nagri, Syriac, Tagalog, Tagbanwa, - Tai_Le, Tamil, Telugu, Thaana, Thai, Tibetan, Tifinagh, Ugaritic, Yi. + Buhid, Canadian_Aboriginal, Carian, Cham, Cherokee, Common, Coptic, Cu- + neiform, Cypriot, Cyrillic, Deseret, Devanagari, Ethiopic, Georgian, + Glagolitic, Gothic, Greek, Gujarati, Gurmukhi, Han, Hangul, Hanunoo, + Hebrew, Hiragana, Inherited, Kannada, Katakana, Kayah_Li, Kharoshthi, + Khmer, Lao, Latin, Lepcha, Limbu, Linear_B, Lycian, Lydian, Malayalam, + Mongolian, Myanmar, New_Tai_Lue, Nko, Ogham, Old_Italic, Old_Persian, + Ol_Chiki, Oriya, Osmanya, Phags_Pa, Phoenician, Rejang, Runic, Saurash- + tra, Shavian, Sinhala, Sudanese, Syloti_Nagri, Syriac, Tagalog, Tag- + banwa, Tai_Le, Tamil, Telugu, Thaana, Thai, Tibetan, Tifinagh, + Ugaritic, Vai, Yi. CHARACTER CLASSES @@ -5157,7 +5205,7 @@ QUANTIFIERS ANCHORS AND SIMPLE ASSERTIONS - \b word boundary + \b word boundary (only ASCII letters recognized) \B not a word boundary ^ start of subject also after internal newline in multiline mode @@ -5183,75 +5231,80 @@ ALTERNATION CAPTURING - (...) capturing group - (?...) named capturing group (Perl) - (?'name'...) named capturing group (Perl) - (?P...) named capturing group (Python) - (?:...) non-capturing group - (?|...) non-capturing group; reset group numbers for - capturing groups in each alternative + (...) capturing group + (?...) named capturing group (Perl) + (?'name'...) named capturing group (Perl) + (?P...) named capturing group (Python) + (?:...) non-capturing group + (?|...) non-capturing group; reset group numbers for + capturing groups in each alternative ATOMIC GROUPS - (?>...) atomic, non-capturing group + (?>...) atomic, non-capturing group COMMENT - (?#....) comment (not nestable) + (?#....) comment (not nestable) OPTION SETTING - (?i) caseless - (?J) allow duplicate names - (?m) multiline - (?s) single line (dotall) - (?U) default ungreedy (lazy) - (?x) extended (ignore white space) - (?-...) unset option(s) + (?i) caseless + (?J) allow duplicate names + (?m) multiline + (?s) single line (dotall) + (?U) default ungreedy (lazy) + (?x) extended (ignore white space) + (?-...) unset option(s) + + The following is recognized only at the start of a pattern or after one + of the newline-setting options with similar syntax: + + (*UTF8) set UTF-8 mode LOOKAHEAD AND LOOKBEHIND ASSERTIONS - (?=...) positive look ahead - (?!...) negative look ahead - (?<=...) positive look behind - (? reference by name (Perl) - \k'name' reference by name (Perl) - \g{name} reference by name (Perl) - \k{name} reference by name (.NET) - (?P=name) reference by name (Python) + \n reference by number (can be ambiguous) + \gn reference by number + \g{n} reference by number + \g{-n} relative reference by number + \k reference by name (Perl) + \k'name' reference by name (Perl) + \g{name} reference by name (Perl) + \k{name} reference by name (.NET) + (?P=name) reference by name (Python) SUBROUTINE REFERENCES (POSSIBLY RECURSIVE) - (?R) recurse whole pattern - (?n) call subpattern by absolute number - (?+n) call subpattern by relative number - (?-n) call subpattern by relative number - (?&name) call subpattern by name (Perl) - (?P>name) call subpattern by name (Python) - \g call subpattern by name (Oniguruma) - \g'name' call subpattern by name (Oniguruma) - \g call subpattern by absolute number (Oniguruma) - \g'n' call subpattern by absolute number (Oniguruma) - \g<+n> call subpattern by relative number (PCRE extension) - \g'+n' call subpattern by relative number (PCRE extension) - \g<-n> call subpattern by relative number (PCRE extension) - \g'-n' call subpattern by relative number (PCRE extension) + (?R) recurse whole pattern + (?n) call subpattern by absolute number + (?+n) call subpattern by relative number + (?-n) call subpattern by relative number + (?&name) call subpattern by name (Perl) + (?P>name) call subpattern by name (Python) + \g call subpattern by name (Oniguruma) + \g'name' call subpattern by name (Oniguruma) + \g call subpattern by absolute number (Oniguruma) + \g'n' call subpattern by absolute number (Oniguruma) + \g<+n> call subpattern by relative number (PCRE extension) + \g'+n' call subpattern by relative number (PCRE extension) + \g<-n> call subpattern by relative number (PCRE extension) + \g'-n' call subpattern by relative number (PCRE extension) CONDITIONAL PATTERNS @@ -5259,56 +5312,56 @@ CONDITIONAL PATTERNS (?(condition)yes-pattern) (?(condition)yes-pattern|no-pattern) - (?(n)... absolute reference condition - (?(+n)... relative reference condition - (?(-n)... relative reference condition - (?()... named reference condition (Perl) - (?('name')... named reference condition (Perl) - (?(name)... named reference condition (PCRE) - (?(R)... overall recursion condition - (?(Rn)... specific group recursion condition - (?(R&name)... specific recursion condition - (?(DEFINE)... define subpattern for reference - (?(assert)... assertion condition + (?(n)... absolute reference condition + (?(+n)... relative reference condition + (?(-n)... relative reference condition + (?()... named reference condition (Perl) + (?('name')... named reference condition (Perl) + (?(name)... named reference condition (PCRE) + (?(R)... overall recursion condition + (?(Rn)... specific group recursion condition + (?(R&name)... specific recursion condition + (?(DEFINE)... define subpattern for reference + (?(assert)... assertion condition BACKTRACKING CONTROL The following act immediately they are reached: - (*ACCEPT) force successful match - (*FAIL) force backtrack; synonym (*F) + (*ACCEPT) force successful match + (*FAIL) force backtrack; synonym (*F) - The following act only when a subsequent match failure causes a back- + The following act only when a subsequent match failure causes a back- track to reach them. They all force a match failure, but they differ in what happens afterwards. Those that advance the start-of-match point do so only if the pattern is not anchored. - (*COMMIT) overall failure, no advance of starting point - (*PRUNE) advance to next starting character - (*SKIP) advance start to current matching position - (*THEN) local failure, backtrack to next alternation + (*COMMIT) overall failure, no advance of starting point + (*PRUNE) advance to next starting character + (*SKIP) advance start to current matching position + (*THEN) local failure, backtrack to next alternation NEWLINE CONVENTIONS - These are recognized only at the very start of the pattern or after a - (*BSR_...) option. + These are recognized only at the very start of the pattern or after a + (*BSR_...) or (*UTF8) option. - (*CR) - (*LF) - (*CRLF) - (*ANYCRLF) - (*ANY) + (*CR) carriage return only + (*LF) linefeed only + (*CRLF) carriage return followed by linefeed + (*ANYCRLF) all three of the above + (*ANY) any Unicode newline sequence WHAT \R MATCHES - These are recognized only at the very start of the pattern or after a - (*...) option that sets the newline convention. + These are recognized only at the very start of the pattern or after a + (*...) option that sets the newline convention or UTF-8 mode. - (*BSR_ANYCRLF) - (*BSR_UNICODE) + (*BSR_ANYCRLF) CR, LF, or CRLF + (*BSR_UNICODE) any Unicode newline sequence CALLOUTS @@ -5331,8 +5384,8 @@ AUTHOR REVISION - Last updated: 09 April 2008 - Copyright (c) 1997-2008 University of Cambridge. + Last updated: 11 April 2009 + Copyright (c) 1997-2009 University of Cambridge. ------------------------------------------------------------------------------ @@ -5514,13 +5567,13 @@ MULTI-SEGMENT MATCHING WITH pcre_dfa_exec() 0: dogsbody 1: dog - The pattern matches the words "dog" or "dogsbody". When the subject is - presented in several parts ("do" and "gsb" being the first two) the - match stops when "dog" has been found, and it is not possible to con- - tinue. On the other hand, if "dogsbody" is presented as a single + The pattern matches the words "dog" or "dogsbody". When the subject is + presented in several parts ("do" and "gsb" being the first two) the + match stops when "dog" has been found, and it is not possible to con- + tinue. On the other hand, if "dogsbody" is presented as a single string, both matches are found. - Because of this phenomenon, it does not usually make sense to end a + Because of this phenomenon, it does not usually make sense to end a pattern that is going to be matched in this way with a variable repeat. 4. Patterns that contain alternatives at the top level which do not all @@ -5867,12 +5920,12 @@ DESCRIPTION command for linking an application that uses them. Because the POSIX functions call the native ones, it is also necessary to add -lpcre. - I have implemented only those option bits that can be reasonably mapped - to PCRE native options. In addition, the option REG_EXTENDED is defined - with the value zero. This has no effect, but since programs that are - written to the POSIX interface often use it, this makes it easier to - slot in PCRE as a replacement library. Other POSIX options are not even - defined. + I have implemented only those POSIX option bits that can be reasonably + mapped to PCRE native options. In addition, the option REG_EXTENDED is + defined with the value zero. This has no effect, but since programs + that are written to the POSIX interface often use it, this makes it + easier to slot in PCRE as a replacement library. Other POSIX options + are not even defined. When PCRE is called via these functions, it is only the API that is POSIX-like in style. The syntax and semantics of the regular expres- @@ -5952,9 +6005,9 @@ COMPILING A PATTERN MATCHING NEWLINE CHARACTERS This area is not simple, because POSIX and Perl take different views of - things. It is not possible to get PCRE to obey POSIX semantics, but - then PCRE was never intended to be a POSIX engine. The following table - lists the different possibilities for matching newline characters in + things. It is not possible to get PCRE to obey POSIX semantics, but + then PCRE was never intended to be a POSIX engine. The following table + lists the different possibilities for matching newline characters in PCRE: Default Change with @@ -5976,19 +6029,19 @@ MATCHING NEWLINE CHARACTERS ^ matches \n in middle no REG_NEWLINE PCRE's behaviour is the same as Perl's, except that there is no equiva- - lent for PCRE_DOLLAR_ENDONLY in Perl. In both PCRE and Perl, there is + lent for PCRE_DOLLAR_ENDONLY in Perl. In both PCRE and Perl, there is no way to stop newline from matching [^a]. - The default POSIX newline handling can be obtained by setting - PCRE_DOTALL and PCRE_DOLLAR_ENDONLY, but there is no way to make PCRE + The default POSIX newline handling can be obtained by setting + PCRE_DOTALL and PCRE_DOLLAR_ENDONLY, but there is no way to make PCRE behave exactly as for the REG_NEWLINE action. MATCHING A PATTERN - The function regexec() is called to match a compiled pattern preg - against a given string, which is by default terminated by a zero byte - (but see REG_STARTEND below), subject to the options in eflags. These + The function regexec() is called to match a compiled pattern preg + against a given string, which is by default terminated by a zero byte + (but see REG_STARTEND below), subject to the options in eflags. These can be: REG_NOTBOL @@ -5996,6 +6049,13 @@ MATCHING A PATTERN The PCRE_NOTBOL option is set when calling the underlying PCRE matching function. + REG_NOTEMPTY + + The PCRE_NOTEMPTY option is set when calling the underlying PCRE match- + ing function. Note that REG_NOTEMPTY is not part of the POSIX standard. + However, setting this option can give more POSIX-like behaviour in some + situations. + REG_NOTEOL The PCRE_NOTEOL option is set when calling the underlying PCRE matching @@ -6058,8 +6118,8 @@ AUTHOR REVISION - Last updated: 05 April 2008 - Copyright (c) 1997-2008 University of Cambridge. + Last updated: 11 March 2009 + Copyright (c) 1997-2009 University of Cambridge. ------------------------------------------------------------------------------ @@ -6163,6 +6223,10 @@ MATCHING INTERFACE need more, consider using the more general interface pcrecpp::RE::DoMatch. See pcrecpp.h for the signature for DoMatch. + NOTE: Do not use no_arg, which is used internally to mark the end of a + list of optional arguments, as a placeholder for missing arguments, as + this can lead to segfaults. + QUOTING METACHARACTERS @@ -6396,7 +6460,7 @@ AUTHOR REVISION - Last updated: 12 November 2007 + Last updated: 17 March 2009 ------------------------------------------------------------------------------ diff --git a/ext/pcre/pcrelib/pcre.h b/ext/pcre/pcrelib/pcre.h index efd105a40a..c5fc4c13e4 100644 --- a/ext/pcre/pcrelib/pcre.h +++ b/ext/pcre/pcrelib/pcre.h @@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE. /* The current PCRE version information. */ #define PCRE_MAJOR 7 -#define PCRE_MINOR 8 +#define PCRE_MINOR 9 #define PCRE_PRERELEASE -#define PCRE_DATE 2008-09-05 +#define PCRE_DATE 2009-04-11 /* When an application links to a PCRE DLL in Windows, the symbols that are imported have to be identified as such. When building PCRE, the appropriate @@ -95,7 +95,8 @@ it is needed here for malloc. */ extern "C" { #endif -/* Options */ +/* Options. Some are compile-time only, some are run-time only, and some are +both, so we keep them all distinct. */ #define PCRE_CASELESS 0x00000001 #define PCRE_MULTILINE 0x00000002 @@ -125,6 +126,8 @@ extern "C" { #define PCRE_BSR_ANYCRLF 0x00800000 #define PCRE_BSR_UNICODE 0x01000000 #define PCRE_JAVASCRIPT_COMPAT 0x02000000 +#define PCRE_NO_START_OPTIMIZE 0x04000000 +#define PCRE_NO_START_OPTIMISE 0x04000000 /* Exec-time and get/set-time error codes */ diff --git a/ext/pcre/pcrelib/pcre_compile.c b/ext/pcre/pcrelib/pcre_compile.c index b079d1962f..1e0672c5cd 100644 --- a/ext/pcre/pcrelib/pcre_compile.c +++ b/ext/pcre/pcrelib/pcre_compile.c @@ -95,21 +95,56 @@ are simple data values; negative values are for special things like \d and so on. Zero means further processing is needed (for things like \x), or the escape is invalid. */ -#ifndef EBCDIC /* This is the "normal" table for ASCII systems */ +#ifndef EBCDIC + +/* This is the "normal" table for ASCII systems or for EBCDIC systems running +in UTF-8 mode. */ + static const short int escapes[] = { - 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */ - 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ - '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */ --ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */ --ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */ --ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */ - '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */ --ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */ --ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */ - 0, 0, -ESC_z /* x - z */ + 0, 0, + 0, 0, + 0, 0, + 0, 0, + 0, 0, + CHAR_COLON, CHAR_SEMICOLON, + CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, + CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK, + CHAR_COMMERCIAL_AT, -ESC_A, + -ESC_B, -ESC_C, + -ESC_D, -ESC_E, + 0, -ESC_G, + -ESC_H, 0, + 0, -ESC_K, + 0, 0, + 0, 0, + -ESC_P, -ESC_Q, + -ESC_R, -ESC_S, + 0, 0, + -ESC_V, -ESC_W, + -ESC_X, 0, + -ESC_Z, CHAR_LEFT_SQUARE_BRACKET, + CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET, + CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE, + CHAR_GRAVE_ACCENT, 7, + -ESC_b, 0, + -ESC_d, ESC_e, + ESC_f, 0, + -ESC_h, 0, + 0, -ESC_k, + 0, 0, + ESC_n, 0, + -ESC_p, 0, + ESC_r, -ESC_s, + ESC_tee, 0, + -ESC_v, -ESC_w, + 0, 0, + -ESC_z }; -#else /* This is the "abnormal" table for EBCDIC systems */ +#else + +/* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */ + static const short int escapes[] = { /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|', /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0, @@ -140,7 +175,9 @@ static const short int escapes[] = { /* Table of special "verbs" like (*PRUNE). This is a short table, so it is searched linearly. Put all the names into a single string, in order to reduce -the number of relocations when a shared library is dynamically linked. */ +the number of relocations when a shared library is dynamically linked. The +string is built from string macros so that it works in UTF-8 mode on EBCDIC +platforms. */ typedef struct verbitem { int len; @@ -148,13 +185,13 @@ typedef struct verbitem { } verbitem; static const char verbnames[] = - "ACCEPT\0" - "COMMIT\0" - "F\0" - "FAIL\0" - "PRUNE\0" - "SKIP\0" - "THEN"; + STRING_ACCEPT0 + STRING_COMMIT0 + STRING_F0 + STRING_FAIL0 + STRING_PRUNE0 + STRING_SKIP0 + STRING_THEN; static const verbitem verbs[] = { { 6, OP_ACCEPT }, @@ -176,9 +213,10 @@ length entry. The first three must be alpha, lower, upper, as this is assumed for handling case independence. */ static const char posix_names[] = - "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0" - "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0" - "word\0" "xdigit"; + STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0 + STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0 + STRING_graph0 STRING_print0 STRING_punct0 STRING_space0 + STRING_word0 STRING_xdigit; static const uschar posix_name_lengths[] = { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; @@ -320,7 +358,11 @@ For convenience, we use the same bit definitions as in chartables: Then we can use ctype_digit and ctype_xdigit in the code. */ -#ifndef EBCDIC /* This is the "normal" case, for ASCII systems */ +#ifndef EBCDIC + +/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in +UTF-8 mode. */ + static const unsigned char digitab[] = { 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ @@ -356,7 +398,10 @@ static const unsigned char digitab[] = 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ -#else /* This is the "abnormal" case, for EBCDIC systems */ +#else + +/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */ + static const unsigned char digitab[] = { 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */ @@ -501,9 +546,9 @@ if (c == 0) *errorcodeptr = ERR1; in a table. A non-zero result is something that can be returned immediately. Otherwise further processing may be required. */ -#ifndef EBCDIC /* ASCII coding */ -else if (c < '0' || c > 'z') {} /* Not alphanumeric */ -else if ((i = escapes[c - '0']) != 0) c = i; +#ifndef EBCDIC /* ASCII/UTF-8 coding */ +else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */ +else if ((i = escapes[c - CHAR_0]) != 0) c = i; #else /* EBCDIC coding */ else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */ @@ -522,11 +567,11 @@ else /* A number of Perl escapes are not handled by PCRE. We give an explicit error. */ - case 'l': - case 'L': - case 'N': - case 'u': - case 'U': + case CHAR_l: + case CHAR_L: + case CHAR_N: + case CHAR_u: + case CHAR_U: *errorcodeptr = ERR37; break; @@ -546,8 +591,8 @@ else (possibly recursive) subroutine calls, _not_ backreferences. Just return the -ESC_g code (cf \k). */ - case 'g': - if (ptr[1] == '<' || ptr[1] == '\'') + case CHAR_g: + if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE) { c = -ESC_g; break; @@ -555,12 +600,12 @@ else /* Handle the Perl-compatible cases */ - if (ptr[1] == '{') + if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) { const uschar *p; - for (p = ptr+2; *p != 0 && *p != '}'; p++) - if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break; - if (*p != 0 && *p != '}') + for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++) + if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break; + if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET) { c = -ESC_k; break; @@ -570,7 +615,7 @@ else } else braced = FALSE; - if (ptr[1] == '-') + if (ptr[1] == CHAR_MINUS) { negated = TRUE; ptr++; @@ -579,7 +624,7 @@ else c = 0; while ((digitab[ptr[1]] & ctype_digit) != 0) - c = c * 10 + *(++ptr) - '0'; + c = c * 10 + *(++ptr) - CHAR_0; if (c < 0) /* Integer overflow */ { @@ -587,7 +632,7 @@ else break; } - if (braced && *(++ptr) != '}') + if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET) { *errorcodeptr = ERR57; break; @@ -624,15 +669,15 @@ else value is greater than 377, the least significant 8 bits are taken. Inside a character class, \ followed by a digit is always an octal number. */ - case '1': case '2': case '3': case '4': case '5': - case '6': case '7': case '8': case '9': + case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5: + case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: if (!isclass) { oldptr = ptr; - c -= '0'; + c -= CHAR_0; while ((digitab[ptr[1]] & ctype_digit) != 0) - c = c * 10 + *(++ptr) - '0'; + c = c * 10 + *(++ptr) - CHAR_0; if (c < 0) /* Integer overflow */ { *errorcodeptr = ERR61; @@ -650,7 +695,7 @@ else generates a binary zero byte and treats the digit as a following literal. Thus we have to pull back the pointer by one. */ - if ((c = *ptr) >= '8') + if ((c = *ptr) >= CHAR_8) { ptr--; c = 0; @@ -663,10 +708,10 @@ else to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more than 3 octal digits. */ - case '0': - c -= '0'; - while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7') - c = c * 8 + *(++ptr) - '0'; + case CHAR_0: + c -= CHAR_0; + while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7) + c = c * 8 + *(++ptr) - CHAR_0; if (!utf8 && c > 255) *errorcodeptr = ERR51; break; @@ -674,8 +719,8 @@ else than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is treated as a data character. */ - case 'x': - if (ptr[1] == '{') + case CHAR_x: + if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) { const uschar *pt = ptr + 2; int count = 0; @@ -684,19 +729,19 @@ else while ((digitab[*pt] & ctype_xdigit) != 0) { register int cc = *pt++; - if (c == 0 && cc == '0') continue; /* Leading zeroes */ + if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */ count++; -#ifndef EBCDIC /* ASCII coding */ - if (cc >= 'a') cc -= 32; /* Convert to upper case */ - c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10)); +#ifndef EBCDIC /* ASCII/UTF-8 coding */ + if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */ + c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10)); #else /* EBCDIC coding */ - if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */ - c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10)); + if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */ + c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); #endif } - if (*pt == '}') + if (*pt == CHAR_RIGHT_CURLY_BRACKET) { if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34; ptr = pt; @@ -712,14 +757,14 @@ else c = 0; while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0) { - int cc; /* Some compilers don't like ++ */ - cc = *(++ptr); /* in initializers */ -#ifndef EBCDIC /* ASCII coding */ - if (cc >= 'a') cc -= 32; /* Convert to upper case */ - c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10)); + int cc; /* Some compilers don't like */ + cc = *(++ptr); /* ++ in initializers */ +#ifndef EBCDIC /* ASCII/UTF-8 coding */ + if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */ + c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10)); #else /* EBCDIC coding */ - if (cc <= 'z') cc += 64; /* Convert to upper case */ - c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10)); + if (cc <= CHAR_z) cc += 64; /* Convert to upper case */ + c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); #endif } break; @@ -728,7 +773,7 @@ else This coding is ASCII-specific, but then the whole concept of \cx is ASCII-specific. (However, an EBCDIC equivalent has now been added.) */ - case 'c': + case CHAR_c: c = *(++ptr); if (c == 0) { @@ -736,11 +781,11 @@ else break; } -#ifndef EBCDIC /* ASCII coding */ - if (c >= 'a' && c <= 'z') c -= 32; +#ifndef EBCDIC /* ASCII/UTF-8 coding */ + if (c >= CHAR_a && c <= CHAR_z) c -= 32; c ^= 0x40; #else /* EBCDIC coding */ - if (c >= 'a' && c <= 'z') c += 64; + if (c >= CHAR_a && c <= CHAR_z) c += 64; c ^= 0xC0; #endif break; @@ -802,9 +847,9 @@ if (c == 0) goto ERROR_RETURN; /* \P or \p can be followed by a name in {}, optionally preceded by ^ for negation. */ -if (c == '{') +if (c == CHAR_LEFT_CURLY_BRACKET) { - if (ptr[1] == '^') + if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT) { *negptr = TRUE; ptr++; @@ -813,10 +858,10 @@ if (c == '{') { c = *(++ptr); if (c == 0) goto ERROR_RETURN; - if (c == '}') break; + if (c == CHAR_RIGHT_CURLY_BRACKET) break; name[i] = c; } - if (c !='}') goto ERROR_RETURN; + if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN; name[i] = 0; } @@ -881,15 +926,15 @@ is_counted_repeat(const uschar *p) { if ((digitab[*p++] & ctype_digit) == 0) return FALSE; while ((digitab[*p] & ctype_digit) != 0) p++; -if (*p == '}') return TRUE; +if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE; -if (*p++ != ',') return FALSE; -if (*p == '}') return TRUE; +if (*p++ != CHAR_COMMA) return FALSE; +if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE; if ((digitab[*p++] & ctype_digit) == 0) return FALSE; while ((digitab[*p] & ctype_digit) != 0) p++; -return (*p == '}'); +return (*p == CHAR_RIGHT_CURLY_BRACKET); } @@ -922,7 +967,7 @@ int max = -1; /* Read the minimum value and do a paranoid check: a negative value indicates an integer overflow. */ -while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0'; +while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0; if (min < 0 || min > 65535) { *errorcodeptr = ERR5; @@ -932,12 +977,12 @@ if (min < 0 || min > 65535) /* Read the maximum value if there is one, and again do a paranoid on its size. Also, max must not be less than min. */ -if (*p == '}') max = min; else +if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else { - if (*(++p) != '}') + if (*(++p) != CHAR_RIGHT_CURLY_BRACKET) { max = 0; - while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0'; + while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0; if (max < 0 || max > 65535) { *errorcodeptr = ERR5; @@ -962,47 +1007,116 @@ return p; /************************************************* -* Find forward referenced subpattern * +* Subroutine for finding forward reference * *************************************************/ -/* This function scans along a pattern's text looking for capturing +/* This recursive function is called only from find_parens() below. The +top-level call starts at the beginning of the pattern. All other calls must +start at a parenthesis. It scans along a pattern's text looking for capturing subpatterns, and counting them. If it finds a named pattern that matches the name it is given, it returns its number. Alternatively, if the name is NULL, it -returns when it reaches a given numbered subpattern. This is used for forward -references to subpatterns. We know that if (?P< is encountered, the name will -be terminated by '>' because that is checked in the first pass. +returns when it reaches a given numbered subpattern. We know that if (?P< is +encountered, the name will be terminated by '>' because that is checked in the +first pass. Recursion is used to keep track of subpatterns that reset the +capturing group numbers - the (?| feature. Arguments: - ptr current position in the pattern + ptrptr address of the current character pointer (updated) cd compile background data name name to seek, or NULL if seeking a numbered subpattern lorn name length, or subpattern number if name is NULL xmode TRUE if we are in /x mode + count pointer to the current capturing subpattern number (updated) Returns: the number of the named subpattern, or -1 if not found */ static int -find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn, - BOOL xmode) +find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn, + BOOL xmode, int *count) { -const uschar *thisname; -int count = cd->bracount; +uschar *ptr = *ptrptr; +int start_count = *count; +int hwm_count = start_count; +BOOL dup_parens = FALSE; -for (; *ptr != 0; ptr++) +/* If the first character is a parenthesis, check on the type of group we are +dealing with. The very first call may not start with a parenthesis. */ + +if (ptr[0] == CHAR_LEFT_PARENTHESIS) { - int term; + if (ptr[1] == CHAR_QUESTION_MARK && + ptr[2] == CHAR_VERTICAL_LINE) + { + ptr += 3; + dup_parens = TRUE; + } + + /* Handle a normal, unnamed capturing parenthesis */ + + else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK) + { + *count += 1; + if (name == NULL && *count == lorn) return *count; + ptr++; + } + + /* Handle a condition. If it is an assertion, just carry on so that it + is processed as normal. If not, skip to the closing parenthesis of the + condition (there can't be any nested parens. */ + + else if (ptr[2] == CHAR_LEFT_PARENTHESIS) + { + ptr += 2; + if (ptr[1] != CHAR_QUESTION_MARK) + { + while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++; + if (*ptr != 0) ptr++; + } + } + + /* We have either (? or (* and not a condition */ + + else + { + ptr += 2; + if (*ptr == CHAR_P) ptr++; /* Allow optional P */ + + /* We have to disambiguate (? for named groups */ + + if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK && + ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE) + { + int term; + const uschar *thisname; + *count += 1; + if (name == NULL && *count == lorn) return *count; + term = *ptr++; + if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN; + thisname = ptr; + while (*ptr != term) ptr++; + if (name != NULL && lorn == ptr - thisname && + strncmp((const char *)name, (const char *)thisname, lorn) == 0) + return *count; + } + } + } +/* Past any initial parenthesis handling, scan for parentheses or vertical +bars. */ + +for (; *ptr != 0; ptr++) + { /* Skip over backslashed characters and also entire \Q...\E */ - if (*ptr == '\\') + if (*ptr == CHAR_BACKSLASH) { - if (*(++ptr) == 0) return -1; - if (*ptr == 'Q') for (;;) + if (*(++ptr) == 0) goto FAIL_EXIT; + if (*ptr == CHAR_Q) for (;;) { - while (*(++ptr) != 0 && *ptr != '\\') {}; - if (*ptr == 0) return -1; - if (*(++ptr) == 'E') break; + while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {}; + if (*ptr == 0) goto FAIL_EXIT; + if (*(++ptr) == CHAR_E) break; } continue; } @@ -1010,21 +1124,26 @@ for (; *ptr != 0; ptr++) /* Skip over character classes; this logic must be similar to the way they are handled for real. If the first character is '^', skip it. Also, if the first few characters (either before or after ^) are \Q\E or \E we skip them - too. This makes for compatibility with Perl. */ + too. This makes for compatibility with Perl. Note the use of STR macros to + encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */ - if (*ptr == '[') + if (*ptr == CHAR_LEFT_SQUARE_BRACKET) { BOOL negate_class = FALSE; for (;;) { int c = *(++ptr); - if (c == '\\') + if (c == CHAR_BACKSLASH) { - if (ptr[1] == 'E') ptr++; - else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3; - else break; + if (ptr[1] == CHAR_E) + ptr++; + else if (strncmp((const char *)ptr+1, + STR_Q STR_BACKSLASH STR_E, 3) == 0) + ptr += 3; + else + break; } - else if (!negate_class && c == '^') + else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT) negate_class = TRUE; else break; } @@ -1032,20 +1151,21 @@ for (; *ptr != 0; ptr++) /* If the next character is ']', it is a data character that must be skipped, except in JavaScript compatibility mode. */ - if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0) + if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET && + (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0) ptr++; - while (*(++ptr) != ']') + while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET) { if (*ptr == 0) return -1; - if (*ptr == '\\') + if (*ptr == CHAR_BACKSLASH) { - if (*(++ptr) == 0) return -1; - if (*ptr == 'Q') for (;;) + if (*(++ptr) == 0) goto FAIL_EXIT; + if (*ptr == CHAR_Q) for (;;) { - while (*(++ptr) != 0 && *ptr != '\\') {}; - if (*ptr == 0) return -1; - if (*(++ptr) == 'E') break; + while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {}; + if (*ptr == 0) goto FAIL_EXIT; + if (*(++ptr) == CHAR_E) break; } continue; } @@ -1055,49 +1175,92 @@ for (; *ptr != 0; ptr++) /* Skip comments in /x mode */ - if (xmode && *ptr == '#') + if (xmode && *ptr == CHAR_NUMBER_SIGN) { - while (*(++ptr) != 0 && *ptr != '\n') {}; - if (*ptr == 0) return -1; + while (*(++ptr) != 0 && *ptr != CHAR_NL) {}; + if (*ptr == 0) goto FAIL_EXIT; continue; } - /* An opening parens must now be a real metacharacter */ + /* Check for the special metacharacters */ - if (*ptr != '(') continue; - if (ptr[1] != '?' && ptr[1] != '*') + if (*ptr == CHAR_LEFT_PARENTHESIS) { - count++; - if (name == NULL && count == lorn) return count; - continue; + int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count); + if (rc > 0) return rc; + if (*ptr == 0) goto FAIL_EXIT; + } + + else if (*ptr == CHAR_RIGHT_PARENTHESIS) + { + if (dup_parens && *count < hwm_count) *count = hwm_count; + *ptrptr = ptr; + return -1; } - ptr += 2; - if (*ptr == 'P') ptr++; /* Allow optional P */ + else if (*ptr == CHAR_VERTICAL_LINE && dup_parens) + { + if (*count > hwm_count) hwm_count = *count; + *count = start_count; + } + } + +FAIL_EXIT: +*ptrptr = ptr; +return -1; +} + - /* We have to disambiguate (? */ - if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') && - *ptr != '\'') - continue; - count++; +/************************************************* +* Find forward referenced subpattern * +*************************************************/ + +/* This function scans along a pattern's text looking for capturing +subpatterns, and counting them. If it finds a named pattern that matches the +name it is given, it returns its number. Alternatively, if the name is NULL, it +returns when it reaches a given numbered subpattern. This is used for forward +references to subpatterns. We used to be able to start this scan from the +current compiling point, using the current count value from cd->bracount, and +do it all in a single loop, but the addition of the possibility of duplicate +subpattern numbers means that we have to scan from the very start, in order to +take account of such duplicates, and to use a recursive function to keep track +of the different types of group. + +Arguments: + cd compile background data + name name to seek, or NULL if seeking a numbered subpattern + lorn name length, or subpattern number if name is NULL + xmode TRUE if we are in /x mode + +Returns: the number of the found subpattern, or -1 if not found +*/ + +static int +find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode) +{ +uschar *ptr = (uschar *)cd->start_pattern; +int count = 0; +int rc; - if (name == NULL && count == lorn) return count; - term = *ptr++; - if (term == '<') term = '>'; - thisname = ptr; - while (*ptr != term) ptr++; - if (name != NULL && lorn == ptr - thisname && - strncmp((const char *)name, (const char *)thisname, lorn) == 0) - return count; +/* If the pattern does not start with an opening parenthesis, the first call +to find_parens_sub() will scan right to the end (if necessary). However, if it +does start with a parenthesis, find_parens_sub() will return when it hits the +matching closing parens. That is why we have to have a loop. */ + +for (;;) + { + rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count); + if (rc > 0 || *ptr++ == 0) break; } -return -1; +return rc; } + /************************************************* * Find first significant op code * *************************************************/ @@ -1611,17 +1774,25 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE BOOL empty_branch; if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */ - /* Scan a closed bracket */ + /* If a conditional group has only one branch, there is a second, implied, + empty branch, so just skip over the conditional, because it could be empty. + Otherwise, scan the individual branches of the group. */ - empty_branch = FALSE; - do - { - if (!empty_branch && could_be_empty_branch(code, endcode, utf8)) - empty_branch = TRUE; + if (c == OP_COND && code[GET(code, 1)] != OP_ALT) code += GET(code, 1); + else + { + empty_branch = FALSE; + do + { + if (!empty_branch && could_be_empty_branch(code, endcode, utf8)) + empty_branch = TRUE; + code += GET(code, 1); + } + while (*code == OP_ALT); + if (!empty_branch) return FALSE; /* All branches are non-empty */ } - while (*code == OP_ALT); - if (!empty_branch) return FALSE; /* All branches are non-empty */ + c = *code; continue; } @@ -1823,10 +1994,10 @@ int terminator; /* Don't combine these lines; the Solaris cc */ terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */ for (++ptr; *ptr != 0; ptr++) { - if (*ptr == '\\' && ptr[1] == ']') ptr++; else + if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else { - if (*ptr == ']') return FALSE; - if (*ptr == terminator && ptr[1] == ']') + if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE; + if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) { *endptr = ptr; return TRUE; @@ -2072,7 +2243,7 @@ if ((options & PCRE_EXTENDED) != 0) for (;;) { while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; - if (*ptr == '#') + if (*ptr == CHAR_NUMBER_SIGN) { while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } @@ -2084,7 +2255,7 @@ if ((options & PCRE_EXTENDED) != 0) /* If the next item is one that we can handle, get its value. A non-negative value is a character, a negative value is an escape value. */ -if (*ptr == '\\') +if (*ptr == CHAR_BACKSLASH) { int temperrorcode = 0; next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE); @@ -2109,7 +2280,7 @@ if ((options & PCRE_EXTENDED) != 0) for (;;) { while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; - if (*ptr == '#') + if (*ptr == CHAR_NUMBER_SIGN) { while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } @@ -2120,8 +2291,9 @@ if ((options & PCRE_EXTENDED) != 0) /* If the next thing is itself optional, we have to give up. */ -if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0) - return FALSE; +if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK || + strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0) + return FALSE; /* Now compare the next item with the previous opcode. If the previous is a positive single character match, "item" either contains the character or, if @@ -2559,7 +2731,7 @@ for (;; ptr++) if (inescq && c != 0) { - if (c == '\\' && ptr[1] == 'E') + if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) { inescq = FALSE; ptr++; @@ -2585,8 +2757,9 @@ for (;; ptr++) /* Fill in length of a previous callout, except when the next thing is a quantifier. */ - is_quantifier = c == '*' || c == '+' || c == '?' || - (c == '{' && is_counted_repeat(ptr+1)); + is_quantifier = + c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK || + (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1)); if (!is_quantifier && previous_callout != NULL && after_manual_callout-- <= 0) @@ -2601,7 +2774,7 @@ for (;; ptr++) if ((options & PCRE_EXTENDED) != 0) { if ((cd->ctypes[c] & ctype_space) != 0) continue; - if (c == '#') + if (c == CHAR_NUMBER_SIGN) { while (*(++ptr) != 0) { @@ -2626,8 +2799,8 @@ for (;; ptr++) { /* ===================================================================*/ case 0: /* The branch terminates at string end */ - case '|': /* or | or ) */ - case ')': + case CHAR_VERTICAL_LINE: /* or | or ) */ + case CHAR_RIGHT_PARENTHESIS: *firstbyteptr = firstbyte; *reqbyteptr = reqbyte; *codeptr = code; @@ -2649,7 +2822,7 @@ for (;; ptr++) /* Handle single-character metacharacters. In multiline mode, ^ disables the setting of any following char as a first character. */ - case '^': + case CHAR_CIRCUMFLEX_ACCENT: if ((options & PCRE_MULTILINE) != 0) { if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; @@ -2658,7 +2831,7 @@ for (;; ptr++) *code++ = OP_CIRC; break; - case '$': + case CHAR_DOLLAR_SIGN: previous = NULL; *code++ = OP_DOLL; break; @@ -2666,7 +2839,7 @@ for (;; ptr++) /* There can never be a first char if '.' is first, whatever happens about repeats. The value of reqbyte doesn't change either. */ - case '.': + case CHAR_DOT: if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; zerofirstbyte = firstbyte; zeroreqbyte = reqbyte; @@ -2690,7 +2863,7 @@ for (;; ptr++) In JavaScript compatibility mode, an isolated ']' causes an error. In default (Perl) mode, it is treated as a data character. */ - case ']': + case CHAR_RIGHT_SQUARE_BRACKET: if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0) { *errorcodeptr = ERR64; @@ -2698,16 +2871,17 @@ for (;; ptr++) } goto NORMAL_CHAR; - case '[': + case CHAR_LEFT_SQUARE_BRACKET: previous = code; /* PCRE supports POSIX class stuff inside a class. Perl gives an error if they are encountered at the top level, so we'll do that too. */ - if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && + if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT || + ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr)) { - *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31; + *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31; goto FAILED; } @@ -2719,13 +2893,17 @@ for (;; ptr++) for (;;) { c = *(++ptr); - if (c == '\\') + if (c == CHAR_BACKSLASH) { - if (ptr[1] == 'E') ptr++; - else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3; - else break; + if (ptr[1] == CHAR_E) + ptr++; + else if (strncmp((const char *)ptr+1, + STR_Q STR_BACKSLASH STR_E, 3) == 0) + ptr += 3; + else + break; } - else if (!negate_class && c == '^') + else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT) negate_class = TRUE; else break; } @@ -2735,7 +2913,8 @@ for (;; ptr++) that. In JS mode, [] must always fail, so generate OP_FAIL, whereas [^] must match any character, so generate OP_ALLANY. */ - if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0) + if (c == CHAR_RIGHT_SQUARE_BRACKET && + (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0) { *code++ = negate_class? OP_ALLANY : OP_FAIL; if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; @@ -2800,7 +2979,7 @@ for (;; ptr++) if (inescq) { - if (c == '\\' && ptr[1] == 'E') /* If we are at \E */ + if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */ { inescq = FALSE; /* Reset literal state */ ptr++; /* Skip the 'E' */ @@ -2815,23 +2994,23 @@ for (;; ptr++) [.ch.] and [=ch=] ("collating elements") and fault them, as Perl 5.6 and 5.8 do. */ - if (c == '[' && - (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && - check_posix_syntax(ptr, &tempptr)) + if (c == CHAR_LEFT_SQUARE_BRACKET && + (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT || + ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr)) { BOOL local_negate = FALSE; int posix_class, taboffset, tabopt; register const uschar *cbits = cd->cbits; uschar pbits[32]; - if (ptr[1] != ':') + if (ptr[1] != CHAR_COLON) { *errorcodeptr = ERR31; goto FAILED; } ptr += 2; - if (*ptr == '^') + if (*ptr == CHAR_CIRCUMFLEX_ACCENT) { local_negate = TRUE; should_flip_negation = TRUE; /* Note negative special */ @@ -2904,17 +3083,17 @@ for (;; ptr++) to 'or' into the one we are building. We assume they have more than one character in them, so set class_charcount bigger than one. */ - if (c == '\\') + if (c == CHAR_BACKSLASH) { c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); if (*errorcodeptr != 0) goto FAILED; - if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */ - else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */ - else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */ + if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */ + else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */ + else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */ else if (-c == ESC_Q) /* Handle start of quoted string */ { - if (ptr[1] == '\\' && ptr[2] == 'E') + if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E) { ptr += 2; /* avoid empty string */ } @@ -3140,7 +3319,7 @@ for (;; ptr++) entirely. The code for handling \Q and \E is messy. */ CHECK_RANGE: - while (ptr[1] == '\\' && ptr[2] == 'E') + while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E) { inescq = FALSE; ptr += 2; @@ -3150,28 +3329,29 @@ for (;; ptr++) /* Remember \r or \n */ - if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF; + if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF; /* Check for range */ - if (!inescq && ptr[1] == '-') + if (!inescq && ptr[1] == CHAR_MINUS) { int d; ptr += 2; - while (*ptr == '\\' && ptr[1] == 'E') ptr += 2; + while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2; /* If we hit \Q (not followed by \E) at this point, go into escaped mode. */ - while (*ptr == '\\' && ptr[1] == 'Q') + while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q) { ptr += 2; - if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; } + if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) + { ptr += 2; continue; } inescq = TRUE; break; } - if (*ptr == 0 || (!inescq && *ptr == ']')) + if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET)) { ptr = oldptr; goto LONE_SINGLE_CHARACTER; @@ -3190,7 +3370,7 @@ for (;; ptr++) not any of the other escapes. Perl 5.6 treats a hyphen as a literal in such circumstances. */ - if (!inescq && d == '\\') + if (!inescq && d == CHAR_BACKSLASH) { d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); if (*errorcodeptr != 0) goto FAILED; @@ -3200,9 +3380,9 @@ for (;; ptr++) if (d < 0) { - if (d == -ESC_b) d = '\b'; - else if (d == -ESC_X) d = 'X'; - else if (d == -ESC_R) d = 'R'; else + if (d == -ESC_b) d = CHAR_BS; + else if (d == -ESC_X) d = CHAR_X; + else if (d == -ESC_R) d = CHAR_R; else { ptr = oldptr; goto LONE_SINGLE_CHARACTER; /* A few lines below */ @@ -3223,7 +3403,7 @@ for (;; ptr++) /* Remember \r or \n */ - if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF; + if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF; /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless matching, we have to use an XCLASS with extra data items. Caseless @@ -3370,7 +3550,7 @@ for (;; ptr++) /* Loop until ']' reached. This "while" is the end of the "do" above. */ - while ((c = *(++ptr)) != 0 && (c != ']' || inescq)); + while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq)); if (c == 0) /* Missing terminating ']' */ { @@ -3515,23 +3695,23 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* Various kinds of repeat; '{' is not necessarily a quantifier, but this has been tested above. */ - case '{': + case CHAR_LEFT_CURLY_BRACKET: if (!is_quantifier) goto NORMAL_CHAR; ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr); if (*errorcodeptr != 0) goto FAILED; goto REPEAT; - case '*': + case CHAR_ASTERISK: repeat_min = 0; repeat_max = -1; goto REPEAT; - case '+': + case CHAR_PLUS: repeat_min = 1; repeat_max = -1; goto REPEAT; - case '?': + case CHAR_QUESTION_MARK: repeat_min = 0; repeat_max = 1; @@ -3566,13 +3746,13 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ but if PCRE_UNGREEDY is set, it works the other way round. We change the repeat type to the non-default. */ - if (ptr[1] == '+') + if (ptr[1] == CHAR_PLUS) { repeat_type = 0; /* Force greedy */ possessive_quantifier = TRUE; ptr++; } - else if (ptr[1] == '?') + else if (ptr[1] == CHAR_QUESTION_MARK) { repeat_type = greedy_non_default; ptr++; @@ -4205,7 +4385,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ lookbehind or option setting or condition or all the other extended parenthesis forms. */ - case '(': + case CHAR_LEFT_PARENTHESIS: newoptions = options; skipbytes = 0; bravalue = OP_CBRA; @@ -4214,19 +4394,19 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* First deal with various "verbs" that can be introduced by '*'. */ - if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0) + if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0) { int i, namelen; const char *vn = verbnames; const uschar *name = ++ptr; previous = NULL; while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {}; - if (*ptr == ':') + if (*ptr == CHAR_COLON) { *errorcodeptr = ERR59; /* Not supported */ goto FAILED; } - if (*ptr != ')') + if (*ptr != CHAR_RIGHT_PARENTHESIS) { *errorcodeptr = ERR60; goto FAILED; @@ -4251,7 +4431,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* Deal with the extended parentheses; all are introduced by '?', and the appearance of any of them means that this is not a capturing group. */ - else if (*ptr == '?') + else if (*ptr == CHAR_QUESTION_MARK) { int i, set, unset, namelen; int *optset; @@ -4260,9 +4440,9 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ switch (*(++ptr)) { - case '#': /* Comment; skip to ket */ + case CHAR_NUMBER_SIGN: /* Comment; skip to ket */ ptr++; - while (*ptr != 0 && *ptr != ')') ptr++; + while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++; if (*ptr == 0) { *errorcodeptr = ERR18; @@ -4272,19 +4452,19 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* ------------------------------------------------------------ */ - case '|': /* Reset capture count for each branch */ + case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */ reset_bracount = TRUE; /* Fall through */ /* ------------------------------------------------------------ */ - case ':': /* Non-capturing bracket */ + case CHAR_COLON: /* Non-capturing bracket */ bravalue = OP_BRA; ptr++; break; /* ------------------------------------------------------------ */ - case '(': + case CHAR_LEFT_PARENTHESIS: bravalue = OP_COND; /* Conditional group */ /* A condition can be an assertion, a number (referring to a numbered @@ -4304,7 +4484,8 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ the switch. This will take control down to where bracketed groups, including assertions, are processed. */ - if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<')) + if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN || + ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN)) break; /* Most other conditions use OP_CREF (a couple change to OP_RREF @@ -4316,7 +4497,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* Check for a test for recursion in a named group. */ - if (ptr[1] == 'R' && ptr[2] == '&') + if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND) { terminator = -1; ptr += 2; @@ -4326,20 +4507,20 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* Check for a test for a named group's having been set, using the Perl syntax (?() or (?('name') */ - else if (ptr[1] == '<') + else if (ptr[1] == CHAR_LESS_THAN_SIGN) { - terminator = '>'; + terminator = CHAR_GREATER_THAN_SIGN; ptr++; } - else if (ptr[1] == '\'') + else if (ptr[1] == CHAR_APOSTROPHE) { - terminator = '\''; + terminator = CHAR_APOSTROPHE; ptr++; } else { terminator = 0; - if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr); + if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr); } /* We now expect to read a name; any thing else is an error */ @@ -4359,12 +4540,13 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ { if (recno >= 0) recno = ((digitab[*ptr] & ctype_digit) != 0)? - recno * 10 + *ptr - '0' : -1; + recno * 10 + *ptr - CHAR_0 : -1; ptr++; } namelen = ptr - name; - if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')') + if ((terminator > 0 && *ptr++ != terminator) || + *ptr++ != CHAR_RIGHT_PARENTHESIS) { ptr--; /* Error offset */ *errorcodeptr = ERR26; @@ -4386,7 +4568,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ *errorcodeptr = ERR58; goto FAILED; } - recno = (refsign == '-')? + recno = (refsign == CHAR_MINUS)? cd->bracount - recno + 1 : recno +cd->bracount; if (recno <= 0 || recno > cd->final_bracount) { @@ -4417,7 +4599,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* Search the pattern for a forward reference */ - else if ((i = find_parens(ptr, cd, name, namelen, + else if ((i = find_parens(cd, name, namelen, (options & PCRE_EXTENDED) != 0)) > 0) { PUT2(code, 2+LINK_SIZE, i); @@ -4438,7 +4620,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* Check for (?(R) for recursion. Allow digits after R to specify a specific group number. */ - else if (*name == 'R') + else if (*name == CHAR_R) { recno = 0; for (i = 1; i < namelen; i++) @@ -4448,7 +4630,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ *errorcodeptr = ERR15; goto FAILED; } - recno = recno * 10 + name[i] - '0'; + recno = recno * 10 + name[i] - CHAR_0; } if (recno == 0) recno = RREF_ANY; code[1+LINK_SIZE] = OP_RREF; /* Change test type */ @@ -4458,7 +4640,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* Similarly, check for the (?(DEFINE) "condition", which is always false. */ - else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0) + else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0) { code[1+LINK_SIZE] = OP_DEF; skipbytes = 1; @@ -4483,16 +4665,16 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* ------------------------------------------------------------ */ - case '=': /* Positive lookahead */ + case CHAR_EQUALS_SIGN: /* Positive lookahead */ bravalue = OP_ASSERT; ptr++; break; /* ------------------------------------------------------------ */ - case '!': /* Negative lookahead */ + case CHAR_EXCLAMATION_MARK: /* Negative lookahead */ ptr++; - if (*ptr == ')') /* Optimize (?!) */ + if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */ { *code++ = OP_FAIL; previous = NULL; @@ -4503,15 +4685,15 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* ------------------------------------------------------------ */ - case '<': /* Lookbehind or named define */ + case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */ switch (ptr[1]) { - case '=': /* Positive lookbehind */ + case CHAR_EQUALS_SIGN: /* Positive lookbehind */ bravalue = OP_ASSERTBACK; ptr += 2; break; - case '!': /* Negative lookbehind */ + case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */ bravalue = OP_ASSERTBACK_NOT; ptr += 2; break; @@ -4526,22 +4708,22 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* ------------------------------------------------------------ */ - case '>': /* One-time brackets */ + case CHAR_GREATER_THAN_SIGN: /* One-time brackets */ bravalue = OP_ONCE; ptr++; break; /* ------------------------------------------------------------ */ - case 'C': /* Callout - may be followed by digits; */ + case CHAR_C: /* Callout - may be followed by digits; */ previous_callout = code; /* Save for later completion */ after_manual_callout = 1; /* Skip one item before completing */ *code++ = OP_CALLOUT; { int n = 0; while ((digitab[*(++ptr)] & ctype_digit) != 0) - n = n * 10 + *ptr - '0'; - if (*ptr != ')') + n = n * 10 + *ptr - CHAR_0; + if (*ptr != CHAR_RIGHT_PARENTHESIS) { *errorcodeptr = ERR39; goto FAILED; @@ -4561,14 +4743,15 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* ------------------------------------------------------------ */ - case 'P': /* Python-style named subpattern handling */ - if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */ + case CHAR_P: /* Python-style named subpattern handling */ + if (*(++ptr) == CHAR_EQUALS_SIGN || + *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */ { - is_recurse = *ptr == '>'; - terminator = ')'; + is_recurse = *ptr == CHAR_GREATER_THAN_SIGN; + terminator = CHAR_RIGHT_PARENTHESIS; goto NAMED_REF_OR_RECURSE; } - else if (*ptr != '<') /* Test for Python-style definition */ + else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */ { *errorcodeptr = ERR41; goto FAILED; @@ -4578,9 +4761,10 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* ------------------------------------------------------------ */ DEFINE_NAME: /* Come here from (?< handling */ - case '\'': + case CHAR_APOSTROPHE: { - terminator = (*ptr == '<')? '>' : '\''; + terminator = (*ptr == CHAR_LESS_THAN_SIGN)? + CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE; name = ++ptr; while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; @@ -4654,8 +4838,8 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* ------------------------------------------------------------ */ - case '&': /* Perl recursion/subroutine syntax */ - terminator = ')'; + case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */ + terminator = CHAR_RIGHT_PARENTHESIS; is_recurse = TRUE; /* Fall through */ @@ -4714,7 +4898,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ recno = GET2(slot, 0); } else if ((recno = /* Forward back reference */ - find_parens(ptr, cd, name, namelen, + find_parens(cd, name, namelen, (options & PCRE_EXTENDED) != 0)) <= 0) { *errorcodeptr = ERR15; @@ -4730,18 +4914,18 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* ------------------------------------------------------------ */ - case 'R': /* Recursion */ + case CHAR_R: /* Recursion */ ptr++; /* Same as (?0) */ /* Fall through */ /* ------------------------------------------------------------ */ - case '-': case '+': - case '0': case '1': case '2': case '3': case '4': /* Recursion or */ - case '5': case '6': case '7': case '8': case '9': /* subroutine */ + case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */ + case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: + case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: { const uschar *called; - terminator = ')'; + terminator = CHAR_RIGHT_PARENTHESIS; /* Come here from the \g<...> and \g'...' code (Oniguruma compatibility). However, the syntax has been checked to ensure that @@ -4751,7 +4935,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ HANDLE_NUMERICAL_RECURSION: - if ((refsign = *ptr) == '+') + if ((refsign = *ptr) == CHAR_PLUS) { ptr++; if ((digitab[*ptr] & ctype_digit) == 0) @@ -4760,7 +4944,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ goto FAILED; } } - else if (refsign == '-') + else if (refsign == CHAR_MINUS) { if ((digitab[ptr[1]] & ctype_digit) == 0) goto OTHER_CHAR_AFTER_QUERY; @@ -4769,7 +4953,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ recno = 0; while((digitab[*ptr] & ctype_digit) != 0) - recno = recno * 10 + *ptr++ - '0'; + recno = recno * 10 + *ptr++ - CHAR_0; if (*ptr != terminator) { @@ -4777,7 +4961,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ goto FAILED; } - if (refsign == '-') + if (refsign == CHAR_MINUS) { if (recno == 0) { @@ -4791,7 +4975,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ goto FAILED; } } - else if (refsign == '+') + else if (refsign == CHAR_PLUS) { if (recno == 0) { @@ -4824,7 +5008,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ if (called == NULL) { - if (find_parens(ptr, cd, NULL, recno, + if (find_parens(cd, NULL, recno, (options & PCRE_EXTENDED) != 0) < 0) { *errorcodeptr = ERR15; @@ -4877,23 +5061,23 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ set = unset = 0; optset = &set; - while (*ptr != ')' && *ptr != ':') + while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON) { switch (*ptr++) { - case '-': optset = &unset; break; + case CHAR_MINUS: optset = &unset; break; - case 'J': /* Record that it changed in the external options */ + case CHAR_J: /* Record that it changed in the external options */ *optset |= PCRE_DUPNAMES; cd->external_flags |= PCRE_JCHANGED; break; - case 'i': *optset |= PCRE_CASELESS; break; - case 'm': *optset |= PCRE_MULTILINE; break; - case 's': *optset |= PCRE_DOTALL; break; - case 'x': *optset |= PCRE_EXTENDED; break; - case 'U': *optset |= PCRE_UNGREEDY; break; - case 'X': *optset |= PCRE_EXTRA; break; + case CHAR_i: *optset |= PCRE_CASELESS; break; + case CHAR_m: *optset |= PCRE_MULTILINE; break; + case CHAR_s: *optset |= PCRE_DOTALL; break; + case CHAR_x: *optset |= PCRE_EXTENDED; break; + case CHAR_U: *optset |= PCRE_UNGREEDY; break; + case CHAR_X: *optset |= PCRE_EXTRA; break; default: *errorcodeptr = ERR12; ptr--; /* Correct the offset */ @@ -4927,7 +5111,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ options if this setting actually changes any of them, and reset the greedy defaults and the case value for firstbyte and reqbyte. */ - if (*ptr == ')') + if (*ptr == CHAR_RIGHT_PARENTHESIS) { if (code == cd->start_code + 1 + LINK_SIZE && (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE)) @@ -5067,7 +5251,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* Error if hit end of pattern */ - if (*ptr != ')') + if (*ptr != CHAR_RIGHT_PARENTHESIS) { *errorcodeptr = ERR14; goto FAILED; @@ -5165,7 +5349,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ We can test for values between ESC_b and ESC_Z for the latter; this may have to change if any new ones are ever created. */ - case '\\': + case CHAR_BACKSLASH: tempptr = ptr; c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE); if (*errorcodeptr != 0) goto FAILED; @@ -5174,8 +5358,9 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ { if (-c == ESC_Q) /* Handle start of quoted string */ { - if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */ - else inescq = TRUE; + if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E) + ptr += 2; /* avoid empty string */ + else inescq = TRUE; continue; } @@ -5203,7 +5388,8 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ { const uschar *p; save_hwm = cd->hwm; /* Normally this is set when '(' is read */ - terminator = (*(++ptr) == '<')? '>' : '\''; + terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)? + CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE; /* These two statements stop the compiler for warning about possibly unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In @@ -5215,7 +5401,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* Test for a name */ - if (ptr[1] != '+' && ptr[1] != '-') + if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS) { BOOL isnumber = TRUE; for (p = ptr + 1; *p != 0 && *p != terminator; p++) @@ -5253,10 +5439,13 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* \k or \k'name' is a back reference by name (Perl syntax). We also support \k{name} (.NET syntax) */ - if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{')) + if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN || + ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET)) { is_recurse = FALSE; - terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}'; + terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)? + CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)? + CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET; goto NAMED_REF_OR_RECURSE; } @@ -5359,7 +5548,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* Remember if \r or \n were seen */ - if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n') + if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF; /* Set the first and required bytes appropriately. If no previous first @@ -5604,7 +5793,7 @@ for (;;) compile a resetting op-code following, except at the very end of the pattern. Return leaving the pointer at the terminating char. */ - if (*ptr != '|') + if (*ptr != CHAR_VERTICAL_LINE) { if (lengthptr == NULL) { @@ -5627,7 +5816,7 @@ for (;;) /* Resetting option if needed */ - if ((options & PCRE_IMS) != oldims && *ptr == ')') + if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS) { *code++ = OP_OPT; *code++ = oldims; @@ -5809,6 +5998,32 @@ do { NULL, 0, FALSE); register int op = *scode; + /* If we are at the start of a conditional assertion group, *both* the + conditional assertion *and* what follows the condition must satisfy the test + for start of line. Other kinds of condition fail. Note that there may be an + auto-callout at the start of a condition. */ + + if (op == OP_COND) + { + scode += 1 + LINK_SIZE; + if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT]; + switch (*scode) + { + case OP_CREF: + case OP_RREF: + case OP_DEF: + return FALSE; + + default: /* Assertion */ + if (!is_startline(scode, bracket_map, backref_map)) return FALSE; + do scode += GET(scode, 1); while (*scode == OP_ALT); + scode += 1 + LINK_SIZE; + break; + } + scode = first_significant_code(scode, NULL, 0, FALSE); + op = *scode; + } + /* Non-capturing brackets */ if (op == OP_BRA) @@ -5827,8 +6042,10 @@ do { /* Other brackets */ - else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND) - { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; } + else if (op == OP_ASSERT || op == OP_ONCE) + { + if (!is_startline(scode, bracket_map, backref_map)) return FALSE; + } /* .* means "start at start or after \n" if it isn't in brackets that may be referenced. */ @@ -6007,30 +6224,6 @@ if (erroroffset == NULL) *erroroffset = 0; -/* Can't support UTF8 unless PCRE has been compiled to include the code. */ - -#ifdef SUPPORT_UTF8 -utf8 = (options & PCRE_UTF8) != 0; -if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 && - (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0) - { - errorcode = ERR44; - goto PCRE_EARLY_ERROR_RETURN2; - } -#else -if ((options & PCRE_UTF8) != 0) - { - errorcode = ERR32; - goto PCRE_EARLY_ERROR_RETURN; - } -#endif - -if ((options & ~PUBLIC_OPTIONS) != 0) - { - errorcode = ERR17; - goto PCRE_EARLY_ERROR_RETURN; - } - /* Set up pointers to the individual character tables */ if (tables == NULL) tables = _pcre_default_tables; @@ -6039,28 +6232,40 @@ cd->fcc = tables + fcc_offset; cd->cbits = tables + cbits_offset; cd->ctypes = tables + ctypes_offset; +/* Check that all undefined public option bits are zero */ + +if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0) + { + errorcode = ERR17; + goto PCRE_EARLY_ERROR_RETURN; + } + /* Check for global one-time settings at the start of the pattern, and remember the offset for later. */ -while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*') +while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && + ptr[skipatstart+1] == CHAR_ASTERISK) { int newnl = 0; int newbsr = 0; - if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0) + if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0) + { skipatstart += 7; options |= PCRE_UTF8; continue; } + + if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0) { skipatstart += 5; newnl = PCRE_NEWLINE_CR; } - else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0) + else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3) == 0) { skipatstart += 5; newnl = PCRE_NEWLINE_LF; } - else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0) + else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5) == 0) { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; } - else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0) + else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0) { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; } - else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0) + else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0) { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; } - else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0) + else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0) { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; } - else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0) + else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0) { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; } if (newnl != 0) @@ -6070,6 +6275,24 @@ while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*') else break; } +/* Can't support UTF8 unless PCRE has been compiled to include the code. */ + +#ifdef SUPPORT_UTF8 +utf8 = (options & PCRE_UTF8) != 0; +if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 && + (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0) + { + errorcode = ERR44; + goto PCRE_EARLY_ERROR_RETURN2; + } +#else +if ((options & PCRE_UTF8) != 0) + { + errorcode = ERR32; + goto PCRE_EARLY_ERROR_RETURN; + } +#endif + /* Check validity of \R options. */ switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) @@ -6088,10 +6311,10 @@ current code allows for fixed one- or two-byte sequences, plus "any" and switch (options & PCRE_NEWLINE_BITS) { case 0: newline = NEWLINE; break; /* Build-time default */ - case PCRE_NEWLINE_CR: newline = '\r'; break; - case PCRE_NEWLINE_LF: newline = '\n'; break; + case PCRE_NEWLINE_CR: newline = CHAR_CR; break; + case PCRE_NEWLINE_LF: newline = CHAR_NL; break; case PCRE_NEWLINE_CR+ - PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; + PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break; case PCRE_NEWLINE_ANY: newline = -1; break; case PCRE_NEWLINE_ANYCRLF: newline = -2; break; default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN; diff --git a/ext/pcre/pcrelib/pcre_config.c b/ext/pcre/pcrelib/pcre_config.c index 5796f75d71..3e7421577a 100644 --- a/ext/pcre/pcrelib/pcre_config.c +++ b/ext/pcre/pcrelib/pcre_config.c @@ -102,11 +102,11 @@ switch (what) break; case PCRE_CONFIG_MATCH_LIMIT: - *((unsigned int *)where) = MATCH_LIMIT; + *((unsigned long int *)where) = MATCH_LIMIT; break; case PCRE_CONFIG_MATCH_LIMIT_RECURSION: - *((unsigned int *)where) = MATCH_LIMIT_RECURSION; + *((unsigned long int *)where) = MATCH_LIMIT_RECURSION; break; case PCRE_CONFIG_STACKRECURSE: diff --git a/ext/pcre/pcrelib/pcre_exec.c b/ext/pcre/pcrelib/pcre_exec.c index 91b5047731..073cf2410a 100644 --- a/ext/pcre/pcrelib/pcre_exec.c +++ b/ext/pcre/pcrelib/pcre_exec.c @@ -320,9 +320,9 @@ typedef struct heapframe { /* Function arguments that may change */ - const uschar *Xeptr; + USPTR Xeptr; const uschar *Xecode; - const uschar *Xmstart; + USPTR Xmstart; int Xoffset_top; long int Xims; eptrblock *Xeptrb; @@ -331,13 +331,15 @@ typedef struct heapframe { /* Function local variables */ - const uschar *Xcallpat; - const uschar *Xcharptr; - const uschar *Xdata; - const uschar *Xnext; - const uschar *Xpp; - const uschar *Xprev; - const uschar *Xsaved_eptr; + USPTR Xcallpat; +#ifdef SUPPORT_UTF8 + USPTR Xcharptr; +#endif + USPTR Xdata; + USPTR Xnext; + USPTR Xpp; + USPTR Xprev; + USPTR Xsaved_eptr; recursion_info Xnew_recursive; @@ -358,6 +360,7 @@ typedef struct heapframe { uschar Xocchars[8]; #endif + int Xcodelink; int Xctype; unsigned int Xfc; int Xfi; @@ -423,7 +426,7 @@ Returns: MATCH_MATCH if matched ) these values are >= 0 */ static int -match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart, +match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart, int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb, int flags, unsigned int rdepth) { @@ -437,6 +440,7 @@ register unsigned int c; /* Character values not kept over RMATCH() calls */ register BOOL utf8; /* Local copy of UTF-8 flag for speed */ BOOL minimize, possessive; /* Quantifier options */ +int condcode; /* When recursion is not being used, all "local" variables that have to be preserved over calls to RMATCH() are part of a "frame" which is obtained from @@ -479,6 +483,7 @@ HEAP_RECURSE: #define charptr frame->Xcharptr #endif #define callpat frame->Xcallpat +#define codelink frame->Xcodelink #define data frame->Xdata #define next frame->Xnext #define pp frame->Xpp @@ -559,6 +564,7 @@ int oclength; uschar occhars[8]; #endif +int codelink; int ctype; int length; int max; @@ -785,7 +791,39 @@ for (;;) case OP_COND: case OP_SCOND: - if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */ + codelink= GET(ecode, 1); + + /* Because of the way auto-callout works during compile, a callout item is + inserted between OP_COND and an assertion condition. */ + + if (ecode[LINK_SIZE+1] == OP_CALLOUT) + { + if (pcre_callout != NULL) + { + pcre_callout_block cb; + cb.version = 1; /* Version 1 of the callout block */ + cb.callout_number = ecode[LINK_SIZE+2]; + cb.offset_vector = md->offset_vector; + cb.subject = (PCRE_SPTR)md->start_subject; + cb.subject_length = md->end_subject - md->start_subject; + cb.start_match = mstart - md->start_subject; + cb.current_position = eptr - md->start_subject; + cb.pattern_position = GET(ecode, LINK_SIZE + 3); + cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE); + cb.capture_top = offset_top/2; + cb.capture_last = md->capture_last; + cb.callout_data = md->callout_data; + if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH); + if (rrc < 0) RRETURN(rrc); + } + ecode += _pcre_OP_lengths[OP_CALLOUT]; + } + + condcode = ecode[LINK_SIZE+1]; + + /* Now see what the actual condition is */ + + if (condcode == OP_RREF) /* Recursion test */ { offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/ condition = md->recursive != NULL && @@ -793,14 +831,14 @@ for (;;) ecode += condition? 3 : GET(ecode, 1); } - else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */ + else if (condcode == OP_CREF) /* Group used test */ { offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */ condition = offset < offset_top && md->offset_vector[offset] >= 0; ecode += condition? 3 : GET(ecode, 1); } - else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */ + else if (condcode == OP_DEF) /* DEFINE - always false */ { condition = FALSE; ecode += GET(ecode, 1); @@ -827,7 +865,7 @@ for (;;) else { condition = FALSE; - ecode += GET(ecode, 1); + ecode += codelink; } } @@ -850,7 +888,7 @@ for (;;) goto TAIL_RECURSE; } } - else /* Condition false & no 2nd alternative */ + else /* Condition false & no alternative */ { ecode += 1 + LINK_SIZE; } @@ -1073,6 +1111,8 @@ for (;;) else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) { DPRINTF(("Recursion gave error %d\n", rrc)); + if (new_recursive.offset_save != stacksave) + (pcre_free)(new_recursive.offset_save); RRETURN(rrc); } @@ -1419,7 +1459,7 @@ for (;;) { if (eptr == md->start_subject) prev_is_word = FALSE; else { - const uschar *lastptr = eptr - 1; + USPTR lastptr = eptr - 1; while((*lastptr & 0xc0) == 0x80) lastptr--; GETCHAR(c, lastptr); prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; @@ -1677,7 +1717,7 @@ for (;;) if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); { - const ucd_record * prop = GET_UCD(c); + const ucd_record *prop = GET_UCD(c); switch(ecode[1]) { @@ -2045,7 +2085,8 @@ for (;;) /* Match an extended character class. This opcode is encountered only - in UTF-8 mode, because that's the only time it is compiled. */ + when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8 + mode, because Unicode properties are supported in non-UTF-8 mode. */ #ifdef SUPPORT_UTF8 case OP_XCLASS: @@ -2087,7 +2128,7 @@ for (;;) for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); + GETCHARINCTEST(c, eptr); if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); } @@ -2106,7 +2147,7 @@ for (;;) RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); + GETCHARINCTEST(c, eptr); if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); } /* Control never gets here */ @@ -2121,7 +2162,7 @@ for (;;) { int len = 1; if (eptr >= md->end_subject) break; - GETCHARLEN(c, eptr, len); + GETCHARLENTEST(c, eptr, len); if (!_pcre_xclass(c, data)) break; eptr += len; } @@ -4531,10 +4572,10 @@ switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) & PCRE_NEWLINE_BITS) { case 0: newline = NEWLINE; break; /* Compile-time default */ - case PCRE_NEWLINE_CR: newline = '\r'; break; - case PCRE_NEWLINE_LF: newline = '\n'; break; + case PCRE_NEWLINE_CR: newline = CHAR_CR; break; + case PCRE_NEWLINE_LF: newline = CHAR_NL; break; case PCRE_NEWLINE_CR+ - PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; + PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break; case PCRE_NEWLINE_ANY: newline = -1; break; case PCRE_NEWLINE_ANYCRLF: newline = -2; break; default: return PCRE_ERROR_BADNEWLINE; @@ -4576,11 +4617,11 @@ back the character offset. */ #ifdef SUPPORT_UTF8 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) { - if (_pcre_valid_utf8((uschar *)subject, length) >= 0) + if (_pcre_valid_utf8((USPTR)subject, length) >= 0) return PCRE_ERROR_BADUTF8; if (start_offset > 0 && start_offset < length) { - int tb = ((uschar *)subject)[start_offset]; + int tb = ((USPTR)subject)[start_offset]; if (tb > 127) { tb &= 0xc0; @@ -4686,11 +4727,11 @@ for(;;) while (iptr < iend) *iptr++ = -1; } - /* Advance to a unique first char if possible. If firstline is TRUE, the - start of the match is constrained to the first line of a multiline string. - That is, the match must be before or at the first newline. Implement this by - temporarily adjusting end_subject so that we stop scanning at a newline. If - the match fails at the newline, later code breaks this loop. */ + /* If firstline is TRUE, the start of the match is constrained to the first + line of a multiline string. That is, the match must be before or at the first + newline. Implement this by temporarily adjusting end_subject so that we stop + scanning at a newline. If the match fails at the newline, later code breaks + this loop. */ if (firstline) { @@ -4710,62 +4751,70 @@ for(;;) end_subject = t; } - /* Now advance to a unique first byte if there is one. */ + /* There are some optimizations that avoid running the match if a known + starting point is not found, or if a known later character is not present. + However, there is an option that disables these, for testing and for ensuring + that all callouts do actually occur. */ - if (first_byte >= 0) + if ((options & PCRE_NO_START_OPTIMIZE) == 0) { - if (first_byte_caseless) - while (start_match < end_subject && md->lcc[*start_match] != first_byte) - start_match++; - else - while (start_match < end_subject && *start_match != first_byte) - start_match++; - } + /* Advance to a unique first byte if there is one. */ - /* Or to just after a linebreak for a multiline match */ + if (first_byte >= 0) + { + if (first_byte_caseless) + while (start_match < end_subject && md->lcc[*start_match] != first_byte) + start_match++; + else + while (start_match < end_subject && *start_match != first_byte) + start_match++; + } - else if (startline) - { - if (start_match > md->start_subject + start_offset) + /* Or to just after a linebreak for a multiline match */ + + else if (startline) { -#ifdef SUPPORT_UTF8 - if (utf8) + if (start_match > md->start_subject + start_offset) { - while (start_match < end_subject && !WAS_NEWLINE(start_match)) +#ifdef SUPPORT_UTF8 + if (utf8) { - start_match++; - while(start_match < end_subject && (*start_match & 0xc0) == 0x80) + while (start_match < end_subject && !WAS_NEWLINE(start_match)) + { start_match++; + while(start_match < end_subject && (*start_match & 0xc0) == 0x80) + start_match++; + } } - } - else + else #endif - while (start_match < end_subject && !WAS_NEWLINE(start_match)) - start_match++; - - /* If we have just passed a CR and the newline option is ANY or ANYCRLF, - and we are now at a LF, advance the match position by one more character. - */ - - if (start_match[-1] == '\r' && - (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && - start_match < end_subject && - *start_match == '\n') - start_match++; + while (start_match < end_subject && !WAS_NEWLINE(start_match)) + start_match++; + + /* If we have just passed a CR and the newline option is ANY or ANYCRLF, + and we are now at a LF, advance the match position by one more character. + */ + + if (start_match[-1] == CHAR_CR && + (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && + start_match < end_subject && + *start_match == CHAR_NL) + start_match++; + } } - } - /* Or to a non-unique first byte after study */ + /* Or to a non-unique first byte after study */ - else if (start_bits != NULL) - { - while (start_match < end_subject) + else if (start_bits != NULL) { - register unsigned int c = *start_match; - if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; - else break; + while (start_match < end_subject) + { + register unsigned int c = *start_match; + if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; + else break; + } } - } + } /* Starting optimizations */ /* Restore fudged end_subject */ @@ -4777,23 +4826,25 @@ for(;;) printf("\n"); #endif - /* If req_byte is set, we know that that character must appear in the subject - for the match to succeed. If the first character is set, req_byte must be - later in the subject; otherwise the test starts at the match point. This - optimization can save a huge amount of backtracking in patterns with nested - unlimited repeats that aren't going to match. Writing separate code for - cased/caseless versions makes it go faster, as does using an autoincrement - and backing off on a match. - - HOWEVER: when the subject string is very, very long, searching to its end can - take a long time, and give bad performance on quite ordinary patterns. This - showed up when somebody was matching something like /^\d+C/ on a 32-megabyte - string... so we don't do this when the string is sufficiently long. - - ALSO: this processing is disabled when partial matching is requested. - */ - - if (req_byte >= 0 && + /* If req_byte is set, we know that that character must appear in the + subject for the match to succeed. If the first character is set, req_byte + must be later in the subject; otherwise the test starts at the match point. + This optimization can save a huge amount of backtracking in patterns with + nested unlimited repeats that aren't going to match. Writing separate code + for cased/caseless versions makes it go faster, as does using an + autoincrement and backing off on a match. + + HOWEVER: when the subject string is very, very long, searching to its end + can take a long time, and give bad performance on quite ordinary patterns. + This showed up when somebody was matching something like /^\d+C/ on a + 32-megabyte string... so we don't do this when the string is sufficiently + long. + + ALSO: this processing is disabled when partial matching is requested, or if + disabling is explicitly requested. */ + + if ((options & PCRE_NO_START_OPTIMIZE) == 0 && + req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX && !md->partial) { @@ -4901,9 +4952,9 @@ for(;;) not contain any explicit matches for \r or \n, and the newline option is CRLF or ANY or ANYCRLF, advance the match position by one more character. */ - if (start_match[-1] == '\r' && + if (start_match[-1] == CHAR_CR && start_match < end_subject && - *start_match == '\n' && + *start_match == CHAR_NL && (re->flags & PCRE_HASCRORLF) == 0 && (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF || diff --git a/ext/pcre/pcrelib/pcre_fullinfo.c b/ext/pcre/pcrelib/pcre_fullinfo.c index 4012b7a4e8..44fa91bbc0 100644 --- a/ext/pcre/pcrelib/pcre_fullinfo.c +++ b/ext/pcre/pcrelib/pcre_fullinfo.c @@ -87,7 +87,7 @@ if (re->magic_number != MAGIC_NUMBER) switch (what) { case PCRE_INFO_OPTIONS: - *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS; + *((unsigned long int *)where) = re->options & PUBLIC_COMPILE_OPTIONS; break; case PCRE_INFO_SIZE: diff --git a/ext/pcre/pcrelib/pcre_get.c b/ext/pcre/pcrelib/pcre_get.c index 65a81c99b0..92b3808820 100644 --- a/ext/pcre/pcrelib/pcre_get.c +++ b/ext/pcre/pcrelib/pcre_get.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2009 University of Cambridge + Copyright (c) 1997-2008 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without diff --git a/ext/pcre/pcrelib/pcre_globals.c b/ext/pcre/pcrelib/pcre_globals.c index f684edbe34..aa3ef90a2a 100644 --- a/ext/pcre/pcrelib/pcre_globals.c +++ b/ext/pcre/pcrelib/pcre_globals.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2009 University of Cambridge + Copyright (c) 1997-2008 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without diff --git a/ext/pcre/pcrelib/pcre_info.c b/ext/pcre/pcrelib/pcre_info.c index 0e0a02f595..e6d435b166 100644 --- a/ext/pcre/pcrelib/pcre_info.c +++ b/ext/pcre/pcrelib/pcre_info.c @@ -81,7 +81,7 @@ if (re->magic_number != MAGIC_NUMBER) re = _pcre_try_flipped(re, &internal_re, NULL, NULL); if (re == NULL) return PCRE_ERROR_BADMAGIC; } -if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS); +if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_COMPILE_OPTIONS); if (first_byte != NULL) *first_byte = ((re->flags & PCRE_FIRSTSET) != 0)? re->first_byte : ((re->flags & PCRE_STARTLINE) != 0)? -1 : -2; diff --git a/ext/pcre/pcrelib/pcre_internal.h b/ext/pcre/pcrelib/pcre_internal.h index c652ed9de0..e168f3909b 100644 --- a/ext/pcre/pcrelib/pcre_internal.h +++ b/ext/pcre/pcrelib/pcre_internal.h @@ -51,6 +51,20 @@ functions whose names all begin with "_pcre_". */ #define DEBUG #endif +/* We do not support both EBCDIC and UTF-8 at the same time. The "configure" +script prevents both being selected, but not everybody uses "configure". */ + +#if defined EBCDIC && defined SUPPORT_UTF8 +#error The use of both EBCDIC and SUPPORT_UTF8 is not supported. +#endif + +/* If SUPPORT_UCP is defined, SUPPORT_UTF8 must also be defined. The +"configure" script ensures this, but not everybody uses "configure". */ + +#if defined SUPPORT_UCP && !defined SUPPORT_UTF8 +#define SUPPORT_UTF8 1 +#endif + /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef inline, and there are *still* stupid compilers about that don't like indented pre-processor statements, or at least there were when I first wrote this. After @@ -478,6 +492,26 @@ if there are extra bytes. This is called when we know we are in UTF-8 mode. */ len += gcaa; \ } +/* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the +pointer, incrementing length if there are extra bytes. This is called when we +know we are in UTF-8 mode. */ + +#define GETCHARLENTEST(c, eptr, len) \ + c = *eptr; \ + if (utf8 && c >= 0xc0) \ + { \ + int gcii; \ + int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ + int gcss = 6*gcaa; \ + c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ + for (gcii = 1; gcii <= gcaa; gcii++) \ + { \ + gcss -= 6; \ + c |= (eptr[gcii] & 0x3f) << gcss; \ + } \ + len += gcaa; \ + } + /* If the pointer is not at the start of a character, move it back until it is. This is called only in UTF-8 mode - we don't put a test within the macro because almost all calls are already within a block of UTF-8 only code. */ @@ -520,7 +554,7 @@ time, run time, or study time, respectively. */ #define PCRE_NEWLINE_BITS (PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_ANY| \ PCRE_NEWLINE_ANYCRLF) -#define PUBLIC_OPTIONS \ +#define PUBLIC_COMPILE_OPTIONS \ (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \ PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \ @@ -529,12 +563,13 @@ time, run time, or study time, respectively. */ #define PUBLIC_EXEC_OPTIONS \ (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ - PCRE_PARTIAL|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE) + PCRE_PARTIAL|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \ + PCRE_NO_START_OPTIMIZE) #define PUBLIC_DFA_EXEC_OPTIONS \ (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_BITS| \ - PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE) + PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE|PCRE_NO_START_OPTIMIZE) #define PUBLIC_STUDY_OPTIONS 0 /* None defined */ @@ -560,38 +595,571 @@ variable-length repeat, or a anything other than literal characters. */ #define REQ_VARY 0x0200 /* reqbyte followed non-literal item */ /* Miscellaneous definitions. The #ifndef is to pacify compiler warnings in -environments where these macros are defined elsewhere. */ +environments where these macros are defined elsewhere. Unfortunately, there +is no way to do the same for the typedef. */ -#ifndef FALSE typedef int BOOL; +#ifndef FALSE #define FALSE 0 #define TRUE 1 #endif +/* If PCRE is to support UTF-8 on EBCDIC platforms, we cannot use normal +character constants like '*' because the compiler would emit their EBCDIC code, +which is different from their ASCII/UTF-8 code. Instead we define macros for +the characters so that they always use the ASCII/UTF-8 code when UTF-8 support +is enabled. When UTF-8 support is not enabled, the definitions use character +literals. Both character and string versions of each character are needed, and +there are some longer strings as well. + +This means that, on EBCDIC platforms, the PCRE library can handle either +EBCDIC, or UTF-8, but not both. To support both in the same compiled library +would need different lookups depending on whether PCRE_UTF8 was set or not. +This would make it impossible to use characters in switch/case statements, +which would reduce performance. For a theoretical use (which nobody has asked +for) in a minority area (EBCDIC platforms), this is not sensible. Any +application that did need both could compile two versions of the library, using +macros to give the functions distinct names. */ + +#ifndef SUPPORT_UTF8 + +/* UTF-8 support is not enabled; use the platform-dependent character literals +so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */ + +#define CHAR_HT '\t' +#define CHAR_VT '\v' +#define CHAR_FF '\f' +#define CHAR_CR '\r' +#define CHAR_NL '\n' +#define CHAR_BS '\b' +#define CHAR_BEL '\a' +#ifdef EBCDIC +#define CHAR_ESC '\047' +#define CHAR_DEL '\007' +#else +#define CHAR_ESC '\033' +#define CHAR_DEL '\177' +#endif + +#define CHAR_SPACE ' ' +#define CHAR_EXCLAMATION_MARK '!' +#define CHAR_QUOTATION_MARK '"' +#define CHAR_NUMBER_SIGN '#' +#define CHAR_DOLLAR_SIGN '$' +#define CHAR_PERCENT_SIGN '%' +#define CHAR_AMPERSAND '&' +#define CHAR_APOSTROPHE '\'' +#define CHAR_LEFT_PARENTHESIS '(' +#define CHAR_RIGHT_PARENTHESIS ')' +#define CHAR_ASTERISK '*' +#define CHAR_PLUS '+' +#define CHAR_COMMA ',' +#define CHAR_MINUS '-' +#define CHAR_DOT '.' +#define CHAR_SLASH '/' +#define CHAR_0 '0' +#define CHAR_1 '1' +#define CHAR_2 '2' +#define CHAR_3 '3' +#define CHAR_4 '4' +#define CHAR_5 '5' +#define CHAR_6 '6' +#define CHAR_7 '7' +#define CHAR_8 '8' +#define CHAR_9 '9' +#define CHAR_COLON ':' +#define CHAR_SEMICOLON ';' +#define CHAR_LESS_THAN_SIGN '<' +#define CHAR_EQUALS_SIGN '=' +#define CHAR_GREATER_THAN_SIGN '>' +#define CHAR_QUESTION_MARK '?' +#define CHAR_COMMERCIAL_AT '@' +#define CHAR_A 'A' +#define CHAR_B 'B' +#define CHAR_C 'C' +#define CHAR_D 'D' +#define CHAR_E 'E' +#define CHAR_F 'F' +#define CHAR_G 'G' +#define CHAR_H 'H' +#define CHAR_I 'I' +#define CHAR_J 'J' +#define CHAR_K 'K' +#define CHAR_L 'L' +#define CHAR_M 'M' +#define CHAR_N 'N' +#define CHAR_O 'O' +#define CHAR_P 'P' +#define CHAR_Q 'Q' +#define CHAR_R 'R' +#define CHAR_S 'S' +#define CHAR_T 'T' +#define CHAR_U 'U' +#define CHAR_V 'V' +#define CHAR_W 'W' +#define CHAR_X 'X' +#define CHAR_Y 'Y' +#define CHAR_Z 'Z' +#define CHAR_LEFT_SQUARE_BRACKET '[' +#define CHAR_BACKSLASH '\\' +#define CHAR_RIGHT_SQUARE_BRACKET ']' +#define CHAR_CIRCUMFLEX_ACCENT '^' +#define CHAR_UNDERSCORE '_' +#define CHAR_GRAVE_ACCENT '`' +#define CHAR_a 'a' +#define CHAR_b 'b' +#define CHAR_c 'c' +#define CHAR_d 'd' +#define CHAR_e 'e' +#define CHAR_f 'f' +#define CHAR_g 'g' +#define CHAR_h 'h' +#define CHAR_i 'i' +#define CHAR_j 'j' +#define CHAR_k 'k' +#define CHAR_l 'l' +#define CHAR_m 'm' +#define CHAR_n 'n' +#define CHAR_o 'o' +#define CHAR_p 'p' +#define CHAR_q 'q' +#define CHAR_r 'r' +#define CHAR_s 's' +#define CHAR_t 't' +#define CHAR_u 'u' +#define CHAR_v 'v' +#define CHAR_w 'w' +#define CHAR_x 'x' +#define CHAR_y 'y' +#define CHAR_z 'z' +#define CHAR_LEFT_CURLY_BRACKET '{' +#define CHAR_VERTICAL_LINE '|' +#define CHAR_RIGHT_CURLY_BRACKET '}' +#define CHAR_TILDE '~' + +#define STR_HT "\t" +#define STR_VT "\v" +#define STR_FF "\f" +#define STR_CR "\r" +#define STR_NL "\n" +#define STR_BS "\b" +#define STR_BEL "\a" +#ifdef EBCDIC +#define STR_ESC "\047" +#define STR_DEL "\007" +#else +#define STR_ESC "\033" +#define STR_DEL "\177" +#endif + +#define STR_SPACE " " +#define STR_EXCLAMATION_MARK "!" +#define STR_QUOTATION_MARK "\"" +#define STR_NUMBER_SIGN "#" +#define STR_DOLLAR_SIGN "$" +#define STR_PERCENT_SIGN "%" +#define STR_AMPERSAND "&" +#define STR_APOSTROPHE "'" +#define STR_LEFT_PARENTHESIS "(" +#define STR_RIGHT_PARENTHESIS ")" +#define STR_ASTERISK "*" +#define STR_PLUS "+" +#define STR_COMMA "," +#define STR_MINUS "-" +#define STR_DOT "." +#define STR_SLASH "/" +#define STR_0 "0" +#define STR_1 "1" +#define STR_2 "2" +#define STR_3 "3" +#define STR_4 "4" +#define STR_5 "5" +#define STR_6 "6" +#define STR_7 "7" +#define STR_8 "8" +#define STR_9 "9" +#define STR_COLON ":" +#define STR_SEMICOLON ";" +#define STR_LESS_THAN_SIGN "<" +#define STR_EQUALS_SIGN "=" +#define STR_GREATER_THAN_SIGN ">" +#define STR_QUESTION_MARK "?" +#define STR_COMMERCIAL_AT "@" +#define STR_A "A" +#define STR_B "B" +#define STR_C "C" +#define STR_D "D" +#define STR_E "E" +#define STR_F "F" +#define STR_G "G" +#define STR_H "H" +#define STR_I "I" +#define STR_J "J" +#define STR_K "K" +#define STR_L "L" +#define STR_M "M" +#define STR_N "N" +#define STR_O "O" +#define STR_P "P" +#define STR_Q "Q" +#define STR_R "R" +#define STR_S "S" +#define STR_T "T" +#define STR_U "U" +#define STR_V "V" +#define STR_W "W" +#define STR_X "X" +#define STR_Y "Y" +#define STR_Z "Z" +#define STR_LEFT_SQUARE_BRACKET "[" +#define STR_BACKSLASH "\\" +#define STR_RIGHT_SQUARE_BRACKET "]" +#define STR_CIRCUMFLEX_ACCENT "^" +#define STR_UNDERSCORE "_" +#define STR_GRAVE_ACCENT "`" +#define STR_a "a" +#define STR_b "b" +#define STR_c "c" +#define STR_d "d" +#define STR_e "e" +#define STR_f "f" +#define STR_g "g" +#define STR_h "h" +#define STR_i "i" +#define STR_j "j" +#define STR_k "k" +#define STR_l "l" +#define STR_m "m" +#define STR_n "n" +#define STR_o "o" +#define STR_p "p" +#define STR_q "q" +#define STR_r "r" +#define STR_s "s" +#define STR_t "t" +#define STR_u "u" +#define STR_v "v" +#define STR_w "w" +#define STR_x "x" +#define STR_y "y" +#define STR_z "z" +#define STR_LEFT_CURLY_BRACKET "{" +#define STR_VERTICAL_LINE "|" +#define STR_RIGHT_CURLY_BRACKET "}" +#define STR_TILDE "~" + +#define STRING_ACCEPT0 "ACCEPT\0" +#define STRING_COMMIT0 "COMMIT\0" +#define STRING_F0 "F\0" +#define STRING_FAIL0 "FAIL\0" +#define STRING_PRUNE0 "PRUNE\0" +#define STRING_SKIP0 "SKIP\0" +#define STRING_THEN "THEN" + +#define STRING_alpha0 "alpha\0" +#define STRING_lower0 "lower\0" +#define STRING_upper0 "upper\0" +#define STRING_alnum0 "alnum\0" +#define STRING_ascii0 "ascii\0" +#define STRING_blank0 "blank\0" +#define STRING_cntrl0 "cntrl\0" +#define STRING_digit0 "digit\0" +#define STRING_graph0 "graph\0" +#define STRING_print0 "print\0" +#define STRING_punct0 "punct\0" +#define STRING_space0 "space\0" +#define STRING_word0 "word\0" +#define STRING_xdigit "xdigit" + +#define STRING_DEFINE "DEFINE" + +#define STRING_CR_RIGHTPAR "CR)" +#define STRING_LF_RIGHTPAR "LF)" +#define STRING_CRLF_RIGHTPAR "CRLF)" +#define STRING_ANY_RIGHTPAR "ANY)" +#define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)" +#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)" +#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)" +#define STRING_UTF8_RIGHTPAR "UTF8)" + +#else /* SUPPORT_UTF8 */ + +/* UTF-8 support is enabled; always use UTF-8 (=ASCII) character codes. This +works in both modes non-EBCDIC platforms, and on EBCDIC platforms in UTF-8 mode +only. */ + +#define CHAR_HT '\011' +#define CHAR_VT '\013' +#define CHAR_FF '\014' +#define CHAR_CR '\015' +#define CHAR_NL '\012' +#define CHAR_BS '\010' +#define CHAR_BEL '\007' +#define CHAR_ESC '\033' +#define CHAR_DEL '\177' + +#define CHAR_SPACE '\040' +#define CHAR_EXCLAMATION_MARK '\041' +#define CHAR_QUOTATION_MARK '\042' +#define CHAR_NUMBER_SIGN '\043' +#define CHAR_DOLLAR_SIGN '\044' +#define CHAR_PERCENT_SIGN '\045' +#define CHAR_AMPERSAND '\046' +#define CHAR_APOSTROPHE '\047' +#define CHAR_LEFT_PARENTHESIS '\050' +#define CHAR_RIGHT_PARENTHESIS '\051' +#define CHAR_ASTERISK '\052' +#define CHAR_PLUS '\053' +#define CHAR_COMMA '\054' +#define CHAR_MINUS '\055' +#define CHAR_DOT '\056' +#define CHAR_SLASH '\057' +#define CHAR_0 '\060' +#define CHAR_1 '\061' +#define CHAR_2 '\062' +#define CHAR_3 '\063' +#define CHAR_4 '\064' +#define CHAR_5 '\065' +#define CHAR_6 '\066' +#define CHAR_7 '\067' +#define CHAR_8 '\070' +#define CHAR_9 '\071' +#define CHAR_COLON '\072' +#define CHAR_SEMICOLON '\073' +#define CHAR_LESS_THAN_SIGN '\074' +#define CHAR_EQUALS_SIGN '\075' +#define CHAR_GREATER_THAN_SIGN '\076' +#define CHAR_QUESTION_MARK '\077' +#define CHAR_COMMERCIAL_AT '\100' +#define CHAR_A '\101' +#define CHAR_B '\102' +#define CHAR_C '\103' +#define CHAR_D '\104' +#define CHAR_E '\105' +#define CHAR_F '\106' +#define CHAR_G '\107' +#define CHAR_H '\110' +#define CHAR_I '\111' +#define CHAR_J '\112' +#define CHAR_K '\113' +#define CHAR_L '\114' +#define CHAR_M '\115' +#define CHAR_N '\116' +#define CHAR_O '\117' +#define CHAR_P '\120' +#define CHAR_Q '\121' +#define CHAR_R '\122' +#define CHAR_S '\123' +#define CHAR_T '\124' +#define CHAR_U '\125' +#define CHAR_V '\126' +#define CHAR_W '\127' +#define CHAR_X '\130' +#define CHAR_Y '\131' +#define CHAR_Z '\132' +#define CHAR_LEFT_SQUARE_BRACKET '\133' +#define CHAR_BACKSLASH '\134' +#define CHAR_RIGHT_SQUARE_BRACKET '\135' +#define CHAR_CIRCUMFLEX_ACCENT '\136' +#define CHAR_UNDERSCORE '\137' +#define CHAR_GRAVE_ACCENT '\140' +#define CHAR_a '\141' +#define CHAR_b '\142' +#define CHAR_c '\143' +#define CHAR_d '\144' +#define CHAR_e '\145' +#define CHAR_f '\146' +#define CHAR_g '\147' +#define CHAR_h '\150' +#define CHAR_i '\151' +#define CHAR_j '\152' +#define CHAR_k '\153' +#define CHAR_l '\154' +#define CHAR_m '\155' +#define CHAR_n '\156' +#define CHAR_o '\157' +#define CHAR_p '\160' +#define CHAR_q '\161' +#define CHAR_r '\162' +#define CHAR_s '\163' +#define CHAR_t '\164' +#define CHAR_u '\165' +#define CHAR_v '\166' +#define CHAR_w '\167' +#define CHAR_x '\170' +#define CHAR_y '\171' +#define CHAR_z '\172' +#define CHAR_LEFT_CURLY_BRACKET '\173' +#define CHAR_VERTICAL_LINE '\174' +#define CHAR_RIGHT_CURLY_BRACKET '\175' +#define CHAR_TILDE '\176' + +#define STR_HT "\011" +#define STR_VT "\013" +#define STR_FF "\014" +#define STR_CR "\015" +#define STR_NL "\012" +#define STR_BS "\010" +#define STR_BEL "\007" +#define STR_ESC "\033" +#define STR_DEL "\177" + +#define STR_SPACE "\040" +#define STR_EXCLAMATION_MARK "\041" +#define STR_QUOTATION_MARK "\042" +#define STR_NUMBER_SIGN "\043" +#define STR_DOLLAR_SIGN "\044" +#define STR_PERCENT_SIGN "\045" +#define STR_AMPERSAND "\046" +#define STR_APOSTROPHE "\047" +#define STR_LEFT_PARENTHESIS "\050" +#define STR_RIGHT_PARENTHESIS "\051" +#define STR_ASTERISK "\052" +#define STR_PLUS "\053" +#define STR_COMMA "\054" +#define STR_MINUS "\055" +#define STR_DOT "\056" +#define STR_SLASH "\057" +#define STR_0 "\060" +#define STR_1 "\061" +#define STR_2 "\062" +#define STR_3 "\063" +#define STR_4 "\064" +#define STR_5 "\065" +#define STR_6 "\066" +#define STR_7 "\067" +#define STR_8 "\070" +#define STR_9 "\071" +#define STR_COLON "\072" +#define STR_SEMICOLON "\073" +#define STR_LESS_THAN_SIGN "\074" +#define STR_EQUALS_SIGN "\075" +#define STR_GREATER_THAN_SIGN "\076" +#define STR_QUESTION_MARK "\077" +#define STR_COMMERCIAL_AT "\100" +#define STR_A "\101" +#define STR_B "\102" +#define STR_C "\103" +#define STR_D "\104" +#define STR_E "\105" +#define STR_F "\106" +#define STR_G "\107" +#define STR_H "\110" +#define STR_I "\111" +#define STR_J "\112" +#define STR_K "\113" +#define STR_L "\114" +#define STR_M "\115" +#define STR_N "\116" +#define STR_O "\117" +#define STR_P "\120" +#define STR_Q "\121" +#define STR_R "\122" +#define STR_S "\123" +#define STR_T "\124" +#define STR_U "\125" +#define STR_V "\126" +#define STR_W "\127" +#define STR_X "\130" +#define STR_Y "\131" +#define STR_Z "\132" +#define STR_LEFT_SQUARE_BRACKET "\133" +#define STR_BACKSLASH "\134" +#define STR_RIGHT_SQUARE_BRACKET "\135" +#define STR_CIRCUMFLEX_ACCENT "\136" +#define STR_UNDERSCORE "\137" +#define STR_GRAVE_ACCENT "\140" +#define STR_a "\141" +#define STR_b "\142" +#define STR_c "\143" +#define STR_d "\144" +#define STR_e "\145" +#define STR_f "\146" +#define STR_g "\147" +#define STR_h "\150" +#define STR_i "\151" +#define STR_j "\152" +#define STR_k "\153" +#define STR_l "\154" +#define STR_m "\155" +#define STR_n "\156" +#define STR_o "\157" +#define STR_p "\160" +#define STR_q "\161" +#define STR_r "\162" +#define STR_s "\163" +#define STR_t "\164" +#define STR_u "\165" +#define STR_v "\166" +#define STR_w "\167" +#define STR_x "\170" +#define STR_y "\171" +#define STR_z "\172" +#define STR_LEFT_CURLY_BRACKET "\173" +#define STR_VERTICAL_LINE "\174" +#define STR_RIGHT_CURLY_BRACKET "\175" +#define STR_TILDE "\176" + +#define STRING_ACCEPT0 STR_A STR_C STR_C STR_E STR_P STR_T "\0" +#define STRING_COMMIT0 STR_C STR_O STR_M STR_M STR_I STR_T "\0" +#define STRING_F0 STR_F "\0" +#define STRING_FAIL0 STR_F STR_A STR_I STR_L "\0" +#define STRING_PRUNE0 STR_P STR_R STR_U STR_N STR_E "\0" +#define STRING_SKIP0 STR_S STR_K STR_I STR_P "\0" +#define STRING_THEN STR_T STR_H STR_E STR_N + +#define STRING_alpha0 STR_a STR_l STR_p STR_h STR_a "\0" +#define STRING_lower0 STR_l STR_o STR_w STR_e STR_r "\0" +#define STRING_upper0 STR_u STR_p STR_p STR_e STR_r "\0" +#define STRING_alnum0 STR_a STR_l STR_n STR_u STR_m "\0" +#define STRING_ascii0 STR_a STR_s STR_c STR_i STR_i "\0" +#define STRING_blank0 STR_b STR_l STR_a STR_n STR_k "\0" +#define STRING_cntrl0 STR_c STR_n STR_t STR_r STR_l "\0" +#define STRING_digit0 STR_d STR_i STR_g STR_i STR_t "\0" +#define STRING_graph0 STR_g STR_r STR_a STR_p STR_h "\0" +#define STRING_print0 STR_p STR_r STR_i STR_n STR_t "\0" +#define STRING_punct0 STR_p STR_u STR_n STR_c STR_t "\0" +#define STRING_space0 STR_s STR_p STR_a STR_c STR_e "\0" +#define STRING_word0 STR_w STR_o STR_r STR_d "\0" +#define STRING_xdigit STR_x STR_d STR_i STR_g STR_i STR_t + +#define STRING_DEFINE STR_D STR_E STR_F STR_I STR_N STR_E + +#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS +#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS +#define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS +#define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS +#define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS +#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS +#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS +#define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS + +#endif /* SUPPORT_UTF8 */ + /* Escape items that are just an encoding of a particular data value. */ #ifndef ESC_e -#define ESC_e 27 +#define ESC_e CHAR_ESC #endif #ifndef ESC_f -#define ESC_f '\f' +#define ESC_f CHAR_FF #endif #ifndef ESC_n -#define ESC_n '\n' +#define ESC_n CHAR_NL #endif #ifndef ESC_r -#define ESC_r '\r' +#define ESC_r CHAR_CR #endif /* We can't officially use ESC_t because it is a POSIX reserved identifier (presumably because of all the others like size_t). */ #ifndef ESC_tee -#define ESC_tee '\t' +#define ESC_tee CHAR_HT #endif /* Codes for different types of Unicode property */ diff --git a/ext/pcre/pcrelib/pcre_maketables.c b/ext/pcre/pcrelib/pcre_maketables.c index 5b12322090..2ba612cec7 100644 --- a/ext/pcre/pcrelib/pcre_maketables.c +++ b/ext/pcre/pcrelib/pcre_maketables.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2009 University of Cambridge + Copyright (c) 1997-2008 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without diff --git a/ext/pcre/pcrelib/pcre_newline.c b/ext/pcre/pcrelib/pcre_newline.c index 0b682437bf..3f6160ef22 100644 --- a/ext/pcre/pcrelib/pcre_newline.c +++ b/ext/pcre/pcrelib/pcre_newline.c @@ -71,8 +71,7 @@ Returns: TRUE or FALSE */ BOOL -_pcre_is_newline(const uschar *ptr, int type, const uschar *endptr, - int *lenptr, BOOL utf8) +_pcre_is_newline(USPTR ptr, int type, USPTR endptr, int *lenptr, BOOL utf8) { int c; if (utf8) { GETCHAR(c, ptr); } else c = *ptr; @@ -121,8 +120,7 @@ Returns: TRUE or FALSE */ BOOL -_pcre_was_newline(const uschar *ptr, int type, const uschar *startptr, - int *lenptr, BOOL utf8) +_pcre_was_newline(USPTR ptr, int type, USPTR startptr, int *lenptr, BOOL utf8) { int c; ptr--; diff --git a/ext/pcre/pcrelib/pcre_ord2utf8.c b/ext/pcre/pcrelib/pcre_ord2utf8.c index c0a574cc86..7931241d67 100644 --- a/ext/pcre/pcrelib/pcre_ord2utf8.c +++ b/ext/pcre/pcrelib/pcre_ord2utf8.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2009 University of Cambridge + Copyright (c) 1997-2008 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without diff --git a/ext/pcre/pcrelib/pcre_printint.src b/ext/pcre/pcrelib/pcre_printint.src index 98b42aa804..5f45fc1985 100644 --- a/ext/pcre/pcrelib/pcre_printint.src +++ b/ext/pcre/pcrelib/pcre_printint.src @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2008 University of Cambridge + Copyright (c) 1997-2009 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -54,7 +54,11 @@ hexadecimal. We don't use isprint() because that can vary from system to system (even without the use of locales) and we want the output always to be the same, for testing purposes. This macro is used in pcretest as well as in this file. */ +#ifdef EBCDIC +#define PRINTABLE(c) ((c) >= 64 && (c) < 255) +#else #define PRINTABLE(c) ((c) >= 32 && (c) < 127) +#endif /* The table of operator names. */ diff --git a/ext/pcre/pcrelib/pcre_refcount.c b/ext/pcre/pcrelib/pcre_refcount.c index 2cc5a9d9f8..c92d578e6e 100644 --- a/ext/pcre/pcrelib/pcre_refcount.c +++ b/ext/pcre/pcrelib/pcre_refcount.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2009 University of Cambridge + Copyright (c) 1997-2008 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without diff --git a/ext/pcre/pcrelib/pcre_study.c b/ext/pcre/pcrelib/pcre_study.c index 97e3a92f77..226cc65941 100644 --- a/ext/pcre/pcrelib/pcre_study.c +++ b/ext/pcre/pcrelib/pcre_study.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2009 University of Cambridge + Copyright (c) 1997-2008 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without diff --git a/ext/pcre/pcrelib/pcre_tables.c b/ext/pcre/pcrelib/pcre_tables.c index 08e5a4a846..87e3c73b68 100644 --- a/ext/pcre/pcrelib/pcre_tables.c +++ b/ext/pcre/pcrelib/pcre_tables.c @@ -107,126 +107,248 @@ putting all the names into a single, large string and then using offsets in the table itself. Maintenance is more error-prone, but frequent changes to this data are unlikely. -July 2008: There is now a script called maint/GenerateUtt.py which can be used -to generate this data instead of maintaining it entirely by hand. */ +July 2008: There is now a script called maint/GenerateUtt.py that can be used +to generate this data instead of maintaining it entirely by hand. + +The script was updated in March 2009 to generate a new EBCDIC-compliant +version. Like all other character and string literals that are compared against +the regular expression pattern, we must use STR_ macros instead of literal +strings to make sure that UTF-8 support works on EBCDIC platforms. */ + +#define STRING_Any0 STR_A STR_n STR_y "\0" +#define STRING_Arabic0 STR_A STR_r STR_a STR_b STR_i STR_c "\0" +#define STRING_Armenian0 STR_A STR_r STR_m STR_e STR_n STR_i STR_a STR_n "\0" +#define STRING_Balinese0 STR_B STR_a STR_l STR_i STR_n STR_e STR_s STR_e "\0" +#define STRING_Bengali0 STR_B STR_e STR_n STR_g STR_a STR_l STR_i "\0" +#define STRING_Bopomofo0 STR_B STR_o STR_p STR_o STR_m STR_o STR_f STR_o "\0" +#define STRING_Braille0 STR_B STR_r STR_a STR_i STR_l STR_l STR_e "\0" +#define STRING_Buginese0 STR_B STR_u STR_g STR_i STR_n STR_e STR_s STR_e "\0" +#define STRING_Buhid0 STR_B STR_u STR_h STR_i STR_d "\0" +#define STRING_C0 STR_C "\0" +#define STRING_Canadian_Aboriginal0 STR_C STR_a STR_n STR_a STR_d STR_i STR_a STR_n STR_UNDERSCORE STR_A STR_b STR_o STR_r STR_i STR_g STR_i STR_n STR_a STR_l "\0" +#define STRING_Carian0 STR_C STR_a STR_r STR_i STR_a STR_n "\0" +#define STRING_Cc0 STR_C STR_c "\0" +#define STRING_Cf0 STR_C STR_f "\0" +#define STRING_Cham0 STR_C STR_h STR_a STR_m "\0" +#define STRING_Cherokee0 STR_C STR_h STR_e STR_r STR_o STR_k STR_e STR_e "\0" +#define STRING_Cn0 STR_C STR_n "\0" +#define STRING_Co0 STR_C STR_o "\0" +#define STRING_Common0 STR_C STR_o STR_m STR_m STR_o STR_n "\0" +#define STRING_Coptic0 STR_C STR_o STR_p STR_t STR_i STR_c "\0" +#define STRING_Cs0 STR_C STR_s "\0" +#define STRING_Cuneiform0 STR_C STR_u STR_n STR_e STR_i STR_f STR_o STR_r STR_m "\0" +#define STRING_Cypriot0 STR_C STR_y STR_p STR_r STR_i STR_o STR_t "\0" +#define STRING_Cyrillic0 STR_C STR_y STR_r STR_i STR_l STR_l STR_i STR_c "\0" +#define STRING_Deseret0 STR_D STR_e STR_s STR_e STR_r STR_e STR_t "\0" +#define STRING_Devanagari0 STR_D STR_e STR_v STR_a STR_n STR_a STR_g STR_a STR_r STR_i "\0" +#define STRING_Ethiopic0 STR_E STR_t STR_h STR_i STR_o STR_p STR_i STR_c "\0" +#define STRING_Georgian0 STR_G STR_e STR_o STR_r STR_g STR_i STR_a STR_n "\0" +#define STRING_Glagolitic0 STR_G STR_l STR_a STR_g STR_o STR_l STR_i STR_t STR_i STR_c "\0" +#define STRING_Gothic0 STR_G STR_o STR_t STR_h STR_i STR_c "\0" +#define STRING_Greek0 STR_G STR_r STR_e STR_e STR_k "\0" +#define STRING_Gujarati0 STR_G STR_u STR_j STR_a STR_r STR_a STR_t STR_i "\0" +#define STRING_Gurmukhi0 STR_G STR_u STR_r STR_m STR_u STR_k STR_h STR_i "\0" +#define STRING_Han0 STR_H STR_a STR_n "\0" +#define STRING_Hangul0 STR_H STR_a STR_n STR_g STR_u STR_l "\0" +#define STRING_Hanunoo0 STR_H STR_a STR_n STR_u STR_n STR_o STR_o "\0" +#define STRING_Hebrew0 STR_H STR_e STR_b STR_r STR_e STR_w "\0" +#define STRING_Hiragana0 STR_H STR_i STR_r STR_a STR_g STR_a STR_n STR_a "\0" +#define STRING_Inherited0 STR_I STR_n STR_h STR_e STR_r STR_i STR_t STR_e STR_d "\0" +#define STRING_Kannada0 STR_K STR_a STR_n STR_n STR_a STR_d STR_a "\0" +#define STRING_Katakana0 STR_K STR_a STR_t STR_a STR_k STR_a STR_n STR_a "\0" +#define STRING_Kayah_Li0 STR_K STR_a STR_y STR_a STR_h STR_UNDERSCORE STR_L STR_i "\0" +#define STRING_Kharoshthi0 STR_K STR_h STR_a STR_r STR_o STR_s STR_h STR_t STR_h STR_i "\0" +#define STRING_Khmer0 STR_K STR_h STR_m STR_e STR_r "\0" +#define STRING_L0 STR_L "\0" +#define STRING_L_AMPERSAND0 STR_L STR_AMPERSAND "\0" +#define STRING_Lao0 STR_L STR_a STR_o "\0" +#define STRING_Latin0 STR_L STR_a STR_t STR_i STR_n "\0" +#define STRING_Lepcha0 STR_L STR_e STR_p STR_c STR_h STR_a "\0" +#define STRING_Limbu0 STR_L STR_i STR_m STR_b STR_u "\0" +#define STRING_Linear_B0 STR_L STR_i STR_n STR_e STR_a STR_r STR_UNDERSCORE STR_B "\0" +#define STRING_Ll0 STR_L STR_l "\0" +#define STRING_Lm0 STR_L STR_m "\0" +#define STRING_Lo0 STR_L STR_o "\0" +#define STRING_Lt0 STR_L STR_t "\0" +#define STRING_Lu0 STR_L STR_u "\0" +#define STRING_Lycian0 STR_L STR_y STR_c STR_i STR_a STR_n "\0" +#define STRING_Lydian0 STR_L STR_y STR_d STR_i STR_a STR_n "\0" +#define STRING_M0 STR_M "\0" +#define STRING_Malayalam0 STR_M STR_a STR_l STR_a STR_y STR_a STR_l STR_a STR_m "\0" +#define STRING_Mc0 STR_M STR_c "\0" +#define STRING_Me0 STR_M STR_e "\0" +#define STRING_Mn0 STR_M STR_n "\0" +#define STRING_Mongolian0 STR_M STR_o STR_n STR_g STR_o STR_l STR_i STR_a STR_n "\0" +#define STRING_Myanmar0 STR_M STR_y STR_a STR_n STR_m STR_a STR_r "\0" +#define STRING_N0 STR_N "\0" +#define STRING_Nd0 STR_N STR_d "\0" +#define STRING_New_Tai_Lue0 STR_N STR_e STR_w STR_UNDERSCORE STR_T STR_a STR_i STR_UNDERSCORE STR_L STR_u STR_e "\0" +#define STRING_Nko0 STR_N STR_k STR_o "\0" +#define STRING_Nl0 STR_N STR_l "\0" +#define STRING_No0 STR_N STR_o "\0" +#define STRING_Ogham0 STR_O STR_g STR_h STR_a STR_m "\0" +#define STRING_Ol_Chiki0 STR_O STR_l STR_UNDERSCORE STR_C STR_h STR_i STR_k STR_i "\0" +#define STRING_Old_Italic0 STR_O STR_l STR_d STR_UNDERSCORE STR_I STR_t STR_a STR_l STR_i STR_c "\0" +#define STRING_Old_Persian0 STR_O STR_l STR_d STR_UNDERSCORE STR_P STR_e STR_r STR_s STR_i STR_a STR_n "\0" +#define STRING_Oriya0 STR_O STR_r STR_i STR_y STR_a "\0" +#define STRING_Osmanya0 STR_O STR_s STR_m STR_a STR_n STR_y STR_a "\0" +#define STRING_P0 STR_P "\0" +#define STRING_Pc0 STR_P STR_c "\0" +#define STRING_Pd0 STR_P STR_d "\0" +#define STRING_Pe0 STR_P STR_e "\0" +#define STRING_Pf0 STR_P STR_f "\0" +#define STRING_Phags_Pa0 STR_P STR_h STR_a STR_g STR_s STR_UNDERSCORE STR_P STR_a "\0" +#define STRING_Phoenician0 STR_P STR_h STR_o STR_e STR_n STR_i STR_c STR_i STR_a STR_n "\0" +#define STRING_Pi0 STR_P STR_i "\0" +#define STRING_Po0 STR_P STR_o "\0" +#define STRING_Ps0 STR_P STR_s "\0" +#define STRING_Rejang0 STR_R STR_e STR_j STR_a STR_n STR_g "\0" +#define STRING_Runic0 STR_R STR_u STR_n STR_i STR_c "\0" +#define STRING_S0 STR_S "\0" +#define STRING_Saurashtra0 STR_S STR_a STR_u STR_r STR_a STR_s STR_h STR_t STR_r STR_a "\0" +#define STRING_Sc0 STR_S STR_c "\0" +#define STRING_Shavian0 STR_S STR_h STR_a STR_v STR_i STR_a STR_n "\0" +#define STRING_Sinhala0 STR_S STR_i STR_n STR_h STR_a STR_l STR_a "\0" +#define STRING_Sk0 STR_S STR_k "\0" +#define STRING_Sm0 STR_S STR_m "\0" +#define STRING_So0 STR_S STR_o "\0" +#define STRING_Sundanese0 STR_S STR_u STR_n STR_d STR_a STR_n STR_e STR_s STR_e "\0" +#define STRING_Syloti_Nagri0 STR_S STR_y STR_l STR_o STR_t STR_i STR_UNDERSCORE STR_N STR_a STR_g STR_r STR_i "\0" +#define STRING_Syriac0 STR_S STR_y STR_r STR_i STR_a STR_c "\0" +#define STRING_Tagalog0 STR_T STR_a STR_g STR_a STR_l STR_o STR_g "\0" +#define STRING_Tagbanwa0 STR_T STR_a STR_g STR_b STR_a STR_n STR_w STR_a "\0" +#define STRING_Tai_Le0 STR_T STR_a STR_i STR_UNDERSCORE STR_L STR_e "\0" +#define STRING_Tamil0 STR_T STR_a STR_m STR_i STR_l "\0" +#define STRING_Telugu0 STR_T STR_e STR_l STR_u STR_g STR_u "\0" +#define STRING_Thaana0 STR_T STR_h STR_a STR_a STR_n STR_a "\0" +#define STRING_Thai0 STR_T STR_h STR_a STR_i "\0" +#define STRING_Tibetan0 STR_T STR_i STR_b STR_e STR_t STR_a STR_n "\0" +#define STRING_Tifinagh0 STR_T STR_i STR_f STR_i STR_n STR_a STR_g STR_h "\0" +#define STRING_Ugaritic0 STR_U STR_g STR_a STR_r STR_i STR_t STR_i STR_c "\0" +#define STRING_Vai0 STR_V STR_a STR_i "\0" +#define STRING_Yi0 STR_Y STR_i "\0" +#define STRING_Z0 STR_Z "\0" +#define STRING_Zl0 STR_Z STR_l "\0" +#define STRING_Zp0 STR_Z STR_p "\0" +#define STRING_Zs0 STR_Z STR_s "\0" const char _pcre_utt_names[] = - "Any\0" - "Arabic\0" - "Armenian\0" - "Balinese\0" - "Bengali\0" - "Bopomofo\0" - "Braille\0" - "Buginese\0" - "Buhid\0" - "C\0" - "Canadian_Aboriginal\0" - "Carian\0" - "Cc\0" - "Cf\0" - "Cham\0" - "Cherokee\0" - "Cn\0" - "Co\0" - "Common\0" - "Coptic\0" - "Cs\0" - "Cuneiform\0" - "Cypriot\0" - "Cyrillic\0" - "Deseret\0" - "Devanagari\0" - "Ethiopic\0" - "Georgian\0" - "Glagolitic\0" - "Gothic\0" - "Greek\0" - "Gujarati\0" - "Gurmukhi\0" - "Han\0" - "Hangul\0" - "Hanunoo\0" - "Hebrew\0" - "Hiragana\0" - "Inherited\0" - "Kannada\0" - "Katakana\0" - "Kayah_Li\0" - "Kharoshthi\0" - "Khmer\0" - "L\0" - "L&\0" - "Lao\0" - "Latin\0" - "Lepcha\0" - "Limbu\0" - "Linear_B\0" - "Ll\0" - "Lm\0" - "Lo\0" - "Lt\0" - "Lu\0" - "Lycian\0" - "Lydian\0" - "M\0" - "Malayalam\0" - "Mc\0" - "Me\0" - "Mn\0" - "Mongolian\0" - "Myanmar\0" - "N\0" - "Nd\0" - "New_Tai_Lue\0" - "Nko\0" - "Nl\0" - "No\0" - "Ogham\0" - "Ol_Chiki\0" - "Old_Italic\0" - "Old_Persian\0" - "Oriya\0" - "Osmanya\0" - "P\0" - "Pc\0" - "Pd\0" - "Pe\0" - "Pf\0" - "Phags_Pa\0" - "Phoenician\0" - "Pi\0" - "Po\0" - "Ps\0" - "Rejang\0" - "Runic\0" - "S\0" - "Saurashtra\0" - "Sc\0" - "Shavian\0" - "Sinhala\0" - "Sk\0" - "Sm\0" - "So\0" - "Sundanese\0" - "Syloti_Nagri\0" - "Syriac\0" - "Tagalog\0" - "Tagbanwa\0" - "Tai_Le\0" - "Tamil\0" - "Telugu\0" - "Thaana\0" - "Thai\0" - "Tibetan\0" - "Tifinagh\0" - "Ugaritic\0" - "Vai\0" - "Yi\0" - "Z\0" - "Zl\0" - "Zp\0" - "Zs\0"; + STRING_Any0 + STRING_Arabic0 + STRING_Armenian0 + STRING_Balinese0 + STRING_Bengali0 + STRING_Bopomofo0 + STRING_Braille0 + STRING_Buginese0 + STRING_Buhid0 + STRING_C0 + STRING_Canadian_Aboriginal0 + STRING_Carian0 + STRING_Cc0 + STRING_Cf0 + STRING_Cham0 + STRING_Cherokee0 + STRING_Cn0 + STRING_Co0 + STRING_Common0 + STRING_Coptic0 + STRING_Cs0 + STRING_Cuneiform0 + STRING_Cypriot0 + STRING_Cyrillic0 + STRING_Deseret0 + STRING_Devanagari0 + STRING_Ethiopic0 + STRING_Georgian0 + STRING_Glagolitic0 + STRING_Gothic0 + STRING_Greek0 + STRING_Gujarati0 + STRING_Gurmukhi0 + STRING_Han0 + STRING_Hangul0 + STRING_Hanunoo0 + STRING_Hebrew0 + STRING_Hiragana0 + STRING_Inherited0 + STRING_Kannada0 + STRING_Katakana0 + STRING_Kayah_Li0 + STRING_Kharoshthi0 + STRING_Khmer0 + STRING_L0 + STRING_L_AMPERSAND0 + STRING_Lao0 + STRING_Latin0 + STRING_Lepcha0 + STRING_Limbu0 + STRING_Linear_B0 + STRING_Ll0 + STRING_Lm0 + STRING_Lo0 + STRING_Lt0 + STRING_Lu0 + STRING_Lycian0 + STRING_Lydian0 + STRING_M0 + STRING_Malayalam0 + STRING_Mc0 + STRING_Me0 + STRING_Mn0 + STRING_Mongolian0 + STRING_Myanmar0 + STRING_N0 + STRING_Nd0 + STRING_New_Tai_Lue0 + STRING_Nko0 + STRING_Nl0 + STRING_No0 + STRING_Ogham0 + STRING_Ol_Chiki0 + STRING_Old_Italic0 + STRING_Old_Persian0 + STRING_Oriya0 + STRING_Osmanya0 + STRING_P0 + STRING_Pc0 + STRING_Pd0 + STRING_Pe0 + STRING_Pf0 + STRING_Phags_Pa0 + STRING_Phoenician0 + STRING_Pi0 + STRING_Po0 + STRING_Ps0 + STRING_Rejang0 + STRING_Runic0 + STRING_S0 + STRING_Saurashtra0 + STRING_Sc0 + STRING_Shavian0 + STRING_Sinhala0 + STRING_Sk0 + STRING_Sm0 + STRING_So0 + STRING_Sundanese0 + STRING_Syloti_Nagri0 + STRING_Syriac0 + STRING_Tagalog0 + STRING_Tagbanwa0 + STRING_Tai_Le0 + STRING_Tamil0 + STRING_Telugu0 + STRING_Thaana0 + STRING_Thai0 + STRING_Tibetan0 + STRING_Tifinagh0 + STRING_Ugaritic0 + STRING_Vai0 + STRING_Yi0 + STRING_Z0 + STRING_Zl0 + STRING_Zp0 + STRING_Zs0; const ucp_type_table _pcre_utt[] = { { 0, PT_ANY, 0 }, diff --git a/ext/pcre/pcrelib/pcre_try_flipped.c b/ext/pcre/pcrelib/pcre_try_flipped.c index 6a8a770da8..5e67943cc0 100644 --- a/ext/pcre/pcrelib/pcre_try_flipped.c +++ b/ext/pcre/pcrelib/pcre_try_flipped.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2009 University of Cambridge + Copyright (c) 1997-2008 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without diff --git a/ext/pcre/pcrelib/pcre_valid_utf8.c b/ext/pcre/pcrelib/pcre_valid_utf8.c index 62f47b952f..3c81dc9ecc 100644 --- a/ext/pcre/pcrelib/pcre_valid_utf8.c +++ b/ext/pcre/pcrelib/pcre_valid_utf8.c @@ -73,10 +73,10 @@ Returns: < 0 if the string is a valid UTF-8 string */ int -_pcre_valid_utf8(const uschar *string, int length) +_pcre_valid_utf8(USPTR string, int length) { #ifdef SUPPORT_UTF8 -register const uschar *p; +register USPTR p; if (length < 0) { diff --git a/ext/pcre/pcrelib/pcre_version.c b/ext/pcre/pcrelib/pcre_version.c index 13330d1a5c..bd63f41e6e 100644 --- a/ext/pcre/pcrelib/pcre_version.c +++ b/ext/pcre/pcrelib/pcre_version.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2009 University of Cambridge + Copyright (c) 1997-2008 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without diff --git a/ext/pcre/pcrelib/pcre_xclass.c b/ext/pcre/pcrelib/pcre_xclass.c index c939662f00..3fc3ba368a 100644 --- a/ext/pcre/pcrelib/pcre_xclass.c +++ b/ext/pcre/pcrelib/pcre_xclass.c @@ -39,8 +39,7 @@ POSSIBILITY OF SUCH DAMAGE. /* This module contains an internal function that is used to match an extended -class (one that contains characters whose values are > 255). It is used by both -pcre_exec() and pcre_def_exec(). */ +class. It is used by both pcre_exec() and pcre_def_exec(). */ #include "config.h" @@ -53,7 +52,7 @@ pcre_exec() and pcre_def_exec(). */ *************************************************/ /* This function is called to match a character against an extended class that -might contain values > 255. +might contain values > 255 and/or Unicode properties. Arguments: c the character @@ -102,7 +101,7 @@ while ((t = *data++) != XCL_END) #ifdef SUPPORT_UCP else /* XCL_PROP & XCL_NOTPROP */ { - const ucd_record * prop = GET_UCD(c); + const ucd_record *prop = GET_UCD(c); switch(*data) { diff --git a/ext/pcre/pcrelib/pcreposix.c b/ext/pcre/pcrelib/pcreposix.c index 87c695b2a1..645e060acc 100644 --- a/ext/pcre/pcrelib/pcreposix.c +++ b/ext/pcre/pcrelib/pcreposix.c @@ -272,6 +272,7 @@ BOOL nosub = if ((eflags & REG_NOTBOL) != 0) options |= PCRE_NOTBOL; if ((eflags & REG_NOTEOL) != 0) options |= PCRE_NOTEOL; +if ((eflags & REG_NOTEMPTY) != 0) options |= PCRE_NOTEMPTY; ((regex_t *)preg)->re_erroffset = (size_t)(-1); /* Only has meaning after compile */ diff --git a/ext/pcre/pcrelib/pcreposix.h b/ext/pcre/pcrelib/pcreposix.h index 4f59d918ee..7c5af72437 100644 --- a/ext/pcre/pcrelib/pcreposix.h +++ b/ext/pcre/pcrelib/pcreposix.h @@ -60,6 +60,7 @@ extern "C" { #define REG_NOSUB 0x0020 #define REG_UTF8 0x0040 /* NOT defined by POSIX. */ #define REG_STARTEND 0x0080 /* BSD feature: pass subject string by so,eo */ +#define REG_NOTEMPTY 0x0100 /* NOT defined by POSIX. */ /* This is not used by PCRE, but by defining it we make it easier to slot PCRE into existing programs that make POSIX calls. */ diff --git a/ext/pcre/pcrelib/testdata/grepinputx b/ext/pcre/pcrelib/testdata/grepinputx index aebba02770..730cc8a0d0 100644 --- a/ext/pcre/pcrelib/testdata/grepinputx +++ b/ext/pcre/pcrelib/testdata/grepinputx @@ -39,4 +39,5 @@ eighteen nineteen twenty +This line contains pattern not on a line by itself. This is the last line of this file. diff --git a/ext/pcre/pcrelib/testdata/grepoutput b/ext/pcre/pcrelib/testdata/grepoutput index 3241984c1c..882344e11f 100644 --- a/ext/pcre/pcrelib/testdata/grepoutput +++ b/ext/pcre/pcrelib/testdata/grepoutput @@ -18,6 +18,7 @@ PATTERN at the start of a line. ./testdata/grepinput:608:Check up on PATTERN near the end. ./testdata/grepinputx:3:Here is the pattern again. ./testdata/grepinputx:5:Pattern +./testdata/grepinputx:42:This line contains pattern not on a line by itself. ---------------------------- Test 6 ------------------------------ 7:PATTERN at the start of a line. 8:In the middle of a line, PATTERN appears. @@ -25,6 +26,7 @@ PATTERN at the start of a line. 608:Check up on PATTERN near the end. 3:Here is the pattern again. 5:Pattern +42:This line contains pattern not on a line by itself. ---------------------------- Test 7 ------------------------------ ./testdata/grepinput ./testdata/grepinputx @@ -75,12 +77,13 @@ RC=1 39:nineteen 40:twenty 41: -42:This is the last line of this file. +43:This is the last line of this file. ---------------------------- Test 12 ----------------------------- Pattern ---------------------------- Test 13 ----------------------------- Here is the pattern again. That time it was on a line by itself. +This line contains pattern not on a line by itself. ---------------------------- Test 14 ----------------------------- ./testdata/grepinputx:To pat or not to pat, that is the question. ---------------------------- Test 15 ----------------------------- @@ -157,6 +160,7 @@ eighteen nineteen twenty +This line contains pattern not on a line by itself. This is the last line of this file. ---------------------------- Test 25 ----------------------------- 15- @@ -207,6 +211,7 @@ eighteen nineteen twenty +This line contains pattern not on a line by itself. This is the last line of this file. ---------------------------- Test 27 ----------------------------- four @@ -227,6 +232,7 @@ eighteen nineteen twenty +This line contains pattern not on a line by itself. This is the last line of this file. ---------------------------- Test 28 ----------------------------- 14-of lines all by themselves. @@ -279,6 +285,7 @@ eighteen nineteen twenty +This line contains pattern not on a line by itself. This is the last line of this file. ---------------------------- Test 30 ----------------------------- ./testdata/grepinput-4-features should be added at the end, because some of the tests involve the @@ -299,6 +306,11 @@ This is the last line of this file. ./testdata/grepinputx:3:Here is the pattern again. ./testdata/grepinputx-4- ./testdata/grepinputx:5:Pattern +-- +./testdata/grepinputx-39-nineteen +./testdata/grepinputx-40-twenty +./testdata/grepinputx-41- +./testdata/grepinputx:42:This line contains pattern not on a line by itself. ---------------------------- Test 31 ----------------------------- ./testdata/grepinput:7:PATTERN at the start of a line. ./testdata/grepinput:8:In the middle of a line, PATTERN appears. @@ -317,6 +329,9 @@ This is the last line of this file. ./testdata/grepinputx-6-That time it was on a line by itself. ./testdata/grepinputx-7- ./testdata/grepinputx-8-To pat or not to pat, that is the question. +-- +./testdata/grepinputx:42:This line contains pattern not on a line by itself. +./testdata/grepinputx-43-This is the last line of this file. ---------------------------- Test 32 ----------------------------- ./testdata/grepinputx ---------------------------- Test 33 ----------------------------- @@ -336,11 +351,11 @@ aaaaa0 aaaaa2 RC=0 ======== STDERR ======== -pcregrep: pcre_exec() error -8 while matching this line: +pcregrep: pcre_exec() error -8 while matching this text: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa pcregrep: error -8 means that a resource limit was exceeded pcregrep: check your regex for nested unlimited loops -pcregrep: pcre_exec() error -8 while matching this line: +pcregrep: pcre_exec() error -8 while matching this text: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ---------------------------- Test 38 ------------------------------ This line contains a binary zero here >< for testing. @@ -388,8 +403,10 @@ PUT NEW DATA ABOVE THIS LINE. ---------------------------- Test 49 ------------------------------ ---------------------------- Test 50 ------------------------------ over the lazy dog. +This time it jumps and jumps and jumps. ---------------------------- Test 51 ------------------------------ fox jumps +This time it jumps and jumps and jumps. ---------------------------- Test 52 ------------------------------ 36972,6 36990,4 @@ -402,3 +419,7 @@ fox jumps 596:28,4 597:15,5 597:32,4 +---------------------------- Test 54 ----------------------------- +Here is the pattern again. +That time it was on a line by itself. +This line contains pattern not on a line by itself. diff --git a/ext/pcre/pcrelib/testdata/testinput1 b/ext/pcre/pcrelib/testdata/testinput1 index 081575bc7b..8b0caa4cc9 100644 --- a/ext/pcre/pcrelib/testdata/testinput1 +++ b/ext/pcre/pcrelib/testdata/testinput1 @@ -4039,4 +4039,29 @@ /.*[op][xyz]/ fooabcfoo +/(?(?=.*b)b|^)/ + adc + abc + +/(?(?=^.*b)b|^)/ + adc + abc + +/(?(?=.*b)b|^)*/ + adc + abc + +/(?(?=.*b)b|^)+/ + adc + abc + +/(?(?=b).*b|^d)/ + abc + +/(?(?=.*b).*b|^d)/ + abc + +/^%((?(?=[a])[^%])|b)*%$/ + %ab% + / End of testinput1 / diff --git a/ext/pcre/pcrelib/testdata/testinput2 b/ext/pcre/pcrelib/testdata/testinput2 index 2b64546a8e..beca157c6a 100644 --- a/ext/pcre/pcrelib/testdata/testinput2 +++ b/ext/pcre/pcrelib/testdata/testinput2 @@ -2726,4 +2726,51 @@ a random value. /Ix /(abc|pqr|123){0}[xyz]/SI +/(?(?=.*b)b|^)/CI + adc + abc + +/(?(?=b).*b|^d)/I + +/(?(?=.*b).*b|^d)/I + +/a?|b?/P + abc + ** Failers + ddd\N + +/xyz/C + xyz + abcxyz + abcxyz\Y + ** Failers + abc + abc\Y + abcxypqr + abcxypqr\Y + +/^"((?(?=[a])[^"])|b)*"$/C + "ab" + +/^"((?(?=[a])[^"])|b)*"$/ + "ab" + +/^X(?5)(a)(?|(b)|(q))(c)(d)(Y)/ + XYabcdY + +/^X(?5)(a)(?|(b)|(q))(c)(d)Y/ + XYabcdY + +/^X(?&N)(a)(?|(b)|(q))(c)(d)(?Y)/ + XYabcdY + +/^X(?7)(a)(?|(b)|(q)(r)(s))(c)(d)(Y)/ + XYabcdY + +/^X(?7)(a)(?|(b|(r)(s))|(q))(c)(d)(Y)/ + XYabcdY + +/^X(?7)(a)(?|(b|(?|(r)|(t))(s))|(q))(c)(d)(Y)/ + XYabcdY + / End of testinput2 / diff --git a/ext/pcre/pcrelib/testdata/testinput5 b/ext/pcre/pcrelib/testdata/testinput5 index 7d64b436fd..38e22b8c01 100644 --- a/ext/pcre/pcrelib/testdata/testinput5 +++ b/ext/pcre/pcrelib/testdata/testinput5 @@ -480,4 +480,9 @@ can't tell the difference.) --/ /X/8f A\x{1ec5}ABCXYZ +/(*UTF8)\x{1234}/ + abcd\x{1234}pqr + +/(*CRLF)(*UTF8)(*BSR_UNICODE)a\Rb/I + / End of testinput5 / diff --git a/ext/pcre/pcrelib/testdata/testinput6 b/ext/pcre/pcrelib/testdata/testinput6 index a8640f9920..628646d7d3 100644 --- a/ext/pcre/pcrelib/testdata/testinput6 +++ b/ext/pcre/pcrelib/testdata/testinput6 @@ -942,5 +942,13 @@ was broken in all cases./ \x{10a}\x{10b} \x{10b}\x{10b} \x{10b}\x{10a} + +/The next two tests are for property support in non-UTF-8 mode/ + +/(?:\p{Lu}|\x20)+/ + \x41\x20\x50\xC2\x54\xC9\x20\x54\x4F\x44\x41\x59 + +/[\p{Lu}\x20]+/ + \x41\x20\x50\xC2\x54\xC9\x20\x54\x4F\x44\x41\x59 / End of testinput6 / diff --git a/ext/pcre/pcrelib/testdata/testinput7 b/ext/pcre/pcrelib/testdata/testinput7 index 5ec48270b0..047a975790 100644 --- a/ext/pcre/pcrelib/testdata/testinput7 +++ b/ext/pcre/pcrelib/testdata/testinput7 @@ -2733,8 +2733,6 @@ abc abq -/ab{1,}bc/ - /ab+bc/ abbbbc @@ -4392,4 +4390,35 @@ ** Failers ab +/X$/E + X + ** Failers + X\n + +/X$/ + X + X\n + +/xyz/C + xyz + abcxyz + abcxyz\Y + ** Failers + abc + abc\Y + abcxypqr + abcxypqr\Y + +/(?C)ab/ + ab + \C-ab + +/ab/C + ab + \C-ab + +/^"((?(?=[a])[^"])|b)*"$/C + "ab" + \C-"ab" + / End of testinput7 / diff --git a/ext/pcre/pcrelib/testdata/testoutput1 b/ext/pcre/pcrelib/testdata/testoutput1 index 208df49bcb..81b0cb868a 100644 --- a/ext/pcre/pcrelib/testdata/testoutput1 +++ b/ext/pcre/pcrelib/testdata/testoutput1 @@ -6609,4 +6609,41 @@ No match fooabcfoo No match +/(?(?=.*b)b|^)/ + adc + 0: + abc + 0: b + +/(?(?=^.*b)b|^)/ + adc + 0: + abc +No match + +/(?(?=.*b)b|^)*/ + adc + 0: + abc + 0: + +/(?(?=.*b)b|^)+/ + adc + 0: + abc + 0: b + +/(?(?=b).*b|^d)/ + abc + 0: b + +/(?(?=.*b).*b|^d)/ + abc + 0: ab + +/^%((?(?=[a])[^%])|b)*%$/ + %ab% + 0: %ab% + 1: + / End of testinput1 / diff --git a/ext/pcre/pcrelib/testdata/testoutput2 b/ext/pcre/pcrelib/testdata/testoutput2 index 2ac018b3b6..420e75dc40 100644 --- a/ext/pcre/pcrelib/testdata/testoutput2 +++ b/ext/pcre/pcrelib/testdata/testoutput2 @@ -9638,4 +9638,206 @@ No first char No need char Starting byte set: x y z +/(?(?=.*b)b|^)/CI +Capturing subpattern count = 0 +Partial matching not supported +Options: +No first char +No need char + adc +--->adc + +0 ^ (?(?=.*b)b|^) + +2 ^ (?=.*b) + +5 ^ .* + +7 ^ ^ b + +7 ^ ^ b + +7 ^^ b + +7 ^ b ++12 ^ ) ++13 ^ + 0: + abc +--->abc + +0 ^ (?(?=.*b)b|^) + +2 ^ (?=.*b) + +5 ^ .* + +7 ^ ^ b + +7 ^ ^ b + +7 ^^ b + +8 ^ ^ ) + +9 ^ b + +0 ^ (?(?=.*b)b|^) + +2 ^ (?=.*b) + +5 ^ .* + +7 ^ ^ b + +7 ^^ b + +7 ^ b + +8 ^^ ) + +9 ^ b ++10 ^^ | ++13 ^^ + 0: b + +/(?(?=b).*b|^d)/I +Capturing subpattern count = 0 +Partial matching not supported +No options +No first char +No need char + +/(?(?=.*b).*b|^d)/I +Capturing subpattern count = 0 +Partial matching not supported +No options +First char at start or follows newline +No need char + +/a?|b?/P + abc + 0: a + ** Failers + 0: + ddd\N +No match: POSIX code 17: match failed + +/xyz/C + xyz +--->xyz + +0 ^ x + +1 ^^ y + +2 ^ ^ z + +3 ^ ^ + 0: xyz + abcxyz +--->abcxyz + +0 ^ x + +1 ^^ y + +2 ^ ^ z + +3 ^ ^ + 0: xyz + abcxyz\Y +--->abcxyz + +0 ^ x + +0 ^ x + +0 ^ x + +0 ^ x + +1 ^^ y + +2 ^ ^ z + +3 ^ ^ + 0: xyz + ** Failers +No match + abc +No match + abc\Y +--->abc + +0 ^ x + +0 ^ x + +0 ^ x + +0 ^ x +No match + abcxypqr +No match + abcxypqr\Y +--->abcxypqr + +0 ^ x + +0 ^ x + +0 ^ x + +0 ^ x + +1 ^^ y + +2 ^ ^ z + +0 ^ x + +0 ^ x + +0 ^ x + +0 ^ x + +0 ^ x +No match + +/^"((?(?=[a])[^"])|b)*"$/C + "ab" +--->"ab" + +0 ^ ^ + +1 ^ " + +2 ^^ ((?(?=[a])[^"])|b)* + +3 ^^ (?(?=[a])[^"]) + +5 ^^ (?=[a]) + +8 ^^ [a] ++11 ^ ^ ) ++12 ^^ [^"] ++16 ^ ^ ) ++17 ^ ^ | + +3 ^ ^ (?(?=[a])[^"]) + +5 ^ ^ (?=[a]) + +8 ^ ^ [a] ++21 ^ ^ " ++18 ^ ^ b ++19 ^ ^ ) + +3 ^ ^ (?(?=[a])[^"]) + +5 ^ ^ (?=[a]) + +8 ^ ^ [a] ++21 ^ ^ " ++22 ^ ^ $ ++23 ^ ^ + 0: "ab" + 1: + +/^"((?(?=[a])[^"])|b)*"$/ + "ab" + 0: "ab" + 1: + +/^X(?5)(a)(?|(b)|(q))(c)(d)(Y)/ + XYabcdY + 0: XYabcdY + 1: a + 2: b + 3: c + 4: d + 5: Y + +/^X(?5)(a)(?|(b)|(q))(c)(d)Y/ +Failed: reference to non-existent subpattern at offset 5 + +/^X(?&N)(a)(?|(b)|(q))(c)(d)(?Y)/ + XYabcdY + 0: XYabcdY + 1: a + 2: b + 3: c + 4: d + 5: Y + +/^X(?7)(a)(?|(b)|(q)(r)(s))(c)(d)(Y)/ + XYabcdY + 0: XYabcdY + 1: a + 2: b + 3: + 4: + 5: c + 6: d + 7: Y + +/^X(?7)(a)(?|(b|(r)(s))|(q))(c)(d)(Y)/ + XYabcdY + 0: XYabcdY + 1: a + 2: b + 3: + 4: + 5: c + 6: d + 7: Y + +/^X(?7)(a)(?|(b|(?|(r)|(t))(s))|(q))(c)(d)(Y)/ + XYabcdY + 0: XYabcdY + 1: a + 2: b + 3: + 4: + 5: c + 6: d + 7: Y + / End of testinput2 / diff --git a/ext/pcre/pcrelib/testdata/testoutput5 b/ext/pcre/pcrelib/testdata/testoutput5 index 9567233703..75630b4500 100644 --- a/ext/pcre/pcrelib/testdata/testoutput5 +++ b/ext/pcre/pcrelib/testdata/testoutput5 @@ -1641,4 +1641,15 @@ No match A\x{1ec5}ABCXYZ 0: X +/(*UTF8)\x{1234}/ + abcd\x{1234}pqr + 0: \x{1234} + +/(*CRLF)(*UTF8)(*BSR_UNICODE)a\Rb/I +Capturing subpattern count = 0 +Options: bsr_unicode utf8 +Forced newline sequence: CRLF +First char = 'a' +Need char = 'b' + / End of testinput5 / diff --git a/ext/pcre/pcrelib/testdata/testoutput6 b/ext/pcre/pcrelib/testdata/testoutput6 index caba466bc9..cb6f7d55ca 100644 --- a/ext/pcre/pcrelib/testdata/testoutput6 +++ b/ext/pcre/pcrelib/testdata/testoutput6 @@ -1746,5 +1746,15 @@ No match \x{10b}\x{10a} 0: \x{10b}\x{10a} 1: \x{10b} + +/The next two tests are for property support in non-UTF-8 mode/ + +/(?:\p{Lu}|\x20)+/ + \x41\x20\x50\xC2\x54\xC9\x20\x54\x4F\x44\x41\x59 + 0: A P\xc2T\xc9 TODAY + +/[\p{Lu}\x20]+/ + \x41\x20\x50\xC2\x54\xC9\x20\x54\x4F\x44\x41\x59 + 0: A P\xc2T\xc9 TODAY / End of testinput6 / diff --git a/ext/pcre/pcrelib/testdata/testoutput7 b/ext/pcre/pcrelib/testdata/testoutput7 index aef4b6ceb5..78c892313c 100644 --- a/ext/pcre/pcrelib/testdata/testoutput7 +++ b/ext/pcre/pcrelib/testdata/testoutput7 @@ -4573,8 +4573,6 @@ No match abq No match -/ab{1,}bc/ - /ab+bc/ abbbbc 0: abbbbc @@ -7254,4 +7252,122 @@ No match ab No match +/X$/E + X + 0: X + ** Failers +No match + X\n +No match + +/X$/ + X + 0: X + X\n + 0: X + +/xyz/C + xyz +--->xyz + +0 ^ x + +1 ^^ y + +2 ^ ^ z + +3 ^ ^ + 0: xyz + abcxyz +--->abcxyz + +0 ^ x + +1 ^^ y + +2 ^ ^ z + +3 ^ ^ + 0: xyz + abcxyz\Y +--->abcxyz + +0 ^ x + +0 ^ x + +0 ^ x + +0 ^ x + +1 ^^ y + +2 ^ ^ z + +3 ^ ^ + 0: xyz + ** Failers +No match + abc +No match + abc\Y +--->abc + +0 ^ x + +0 ^ x + +0 ^ x + +0 ^ x +No match + abcxypqr +No match + abcxypqr\Y +--->abcxypqr + +0 ^ x + +0 ^ x + +0 ^ x + +0 ^ x + +1 ^^ y + +2 ^ ^ z + +0 ^ x + +0 ^ x + +0 ^ x + +0 ^ x + +0 ^ x +No match + +/(?C)ab/ + ab +--->ab + 0 ^ a + 0: ab + \C-ab + 0: ab + +/ab/C + ab +--->ab + +0 ^ a + +1 ^^ b + +2 ^ ^ + 0: ab + \C-ab + 0: ab + +/^"((?(?=[a])[^"])|b)*"$/C + "ab" +--->"ab" + +0 ^ ^ + +1 ^ " + +2 ^^ ((?(?=[a])[^"])|b)* ++21 ^^ " + +3 ^^ (?(?=[a])[^"]) ++18 ^^ b + +5 ^^ (?=[a]) + +8 ^ [a] ++11 ^^ ) ++12 ^^ [^"] ++16 ^ ^ ) ++17 ^ ^ | ++21 ^ ^ " + +3 ^ ^ (?(?=[a])[^"]) ++18 ^ ^ b + +5 ^ ^ (?=[a]) + +8 ^ [a] ++19 ^ ^ ) ++21 ^ ^ " + +3 ^ ^ (?(?=[a])[^"]) ++18 ^ ^ b + +5 ^ ^ (?=[a]) + +8 ^ [a] ++17 ^ ^ | ++22 ^ ^ $ ++23 ^ ^ + 0: "ab" + \C-"ab" + 0: "ab" + / End of testinput7 / diff --git a/ext/pcre/pcrelib/ucp.h b/ext/pcre/pcrelib/ucp.h index 1968e4765c..ef62e40583 100644 --- a/ext/pcre/pcrelib/ucp.h +++ b/ext/pcre/pcrelib/ucp.h @@ -6,9 +6,8 @@ #define _UCP_H /* This file contains definitions of the property values that are returned by -the function _pcre_ucp_findprop(). New values that are added for new releases -of Unicode should always be at the end of each enum, for backwards -compatibility. */ +the UCD access macros. New values that are added for new releases of Unicode +should always be at the end of each enum, for backwards compatibility. */ /* These are the general character categories. */ -- 2.40.0