From: Andy Heninger Date: Fri, 27 Apr 2012 21:29:34 +0000 (+0000) Subject: ICU-9283 fix for look-behind assertions w/ case insensitive matching. X-Git-Tag: milestone-59-0-1~3842 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b916fcb50b04ee230f6fe47711d7581af6a31e5c;p=icu ICU-9283 fix for look-behind assertions w/ case insensitive matching. X-SVN-Rev: 31782 --- diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp index 5a15ab6b9dd..146d73cfafe 100644 --- a/icu4c/source/i18n/regexcmp.cpp +++ b/icu4c/source/i18n/regexcmp.cpp @@ -3306,10 +3306,31 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { } case URX_STRING_I: - // TODO: Is the case-folded string the longest? - // If so we can optimize this the same as URX_STRING. - loc++; - currentLen = INT32_MAX; + // TODO: This code assumes that any user string that matches will be no longer + // than our compiled string, with case insensitive matching. + // Our compiled string has been case-folded already. + // + // Any matching user string will have no more code points than our + // compiled (folded) string. Folding may add code points, but + // not remove them. + // + // There is a potential problem if a supplemental code point + // case-folds to a BMP code point. In this case our compiled string + // could be shorter (in code units) than a matching user string. + // + // At this time (Unicode 6.1) there are no such characters, and this case + // is not being handled. A test, intltest regex/Bug9283, will fail if + // any problematic characters are added to Unicode. + // + // If this happens, we can make a set of the BMP chars that the + // troublesome supplementals fold to, scan our string, and bump the + // currentLen one extra for each that is found. + // + { + loc++; + int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); + currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp)); + } break; case URX_CTR_INIT: diff --git a/icu4c/source/test/intltest/regextst.cpp b/icu4c/source/test/intltest/regextst.cpp index 09f205aad99..1e197a90e45 100644 --- a/icu4c/source/test/intltest/regextst.cpp +++ b/icu4c/source/test/intltest/regextst.cpp @@ -26,6 +26,7 @@ #include "unicode/regex.h" #include "unicode/uchar.h" #include "unicode/ucnv.h" +#include "unicode/uniset.h" #include "unicode/ustring.h" #include "regextst.h" #include "uvector.h" @@ -127,6 +128,9 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch case 20: name = "CheckInvBufSize"; if (exec) CheckInvBufSize(); break; + case 21: name = "Bug 9283"; + if (exec) Bug9283(); + break; default: name = ""; break; //needed to end loop @@ -5184,6 +5188,34 @@ void RegexTest::Bug7029() { delete pMatcher; } +// Bug 9283 +// This test is checking for the existance of any supplemental characters that case-fold +// to a bmp character. +// +// At the time of this writing there are none. If any should appear in a subsequent release +// of Unicode, the code in regular expressions compilation that determines the longest +// posssible match for a literal string will need to be enhanced. +// +// See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength() +// for details on what to do in case of a failure of this test. +// +void RegexTest::Bug9283() { + UErrorCode status = U_ZERO_ERROR; + UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status); + REGEX_CHECK_STATUS; + int32_t index; + UChar32 c; + for (index=0; ; index++) { + c = supplementalsWithCaseFolding.charAt(index); + if (c == -1) { + break; + } + UnicodeString cf = UnicodeString(c).foldCase(); + REGEX_ASSERT(cf.length() >= 2); + } +} + + void RegexTest::CheckInvBufSize() { if(inv_next>=INV_BUFSIZ) { errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n", diff --git a/icu4c/source/test/intltest/regextst.h b/icu4c/source/test/intltest/regextst.h index 77742fb6b15..22a77eaa2e3 100644 --- a/icu4c/source/test/intltest/regextst.h +++ b/icu4c/source/test/intltest/regextst.h @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 2002-2011, International Business Machines Corporation and + * Copyright (c) 2002-2012, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -45,6 +45,7 @@ public: virtual void Bug7740(); virtual void Bug8479(); virtual void Bug7029(); + virtual void Bug9283(); virtual void CheckInvBufSize(); // The following functions are internal to the regexp tests. diff --git a/icu4c/source/test/testdata/regextst.txt b/icu4c/source/test/testdata/regextst.txt index 21ecae9377d..53bd73a7ef3 100644 --- a/icu4c/source/test/testdata/regextst.txt +++ b/icu4c/source/test/testdata/regextst.txt @@ -1141,6 +1141,11 @@ "[\w]+" " <0>abc\u200cdef\u200dghi " "[\w]+" i " <0>abc\u200cdef\u200dghi " +# Bug 9283 +# uregex_open fails for look-behind assertion + case-insensitive + +"(ab)?(?<=ab)cd|ef" i "<0><1>abcd" + # Random debugging, Temporary # #"^(?:a?b?)*$" "a--"