From 03937347fbe800c63af14aa4739a806697a53ed2 Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Fri, 18 Oct 2019 19:00:32 -0700 Subject: [PATCH] ICU-20863 Regex, lazy creation and reduced size of map from capture group names to numbers. --- icu4c/source/i18n/regexcmp.cpp | 9 ++- icu4c/source/i18n/rematch.cpp | 2 +- icu4c/source/i18n/repattrn.cpp | 57 +++++++++------ icu4c/source/i18n/unicode/regex.h | 5 +- icu4c/source/i18n/uregex.cpp | 3 +- icu4c/source/test/intltest/regextst.cpp | 92 +++++++++++++++++++++++++ icu4c/source/test/intltest/regextst.h | 1 + 7 files changed, 143 insertions(+), 26 deletions(-) diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp index cacc069e962..7274ca9a925 100644 --- a/icu4c/source/i18n/regexcmp.cpp +++ b/icu4c/source/i18n/regexcmp.cpp @@ -490,6 +490,12 @@ UBool RegexCompile::doParseActions(int32_t action) // If this is a named capture group, add the name->group number mapping. if (fCaptureName != NULL) { + if (!fRXPat->initNamedCaptureMap()) { + if (U_SUCCESS(*fStatus)) { + error(fRXPat->fDeferredStatus); + } + break; + } int32_t groupNumber = fRXPat->fGroupMap->size(); int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus); fCaptureName = NULL; // hash table takes ownership of the name (key) string. @@ -1345,7 +1351,8 @@ UBool RegexCompile::doParseActions(int32_t action) case doCompleteNamedBackRef: { - int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName); + int32_t groupNumber = + fRXPat->fNamedCaptureMap ? uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName) : 0; if (groupNumber == 0) { // Group name has not been defined. // Could be a forward reference. If we choose to support them at some diff --git a/icu4c/source/i18n/rematch.cpp b/icu4c/source/i18n/rematch.cpp index 6d6ea0fb5dd..5be3fa7ddca 100644 --- a/icu4c/source/i18n/rematch.cpp +++ b/icu4c/source/i18n/rematch.cpp @@ -429,7 +429,7 @@ RegexMatcher &RegexMatcher::appendReplacement(UText *dest, (nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9 groupName.append(nextChar); } else if (nextChar == RIGHTBRACKET) { - groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName); + groupNum = fPattern->fNamedCaptureMap ? uhash_geti(fPattern->fNamedCaptureMap, &groupName) : 0; if (groupNum == 0) { status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; } diff --git a/icu4c/source/i18n/repattrn.cpp b/icu4c/source/i18n/repattrn.cpp index 1386d186c4d..ee6bc6836d1 100644 --- a/icu4c/source/i18n/repattrn.cpp +++ b/icu4c/source/i18n/repattrn.cpp @@ -138,18 +138,20 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) { } // Copy the named capture group hash map. - int32_t hashPos = UHASH_FIRST; - while (const UHashElement *hashEl = uhash_nextElement(other.fNamedCaptureMap, &hashPos)) { - if (U_FAILURE(fDeferredStatus)) { - break; - } - const UnicodeString *name = (const UnicodeString *)hashEl->key.pointer; - UnicodeString *key = new UnicodeString(*name); - int32_t val = hashEl->value.integer; - if (key == NULL) { - fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; - } else { - uhash_puti(fNamedCaptureMap, key, val, &fDeferredStatus); + if (other.fNamedCaptureMap != nullptr && initNamedCaptureMap()) { + int32_t hashPos = UHASH_FIRST; + while (const UHashElement *hashEl = uhash_nextElement(other.fNamedCaptureMap, &hashPos)) { + if (U_FAILURE(fDeferredStatus)) { + break; + } + const UnicodeString *name = (const UnicodeString *)hashEl->key.pointer; + UnicodeString *key = new UnicodeString(*name); + int32_t val = hashEl->value.integer; + if (key == NULL) { + fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; + } else { + uhash_puti(fNamedCaptureMap, key, val, &fDeferredStatus); + } } } return *this; @@ -191,27 +193,38 @@ void RegexPattern::init() { fSets = new UVector(fDeferredStatus); fInitialChars = new UnicodeSet; fInitialChars8 = new Regex8BitSet; - fNamedCaptureMap = uhash_open(uhash_hashUnicodeString, // Key hash function - uhash_compareUnicodeString, // Key comparator function - uhash_compareLong, // Value comparator function - &fDeferredStatus); if (U_FAILURE(fDeferredStatus)) { return; } if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL || - fInitialChars == NULL || fInitialChars8 == NULL || fNamedCaptureMap == NULL) { + fInitialChars == NULL || fInitialChars8 == NULL) { fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; return; } // Slot zero of the vector of sets is reserved. Fill it here. fSets->addElement((int32_t)0, fDeferredStatus); +} + + +bool RegexPattern::initNamedCaptureMap() { + if (fNamedCaptureMap) { + return true; + } + fNamedCaptureMap = uhash_openSize(uhash_hashUnicodeString, // Key hash function + uhash_compareUnicodeString, // Key comparator function + uhash_compareLong, // Value comparator function + 7, // Initial table capacity + &fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return false; + } // fNamedCaptureMap owns its key strings, type (UnicodeString *) uhash_setKeyDeleter(fNamedCaptureMap, uprv_deleteUObject); + return true; } - //-------------------------------------------------------------------------- // // zap Delete everything owned by this RegexPattern. @@ -246,8 +259,10 @@ void RegexPattern::zap() { delete fPatternString; fPatternString = NULL; } - uhash_close(fNamedCaptureMap); - fNamedCaptureMap = NULL; + if (fNamedCaptureMap != NULL) { + uhash_close(fNamedCaptureMap); + fNamedCaptureMap = NULL; + } } @@ -618,7 +633,7 @@ int32_t RegexPattern::groupNumberFromName(const UnicodeString &groupName, UError // No need to explicitly check for syntactically valid names. // Invalid ones will never be in the map, and the lookup will fail. - int32_t number = uhash_geti(fNamedCaptureMap, &groupName); + int32_t number = fNamedCaptureMap ? uhash_geti(fNamedCaptureMap, &groupName) : 0; if (number == 0) { status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; } diff --git a/icu4c/source/i18n/unicode/regex.h b/icu4c/source/i18n/unicode/regex.h index 69fb7ecd093..6338eb7c754 100644 --- a/icu4c/source/i18n/unicode/regex.h +++ b/icu4c/source/i18n/unicode/regex.h @@ -635,8 +635,9 @@ private: // // Implementation Methods // - void init(); // Common initialization, for use by constructors. - void zap(); // Common cleanup + void init(); // Common initialization, for use by constructors. + bool initNamedCaptureMap(); // Lazy init for fNamedCaptureMap. + void zap(); // Common cleanup void dumpOp(int32_t index) const; diff --git a/icu4c/source/i18n/uregex.cpp b/icu4c/source/i18n/uregex.cpp index 57c2febe9d0..7f41918cff9 100644 --- a/icu4c/source/i18n/uregex.cpp +++ b/icu4c/source/i18n/uregex.cpp @@ -1508,7 +1508,8 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, (c32 >= 0x31 && c32 <= 0x39)) { // 0..9 groupName.append(c32); } else if (c32 == RIGHTBRACKET) { - groupNum = uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName); + groupNum = regexp->fPat->fNamedCaptureMap ? + uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName) : 0; if (groupNum == 0) { // Name not defined by pattern. *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; diff --git a/icu4c/source/test/intltest/regextst.cpp b/icu4c/source/test/intltest/regextst.cpp index b6391d2fa28..681ae1150e2 100644 --- a/icu4c/source/test/intltest/regextst.cpp +++ b/icu4c/source/test/intltest/regextst.cpp @@ -105,6 +105,7 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch TESTCASE_AUTO(TestBug13631); TESTCASE_AUTO(TestBug13632); TESTCASE_AUTO(TestBug20359); + TESTCASE_AUTO(TestBug20863); TESTCASE_AUTO_END; } @@ -5913,4 +5914,95 @@ void RegexTest::TestBug20359() { assertSuccess(WHERE, status); } + +void RegexTest::TestBug20863() { + // Test that patterns with a large number of named capture groups work correctly. + // + // The ticket was not for a bug per se, but to reduce memory usage by using lazy + // construction of the map from capture names to numbers, and decreasing the + // default size of the map. + + constexpr int GROUP_COUNT = 2000; + std::vector groupNames; + for (int32_t i=0; i.)"); + } + + UErrorCode status = U_ZERO_ERROR; + UParseError pe; + LocalPointer pattern(RegexPattern::compile(patternString, pe, status), status); + if (!assertSuccess(WHERE, status)) { + return; + } + + for (int32_t i=0; igroupNumberFromName(groupNames[i], status); + if (!assertSuccess(WHERE, status)) { + return; + } + assertEquals(WHERE, i+1, group); + // Note: group 0 is the overall match; group 1 is the first separate capture group. + } + + // Verify that assignment of patterns with various combinations of named capture work. + // Lazy creation of the internal named capture map changed the implementation logic here. + { + LocalPointer pat1(RegexPattern::compile(u"abc", pe, status), status); + LocalPointer pat2(RegexPattern::compile(u"a(?b)c", pe, status), status); + assertSuccess(WHERE, status); + assertFalse(WHERE, *pat1 == *pat2); + *pat1 = *pat2; + assertTrue(WHERE, *pat1 == *pat2); + assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name", status)); + assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name", status)); + assertSuccess(WHERE, status); + } + + { + LocalPointer pat1(RegexPattern::compile(u"abc", pe, status), status); + LocalPointer pat2(RegexPattern::compile(u"a(?b)c", pe, status), status); + assertSuccess(WHERE, status); + assertFalse(WHERE, *pat1 == *pat2); + *pat2 = *pat1; + assertTrue(WHERE, *pat1 == *pat2); + assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name", status)); + assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status); + status = U_ZERO_ERROR; + assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name", status)); + assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status); + status = U_ZERO_ERROR; + } + + { + LocalPointer pat1(RegexPattern::compile(u"a(?b)c", pe, status), status); + LocalPointer pat2(RegexPattern::compile(u"a(?b)c", pe, status), status); + assertSuccess(WHERE, status); + assertFalse(WHERE, *pat1 == *pat2); + *pat2 = *pat1; + assertTrue(WHERE, *pat1 == *pat2); + assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name1", status)); + assertSuccess(WHERE, status); + assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name1", status)); + assertSuccess(WHERE, status); + assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name2", status)); + assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status); + status = U_ZERO_ERROR; + assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name2", status)); + assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status); + status = U_ZERO_ERROR; + } + +} + + #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ diff --git a/icu4c/source/test/intltest/regextst.h b/icu4c/source/test/intltest/regextst.h index 58e9acb22c6..46494c568fc 100644 --- a/icu4c/source/test/intltest/regextst.h +++ b/icu4c/source/test/intltest/regextst.h @@ -60,6 +60,7 @@ public: virtual void TestBug13631(); virtual void TestBug13632(); virtual void TestBug20359(); + virtual void TestBug20863(); // The following functions are internal to the regexp tests. virtual void assertUText(const char *expected, UText *actual, const char *file, int line); -- 2.40.0