]> granicus.if.org Git - icu/commitdiff
ICU-20863 Regex, lazy creation and reduced size of map from capture group names to...
authorAndy Heninger <andy.heninger@gmail.com>
Sat, 19 Oct 2019 02:00:32 +0000 (19:00 -0700)
committerAndy Heninger <andy.heninger@gmail.com>
Wed, 23 Oct 2019 00:23:26 +0000 (17:23 -0700)
icu4c/source/i18n/regexcmp.cpp
icu4c/source/i18n/rematch.cpp
icu4c/source/i18n/repattrn.cpp
icu4c/source/i18n/unicode/regex.h
icu4c/source/i18n/uregex.cpp
icu4c/source/test/intltest/regextst.cpp
icu4c/source/test/intltest/regextst.h

index cacc069e962cb5d97bf9862d5433a476a4541f6e..7274ca9a9255091b4644b6a56aa44588a16684ec 100644 (file)
@@ -490,6 +490,12 @@ UBool RegexCompile::doParseActions(int32_t action)
 
             // If this is a named capture group, add the name->group number mapping.
             if (fCaptureName != NULL) {
+                if (!fRXPat->initNamedCaptureMap()) {
+                    if (U_SUCCESS(*fStatus)) {
+                        error(fRXPat->fDeferredStatus);
+                    }
+                    break;
+                }
                 int32_t groupNumber = fRXPat->fGroupMap->size();
                 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
                 fCaptureName = NULL;    // hash table takes ownership of the name (key) string.
@@ -1345,7 +1351,8 @@ UBool RegexCompile::doParseActions(int32_t action)
 
     case doCompleteNamedBackRef:
         {
-        int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
+        int32_t groupNumber =
+            fRXPat->fNamedCaptureMap ? uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName) : 0;
         if (groupNumber == 0) {
             // Group name has not been defined.
             //   Could be a forward reference. If we choose to support them at some
index 6d6ea0fb5dd4e9d754e6176c734b2714b566107f..5be3fa7ddca8acb461921240dc69f8f7abd538e2 100644 (file)
@@ -429,7 +429,7 @@ RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
                                (nextChar >= 0x31 && nextChar <= 0x39)) {       // 0..9
                         groupName.append(nextChar);
                     } else if (nextChar == RIGHTBRACKET) {
-                        groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName);
+                        groupNum = fPattern->fNamedCaptureMap ? uhash_geti(fPattern->fNamedCaptureMap, &groupName) : 0;
                         if (groupNum == 0) {
                             status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
                         }
index 1386d186c4d2051036dfe9b5116149b696974cb9..ee6bc6836d12986b07e179b3ea7c7dabcd43dc35 100644 (file)
@@ -138,18 +138,20 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
     }
 
     // Copy the named capture group hash map.
-    int32_t hashPos = UHASH_FIRST;
-    while (const UHashElement *hashEl = uhash_nextElement(other.fNamedCaptureMap, &hashPos)) {
-        if (U_FAILURE(fDeferredStatus)) {
-            break;
-        }
-        const UnicodeString *name = (const UnicodeString *)hashEl->key.pointer;
-        UnicodeString *key = new UnicodeString(*name);
-        int32_t val = hashEl->value.integer;
-        if (key == NULL) {
-            fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
-        } else {
-            uhash_puti(fNamedCaptureMap, key, val, &fDeferredStatus);
+    if (other.fNamedCaptureMap != nullptr && initNamedCaptureMap()) {
+        int32_t hashPos = UHASH_FIRST;
+        while (const UHashElement *hashEl = uhash_nextElement(other.fNamedCaptureMap, &hashPos)) {
+            if (U_FAILURE(fDeferredStatus)) {
+                break;
+            }
+            const UnicodeString *name = (const UnicodeString *)hashEl->key.pointer;
+            UnicodeString *key = new UnicodeString(*name);
+            int32_t val = hashEl->value.integer;
+            if (key == NULL) {
+                fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
+            } else {
+                uhash_puti(fNamedCaptureMap, key, val, &fDeferredStatus);
+            }
         }
     }
     return *this;
@@ -191,27 +193,38 @@ void RegexPattern::init() {
     fSets             = new UVector(fDeferredStatus);
     fInitialChars     = new UnicodeSet;
     fInitialChars8    = new Regex8BitSet;
-    fNamedCaptureMap  = uhash_open(uhash_hashUnicodeString,     // Key hash function
-                                   uhash_compareUnicodeString,  // Key comparator function
-                                   uhash_compareLong,           // Value comparator function
-                                   &fDeferredStatus);
     if (U_FAILURE(fDeferredStatus)) {
         return;
     }
     if (fCompiledPat == NULL  || fGroupMap == NULL || fSets == NULL ||
-            fInitialChars == NULL || fInitialChars8 == NULL || fNamedCaptureMap == NULL) {
+            fInitialChars == NULL || fInitialChars8 == NULL) {
         fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
         return;
     }
 
     // Slot zero of the vector of sets is reserved.  Fill it here.
     fSets->addElement((int32_t)0, fDeferredStatus);
+}
+
+
+bool RegexPattern::initNamedCaptureMap() {
+    if (fNamedCaptureMap) {
+        return true;
+    }
+    fNamedCaptureMap  = uhash_openSize(uhash_hashUnicodeString,     // Key hash function
+                                       uhash_compareUnicodeString,  // Key comparator function
+                                       uhash_compareLong,           // Value comparator function
+                                       7,                           // Initial table capacity
+                                       &fDeferredStatus);
+    if (U_FAILURE(fDeferredStatus)) {
+        return false;
+    }
 
     // fNamedCaptureMap owns its key strings, type (UnicodeString *)
     uhash_setKeyDeleter(fNamedCaptureMap, uprv_deleteUObject);
+    return true;
 }
 
-
 //--------------------------------------------------------------------------
 //
 //   zap            Delete everything owned by this RegexPattern.
@@ -246,8 +259,10 @@ void RegexPattern::zap() {
         delete fPatternString;
         fPatternString = NULL;
     }
-    uhash_close(fNamedCaptureMap);
-    fNamedCaptureMap = NULL;
+    if (fNamedCaptureMap != NULL) {
+        uhash_close(fNamedCaptureMap);
+        fNamedCaptureMap = NULL;
+    }
 }
 
 
@@ -618,7 +633,7 @@ int32_t RegexPattern::groupNumberFromName(const UnicodeString &groupName, UError
     // No need to explicitly check for syntactically valid names.
     // Invalid ones will never be in the map, and the lookup will fail.
 
-    int32_t number = uhash_geti(fNamedCaptureMap, &groupName);
+    int32_t number = fNamedCaptureMap ? uhash_geti(fNamedCaptureMap, &groupName) : 0;
     if (number == 0) {
         status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
     }
index 69fb7ecd093ce792b50483f64d6cde611951429d..6338eb7c754eff8a097bd864a08088dd28d215b2 100644 (file)
@@ -635,8 +635,9 @@ private:
     //
     //  Implementation Methods
     //
-    void        init();            // Common initialization, for use by constructors.
-    void        zap();             // Common cleanup
+    void        init();                 // Common initialization, for use by constructors.
+    bool        initNamedCaptureMap();  // Lazy init for fNamedCaptureMap.
+    void        zap();                  // Common cleanup
 
     void        dumpOp(int32_t index) const;
 
index 57c2febe9d02bcb7f52e368897be6cd369886d7a..7f41918cff93a458f220ecf7d1332b35f61aa79a 100644 (file)
@@ -1508,7 +1508,8 @@ int32_t RegexCImpl::appendReplacement(RegularExpression    *regexp,
                         (c32 >= 0x31 && c32 <= 0x39)) {       // 0..9
                     groupName.append(c32);
                 } else if (c32 == RIGHTBRACKET) {
-                    groupNum = uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName);
+                    groupNum = regexp->fPat->fNamedCaptureMap ?
+                            uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName) : 0;
                     if (groupNum == 0) {
                         // Name not defined by pattern.
                         *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
index b6391d2fa289781b3f5241a280ea8d2e8238afe1..681ae1150e2ad02ce51f8ee867f65b3366a3fcb8 100644 (file)
@@ -105,6 +105,7 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
     TESTCASE_AUTO(TestBug13631);
     TESTCASE_AUTO(TestBug13632);
     TESTCASE_AUTO(TestBug20359);
+    TESTCASE_AUTO(TestBug20863);
     TESTCASE_AUTO_END;
 }
 
@@ -5913,4 +5914,95 @@ void RegexTest::TestBug20359() {
     assertSuccess(WHERE, status);
 }
 
+
+void RegexTest::TestBug20863() {
+    // Test that patterns with a large number of named capture groups work correctly.
+    //
+    // The ticket was not for a bug per se, but to reduce memory usage by using lazy
+    // construction of the map from capture names to numbers, and decreasing the
+    // default size of the map.
+
+    constexpr int GROUP_COUNT = 2000;
+    std::vector<UnicodeString> groupNames;
+    for (int32_t i=0; i<GROUP_COUNT; ++i) {
+        UnicodeString name;
+        name.append(u"name");
+        name.append(Int64ToUnicodeString(i));
+        groupNames.push_back(name);
+    }
+
+    UnicodeString patternString;
+    for (UnicodeString name: groupNames) {
+        patternString.append(u"(?<");
+        patternString.append(name);
+        patternString.append(u">.)");
+    }
+
+    UErrorCode status = U_ZERO_ERROR;
+    UParseError pe;
+    LocalPointer<RegexPattern> pattern(RegexPattern::compile(patternString, pe, status), status);
+    if (!assertSuccess(WHERE, status)) {
+        return;
+    }
+
+    for (int32_t i=0; i<GROUP_COUNT; ++i) {
+        int32_t group = pattern->groupNumberFromName(groupNames[i], status);
+        if (!assertSuccess(WHERE, status)) {
+            return;
+        }
+        assertEquals(WHERE, i+1, group);
+        // Note: group 0 is the overall match; group 1 is the first separate capture group.
+    }
+
+    // Verify that assignment of patterns with various combinations of named capture work.
+    // Lazy creation of the internal named capture map changed the implementation logic here.
+    {
+        LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"abc", pe, status), status);
+        LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name>b)c", pe, status), status);
+        assertSuccess(WHERE, status);
+        assertFalse(WHERE, *pat1 == *pat2);
+        *pat1 = *pat2;
+        assertTrue(WHERE, *pat1 == *pat2);
+        assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name", status));
+        assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name", status));
+        assertSuccess(WHERE, status);
+    }
+
+    {
+        LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"abc", pe, status), status);
+        LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name>b)c", pe, status), status);
+        assertSuccess(WHERE, status);
+        assertFalse(WHERE, *pat1 == *pat2);
+        *pat2 = *pat1;
+        assertTrue(WHERE, *pat1 == *pat2);
+        assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name", status));
+        assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
+        status = U_ZERO_ERROR;
+        assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name", status));
+        assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
+        status = U_ZERO_ERROR;
+    }
+
+    {
+        LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"a(?<name1>b)c", pe, status), status);
+        LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name2>b)c", pe, status), status);
+        assertSuccess(WHERE, status);
+        assertFalse(WHERE, *pat1 == *pat2);
+        *pat2 = *pat1;
+        assertTrue(WHERE, *pat1 == *pat2);
+        assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name1", status));
+        assertSuccess(WHERE, status);
+        assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name1", status));
+        assertSuccess(WHERE, status);
+        assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name2", status));
+        assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
+        status = U_ZERO_ERROR;
+        assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name2", status));
+        assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
+        status = U_ZERO_ERROR;
+    }
+
+}
+
+
 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
index 58e9acb22c66b3c872c11055e94c3e751fe85d16..46494c568fc12f2049f6469fc812dec39767684b 100644 (file)
@@ -60,6 +60,7 @@ public:
     virtual void TestBug13631();
     virtual void TestBug13632();
     virtual void TestBug20359();
+    virtual void TestBug20863();
 
     // The following functions are internal to the regexp tests.
     virtual void assertUText(const char *expected, UText *actual, const char *file, int line);