// If this is a named capture group, add the name->group number mapping.
if (fCaptureName != NULL) {
+ if (!fRXPat->initNamedCaptureMap()) {
+ if (U_SUCCESS(*fStatus)) {
+ error(fRXPat->fDeferredStatus);
+ }
+ break;
+ }
int32_t groupNumber = fRXPat->fGroupMap->size();
int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
fCaptureName = NULL; // hash table takes ownership of the name (key) string.
case doCompleteNamedBackRef:
{
- int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
+ int32_t groupNumber =
+ fRXPat->fNamedCaptureMap ? uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName) : 0;
if (groupNumber == 0) {
// Group name has not been defined.
// Could be a forward reference. If we choose to support them at some
(nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9
groupName.append(nextChar);
} else if (nextChar == RIGHTBRACKET) {
- groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName);
+ groupNum = fPattern->fNamedCaptureMap ? uhash_geti(fPattern->fNamedCaptureMap, &groupName) : 0;
if (groupNum == 0) {
status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
}
}
// Copy the named capture group hash map.
- int32_t hashPos = UHASH_FIRST;
- while (const UHashElement *hashEl = uhash_nextElement(other.fNamedCaptureMap, &hashPos)) {
- if (U_FAILURE(fDeferredStatus)) {
- break;
- }
- const UnicodeString *name = (const UnicodeString *)hashEl->key.pointer;
- UnicodeString *key = new UnicodeString(*name);
- int32_t val = hashEl->value.integer;
- if (key == NULL) {
- fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
- } else {
- uhash_puti(fNamedCaptureMap, key, val, &fDeferredStatus);
+ if (other.fNamedCaptureMap != nullptr && initNamedCaptureMap()) {
+ int32_t hashPos = UHASH_FIRST;
+ while (const UHashElement *hashEl = uhash_nextElement(other.fNamedCaptureMap, &hashPos)) {
+ if (U_FAILURE(fDeferredStatus)) {
+ break;
+ }
+ const UnicodeString *name = (const UnicodeString *)hashEl->key.pointer;
+ UnicodeString *key = new UnicodeString(*name);
+ int32_t val = hashEl->value.integer;
+ if (key == NULL) {
+ fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
+ } else {
+ uhash_puti(fNamedCaptureMap, key, val, &fDeferredStatus);
+ }
}
}
return *this;
fSets = new UVector(fDeferredStatus);
fInitialChars = new UnicodeSet;
fInitialChars8 = new Regex8BitSet;
- fNamedCaptureMap = uhash_open(uhash_hashUnicodeString, // Key hash function
- uhash_compareUnicodeString, // Key comparator function
- uhash_compareLong, // Value comparator function
- &fDeferredStatus);
if (U_FAILURE(fDeferredStatus)) {
return;
}
if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL ||
- fInitialChars == NULL || fInitialChars8 == NULL || fNamedCaptureMap == NULL) {
+ fInitialChars == NULL || fInitialChars8 == NULL) {
fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
return;
}
// Slot zero of the vector of sets is reserved. Fill it here.
fSets->addElement((int32_t)0, fDeferredStatus);
+}
+
+
+bool RegexPattern::initNamedCaptureMap() {
+ if (fNamedCaptureMap) {
+ return true;
+ }
+ fNamedCaptureMap = uhash_openSize(uhash_hashUnicodeString, // Key hash function
+ uhash_compareUnicodeString, // Key comparator function
+ uhash_compareLong, // Value comparator function
+ 7, // Initial table capacity
+ &fDeferredStatus);
+ if (U_FAILURE(fDeferredStatus)) {
+ return false;
+ }
// fNamedCaptureMap owns its key strings, type (UnicodeString *)
uhash_setKeyDeleter(fNamedCaptureMap, uprv_deleteUObject);
+ return true;
}
-
//--------------------------------------------------------------------------
//
// zap Delete everything owned by this RegexPattern.
delete fPatternString;
fPatternString = NULL;
}
- uhash_close(fNamedCaptureMap);
- fNamedCaptureMap = NULL;
+ if (fNamedCaptureMap != NULL) {
+ uhash_close(fNamedCaptureMap);
+ fNamedCaptureMap = NULL;
+ }
}
// No need to explicitly check for syntactically valid names.
// Invalid ones will never be in the map, and the lookup will fail.
- int32_t number = uhash_geti(fNamedCaptureMap, &groupName);
+ int32_t number = fNamedCaptureMap ? uhash_geti(fNamedCaptureMap, &groupName) : 0;
if (number == 0) {
status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
}
//
// Implementation Methods
//
- void init(); // Common initialization, for use by constructors.
- void zap(); // Common cleanup
+ void init(); // Common initialization, for use by constructors.
+ bool initNamedCaptureMap(); // Lazy init for fNamedCaptureMap.
+ void zap(); // Common cleanup
void dumpOp(int32_t index) const;
(c32 >= 0x31 && c32 <= 0x39)) { // 0..9
groupName.append(c32);
} else if (c32 == RIGHTBRACKET) {
- groupNum = uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName);
+ groupNum = regexp->fPat->fNamedCaptureMap ?
+ uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName) : 0;
if (groupNum == 0) {
// Name not defined by pattern.
*status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
TESTCASE_AUTO(TestBug13631);
TESTCASE_AUTO(TestBug13632);
TESTCASE_AUTO(TestBug20359);
+ TESTCASE_AUTO(TestBug20863);
TESTCASE_AUTO_END;
}
assertSuccess(WHERE, status);
}
+
+void RegexTest::TestBug20863() {
+ // Test that patterns with a large number of named capture groups work correctly.
+ //
+ // The ticket was not for a bug per se, but to reduce memory usage by using lazy
+ // construction of the map from capture names to numbers, and decreasing the
+ // default size of the map.
+
+ constexpr int GROUP_COUNT = 2000;
+ std::vector<UnicodeString> groupNames;
+ for (int32_t i=0; i<GROUP_COUNT; ++i) {
+ UnicodeString name;
+ name.append(u"name");
+ name.append(Int64ToUnicodeString(i));
+ groupNames.push_back(name);
+ }
+
+ UnicodeString patternString;
+ for (UnicodeString name: groupNames) {
+ patternString.append(u"(?<");
+ patternString.append(name);
+ patternString.append(u">.)");
+ }
+
+ UErrorCode status = U_ZERO_ERROR;
+ UParseError pe;
+ LocalPointer<RegexPattern> pattern(RegexPattern::compile(patternString, pe, status), status);
+ if (!assertSuccess(WHERE, status)) {
+ return;
+ }
+
+ for (int32_t i=0; i<GROUP_COUNT; ++i) {
+ int32_t group = pattern->groupNumberFromName(groupNames[i], status);
+ if (!assertSuccess(WHERE, status)) {
+ return;
+ }
+ assertEquals(WHERE, i+1, group);
+ // Note: group 0 is the overall match; group 1 is the first separate capture group.
+ }
+
+ // Verify that assignment of patterns with various combinations of named capture work.
+ // Lazy creation of the internal named capture map changed the implementation logic here.
+ {
+ LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"abc", pe, status), status);
+ LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name>b)c", pe, status), status);
+ assertSuccess(WHERE, status);
+ assertFalse(WHERE, *pat1 == *pat2);
+ *pat1 = *pat2;
+ assertTrue(WHERE, *pat1 == *pat2);
+ assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name", status));
+ assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name", status));
+ assertSuccess(WHERE, status);
+ }
+
+ {
+ LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"abc", pe, status), status);
+ LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name>b)c", pe, status), status);
+ assertSuccess(WHERE, status);
+ assertFalse(WHERE, *pat1 == *pat2);
+ *pat2 = *pat1;
+ assertTrue(WHERE, *pat1 == *pat2);
+ assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name", status));
+ assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
+ status = U_ZERO_ERROR;
+ assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name", status));
+ assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
+ status = U_ZERO_ERROR;
+ }
+
+ {
+ LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"a(?<name1>b)c", pe, status), status);
+ LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name2>b)c", pe, status), status);
+ assertSuccess(WHERE, status);
+ assertFalse(WHERE, *pat1 == *pat2);
+ *pat2 = *pat1;
+ assertTrue(WHERE, *pat1 == *pat2);
+ assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name1", status));
+ assertSuccess(WHERE, status);
+ assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name1", status));
+ assertSuccess(WHERE, status);
+ assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name2", status));
+ assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
+ status = U_ZERO_ERROR;
+ assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name2", status));
+ assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
+ status = U_ZERO_ERROR;
+ }
+
+}
+
+
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
virtual void TestBug13631();
virtual void TestBug13632();
virtual void TestBug20359();
+ virtual void TestBug20863();
// The following functions are internal to the regexp tests.
virtual void assertUText(const char *expected, UText *actual, const char *file, int line);