return FALSE;
}
} else {
- // For now, let the matcher discover that it can't match on its own
- // We don't know how long the match len is in native characters
- testStartLimit = fActiveLimit;
+ // We don't know exactly how long the minimum match length is in native characters.
+ // Treat anything > 0 as 1.
+ testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
}
UChar32 c;
{
// Match may start on any char from a pre-computed set.
U_ASSERT(fPattern->fMinMatchLen > 0);
- int64_t pos;
UTEXT_SETNATIVEINDEX(fInputText, startPos);
for (;;) {
+ int64_t pos = startPos;
c = UTEXT_NEXT32(fInputText);
- pos = UTEXT_GETNATIVEINDEX(fInputText);
+ startPos = UTEXT_GETNATIVEINDEX(fInputText);
// c will be -1 (U_SENTINEL) at end of text, in which case we
// skip this next block (so we don't have a negative array index)
// and handle end of text in the following block.
if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
(c>=256 && fPattern->fInitialChars->contains(c)))) {
- MatchAt(startPos, FALSE, fDeferredStatus);
+ MatchAt(pos, FALSE, fDeferredStatus);
if (U_FAILURE(fDeferredStatus)) {
return FALSE;
}
}
UTEXT_SETNATIVEINDEX(fInputText, pos);
}
- if (startPos >= testStartLimit) {
+ if (startPos > testStartLimit) {
fMatch = FALSE;
fHitEnd = TRUE;
return FALSE;
}
- startPos = pos;
if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
return FALSE;
}
// Match starts on exactly one char.
U_ASSERT(fPattern->fMinMatchLen > 0);
UChar32 theChar = fPattern->fInitialChar;
- int64_t pos;
UTEXT_SETNATIVEINDEX(fInputText, startPos);
for (;;) {
+ int64_t pos = startPos;
c = UTEXT_NEXT32(fInputText);
- pos = UTEXT_GETNATIVEINDEX(fInputText);
+ startPos = UTEXT_GETNATIVEINDEX(fInputText);
if (c == theChar) {
- MatchAt(startPos, FALSE, fDeferredStatus);
+ MatchAt(pos, FALSE, fDeferredStatus);
if (U_FAILURE(fDeferredStatus)) {
return FALSE;
}
}
UTEXT_SETNATIVEINDEX(fInputText, pos);
}
- if (startPos >= testStartLimit) {
+ if (startPos > testStartLimit) {
fMatch = FALSE;
fHitEnd = TRUE;
return FALSE;
}
- startPos = pos;
if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
return FALSE;
}
// the minimum length match would extend past the end of the input.
// Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
// Be aware of possible overflows if making changes here.
+ // Note: a match can begin at inputBuf + testLen; it is an inclusive limit.
int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
if (startPos > testLen) {
fMatch = FALSE;
return TRUE;
}
}
- if (pos >= testLen) {
+ if (startPos > testLen) {
fMatch = FALSE;
fHitEnd = TRUE;
return FALSE;
return FALSE;
}
}
- U_ASSERT(FALSE);
+ U_ASSERT(FALSE);
case START_LINE:
{
// To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
// detect the bad read.
- UnicodeString patternString("A|B|C");
- UnicodeString txtString = UnicodeString("a string \\ud800\\udc00").unescape();
- UChar *exactBuffer = new UChar[txtString.length()];
+ TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
+ TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
+
+ // Test again with a pattern starting with a single character,
+ // which takes a different code path than starting with an OR expression,
+ // but with similar logic.
+ TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
+ TestCase11049("C", "string matches at end C", TRUE, __LINE__);
+}
+
+// Run a single test case from TestBug11049(). Internal function.
+void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
UErrorCode status = U_ZERO_ERROR;
- txtString.extract(exactBuffer, txtString.length(), status);
- UText *ut = utext_openUChars(NULL, exactBuffer, txtString.length(), &status);
+ UnicodeString patternString = UnicodeString(pattern).unescape();
+ LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
- LocalPointer<RegexPattern> pattern(RegexPattern::compile(patternString, 0, status));
+ UnicodeString dataString = UnicodeString(data).unescape();
+ UChar *exactBuffer = new UChar[dataString.length()];
+ dataString.extract(exactBuffer, dataString.length(), status);
+ UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
+
+ LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
REGEX_CHECK_STATUS;
- LocalPointer<RegexMatcher> matcher(pattern->matcher(status));
matcher->reset(ut);
- REGEX_CHECK_STATUS;
UBool result = matcher->find();
- REGEX_ASSERT(result == FALSE);
-
- // Verify that match starting on the last char in input will be found.
- txtString = UnicodeString("string matches at end C");
- matcher->reset(txtString);
- result = matcher->find();
- REGEX_ASSERT(result == TRUE);
+ if (result != expectMatch) {
+ errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
+ __FILE__, lineNumber, expectMatch, result, pattern, data);
+ }
- // Put an unpaired surrogate at the end of the input text,
- // let valgrind verify that find() doesn't look off the end.
- txtString = UnicodeString("a string \\ud800").unescape();
- delete [] exactBuffer;
- exactBuffer = new UChar[txtString.length()];
- txtString.extract(exactBuffer, txtString.length(), status);
- utext_openUChars(ut, exactBuffer, txtString.length(), &status);
+ // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
+ // off-by-one on find() with match at the last code point.
+ // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
+ // because string.unescape() will only shrink it.
+ char * utf8Buffer = new char[uprv_strlen(data)+1];
+ u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
+ REGEX_CHECK_STATUS;
+ ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
+ REGEX_CHECK_STATUS;
matcher->reset(ut);
result = matcher->find();
- REGEX_ASSERT(result == FALSE);
- REGEX_CHECK_STATUS;
+ if (result != expectMatch) {
+ errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
+ __FILE__, lineNumber, expectMatch, result, pattern, data);
+ }
+ delete [] utf8Buffer;
utext_close(ut);
delete [] exactBuffer;
}
+
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */