static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
+ static void _appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
+ UBool escapeUnprintable);
+
//----------------------------------------------------------------
// Implementation: Fundamental operators
//----------------------------------------------------------------
* Append the <code>toPattern()</code> representation of a
* string to the given <code>StringBuffer</code>.
*/
-void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
-escapeUnprintable) {
+void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable) {
UChar32 cp;
for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
_appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
* Append the <code>toPattern()</code> representation of a
* character to the given <code>StringBuffer</code>.
*/
-void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
-escapeUnprintable) {
- if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
+void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable) {
+ if (escapeUnprintable ? ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
// Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
// unprintable
- if (ICU_Utility::escapeUnprintable(buf, c)) {
- return;
- }
+ ICU_Utility::escape(buf, c);
+ return;
}
// Okay to let ':' pass through
switch (c) {
buf.append(c);
}
+void UnicodeSet::_appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
+ UBool escapeUnprintable) {
+ _appendToPat(result, start, escapeUnprintable);
+ if (start != end) {
+ if ((start+1) != end ||
+ // Avoid writing what looks like a lead+trail surrogate pair.
+ start == 0xdbff) {
+ result.append(u'-');
+ }
+ _appendToPat(result, end, escapeUnprintable);
+ }
+}
+
/**
* Append a string representation of this set to result. This will be
* a cleaned version of the string passed to applyPattern(), if there
for (i=0; i<patLen; ) {
UChar32 c;
U16_NEXT(pat, i, patLen, c);
- if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
+ if (escapeUnprintable ?
+ ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
// If the unprintable character is preceded by an odd
// number of backslashes, then it has been escaped.
// Before unescaping it, we delete the final
if ((backslashCount % 2) == 1) {
result.truncate(result.length() - 1);
}
- ICU_Utility::escapeUnprintable(result, c);
+ ICU_Utility::escape(result, c);
backslashCount = 0;
} else {
result.append(c);
{
result.append(u'[');
-// // Check against the predefined categories. We implicitly build
-// // up ALL category sets the first time toPattern() is called.
-// for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
-// if (*this == getCategorySet(cat)) {
-// result.append(u':');
-// result.append(CATEGORY_NAMES, cat*2, 2);
-// return result.append(CATEGORY_CLOSE);
-// }
-// }
-
- int32_t count = getRangeCount();
+ int32_t i = 0;
+ int32_t limit = len & ~1; // = 2 * getRangeCount()
// If the set contains at least 2 intervals and includes both
// MIN_VALUE and MAX_VALUE, then the inverse representation will
// be more economical.
- if (count > 1 &&
- getRangeStart(0) == MIN_VALUE &&
- getRangeEnd(count-1) == MAX_VALUE) {
-
+ // if (getRangeCount() >= 2 &&
+ // getRangeStart(0) == MIN_VALUE &&
+ // getRangeEnd(last) == MAX_VALUE)
+ // Invariant: list[len-1] == HIGH == MAX_VALUE + 1
+ // If limit == len then len is even and the last range ends with MAX_VALUE.
+ if (len >= 4 && list[0] == 0 && limit == len) {
// Emit the inverse
result.append(u'^');
-
- for (int32_t i = 1; i < count; ++i) {
- UChar32 start = getRangeEnd(i-1)+1;
- UChar32 end = getRangeStart(i)-1;
- _appendToPat(result, start, escapeUnprintable);
- if (start != end) {
- if ((start+1) != end) {
- result.append(u'-');
- }
- _appendToPat(result, end, escapeUnprintable);
+ // Offsetting the inversion list index by one lets us
+ // iterate over the ranges of the set complement.
+ i = 1;
+ --limit;
+ }
+
+ // Emit the ranges as pairs.
+ while (i < limit) {
+ UChar32 start = list[i]; // getRangeStart()
+ UChar32 end = list[i + 1] - 1; // getRangeEnd() = range limit minus one
+ if (!(0xd800 <= end && end <= 0xdbff)) {
+ _appendToPat(result, start, end, escapeUnprintable);
+ i += 2;
+ } else {
+ // The range ends with a lead surrogate.
+ // Avoid writing what looks like a lead+trail surrogate pair.
+ // 1. Postpone ranges that start with a lead surrogate code point.
+ int32_t firstLead = i;
+ while ((i += 2) < limit && list[i] <= 0xdbff) {}
+ int32_t firstAfterLead = i;
+ // 2. Write following ranges that start with a trail surrogate code point.
+ while (i < limit && (start = list[i]) <= 0xdfff) {
+ _appendToPat(result, start, list[i + 1] - 1, escapeUnprintable);
+ i += 2;
}
- }
- }
-
- // Default; emit the ranges as pairs
- else {
- for (int32_t i = 0; i < count; ++i) {
- UChar32 start = getRangeStart(i);
- UChar32 end = getRangeEnd(i);
- _appendToPat(result, start, escapeUnprintable);
- if (start != end) {
- if ((start+1) != end) {
- result.append(u'-');
- }
- _appendToPat(result, end, escapeUnprintable);
+ // 3. Now write the postponed ranges.
+ for (int j = firstLead; j < firstAfterLead; j += 2) {
+ _appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable);
}
}
}
return result;
}
-/**
- * Return true if the character is NOT printable ASCII.
- */
UBool ICU_Utility::isUnprintable(UChar32 c) {
return !(c >= 0x20 && c <= 0x7E);
}
-/**
- * Escape unprintable characters using \uxxxx notation for U+0000 to
- * U+FFFF and \Uxxxxxxxx for U+10000 and above. If the character is
- * printable ASCII, then do nothing and return FALSE. Otherwise,
- * append the escaped notation and return TRUE.
- */
+UBool ICU_Utility::shouldAlwaysBeEscaped(UChar32 c) {
+ if (c < 0x20) {
+ return true; // C0 control codes
+ } else if (c <= 0x7e) {
+ return false; // printable ASCII
+ } else if (c <= 0x9f) {
+ return true; // C1 control codes
+ } else if (c < 0xd800) {
+ return false; // most of the BMP
+ } else if (c <= 0xdfff || (0xfdd0 <= c && c <= 0xfdef) || (c & 0xfffe) == 0xfffe) {
+ return true; // surrogate or noncharacter code points
+ } else if (c <= 0x10ffff) {
+ return false; // all else
+ } else {
+ return true; // not a code point
+ }
+}
+
UBool ICU_Utility::escapeUnprintable(UnicodeString& result, UChar32 c) {
if (isUnprintable(c)) {
- result.append(BACKSLASH);
- if (c & ~0xFFFF) {
- result.append(UPPER_U);
- result.append(DIGITS[0xF&(c>>28)]);
- result.append(DIGITS[0xF&(c>>24)]);
- result.append(DIGITS[0xF&(c>>20)]);
- result.append(DIGITS[0xF&(c>>16)]);
- } else {
- result.append(LOWER_U);
- }
- result.append(DIGITS[0xF&(c>>12)]);
- result.append(DIGITS[0xF&(c>>8)]);
- result.append(DIGITS[0xF&(c>>4)]);
- result.append(DIGITS[0xF&c]);
- return TRUE;
+ escape(result, c);
+ return true;
+ }
+ return false;
+}
+
+UnicodeString &ICU_Utility::escape(UnicodeString& result, UChar32 c) {
+ result.append(BACKSLASH);
+ if (c & ~0xFFFF) {
+ result.append(UPPER_U);
+ result.append(DIGITS[0xF&(c>>28)]);
+ result.append(DIGITS[0xF&(c>>24)]);
+ result.append(DIGITS[0xF&(c>>20)]);
+ result.append(DIGITS[0xF&(c>>16)]);
+ } else {
+ result.append(LOWER_U);
}
- return FALSE;
+ result.append(DIGITS[0xF&(c>>12)]);
+ result.append(DIGITS[0xF&(c>>8)]);
+ result.append(DIGITS[0xF&(c>>4)]);
+ result.append(DIGITS[0xF&c]);
+ return result;
}
/**
/**
* Return true if the character is NOT printable ASCII.
- *
- * This method should really be in UnicodeString (or similar). For
- * now, we implement it here and share it with friend classes.
+ * The tab, newline and linefeed characters are considered unprintable.
*/
static UBool isUnprintable(UChar32 c);
/**
- * Escape unprintable characters using \uxxxx notation for U+0000 to
+ * @return true for control codes and for surrogate and noncharacter code points
+ */
+ static UBool shouldAlwaysBeEscaped(UChar32 c);
+
+ /**
+ * Escapes one unprintable code point using \uxxxx notation for U+0000 to
* U+FFFF and \Uxxxxxxxx for U+10000 and above. If the character is
* printable ASCII, then do nothing and return false. Otherwise,
* append the escaped notation and return true.
*/
static UBool escapeUnprintable(UnicodeString& result, UChar32 c);
+ /**
+ * Escapes one code point using \uxxxx notation
+ * for U+0000 to U+FFFF and \Uxxxxxxxx for U+10000 and above.
+ * @return result
+ */
+ static UnicodeString &escape(UnicodeString& result, UChar32 c);
+
/**
* Returns the index of a character, ignoring quoted text.
* For example, in the string "abc'hide'h", the 'h' in "hide" will not be
TESTCASE_AUTO(TestFreezable);
TESTCASE_AUTO(TestSpan);
TESTCASE_AUTO(TestStringSpan);
- TESTCASE_AUTO(TestUCAUnsafeBackwards);
+ TESTCASE_AUTO(TestPatternWithSurrogates);
TESTCASE_AUTO(TestIntOverflow);
TESTCASE_AUTO(TestUnusedCcc);
TESTCASE_AUTO(TestDeepPattern);
}
}
-/**
- * Including collationroot.h fails here with
-1>c:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\include\driverspecs.h(142): error C2008: '$' : unexpected in macro definition
- * .. so, we skip this test on Windows.
- *
- * the cause is that intltest builds with /Za which disables language extensions - which means
- * windows header files can't be used.
- */
-#if !UCONFIG_NO_COLLATION && !U_PLATFORM_HAS_WIN32_API
-#include "collationroot.h"
-#include "collationtailoring.h"
-#endif
-
-void UnicodeSetTest::TestUCAUnsafeBackwards() {
-#if U_PLATFORM_HAS_WIN32_API
- infoln("Skipping TestUCAUnsafeBackwards() - can't include collationroot.h on Windows without language extensions!");
-#elif !UCONFIG_NO_COLLATION
- UErrorCode errorCode = U_ZERO_ERROR;
-
- // Get the unsafeBackwardsSet
- const CollationCacheEntry *rootEntry = CollationRoot::getRootCacheEntry(errorCode);
- if(U_FAILURE(errorCode)) {
- dataerrln("FAIL: %s getting root cache entry", u_errorName(errorCode));
- return;
- }
- //const UVersionInfo &version = rootEntry->tailoring->version;
- const UnicodeSet *unsafeBackwardSet = rootEntry->tailoring->unsafeBackwardSet;
-
- checkSerializeRoundTrip(*unsafeBackwardSet, errorCode);
-
- if(!logKnownIssue("11891","UnicodeSet fails to round trip on CollationRoot...unsafeBackwards set")) {
- // simple test case
- // TODO(ticket #11891): Simplify this test function to this simple case. Rename it appropriately.
- // TODO(ticket #11891): Port test to Java. Is this a bug there, too?
- UnicodeSet surrogates;
- surrogates.add(0xd83a); // a lead surrogate
- surrogates.add(0xdc00, 0xdfff); // a range of trail surrogates
- UnicodeString pat;
- surrogates.toPattern(pat, FALSE); // bad: [ 0xd83a, 0xdc00, 0x2d, 0xdfff ]
- // TODO: Probably fix either UnicodeSet::_generatePattern() or _appendToPat()
- // so that at least one type of surrogate code points are escaped,
- // or (minimally) so that adjacent lead+trail surrogate code points are escaped.
- errorCode = U_ZERO_ERROR;
- UnicodeSet s2;
- s2.applyPattern(pat, errorCode); // looks like invalid range [ 0x1e800, 0x2d, 0xdfff ]
- if(U_FAILURE(errorCode)) {
- errln("FAIL: surrogates to/from pattern - %s", u_errorName(errorCode));
- } else {
- checkEqual(surrogates, s2, "surrogates to/from pattern");
- }
- // This occurs in the UCA unsafe-backwards set.
- checkRoundTrip(*unsafeBackwardSet);
- }
-#endif
+void UnicodeSetTest::TestPatternWithSurrogates() {
+ IcuTestErrorCode errorCode(*this, "TestPatternWithSurrogates");
+ // Regression test for ICU-11891
+ UnicodeSet surrogates;
+ surrogates.add(0xd000, 0xd82f); // a range ending with a lead surrogate code point
+ surrogates.add(0xd83a); // a lead surrogate
+ surrogates.add(0xdc00, 0xdfff); // a range of trail surrogates
+ UnicodeString pat;
+ surrogates.toPattern(pat, false); // bad if U+D83A is immediately followed by U+DC00
+ UnicodeSet s2;
+ // was: U_MALFORMED_SET
+ // Java: IllegalArgumentException: Error: Invalid range at "[...\U0001E800-\uDFFF|...]"
+ s2.applyPattern(pat, errorCode);
+ if (errorCode.errIfFailureAndReset("surrogates (1) to/from pattern")) { return; }
+ checkEqual(surrogates, s2, "surrogates (1) to/from pattern");
+
+ // create a range of DBFF-DC00, and in the complement form a range of DC01-DC03
+ surrogates.add(0xdbff).remove(0xdc01, 0xdc03);
+ // add a beyond-surrogates range, up to the last code point
+ surrogates.add(0x10affe, 0x10ffff);
+ surrogates.toPattern(pat, false); // bad if U+DBFF is immediately followed by U+DC00
+ s2.applyPattern(pat, errorCode);
+ if (errorCode.errIfFailureAndReset("surrogates (2) to/from pattern")) { return; }
+ checkEqual(surrogates, s2, "surrogates (2) to/from pattern");
+
+ // Test the toPattern() code path when the pattern is shorter in complement form:
+ // [^opposite-ranges]
+ surrogates.add(0, 0x6789);
+ surrogates.toPattern(pat, false);
+ s2.applyPattern(pat, errorCode);
+ if (errorCode.errIfFailureAndReset("surrogates (3) to/from pattern")) { return; }
+ checkEqual(surrogates, s2, "surrogates (3) to/from pattern");
+
+ // Start with a pattern, in case the original pattern is kept but
+ // without the extra white space.
+ surrogates.applyPattern(u"[\\uD83A \\uDC00-\\uDFFF]", errorCode);
+ if (errorCode.errIfFailureAndReset("surrogates from pattern")) { return; }
+ surrogates.toPattern(pat, false);
+ s2.applyPattern(pat, errorCode);
+ if (errorCode.errIfFailureAndReset("surrogates from/to/from pattern")) { return; }
+ checkEqual(surrogates, s2, "surrogates from/to/from pattern");
}
void UnicodeSetTest::TestIntOverflow() {
void TestStringSpan();
- void TestUCAUnsafeBackwards();
+ void TestPatternWithSurrogates();
void TestIntOverflow();
void TestUnusedCcc();
void TestDeepPattern();
}
/**
- * Escape unprintable characters using <backslash>uxxxx notation
+ * @return true for control codes and for surrogate and noncharacter code points
+ */
+ public static boolean shouldAlwaysBeEscaped(int c) {
+ if (c < 0x20) {
+ return true; // C0 control codes
+ } else if (c <= 0x7e) {
+ return false; // printable ASCII
+ } else if (c <= 0x9f) {
+ return true; // C1 control codes
+ } else if (c < 0xd800) {
+ return false; // most of the BMP
+ } else if (c <= 0xdfff || (0xfdd0 <= c && c <= 0xfdef) || (c & 0xfffe) == 0xfffe) {
+ return true; // surrogate or noncharacter code points
+ } else if (c <= 0x10ffff) {
+ return false; // all else
+ } else {
+ return true; // not a code point
+ }
+ }
+
+ /**
+ * Escapes one unprintable code point using <backslash>uxxxx notation
* for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and
* above. If the character is printable ASCII, then do nothing
* and return FALSE. Otherwise, append the escaped notation and
* return TRUE.
*/
public static <T extends Appendable> boolean escapeUnprintable(T result, int c) {
+ if (isUnprintable(c)) {
+ escape(result, c);
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Escapes one code point using <backslash>uxxxx notation
+ * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and above.
+ * @return result
+ */
+ public static <T extends Appendable> T escape(T result, int c) {
try {
- if (isUnprintable(c)) {
- result.append('\\');
- if ((c & ~0xFFFF) != 0) {
- result.append('U');
- result.append(DIGITS[0xF&(c>>28)]);
- result.append(DIGITS[0xF&(c>>24)]);
- result.append(DIGITS[0xF&(c>>20)]);
- result.append(DIGITS[0xF&(c>>16)]);
- } else {
- result.append('u');
- }
- result.append(DIGITS[0xF&(c>>12)]);
- result.append(DIGITS[0xF&(c>>8)]);
- result.append(DIGITS[0xF&(c>>4)]);
- result.append(DIGITS[0xF&c]);
- return true;
+ result.append('\\');
+ if ((c & ~0xFFFF) != 0) {
+ result.append('U');
+ result.append(DIGITS[0xF&(c>>28)]);
+ result.append(DIGITS[0xF&(c>>24)]);
+ result.append(DIGITS[0xF&(c>>20)]);
+ result.append(DIGITS[0xF&(c>>16)]);
+ } else {
+ result.append('u');
}
- return false;
+ result.append(DIGITS[0xF&(c>>12)]);
+ result.append(DIGITS[0xF&(c>>8)]);
+ result.append(DIGITS[0xF&(c>>4)]);
+ result.append(DIGITS[0xF&c]);
+ return result;
} catch (IOException e) {
- throw new IllegalIcuArgumentException(e);
+ throw new ICUUncheckedIOException(e);
}
}
*/
private static <T extends Appendable> T _appendToPat(T buf, int c, boolean escapeUnprintable) {
try {
- if (escapeUnprintable && Utility.isUnprintable(c)) {
+ if (escapeUnprintable ? Utility.isUnprintable(c) : Utility.shouldAlwaysBeEscaped(c)) {
// Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything
// unprintable
- if (Utility.escapeUnprintable(buf, c)) {
- return buf;
- }
+ return Utility.escape(buf, c);
}
// Okay to let ':' pass through
switch (c) {
}
}
+ private static <T extends Appendable> T _appendToPat(
+ T result, int start, int end, boolean escapeUnprintable) {
+ _appendToPat(result, start, escapeUnprintable);
+ if (start != end) {
+ if ((start+1) != end ||
+ // Avoid writing what looks like a lead+trail surrogate pair.
+ start == 0xdbff) {
+ try {
+ result.append('-');
+ } catch (IOException e) {
+ throw new ICUUncheckedIOException(e);
+ }
+ }
+ _appendToPat(result, end, escapeUnprintable);
+ }
+ return result;
+ }
+
/**
* Returns a string representation of this set. If the result of
* calling this function is passed to a UnicodeSet constructor, it
}
try {
if (!escapeUnprintable) {
+ // TODO: The C++ version does not have this shortcut, and instead
+ // always cleans up the pattern string,
+ // which also escapes Utility.shouldAlwaysBeEscaped(c).
+ // We should sync these implementations.
result.append(pat);
return result;
}
// If the unprintable character is preceded by an odd
// number of backslashes, then it has been escaped
// and we omit the last backslash.
- Utility.escapeUnprintable(result, c);
+ Utility.escape(result, c);
oddNumberOfBackslashes = false;
} else if (!oddNumberOfBackslashes && c == '\\') {
// Temporarily withhold an odd-numbered backslash.
* Generate and append a string representation of this set to result.
* This does not use this.pat, the cleaned up copy of the string
* passed to applyPattern().
+ *
* @param result the buffer into which to generate the pattern
* @param escapeUnprintable escape unprintable characters if true
* @stable ICU 2.0
* Generate and append a string representation of this set to result.
* This does not use this.pat, the cleaned up copy of the string
* passed to applyPattern().
+ *
+ * @param result the buffer into which to generate the pattern
+ * @param escapeUnprintable escape unprintable characters if true
* @param includeStrings if false, doesn't include the strings.
* @stable ICU 3.8
*/
return appendNewPattern(result, escapeUnprintable, includeStrings);
}
+ // Implementation of public _generatePattern().
+ // Allows other callers to use a StringBuilder while the existing API is stuck with StringBuffer.
private <T extends Appendable> T appendNewPattern(
T result, boolean escapeUnprintable, boolean includeStrings) {
try {
result.append('[');
- int count = getRangeCount();
+ int i = 0;
+ int limit = len & ~1; // = 2 * getRangeCount()
// If the set contains at least 2 intervals and includes both
// MIN_VALUE and MAX_VALUE, then the inverse representation will
// be more economical.
- if (count > 1 &&
- getRangeStart(0) == MIN_VALUE &&
- getRangeEnd(count-1) == MAX_VALUE) {
-
+ // if (getRangeCount() >= 2 &&
+ // getRangeStart(0) == MIN_VALUE &&
+ // getRangeEnd(last) == MAX_VALUE)
+ // Invariant: list[len-1] == HIGH == MAX_VALUE + 1
+ // If limit == len then len is even and the last range ends with MAX_VALUE.
+ if (len >= 4 && list[0] == 0 && limit == len) {
// Emit the inverse
result.append('^');
-
- for (int i = 1; i < count; ++i) {
- int start = getRangeEnd(i-1)+1;
- int end = getRangeStart(i)-1;
- _appendToPat(result, start, escapeUnprintable);
- if (start != end) {
- if ((start+1) != end) {
- result.append('-');
- }
- _appendToPat(result, end, escapeUnprintable);
- }
- }
+ // Offsetting the inversion list index by one lets us
+ // iterate over the ranges of the set complement.
+ i = 1;
+ --limit;
}
- // Default; emit the ranges as pairs
- else {
- for (int i = 0; i < count; ++i) {
- int start = getRangeStart(i);
- int end = getRangeEnd(i);
- _appendToPat(result, start, escapeUnprintable);
- if (start != end) {
- if ((start+1) != end) {
- result.append('-');
- }
- _appendToPat(result, end, escapeUnprintable);
+ // Emit the ranges as pairs.
+ while (i < limit) {
+ int start = list[i]; // getRangeStart()
+ int end = list[i + 1] - 1; // getRangeEnd() = range limit minus one
+ if (!(0xd800 <= end && end <= 0xdbff)) {
+ _appendToPat(result, start, end, escapeUnprintable);
+ i += 2;
+ } else {
+ // The range ends with a lead surrogate.
+ // Avoid writing what looks like a lead+trail surrogate pair.
+ // 1. Postpone ranges that start with a lead surrogate code point.
+ int firstLead = i;
+ while ((i += 2) < limit && list[i] <= 0xdbff) {}
+ int firstAfterLead = i;
+ // 2. Write following ranges that start with a trail surrogate code point.
+ while (i < limit && (start = list[i]) <= 0xdfff) {
+ _appendToPat(result, start, list[i + 1] - 1, escapeUnprintable);
+ i += 2;
+ }
+ // 3. Now write the postponed ranges.
+ for (int j = firstLead; j < firstAfterLead; j += 2) {
+ _appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable);
}
}
}
UnicodeSet.fromAll("b").compareTo(Collections.singleton("a")) > 0);
}
+ @Test
+ public void TestPatternWithSurrogates() {
+ // Regression test for ICU-11891
+ UnicodeSet surrogates = new UnicodeSet();
+ surrogates.add(0xd000, 0xd82f); // a range ending with a lead surrogate code point
+ surrogates.add(0xd83a); // a lead surrogate
+ surrogates.add(0xdc00, 0xdfff); // a range of trail surrogates
+ String pat = surrogates.toPattern(false); // bad if U+D83A is immediately followed by U+DC00
+ UnicodeSet s2 = new UnicodeSet();
+ // was: IllegalArgumentException: Error: Invalid range at "[...\U0001E800-\uDFFF|...]"
+ s2.applyPattern(pat);
+ checkEqual(surrogates, s2, "surrogates (1) to/from pattern");
+
+ // create a range of DBFF-DC00, and in the complement form a range of DC01-DC03
+ surrogates.add(0xdbff).remove(0xdc01, 0xdc03);
+ // add a beyond-surrogates range, up to the last code point
+ surrogates.add(0x10affe, 0x10ffff);
+ pat = surrogates.toPattern(false); // bad if U+DBFF is immediately followed by U+DC00
+ s2.applyPattern(pat);
+ checkEqual(surrogates, s2, "surrogates (2) to/from pattern");
+
+ // Test the toPattern() code path when the pattern is shorter in complement form:
+ // [^opposite-ranges]
+ surrogates.add(0, 0x6789);
+ pat = surrogates.toPattern(false);
+ s2.applyPattern(pat);
+ checkEqual(surrogates, s2, "surrogates (3) to/from pattern");
+
+ // Start with a pattern, in case the original pattern is kept but
+ // without the extra white space.
+ surrogates.applyPattern("[\\uD83A \\uDC00-\\uDFFF]");
+ pat = surrogates.toPattern(false);
+ s2.applyPattern(pat);
+ checkEqual(surrogates, s2, "surrogates from/to/from pattern");
+ }
+
@Test
public void TestUnusedCcc() {
// All numeric ccc values 0..255 are valid, but many are unused.
errln(e.getMessage());
continue;
}
- final String expected = "[" + s + "]";
- assertEquals("Doubled character works" + hex.transform(s), expected, pattern);
+ String expected = "[" + s + "]"; // Try this first for faster testing.
+ boolean ok = pattern.equals(expected);
+ if (!ok) {
+ expected = new UnicodeSet(expected).toPattern(false);
+ ok = pattern.equals(expected);
+ }
+ assertTrue("Doubled character works " + hex.transform(s), ok);
// verify that we can create a regex pattern and use as expected
String shouldNotMatch = UTF16.valueOf((cp + 1) % 0x110000);