From: Markus Scherer Date: Fri, 17 Sep 2021 02:51:56 +0000 (-0700) Subject: ICU-11891 make UnicodeSet.toPattern() well-formed & round-trip X-Git-Tag: release-70-rc~29 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=3f0e003901c155f88a99e2fc660c12e769e2f8f1;p=icu ICU-11891 make UnicodeSet.toPattern() well-formed & round-trip --- diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h index d0cc86e02d2..b5fd13a686f 100644 --- a/icu4c/source/common/unicode/uniset.h +++ b/icu4c/source/common/unicode/uniset.h @@ -1597,6 +1597,9 @@ private: static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable); + static void _appendToPat(UnicodeString &result, UChar32 start, UChar32 end, + UBool escapeUnprintable); + //---------------------------------------------------------------- // Implementation: Fundamental operators //---------------------------------------------------------------- diff --git a/icu4c/source/common/uniset.cpp b/icu4c/source/common/uniset.cpp index 47993361f2f..967ea2ecdb2 100644 --- a/icu4c/source/common/uniset.cpp +++ b/icu4c/source/common/uniset.cpp @@ -1967,8 +1967,7 @@ void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) * Append the toPattern() representation of a * string to the given StringBuffer. */ -void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool -escapeUnprintable) { +void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable) { UChar32 cp; for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) { _appendToPat(buf, cp = s.char32At(i), escapeUnprintable); @@ -1979,14 +1978,12 @@ escapeUnprintable) { * Append the toPattern() representation of a * character to the given StringBuffer. */ -void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool -escapeUnprintable) { - if (escapeUnprintable && ICU_Utility::isUnprintable(c)) { +void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable) { + if (escapeUnprintable ? ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) { // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything // unprintable - if (ICU_Utility::escapeUnprintable(buf, c)) { - return; - } + ICU_Utility::escape(buf, c); + return; } // Okay to let ':' pass through switch (c) { @@ -2012,6 +2009,19 @@ escapeUnprintable) { buf.append(c); } +void UnicodeSet::_appendToPat(UnicodeString &result, UChar32 start, UChar32 end, + UBool escapeUnprintable) { + _appendToPat(result, start, escapeUnprintable); + if (start != end) { + if ((start+1) != end || + // Avoid writing what looks like a lead+trail surrogate pair. + start == 0xdbff) { + result.append(u'-'); + } + _appendToPat(result, end, escapeUnprintable); + } +} + /** * Append a string representation of this set to result. This will be * a cleaned version of the string passed to applyPattern(), if there @@ -2026,7 +2036,8 @@ UnicodeString& UnicodeSet::_toPattern(UnicodeString& result, for (i=0; i 1 && - getRangeStart(0) == MIN_VALUE && - getRangeEnd(count-1) == MAX_VALUE) { - + // if (getRangeCount() >= 2 && + // getRangeStart(0) == MIN_VALUE && + // getRangeEnd(last) == MAX_VALUE) + // Invariant: list[len-1] == HIGH == MAX_VALUE + 1 + // If limit == len then len is even and the last range ends with MAX_VALUE. + if (len >= 4 && list[0] == 0 && limit == len) { // Emit the inverse result.append(u'^'); - - for (int32_t i = 1; i < count; ++i) { - UChar32 start = getRangeEnd(i-1)+1; - UChar32 end = getRangeStart(i)-1; - _appendToPat(result, start, escapeUnprintable); - if (start != end) { - if ((start+1) != end) { - result.append(u'-'); - } - _appendToPat(result, end, escapeUnprintable); + // Offsetting the inversion list index by one lets us + // iterate over the ranges of the set complement. + i = 1; + --limit; + } + + // Emit the ranges as pairs. + while (i < limit) { + UChar32 start = list[i]; // getRangeStart() + UChar32 end = list[i + 1] - 1; // getRangeEnd() = range limit minus one + if (!(0xd800 <= end && end <= 0xdbff)) { + _appendToPat(result, start, end, escapeUnprintable); + i += 2; + } else { + // The range ends with a lead surrogate. + // Avoid writing what looks like a lead+trail surrogate pair. + // 1. Postpone ranges that start with a lead surrogate code point. + int32_t firstLead = i; + while ((i += 2) < limit && list[i] <= 0xdbff) {} + int32_t firstAfterLead = i; + // 2. Write following ranges that start with a trail surrogate code point. + while (i < limit && (start = list[i]) <= 0xdfff) { + _appendToPat(result, start, list[i + 1] - 1, escapeUnprintable); + i += 2; } - } - } - - // Default; emit the ranges as pairs - else { - for (int32_t i = 0; i < count; ++i) { - UChar32 start = getRangeStart(i); - UChar32 end = getRangeEnd(i); - _appendToPat(result, start, escapeUnprintable); - if (start != end) { - if ((start+1) != end) { - result.append(u'-'); - } - _appendToPat(result, end, escapeUnprintable); + // 3. Now write the postponed ranges. + for (int j = firstLead; j < firstAfterLead; j += 2) { + _appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable); } } } diff --git a/icu4c/source/common/util.cpp b/icu4c/source/common/util.cpp index 86e5c791bad..f3421722599 100644 --- a/icu4c/source/common/util.cpp +++ b/icu4c/source/common/util.cpp @@ -65,38 +65,52 @@ UnicodeString& ICU_Utility::appendNumber(UnicodeString& result, int32_t n, return result; } -/** - * Return true if the character is NOT printable ASCII. - */ UBool ICU_Utility::isUnprintable(UChar32 c) { return !(c >= 0x20 && c <= 0x7E); } -/** - * Escape unprintable characters using \uxxxx notation for U+0000 to - * U+FFFF and \Uxxxxxxxx for U+10000 and above. If the character is - * printable ASCII, then do nothing and return FALSE. Otherwise, - * append the escaped notation and return TRUE. - */ +UBool ICU_Utility::shouldAlwaysBeEscaped(UChar32 c) { + if (c < 0x20) { + return true; // C0 control codes + } else if (c <= 0x7e) { + return false; // printable ASCII + } else if (c <= 0x9f) { + return true; // C1 control codes + } else if (c < 0xd800) { + return false; // most of the BMP + } else if (c <= 0xdfff || (0xfdd0 <= c && c <= 0xfdef) || (c & 0xfffe) == 0xfffe) { + return true; // surrogate or noncharacter code points + } else if (c <= 0x10ffff) { + return false; // all else + } else { + return true; // not a code point + } +} + UBool ICU_Utility::escapeUnprintable(UnicodeString& result, UChar32 c) { if (isUnprintable(c)) { - result.append(BACKSLASH); - if (c & ~0xFFFF) { - result.append(UPPER_U); - result.append(DIGITS[0xF&(c>>28)]); - result.append(DIGITS[0xF&(c>>24)]); - result.append(DIGITS[0xF&(c>>20)]); - result.append(DIGITS[0xF&(c>>16)]); - } else { - result.append(LOWER_U); - } - result.append(DIGITS[0xF&(c>>12)]); - result.append(DIGITS[0xF&(c>>8)]); - result.append(DIGITS[0xF&(c>>4)]); - result.append(DIGITS[0xF&c]); - return TRUE; + escape(result, c); + return true; + } + return false; +} + +UnicodeString &ICU_Utility::escape(UnicodeString& result, UChar32 c) { + result.append(BACKSLASH); + if (c & ~0xFFFF) { + result.append(UPPER_U); + result.append(DIGITS[0xF&(c>>28)]); + result.append(DIGITS[0xF&(c>>24)]); + result.append(DIGITS[0xF&(c>>20)]); + result.append(DIGITS[0xF&(c>>16)]); + } else { + result.append(LOWER_U); } - return FALSE; + result.append(DIGITS[0xF&(c>>12)]); + result.append(DIGITS[0xF&(c>>8)]); + result.append(DIGITS[0xF&(c>>4)]); + result.append(DIGITS[0xF&c]); + return result; } /** diff --git a/icu4c/source/common/util.h b/icu4c/source/common/util.h index a748d915d47..7e6d356899f 100644 --- a/icu4c/source/common/util.h +++ b/icu4c/source/common/util.h @@ -55,20 +55,30 @@ class U_COMMON_API ICU_Utility /* not : public UObject because all methods are s /** * Return true if the character is NOT printable ASCII. - * - * This method should really be in UnicodeString (or similar). For - * now, we implement it here and share it with friend classes. + * The tab, newline and linefeed characters are considered unprintable. */ static UBool isUnprintable(UChar32 c); /** - * Escape unprintable characters using \uxxxx notation for U+0000 to + * @return true for control codes and for surrogate and noncharacter code points + */ + static UBool shouldAlwaysBeEscaped(UChar32 c); + + /** + * Escapes one unprintable code point using \uxxxx notation for U+0000 to * U+FFFF and \Uxxxxxxxx for U+10000 and above. If the character is * printable ASCII, then do nothing and return false. Otherwise, * append the escaped notation and return true. */ static UBool escapeUnprintable(UnicodeString& result, UChar32 c); + /** + * Escapes one code point using \uxxxx notation + * for U+0000 to U+FFFF and \Uxxxxxxxx for U+10000 and above. + * @return result + */ + static UnicodeString &escape(UnicodeString& result, UChar32 c); + /** * Returns the index of a character, ignoring quoted text. * For example, in the string "abc'hide'h", the 'h' in "hide" will not be diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 194182998b7..b2241031944 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -94,7 +94,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, TESTCASE_AUTO(TestFreezable); TESTCASE_AUTO(TestSpan); TESTCASE_AUTO(TestStringSpan); - TESTCASE_AUTO(TestUCAUnsafeBackwards); + TESTCASE_AUTO(TestPatternWithSurrogates); TESTCASE_AUTO(TestIntOverflow); TESTCASE_AUTO(TestUnusedCcc); TESTCASE_AUTO(TestDeepPattern); @@ -3908,60 +3908,47 @@ void UnicodeSetTest::TestStringSpan() { } } -/** - * Including collationroot.h fails here with -1>c:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\include\driverspecs.h(142): error C2008: '$' : unexpected in macro definition - * .. so, we skip this test on Windows. - * - * the cause is that intltest builds with /Za which disables language extensions - which means - * windows header files can't be used. - */ -#if !UCONFIG_NO_COLLATION && !U_PLATFORM_HAS_WIN32_API -#include "collationroot.h" -#include "collationtailoring.h" -#endif - -void UnicodeSetTest::TestUCAUnsafeBackwards() { -#if U_PLATFORM_HAS_WIN32_API - infoln("Skipping TestUCAUnsafeBackwards() - can't include collationroot.h on Windows without language extensions!"); -#elif !UCONFIG_NO_COLLATION - UErrorCode errorCode = U_ZERO_ERROR; - - // Get the unsafeBackwardsSet - const CollationCacheEntry *rootEntry = CollationRoot::getRootCacheEntry(errorCode); - if(U_FAILURE(errorCode)) { - dataerrln("FAIL: %s getting root cache entry", u_errorName(errorCode)); - return; - } - //const UVersionInfo &version = rootEntry->tailoring->version; - const UnicodeSet *unsafeBackwardSet = rootEntry->tailoring->unsafeBackwardSet; - - checkSerializeRoundTrip(*unsafeBackwardSet, errorCode); - - if(!logKnownIssue("11891","UnicodeSet fails to round trip on CollationRoot...unsafeBackwards set")) { - // simple test case - // TODO(ticket #11891): Simplify this test function to this simple case. Rename it appropriately. - // TODO(ticket #11891): Port test to Java. Is this a bug there, too? - UnicodeSet surrogates; - surrogates.add(0xd83a); // a lead surrogate - surrogates.add(0xdc00, 0xdfff); // a range of trail surrogates - UnicodeString pat; - surrogates.toPattern(pat, FALSE); // bad: [ 0xd83a, 0xdc00, 0x2d, 0xdfff ] - // TODO: Probably fix either UnicodeSet::_generatePattern() or _appendToPat() - // so that at least one type of surrogate code points are escaped, - // or (minimally) so that adjacent lead+trail surrogate code points are escaped. - errorCode = U_ZERO_ERROR; - UnicodeSet s2; - s2.applyPattern(pat, errorCode); // looks like invalid range [ 0x1e800, 0x2d, 0xdfff ] - if(U_FAILURE(errorCode)) { - errln("FAIL: surrogates to/from pattern - %s", u_errorName(errorCode)); - } else { - checkEqual(surrogates, s2, "surrogates to/from pattern"); - } - // This occurs in the UCA unsafe-backwards set. - checkRoundTrip(*unsafeBackwardSet); - } -#endif +void UnicodeSetTest::TestPatternWithSurrogates() { + IcuTestErrorCode errorCode(*this, "TestPatternWithSurrogates"); + // Regression test for ICU-11891 + UnicodeSet surrogates; + surrogates.add(0xd000, 0xd82f); // a range ending with a lead surrogate code point + surrogates.add(0xd83a); // a lead surrogate + surrogates.add(0xdc00, 0xdfff); // a range of trail surrogates + UnicodeString pat; + surrogates.toPattern(pat, false); // bad if U+D83A is immediately followed by U+DC00 + UnicodeSet s2; + // was: U_MALFORMED_SET + // Java: IllegalArgumentException: Error: Invalid range at "[...\U0001E800-\uDFFF|...]" + s2.applyPattern(pat, errorCode); + if (errorCode.errIfFailureAndReset("surrogates (1) to/from pattern")) { return; } + checkEqual(surrogates, s2, "surrogates (1) to/from pattern"); + + // create a range of DBFF-DC00, and in the complement form a range of DC01-DC03 + surrogates.add(0xdbff).remove(0xdc01, 0xdc03); + // add a beyond-surrogates range, up to the last code point + surrogates.add(0x10affe, 0x10ffff); + surrogates.toPattern(pat, false); // bad if U+DBFF is immediately followed by U+DC00 + s2.applyPattern(pat, errorCode); + if (errorCode.errIfFailureAndReset("surrogates (2) to/from pattern")) { return; } + checkEqual(surrogates, s2, "surrogates (2) to/from pattern"); + + // Test the toPattern() code path when the pattern is shorter in complement form: + // [^opposite-ranges] + surrogates.add(0, 0x6789); + surrogates.toPattern(pat, false); + s2.applyPattern(pat, errorCode); + if (errorCode.errIfFailureAndReset("surrogates (3) to/from pattern")) { return; } + checkEqual(surrogates, s2, "surrogates (3) to/from pattern"); + + // Start with a pattern, in case the original pattern is kept but + // without the extra white space. + surrogates.applyPattern(u"[\\uD83A \\uDC00-\\uDFFF]", errorCode); + if (errorCode.errIfFailureAndReset("surrogates from pattern")) { return; } + surrogates.toPattern(pat, false); + s2.applyPattern(pat, errorCode); + if (errorCode.errIfFailureAndReset("surrogates from/to/from pattern")) { return; } + checkEqual(surrogates, s2, "surrogates from/to/from pattern"); } void UnicodeSetTest::TestIntOverflow() { diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h index f999d45017a..bba91ba2beb 100644 --- a/icu4c/source/test/intltest/usettest.h +++ b/icu4c/source/test/intltest/usettest.h @@ -91,7 +91,7 @@ private: void TestStringSpan(); - void TestUCAUnsafeBackwards(); + void TestPatternWithSurrogates(); void TestIntOverflow(); void TestUnusedCcc(); void TestDeepPattern(); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java index 5308cad9eb9..634d89e364a 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java @@ -1536,34 +1536,65 @@ public final class Utility { } /** - * Escape unprintable characters using uxxxx notation + * @return true for control codes and for surrogate and noncharacter code points + */ + public static boolean shouldAlwaysBeEscaped(int c) { + if (c < 0x20) { + return true; // C0 control codes + } else if (c <= 0x7e) { + return false; // printable ASCII + } else if (c <= 0x9f) { + return true; // C1 control codes + } else if (c < 0xd800) { + return false; // most of the BMP + } else if (c <= 0xdfff || (0xfdd0 <= c && c <= 0xfdef) || (c & 0xfffe) == 0xfffe) { + return true; // surrogate or noncharacter code points + } else if (c <= 0x10ffff) { + return false; // all else + } else { + return true; // not a code point + } + } + + /** + * Escapes one unprintable code point using uxxxx notation * for U+0000 to U+FFFF and Uxxxxxxxx for U+10000 and * above. If the character is printable ASCII, then do nothing * and return FALSE. Otherwise, append the escaped notation and * return TRUE. */ public static boolean escapeUnprintable(T result, int c) { + if (isUnprintable(c)) { + escape(result, c); + return true; + } + return false; + } + + /** + * Escapes one code point using uxxxx notation + * for U+0000 to U+FFFF and Uxxxxxxxx for U+10000 and above. + * @return result + */ + public static T escape(T result, int c) { try { - if (isUnprintable(c)) { - result.append('\\'); - if ((c & ~0xFFFF) != 0) { - result.append('U'); - result.append(DIGITS[0xF&(c>>28)]); - result.append(DIGITS[0xF&(c>>24)]); - result.append(DIGITS[0xF&(c>>20)]); - result.append(DIGITS[0xF&(c>>16)]); - } else { - result.append('u'); - } - result.append(DIGITS[0xF&(c>>12)]); - result.append(DIGITS[0xF&(c>>8)]); - result.append(DIGITS[0xF&(c>>4)]); - result.append(DIGITS[0xF&c]); - return true; + result.append('\\'); + if ((c & ~0xFFFF) != 0) { + result.append('U'); + result.append(DIGITS[0xF&(c>>28)]); + result.append(DIGITS[0xF&(c>>24)]); + result.append(DIGITS[0xF&(c>>20)]); + result.append(DIGITS[0xF&(c>>16)]); + } else { + result.append('u'); } - return false; + result.append(DIGITS[0xF&(c>>12)]); + result.append(DIGITS[0xF&(c>>8)]); + result.append(DIGITS[0xF&(c>>4)]); + result.append(DIGITS[0xF&c]); + return result; } catch (IOException e) { - throw new IllegalIcuArgumentException(e); + throw new ICUUncheckedIOException(e); } } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java index cda5bf1839e..88b9464a82e 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java @@ -650,12 +650,10 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa */ private static T _appendToPat(T buf, int c, boolean escapeUnprintable) { try { - if (escapeUnprintable && Utility.isUnprintable(c)) { + if (escapeUnprintable ? Utility.isUnprintable(c) : Utility.shouldAlwaysBeEscaped(c)) { // Use hex escape notation (uxxxx or Uxxxxxxxx) for anything // unprintable - if (Utility.escapeUnprintable(buf, c)) { - return buf; - } + return Utility.escape(buf, c); } // Okay to let ':' pass through switch (c) { @@ -685,6 +683,24 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa } } + private static T _appendToPat( + T result, int start, int end, boolean escapeUnprintable) { + _appendToPat(result, start, escapeUnprintable); + if (start != end) { + if ((start+1) != end || + // Avoid writing what looks like a lead+trail surrogate pair. + start == 0xdbff) { + try { + result.append('-'); + } catch (IOException e) { + throw new ICUUncheckedIOException(e); + } + } + _appendToPat(result, end, escapeUnprintable); + } + return result; + } + /** * Returns a string representation of this set. If the result of * calling this function is passed to a UnicodeSet constructor, it @@ -712,6 +728,10 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa } try { if (!escapeUnprintable) { + // TODO: The C++ version does not have this shortcut, and instead + // always cleans up the pattern string, + // which also escapes Utility.shouldAlwaysBeEscaped(c). + // We should sync these implementations. result.append(pat); return result; } @@ -723,7 +743,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa // If the unprintable character is preceded by an odd // number of backslashes, then it has been escaped // and we omit the last backslash. - Utility.escapeUnprintable(result, c); + Utility.escape(result, c); oddNumberOfBackslashes = false; } else if (!oddNumberOfBackslashes && c == '\\') { // Temporarily withhold an odd-numbered backslash. @@ -749,6 +769,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * Generate and append a string representation of this set to result. * This does not use this.pat, the cleaned up copy of the string * passed to applyPattern(). + * * @param result the buffer into which to generate the pattern * @param escapeUnprintable escape unprintable characters if true * @stable ICU 2.0 @@ -761,6 +782,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * Generate and append a string representation of this set to result. * This does not use this.pat, the cleaned up copy of the string * passed to applyPattern(). + * + * @param result the buffer into which to generate the pattern + * @param escapeUnprintable escape unprintable characters if true * @param includeStrings if false, doesn't include the strings. * @stable ICU 3.8 */ @@ -769,47 +793,55 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa return appendNewPattern(result, escapeUnprintable, includeStrings); } + // Implementation of public _generatePattern(). + // Allows other callers to use a StringBuilder while the existing API is stuck with StringBuffer. private T appendNewPattern( T result, boolean escapeUnprintable, boolean includeStrings) { try { result.append('['); - int count = getRangeCount(); + int i = 0; + int limit = len & ~1; // = 2 * getRangeCount() // If the set contains at least 2 intervals and includes both // MIN_VALUE and MAX_VALUE, then the inverse representation will // be more economical. - if (count > 1 && - getRangeStart(0) == MIN_VALUE && - getRangeEnd(count-1) == MAX_VALUE) { - + // if (getRangeCount() >= 2 && + // getRangeStart(0) == MIN_VALUE && + // getRangeEnd(last) == MAX_VALUE) + // Invariant: list[len-1] == HIGH == MAX_VALUE + 1 + // If limit == len then len is even and the last range ends with MAX_VALUE. + if (len >= 4 && list[0] == 0 && limit == len) { // Emit the inverse result.append('^'); - - for (int i = 1; i < count; ++i) { - int start = getRangeEnd(i-1)+1; - int end = getRangeStart(i)-1; - _appendToPat(result, start, escapeUnprintable); - if (start != end) { - if ((start+1) != end) { - result.append('-'); - } - _appendToPat(result, end, escapeUnprintable); - } - } + // Offsetting the inversion list index by one lets us + // iterate over the ranges of the set complement. + i = 1; + --limit; } - // Default; emit the ranges as pairs - else { - for (int i = 0; i < count; ++i) { - int start = getRangeStart(i); - int end = getRangeEnd(i); - _appendToPat(result, start, escapeUnprintable); - if (start != end) { - if ((start+1) != end) { - result.append('-'); - } - _appendToPat(result, end, escapeUnprintable); + // Emit the ranges as pairs. + while (i < limit) { + int start = list[i]; // getRangeStart() + int end = list[i + 1] - 1; // getRangeEnd() = range limit minus one + if (!(0xd800 <= end && end <= 0xdbff)) { + _appendToPat(result, start, end, escapeUnprintable); + i += 2; + } else { + // The range ends with a lead surrogate. + // Avoid writing what looks like a lead+trail surrogate pair. + // 1. Postpone ranges that start with a lead surrogate code point. + int firstLead = i; + while ((i += 2) < limit && list[i] <= 0xdbff) {} + int firstAfterLead = i; + // 2. Write following ranges that start with a trail surrogate code point. + while (i < limit && (start = list[i]) <= 0xdfff) { + _appendToPat(result, start, list[i + 1] - 1, escapeUnprintable); + i += 2; + } + // 3. Now write the postponed ranges. + for (int j = firstLead; j < firstAfterLead; j += 2) { + _appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable); } } } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java index 8b0121cedb4..bf44762c957 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java @@ -2752,6 +2752,42 @@ public class UnicodeSetTest extends TestFmwk { UnicodeSet.fromAll("b").compareTo(Collections.singleton("a")) > 0); } + @Test + public void TestPatternWithSurrogates() { + // Regression test for ICU-11891 + UnicodeSet surrogates = new UnicodeSet(); + surrogates.add(0xd000, 0xd82f); // a range ending with a lead surrogate code point + surrogates.add(0xd83a); // a lead surrogate + surrogates.add(0xdc00, 0xdfff); // a range of trail surrogates + String pat = surrogates.toPattern(false); // bad if U+D83A is immediately followed by U+DC00 + UnicodeSet s2 = new UnicodeSet(); + // was: IllegalArgumentException: Error: Invalid range at "[...\U0001E800-\uDFFF|...]" + s2.applyPattern(pat); + checkEqual(surrogates, s2, "surrogates (1) to/from pattern"); + + // create a range of DBFF-DC00, and in the complement form a range of DC01-DC03 + surrogates.add(0xdbff).remove(0xdc01, 0xdc03); + // add a beyond-surrogates range, up to the last code point + surrogates.add(0x10affe, 0x10ffff); + pat = surrogates.toPattern(false); // bad if U+DBFF is immediately followed by U+DC00 + s2.applyPattern(pat); + checkEqual(surrogates, s2, "surrogates (2) to/from pattern"); + + // Test the toPattern() code path when the pattern is shorter in complement form: + // [^opposite-ranges] + surrogates.add(0, 0x6789); + pat = surrogates.toPattern(false); + s2.applyPattern(pat); + checkEqual(surrogates, s2, "surrogates (3) to/from pattern"); + + // Start with a pattern, in case the original pattern is kept but + // without the extra white space. + surrogates.applyPattern("[\\uD83A \\uDC00-\\uDFFF]"); + pat = surrogates.toPattern(false); + s2.applyPattern(pat); + checkEqual(surrogates, s2, "surrogates from/to/from pattern"); + } + @Test public void TestUnusedCcc() { // All numeric ccc values 0..255 are valid, but many are unused. diff --git a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RegexUtilitiesTest.java b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RegexUtilitiesTest.java index 4131fd4a53f..88e18b160a2 100644 --- a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RegexUtilitiesTest.java +++ b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RegexUtilitiesTest.java @@ -79,8 +79,13 @@ public class RegexUtilitiesTest extends TestFmwk { errln(e.getMessage()); continue; } - final String expected = "[" + s + "]"; - assertEquals("Doubled character works" + hex.transform(s), expected, pattern); + String expected = "[" + s + "]"; // Try this first for faster testing. + boolean ok = pattern.equals(expected); + if (!ok) { + expected = new UnicodeSet(expected).toPattern(false); + ok = pattern.equals(expected); + } + assertTrue("Doubled character works " + hex.transform(s), ok); // verify that we can create a regex pattern and use as expected String shouldNotMatch = UTF16.valueOf((cp + 1) % 0x110000);