ICU-11891 make UnicodeSet.toPattern() well-formed & round-trip

author Markus Scherer <markus.icu@gmail.com>

Fri, 17 Sep 2021 02:51:56 +0000 (19:51 -0700)

committer Markus Scherer <markus.icu@gmail.com>

Fri, 17 Sep 2021 20:49:13 +0000 (13:49 -0700)
author Markus Scherer <markus.icu@gmail.com>
Fri, 17 Sep 2021 02:51:56 +0000 (19:51 -0700)
committer Markus Scherer <markus.icu@gmail.com>
Fri, 17 Sep 2021 20:49:13 +0000 (13:49 -0700)
diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h

index d0cc86e02d27250d4c4327741e0102111a208ff1..b5fd13a686fbe418c1c8d4334dabed4979f46ec0 100644 (file)
--- a/icu4c/source/common/unicode/uniset.h
+++ b/icu4c/source/common/unicode/uniset.h
@@ -1597,6 +1597,9 @@ private:
  
      static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
  
+    static void _appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
+                             UBool escapeUnprintable);
+
      //----------------------------------------------------------------
      // Implementation: Fundamental operators
      //----------------------------------------------------------------
diff --git a/icu4c/source/common/uniset.cpp b/icu4c/source/common/uniset.cpp

index 47993361f2f5ecee710dd943281ac9c9690227c6..967ea2ecdb22c241c8d58bcbe5dcb96468b469d8 100644 (file)
--- a/icu4c/source/common/uniset.cpp
+++ b/icu4c/source/common/uniset.cpp
@@ -1967,8 +1967,7 @@ void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity)
   * Append the <code>toPattern()</code> representation of a
   * string to the given <code>StringBuffer</code>.
   */
-void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
-escapeUnprintable) {
+void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable) {
      UChar32 cp;
      for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
          _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
@@ -1979,14 +1978,12 @@ escapeUnprintable) {
   * Append the <code>toPattern()</code> representation of a
   * character to the given <code>StringBuffer</code>.
   */
-void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
-escapeUnprintable) {
-    if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
+void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable) {
+    if (escapeUnprintable ? ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
          // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
          // unprintable
-        if (ICU_Utility::escapeUnprintable(buf, c)) {
-            return;
-        }
+        ICU_Utility::escape(buf, c);
+        return;
      }
      // Okay to let ':' pass through
      switch (c) {
@@ -2012,6 +2009,19 @@ escapeUnprintable) {
      buf.append(c);
  }
  
+void UnicodeSet::_appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
+                              UBool escapeUnprintable) {
+    _appendToPat(result, start, escapeUnprintable);
+    if (start != end) {
+        if ((start+1) != end ||
+                // Avoid writing what looks like a lead+trail surrogate pair.
+                start == 0xdbff) {
+            result.append(u'-');
+        }
+        _appendToPat(result, end, escapeUnprintable);
+    }
+}
+
  /**
   * Append a string representation of this set to result.  This will be
   * a cleaned version of the string passed to applyPattern(), if there
@@ -2026,7 +2036,8 @@ UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
          for (i=0; i<patLen; ) {
              UChar32 c;
              U16_NEXT(pat, i, patLen, c);
-            if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
+            if (escapeUnprintable ?
+                    ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
                  // If the unprintable character is preceded by an odd
                  // number of backslashes, then it has been escaped.
                  // Before unescaping it, we delete the final
@@ -2034,7 +2045,7 @@ UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
                  if ((backslashCount % 2) == 1) {
                      result.truncate(result.length() - 1);
                  }
-                ICU_Utility::escapeUnprintable(result, c);
+                ICU_Utility::escape(result, c);
                  backslashCount = 0;
              } else {
                  result.append(c);
@@ -2073,52 +2084,48 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
  {
      result.append(u'[');
  
-//  // Check against the predefined categories.  We implicitly build
-//  // up ALL category sets the first time toPattern() is called.
-//  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
-//      if (*this == getCategorySet(cat)) {
-//          result.append(u':');
-//          result.append(CATEGORY_NAMES, cat*2, 2);
-//          return result.append(CATEGORY_CLOSE);
-//      }
-//  }
-
-    int32_t count = getRangeCount();
+    int32_t i = 0;
+    int32_t limit = len & ~1;  // = 2 * getRangeCount()
  
      // If the set contains at least 2 intervals and includes both
      // MIN_VALUE and MAX_VALUE, then the inverse representation will
      // be more economical.
-    if (count > 1 &&
-        getRangeStart(0) == MIN_VALUE &&
-        getRangeEnd(count-1) == MAX_VALUE) {
-
+    //     if (getRangeCount() >= 2 &&
+    //             getRangeStart(0) == MIN_VALUE &&
+    //             getRangeEnd(last) == MAX_VALUE)
+    // Invariant: list[len-1] == HIGH == MAX_VALUE + 1
+    // If limit == len then len is even and the last range ends with MAX_VALUE.
+    if (len >= 4 && list[0] == 0 && limit == len) {
          // Emit the inverse
          result.append(u'^');
-
-        for (int32_t i = 1; i < count; ++i) {
-            UChar32 start = getRangeEnd(i-1)+1;
-            UChar32 end = getRangeStart(i)-1;
-            _appendToPat(result, start, escapeUnprintable);
-            if (start != end) {
-                if ((start+1) != end) {
-                    result.append(u'-');
-                }
-                _appendToPat(result, end, escapeUnprintable);
+        // Offsetting the inversion list index by one lets us
+        // iterate over the ranges of the set complement.
+        i = 1;
+        --limit;
+    }
+
+    // Emit the ranges as pairs.
+    while (i < limit) {
+        UChar32 start = list[i];  // getRangeStart()
+        UChar32 end = list[i + 1] - 1;  // getRangeEnd() = range limit minus one
+        if (!(0xd800 <= end && end <= 0xdbff)) {
+            _appendToPat(result, start, end, escapeUnprintable);
+            i += 2;
+        } else {
+            // The range ends with a lead surrogate.
+            // Avoid writing what looks like a lead+trail surrogate pair.
+            // 1. Postpone ranges that start with a lead surrogate code point.
+            int32_t firstLead = i;
+            while ((i += 2) < limit && list[i] <= 0xdbff) {}
+            int32_t firstAfterLead = i;
+            // 2. Write following ranges that start with a trail surrogate code point.
+            while (i < limit && (start = list[i]) <= 0xdfff) {
+                _appendToPat(result, start, list[i + 1] - 1, escapeUnprintable);
+                i += 2;
              }
-        }
-    }
-
-    // Default; emit the ranges as pairs
-    else {
-        for (int32_t i = 0; i < count; ++i) {
-            UChar32 start = getRangeStart(i);
-            UChar32 end = getRangeEnd(i);
-            _appendToPat(result, start, escapeUnprintable);
-            if (start != end) {
-                if ((start+1) != end) {
-                    result.append(u'-');
-                }
-                _appendToPat(result, end, escapeUnprintable);
+            // 3. Now write the postponed ranges.
+            for (int j = firstLead; j < firstAfterLead; j += 2) {
+                _appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable);
              }
          }
      }
diff --git a/icu4c/source/common/util.cpp b/icu4c/source/common/util.cpp

index 86e5c791bad98e111a0d1d49343199e5d855084f..f342172259980d5b6f132e2a6a20bf0252008f50 100644 (file)
--- a/icu4c/source/common/util.cpp
+++ b/icu4c/source/common/util.cpp
@@ -65,38 +65,52 @@ UnicodeString& ICU_Utility::appendNumber(UnicodeString& result, int32_t n,
      return result;
  }
  
-/**
- * Return true if the character is NOT printable ASCII.
- */
  UBool ICU_Utility::isUnprintable(UChar32 c) {
      return !(c >= 0x20 && c <= 0x7E);
  }
  
-/**
- * Escape unprintable characters using \uxxxx notation for U+0000 to
- * U+FFFF and \Uxxxxxxxx for U+10000 and above.  If the character is
- * printable ASCII, then do nothing and return FALSE.  Otherwise,
- * append the escaped notation and return TRUE.
- */
+UBool ICU_Utility::shouldAlwaysBeEscaped(UChar32 c) {
+    if (c < 0x20) {
+        return true;  // C0 control codes
+    } else if (c <= 0x7e) {
+        return false;  // printable ASCII
+    } else if (c <= 0x9f) {
+        return true;  // C1 control codes
+    } else if (c < 0xd800) {
+        return false;  // most of the BMP
+    } else if (c <= 0xdfff || (0xfdd0 <= c && c <= 0xfdef) || (c & 0xfffe) == 0xfffe) {
+        return true;  // surrogate or noncharacter code points
+    } else if (c <= 0x10ffff) {
+        return false;  // all else
+    } else {
+        return true;  // not a code point
+    }
+}
+
  UBool ICU_Utility::escapeUnprintable(UnicodeString& result, UChar32 c) {
      if (isUnprintable(c)) {
-        result.append(BACKSLASH);
-        if (c & ~0xFFFF) {
-            result.append(UPPER_U);
-            result.append(DIGITS[0xF&(c>>28)]);
-            result.append(DIGITS[0xF&(c>>24)]);
-            result.append(DIGITS[0xF&(c>>20)]);
-            result.append(DIGITS[0xF&(c>>16)]);
-        } else {
-            result.append(LOWER_U);
-        }
-        result.append(DIGITS[0xF&(c>>12)]);
-        result.append(DIGITS[0xF&(c>>8)]);
-        result.append(DIGITS[0xF&(c>>4)]);
-        result.append(DIGITS[0xF&c]);
-        return TRUE;
+        escape(result, c);
+        return true;
+    }
+    return false;
+}
+
+UnicodeString &ICU_Utility::escape(UnicodeString& result, UChar32 c) {
+    result.append(BACKSLASH);
+    if (c & ~0xFFFF) {
+        result.append(UPPER_U);
+        result.append(DIGITS[0xF&(c>>28)]);
+        result.append(DIGITS[0xF&(c>>24)]);
+        result.append(DIGITS[0xF&(c>>20)]);
+        result.append(DIGITS[0xF&(c>>16)]);
+    } else {
+        result.append(LOWER_U);
      }
-    return FALSE;
+    result.append(DIGITS[0xF&(c>>12)]);
+    result.append(DIGITS[0xF&(c>>8)]);
+    result.append(DIGITS[0xF&(c>>4)]);
+    result.append(DIGITS[0xF&c]);
+    return result;
  }
  
  /**
diff --git a/icu4c/source/common/util.h b/icu4c/source/common/util.h

index a748d915d47ed85e88ff927db69f719ff6391750..7e6d356899f9240a91f91e7fc3b9794d46368c75 100644 (file)
--- a/icu4c/source/common/util.h
+++ b/icu4c/source/common/util.h
@@ -55,20 +55,30 @@ class U_COMMON_API ICU_Utility /* not : public UObject because all methods are s
  
      /**
       * Return true if the character is NOT printable ASCII.
-     *
-     * This method should really be in UnicodeString (or similar).  For
-     * now, we implement it here and share it with friend classes.
+     * The tab, newline and linefeed characters are considered unprintable.
       */
      static UBool isUnprintable(UChar32 c);
  
      /**
-     * Escape unprintable characters using \uxxxx notation for U+0000 to
+     * @return true for control codes and for surrogate and noncharacter code points
+     */
+    static UBool shouldAlwaysBeEscaped(UChar32 c);
+
+    /**
+     * Escapes one unprintable code point using \uxxxx notation for U+0000 to
       * U+FFFF and \Uxxxxxxxx for U+10000 and above.  If the character is
       * printable ASCII, then do nothing and return false.  Otherwise,
       * append the escaped notation and return true.
       */
      static UBool escapeUnprintable(UnicodeString& result, UChar32 c);
  
+    /**
+     * Escapes one code point using \uxxxx notation
+     * for U+0000 to U+FFFF and \Uxxxxxxxx for U+10000 and above.
+     * @return result
+     */
+    static UnicodeString &escape(UnicodeString& result, UChar32 c);
+
      /**
       * Returns the index of a character, ignoring quoted text.
       * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp

index 194182998b7de1fad724607a34b5b650e08b510a..b2241031944d11c206ca43ce8ae275de06b2a546 100644 (file)
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -94,7 +94,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
      TESTCASE_AUTO(TestFreezable);
      TESTCASE_AUTO(TestSpan);
      TESTCASE_AUTO(TestStringSpan);
-    TESTCASE_AUTO(TestUCAUnsafeBackwards);
+    TESTCASE_AUTO(TestPatternWithSurrogates);
      TESTCASE_AUTO(TestIntOverflow);
      TESTCASE_AUTO(TestUnusedCcc);
      TESTCASE_AUTO(TestDeepPattern);
@@ -3908,60 +3908,47 @@ void UnicodeSetTest::TestStringSpan() {
      }
  }
  
-/**
- * Including collationroot.h fails here with
-1>c:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\include\driverspecs.h(142): error C2008: '$' : unexpected in macro definition
- *  .. so, we skip this test on Windows.
- * 
- * the cause is that  intltest builds with /Za which disables language extensions - which means
- *  windows header files can't be used.
- */
-#if !UCONFIG_NO_COLLATION && !U_PLATFORM_HAS_WIN32_API
-#include "collationroot.h"
-#include "collationtailoring.h"
-#endif
-
-void UnicodeSetTest::TestUCAUnsafeBackwards() {
-#if U_PLATFORM_HAS_WIN32_API
-    infoln("Skipping TestUCAUnsafeBackwards() - can't include collationroot.h on Windows without language extensions!");
-#elif !UCONFIG_NO_COLLATION
-    UErrorCode errorCode = U_ZERO_ERROR;
-
-    // Get the unsafeBackwardsSet
-    const CollationCacheEntry *rootEntry = CollationRoot::getRootCacheEntry(errorCode);
-    if(U_FAILURE(errorCode)) {
-      dataerrln("FAIL: %s getting root cache entry", u_errorName(errorCode));
-      return;
-    }
-    //const UVersionInfo &version = rootEntry->tailoring->version;
-    const UnicodeSet *unsafeBackwardSet = rootEntry->tailoring->unsafeBackwardSet;
-
-    checkSerializeRoundTrip(*unsafeBackwardSet, errorCode);
-
-    if(!logKnownIssue("11891","UnicodeSet fails to round trip on CollationRoot...unsafeBackwards set")) {
-        // simple test case
-        // TODO(ticket #11891): Simplify this test function to this simple case. Rename it appropriately.
-        // TODO(ticket #11891): Port test to Java. Is this a bug there, too?
-        UnicodeSet surrogates;
-        surrogates.add(0xd83a);  // a lead surrogate
-        surrogates.add(0xdc00, 0xdfff);  // a range of trail surrogates
-        UnicodeString pat;
-        surrogates.toPattern(pat, FALSE);  // bad: [ 0xd83a, 0xdc00, 0x2d, 0xdfff ]
-        // TODO: Probably fix either UnicodeSet::_generatePattern() or _appendToPat()
-        // so that at least one type of surrogate code points are escaped,
-        // or (minimally) so that adjacent lead+trail surrogate code points are escaped.
-        errorCode = U_ZERO_ERROR;
-        UnicodeSet s2;
-        s2.applyPattern(pat, errorCode);  // looks like invalid range [ 0x1e800, 0x2d, 0xdfff ]
-        if(U_FAILURE(errorCode)) {
-            errln("FAIL: surrogates to/from pattern - %s", u_errorName(errorCode));
-        } else {
-            checkEqual(surrogates, s2, "surrogates to/from pattern");
-        }
-        // This occurs in the UCA unsafe-backwards set.
-        checkRoundTrip(*unsafeBackwardSet);
-    }
-#endif
+void UnicodeSetTest::TestPatternWithSurrogates() {
+    IcuTestErrorCode errorCode(*this, "TestPatternWithSurrogates");
+    // Regression test for ICU-11891
+    UnicodeSet surrogates;
+    surrogates.add(0xd000, 0xd82f);  // a range ending with a lead surrogate code point
+    surrogates.add(0xd83a);  // a lead surrogate
+    surrogates.add(0xdc00, 0xdfff);  // a range of trail surrogates
+    UnicodeString pat;
+    surrogates.toPattern(pat, false);  // bad if U+D83A is immediately followed by U+DC00
+    UnicodeSet s2;
+    // was: U_MALFORMED_SET
+    // Java: IllegalArgumentException: Error: Invalid range at "[...\U0001E800-\uDFFF|...]"
+    s2.applyPattern(pat, errorCode);
+    if (errorCode.errIfFailureAndReset("surrogates (1) to/from pattern")) { return; }
+    checkEqual(surrogates, s2, "surrogates (1) to/from pattern");
+
+    // create a range of DBFF-DC00, and in the complement form a range of DC01-DC03
+    surrogates.add(0xdbff).remove(0xdc01, 0xdc03);
+    // add a beyond-surrogates range, up to the last code point
+    surrogates.add(0x10affe, 0x10ffff);
+    surrogates.toPattern(pat, false);  // bad if U+DBFF is immediately followed by U+DC00
+    s2.applyPattern(pat, errorCode);
+    if (errorCode.errIfFailureAndReset("surrogates (2) to/from pattern")) { return; }
+    checkEqual(surrogates, s2, "surrogates (2) to/from pattern");
+
+    // Test the toPattern() code path when the pattern is shorter in complement form:
+    // [^opposite-ranges]
+    surrogates.add(0, 0x6789);
+    surrogates.toPattern(pat, false);
+    s2.applyPattern(pat, errorCode);
+    if (errorCode.errIfFailureAndReset("surrogates (3) to/from pattern")) { return; }
+    checkEqual(surrogates, s2, "surrogates (3) to/from pattern");
+
+    // Start with a pattern, in case the original pattern is kept but
+    // without the extra white space.
+    surrogates.applyPattern(u"[\\uD83A \\uDC00-\\uDFFF]", errorCode);
+    if (errorCode.errIfFailureAndReset("surrogates from pattern")) { return; }
+    surrogates.toPattern(pat, false);
+    s2.applyPattern(pat, errorCode);
+    if (errorCode.errIfFailureAndReset("surrogates from/to/from pattern")) { return; }
+    checkEqual(surrogates, s2, "surrogates from/to/from pattern");
  }
  
  void UnicodeSetTest::TestIntOverflow() {
diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h

index f999d45017a32e3812f7b64814e65fe5bc209ec7..bba91ba2bebf72545723a4374a4cfed443eaf749 100644 (file)
--- a/icu4c/source/test/intltest/usettest.h
+++ b/icu4c/source/test/intltest/usettest.h
@@ -91,7 +91,7 @@ private:
  
      void TestStringSpan();
  
-    void TestUCAUnsafeBackwards();
+    void TestPatternWithSurrogates();
      void TestIntOverflow();
      void TestUnusedCcc();
      void TestDeepPattern();
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java

index 5308cad9eb905ddc0bb569d9e495c281c877926e..634d89e364a6b34e5de1ac5c12e5fee7ed7f8331 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java
@@ -1536,34 +1536,65 @@ public final class Utility {
      }
  
      /**
-     * Escape unprintable characters using <backslash>uxxxx notation
+     * @return true for control codes and for surrogate and noncharacter code points
+     */
+    public static boolean shouldAlwaysBeEscaped(int c) {
+        if (c < 0x20) {
+            return true;  // C0 control codes
+        } else if (c <= 0x7e) {
+            return false;  // printable ASCII
+        } else if (c <= 0x9f) {
+            return true;  // C1 control codes
+        } else if (c < 0xd800) {
+            return false;  // most of the BMP
+        } else if (c <= 0xdfff || (0xfdd0 <= c && c <= 0xfdef) || (c & 0xfffe) == 0xfffe) {
+            return true;  // surrogate or noncharacter code points
+        } else if (c <= 0x10ffff) {
+            return false;  // all else
+        } else {
+            return true;  // not a code point
+        }
+    }
+
+    /**
+     * Escapes one unprintable code point using <backslash>uxxxx notation
       * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and
       * above.  If the character is printable ASCII, then do nothing
       * and return FALSE.  Otherwise, append the escaped notation and
       * return TRUE.
       */
      public static <T extends Appendable> boolean escapeUnprintable(T result, int c) {
+        if (isUnprintable(c)) {
+            escape(result, c);
+            return true;
+        }
+        return false;
+    }
+
+    /**
+     * Escapes one code point using <backslash>uxxxx notation
+     * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and above.
+     * @return result
+     */
+    public static <T extends Appendable> T escape(T result, int c) {
          try {
-            if (isUnprintable(c)) {
-                result.append('\\');
-                if ((c & ~0xFFFF) != 0) {
-                    result.append('U');
-                    result.append(DIGITS[0xF&(c>>28)]);
-                    result.append(DIGITS[0xF&(c>>24)]);
-                    result.append(DIGITS[0xF&(c>>20)]);
-                    result.append(DIGITS[0xF&(c>>16)]);
-                } else {
-                    result.append('u');
-                }
-                result.append(DIGITS[0xF&(c>>12)]);
-                result.append(DIGITS[0xF&(c>>8)]);
-                result.append(DIGITS[0xF&(c>>4)]);
-                result.append(DIGITS[0xF&c]);
-                return true;
+            result.append('\\');
+            if ((c & ~0xFFFF) != 0) {
+                result.append('U');
+                result.append(DIGITS[0xF&(c>>28)]);
+                result.append(DIGITS[0xF&(c>>24)]);
+                result.append(DIGITS[0xF&(c>>20)]);
+                result.append(DIGITS[0xF&(c>>16)]);
+            } else {
+                result.append('u');
              }
-            return false;
+            result.append(DIGITS[0xF&(c>>12)]);
+            result.append(DIGITS[0xF&(c>>8)]);
+            result.append(DIGITS[0xF&(c>>4)]);
+            result.append(DIGITS[0xF&c]);
+            return result;
          } catch (IOException e) {
-            throw new IllegalIcuArgumentException(e);
+            throw new ICUUncheckedIOException(e);
          }
      }
  
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java

index cda5bf1839ea78128c7f2a4ce6c0e16f99983358..88b9464a82edb29447256237961dd599190548ca 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
@@ -650,12 +650,10 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
       */
      private static <T extends Appendable> T _appendToPat(T buf, int c, boolean escapeUnprintable) {
          try {
-            if (escapeUnprintable && Utility.isUnprintable(c)) {
+            if (escapeUnprintable ? Utility.isUnprintable(c) : Utility.shouldAlwaysBeEscaped(c)) {
                  // Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything
                  // unprintable
-                if (Utility.escapeUnprintable(buf, c)) {
-                    return buf;
-                }
+                return Utility.escape(buf, c);
              }
              // Okay to let ':' pass through
              switch (c) {
@@ -685,6 +683,24 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
          }
      }
  
+    private static <T extends Appendable> T _appendToPat(
+            T result, int start, int end, boolean escapeUnprintable) {
+        _appendToPat(result, start, escapeUnprintable);
+        if (start != end) {
+            if ((start+1) != end ||
+                    // Avoid writing what looks like a lead+trail surrogate pair.
+                    start == 0xdbff) {
+                try {
+                    result.append('-');
+                } catch (IOException e) {
+                    throw new ICUUncheckedIOException(e);
+                }
+            }
+            _appendToPat(result, end, escapeUnprintable);
+        }
+        return result;
+    }
+
      /**
       * Returns a string representation of this set.  If the result of
       * calling this function is passed to a UnicodeSet constructor, it
@@ -712,6 +728,10 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
          }
          try {
              if (!escapeUnprintable) {
+                // TODO: The C++ version does not have this shortcut, and instead
+                // always cleans up the pattern string,
+                // which also escapes Utility.shouldAlwaysBeEscaped(c).
+                // We should sync these implementations.
                  result.append(pat);
                  return result;
              }
@@ -723,7 +743,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
                      // If the unprintable character is preceded by an odd
                      // number of backslashes, then it has been escaped
                      // and we omit the last backslash.
-                    Utility.escapeUnprintable(result, c);
+                    Utility.escape(result, c);
                      oddNumberOfBackslashes = false;
                  } else if (!oddNumberOfBackslashes && c == '\\') {
                      // Temporarily withhold an odd-numbered backslash.
@@ -749,6 +769,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
       * Generate and append a string representation of this set to result.
       * This does not use this.pat, the cleaned up copy of the string
       * passed to applyPattern().
+     *
       * @param result the buffer into which to generate the pattern
       * @param escapeUnprintable escape unprintable characters if true
       * @stable ICU 2.0
@@ -761,6 +782,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
       * Generate and append a string representation of this set to result.
       * This does not use this.pat, the cleaned up copy of the string
       * passed to applyPattern().
+     *
+     * @param result the buffer into which to generate the pattern
+     * @param escapeUnprintable escape unprintable characters if true
       * @param includeStrings if false, doesn't include the strings.
       * @stable ICU 3.8
       */
@@ -769,47 +793,55 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
          return appendNewPattern(result, escapeUnprintable, includeStrings);
      }
  
+    // Implementation of public _generatePattern().
+    // Allows other callers to use a StringBuilder while the existing API is stuck with StringBuffer.
      private <T extends Appendable> T appendNewPattern(
              T result, boolean escapeUnprintable, boolean includeStrings) {
          try {
              result.append('[');
  
-            int count = getRangeCount();
+            int i = 0;
+            int limit = len & ~1;  // = 2 * getRangeCount()
  
              // If the set contains at least 2 intervals and includes both
              // MIN_VALUE and MAX_VALUE, then the inverse representation will
              // be more economical.
-            if (count > 1 &&
-                    getRangeStart(0) == MIN_VALUE &&
-                    getRangeEnd(count-1) == MAX_VALUE) {
-
+            //     if (getRangeCount() >= 2 &&
+            //             getRangeStart(0) == MIN_VALUE &&
+            //             getRangeEnd(last) == MAX_VALUE)
+            // Invariant: list[len-1] == HIGH == MAX_VALUE + 1
+            // If limit == len then len is even and the last range ends with MAX_VALUE.
+            if (len >= 4 && list[0] == 0 && limit == len) {
                  // Emit the inverse
                  result.append('^');
-
-                for (int i = 1; i < count; ++i) {
-                    int start = getRangeEnd(i-1)+1;
-                    int end = getRangeStart(i)-1;
-                    _appendToPat(result, start, escapeUnprintable);
-                    if (start != end) {
-                        if ((start+1) != end) {
-                            result.append('-');
-                        }
-                        _appendToPat(result, end, escapeUnprintable);
-                    }
-                }
+                // Offsetting the inversion list index by one lets us
+                // iterate over the ranges of the set complement.
+                i = 1;
+                --limit;
              }
  
-            // Default; emit the ranges as pairs
-            else {
-                for (int i = 0; i < count; ++i) {
-                    int start = getRangeStart(i);
-                    int end = getRangeEnd(i);
-                    _appendToPat(result, start, escapeUnprintable);
-                    if (start != end) {
-                        if ((start+1) != end) {
-                            result.append('-');
-                        }
-                        _appendToPat(result, end, escapeUnprintable);
+            // Emit the ranges as pairs.
+            while (i < limit) {
+                int start = list[i];  // getRangeStart()
+                int end = list[i + 1] - 1;  // getRangeEnd() = range limit minus one
+                if (!(0xd800 <= end && end <= 0xdbff)) {
+                    _appendToPat(result, start, end, escapeUnprintable);
+                    i += 2;
+                } else {
+                    // The range ends with a lead surrogate.
+                    // Avoid writing what looks like a lead+trail surrogate pair.
+                    // 1. Postpone ranges that start with a lead surrogate code point.
+                    int firstLead = i;
+                    while ((i += 2) < limit && list[i] <= 0xdbff) {}
+                    int firstAfterLead = i;
+                    // 2. Write following ranges that start with a trail surrogate code point.
+                    while (i < limit && (start = list[i]) <= 0xdfff) {
+                        _appendToPat(result, start, list[i + 1] - 1, escapeUnprintable);
+                        i += 2;
+                    }
+                    // 3. Now write the postponed ranges.
+                    for (int j = firstLead; j < firstAfterLead; j += 2) {
+                        _appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable);
                      }
                  }
              }
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java

index 8b0121cedb46f0d664363272aceb98dc1e057743..bf44762c957c4cc30595aa78239d767c43214bc4 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java
@@ -2752,6 +2752,42 @@ public class UnicodeSetTest extends TestFmwk {
                  UnicodeSet.fromAll("b").compareTo(Collections.singleton("a")) > 0);
      }
  
+    @Test
+    public void TestPatternWithSurrogates() {
+        // Regression test for ICU-11891
+        UnicodeSet surrogates = new UnicodeSet();
+        surrogates.add(0xd000, 0xd82f);  // a range ending with a lead surrogate code point
+        surrogates.add(0xd83a);  // a lead surrogate
+        surrogates.add(0xdc00, 0xdfff);  // a range of trail surrogates
+        String pat = surrogates.toPattern(false);  // bad if U+D83A is immediately followed by U+DC00
+        UnicodeSet s2 = new UnicodeSet();
+        // was: IllegalArgumentException: Error: Invalid range at "[...\U0001E800-\uDFFF|...]"
+        s2.applyPattern(pat);
+        checkEqual(surrogates, s2, "surrogates (1) to/from pattern");
+
+        // create a range of DBFF-DC00, and in the complement form a range of DC01-DC03
+        surrogates.add(0xdbff).remove(0xdc01, 0xdc03);
+        // add a beyond-surrogates range, up to the last code point
+        surrogates.add(0x10affe, 0x10ffff);
+        pat = surrogates.toPattern(false);  // bad if U+DBFF is immediately followed by U+DC00
+        s2.applyPattern(pat);
+        checkEqual(surrogates, s2, "surrogates (2) to/from pattern");
+
+        // Test the toPattern() code path when the pattern is shorter in complement form:
+        // [^opposite-ranges]
+        surrogates.add(0, 0x6789);
+        pat = surrogates.toPattern(false);
+        s2.applyPattern(pat);
+        checkEqual(surrogates, s2, "surrogates (3) to/from pattern");
+
+        // Start with a pattern, in case the original pattern is kept but
+        // without the extra white space.
+        surrogates.applyPattern("[\\uD83A \\uDC00-\\uDFFF]");
+        pat = surrogates.toPattern(false);
+        s2.applyPattern(pat);
+        checkEqual(surrogates, s2, "surrogates from/to/from pattern");
+    }
+
      @Test
      public void TestUnusedCcc() {
          // All numeric ccc values 0..255 are valid, but many are unused.
diff --git a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RegexUtilitiesTest.java b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RegexUtilitiesTest.java

index 4131fd4a53fe82f6bdb573ff4b588c74b7a49938..88e18b160a2d2f90cdf8172bc00677bac1459b26 100644 (file)
--- a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RegexUtilitiesTest.java
+++ b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RegexUtilitiesTest.java
@@ -79,8 +79,13 @@ public class RegexUtilitiesTest extends TestFmwk {
                  errln(e.getMessage());
                  continue;
              }
-            final String expected = "[" + s + "]";
-            assertEquals("Doubled character works" + hex.transform(s), expected, pattern);
+            String expected = "[" + s + "]";  // Try this first for faster testing.
+            boolean ok = pattern.equals(expected);
+            if (!ok) {
+                expected = new UnicodeSet(expected).toPattern(false);
+                ok = pattern.equals(expected);
+            }
+            assertTrue("Doubled character works " + hex.transform(s), ok);
  
              // verify that we can create a regex pattern and use as expected
              String shouldNotMatch = UTF16.valueOf((cp + 1) % 0x110000);
author	Markus Scherer <markus.icu@gmail.com>
	Fri, 17 Sep 2021 02:51:56 +0000 (19:51 -0700)
committer	Markus Scherer <markus.icu@gmail.com>
	Fri, 17 Sep 2021 20:49:13 +0000 (13:49 -0700)
icu4c/source/common/unicode/uniset.h		patch \| blob \| history
icu4c/source/common/uniset.cpp		patch \| blob \| history
icu4c/source/common/util.cpp		patch \| blob \| history
icu4c/source/common/util.h		patch \| blob \| history
icu4c/source/test/intltest/usettest.cpp		patch \| blob \| history
icu4c/source/test/intltest/usettest.h		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java		patch \| blob \| history
icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RegexUtilitiesTest.java		patch \| blob \| history