ICU-21645 test & fix unescapeAt(2 supp escapes)

author Markus Scherer <markus.icu@gmail.com>

Sat, 19 Jun 2021 02:01:58 +0000 (02:01 +0000)

committer Markus Scherer <markus.icu@gmail.com>

Sat, 19 Jun 2021 16:49:56 +0000 (16:49 +0000)
author Markus Scherer <markus.icu@gmail.com>
Sat, 19 Jun 2021 02:01:58 +0000 (02:01 +0000)
committer Markus Scherer <markus.icu@gmail.com>
Sat, 19 Jun 2021 16:49:56 +0000 (16:49 +0000)
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java

index d74c231436085e34b67c4d2f7837a31f05938e41..cd90601155292a254d0e8ba690c7d24e4f5fee4e 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java
@@ -863,8 +863,7 @@ public final class Utility {
              // if there is a trail surrogate after it, either as an
              // escape or as a literal.  If so, join them up into a
              // supplementary.
-            if (offset < length &&
-                    UTF16.isLeadSurrogate((char) result)) {
+            if (offset < length && result <= 0xffff && UTF16.isLeadSurrogate((char) result)) {
                  int ahead = offset+1;
                  c = s.charAt(offset); // [sic] get 16-bit code unit
                  if (c == '\\' && ahead < length) {
@@ -872,7 +871,7 @@ public final class Utility {
                      c = unescapeAt(s, o);
                      ahead = o[0];
                  }
-                if (UTF16.isTrailSurrogate((char) c)) {
+                if (c <= 0xffff && UTF16.isTrailSurrogate((char) c)) {
                      offset = ahead;
                      result = Character.toCodePoint((char) result, (char) c);
                  }
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java

index 15aebdfc045aba7b890eb9beb03c7a904a3018b0..a7ee1d072d3ceb76bab844f23454729a4532a1e9 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java
@@ -49,6 +49,22 @@ public class UtilityTest extends TestFmwk {
          if (!result.equals(expect)) {
              errln("FAIL: Utility.unescape() returned " + result + ", exp. " + expect);
          }
+
+        // Regression test for ICU-21645
+        String s = "\\U0001DA8B\\U0001DF00-\\U0001DF1E";
+        int[] offset16 = new int[] { 1 };  // after the backslash
+        // This returned U+B2F00 for the first _two_ escapes.
+        int c = Utility.unescapeAt(s, offset16);
+        assertEquals(s + " unescape at 1, code point", 0x1DA8B, c);
+        assertEquals(s + " unescape at 1, offset", 10, offset16[0]);
+        String pattern = "[" + s + "]";
+        // This threw an IllegalArgumentException because the parser called Utility.unescapeAt()
+        // and saw an invalid range of B2F00..1DF1E (start >= end).
+        UnicodeSet set = new UnicodeSet(pattern);
+        assertEquals(pattern + " size", 32, set.size());
+        assertTrue(pattern + " contains U+1DA8B", set.contains(0x1DA8B));
+        assertTrue(pattern + " contains U+1DF00..U+1DF1E", set.contains(0x1DF00, 0x1DF1E));
+        assertFalse(pattern + " contains U+1DF1F", set.contains(0x1DF1F));
      }
  
      @Test
author	Markus Scherer <markus.icu@gmail.com>
	Sat, 19 Jun 2021 02:01:58 +0000 (02:01 +0000)
committer	Markus Scherer <markus.icu@gmail.com>
	Sat, 19 Jun 2021 16:49:56 +0000 (16:49 +0000)
icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java		patch \| blob \| history