From: Markus Scherer <markus.icu@gmail.com>
Date: Sat, 19 Jun 2021 02:01:58 +0000 (+0000)
Subject: ICU-21645 test & fix unescapeAt(2 supp escapes)
X-Git-Tag: cldr/2021-08-11~30
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=d9aa1ecab0cc65e402ddc435db01cfce68940cda;p=icu

ICU-21645 test & fix unescapeAt(2 supp escapes)
---

diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java
index d74c2314360..cd906011552 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java
@@ -863,8 +863,7 @@ public final class Utility {
             // if there is a trail surrogate after it, either as an
             // escape or as a literal.  If so, join them up into a
             // supplementary.
-            if (offset < length &&
-                    UTF16.isLeadSurrogate((char) result)) {
+            if (offset < length && result <= 0xffff && UTF16.isLeadSurrogate((char) result)) {
                 int ahead = offset+1;
                 c = s.charAt(offset); // [sic] get 16-bit code unit
                 if (c == '\\' && ahead < length) {
@@ -872,7 +871,7 @@ public final class Utility {
                     c = unescapeAt(s, o);
                     ahead = o[0];
                 }
-                if (UTF16.isTrailSurrogate((char) c)) {
+                if (c <= 0xffff && UTF16.isTrailSurrogate((char) c)) {
                     offset = ahead;
                     result = Character.toCodePoint((char) result, (char) c);
                 }
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java
index 15aebdfc045..a7ee1d072d3 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java
@@ -49,6 +49,22 @@ public class UtilityTest extends TestFmwk {
         if (!result.equals(expect)) {
             errln("FAIL: Utility.unescape() returned " + result + ", exp. " + expect);
         }
+
+        // Regression test for ICU-21645
+        String s = "\\U0001DA8B\\U0001DF00-\\U0001DF1E";
+        int[] offset16 = new int[] { 1 };  // after the backslash
+        // This returned U+B2F00 for the first _two_ escapes.
+        int c = Utility.unescapeAt(s, offset16);
+        assertEquals(s + " unescape at 1, code point", 0x1DA8B, c);
+        assertEquals(s + " unescape at 1, offset", 10, offset16[0]);
+        String pattern = "[" + s + "]";
+        // This threw an IllegalArgumentException because the parser called Utility.unescapeAt()
+        // and saw an invalid range of B2F00..1DF1E (start >= end).
+        UnicodeSet set = new UnicodeSet(pattern);
+        assertEquals(pattern + " size", 32, set.size());
+        assertTrue(pattern + " contains U+1DA8B", set.contains(0x1DA8B));
+        assertTrue(pattern + " contains U+1DF00..U+1DF1E", set.contains(0x1DF00, 0x1DF1E));
+        assertFalse(pattern + " contains U+1DF1F", set.contains(0x1DF1F));
     }
 
     @Test