ICU-21645 reduce heap allocations in unescape() and other parsing

author Markus Scherer <markus.icu@gmail.com>

Sat, 19 Jun 2021 20:25:33 +0000 (20:25 +0000)

committer Markus Scherer <markus.icu@gmail.com>

Mon, 21 Jun 2021 19:40:28 +0000 (19:40 +0000)
author Markus Scherer <markus.icu@gmail.com>
Sat, 19 Jun 2021 20:25:33 +0000 (20:25 +0000)
committer Markus Scherer <markus.icu@gmail.com>
Mon, 21 Jun 2021 19:40:28 +0000 (19:40 +0000)
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/RuleCharacterIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/RuleCharacterIterator.java

index 5e981655015324ec82c7d8fba6a6c150fc8054d5..49c74f4d95bfe4743192f73c30ce0f7da0a85237 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/RuleCharacterIterator.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/RuleCharacterIterator.java
@@ -52,7 +52,7 @@ public class RuleCharacterIterator {
      /**
       * Current variable expansion, or null if none.
       */
-    private char[] buf;
+    private String buf;
  
      /**
       * Position within buf[].  Meaningless if buf == null.
@@ -79,7 +79,7 @@ public class RuleCharacterIterator {
      /**
       * Bitmask option to enable parsing of escape sequences.  If (options &
       * PARSE_ESCAPES) != 0, then an embedded escape sequence will be expanded
-     * to its value.  Escapes are parsed using Utility.unescapeAt().
+     * to its value.  Escapes are parsed using Utility.unescapeAndLengthAt().
       */
      public static final int PARSE_ESCAPES   = 2;
  
@@ -90,6 +90,13 @@ public class RuleCharacterIterator {
       */
      public static final int SKIP_WHITESPACE = 4;
  
+    /** For use with {@link #getPos(Position)} & {@link #setPos(Position)}. */
+    public static final class Position {
+        private String buf;
+        private int bufPos;
+        private int posIndex;
+    };
+
      /**
       * Constructs an iterator over the given text, starting at the given
       * position.
@@ -144,15 +151,17 @@ public class RuleCharacterIterator {
                      break;
                  }
                  bufPos = 0;
-                buf = sym.lookup(name);
-                if (buf == null) {
+                char[] chars = sym.lookup(name);
+                if (chars == null) {
+                    buf = null;
                      throw new IllegalArgumentException(
                                  "Undefined variable: " + name);
                  }
                  // Handle empty variable value
-                if (buf.length == 0) {
+                if (chars.length == 0) {
                      buf = null;
                  }
+                buf = new String(chars);
                  continue;
              }
  
@@ -162,13 +171,14 @@ public class RuleCharacterIterator {
              }
  
              if (c == '\\' && (options & PARSE_ESCAPES) != 0) {
-                int offset[] = new int[] { 0 };
-                c = Utility.unescapeAt(lookahead(), offset);
-                jumpahead(offset[0]);
-                isEscaped = true;
-                if (c < 0) {
+                int cpAndLength = Utility.unescapeAndLengthAt(
+                        getCurrentBuffer(), getCurrentBufferPos());
+                if (cpAndLength < 0) {
                      throw new IllegalArgumentException("Invalid escape");
                  }
+                c = Utility.cpFromCodePointAndLength(cpAndLength);
+                jumpahead(Utility.lengthFromCodePointAndLength(cpAndLength));
+                isEscaped = true;
              }
  
              break;
@@ -199,7 +209,7 @@ public class RuleCharacterIterator {
       * restore this iterator's position.  Usage idiom:
       *
       * RuleCharacterIterator iterator = ...;
-     * Object pos = iterator.getPos(null); // allocate position object
+     * Position pos = iterator.getPos(null); // allocate position object
       * for (;;) {
       *   pos = iterator.getPos(pos); // reuse position object
       *   int c = iterator.next(...);
@@ -213,15 +223,13 @@ public class RuleCharacterIterator {
       * @return a position object which may be passed to setPos(),
       * either `p,' or if `p' == null, a newly-allocated object
       */
-    public Object getPos(Object p) {
+    public Position getPos(Position p) {
          if (p == null) {
-            return new Object[] {buf, new int[] {pos.getIndex(), bufPos}};
+            p = new Position();
          }
-        Object[] a = (Object[]) p;
-        a[0] = buf;
-        int[] v = (int[]) a[1];
-        v[0] = pos.getIndex();
-        v[1] = bufPos;
+        p.buf = buf;
+        p.bufPos = bufPos;
+        p.posIndex = pos.getIndex();
          return p;
      }
  
@@ -230,12 +238,10 @@ public class RuleCharacterIterator {
       * returned the given object.
       * @param p a position object previously returned by getPos()
       */
-    public void setPos(Object p) {
-        Object[] a = (Object[]) p;
-        buf = (char[]) a[0];
-        int[] v = (int[]) a[1];
-        pos.setIndex(v[0]);
-        bufPos = v[1];
+    public void setPos(Position p) {
+        buf = p.buf;
+        pos.setIndex(p.posIndex);
+        bufPos = p.bufPos;
      }
  
      /**
@@ -260,25 +266,35 @@ public class RuleCharacterIterator {
       * Returns a string containing the remainder of the characters to be
       * returned by this iterator, without any option processing.  If the
       * iterator is currently within a variable expansion, this will only
-     * extend to the end of the variable expansion.  This method is provided
-     * so that iterators may interoperate with string-based APIs.  The typical
-     * sequence of calls is to call skipIgnored(), then call lookahead(), then
-     * parse the string returned by lookahead(), then call jumpahead() to
+     * extend to the end of the variable expansion.
+     * This method, together with getCurrentBufferPos() (which replace the former lookahead()),
+     * is provided so that iterators may interoperate with string-based APIs. The typical
+     * sequence of calls is to call skipIgnored(), then call these methods, then
+     * parse that substring, then call jumpahead() to
       * resynchronize the iterator.
       * @return a string containing the characters to be returned by future
       * calls to next()
       */
-    public String lookahead() {
+    public String getCurrentBuffer() {
+        if (buf != null) {
+            return buf;
+        } else {
+            return text;
+        }
+    }
+
+    public int getCurrentBufferPos() {
          if (buf != null) {
-            return new String(buf, bufPos, buf.length - bufPos);
+            return bufPos;
          } else {
-            return text.substring(pos.getIndex());
+            return pos.getIndex();
          }
      }
  
      /**
       * Advances the position by the given number of 16-bit code units.
-     * This is useful in conjunction with the lookahead() method.
+     * This is useful in conjunction with getCurrentBuffer()+getCurrentBufferPos()
+     * (formerly lookahead()).
       * @param count the number of 16-bit code units to jump over
       */
      public void jumpahead(int count) {
@@ -287,10 +303,10 @@ public class RuleCharacterIterator {
          }
          if (buf != null) {
              bufPos += count;
-            if (bufPos > buf.length) {
+            if (bufPos > buf.length()) {
                  throw new IllegalArgumentException();
              }
-            if (bufPos == buf.length) {
+            if (bufPos == buf.length()) {
                  buf = null;
              }
          } else {
@@ -321,7 +337,7 @@ public class RuleCharacterIterator {
       */
      private int _current() {
          if (buf != null) {
-            return UTF16.charAt(buf, 0, buf.length, bufPos);
+            return UTF16.charAt(buf, bufPos);
          } else {
              int i = pos.getIndex();
              return (i < text.length()) ? UTF16.charAt(text, i) : DONE;
@@ -335,7 +351,7 @@ public class RuleCharacterIterator {
      private void _advance(int count) {
          if (buf != null) {
              bufPos += count;
-            if (bufPos == buf.length) {
+            if (bufPos == buf.length()) {
                  buf = null;
              }
          } else {
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java

index cd90601155292a254d0e8ba690c7d24e4f5fee4e..25b02e506949f738c94f88c47fe50e5bdd55a247 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java
@@ -778,14 +778,16 @@ public final class Utility {
      };
  
      /**
-     * Convert an escape to a 32-bit code point value.  We attempt
+     * Converts an escape to a code point value. We attempt
       * to parallel the icu4c unescapeAt() function.
-     * @param offset16 an array containing offset to the character
-     * <em>after</em> the backslash.  Upon return offset16[0] will
-     * be updated to point after the escape sequence.
-     * @return character value from 0 to 10FFFF, or -1 on error.
+     * This function returns an integer with
+     * both the code point (bits 28..8) and the length of the escape sequence (bits 7..0).
+     * offset+length is the index after the escape sequence.
+     *
+     * @param offset the offset to the character <em>after</em> the backslash.
+     * @return the code point and length, or -1 on error.
       */
-    public static int unescapeAt(String s, int[] offset16) {
+    public static int unescapeAndLengthAt(CharSequence s, int offset) {
          int c;
          int result = 0;
          int n = 0;
@@ -797,11 +799,11 @@ public final class Utility {
          boolean braces = false;
  
          /* Check that offset is in range */
-        int offset = offset16[0];
          int length = s.length();
          if (offset < 0 || offset >= length) {
              return -1;
          }
+        int start = offset;
  
          /* Fetch first UChar after '\\' */
          c = Character.codePointAt(s, offset);
@@ -867,24 +869,24 @@ public final class Utility {
                  int ahead = offset+1;
                  c = s.charAt(offset); // [sic] get 16-bit code unit
                  if (c == '\\' && ahead < length) {
-                    int o[] = new int[] { ahead };
-                    c = unescapeAt(s, o);
-                    ahead = o[0];
+                    int cpAndLength = unescapeAndLengthAt(s, ahead);
+                    if (cpAndLength >= 0) {
+                        c = cpAndLength >> 8;
+                        ahead += cpAndLength & 0xff;
+                    }
                  }
                  if (c <= 0xffff && UTF16.isTrailSurrogate((char) c)) {
                      offset = ahead;
                      result = Character.toCodePoint((char) result, (char) c);
                  }
              }
-            offset16[0] = offset;
-            return result;
+            return codePointAndLength(result, start, offset);
          }
  
          /* Convert C-style escapes in table */
          for (i=0; i<UNESCAPE_MAP.length; i+=2) {
              if (c == UNESCAPE_MAP[i]) {
-                offset16[0] = offset;
-                return UNESCAPE_MAP[i+1];
+                return codePointAndLength(UNESCAPE_MAP[i+1], start, offset);
              } else if (c < UNESCAPE_MAP[i]) {
                  break;
              }
@@ -893,64 +895,94 @@ public final class Utility {
          /* Map \cX to control-X: X & 0x1F */
          if (c == 'c' && offset < length) {
              c = UTF16.charAt(s, offset);
-            offset16[0] = offset + UTF16.getCharCount(c);
-            return 0x1F & c;
+            return codePointAndLength(c & 0x1F, start, offset + UTF16.getCharCount(c));
          }
  
          /* If no special forms are recognized, then consider
           * the backslash to generically escape the next character. */
-        offset16[0] = offset;
-        return c;
+        return codePointAndLength(c, start, offset);
+    }
+
+    private static int codePointAndLength(int c, int length) {
+        assert 0 <= c && c <= 0x10ffff;
+        assert 0 <= length && length <= 0xff;
+        return c << 8 | length;
+    }
+
+    private static int codePointAndLength(int c, int start, int limit) {
+        return codePointAndLength(c, limit - start);
+    }
+
+    public static int cpFromCodePointAndLength(int cpAndLength) {
+        assert cpAndLength >= 0;
+        return cpAndLength >> 8;
+    }
+
+    public static int lengthFromCodePointAndLength(int cpAndLength) {
+        assert cpAndLength >= 0;
+        return cpAndLength & 0xff;
      }
  
      /**
-     * Convert all escapes in a given string using unescapeAt().
+     * Convert all escapes in a given string using unescapeAndLengthAt().
       * @exception IllegalArgumentException if an invalid escape is
       * seen.
       */
-    public static String unescape(String s) {
-        StringBuilder buf = new StringBuilder();
-        int[] pos = new int[1];
+    public static String unescape(CharSequence s) {
+        StringBuilder buf = null;
          for (int i=0; i<s.length(); ) {
              char c = s.charAt(i++);
              if (c == '\\') {
-                pos[0] = i;
-                int e = unescapeAt(s, pos);
-                if (e < 0) {
+                if (buf == null) {
+                    buf = new StringBuilder(s.length()).append(s, 0, i - 1);
+                }
+                int cpAndLength = unescapeAndLengthAt(s, i);
+                if (cpAndLength < 0) {
                      throw new IllegalArgumentException("Invalid escape sequence " +
-                            s.substring(i-1, Math.min(i+8, s.length())));
+                            s.subSequence(i-1, Math.min(i+9, s.length())));
                  }
-                buf.appendCodePoint(e);
-                i = pos[0];
-            } else {
+                buf.appendCodePoint(cpAndLength >> 8);
+                i += cpAndLength & 0xff;
+            } else if (buf != null) {
+                // We could optimize this further by appending whole substrings between escapes.
                  buf.append(c);
              }
          }
+        if (buf == null) {
+            // No escapes in s.
+            return s.toString();
+        }
          return buf.toString();
      }
  
      /**
-     * Convert all escapes in a given string using unescapeAt().
+     * Convert all escapes in a given string using unescapeAndLengthAt().
       * Leave invalid escape sequences unchanged.
       */
-    public static String unescapeLeniently(String s) {
-        StringBuilder buf = new StringBuilder();
-        int[] pos = new int[1];
+    public static String unescapeLeniently(CharSequence s) {
+        StringBuilder buf = null;
          for (int i=0; i<s.length(); ) {
              char c = s.charAt(i++);
              if (c == '\\') {
-                pos[0] = i;
-                int e = unescapeAt(s, pos);
-                if (e < 0) {
+                if (buf == null) {
+                    buf = new StringBuilder(s.length()).append(s, 0, i - 1);
+                }
+                int cpAndLength = unescapeAndLengthAt(s, i);
+                if (cpAndLength < 0) {
                      buf.append(c);
                  } else {
-                    buf.appendCodePoint(e);
-                    i = pos[0];
+                    buf.appendCodePoint(cpAndLength >> 8);
+                    i += cpAndLength & 0xff;
                  }
-            } else {
+            } else if (buf != null) {
+                // We could optimize this further by appending whole substrings between escapes.
                  buf.append(c);
              }
          }
+        if (buf == null) {
+            // No escapes in s.
+            return s.toString();
+        }
          return buf.toString();
      }
  
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/data/TokenIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/data/TokenIterator.java

index d37ea468ff5534139fed4cdd18670d8ad160d23e..166948b417dd89558189827cdc298969f4ca2c22 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/data/TokenIterator.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/data/TokenIterator.java
@@ -87,7 +87,7 @@ public class TokenIterator {
      public int getLineNumber() {
          return reader.getLineNumber();
      }
-    
+
      /**
       * Return a string description of the position of the last line
       * returned by readLine() or readLineSkippingComments().
@@ -95,7 +95,7 @@ public class TokenIterator {
      public String describePosition() {
          return reader.describePosition() + ':' + (lastpos+1);
      }
-    
+
      /**
       * Read the next token from 'this.line' and append it to
       * 'this.buf'.  Tokens are separated by Pattern_White_Space.  Tokens
@@ -127,22 +127,17 @@ public class TokenIterator {
              buf.append(c);
              break;
          }
-        int[] posref = null;
          while (position < line.length()) {
              c = line.charAt(position); // 16-bit ok
              if (c == '\\') {
-                if (posref == null) {
-                    posref = new int[1];
-                }
-                posref[0] = position+1;
-                int c32 = Utility.unescapeAt(line, posref);
-                if (c32 < 0) {
+                int cpAndLength = Utility.unescapeAndLengthAt(line, position + 1);
+                if (cpAndLength < 0) {
                      throw new RuntimeException("Invalid escape at " +
                                                 reader.describePosition() + ':' +
                                                 position);
                  }
-                UTF16.append(buf, c32);
-                position = posref[0];
+                UTF16.append(buf, Utility.cpFromCodePointAndLength(cpAndLength));
+                position += 1 + Utility.lengthFromCodePointAndLength(cpAndLength);
              } else if ((quote != 0 && c == quote) ||
                         (quote == 0 && PatternProps.isWhiteSpace(c))) {
                  return ++position;
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java

index c9a8aff5a6d9bf4010b8ca78dd8574694bed8d91..9249ba86edcf5147a8b92985c3ff32d77978c59b 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java
@@ -823,19 +823,18 @@ class RBBIRuleScanner {
  
              //
              //  check for backslash escaped characters.
-            //  Use String.unescapeAt() to handle them.
              //
              if (c.fChar == '\\') {
                  c.fEscaped = true;
-                int[] unescapeIndex = new int[1];
-                unescapeIndex[0] = fNextIndex;
-                c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex);
-                if (unescapeIndex[0] == fNextIndex) {
+                int cpAndLength = Utility.unescapeAndLengthAt(fRB.fRules, fNextIndex);
+                if (cpAndLength < 0) {
                      error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED);
                  }
+                c.fChar = Utility.cpFromCodePointAndLength(cpAndLength);
+                int length = Utility.lengthFromCodePointAndLength(cpAndLength);
  
-                fCharNum += unescapeIndex[0] - fNextIndex;
-                fNextIndex = unescapeIndex[0];
+                fCharNum += length;
+                fNextIndex += length;
              }
          }
          // putc(c.fChar, stdout);
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java

index 04154054ff80ee26593deafe7cf369c9c6c7fdb6..d41ff99ea807f23943b68d54da796b703d3cbb97 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
@@ -2555,7 +2555,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
          StringBuilder patBuf = new StringBuilder(), buf = null;
          boolean usePat = false;
          UnicodeSet scratch = null;
-        Object backup = null;
+        RuleCharacterIterator.Position backup = null;
  
          // mode: 0=before [, 1=between [...], 2=after ]
          // lastItem: 0=none, 1=char, 2=set
@@ -3673,7 +3673,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
              int iterOpts) {
          boolean result = false;
          iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES;
-        Object pos = chars.getPos(null);
+        RuleCharacterIterator.Position pos = chars.getPos(null);
          int c = chars.next(iterOpts);
          if (c == '[' || c == '\\') {
              int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE);
@@ -3784,14 +3784,16 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
       */
      private void applyPropertyPattern(RuleCharacterIterator chars,
              Appendable rebuiltPat, SymbolTable symbols) {
-        String patStr = chars.lookahead();
-        ParsePosition pos = new ParsePosition(0);
+        String patStr = chars.getCurrentBuffer();
+        int start = chars.getCurrentBufferPos();
+        ParsePosition pos = new ParsePosition(start);
          applyPropertyPattern(patStr, pos, symbols);
-        if (pos.getIndex() == 0) {
+        int length = pos.getIndex() - start;
+        if (length == 0) {
              syntaxError(chars, "Invalid property pattern");
          }
-        chars.jumpahead(pos.getIndex());
-        append(rebuiltPat, patStr.substring(0, pos.getIndex()));
+        chars.jumpahead(length);
+        append(rebuiltPat, patStr.substring(start, pos.getIndex()));
      }
  
      //----------------------------------------------------------------
diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/TransliteratorParser.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/TransliteratorParser.java

index 9e300a729b2b4b24e0f9fa557b33d084750f82c2..f16a121e71b061e1683074f53e7dd55eb37c4e90 100644 (file)
--- a/icu4j/main/classes/translit/src/com/ibm/icu/text/TransliteratorParser.java
+++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/TransliteratorParser.java
@@ -493,12 +493,12 @@ class TransliteratorParser {
                      if (pos == limit) {
                          syntaxError("Trailing backslash", rule, start);
                      }
-                    iref[0] = pos;
-                    int escaped = Utility.unescapeAt(rule, iref);
-                    pos = iref[0];
-                    if (escaped == -1) {
+                    int cpAndLength = Utility.unescapeAndLengthAt(rule, pos);
+                    if (cpAndLength < 0) {
                          syntaxError("Malformed escape", rule, start);
                      }
+                    int escaped = Utility.cpFromCodePointAndLength(cpAndLength);
+                    pos += Utility.lengthFromCodePointAndLength(cpAndLength);
                      parser.checkVariableRange(escaped, rule, start);
                      UTF16.append(buf, escaped);
                      continue;
@@ -902,16 +902,16 @@ class TransliteratorParser {
          boolean parsingIDs = true;
          int ruleCount = 0;
  
-        dataVector = new ArrayList<Data>();
-        idBlockVector = new ArrayList<String>();
+        dataVector = new ArrayList<>();
+        idBlockVector = new ArrayList<>();
          curData = null;
          direction = dir;
          compoundFilter = null;
-        variablesVector = new ArrayList<Object>();
-        variableNames = new HashMap<String, char[]>();
+        variablesVector = new ArrayList<>();
+        variableNames = new HashMap<>();
          parseData = new ParseData();
  
-        List<RuntimeException> errors = new ArrayList<RuntimeException>();
+        List<RuntimeException> errors = new ArrayList<>();
          int errorCount = 0;
  
          ruleArray.reset();
@@ -1079,7 +1079,7 @@ class TransliteratorParser {
              Data data = dataVector.get(i);
              data.variables = new Object[variablesVector.size()];
              variablesVector.toArray(data.variables);
-            data.variableNames = new HashMap<String, char[]>();
+            data.variableNames = new HashMap<>();
              data.variableNames.putAll(variableNames);
          }
          variablesVector = null;
@@ -1143,7 +1143,7 @@ class TransliteratorParser {
  
          // Set up segments data
          segmentStandins = new StringBuffer();
-        segmentObjects = new ArrayList<StringMatcher>();
+        segmentObjects = new ArrayList<>();
  
          RuleHalf left  = new RuleHalf();
          RuleHalf right = new RuleHalf();
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java

index 76c76b026806e54765459a38d55a4fa0ea37b8be..4bea6cc0ec886fda3cd1f4b98f1875eb6dff1ff5 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java
@@ -344,14 +344,12 @@ public void TestExtended() {
                  }
  
                  // Let unescape handle the back slash.
-                int  charIdxAr[] = new int[1];
-                charIdxAr[0] = charIdx;
-                cp = Utility.unescapeAt(testString, charIdxAr);
-                if (cp != -1) {
+                int cpAndLength = Utility.unescapeAndLengthAt(testString, charIdx);
+                if (cpAndLength >= 0) {
                      // Escape sequence was recognized.  Insert the char
                      //   into the test data.
-                    charIdx = charIdxAr[0];
-                    tp.dataToBreak.appendCodePoint(cp);
+                    charIdx += Utility.lengthFromCodePointAndLength(cpAndLength);
+                    tp.dataToBreak.appendCodePoint(Utility.cpFromCodePointAndLength(cpAndLength));
                      for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
                          tp.srcLine[i] = lineNum;
                          tp.srcCol[i]  = column;
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java

index a7ee1d072d3ceb76bab844f23454729a4532a1e9..15d1e6010498823b0a1ea489868778e6ffe5182a 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java
@@ -52,11 +52,9 @@ public class UtilityTest extends TestFmwk {
  
          // Regression test for ICU-21645
          String s = "\\U0001DA8B\\U0001DF00-\\U0001DF1E";
-        int[] offset16 = new int[] { 1 };  // after the backslash
          // This returned U+B2F00 for the first _two_ escapes.
-        int c = Utility.unescapeAt(s, offset16);
-        assertEquals(s + " unescape at 1, code point", 0x1DA8B, c);
-        assertEquals(s + " unescape at 1, offset", 10, offset16[0]);
+        int cpAndLength = Utility.unescapeAndLengthAt(s, 1);  // index 1 = after the backslash
+        assertEquals(s + " unescape at 1, cpAndLength", 0x1DA8B09, cpAndLength);
          String pattern = "[" + s + "]";
          // This threw an IllegalArgumentException because the parser called Utility.unescapeAt()
          // and saw an invalid range of B2F00..1DF1E (start >= end).
author	Markus Scherer <markus.icu@gmail.com>
	Sat, 19 Jun 2021 20:25:33 +0000 (20:25 +0000)
committer	Markus Scherer <markus.icu@gmail.com>
	Mon, 21 Jun 2021 19:40:28 +0000 (19:40 +0000)
icu4j/main/classes/core/src/com/ibm/icu/impl/RuleCharacterIterator.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/data/TokenIterator.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java		patch \| blob \| history
icu4j/main/classes/translit/src/com/ibm/icu/text/TransliteratorParser.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java		patch \| blob \| history