ICU-21645 reduce heap allocations in unescape() and other parsing
authorMarkus Scherer <markus.icu@gmail.com>
Sat, 19 Jun 2021 20:25:33 +0000 (20:25 +0000)
committerMarkus Scherer <markus.icu@gmail.com>
Mon, 21 Jun 2021 19:40:28 +0000 (19:40 +0000)
icu4j/main/classes/core/src/com/ibm/icu/impl/RuleCharacterIterator.java
icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java
icu4j/main/classes/core/src/com/ibm/icu/impl/data/TokenIterator.java
icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java
icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
icu4j/main/classes/translit/src/com/ibm/icu/text/TransliteratorParser.java
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java
icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java

index 5e981655015324ec82c7d8fba6a6c150fc8054d5..49c74f4d95bfe4743192f73c30ce0f7da0a85237 100644 (file)
@@ -52,7 +52,7 @@ public class RuleCharacterIterator {
     /**
      * Current variable expansion, or null if none.
      */
-    private char[] buf;
+    private String buf;
 
     /**
      * Position within buf[].  Meaningless if buf == null.
@@ -79,7 +79,7 @@ public class RuleCharacterIterator {
     /**
      * Bitmask option to enable parsing of escape sequences.  If (options &
      * PARSE_ESCAPES) != 0, then an embedded escape sequence will be expanded
-     * to its value.  Escapes are parsed using Utility.unescapeAt().
+     * to its value.  Escapes are parsed using Utility.unescapeAndLengthAt().
      */
     public static final int PARSE_ESCAPES   = 2;
 
@@ -90,6 +90,13 @@ public class RuleCharacterIterator {
      */
     public static final int SKIP_WHITESPACE = 4;
 
+    /** For use with {@link #getPos(Position)} & {@link #setPos(Position)}. */
+    public static final class Position {
+        private String buf;
+        private int bufPos;
+        private int posIndex;
+    };
+
     /**
      * Constructs an iterator over the given text, starting at the given
      * position.
@@ -144,15 +151,17 @@ public class RuleCharacterIterator {
                     break;
                 }
                 bufPos = 0;
-                buf = sym.lookup(name);
-                if (buf == null) {
+                char[] chars = sym.lookup(name);
+                if (chars == null) {
+                    buf = null;
                     throw new IllegalArgumentException(
                                 "Undefined variable: " + name);
                 }
                 // Handle empty variable value
-                if (buf.length == 0) {
+                if (chars.length == 0) {
                     buf = null;
                 }
+                buf = new String(chars);
                 continue;
             }
 
@@ -162,13 +171,14 @@ public class RuleCharacterIterator {
             }
 
             if (c == '\\' && (options & PARSE_ESCAPES) != 0) {
-                int offset[] = new int[] { 0 };
-                c = Utility.unescapeAt(lookahead(), offset);
-                jumpahead(offset[0]);
-                isEscaped = true;
-                if (c < 0) {
+                int cpAndLength = Utility.unescapeAndLengthAt(
+                        getCurrentBuffer(), getCurrentBufferPos());
+                if (cpAndLength < 0) {
                     throw new IllegalArgumentException("Invalid escape");
                 }
+                c = Utility.cpFromCodePointAndLength(cpAndLength);
+                jumpahead(Utility.lengthFromCodePointAndLength(cpAndLength));
+                isEscaped = true;
             }
 
             break;
@@ -199,7 +209,7 @@ public class RuleCharacterIterator {
      * restore this iterator's position.  Usage idiom:
      *
      * RuleCharacterIterator iterator = ...;
-     * Object pos = iterator.getPos(null); // allocate position object
+     * Position pos = iterator.getPos(null); // allocate position object
      * for (;;) {
      *   pos = iterator.getPos(pos); // reuse position object
      *   int c = iterator.next(...);
@@ -213,15 +223,13 @@ public class RuleCharacterIterator {
      * @return a position object which may be passed to setPos(),
      * either `p,' or if `p' == null, a newly-allocated object
      */
-    public Object getPos(Object p) {
+    public Position getPos(Position p) {
         if (p == null) {
-            return new Object[] {buf, new int[] {pos.getIndex(), bufPos}};
+            p = new Position();
         }
-        Object[] a = (Object[]) p;
-        a[0] = buf;
-        int[] v = (int[]) a[1];
-        v[0] = pos.getIndex();
-        v[1] = bufPos;
+        p.buf = buf;
+        p.bufPos = bufPos;
+        p.posIndex = pos.getIndex();
         return p;
     }
 
@@ -230,12 +238,10 @@ public class RuleCharacterIterator {
      * returned the given object.
      * @param p a position object previously returned by getPos()
      */
-    public void setPos(Object p) {
-        Object[] a = (Object[]) p;
-        buf = (char[]) a[0];
-        int[] v = (int[]) a[1];
-        pos.setIndex(v[0]);
-        bufPos = v[1];
+    public void setPos(Position p) {
+        buf = p.buf;
+        pos.setIndex(p.posIndex);
+        bufPos = p.bufPos;
     }
 
     /**
@@ -260,25 +266,35 @@ public class RuleCharacterIterator {
      * Returns a string containing the remainder of the characters to be
      * returned by this iterator, without any option processing.  If the
      * iterator is currently within a variable expansion, this will only
-     * extend to the end of the variable expansion.  This method is provided
-     * so that iterators may interoperate with string-based APIs.  The typical
-     * sequence of calls is to call skipIgnored(), then call lookahead(), then
-     * parse the string returned by lookahead(), then call jumpahead() to
+     * extend to the end of the variable expansion.
+     * This method, together with getCurrentBufferPos() (which replace the former lookahead()),
+     * is provided so that iterators may interoperate with string-based APIs. The typical
+     * sequence of calls is to call skipIgnored(), then call these methods, then
+     * parse that substring, then call jumpahead() to
      * resynchronize the iterator.
      * @return a string containing the characters to be returned by future
      * calls to next()
      */
-    public String lookahead() {
+    public String getCurrentBuffer() {
+        if (buf != null) {
+            return buf;
+        } else {
+            return text;
+        }
+    }
+
+    public int getCurrentBufferPos() {
         if (buf != null) {
-            return new String(buf, bufPos, buf.length - bufPos);
+            return bufPos;
         } else {
-            return text.substring(pos.getIndex());
+            return pos.getIndex();
         }
     }
 
     /**
      * Advances the position by the given number of 16-bit code units.
-     * This is useful in conjunction with the lookahead() method.
+     * This is useful in conjunction with getCurrentBuffer()+getCurrentBufferPos()
+     * (formerly lookahead()).
      * @param count the number of 16-bit code units to jump over
      */
     public void jumpahead(int count) {
@@ -287,10 +303,10 @@ public class RuleCharacterIterator {
         }
         if (buf != null) {
             bufPos += count;
-            if (bufPos > buf.length) {
+            if (bufPos > buf.length()) {
                 throw new IllegalArgumentException();
             }
-            if (bufPos == buf.length) {
+            if (bufPos == buf.length()) {
                 buf = null;
             }
         } else {
@@ -321,7 +337,7 @@ public class RuleCharacterIterator {
      */
     private int _current() {
         if (buf != null) {
-            return UTF16.charAt(buf, 0, buf.length, bufPos);
+            return UTF16.charAt(buf, bufPos);
         } else {
             int i = pos.getIndex();
             return (i < text.length()) ? UTF16.charAt(text, i) : DONE;
@@ -335,7 +351,7 @@ public class RuleCharacterIterator {
     private void _advance(int count) {
         if (buf != null) {
             bufPos += count;
-            if (bufPos == buf.length) {
+            if (bufPos == buf.length()) {
                 buf = null;
             }
         } else {
index cd90601155292a254d0e8ba690c7d24e4f5fee4e..25b02e506949f738c94f88c47fe50e5bdd55a247 100644 (file)
@@ -778,14 +778,16 @@ public final class Utility {
     };
 
     /**
-     * Convert an escape to a 32-bit code point value.  We attempt
+     * Converts an escape to a code point value. We attempt
      * to parallel the icu4c unescapeAt() function.
-     * @param offset16 an array containing offset to the character
-     * <em>after</em> the backslash.  Upon return offset16[0] will
-     * be updated to point after the escape sequence.
-     * @return character value from 0 to 10FFFF, or -1 on error.
+     * This function returns an integer with
+     * both the code point (bits 28..8) and the length of the escape sequence (bits 7..0).
+     * offset+length is the index after the escape sequence.
+     *
+     * @param offset the offset to the character <em>after</em> the backslash.
+     * @return the code point and length, or -1 on error.
      */
-    public static int unescapeAt(String s, int[] offset16) {
+    public static int unescapeAndLengthAt(CharSequence s, int offset) {
         int c;
         int result = 0;
         int n = 0;
@@ -797,11 +799,11 @@ public final class Utility {
         boolean braces = false;
 
         /* Check that offset is in range */
-        int offset = offset16[0];
         int length = s.length();
         if (offset < 0 || offset >= length) {
             return -1;
         }
+        int start = offset;
 
         /* Fetch first UChar after '\\' */
         c = Character.codePointAt(s, offset);
@@ -867,24 +869,24 @@ public final class Utility {
                 int ahead = offset+1;
                 c = s.charAt(offset); // [sic] get 16-bit code unit
                 if (c == '\\' && ahead < length) {
-                    int o[] = new int[] { ahead };
-                    c = unescapeAt(s, o);
-                    ahead = o[0];
+                    int cpAndLength = unescapeAndLengthAt(s, ahead);
+                    if (cpAndLength >= 0) {
+                        c = cpAndLength >> 8;
+                        ahead += cpAndLength & 0xff;
+                    }
                 }
                 if (c <= 0xffff && UTF16.isTrailSurrogate((char) c)) {
                     offset = ahead;
                     result = Character.toCodePoint((char) result, (char) c);
                 }
             }
-            offset16[0] = offset;
-            return result;
+            return codePointAndLength(result, start, offset);
         }
 
         /* Convert C-style escapes in table */
         for (i=0; i<UNESCAPE_MAP.length; i+=2) {
             if (c == UNESCAPE_MAP[i]) {
-                offset16[0] = offset;
-                return UNESCAPE_MAP[i+1];
+                return codePointAndLength(UNESCAPE_MAP[i+1], start, offset);
             } else if (c < UNESCAPE_MAP[i]) {
                 break;
             }
@@ -893,64 +895,94 @@ public final class Utility {
         /* Map \cX to control-X: X & 0x1F */
         if (c == 'c' && offset < length) {
             c = UTF16.charAt(s, offset);
-            offset16[0] = offset + UTF16.getCharCount(c);
-            return 0x1F & c;
+            return codePointAndLength(c & 0x1F, start, offset + UTF16.getCharCount(c));
         }
 
         /* If no special forms are recognized, then consider
          * the backslash to generically escape the next character. */
-        offset16[0] = offset;
-        return c;
+        return codePointAndLength(c, start, offset);
+    }
+
+    private static int codePointAndLength(int c, int length) {
+        assert 0 <= c && c <= 0x10ffff;
+        assert 0 <= length && length <= 0xff;
+        return c << 8 | length;
+    }
+
+    private static int codePointAndLength(int c, int start, int limit) {
+        return codePointAndLength(c, limit - start);
+    }
+
+    public static int cpFromCodePointAndLength(int cpAndLength) {
+        assert cpAndLength >= 0;
+        return cpAndLength >> 8;
+    }
+
+    public static int lengthFromCodePointAndLength(int cpAndLength) {
+        assert cpAndLength >= 0;
+        return cpAndLength & 0xff;
     }
 
     /**
-     * Convert all escapes in a given string using unescapeAt().
+     * Convert all escapes in a given string using unescapeAndLengthAt().
      * @exception IllegalArgumentException if an invalid escape is
      * seen.
      */
-    public static String unescape(String s) {
-        StringBuilder buf = new StringBuilder();
-        int[] pos = new int[1];
+    public static String unescape(CharSequence s) {
+        StringBuilder buf = null;
         for (int i=0; i<s.length(); ) {
             char c = s.charAt(i++);
             if (c == '\\') {
-                pos[0] = i;
-                int e = unescapeAt(s, pos);
-                if (e < 0) {
+                if (buf == null) {
+                    buf = new StringBuilder(s.length()).append(s, 0, i - 1);
+                }
+                int cpAndLength = unescapeAndLengthAt(s, i);
+                if (cpAndLength < 0) {
                     throw new IllegalArgumentException("Invalid escape sequence " +
-                            s.substring(i-1, Math.min(i+8, s.length())));
+                            s.subSequence(i-1, Math.min(i+9, s.length())));
                 }
-                buf.appendCodePoint(e);
-                i = pos[0];
-            } else {
+                buf.appendCodePoint(cpAndLength >> 8);
+                i += cpAndLength & 0xff;
+            } else if (buf != null) {
+                // We could optimize this further by appending whole substrings between escapes.
                 buf.append(c);
             }
         }
+        if (buf == null) {
+            // No escapes in s.
+            return s.toString();
+        }
         return buf.toString();
     }
 
     /**
-     * Convert all escapes in a given string using unescapeAt().
+     * Convert all escapes in a given string using unescapeAndLengthAt().
      * Leave invalid escape sequences unchanged.
      */
-    public static String unescapeLeniently(String s) {
-        StringBuilder buf = new StringBuilder();
-        int[] pos = new int[1];
+    public static String unescapeLeniently(CharSequence s) {
+        StringBuilder buf = null;
         for (int i=0; i<s.length(); ) {
             char c = s.charAt(i++);
             if (c == '\\') {
-                pos[0] = i;
-                int e = unescapeAt(s, pos);
-                if (e < 0) {
+                if (buf == null) {
+                    buf = new StringBuilder(s.length()).append(s, 0, i - 1);
+                }
+                int cpAndLength = unescapeAndLengthAt(s, i);
+                if (cpAndLength < 0) {
                     buf.append(c);
                 } else {
-                    buf.appendCodePoint(e);
-                    i = pos[0];
+                    buf.appendCodePoint(cpAndLength >> 8);
+                    i += cpAndLength & 0xff;
                 }
-            } else {
+            } else if (buf != null) {
+                // We could optimize this further by appending whole substrings between escapes.
                 buf.append(c);
             }
         }
+        if (buf == null) {
+            // No escapes in s.
+            return s.toString();
+        }
         return buf.toString();
     }
 
index d37ea468ff5534139fed4cdd18670d8ad160d23e..166948b417dd89558189827cdc298969f4ca2c22 100644 (file)
@@ -87,7 +87,7 @@ public class TokenIterator {
     public int getLineNumber() {
         return reader.getLineNumber();
     }
-    
+
     /**
      * Return a string description of the position of the last line
      * returned by readLine() or readLineSkippingComments().
@@ -95,7 +95,7 @@ public class TokenIterator {
     public String describePosition() {
         return reader.describePosition() + ':' + (lastpos+1);
     }
-    
+
     /**
      * Read the next token from 'this.line' and append it to
      * 'this.buf'.  Tokens are separated by Pattern_White_Space.  Tokens
@@ -127,22 +127,17 @@ public class TokenIterator {
             buf.append(c);
             break;
         }
-        int[] posref = null;
         while (position < line.length()) {
             c = line.charAt(position); // 16-bit ok
             if (c == '\\') {
-                if (posref == null) {
-                    posref = new int[1];
-                }
-                posref[0] = position+1;
-                int c32 = Utility.unescapeAt(line, posref);
-                if (c32 < 0) {
+                int cpAndLength = Utility.unescapeAndLengthAt(line, position + 1);
+                if (cpAndLength < 0) {
                     throw new RuntimeException("Invalid escape at " +
                                                reader.describePosition() + ':' +
                                                position);
                 }
-                UTF16.append(buf, c32);
-                position = posref[0];
+                UTF16.append(buf, Utility.cpFromCodePointAndLength(cpAndLength));
+                position += 1 + Utility.lengthFromCodePointAndLength(cpAndLength);
             } else if ((quote != 0 && c == quote) ||
                        (quote == 0 && PatternProps.isWhiteSpace(c))) {
                 return ++position;
index c9a8aff5a6d9bf4010b8ca78dd8574694bed8d91..9249ba86edcf5147a8b92985c3ff32d77978c59b 100644 (file)
@@ -823,19 +823,18 @@ class RBBIRuleScanner {
 
             //
             //  check for backslash escaped characters.
-            //  Use String.unescapeAt() to handle them.
             //
             if (c.fChar == '\\') {
                 c.fEscaped = true;
-                int[] unescapeIndex = new int[1];
-                unescapeIndex[0] = fNextIndex;
-                c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex);
-                if (unescapeIndex[0] == fNextIndex) {
+                int cpAndLength = Utility.unescapeAndLengthAt(fRB.fRules, fNextIndex);
+                if (cpAndLength < 0) {
                     error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED);
                 }
+                c.fChar = Utility.cpFromCodePointAndLength(cpAndLength);
+                int length = Utility.lengthFromCodePointAndLength(cpAndLength);
 
-                fCharNum += unescapeIndex[0] - fNextIndex;
-                fNextIndex = unescapeIndex[0];
+                fCharNum += length;
+                fNextIndex += length;
             }
         }
         // putc(c.fChar, stdout);
index 04154054ff80ee26593deafe7cf369c9c6c7fdb6..d41ff99ea807f23943b68d54da796b703d3cbb97 100644 (file)
@@ -2555,7 +2555,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
         StringBuilder patBuf = new StringBuilder(), buf = null;
         boolean usePat = false;
         UnicodeSet scratch = null;
-        Object backup = null;
+        RuleCharacterIterator.Position backup = null;
 
         // mode: 0=before [, 1=between [...], 2=after ]
         // lastItem: 0=none, 1=char, 2=set
@@ -3673,7 +3673,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
             int iterOpts) {
         boolean result = false;
         iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES;
-        Object pos = chars.getPos(null);
+        RuleCharacterIterator.Position pos = chars.getPos(null);
         int c = chars.next(iterOpts);
         if (c == '[' || c == '\\') {
             int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE);
@@ -3784,14 +3784,16 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
      */
     private void applyPropertyPattern(RuleCharacterIterator chars,
             Appendable rebuiltPat, SymbolTable symbols) {
-        String patStr = chars.lookahead();
-        ParsePosition pos = new ParsePosition(0);
+        String patStr = chars.getCurrentBuffer();
+        int start = chars.getCurrentBufferPos();
+        ParsePosition pos = new ParsePosition(start);
         applyPropertyPattern(patStr, pos, symbols);
-        if (pos.getIndex() == 0) {
+        int length = pos.getIndex() - start;
+        if (length == 0) {
             syntaxError(chars, "Invalid property pattern");
         }
-        chars.jumpahead(pos.getIndex());
-        append(rebuiltPat, patStr.substring(0, pos.getIndex()));
+        chars.jumpahead(length);
+        append(rebuiltPat, patStr.substring(start, pos.getIndex()));
     }
 
     //----------------------------------------------------------------
index 9e300a729b2b4b24e0f9fa557b33d084750f82c2..f16a121e71b061e1683074f53e7dd55eb37c4e90 100644 (file)
@@ -493,12 +493,12 @@ class TransliteratorParser {
                     if (pos == limit) {
                         syntaxError("Trailing backslash", rule, start);
                     }
-                    iref[0] = pos;
-                    int escaped = Utility.unescapeAt(rule, iref);
-                    pos = iref[0];
-                    if (escaped == -1) {
+                    int cpAndLength = Utility.unescapeAndLengthAt(rule, pos);
+                    if (cpAndLength < 0) {
                         syntaxError("Malformed escape", rule, start);
                     }
+                    int escaped = Utility.cpFromCodePointAndLength(cpAndLength);
+                    pos += Utility.lengthFromCodePointAndLength(cpAndLength);
                     parser.checkVariableRange(escaped, rule, start);
                     UTF16.append(buf, escaped);
                     continue;
@@ -902,16 +902,16 @@ class TransliteratorParser {
         boolean parsingIDs = true;
         int ruleCount = 0;
 
-        dataVector = new ArrayList<Data>();
-        idBlockVector = new ArrayList<String>();
+        dataVector = new ArrayList<>();
+        idBlockVector = new ArrayList<>();
         curData = null;
         direction = dir;
         compoundFilter = null;
-        variablesVector = new ArrayList<Object>();
-        variableNames = new HashMap<String, char[]>();
+        variablesVector = new ArrayList<>();
+        variableNames = new HashMap<>();
         parseData = new ParseData();
 
-        List<RuntimeException> errors = new ArrayList<RuntimeException>();
+        List<RuntimeException> errors = new ArrayList<>();
         int errorCount = 0;
 
         ruleArray.reset();
@@ -1079,7 +1079,7 @@ class TransliteratorParser {
             Data data = dataVector.get(i);
             data.variables = new Object[variablesVector.size()];
             variablesVector.toArray(data.variables);
-            data.variableNames = new HashMap<String, char[]>();
+            data.variableNames = new HashMap<>();
             data.variableNames.putAll(variableNames);
         }
         variablesVector = null;
@@ -1143,7 +1143,7 @@ class TransliteratorParser {
 
         // Set up segments data
         segmentStandins = new StringBuffer();
-        segmentObjects = new ArrayList<StringMatcher>();
+        segmentObjects = new ArrayList<>();
 
         RuleHalf left  = new RuleHalf();
         RuleHalf right = new RuleHalf();
index 76c76b026806e54765459a38d55a4fa0ea37b8be..4bea6cc0ec886fda3cd1f4b98f1875eb6dff1ff5 100644 (file)
@@ -344,14 +344,12 @@ public void TestExtended() {
                 }
 
                 // Let unescape handle the back slash.
-                int  charIdxAr[] = new int[1];
-                charIdxAr[0] = charIdx;
-                cp = Utility.unescapeAt(testString, charIdxAr);
-                if (cp != -1) {
+                int cpAndLength = Utility.unescapeAndLengthAt(testString, charIdx);
+                if (cpAndLength >= 0) {
                     // Escape sequence was recognized.  Insert the char
                     //   into the test data.
-                    charIdx = charIdxAr[0];
-                    tp.dataToBreak.appendCodePoint(cp);
+                    charIdx += Utility.lengthFromCodePointAndLength(cpAndLength);
+                    tp.dataToBreak.appendCodePoint(Utility.cpFromCodePointAndLength(cpAndLength));
                     for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
                         tp.srcLine[i] = lineNum;
                         tp.srcCol[i]  = column;
index a7ee1d072d3ceb76bab844f23454729a4532a1e9..15d1e6010498823b0a1ea489868778e6ffe5182a 100644 (file)
@@ -52,11 +52,9 @@ public class UtilityTest extends TestFmwk {
 
         // Regression test for ICU-21645
         String s = "\\U0001DA8B\\U0001DF00-\\U0001DF1E";
-        int[] offset16 = new int[] { 1 };  // after the backslash
         // This returned U+B2F00 for the first _two_ escapes.
-        int c = Utility.unescapeAt(s, offset16);
-        assertEquals(s + " unescape at 1, code point", 0x1DA8B, c);
-        assertEquals(s + " unescape at 1, offset", 10, offset16[0]);
+        int cpAndLength = Utility.unescapeAndLengthAt(s, 1);  // index 1 = after the backslash
+        assertEquals(s + " unescape at 1, cpAndLength", 0x1DA8B09, cpAndLength);
         String pattern = "[" + s + "]";
         // This threw an IllegalArgumentException because the parser called Utility.unescapeAt()
         // and saw an invalid range of B2F00..1DF1E (start >= end).