/**
* Current variable expansion, or null if none.
*/
- private char[] buf;
+ private String buf;
/**
* Position within buf[]. Meaningless if buf == null.
/**
* Bitmask option to enable parsing of escape sequences. If (options &
* PARSE_ESCAPES) != 0, then an embedded escape sequence will be expanded
- * to its value. Escapes are parsed using Utility.unescapeAt().
+ * to its value. Escapes are parsed using Utility.unescapeAndLengthAt().
*/
public static final int PARSE_ESCAPES = 2;
*/
public static final int SKIP_WHITESPACE = 4;
+ /** For use with {@link #getPos(Position)} & {@link #setPos(Position)}. */
+ public static final class Position {
+ private String buf;
+ private int bufPos;
+ private int posIndex;
+ };
+
/**
* Constructs an iterator over the given text, starting at the given
* position.
break;
}
bufPos = 0;
- buf = sym.lookup(name);
- if (buf == null) {
+ char[] chars = sym.lookup(name);
+ if (chars == null) {
+ buf = null;
throw new IllegalArgumentException(
"Undefined variable: " + name);
}
// Handle empty variable value
- if (buf.length == 0) {
+ if (chars.length == 0) {
buf = null;
}
+ buf = new String(chars);
continue;
}
}
if (c == '\\' && (options & PARSE_ESCAPES) != 0) {
- int offset[] = new int[] { 0 };
- c = Utility.unescapeAt(lookahead(), offset);
- jumpahead(offset[0]);
- isEscaped = true;
- if (c < 0) {
+ int cpAndLength = Utility.unescapeAndLengthAt(
+ getCurrentBuffer(), getCurrentBufferPos());
+ if (cpAndLength < 0) {
throw new IllegalArgumentException("Invalid escape");
}
+ c = Utility.cpFromCodePointAndLength(cpAndLength);
+ jumpahead(Utility.lengthFromCodePointAndLength(cpAndLength));
+ isEscaped = true;
}
break;
* restore this iterator's position. Usage idiom:
*
* RuleCharacterIterator iterator = ...;
- * Object pos = iterator.getPos(null); // allocate position object
+ * Position pos = iterator.getPos(null); // allocate position object
* for (;;) {
* pos = iterator.getPos(pos); // reuse position object
* int c = iterator.next(...);
* @return a position object which may be passed to setPos(),
* either `p,' or if `p' == null, a newly-allocated object
*/
- public Object getPos(Object p) {
+ public Position getPos(Position p) {
if (p == null) {
- return new Object[] {buf, new int[] {pos.getIndex(), bufPos}};
+ p = new Position();
}
- Object[] a = (Object[]) p;
- a[0] = buf;
- int[] v = (int[]) a[1];
- v[0] = pos.getIndex();
- v[1] = bufPos;
+ p.buf = buf;
+ p.bufPos = bufPos;
+ p.posIndex = pos.getIndex();
return p;
}
* returned the given object.
* @param p a position object previously returned by getPos()
*/
- public void setPos(Object p) {
- Object[] a = (Object[]) p;
- buf = (char[]) a[0];
- int[] v = (int[]) a[1];
- pos.setIndex(v[0]);
- bufPos = v[1];
+ public void setPos(Position p) {
+ buf = p.buf;
+ pos.setIndex(p.posIndex);
+ bufPos = p.bufPos;
}
/**
* Returns a string containing the remainder of the characters to be
* returned by this iterator, without any option processing. If the
* iterator is currently within a variable expansion, this will only
- * extend to the end of the variable expansion. This method is provided
- * so that iterators may interoperate with string-based APIs. The typical
- * sequence of calls is to call skipIgnored(), then call lookahead(), then
- * parse the string returned by lookahead(), then call jumpahead() to
+ * extend to the end of the variable expansion.
+ * This method, together with getCurrentBufferPos() (which replace the former lookahead()),
+ * is provided so that iterators may interoperate with string-based APIs. The typical
+ * sequence of calls is to call skipIgnored(), then call these methods, then
+ * parse that substring, then call jumpahead() to
* resynchronize the iterator.
* @return a string containing the characters to be returned by future
* calls to next()
*/
- public String lookahead() {
+ public String getCurrentBuffer() {
+ if (buf != null) {
+ return buf;
+ } else {
+ return text;
+ }
+ }
+
+ public int getCurrentBufferPos() {
if (buf != null) {
- return new String(buf, bufPos, buf.length - bufPos);
+ return bufPos;
} else {
- return text.substring(pos.getIndex());
+ return pos.getIndex();
}
}
/**
* Advances the position by the given number of 16-bit code units.
- * This is useful in conjunction with the lookahead() method.
+ * This is useful in conjunction with getCurrentBuffer()+getCurrentBufferPos()
+ * (formerly lookahead()).
* @param count the number of 16-bit code units to jump over
*/
public void jumpahead(int count) {
}
if (buf != null) {
bufPos += count;
- if (bufPos > buf.length) {
+ if (bufPos > buf.length()) {
throw new IllegalArgumentException();
}
- if (bufPos == buf.length) {
+ if (bufPos == buf.length()) {
buf = null;
}
} else {
*/
private int _current() {
if (buf != null) {
- return UTF16.charAt(buf, 0, buf.length, bufPos);
+ return UTF16.charAt(buf, bufPos);
} else {
int i = pos.getIndex();
return (i < text.length()) ? UTF16.charAt(text, i) : DONE;
private void _advance(int count) {
if (buf != null) {
bufPos += count;
- if (bufPos == buf.length) {
+ if (bufPos == buf.length()) {
buf = null;
}
} else {
};
/**
- * Convert an escape to a 32-bit code point value. We attempt
+ * Converts an escape to a code point value. We attempt
* to parallel the icu4c unescapeAt() function.
- * @param offset16 an array containing offset to the character
- * <em>after</em> the backslash. Upon return offset16[0] will
- * be updated to point after the escape sequence.
- * @return character value from 0 to 10FFFF, or -1 on error.
+ * This function returns an integer with
+ * both the code point (bits 28..8) and the length of the escape sequence (bits 7..0).
+ * offset+length is the index after the escape sequence.
+ *
+ * @param offset the offset to the character <em>after</em> the backslash.
+ * @return the code point and length, or -1 on error.
*/
- public static int unescapeAt(String s, int[] offset16) {
+ public static int unescapeAndLengthAt(CharSequence s, int offset) {
int c;
int result = 0;
int n = 0;
boolean braces = false;
/* Check that offset is in range */
- int offset = offset16[0];
int length = s.length();
if (offset < 0 || offset >= length) {
return -1;
}
+ int start = offset;
/* Fetch first UChar after '\\' */
c = Character.codePointAt(s, offset);
int ahead = offset+1;
c = s.charAt(offset); // [sic] get 16-bit code unit
if (c == '\\' && ahead < length) {
- int o[] = new int[] { ahead };
- c = unescapeAt(s, o);
- ahead = o[0];
+ int cpAndLength = unescapeAndLengthAt(s, ahead);
+ if (cpAndLength >= 0) {
+ c = cpAndLength >> 8;
+ ahead += cpAndLength & 0xff;
+ }
}
if (c <= 0xffff && UTF16.isTrailSurrogate((char) c)) {
offset = ahead;
result = Character.toCodePoint((char) result, (char) c);
}
}
- offset16[0] = offset;
- return result;
+ return codePointAndLength(result, start, offset);
}
/* Convert C-style escapes in table */
for (i=0; i<UNESCAPE_MAP.length; i+=2) {
if (c == UNESCAPE_MAP[i]) {
- offset16[0] = offset;
- return UNESCAPE_MAP[i+1];
+ return codePointAndLength(UNESCAPE_MAP[i+1], start, offset);
} else if (c < UNESCAPE_MAP[i]) {
break;
}
/* Map \cX to control-X: X & 0x1F */
if (c == 'c' && offset < length) {
c = UTF16.charAt(s, offset);
- offset16[0] = offset + UTF16.getCharCount(c);
- return 0x1F & c;
+ return codePointAndLength(c & 0x1F, start, offset + UTF16.getCharCount(c));
}
/* If no special forms are recognized, then consider
* the backslash to generically escape the next character. */
- offset16[0] = offset;
- return c;
+ return codePointAndLength(c, start, offset);
+ }
+
+ private static int codePointAndLength(int c, int length) {
+ assert 0 <= c && c <= 0x10ffff;
+ assert 0 <= length && length <= 0xff;
+ return c << 8 | length;
+ }
+
+ private static int codePointAndLength(int c, int start, int limit) {
+ return codePointAndLength(c, limit - start);
+ }
+
+ public static int cpFromCodePointAndLength(int cpAndLength) {
+ assert cpAndLength >= 0;
+ return cpAndLength >> 8;
+ }
+
+ public static int lengthFromCodePointAndLength(int cpAndLength) {
+ assert cpAndLength >= 0;
+ return cpAndLength & 0xff;
}
/**
- * Convert all escapes in a given string using unescapeAt().
+ * Convert all escapes in a given string using unescapeAndLengthAt().
* @exception IllegalArgumentException if an invalid escape is
* seen.
*/
- public static String unescape(String s) {
- StringBuilder buf = new StringBuilder();
- int[] pos = new int[1];
+ public static String unescape(CharSequence s) {
+ StringBuilder buf = null;
for (int i=0; i<s.length(); ) {
char c = s.charAt(i++);
if (c == '\\') {
- pos[0] = i;
- int e = unescapeAt(s, pos);
- if (e < 0) {
+ if (buf == null) {
+ buf = new StringBuilder(s.length()).append(s, 0, i - 1);
+ }
+ int cpAndLength = unescapeAndLengthAt(s, i);
+ if (cpAndLength < 0) {
throw new IllegalArgumentException("Invalid escape sequence " +
- s.substring(i-1, Math.min(i+8, s.length())));
+ s.subSequence(i-1, Math.min(i+9, s.length())));
}
- buf.appendCodePoint(e);
- i = pos[0];
- } else {
+ buf.appendCodePoint(cpAndLength >> 8);
+ i += cpAndLength & 0xff;
+ } else if (buf != null) {
+ // We could optimize this further by appending whole substrings between escapes.
buf.append(c);
}
}
+ if (buf == null) {
+ // No escapes in s.
+ return s.toString();
+ }
return buf.toString();
}
/**
- * Convert all escapes in a given string using unescapeAt().
+ * Convert all escapes in a given string using unescapeAndLengthAt().
* Leave invalid escape sequences unchanged.
*/
- public static String unescapeLeniently(String s) {
- StringBuilder buf = new StringBuilder();
- int[] pos = new int[1];
+ public static String unescapeLeniently(CharSequence s) {
+ StringBuilder buf = null;
for (int i=0; i<s.length(); ) {
char c = s.charAt(i++);
if (c == '\\') {
- pos[0] = i;
- int e = unescapeAt(s, pos);
- if (e < 0) {
+ if (buf == null) {
+ buf = new StringBuilder(s.length()).append(s, 0, i - 1);
+ }
+ int cpAndLength = unescapeAndLengthAt(s, i);
+ if (cpAndLength < 0) {
buf.append(c);
} else {
- buf.appendCodePoint(e);
- i = pos[0];
+ buf.appendCodePoint(cpAndLength >> 8);
+ i += cpAndLength & 0xff;
}
- } else {
+ } else if (buf != null) {
+ // We could optimize this further by appending whole substrings between escapes.
buf.append(c);
}
}
+ if (buf == null) {
+ // No escapes in s.
+ return s.toString();
+ }
return buf.toString();
}
public int getLineNumber() {
return reader.getLineNumber();
}
-
+
/**
* Return a string description of the position of the last line
* returned by readLine() or readLineSkippingComments().
public String describePosition() {
return reader.describePosition() + ':' + (lastpos+1);
}
-
+
/**
* Read the next token from 'this.line' and append it to
* 'this.buf'. Tokens are separated by Pattern_White_Space. Tokens
buf.append(c);
break;
}
- int[] posref = null;
while (position < line.length()) {
c = line.charAt(position); // 16-bit ok
if (c == '\\') {
- if (posref == null) {
- posref = new int[1];
- }
- posref[0] = position+1;
- int c32 = Utility.unescapeAt(line, posref);
- if (c32 < 0) {
+ int cpAndLength = Utility.unescapeAndLengthAt(line, position + 1);
+ if (cpAndLength < 0) {
throw new RuntimeException("Invalid escape at " +
reader.describePosition() + ':' +
position);
}
- UTF16.append(buf, c32);
- position = posref[0];
+ UTF16.append(buf, Utility.cpFromCodePointAndLength(cpAndLength));
+ position += 1 + Utility.lengthFromCodePointAndLength(cpAndLength);
} else if ((quote != 0 && c == quote) ||
(quote == 0 && PatternProps.isWhiteSpace(c))) {
return ++position;
//
// check for backslash escaped characters.
- // Use String.unescapeAt() to handle them.
//
if (c.fChar == '\\') {
c.fEscaped = true;
- int[] unescapeIndex = new int[1];
- unescapeIndex[0] = fNextIndex;
- c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex);
- if (unescapeIndex[0] == fNextIndex) {
+ int cpAndLength = Utility.unescapeAndLengthAt(fRB.fRules, fNextIndex);
+ if (cpAndLength < 0) {
error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED);
}
+ c.fChar = Utility.cpFromCodePointAndLength(cpAndLength);
+ int length = Utility.lengthFromCodePointAndLength(cpAndLength);
- fCharNum += unescapeIndex[0] - fNextIndex;
- fNextIndex = unescapeIndex[0];
+ fCharNum += length;
+ fNextIndex += length;
}
}
// putc(c.fChar, stdout);
StringBuilder patBuf = new StringBuilder(), buf = null;
boolean usePat = false;
UnicodeSet scratch = null;
- Object backup = null;
+ RuleCharacterIterator.Position backup = null;
// mode: 0=before [, 1=between [...], 2=after ]
// lastItem: 0=none, 1=char, 2=set
int iterOpts) {
boolean result = false;
iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES;
- Object pos = chars.getPos(null);
+ RuleCharacterIterator.Position pos = chars.getPos(null);
int c = chars.next(iterOpts);
if (c == '[' || c == '\\') {
int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE);
*/
private void applyPropertyPattern(RuleCharacterIterator chars,
Appendable rebuiltPat, SymbolTable symbols) {
- String patStr = chars.lookahead();
- ParsePosition pos = new ParsePosition(0);
+ String patStr = chars.getCurrentBuffer();
+ int start = chars.getCurrentBufferPos();
+ ParsePosition pos = new ParsePosition(start);
applyPropertyPattern(patStr, pos, symbols);
- if (pos.getIndex() == 0) {
+ int length = pos.getIndex() - start;
+ if (length == 0) {
syntaxError(chars, "Invalid property pattern");
}
- chars.jumpahead(pos.getIndex());
- append(rebuiltPat, patStr.substring(0, pos.getIndex()));
+ chars.jumpahead(length);
+ append(rebuiltPat, patStr.substring(start, pos.getIndex()));
}
//----------------------------------------------------------------
if (pos == limit) {
syntaxError("Trailing backslash", rule, start);
}
- iref[0] = pos;
- int escaped = Utility.unescapeAt(rule, iref);
- pos = iref[0];
- if (escaped == -1) {
+ int cpAndLength = Utility.unescapeAndLengthAt(rule, pos);
+ if (cpAndLength < 0) {
syntaxError("Malformed escape", rule, start);
}
+ int escaped = Utility.cpFromCodePointAndLength(cpAndLength);
+ pos += Utility.lengthFromCodePointAndLength(cpAndLength);
parser.checkVariableRange(escaped, rule, start);
UTF16.append(buf, escaped);
continue;
boolean parsingIDs = true;
int ruleCount = 0;
- dataVector = new ArrayList<Data>();
- idBlockVector = new ArrayList<String>();
+ dataVector = new ArrayList<>();
+ idBlockVector = new ArrayList<>();
curData = null;
direction = dir;
compoundFilter = null;
- variablesVector = new ArrayList<Object>();
- variableNames = new HashMap<String, char[]>();
+ variablesVector = new ArrayList<>();
+ variableNames = new HashMap<>();
parseData = new ParseData();
- List<RuntimeException> errors = new ArrayList<RuntimeException>();
+ List<RuntimeException> errors = new ArrayList<>();
int errorCount = 0;
ruleArray.reset();
Data data = dataVector.get(i);
data.variables = new Object[variablesVector.size()];
variablesVector.toArray(data.variables);
- data.variableNames = new HashMap<String, char[]>();
+ data.variableNames = new HashMap<>();
data.variableNames.putAll(variableNames);
}
variablesVector = null;
// Set up segments data
segmentStandins = new StringBuffer();
- segmentObjects = new ArrayList<StringMatcher>();
+ segmentObjects = new ArrayList<>();
RuleHalf left = new RuleHalf();
RuleHalf right = new RuleHalf();
}
// Let unescape handle the back slash.
- int charIdxAr[] = new int[1];
- charIdxAr[0] = charIdx;
- cp = Utility.unescapeAt(testString, charIdxAr);
- if (cp != -1) {
+ int cpAndLength = Utility.unescapeAndLengthAt(testString, charIdx);
+ if (cpAndLength >= 0) {
// Escape sequence was recognized. Insert the char
// into the test data.
- charIdx = charIdxAr[0];
- tp.dataToBreak.appendCodePoint(cp);
+ charIdx += Utility.lengthFromCodePointAndLength(cpAndLength);
+ tp.dataToBreak.appendCodePoint(Utility.cpFromCodePointAndLength(cpAndLength));
for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
tp.srcLine[i] = lineNum;
tp.srcCol[i] = column;
// Regression test for ICU-21645
String s = "\\U0001DA8B\\U0001DF00-\\U0001DF1E";
- int[] offset16 = new int[] { 1 }; // after the backslash
// This returned U+B2F00 for the first _two_ escapes.
- int c = Utility.unescapeAt(s, offset16);
- assertEquals(s + " unescape at 1, code point", 0x1DA8B, c);
- assertEquals(s + " unescape at 1, offset", 10, offset16[0]);
+ int cpAndLength = Utility.unescapeAndLengthAt(s, 1); // index 1 = after the backslash
+ assertEquals(s + " unescape at 1, cpAndLength", 0x1DA8B09, cpAndLength);
String pattern = "[" + s + "]";
// This threw an IllegalArgumentException because the parser called Utility.unescapeAt()
// and saw an invalid range of B2F00..1DF1E (start >= end).