From: Shane Carr Date: Wed, 29 Mar 2017 02:31:49 +0000 (+0000) Subject: ICU-13060 Documentation and parsing whitespace/bidi tweaks. X-Git-Tag: release-59-rc~21 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=08dcc7e82e0433576f74ddb7ff3fd5f2c97eef6e;p=icu ICU-13060 Documentation and parsing whitespace/bidi tweaks. X-SVN-Rev: 39951 --- diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Parse.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Parse.java index ae8d6d91f20..a04c2433f38 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Parse.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Parse.java @@ -11,6 +11,7 @@ import java.util.Iterator; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; +import com.ibm.icu.impl.PatternProps; import com.ibm.icu.impl.StandardPlural; import com.ibm.icu.impl.TextTrieMap; import com.ibm.icu.impl.number.formatters.BigDecimalMultiplier; @@ -1129,7 +1130,7 @@ public class Parse { } acceptGrouping(cp, StateName.AFTER_INTEGER_DIGIT, state, item); if (state.length > 0 && mode == ParseMode.FAST) break; - acceptBidi(cp, StateName.AFTER_INTEGER_DIGIT, state, item); + acceptBidi(cp, StateName.BEFORE_SUFFIX, state, item); if (state.length > 0 && mode == ParseMode.FAST) break; acceptPadding(cp, StateName.BEFORE_SUFFIX, state, item); if (state.length > 0 && mode == ParseMode.FAST) break; @@ -1155,7 +1156,7 @@ public class Parse { // We encountered a decimal point acceptFractionDigit(cp, StateName.AFTER_FRACTION_DIGIT, state, item); if (state.length > 0 && mode == ParseMode.FAST) break; - acceptBidi(cp, StateName.AFTER_FRACTION_DIGIT, state, item); + acceptBidi(cp, StateName.BEFORE_SUFFIX, state, item); if (state.length > 0 && mode == ParseMode.FAST) break; acceptPadding(cp, StateName.BEFORE_SUFFIX, state, item); if (state.length > 0 && mode == ParseMode.FAST) break; @@ -1184,7 +1185,7 @@ public class Parse { break; case AFTER_EXPONENT_DIGIT: - acceptBidi(cp, StateName.AFTER_EXPONENT_DIGIT, state, item); + acceptBidi(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item); acceptPadding(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item); acceptExponentDigit(cp, StateName.AFTER_EXPONENT_DIGIT, state, item); if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) { @@ -1257,17 +1258,17 @@ public class Parse { break; case INSIDE_STRING: - acceptStringOffset(cp, state, item); + long added0 = acceptStringOffset(cp, state, item); // Accept arbitrary bidi in the middle of strings. - if (state.length == 0 && UNISET_BIDI.contains(cp)) { + if (added0 == 0L && UNISET_BIDI.contains(cp)) { state.getNext().copyFrom(item, item.name, cp); } break; case INSIDE_AFFIX_PATTERN: - long added = acceptAffixPatternOffset(cp, state, item); - // Accept arbitrary bidi and whitespace (if lenient) in the middle of affixes. - if (added == 0L && isIgnorable(cp, state)) { + long added1 = acceptAffixPatternOffset(cp, state, item); + // Accept arbitrary bidi and whitespace (if lenient) in the middle of affix patterns. + if (added1 == 0L && isIgnorable(cp, state)) { state.getNext().copyFrom(item, item.name, cp); } break; @@ -1758,8 +1759,8 @@ public class Parse { if (!next.sawSuffix && holder.s.isEmpty()) next.score += 5; } - private static void acceptStringOffset(int cp, ParserState state, StateItem item) { - acceptString( + private static long acceptStringOffset(int cp, ParserState state, StateItem item) { + return acceptString( cp, item.returnTo1, item.returnTo2, @@ -1813,20 +1814,20 @@ public class Parse { referenceCp = Character.codePointAt(str, offset); count = Character.charCount(referenceCp); equals = codePointEquals(cp, referenceCp, state); - if (!UNISET_BIDI.contains(cp)) break; + if (!UNISET_BIDI.contains(referenceCp)) break; } if (equals) { // Matches first code point of the string StateItem next = state.getNext().copyFrom(item, null, cp); - // Skip over ignorable code points in the middle of the string. + // Skip over bidi code points in the middle or end of the string. // They will be accepted in the main loop. offset += count; for (; offset < str.length(); offset += count) { referenceCp = Character.codePointAt(str, offset); count = Character.charCount(referenceCp); - if (!UNISET_BIDI.contains(cp)) break; + if (!UNISET_BIDI.contains(referenceCp)) break; } if (offset < str.length()) { @@ -1879,7 +1880,7 @@ public class Parse { tag = AffixPatternUtils.nextToken(tag, str); typeOrCp = AffixPatternUtils.getTypeOrCp(tag); hasNext = AffixPatternUtils.hasNext(tag, str); - if (typeOrCp < 0 || !isIgnorable(typeOrCp, state)) break; + if (typeOrCp < 0 || !isPatternIgnorable(typeOrCp, state)) break; } // Convert from the returned tag to a code point, string, or currency to check @@ -1926,18 +1927,16 @@ public class Parse { while (hasNext) { long futureTag = AffixPatternUtils.nextToken(tag, str); int futureTypeOrCp = AffixPatternUtils.getTypeOrCp(futureTag); - if (futureTypeOrCp < 0 || !isIgnorable(futureTypeOrCp, state)) break; + if (futureTypeOrCp < 0 || !isPatternIgnorable(futureTypeOrCp, state)) break; tag = futureTag; typeOrCp = futureTypeOrCp; hasNext = AffixPatternUtils.hasNext(tag, str); } long added = 0L; - if (resolvedCp >= 0) { + if (resolvedCp >= 0 && codePointEquals(cp, resolvedCp, state)) { // Code point - if (!codePointEquals(cp, resolvedCp, state)) return 0L; StateItem next = state.getNext().copyFrom(item, null, cp); - if (hasNext) { // Additional tokens in affix string. next.name = StateName.INSIDE_AFFIX_PATTERN; @@ -1948,6 +1947,7 @@ public class Parse { next.trailingCount = 0; next.returnTo1 = null; } + next.score += 1; // reward for consuming code point from pattern added |= 1L << state.lastInsertedIndex(); } if (resolvedMinusSign) { @@ -2180,11 +2180,26 @@ public class Parse { * * @param cp The code point to test. Returns false if cp is negative. * @param state The current {@link ParserState}, used for determining strict mode. - * @return true if cp is bidi or whitespace in lenient mode; false otherwise. + * @return true if cp is ignorable; false otherwise. */ private static boolean isIgnorable(int cp, ParserState state) { if (cp < 0) return false; if (UNISET_BIDI.contains(cp)) return true; return state.mode == ParseMode.LENIENT && UNISET_WHITESPACE.contains(cp); } + + /** + * Checks whether the given code point is "ignorable" in pattern syntax. This includes all + * characters that are normally ignorable in {@link #isIgnorable} plus characters having the + * Pattern_White_Space property. + * + * @param cp The code point to test. Returns false if cp is negative. + * @param state The current {@link ParserState}, used for determining strict mode. + * @return true if cp is ignorable; false otherwise. + */ + private static boolean isPatternIgnorable(int cp, ParserState state) { + if (cp < 0) return false; + if (state.mode == ParseMode.LENIENT && PatternProps.isWhiteSpace(cp)) return true; + return isIgnorable(cp, state); + } } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java b/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java index fbe9915b548..dbdb41fe578 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java @@ -198,6 +198,15 @@ import com.ibm.icu.util.ULocale.Category; * and leaves the parse position unchanged. The convenience method {@link #parse(String)} indicates * parse failure by throwing a {@link java.text.ParseException}. * + *

Under the hood, a state table parsing engine is used. To debug a parsing failure during + * development, use the following pattern to print details about the state table transitions: + * + *

+ * com.ibm.icu.impl.number.Parse.DEBUGGING = true;
+ * df.parse("123.45", ppos);
+ * com.ibm.icu.impl.number.Parse.DEBUGGING = false;
+ * 
+ * *

Thread Safety and Best Practices

* *

Starting with ICU 59, instance of DecimalFormat are thread-safe. diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/data/numberformattestspecification.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/data/numberformattestspecification.txt index 0b4d412a89c..83ece37db3a 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/data/numberformattestspecification.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/data/numberformattestspecification.txt @@ -1439,3 +1439,62 @@ parse output breaks // K and J return null 9 999 9999 JK +test parse ignorables +set locale ar +// Note: Prefixes contain RLMs, as do some of the test cases. +set pattern x a‎b0c df +set negativePrefix y g‎h +set negativeSuffix i jk +begin +parse output breaks +x a‎b56c df 56 +x a‎b56c df 56 K +x ab56c df 56 K +x ab56c df 56 JK +x ab56c df 56 K +x ab56 56 JK +x a b56 56 JK +56cdf 56 JK +56c df 56 JK +56cd f 56 JK +56c‎d‎f 56 JK +56cdf 56 JK +56c d‎f 56 JK +56‎c df 56 JK +y g‎h56i jk -56 +y g‎h56i jk -56 KS +y gh56i jk -56 K +y gh56i jk -56 JK +y gh56i jk -56 K +y gh56 -56 JK +y g h56 -56 JKS +// S stops parsing at the 'i' for these and returns -56 +56ijk -56 JK +56i jk -56 JK +56ij k -56 JK +56i‎j‎k -56 JK +56ijk -56 JK +56i j‎k -56 JK +56‎i jk -56 JK +// S gets 56 (accepts ' ' gs grouping); J and K get null +5 6 5 JKS +5‎6 5 JK + +test parse spaces in grouping +// This test gives the ideal behavior of these cases, which +// none of the implementations currently support. +set locale en +set pattern #,##0 +begin +parse output breaks +// J and S get "12" here +1 2 1 JS +1 23 1 JS +// K gets 1 here; doesn't pick up the grouping separator +1 234 1234 K + + + + + + diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java index 0ac2e58bacc..1ad5884c5cd 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java @@ -4850,13 +4850,16 @@ public class NumberFormatTest extends TestFmwk { @Test public void Test11839() { DecimalFormatSymbols dfs = new DecimalFormatSymbols(ULocale.ENGLISH); - dfs.setMinusSign('∸'); - dfs.setPlusSign('∔'); // ∔ U+2214 DOT PLUS + dfs.setMinusSignString("a∸"); + dfs.setPlusSignString("b∔"); // ∔ U+2214 DOT PLUS DecimalFormat df = new DecimalFormat("0.00+;0.00-", dfs); String result = df.format(-1.234); - assertEquals("Locale-specific minus sign should be used", "1.23∸", result); + assertEquals("Locale-specific minus sign should be used", "1.23a∸", result); result = df.format(1.234); - assertEquals("Locale-specific plus sign should be used", "1.23∔", result); + assertEquals("Locale-specific plus sign should be used", "1.23b∔", result); + // Test round-trip with parse + expect2(df, -456, "456.00a∸"); + expect2(df, 456, "456.00b∔"); } @Test @@ -5171,6 +5174,24 @@ public class NumberFormatTest extends TestFmwk { } } + @Test + public void testParseIgnorables() { + // Also see the test case "test parse ignorables" in numberformattestspecification.txt + DecimalFormatSymbols dfs = DecimalFormatSymbols.getInstance(); + dfs.setPercentString("\u200E%\u200E"); + DecimalFormat df = new DecimalFormat("0%;-0a", dfs); + ParsePosition ppos = new ParsePosition(0); + Number result = df.parse("42\u200E%\u200E ", ppos); + assertEquals("Should parse as percentage", new BigDecimal("0.42"), result); + // NOTE: This behavior is specified only in 59. It is probably okay if it changes in the future. + assertEquals("Should not consume the trailing bidi even though it is in the symbol", 4, ppos.getIndex()); + ppos.setIndex(0); + result = df.parse("-42a\u200E ", ppos); + assertEquals("Should parse as percent", new BigDecimal("-0.42"), result); + // NOTE: This behavior is specified only in 59. It is probably okay if it changes in the future. + assertEquals("Should not consume the trailing bidi or whitespace", 4, ppos.getIndex()); + } + @Test public void testSignificantDigitsMode() { String[][] allExpected = {