From 48a633f41f36a42f9331fdeb6813bd6181d07313 Mon Sep 17 00:00:00 2001 From: Shane Carr Date: Tue, 6 Feb 2018 09:43:37 +0000 Subject: [PATCH] ICU-13574 Defining more fundamental parsing types. X-SVN-Rev: 40843 --- icu4c/source/i18n/Makefile.in | 2 +- icu4c/source/i18n/number_decimalquantity.cpp | 30 +++ icu4c/source/i18n/number_decimalquantity.h | 13 +- icu4c/source/i18n/numparse_parsednumber.cpp | 75 ++++++ icu4c/source/i18n/numparse_stringsegment.h | 55 ----- icu4c/source/i18n/numparse_types.h | 216 +++++++++++++++++- icu4c/source/i18n/plurrule.cpp | 2 + icu4c/source/i18n/plurrule_impl.h | 126 +++++----- .../icu/impl/number/parse/AffixMatcher.java | 2 +- .../impl/number/parse/NumberParseMatcher.java | 2 + .../impl/number/parse/NumberParserImpl.java | 16 +- .../icu/impl/number/parse/ParsedNumber.java | 6 +- .../number/parse/UnicodeSetStaticCache.java | 5 - .../icu/dev/test/number/NumberParserTest.java | 17 +- .../number/UnicodeSetStaticCacheTest.java | 2 +- 15 files changed, 423 insertions(+), 146 deletions(-) create mode 100644 icu4c/source/i18n/numparse_parsednumber.cpp diff --git a/icu4c/source/i18n/Makefile.in b/icu4c/source/i18n/Makefile.in index 2b9cca70556..a5752781f2f 100644 --- a/icu4c/source/i18n/Makefile.in +++ b/icu4c/source/i18n/Makefile.in @@ -108,7 +108,7 @@ number_decimfmtprops.o number_fluent.o number_formatimpl.o number_grouping.o \ number_integerwidth.o number_longnames.o number_modifiers.o number_notation.o \ number_padding.o number_patternmodifier.o number_patternstring.o \ number_rounding.o number_scientific.o number_stringbuilder.o \ -numparse_stringsegment.o numparse_unisets.o +numparse_stringsegment.o numparse_unisets.o numparse_parsednumber.o ## Header files to install diff --git a/icu4c/source/i18n/number_decimalquantity.cpp b/icu4c/source/i18n/number_decimalquantity.cpp index 6f6ac9def65..3342771b987 100644 --- a/icu4c/source/i18n/number_decimalquantity.cpp +++ b/icu4c/source/i18n/number_decimalquantity.cpp @@ -103,6 +103,7 @@ DecimalQuantity &DecimalQuantity::operator=(const DecimalQuantity &other) { return *this; } copyBcdFrom(other); + bogus = other.bogus; lOptPos = other.lOptPos; lReqPos = other.lReqPos; rReqPos = other.rReqPos; @@ -466,6 +467,35 @@ int64_t DecimalQuantity::toFractionLong(bool includeTrailingZeros) const { return result; } +bool DecimalQuantity::fitsInLong() const { + if (isZero()) { + return true; + } + if (scale < 0) { + return false; + } + int magnitude = getMagnitude(); + if (magnitude < 18) { + return true; + } + if (magnitude > 18) { + return false; + } + // Hard case: the magnitude is 10^18. + // The largest int64 is: 9,223,372,036,854,775,807 + for (int p = 0; p < precision; p++) { + int8_t digit = getDigit(18 - p); + static int8_t INT64_BCD[] = { 9, 2, 2, 3, 3, 7, 2, 0, 3, 6, 8, 5, 4, 7, 7, 5, 8, 0, 7 }; + if (digit < INT64_BCD[p]) { + return true; + } else if (digit > INT64_BCD[p]) { + return false; + } + } + // Exactly equal to max long. + return true; +} + double DecimalQuantity::toDouble() const { if (isApproximate) { return toDoubleFromOriginal(); diff --git a/icu4c/source/i18n/number_decimalquantity.h b/icu4c/source/i18n/number_decimalquantity.h index 3ff9fbeffef..aea66fdb7cd 100644 --- a/icu4c/source/i18n/number_decimalquantity.h +++ b/icu4c/source/i18n/number_decimalquantity.h @@ -35,7 +35,7 @@ class U_I18N_API DecimalQuantity : public IFixedDecimal, public UMemory { DecimalQuantity(); - ~DecimalQuantity(); + ~DecimalQuantity() override; /** * Sets this instance to be equal to another instance. @@ -128,6 +128,12 @@ class U_I18N_API DecimalQuantity : public IFixedDecimal, public UMemory { int64_t toFractionLong(bool includeTrailingZeros) const; + /** + * Returns whether or not a Long can fully represent the value stored in this DecimalQuantity. + * Assumes that the DecimalQuantity is positive. + */ + bool fitsInLong() const; + /** @return The value contained in this {@link DecimalQuantity} approximated as a double. */ double toDouble() const; @@ -235,6 +241,11 @@ class U_I18N_API DecimalQuantity : public IFixedDecimal, public UMemory { /** Visible for testing */ inline bool isExplicitExactDouble() { return explicitExactDouble; }; + /** + * Bogus flag for when a DecimalQuantity is stored on the stack. + */ + bool bogus = false; + private: /** * The power of ten corresponding to the least significant digit in the BCD. For example, if this diff --git a/icu4c/source/i18n/numparse_parsednumber.cpp b/icu4c/source/i18n/numparse_parsednumber.cpp new file mode 100644 index 00000000000..9db933502a3 --- /dev/null +++ b/icu4c/source/i18n/numparse_parsednumber.cpp @@ -0,0 +1,75 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT + +#include "numparse_types.h" +#include + +using namespace icu; +using namespace icu::numparse; +using namespace icu::numparse::impl; + + +ParsedNumber::ParsedNumber() { + clear(); +} + +void ParsedNumber::clear() { + quantity.bogus = true; + charEnd = 0; + flags = 0; + prefix.setToBogus(); + suffix.setToBogus(); + currencyCode.setToBogus(); +} + +void ParsedNumber::setCharsConsumed(const StringSegment& segment) { + charEnd = segment.getOffset(); +} + +bool ParsedNumber::success() const { + return charEnd > 0 && 0 == (flags & FLAG_FAIL); +} + +bool ParsedNumber::seenNumber() const { + return !quantity.bogus || 0 != (flags & FLAG_NAN) || 0 != (flags & FLAG_INFINITY); +} + +double ParsedNumber::getDouble() const { + bool sawNegative = 0 != (flags & FLAG_NEGATIVE); + bool sawNaN = 0 != (flags & FLAG_NAN); + bool sawInfinity = 0 != (flags & FLAG_INFINITY); + + // Check for NaN, infinity, and -0.0 + if (sawNaN) { + return NAN; + } + if (sawInfinity) { + if (sawNegative) { + return -INFINITY; + } else { + return INFINITY; + } + } + if (quantity.isZero() && sawNegative) { + return -0.0; + } + + if (quantity.fitsInLong()) { + long l = quantity.toLong(); + if (0 != (flags & FLAG_NEGATIVE)) { + l *= -1; + } + return l; + } + + // TODO: MIN_LONG + return quantity.toDouble(); +} + + + +#endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/i18n/numparse_stringsegment.h b/icu4c/source/i18n/numparse_stringsegment.h index 30f11af7a19..d2e6154cd3d 100644 --- a/icu4c/source/i18n/numparse_stringsegment.h +++ b/icu4c/source/i18n/numparse_stringsegment.h @@ -15,61 +15,6 @@ U_NAMESPACE_BEGIN namespace numparse { namespace impl { -/** - * A mutable class allowing for a String with a variable offset and length. The charAt, length, and - * subSequence methods all operate relative to the fixed offset into the String. - * - * @author sffc - */ -class StringSegment : public UMemory, public ::icu::number::impl::CharSequence { - public: - explicit StringSegment(const UnicodeString &str); - - int32_t getOffset() const; - - void setOffset(int32_t start); - - /** - * Equivalent to setOffset(getOffset()+delta). - * - *

- * This method is usually called by a Matcher to register that a char was consumed. If the char is - * strong (it usually is, except for things like whitespace), follow this with a call to - * {@link ParsedNumber#setCharsConsumed}. For more information on strong chars, see that method. - */ - void adjustOffset(int32_t delta); - - void setLength(int32_t length); - - void resetLength(); - - int32_t length() const override; - - char16_t charAt(int32_t index) const override; - - UChar32 codePointAt(int32_t index) const override; - - UnicodeString toUnicodeString() const override; - - /** - * Returns the first code point in the string segment, or -1 if the string starts with an invalid - * code point. - */ - UChar32 getCodePoint() const; - - /** - * Returns the length of the prefix shared by this StringSegment and the given CharSequence. For - * example, if this string segment is "aab", and the char sequence is "aac", this method returns 2, - * since the first 2 characters are the same. - */ - int32_t getCommonPrefixLength(const UnicodeString &other); - - private: - const UnicodeString fStr; - int32_t fStart; - int32_t fEnd; -}; - } // namespace impl } // namespace numparse diff --git a/icu4c/source/i18n/numparse_types.h b/icu4c/source/i18n/numparse_types.h index b607f36cc99..92957204baa 100644 --- a/icu4c/source/i18n/numparse_types.h +++ b/icu4c/source/i18n/numparse_types.h @@ -8,11 +8,223 @@ #define __NUMPARSE_TYPES_H__ #include "unicode/uobject.h" +#include "number_decimalquantity.h" -U_NAMESPACE_BEGIN -namespace numparse { +U_NAMESPACE_BEGIN namespace numparse { namespace impl { +// Forward-declarations +class StringSegment; +class ParsedNumber; + + +/** + * Struct-like class to hold the results of a parsing routine. + * + * @author sffc + */ +class ParsedNumber { + public: + enum ParsedNumberFlags { + FLAG_NEGATIVE = 0x0001, + FLAG_PERCENT = 0x0002, + FLAG_PERMILLE = 0x0004, + FLAG_HAS_EXPONENT = 0x0008, + FLAG_HAS_DEFAULT_CURRENCY = 0x0010, + FLAG_HAS_DECIMAL_SEPARATOR = 0x0020, + FLAG_NAN = 0x0040, + FLAG_INFINITY = 0x0080, + FLAG_FAIL = 0x0100, + }; + + /** + * The numerical value that was parsed. + */ + ::icu::number::impl::DecimalQuantity quantity; + + /** + * The index of the last char consumed during parsing. If parsing started at index 0, this is equal + * to the number of chars consumed. This is NOT necessarily the same as the StringSegment offset; + * "weak" chars, like whitespace, change the offset, but the charsConsumed is not touched until a + * "strong" char is encountered. + */ + int32_t charEnd; + + /** + * Boolean flags (see constants below). + */ + int32_t flags; + + /** + * The pattern string corresponding to the prefix that got consumed. + */ + UnicodeString prefix; + + /** + * The pattern string corresponding to the suffix that got consumed. + */ + UnicodeString suffix; + + /** + * The currency that got consumed. + */ + UnicodeString currencyCode; + + ParsedNumber(); + + ParsedNumber(const ParsedNumber& other) = default; + + ParsedNumber& operator=(const ParsedNumber& other) = default; + + void clear(); + + /** + * Call this method to register that a "strong" char was consumed. This should be done after calling + * {@link StringSegment#setOffset} or {@link StringSegment#adjustOffset} except when the char is + * "weak", like whitespace. + * + *

+ * What is a strong versus weak char? The behavior of number parsing is to "stop" + * after reading the number, even if there is other content following the number. For example, after + * parsing the string "123 " (123 followed by a space), the cursor should be set to 3, not 4, even + * though there are matchers that accept whitespace. In this example, the digits are strong, whereas + * the whitespace is weak. Grouping separators are weak, whereas decimal separators are strong. Most + * other chars are strong. + * + * @param segment + * The current StringSegment, usually immediately following a call to setOffset. + */ + void setCharsConsumed(const StringSegment& segment); + + /** + * Returns whether this the parse was successful. To be successful, at least one char must have been + * consumed, and the failure flag must not be set. + */ + bool success() const; + + bool seenNumber() const; + + double getDouble() const; +}; + + +/** + * A mutable class allowing for a String with a variable offset and length. The charAt, length, and + * subSequence methods all operate relative to the fixed offset into the String. + * + * @author sffc + */ +class StringSegment : public UMemory, public ::icu::number::impl::CharSequence { + public: + explicit StringSegment(const UnicodeString& str); + + int32_t getOffset() const; + + void setOffset(int32_t start); + + /** + * Equivalent to setOffset(getOffset()+delta). + * + *

+ * This method is usually called by a Matcher to register that a char was consumed. If the char is + * strong (it usually is, except for things like whitespace), follow this with a call to + * {@link ParsedNumber#setCharsConsumed}. For more information on strong chars, see that method. + */ + void adjustOffset(int32_t delta); + + void setLength(int32_t length); + + void resetLength(); + + int32_t length() const override; + + char16_t charAt(int32_t index) const override; + + UChar32 codePointAt(int32_t index) const override; + + UnicodeString toUnicodeString() const override; + + /** + * Returns the first code point in the string segment, or -1 if the string starts with an invalid + * code point. + */ + UChar32 getCodePoint() const; + + /** + * Returns the length of the prefix shared by this StringSegment and the given CharSequence. For + * example, if this string segment is "aab", and the char sequence is "aac", this method returns 2, + * since the first 2 characters are the same. + */ + int32_t getCommonPrefixLength(const UnicodeString& other); + + private: + const UnicodeString fStr; + int32_t fStart; + int32_t fEnd; +}; + + +/** + * The core interface implemented by all matchers used for number parsing. + * + * Given a string, there should NOT be more than one way to consume the string with the same matcher + * applied multiple times. If there is, the non-greedy parsing algorithm will be unhappy and may enter an + * exponential-time loop. For example, consider the "A Matcher" that accepts "any number of As". Given + * the string "AAAA", there are 2^N = 8 ways to apply the A Matcher to this string: you could have the A + * Matcher apply 4 times to each character; you could have it apply just once to all the characters; you + * could have it apply to the first 2 characters and the second 2 characters; and so on. A better version + * of the "A Matcher" would be for it to accept exactly one A, and allow the algorithm to run it + * repeatedly to consume a string of multiple As. The A Matcher can implement the Flexible interface + * below to signal that it can be applied multiple times in a row. + * + * @author sffc + */ +class NumberParseMatcher { + public: + virtual ~NumberParseMatcher() = default; + + /** + * Matchers can override this method to return true to indicate that they are optional and can be run + * repeatedly. Used by SeriesMatcher, primarily in the context of IgnorablesMatcher. + */ + virtual bool isFlexible() const { + return false; + } + + /** + * Runs this matcher starting at the beginning of the given StringSegment. If this matcher finds + * something interesting in the StringSegment, it should update the offset of the StringSegment + * corresponding to how many chars were matched. + * + * @param segment + * The StringSegment to match against. Matches always start at the beginning of the + * segment. The segment is guaranteed to contain at least one char. + * @param result + * The data structure to store results if the match succeeds. + * @return Whether this matcher thinks there may be more interesting chars beyond the end of the + * string segment. + */ + virtual bool match(StringSegment& segment, ParsedNumber& result) const = 0; + + /** + * Should return a set representing all possible chars (UTF-16 code units) that could be the first + * char that this matcher can consume. This method is only called during construction phase, and its + * return value is used to skip this matcher unless a segment begins with a char in this set. To make + * this matcher always run, return {@link UnicodeSet#ALL_CODE_POINTS}. + */ + virtual UnicodeSet getLeadCodePoints() const = 0; + + /** + * Method called at the end of a parse, after all matchers have failed to consume any more chars. + * Allows a matcher to make final modifications to the result given the knowledge that no more + * matches are possible. + * + * @param result + * The data structure to store results. + */ + virtual void postProcess(ParsedNumber& result) const = 0; +}; + } // namespace impl } // namespace numparse diff --git a/icu4c/source/i18n/plurrule.cpp b/icu4c/source/i18n/plurrule.cpp index dcf28b2bc1a..14b5fe6d9d1 100644 --- a/icu4c/source/i18n/plurrule.cpp +++ b/icu4c/source/i18n/plurrule.cpp @@ -42,6 +42,8 @@ U_NAMESPACE_BEGIN +using namespace icu::pluralimpl; + static const UChar PLURAL_KEYWORD_OTHER[]={LOW_O,LOW_T,LOW_H,LOW_E,LOW_R,0}; static const UChar PLURAL_DEFAULT_RULE[]={LOW_O,LOW_T,LOW_H,LOW_E,LOW_R,COLON,SPACE,LOW_N,0}; static const UChar PK_IN[]={LOW_I,LOW_N,0}; diff --git a/icu4c/source/i18n/plurrule_impl.h b/icu4c/source/i18n/plurrule_impl.h index b93fc501bac..152c33e862d 100644 --- a/icu4c/source/i18n/plurrule_impl.h +++ b/icu4c/source/i18n/plurrule_impl.h @@ -40,67 +40,71 @@ class DigitInterval; class PluralRules; class VisibleDigits; -static const UChar DOT = ((UChar)0x002E); -static const UChar SINGLE_QUOTE = ((UChar)0x0027); -static const UChar SLASH = ((UChar)0x002F); -static const UChar BACKSLASH = ((UChar)0x005C); -static const UChar SPACE = ((UChar)0x0020); -static const UChar EXCLAMATION = ((UChar)0x0021); -static const UChar QUOTATION_MARK = ((UChar)0x0022); -static const UChar NUMBER_SIGN = ((UChar)0x0023); -static const UChar PERCENT_SIGN = ((UChar)0x0025); -static const UChar ASTERISK = ((UChar)0x002A); -static const UChar COMMA = ((UChar)0x002C); -static const UChar HYPHEN = ((UChar)0x002D); -static const UChar U_ZERO = ((UChar)0x0030); -static const UChar U_ONE = ((UChar)0x0031); -static const UChar U_TWO = ((UChar)0x0032); -static const UChar U_THREE = ((UChar)0x0033); -static const UChar U_FOUR = ((UChar)0x0034); -static const UChar U_FIVE = ((UChar)0x0035); -static const UChar U_SIX = ((UChar)0x0036); -static const UChar U_SEVEN = ((UChar)0x0037); -static const UChar U_EIGHT = ((UChar)0x0038); -static const UChar U_NINE = ((UChar)0x0039); -static const UChar COLON = ((UChar)0x003A); -static const UChar SEMI_COLON = ((UChar)0x003B); -static const UChar EQUALS = ((UChar)0x003D); -static const UChar AT = ((UChar)0x0040); -static const UChar CAP_A = ((UChar)0x0041); -static const UChar CAP_B = ((UChar)0x0042); -static const UChar CAP_R = ((UChar)0x0052); -static const UChar CAP_Z = ((UChar)0x005A); -static const UChar LOWLINE = ((UChar)0x005F); -static const UChar LEFTBRACE = ((UChar)0x007B); -static const UChar RIGHTBRACE = ((UChar)0x007D); -static const UChar TILDE = ((UChar)0x007E); -static const UChar ELLIPSIS = ((UChar)0x2026); - -static const UChar LOW_A = ((UChar)0x0061); -static const UChar LOW_B = ((UChar)0x0062); -static const UChar LOW_C = ((UChar)0x0063); -static const UChar LOW_D = ((UChar)0x0064); -static const UChar LOW_E = ((UChar)0x0065); -static const UChar LOW_F = ((UChar)0x0066); -static const UChar LOW_G = ((UChar)0x0067); -static const UChar LOW_H = ((UChar)0x0068); -static const UChar LOW_I = ((UChar)0x0069); -static const UChar LOW_J = ((UChar)0x006a); -static const UChar LOW_K = ((UChar)0x006B); -static const UChar LOW_L = ((UChar)0x006C); -static const UChar LOW_M = ((UChar)0x006D); -static const UChar LOW_N = ((UChar)0x006E); -static const UChar LOW_O = ((UChar)0x006F); -static const UChar LOW_P = ((UChar)0x0070); -static const UChar LOW_Q = ((UChar)0x0071); -static const UChar LOW_R = ((UChar)0x0072); -static const UChar LOW_S = ((UChar)0x0073); -static const UChar LOW_T = ((UChar)0x0074); -static const UChar LOW_U = ((UChar)0x0075); -static const UChar LOW_V = ((UChar)0x0076); -static const UChar LOW_W = ((UChar)0x0077); -static const UChar LOW_Y = ((UChar)0x0079); -static const UChar LOW_Z = ((UChar)0x007A); +namespace pluralimpl { + +static const UChar DOT = ((UChar) 0x002E); +static const UChar SINGLE_QUOTE = ((UChar) 0x0027); +static const UChar SLASH = ((UChar) 0x002F); +static const UChar BACKSLASH = ((UChar) 0x005C); +static const UChar SPACE = ((UChar) 0x0020); +static const UChar EXCLAMATION = ((UChar) 0x0021); +static const UChar QUOTATION_MARK = ((UChar) 0x0022); +static const UChar NUMBER_SIGN = ((UChar) 0x0023); +static const UChar PERCENT_SIGN = ((UChar) 0x0025); +static const UChar ASTERISK = ((UChar) 0x002A); +static const UChar COMMA = ((UChar) 0x002C); +static const UChar HYPHEN = ((UChar) 0x002D); +static const UChar U_ZERO = ((UChar) 0x0030); +static const UChar U_ONE = ((UChar) 0x0031); +static const UChar U_TWO = ((UChar) 0x0032); +static const UChar U_THREE = ((UChar) 0x0033); +static const UChar U_FOUR = ((UChar) 0x0034); +static const UChar U_FIVE = ((UChar) 0x0035); +static const UChar U_SIX = ((UChar) 0x0036); +static const UChar U_SEVEN = ((UChar) 0x0037); +static const UChar U_EIGHT = ((UChar) 0x0038); +static const UChar U_NINE = ((UChar) 0x0039); +static const UChar COLON = ((UChar) 0x003A); +static const UChar SEMI_COLON = ((UChar) 0x003B); +static const UChar EQUALS = ((UChar) 0x003D); +static const UChar AT = ((UChar) 0x0040); +static const UChar CAP_A = ((UChar) 0x0041); +static const UChar CAP_B = ((UChar) 0x0042); +static const UChar CAP_R = ((UChar) 0x0052); +static const UChar CAP_Z = ((UChar) 0x005A); +static const UChar LOWLINE = ((UChar) 0x005F); +static const UChar LEFTBRACE = ((UChar) 0x007B); +static const UChar RIGHTBRACE = ((UChar) 0x007D); +static const UChar TILDE = ((UChar) 0x007E); +static const UChar ELLIPSIS = ((UChar) 0x2026); + +static const UChar LOW_A = ((UChar) 0x0061); +static const UChar LOW_B = ((UChar) 0x0062); +static const UChar LOW_C = ((UChar) 0x0063); +static const UChar LOW_D = ((UChar) 0x0064); +static const UChar LOW_E = ((UChar) 0x0065); +static const UChar LOW_F = ((UChar) 0x0066); +static const UChar LOW_G = ((UChar) 0x0067); +static const UChar LOW_H = ((UChar) 0x0068); +static const UChar LOW_I = ((UChar) 0x0069); +static const UChar LOW_J = ((UChar) 0x006a); +static const UChar LOW_K = ((UChar) 0x006B); +static const UChar LOW_L = ((UChar) 0x006C); +static const UChar LOW_M = ((UChar) 0x006D); +static const UChar LOW_N = ((UChar) 0x006E); +static const UChar LOW_O = ((UChar) 0x006F); +static const UChar LOW_P = ((UChar) 0x0070); +static const UChar LOW_Q = ((UChar) 0x0071); +static const UChar LOW_R = ((UChar) 0x0072); +static const UChar LOW_S = ((UChar) 0x0073); +static const UChar LOW_T = ((UChar) 0x0074); +static const UChar LOW_U = ((UChar) 0x0075); +static const UChar LOW_V = ((UChar) 0x0076); +static const UChar LOW_W = ((UChar) 0x0077); +static const UChar LOW_Y = ((UChar) 0x0079); +static const UChar LOW_Z = ((UChar) 0x007A); + +}; static const int32_t PLURAL_RANGE_HIGH = 0x7fffffff; diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AffixMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AffixMatcher.java index 58bf69ac7d2..5104e29b9da 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AffixMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AffixMatcher.java @@ -75,7 +75,7 @@ public class AffixMatcher implements NumberParseMatcher { return true; } - public static void newGenerate( + public static void createMatchers( AffixPatternProvider patternInfo, NumberParserImpl output, MatcherFactory factory, diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParseMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParseMatcher.java index 1d576cee2cc..cd7b04ade65 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParseMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParseMatcher.java @@ -5,6 +5,8 @@ package com.ibm.icu.impl.number.parse; import com.ibm.icu.text.UnicodeSet; /** + * The core interface implemented by all matchers used for number parsing. + * * Given a string, there should NOT be more than one way to consume the string with the same matcher * applied multiple times. If there is, the non-greedy parsing algorithm will be unhappy and may enter an * exponential-time loop. For example, consider the "A Matcher" that accepts "any number of As". Given diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java index ff59ca052cc..55a046a43ad 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java @@ -66,18 +66,10 @@ public class NumberParserImpl { STRICT, } - @Deprecated - public static NumberParserImpl createParserFromPattern( + public static NumberParserImpl createSimpleParser( ULocale locale, String pattern, - boolean strictGrouping) { - // Temporary frontend for testing. - - int parseFlags = ParsingUtils.PARSE_FLAG_IGNORE_CASE - | ParsingUtils.PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES; - if (strictGrouping) { - parseFlags |= ParsingUtils.PARSE_FLAG_STRICT_GROUPING_SIZE; - } + int parseFlags) { NumberParserImpl parser = new NumberParserImpl(parseFlags, true); DecimalFormatSymbols symbols = DecimalFormatSymbols.getInstance(locale); @@ -91,7 +83,7 @@ public class NumberParserImpl { factory.parseFlags = parseFlags; ParsedPatternInfo patternInfo = PatternStringParser.parseToPatternInfo(pattern); - AffixMatcher.newGenerate(patternInfo, parser, factory, ignorables, parseFlags); + AffixMatcher.createMatchers(patternInfo, parser, factory, ignorables, parseFlags); Grouper grouper = Grouper.forStrategy(GroupingStrategy.AUTO).withLocaleData(locale, patternInfo); @@ -209,7 +201,7 @@ public class NumberParserImpl { ////////////////////// // Set up a pattern modifier with mostly defaults to generate AffixMatchers. - AffixMatcher.newGenerate(patternInfo, parser, factory, ignorables, parseFlags); + AffixMatcher.createMatchers(patternInfo, parser, factory, ignorables, parseFlags); //////////////////////// /// CURRENCY MATCHER /// diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ParsedNumber.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ParsedNumber.java index 27ce15df1f6..2bd45cc08be 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ParsedNumber.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ParsedNumber.java @@ -8,11 +8,15 @@ import java.util.Comparator; import com.ibm.icu.impl.number.DecimalQuantity_DualStorageBCD; /** - * @author sffc + * Struct-like class to hold the results of a parsing routine. * + * @author sffc */ public class ParsedNumber { + /** + * The numerical value that was parsed. + */ public DecimalQuantity_DualStorageBCD quantity; /** diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java index bf0593e1230..3839301cea5 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java @@ -72,10 +72,6 @@ public class UnicodeSetStaticCache { return get(key1).contains(str) ? key1 : chooseFrom(str, key2); } - public static Key chooseFrom(String str, Key key1, Key key2, Key key3) { - return get(key1).contains(str) ? key1 : chooseFrom(str, key2, key3); - } - private static UnicodeSet computeUnion(Key k1, Key k2) { return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).freeze(); } @@ -110,7 +106,6 @@ public class UnicodeSetStaticCache { unicodeSets.put(Key.MINUS_SIGN, new UnicodeSet("[-⁻₋−➖﹣-]").freeze()); unicodeSets.put(Key.PLUS_SIGN, new UnicodeSet("[+⁺₊➕﬩﹢+]").freeze()); - // TODO: Fill in the next three sets. unicodeSets.put(Key.PERCENT_SIGN, new UnicodeSet("[%٪]").freeze()); unicodeSets.put(Key.PERMILLE_SIGN, new UnicodeSet("[‰؉]").freeze()); unicodeSets.put(Key.INFINITY, new UnicodeSet("[∞]").freeze()); diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java index 4e69a762581..5bb123968b6 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java @@ -13,6 +13,7 @@ import com.ibm.icu.impl.number.parse.IgnorablesMatcher; import com.ibm.icu.impl.number.parse.MinusSignMatcher; import com.ibm.icu.impl.number.parse.NumberParserImpl; import com.ibm.icu.impl.number.parse.ParsedNumber; +import com.ibm.icu.impl.number.parse.ParsingUtils; import com.ibm.icu.impl.number.parse.PercentMatcher; import com.ibm.icu.impl.number.parse.PlusSignMatcher; import com.ibm.icu.impl.number.parse.SeriesMatcher; @@ -58,8 +59,8 @@ public class NumberParserTest { { 3, "-𝟱𝟭𝟰𝟮𝟯-", "0", 11, -51423. }, { 3, "a51423US dollars", "a0¤¤¤", 16, 51423. }, { 3, "a 51423 US dollars", "a0¤¤¤", 18, 51423. }, - { 3, "514.23 USD", "0", 10, 514.23 }, - { 3, "514.23 GBP", "0", 10, 514.23 }, + { 3, "514.23 USD", "¤0", 10, 514.23 }, + { 3, "514.23 GBP", "¤0", 10, 514.23 }, { 3, "a 𝟱𝟭𝟰𝟮𝟯 b", "a0b", 14, 51423. }, { 3, "-a 𝟱𝟭𝟰𝟮𝟯 b", "a0b", 15, -51423. }, { 3, "a -𝟱𝟭𝟰𝟮𝟯 b", "a0b", 15, -51423. }, @@ -79,7 +80,7 @@ public class NumberParserTest { { 3, "𝟱.𝟭𝟰𝟮E𝟯", "0", 12, 5142. }, { 3, "𝟱.𝟭𝟰𝟮E-𝟯", "0", 13, 0.005142 }, { 3, "𝟱.𝟭𝟰𝟮e-𝟯", "0", 13, 0.005142 }, - { 7, "5,142.50 Canadian dollars", "#,##,##0", 25, 5142.5 }, + { 7, "5,142.50 Canadian dollars", "#,##,##0 ¤¤¤", 25, 5142.5 }, { 3, "a$ b5", "a ¤ b0", 5, 5.0 }, { 3, "📺1.23", "📺0;📻0", 6, 1.23 }, { 3, "📻1.23", "📺0;📻0", 6, -1.23 }, @@ -87,6 +88,8 @@ public class NumberParserTest { { 3, " 0", "a0", 31, 0.0 }, // should not hang { 3, "0", "0", 1, 0.0 } }; + int parseFlags = ParsingUtils.PARSE_FLAG_IGNORE_CASE + | ParsingUtils.PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES; for (Object[] cas : cases) { int flags = (Integer) cas[0]; String input = (String) cas[1]; @@ -94,7 +97,7 @@ public class NumberParserTest { int expectedCharsConsumed = (Integer) cas[3]; double resultDouble = (Double) cas[4]; NumberParserImpl parser = NumberParserImpl - .createParserFromPattern(ULocale.ENGLISH, pattern, false); + .createSimpleParser(ULocale.ENGLISH, pattern, parseFlags); String message = "Input <" + input + "> Parser " + parser; if (0 != (flags & 0x01)) { @@ -127,7 +130,9 @@ public class NumberParserTest { if (0 != (flags & 0x04)) { // Test with strict separators - parser = NumberParserImpl.createParserFromPattern(ULocale.ENGLISH, pattern, true); + parser = NumberParserImpl.createSimpleParser(ULocale.ENGLISH, + pattern, + parseFlags | ParsingUtils.PARSE_FLAG_STRICT_GROUPING_SIZE); ParsedNumber resultObject = new ParsedNumber(); parser.parse(input, true, resultObject); assertNotNull("Strict Parse failed: " + message, resultObject.quantity); @@ -146,7 +151,7 @@ public class NumberParserTest { public void testLocaleFi() { // This case is interesting because locale fi has NaN starting with 'e', the same as scientific NumberParserImpl parser = NumberParserImpl - .createParserFromPattern(new ULocale("fi"), "0", false); + .createSimpleParser(new ULocale("fi"), "0", ParsingUtils.PARSE_FLAG_IGNORE_CASE); ParsedNumber resultObject = new ParsedNumber(); parser.parse("epäluku", false, resultObject); diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/UnicodeSetStaticCacheTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/UnicodeSetStaticCacheTest.java index 97283a1400e..7aec4f77f17 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/UnicodeSetStaticCacheTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/UnicodeSetStaticCacheTest.java @@ -75,7 +75,7 @@ public class UnicodeSetStaticCacheTest { static void assertInSet(ULocale locale, UnicodeSet set, int cp) { // If this test case fails, add the specified code point to the corresponding set in - // UnicodeSetStaticCache.java + // UnicodeSetStaticCache.java and numparse_unisets.cpp assertTrue( locale + " U+" -- 2.40.0