ICU-13574 Basic parsing tests are passing on the pieces of code written so far, Decim...

author Shane Carr <shane@unicode.org>

Thu, 8 Feb 2018 09:59:35 +0000 (09:59 +0000)

committer Shane Carr <shane@unicode.org>

Thu, 8 Feb 2018 09:59:35 +0000 (09:59 +0000)
author Shane Carr <shane@unicode.org>
Thu, 8 Feb 2018 09:59:35 +0000 (09:59 +0000)
committer Shane Carr <shane@unicode.org>
Thu, 8 Feb 2018 09:59:35 +0000 (09:59 +0000)
diff --git a/icu4c/source/i18n/numparse_impl.cpp b/icu4c/source/i18n/numparse_impl.cpp

index d93c0173f403ab6a00323426193efb1412ac0be8..4348d86c6d6e8ab904d32b143ad2b833ae24fb84 100644 (file)
--- a/icu4c/source/i18n/numparse_impl.cpp
+++ b/icu4c/source/i18n/numparse_impl.cpp
@@ -13,6 +13,8 @@
  #include "numparse_decimal.h"
  #include "unicode/numberformatter.h"
  
+#include <typeinfo>
+
  using namespace icu;
  using namespace icu::number;
  using namespace icu::number::impl;
@@ -92,22 +94,121 @@ void NumberParserImpl::freeze() {
      fFrozen = true;
  }
  
-//void
-//NumberParserImpl::parse(const UnicodeString& input, int32_t start, bool greedy, ParsedNumber& result,
-//                        UErrorCode& status) const {
-//    U_ASSERT(frozen);
-//    // TODO: Check start >= 0 and start < input.length()
-//    StringSegment segment(utils::maybeFold(input, parseFlags));
-//    segment.adjustOffset(start);
-//    if (greedy) {
-//        parseGreedyRecursive(segment, result);
-//    } else {
-//        parseLongestRecursive(segment, result);
-//    }
-//    for (NumberParseMatcher matcher : matchers) {
-//        matcher.postProcess(result);
-//    }
-//}
+void NumberParserImpl::parse(const UnicodeString& input, bool greedy, ParsedNumber& result,
+                             UErrorCode& status) const {
+    return parse(input, 0, greedy, result, status);
+}
+
+void
+NumberParserImpl::parse(const UnicodeString& input, int32_t start, bool greedy, ParsedNumber& result,
+                        UErrorCode& status) const {
+    U_ASSERT(fFrozen);
+    // TODO: Check start >= 0 and start < input.length()
+    StringSegment segment(input, fParseFlags);
+    segment.adjustOffset(start);
+    if (greedy) {
+        parseGreedyRecursive(segment, result, status);
+    } else {
+        parseLongestRecursive(segment, result, status);
+    }
+    for (int32_t i = 0; i < fNumMatchers; i++) {
+        fMatchers[i]->postProcess(result);
+    }
+}
+
+void NumberParserImpl::parseGreedyRecursive(StringSegment& segment, ParsedNumber& result,
+                                            UErrorCode& status) const {
+    // Base Case
+    if (segment.length() == 0) {
+        return;
+    }
+
+    int initialOffset = segment.getOffset();
+    int leadCp = segment.getCodePoint();
+    for (int32_t i = 0; i < fNumMatchers; i++) {
+        if (fComputeLeads && !fLeads[i]->contains(leadCp)) {
+            continue;
+        }
+        const NumberParseMatcher* matcher = fMatchers[i];
+        matcher->match(segment, result, status);
+        if (U_FAILURE(status)) {
+            return;
+        }
+        if (segment.getOffset() != initialOffset) {
+            // In a greedy parse, recurse on only the first match.
+            parseGreedyRecursive(segment, result, status);
+            // The following line resets the offset so that the StringSegment says the same across
+            // the function
+            // call boundary. Since we recurse only once, this line is not strictly necessary.
+            segment.setOffset(initialOffset);
+            return;
+        }
+    }
+
+    // NOTE: If we get here, the greedy parse completed without consuming the entire string.
+}
+
+void NumberParserImpl::parseLongestRecursive(StringSegment& segment, ParsedNumber& result,
+                                             UErrorCode& status) const {
+    // Base Case
+    if (segment.length() == 0) {
+        return;
+    }
+
+    // TODO: Give a nice way for the matcher to reset the ParsedNumber?
+    ParsedNumber initial(result);
+    ParsedNumber candidate;
+
+    int initialOffset = segment.getOffset();
+    for (int32_t i = 0; i < fNumMatchers; i++) {
+        // TODO: Check leadChars here?
+        const NumberParseMatcher* matcher = fMatchers[i];
+
+        // In a non-greedy parse, we attempt all possible matches and pick the best.
+        for (int32_t charsToConsume = 0; charsToConsume < segment.length();) {
+            charsToConsume += U16_LENGTH(segment.codePointAt(charsToConsume));
+
+            // Run the matcher on a segment of the current length.
+            candidate = initial;
+            segment.setLength(charsToConsume);
+            bool maybeMore = matcher->match(segment, candidate, status);
+            segment.resetLength();
+            if (U_FAILURE(status)) {
+                return;
+            }
+
+            // If the entire segment was consumed, recurse.
+            if (segment.getOffset() - initialOffset == charsToConsume) {
+                parseLongestRecursive(segment, candidate, status);
+                if (U_FAILURE(status)) {
+                    return;
+                }
+                if (candidate.isBetterThan(result)) {
+                    result = candidate;
+                }
+            }
+
+            // Since the segment can be re-used, reset the offset.
+            // This does not have an effect if the matcher did not consume any chars.
+            segment.setOffset(initialOffset);
+
+            // Unless the matcher wants to see the next char, continue to the next matcher.
+            if (!maybeMore) {
+                break;
+            }
+        }
+    }
+}
+
+UnicodeString NumberParserImpl::toString() const {
+    UnicodeString result(u"<NumberParserImpl matchers:[");
+    for (int32_t i = 0; i < fNumMatchers; i++) {
+        result.append(u' ');
+        result.append(UnicodeString(typeid(*fMatchers[i]).name()));
+    }
+    result.append(u" ]>", -1);
+    return result;
+}
  
  
  #endif /* #if !UCONFIG_NO_FORMATTING */
diff --git a/icu4c/source/i18n/numparse_impl.h b/icu4c/source/i18n/numparse_impl.h

index 2ded607d829345efca67d5fb8fe0df4701078484..adb929468945eb6ed0d4adc6700f41ae6ed44193 100644 (file)
--- a/icu4c/source/i18n/numparse_impl.h
+++ b/icu4c/source/i18n/numparse_impl.h
@@ -42,9 +42,9 @@ class NumberParserImpl {
  
      ~NumberParserImpl();
  
-    void parseGreedyRecursive(StringSegment& segment, ParsedNumber& result) const;
+    void parseGreedyRecursive(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const;
  
-    void parseLongestRecursive(StringSegment& segment, ParsedNumber& result) const;
+    void parseLongestRecursive(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const;
  };
  
  
diff --git a/icu4c/source/i18n/numparse_parsednumber.cpp b/icu4c/source/i18n/numparse_parsednumber.cpp

index 9db933502a3e0ccd40bb90291be17fcd5b951a89..203383692f24d7efcfaaf90762f32509cd108de3 100644 (file)
--- a/icu4c/source/i18n/numparse_parsednumber.cpp
+++ b/icu4c/source/i18n/numparse_parsednumber.cpp
@@ -70,6 +70,11 @@ double ParsedNumber::getDouble() const {
      return quantity.toDouble();
  }
  
+bool ParsedNumber::isBetterThan(const ParsedNumber& other) {
+    // Favor results with strictly more characters consumed.
+    return charEnd > other.charEnd;
+}
+
  
  
  #endif /* #if !UCONFIG_NO_FORMATTING */
diff --git a/icu4c/source/i18n/numparse_stringsegment.cpp b/icu4c/source/i18n/numparse_stringsegment.cpp

index ecabab5faa8ec7ac0079d0d35e089b8425ed2b4e..368389009063f3ca6d5dc546b2dc3114bb2c5f88 100644 (file)
--- a/icu4c/source/i18n/numparse_stringsegment.cpp
+++ b/icu4c/source/i18n/numparse_stringsegment.cpp
@@ -9,13 +9,16 @@
  #include "numparse_stringsegment.h"
  #include "putilimp.h"
  #include "unicode/utf16.h"
+#include "unicode/uniset.h"
  
  using namespace icu;
  using namespace icu::numparse;
  using namespace icu::numparse::impl;
  
  
-StringSegment::StringSegment(const UnicodeString &str) : fStr(str), fStart(0), fEnd(str.length()) {}
+StringSegment::StringSegment(const UnicodeString& str, parse_flags_t parseFlags)
+        : fStr(str), fStart(0), fEnd(str.length()),
+          fFoldCase(0 != (parseFlags & PARSE_FLAG_IGNORE_CASE)) {}
  
  int32_t StringSegment::getOffset() const {
      return fStart;
@@ -29,6 +32,10 @@ void StringSegment::adjustOffset(int32_t delta) {
      fStart += delta;
  }
  
+void StringSegment::adjustOffsetByCodePoint() {
+    fStart += U16_LENGTH(getCodePoint());
+}
+
  void StringSegment::setLength(int32_t length) {
      fEnd = fStart + length;
  }
@@ -64,10 +71,35 @@ UChar32 StringSegment::getCodePoint() const {
      }
  }
  
-int32_t StringSegment::getCommonPrefixLength(const UnicodeString &other) {
+bool StringSegment::matches(UChar32 otherCp) const {
+    return codePointsEqual(getCodePoint(), otherCp, fFoldCase);
+}
+
+bool StringSegment::matches(const UnicodeSet& uniset) const {
+    // TODO: Move UnicodeSet case-folding logic here.
+    // TODO: Handle string matches here instead of separately.
+    UChar32 cp = getCodePoint();
+    if (cp == -1) {
+        return false;
+    }
+    return uniset.contains(cp);
+}
+
+int32_t StringSegment::getCommonPrefixLength(const UnicodeString& other) {
+    return getPrefixLengthInternal(other, fFoldCase);
+}
+
+int32_t StringSegment::getCaseSensitivePrefixLength(const UnicodeString& other) {
+    return getPrefixLengthInternal(other, false);
+}
+
+int32_t StringSegment::getPrefixLengthInternal(const UnicodeString& other, bool foldCase) {
      int32_t offset = 0;
      for (; offset < uprv_min(length(), other.length());) {
-        if (charAt(offset) != other.charAt(offset)) {
+        // TODO: case-fold code points, not chars
+        char16_t c1 = charAt(offset);
+        char16_t c2 = other.charAt(offset);
+        if (!codePointsEqual(c1, c2, foldCase)) {
              break;
          }
          offset++;
@@ -75,5 +107,17 @@ int32_t StringSegment::getCommonPrefixLength(const UnicodeString &other) {
      return offset;
  }
  
+bool StringSegment::codePointsEqual(UChar32 cp1, UChar32 cp2, bool foldCase) {
+    if (cp1 == cp2) {
+        return true;
+    }
+    if (!foldCase) {
+        return false;
+    }
+    cp1 = u_foldCase(cp1, TRUE);
+    cp2 = u_foldCase(cp2, TRUE);
+    return cp1 == cp2;
+}
+
  
  #endif /* #if !UCONFIG_NO_FORMATTING */
diff --git a/icu4c/source/i18n/numparse_types.h b/icu4c/source/i18n/numparse_types.h

index fe8a5652476cc8eca9c5cc58a29c3ecf68657eed..5280c41fece000a1f5f3f75cdf1acd3059b30b3a 100644 (file)
--- a/icu4c/source/i18n/numparse_types.h
+++ b/icu4c/source/i18n/numparse_types.h
@@ -130,6 +130,8 @@ class ParsedNumber {
      bool seenNumber() const;
  
      double getDouble() const;
+
+    bool isBetterThan(const ParsedNumber& other);
  };
  
  
@@ -141,7 +143,7 @@ class ParsedNumber {
   */
  class StringSegment : public UMemory, public ::icu::number::impl::CharSequence {
    public:
-    explicit StringSegment(const UnicodeString& str);
+    explicit StringSegment(const UnicodeString& str, parse_flags_t parseFlags);
  
      int32_t getOffset() const;
  
@@ -157,6 +159,11 @@ class StringSegment : public UMemory, public ::icu::number::impl::CharSequence {
       */
      void adjustOffset(int32_t delta);
  
+    /**
+     * Adjusts the offset by the width of the current code point, either 1 or 2 chars.
+     */
+    void adjustOffsetByCodePoint();
+
      void setLength(int32_t length);
  
      void resetLength();
@@ -172,20 +179,51 @@ class StringSegment : public UMemory, public ::icu::number::impl::CharSequence {
      /**
       * Returns the first code point in the string segment, or -1 if the string starts with an invalid
       * code point.
+     *
+     * <p>
+     * <strong>Important:</strong> Most of the time, you should use {@link #matches}, which handles case
+     * folding logic, instead of this method.
       */
      UChar32 getCodePoint() const;
  
+    /**
+     * Returns true if the first code point of this StringSegment equals the given code point.
+     *
+     * <p>
+     * This method will perform case folding if case folding is enabled for the parser.
+     */
+    bool matches(UChar32 otherCp) const;
+
+    /**
+     * Returns true if the first code point of this StringSegment is in the given UnicodeSet.
+     */
+    bool matches(const UnicodeSet& uniset) const;
+
      /**
       * Returns the length of the prefix shared by this StringSegment and the given CharSequence. For
       * example, if this string segment is "aab", and the char sequence is "aac", this method returns 2,
       * since the first 2 characters are the same.
+     *
+     * <p>
+     * This method will perform case folding if case folding is enabled for the parser.
       */
      int32_t getCommonPrefixLength(const UnicodeString& other);
  
+    /**
+     * Like {@link #getCommonPrefixLength}, but never performs case folding, even if case folding is
+     * enabled for the parser.
+     */
+    int32_t getCaseSensitivePrefixLength(const UnicodeString& other);
+
    private:
      const UnicodeString fStr;
      int32_t fStart;
      int32_t fEnd;
+    bool fFoldCase;
+
+    int32_t getPrefixLengthInternal(const UnicodeString& other, bool foldCase);
+
+    static bool codePointsEqual(UChar32 cp1, UChar32 cp2, bool foldCase);
  };
  
  
diff --git a/icu4c/source/test/intltest/numbertest_parse.cpp b/icu4c/source/test/intltest/numbertest_parse.cpp

index b0d2fe8cf1d8951af55023b4461cc0d6d66514eb..c594a493adc871a76b210891288e54edd0fd8f0c 100644 (file)
--- a/icu4c/source/test/intltest/numbertest_parse.cpp
+++ b/icu4c/source/test/intltest/numbertest_parse.cpp
@@ -50,38 +50,39 @@ void NumberParserTest::testBasic() {
                   {7, u"𝟳𝟴.𝟬𝟬𝟬.𝟬𝟮𝟯", u"#,##,##0", 11, 78.},
                   {3, u"-𝟱𝟭𝟰𝟮𝟯", u"0", 11, -51423.},
                   {3, u"-𝟱𝟭𝟰𝟮𝟯-", u"0", 11, -51423.},
-                 {3, u"a51423US dollars", u"a0¤¤¤", 16, 51423.},
-                 {3, u"a 51423 US dollars", u"a0¤¤¤", 18, 51423.},
-                 {3, u"514.23 USD", u"¤0", 10, 514.23},
-                 {3, u"514.23 GBP", u"¤0", 10, 514.23},
-                 {3, u"a 𝟱𝟭𝟰𝟮𝟯 b", u"a0b", 14, 51423.},
-                 {3, u"-a 𝟱𝟭𝟰𝟮𝟯 b", u"a0b", 15, -51423.},
-                 {3, u"a -𝟱𝟭𝟰𝟮𝟯 b", u"a0b", 15, -51423.},
-                 {3, u"𝟱𝟭𝟰𝟮𝟯", u"[0];(0)", 10, 51423.},
-                 {3, u"[𝟱𝟭𝟰𝟮𝟯", u"[0];(0)", 11, 51423.},
-                 {3, u"𝟱𝟭𝟰𝟮𝟯]", u"[0];(0)", 11, 51423.},
-                 {3, u"[𝟱𝟭𝟰𝟮𝟯]", u"[0];(0)", 12, 51423.},
-                 {3, u"(𝟱𝟭𝟰𝟮𝟯", u"[0];(0)", 11, -51423.},
-                 {3, u"𝟱𝟭𝟰𝟮𝟯)", u"[0];(0)", 11, -51423.},
-                 {3, u"(𝟱𝟭𝟰𝟮𝟯)", u"[0];(0)", 12, -51423.},
-                 {3, u"𝟱𝟭𝟰𝟮𝟯", u"{0};{0}", 10, 51423.},
-                 {3, u"{𝟱𝟭𝟰𝟮𝟯", u"{0};{0}", 11, 51423.},
-                 {3, u"𝟱𝟭𝟰𝟮𝟯}", u"{0};{0}", 11, 51423.},
-                 {3, u"{𝟱𝟭𝟰𝟮𝟯}", u"{0};{0}", 12, 51423.},
-                 {1, u"a40b", u"a0'0b'", 3, 40.}, // greedy code path thinks "40" is the number
-                 {2, u"a40b", u"a0'0b'", 4, 4.}, // slow code path finds the suffix "0b"
-                 {3, u"𝟱.𝟭𝟰𝟮E𝟯", u"0", 12, 5142.},
-                 {3, u"𝟱.𝟭𝟰𝟮E-𝟯", u"0", 13, 0.005142},
-                 {3, u"𝟱.𝟭𝟰𝟮e-𝟯", u"0", 13, 0.005142},
-                 {7, u"5,142.50 Canadian dollars", u"#,##,##0 ¤¤¤", 25, 5142.5},
-                 {3, u"a$ b5", u"a ¤ b0", 5, 5.0},
-                 {3, u"📺1.23", u"📺0;📻0", 6, 1.23},
-                 {3, u"📻1.23", u"📺0;📻0", 6, -1.23},
-                 {3, u".00", u"0", 3, 0.0},
-                 {3, u"                              0", u"a0", 31, 0.0}, // should not hang
-                 {3, u"NaN", u"0", 3, NAN},
-                 {3, u"NaN E5", u"0", 3, NAN},
-                 {3, u"0", u"0", 1, 0.0}};
+//                 {3, u"a51423US dollars", u"a0¤¤¤", 16, 51423.},
+//                 {3, u"a 51423 US dollars", u"a0¤¤¤", 18, 51423.},
+//                 {3, u"514.23 USD", u"¤0", 10, 514.23},
+//                 {3, u"514.23 GBP", u"¤0", 10, 514.23},
+//                 {3, u"a 𝟱𝟭𝟰𝟮𝟯 b", u"a0b", 14, 51423.},
+//                 {3, u"-a 𝟱𝟭𝟰𝟮𝟯 b", u"a0b", 15, -51423.},
+//                 {3, u"a -𝟱𝟭𝟰𝟮𝟯 b", u"a0b", 15, -51423.},
+//                 {3, u"𝟱𝟭𝟰𝟮𝟯", u"[0];(0)", 10, 51423.},
+//                 {3, u"[𝟱𝟭𝟰𝟮𝟯", u"[0];(0)", 11, 51423.},
+//                 {3, u"𝟱𝟭𝟰𝟮𝟯]", u"[0];(0)", 11, 51423.},
+//                 {3, u"[𝟱𝟭𝟰𝟮𝟯]", u"[0];(0)", 12, 51423.},
+//                 {3, u"(𝟱𝟭𝟰𝟮𝟯", u"[0];(0)", 11, -51423.},
+//                 {3, u"𝟱𝟭𝟰𝟮𝟯)", u"[0];(0)", 11, -51423.},
+//                 {3, u"(𝟱𝟭𝟰𝟮𝟯)", u"[0];(0)", 12, -51423.},
+//                 {3, u"𝟱𝟭𝟰𝟮𝟯", u"{0};{0}", 10, 51423.},
+//                 {3, u"{𝟱𝟭𝟰𝟮𝟯", u"{0};{0}", 11, 51423.},
+//                 {3, u"𝟱𝟭𝟰𝟮𝟯}", u"{0};{0}", 11, 51423.},
+//                 {3, u"{𝟱𝟭𝟰𝟮𝟯}", u"{0};{0}", 12, 51423.},
+//                 {1, u"a40b", u"a0'0b'", 3, 40.}, // greedy code path thinks "40" is the number
+//                 {2, u"a40b", u"a0'0b'", 4, 4.}, // slow code path finds the suffix "0b"
+//                 {3, u"𝟱.𝟭𝟰𝟮E𝟯", u"0", 12, 5142.},
+//                 {3, u"𝟱.𝟭𝟰𝟮E-𝟯", u"0", 13, 0.005142},
+//                 {3, u"𝟱.𝟭𝟰𝟮e-𝟯", u"0", 13, 0.005142},
+//                 {7, u"5,142.50 Canadian dollars", u"#,##,##0 ¤¤¤", 25, 5142.5},
+//                 {3, u"a$ b5", u"a ¤ b0", 5, 5.0},
+//                 {3, u"📺1.23", u"📺0;📻0", 6, 1.23},
+//                 {3, u"📻1.23", u"📺0;📻0", 6, -1.23},
+//                 {3, u".00", u"0", 3, 0.0},
+//                 {3, u"                              0", u"a0", 31, 0.0}, // should not hang
+//                 {3, u"NaN", u"0", 3, NAN},
+//                 {3, u"NaN E5", u"0", 3, NAN},
+//                 {3, u"0", u"0", 1, 0.0}
+    };
  
      parse_flags_t parseFlags = PARSE_FLAG_IGNORE_CASE | PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES;
      for (auto cas : cases) {
@@ -123,10 +124,7 @@ void NumberParserTest::testBasic() {
          if (0 != (cas.flags & 0x04)) {
              // Test with strict separators
              parser = NumberParserImpl::createSimpleParser(
-                    Locale("en"),
-                    patternString,
-                    parseFlags | PARSE_FLAG_STRICT_GROUPING_SIZE,
-                    status);
+                    Locale("en"), patternString, parseFlags | PARSE_FLAG_STRICT_GROUPING_SIZE, status);
              ParsedNumber resultObject;
              parser->parse(inputString, true, resultObject, status);
              assertTrue("Strict Parse failed: " + message, resultObject.success());
diff --git a/icu4c/source/test/intltest/numbertest_stringsegment.cpp b/icu4c/source/test/intltest/numbertest_stringsegment.cpp

index 519642e49a239765838eb4dec4afc7ecd9717add..665bc7c52b0225dcd4fdc09cb11f0a40521db515 100644 (file)
--- a/icu4c/source/test/intltest/numbertest_stringsegment.cpp
+++ b/icu4c/source/test/intltest/numbertest_stringsegment.cpp
@@ -24,7 +24,7 @@ void StringSegmentTest::runIndexedTest(int32_t index, UBool exec, const char*&na
  }
  
  void StringSegmentTest::testOffset() {
-    StringSegment segment(SAMPLE_STRING);
+    StringSegment segment(SAMPLE_STRING, 0);
      assertEquals("Initial Offset", 0, segment.getOffset());
      segment.adjustOffset(3);
      assertEquals("Adjust A", 3, segment.getOffset());
@@ -35,7 +35,7 @@ void StringSegmentTest::testOffset() {
  }
  
  void StringSegmentTest::testLength() {
-    StringSegment segment(SAMPLE_STRING);
+    StringSegment segment(SAMPLE_STRING, 0);
      assertEquals("Initial length", 11, segment.length());
      segment.adjustOffset(3);
      assertEquals("Adjust", 8, segment.length());
@@ -48,7 +48,7 @@ void StringSegmentTest::testLength() {
  }
  
  void StringSegmentTest::testCharAt() {
-    StringSegment segment(SAMPLE_STRING);
+    StringSegment segment(SAMPLE_STRING, 0);
      assertEquals("Initial", SAMPLE_STRING, segment.toUnicodeString());
      segment.adjustOffset(3);
      assertEquals("After adjust-offset", UnicodeString(u"radio 📻"), segment.toUnicodeString());
@@ -57,7 +57,7 @@ void StringSegmentTest::testCharAt() {
  }
  
  void StringSegmentTest::testGetCodePoint() {
-    StringSegment segment(SAMPLE_STRING);
+    StringSegment segment(SAMPLE_STRING, 0);
      assertEquals("Double-width code point", 0x1F4FB, segment.getCodePoint());
      segment.setLength(1);
      assertEquals("Inalid A", -1, segment.getCodePoint());
@@ -69,7 +69,7 @@ void StringSegmentTest::testGetCodePoint() {
  }
  
  void StringSegmentTest::testCommonPrefixLength() {
-    StringSegment segment(SAMPLE_STRING);
+    StringSegment segment(SAMPLE_STRING, 0);
      assertEquals("", 11, segment.getCommonPrefixLength(SAMPLE_STRING));
      assertEquals("", 4, segment.getCommonPrefixLength(u"📻 r"));
      assertEquals("", 3, segment.getCommonPrefixLength(u"📻 x"));
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java

index 6fd6050442f7738da7bbf3409a4c4ae56428630c..4f9d6c0f325fad38066c1685b59b92c2e404e8bf 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java
@@ -5,7 +5,6 @@ package com.ibm.icu.impl.number.parse;
  import java.text.ParsePosition;
  import java.util.ArrayList;
  import java.util.Collection;
-import java.util.Comparator;
  import java.util.List;
  
  import com.ibm.icu.impl.number.AffixPatternProvider;
@@ -268,7 +267,6 @@ public class NumberParserImpl {
      private final int parseFlags;
      private final List<NumberParseMatcher> matchers;
      private final List<UnicodeSet> leads;
-    private Comparator<ParsedNumber> comparator;
      private boolean frozen;
  
      /**
@@ -284,7 +282,6 @@ public class NumberParserImpl {
          } else {
              leads = null;
          }
-        comparator = ParsedNumber.COMPARATOR; // default value
          this.parseFlags = parseFlags;
          frozen = false;
      }
@@ -318,11 +315,6 @@ public class NumberParserImpl {
          this.leads.add(leadCodePoints);
      }
  
-    public void setComparator(Comparator<ParsedNumber> comparator) {
-        assert !frozen;
-        this.comparator = comparator;
-    }
-
      public void freeze() {
          frozen = true;
      }
@@ -400,11 +392,12 @@ public class NumberParserImpl {
  
          int initialOffset = segment.getOffset();
          for (int i = 0; i < matchers.size(); i++) {
+            // TODO: Check leadChars here?
              NumberParseMatcher matcher = matchers.get(i);
  
              // In a non-greedy parse, we attempt all possible matches and pick the best.
              for (int charsToConsume = 0; charsToConsume < segment.length();) {
-                charsToConsume += Character.charCount(Character.codePointAt(segment, charsToConsume));
+                charsToConsume += Character.charCount(segment.codePointAt(charsToConsume));
  
                  // Run the matcher on a segment of the current length.
                  candidate.copyFrom(initial);
@@ -415,7 +408,7 @@ public class NumberParserImpl {
                  // If the entire segment was consumed, recurse.
                  if (segment.getOffset() - initialOffset == charsToConsume) {
                      parseLongestRecursive(segment, candidate);
-                    if (comparator.compare(candidate, result) > 0) {
+                    if (candidate.isBetterThan(result)) {
                          result.copyFrom(candidate);
                      }
                  }
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ParsedNumber.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ParsedNumber.java

index 2bd45cc08beefaa9ed439cdaf7b51e96d5ff7ae1..d1b6751834ab147a065fed75188fd1d4a2fcdefb 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ParsedNumber.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ParsedNumber.java
@@ -166,4 +166,8 @@ public class ParsedNumber {
          return d;
  
      }
+
+    boolean isBetterThan(ParsedNumber other) {
+        return COMPARATOR.compare(this, other) > 0;
+    }
  }
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/StringSegment.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/StringSegment.java

index bc0cab0c5d02bd733dd8e09e9ed418146c922384..39416fd7535d3430c97f30c7cc0cd91d9c8f0750 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/StringSegment.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/StringSegment.java
@@ -74,6 +74,10 @@ public class StringSegment implements CharSequence {
          return str.charAt(index + start);
      }
  
+    public int codePointAt(int index) {
+        return str.codePointAt(index + start);
+    }
+
      @Override
      public CharSequence subSequence(int start, int end) {
          throw new AssertionError(); // Never used
author	Shane Carr <shane@unicode.org>
	Thu, 8 Feb 2018 09:59:35 +0000 (09:59 +0000)
committer	Shane Carr <shane@unicode.org>
	Thu, 8 Feb 2018 09:59:35 +0000 (09:59 +0000)
icu4c/source/i18n/numparse_impl.cpp		patch \| blob \| history
icu4c/source/i18n/numparse_impl.h		patch \| blob \| history
icu4c/source/i18n/numparse_parsednumber.cpp		patch \| blob \| history
icu4c/source/i18n/numparse_stringsegment.cpp		patch \| blob \| history
icu4c/source/i18n/numparse_types.h		patch \| blob \| history
icu4c/source/test/intltest/numbertest_parse.cpp		patch \| blob \| history
icu4c/source/test/intltest/numbertest_stringsegment.cpp		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ParsedNumber.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/StringSegment.java		patch \| blob \| history