#include "numparse_decimal.h"
#include "unicode/numberformatter.h"
+#include <typeinfo>
+
using namespace icu;
using namespace icu::number;
using namespace icu::number::impl;
fFrozen = true;
}
-//void
-//NumberParserImpl::parse(const UnicodeString& input, int32_t start, bool greedy, ParsedNumber& result,
-// UErrorCode& status) const {
-// U_ASSERT(frozen);
-// // TODO: Check start >= 0 and start < input.length()
-// StringSegment segment(utils::maybeFold(input, parseFlags));
-// segment.adjustOffset(start);
-// if (greedy) {
-// parseGreedyRecursive(segment, result);
-// } else {
-// parseLongestRecursive(segment, result);
-// }
-// for (NumberParseMatcher matcher : matchers) {
-// matcher.postProcess(result);
-// }
-//}
+void NumberParserImpl::parse(const UnicodeString& input, bool greedy, ParsedNumber& result,
+ UErrorCode& status) const {
+ return parse(input, 0, greedy, result, status);
+}
+
+void
+NumberParserImpl::parse(const UnicodeString& input, int32_t start, bool greedy, ParsedNumber& result,
+ UErrorCode& status) const {
+ U_ASSERT(fFrozen);
+ // TODO: Check start >= 0 and start < input.length()
+ StringSegment segment(input, fParseFlags);
+ segment.adjustOffset(start);
+ if (greedy) {
+ parseGreedyRecursive(segment, result, status);
+ } else {
+ parseLongestRecursive(segment, result, status);
+ }
+ for (int32_t i = 0; i < fNumMatchers; i++) {
+ fMatchers[i]->postProcess(result);
+ }
+}
+
+void NumberParserImpl::parseGreedyRecursive(StringSegment& segment, ParsedNumber& result,
+ UErrorCode& status) const {
+ // Base Case
+ if (segment.length() == 0) {
+ return;
+ }
+
+ int initialOffset = segment.getOffset();
+ int leadCp = segment.getCodePoint();
+ for (int32_t i = 0; i < fNumMatchers; i++) {
+ if (fComputeLeads && !fLeads[i]->contains(leadCp)) {
+ continue;
+ }
+ const NumberParseMatcher* matcher = fMatchers[i];
+ matcher->match(segment, result, status);
+ if (U_FAILURE(status)) {
+ return;
+ }
+ if (segment.getOffset() != initialOffset) {
+ // In a greedy parse, recurse on only the first match.
+ parseGreedyRecursive(segment, result, status);
+ // The following line resets the offset so that the StringSegment says the same across
+ // the function
+ // call boundary. Since we recurse only once, this line is not strictly necessary.
+ segment.setOffset(initialOffset);
+ return;
+ }
+ }
+
+ // NOTE: If we get here, the greedy parse completed without consuming the entire string.
+}
+
+void NumberParserImpl::parseLongestRecursive(StringSegment& segment, ParsedNumber& result,
+ UErrorCode& status) const {
+ // Base Case
+ if (segment.length() == 0) {
+ return;
+ }
+
+ // TODO: Give a nice way for the matcher to reset the ParsedNumber?
+ ParsedNumber initial(result);
+ ParsedNumber candidate;
+
+ int initialOffset = segment.getOffset();
+ for (int32_t i = 0; i < fNumMatchers; i++) {
+ // TODO: Check leadChars here?
+ const NumberParseMatcher* matcher = fMatchers[i];
+
+ // In a non-greedy parse, we attempt all possible matches and pick the best.
+ for (int32_t charsToConsume = 0; charsToConsume < segment.length();) {
+ charsToConsume += U16_LENGTH(segment.codePointAt(charsToConsume));
+
+ // Run the matcher on a segment of the current length.
+ candidate = initial;
+ segment.setLength(charsToConsume);
+ bool maybeMore = matcher->match(segment, candidate, status);
+ segment.resetLength();
+ if (U_FAILURE(status)) {
+ return;
+ }
+
+ // If the entire segment was consumed, recurse.
+ if (segment.getOffset() - initialOffset == charsToConsume) {
+ parseLongestRecursive(segment, candidate, status);
+ if (U_FAILURE(status)) {
+ return;
+ }
+ if (candidate.isBetterThan(result)) {
+ result = candidate;
+ }
+ }
+
+ // Since the segment can be re-used, reset the offset.
+ // This does not have an effect if the matcher did not consume any chars.
+ segment.setOffset(initialOffset);
+
+ // Unless the matcher wants to see the next char, continue to the next matcher.
+ if (!maybeMore) {
+ break;
+ }
+ }
+ }
+}
+
+UnicodeString NumberParserImpl::toString() const {
+ UnicodeString result(u"<NumberParserImpl matchers:[");
+ for (int32_t i = 0; i < fNumMatchers; i++) {
+ result.append(u' ');
+ result.append(UnicodeString(typeid(*fMatchers[i]).name()));
+ }
+ result.append(u" ]>", -1);
+ return result;
+}
#endif /* #if !UCONFIG_NO_FORMATTING */
~NumberParserImpl();
- void parseGreedyRecursive(StringSegment& segment, ParsedNumber& result) const;
+ void parseGreedyRecursive(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const;
- void parseLongestRecursive(StringSegment& segment, ParsedNumber& result) const;
+ void parseLongestRecursive(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const;
};
return quantity.toDouble();
}
+bool ParsedNumber::isBetterThan(const ParsedNumber& other) {
+ // Favor results with strictly more characters consumed.
+ return charEnd > other.charEnd;
+}
+
#endif /* #if !UCONFIG_NO_FORMATTING */
#include "numparse_stringsegment.h"
#include "putilimp.h"
#include "unicode/utf16.h"
+#include "unicode/uniset.h"
using namespace icu;
using namespace icu::numparse;
using namespace icu::numparse::impl;
-StringSegment::StringSegment(const UnicodeString &str) : fStr(str), fStart(0), fEnd(str.length()) {}
+StringSegment::StringSegment(const UnicodeString& str, parse_flags_t parseFlags)
+ : fStr(str), fStart(0), fEnd(str.length()),
+ fFoldCase(0 != (parseFlags & PARSE_FLAG_IGNORE_CASE)) {}
int32_t StringSegment::getOffset() const {
return fStart;
fStart += delta;
}
+void StringSegment::adjustOffsetByCodePoint() {
+ fStart += U16_LENGTH(getCodePoint());
+}
+
void StringSegment::setLength(int32_t length) {
fEnd = fStart + length;
}
}
}
-int32_t StringSegment::getCommonPrefixLength(const UnicodeString &other) {
+bool StringSegment::matches(UChar32 otherCp) const {
+ return codePointsEqual(getCodePoint(), otherCp, fFoldCase);
+}
+
+bool StringSegment::matches(const UnicodeSet& uniset) const {
+ // TODO: Move UnicodeSet case-folding logic here.
+ // TODO: Handle string matches here instead of separately.
+ UChar32 cp = getCodePoint();
+ if (cp == -1) {
+ return false;
+ }
+ return uniset.contains(cp);
+}
+
+int32_t StringSegment::getCommonPrefixLength(const UnicodeString& other) {
+ return getPrefixLengthInternal(other, fFoldCase);
+}
+
+int32_t StringSegment::getCaseSensitivePrefixLength(const UnicodeString& other) {
+ return getPrefixLengthInternal(other, false);
+}
+
+int32_t StringSegment::getPrefixLengthInternal(const UnicodeString& other, bool foldCase) {
int32_t offset = 0;
for (; offset < uprv_min(length(), other.length());) {
- if (charAt(offset) != other.charAt(offset)) {
+ // TODO: case-fold code points, not chars
+ char16_t c1 = charAt(offset);
+ char16_t c2 = other.charAt(offset);
+ if (!codePointsEqual(c1, c2, foldCase)) {
break;
}
offset++;
return offset;
}
+bool StringSegment::codePointsEqual(UChar32 cp1, UChar32 cp2, bool foldCase) {
+ if (cp1 == cp2) {
+ return true;
+ }
+ if (!foldCase) {
+ return false;
+ }
+ cp1 = u_foldCase(cp1, TRUE);
+ cp2 = u_foldCase(cp2, TRUE);
+ return cp1 == cp2;
+}
+
#endif /* #if !UCONFIG_NO_FORMATTING */
bool seenNumber() const;
double getDouble() const;
+
+ bool isBetterThan(const ParsedNumber& other);
};
*/
class StringSegment : public UMemory, public ::icu::number::impl::CharSequence {
public:
- explicit StringSegment(const UnicodeString& str);
+ explicit StringSegment(const UnicodeString& str, parse_flags_t parseFlags);
int32_t getOffset() const;
*/
void adjustOffset(int32_t delta);
+ /**
+ * Adjusts the offset by the width of the current code point, either 1 or 2 chars.
+ */
+ void adjustOffsetByCodePoint();
+
void setLength(int32_t length);
void resetLength();
/**
* Returns the first code point in the string segment, or -1 if the string starts with an invalid
* code point.
+ *
+ * <p>
+ * <strong>Important:</strong> Most of the time, you should use {@link #matches}, which handles case
+ * folding logic, instead of this method.
*/
UChar32 getCodePoint() const;
+ /**
+ * Returns true if the first code point of this StringSegment equals the given code point.
+ *
+ * <p>
+ * This method will perform case folding if case folding is enabled for the parser.
+ */
+ bool matches(UChar32 otherCp) const;
+
+ /**
+ * Returns true if the first code point of this StringSegment is in the given UnicodeSet.
+ */
+ bool matches(const UnicodeSet& uniset) const;
+
/**
* Returns the length of the prefix shared by this StringSegment and the given CharSequence. For
* example, if this string segment is "aab", and the char sequence is "aac", this method returns 2,
* since the first 2 characters are the same.
+ *
+ * <p>
+ * This method will perform case folding if case folding is enabled for the parser.
*/
int32_t getCommonPrefixLength(const UnicodeString& other);
+ /**
+ * Like {@link #getCommonPrefixLength}, but never performs case folding, even if case folding is
+ * enabled for the parser.
+ */
+ int32_t getCaseSensitivePrefixLength(const UnicodeString& other);
+
private:
const UnicodeString fStr;
int32_t fStart;
int32_t fEnd;
+ bool fFoldCase;
+
+ int32_t getPrefixLengthInternal(const UnicodeString& other, bool foldCase);
+
+ static bool codePointsEqual(UChar32 cp1, UChar32 cp2, bool foldCase);
};
{7, u"๐ณ๐ด.๐ฌ๐ฌ๐ฌ.๐ฌ๐ฎ๐ฏ", u"#,##,##0", 11, 78.},
{3, u"-๐ฑ๐ญ๐ฐ๐ฎ๐ฏ", u"0", 11, -51423.},
{3, u"-๐ฑ๐ญ๐ฐ๐ฎ๐ฏ-", u"0", 11, -51423.},
- {3, u"a51423US dollars", u"a0ยคยคยค", 16, 51423.},
- {3, u"a 51423 US dollars", u"a0ยคยคยค", 18, 51423.},
- {3, u"514.23 USD", u"ยค0", 10, 514.23},
- {3, u"514.23 GBP", u"ยค0", 10, 514.23},
- {3, u"a ๐ฑ๐ญ๐ฐ๐ฎ๐ฏ b", u"a0b", 14, 51423.},
- {3, u"-a ๐ฑ๐ญ๐ฐ๐ฎ๐ฏ b", u"a0b", 15, -51423.},
- {3, u"a -๐ฑ๐ญ๐ฐ๐ฎ๐ฏ b", u"a0b", 15, -51423.},
- {3, u"๐ฑ๐ญ๐ฐ๐ฎ๐ฏ", u"[0];(0)", 10, 51423.},
- {3, u"[๐ฑ๐ญ๐ฐ๐ฎ๐ฏ", u"[0];(0)", 11, 51423.},
- {3, u"๐ฑ๐ญ๐ฐ๐ฎ๐ฏ]", u"[0];(0)", 11, 51423.},
- {3, u"[๐ฑ๐ญ๐ฐ๐ฎ๐ฏ]", u"[0];(0)", 12, 51423.},
- {3, u"(๐ฑ๐ญ๐ฐ๐ฎ๐ฏ", u"[0];(0)", 11, -51423.},
- {3, u"๐ฑ๐ญ๐ฐ๐ฎ๐ฏ)", u"[0];(0)", 11, -51423.},
- {3, u"(๐ฑ๐ญ๐ฐ๐ฎ๐ฏ)", u"[0];(0)", 12, -51423.},
- {3, u"๐ฑ๐ญ๐ฐ๐ฎ๐ฏ", u"{0};{0}", 10, 51423.},
- {3, u"{๐ฑ๐ญ๐ฐ๐ฎ๐ฏ", u"{0};{0}", 11, 51423.},
- {3, u"๐ฑ๐ญ๐ฐ๐ฎ๐ฏ}", u"{0};{0}", 11, 51423.},
- {3, u"{๐ฑ๐ญ๐ฐ๐ฎ๐ฏ}", u"{0};{0}", 12, 51423.},
- {1, u"a40b", u"a0'0b'", 3, 40.}, // greedy code path thinks "40" is the number
- {2, u"a40b", u"a0'0b'", 4, 4.}, // slow code path finds the suffix "0b"
- {3, u"๐ฑ.๐ญ๐ฐ๐ฎE๐ฏ", u"0", 12, 5142.},
- {3, u"๐ฑ.๐ญ๐ฐ๐ฎE-๐ฏ", u"0", 13, 0.005142},
- {3, u"๐ฑ.๐ญ๐ฐ๐ฎe-๐ฏ", u"0", 13, 0.005142},
- {7, u"5,142.50 Canadian dollars", u"#,##,##0 ยคยคยค", 25, 5142.5},
- {3, u"a$ b5", u"a ยค b0", 5, 5.0},
- {3, u"๐บ1.23", u"๐บ0;๐ป0", 6, 1.23},
- {3, u"๐ป1.23", u"๐บ0;๐ป0", 6, -1.23},
- {3, u".00", u"0", 3, 0.0},
- {3, u" 0", u"a0", 31, 0.0}, // should not hang
- {3, u"NaN", u"0", 3, NAN},
- {3, u"NaN E5", u"0", 3, NAN},
- {3, u"0", u"0", 1, 0.0}};
+// {3, u"a51423US dollars", u"a0ยคยคยค", 16, 51423.},
+// {3, u"a 51423 US dollars", u"a0ยคยคยค", 18, 51423.},
+// {3, u"514.23 USD", u"ยค0", 10, 514.23},
+// {3, u"514.23 GBP", u"ยค0", 10, 514.23},
+// {3, u"a ๐ฑ๐ญ๐ฐ๐ฎ๐ฏ b", u"a0b", 14, 51423.},
+// {3, u"-a ๐ฑ๐ญ๐ฐ๐ฎ๐ฏ b", u"a0b", 15, -51423.},
+// {3, u"a -๐ฑ๐ญ๐ฐ๐ฎ๐ฏ b", u"a0b", 15, -51423.},
+// {3, u"๐ฑ๐ญ๐ฐ๐ฎ๐ฏ", u"[0];(0)", 10, 51423.},
+// {3, u"[๐ฑ๐ญ๐ฐ๐ฎ๐ฏ", u"[0];(0)", 11, 51423.},
+// {3, u"๐ฑ๐ญ๐ฐ๐ฎ๐ฏ]", u"[0];(0)", 11, 51423.},
+// {3, u"[๐ฑ๐ญ๐ฐ๐ฎ๐ฏ]", u"[0];(0)", 12, 51423.},
+// {3, u"(๐ฑ๐ญ๐ฐ๐ฎ๐ฏ", u"[0];(0)", 11, -51423.},
+// {3, u"๐ฑ๐ญ๐ฐ๐ฎ๐ฏ)", u"[0];(0)", 11, -51423.},
+// {3, u"(๐ฑ๐ญ๐ฐ๐ฎ๐ฏ)", u"[0];(0)", 12, -51423.},
+// {3, u"๐ฑ๐ญ๐ฐ๐ฎ๐ฏ", u"{0};{0}", 10, 51423.},
+// {3, u"{๐ฑ๐ญ๐ฐ๐ฎ๐ฏ", u"{0};{0}", 11, 51423.},
+// {3, u"๐ฑ๐ญ๐ฐ๐ฎ๐ฏ}", u"{0};{0}", 11, 51423.},
+// {3, u"{๐ฑ๐ญ๐ฐ๐ฎ๐ฏ}", u"{0};{0}", 12, 51423.},
+// {1, u"a40b", u"a0'0b'", 3, 40.}, // greedy code path thinks "40" is the number
+// {2, u"a40b", u"a0'0b'", 4, 4.}, // slow code path finds the suffix "0b"
+// {3, u"๐ฑ.๐ญ๐ฐ๐ฎE๐ฏ", u"0", 12, 5142.},
+// {3, u"๐ฑ.๐ญ๐ฐ๐ฎE-๐ฏ", u"0", 13, 0.005142},
+// {3, u"๐ฑ.๐ญ๐ฐ๐ฎe-๐ฏ", u"0", 13, 0.005142},
+// {7, u"5,142.50 Canadian dollars", u"#,##,##0 ยคยคยค", 25, 5142.5},
+// {3, u"a$ b5", u"a ยค b0", 5, 5.0},
+// {3, u"๐บ1.23", u"๐บ0;๐ป0", 6, 1.23},
+// {3, u"๐ป1.23", u"๐บ0;๐ป0", 6, -1.23},
+// {3, u".00", u"0", 3, 0.0},
+// {3, u" 0", u"a0", 31, 0.0}, // should not hang
+// {3, u"NaN", u"0", 3, NAN},
+// {3, u"NaN E5", u"0", 3, NAN},
+// {3, u"0", u"0", 1, 0.0}
+ };
parse_flags_t parseFlags = PARSE_FLAG_IGNORE_CASE | PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES;
for (auto cas : cases) {
if (0 != (cas.flags & 0x04)) {
// Test with strict separators
parser = NumberParserImpl::createSimpleParser(
- Locale("en"),
- patternString,
- parseFlags | PARSE_FLAG_STRICT_GROUPING_SIZE,
- status);
+ Locale("en"), patternString, parseFlags | PARSE_FLAG_STRICT_GROUPING_SIZE, status);
ParsedNumber resultObject;
parser->parse(inputString, true, resultObject, status);
assertTrue("Strict Parse failed: " + message, resultObject.success());
}
void StringSegmentTest::testOffset() {
- StringSegment segment(SAMPLE_STRING);
+ StringSegment segment(SAMPLE_STRING, 0);
assertEquals("Initial Offset", 0, segment.getOffset());
segment.adjustOffset(3);
assertEquals("Adjust A", 3, segment.getOffset());
}
void StringSegmentTest::testLength() {
- StringSegment segment(SAMPLE_STRING);
+ StringSegment segment(SAMPLE_STRING, 0);
assertEquals("Initial length", 11, segment.length());
segment.adjustOffset(3);
assertEquals("Adjust", 8, segment.length());
}
void StringSegmentTest::testCharAt() {
- StringSegment segment(SAMPLE_STRING);
+ StringSegment segment(SAMPLE_STRING, 0);
assertEquals("Initial", SAMPLE_STRING, segment.toUnicodeString());
segment.adjustOffset(3);
assertEquals("After adjust-offset", UnicodeString(u"radio ๐ป"), segment.toUnicodeString());
}
void StringSegmentTest::testGetCodePoint() {
- StringSegment segment(SAMPLE_STRING);
+ StringSegment segment(SAMPLE_STRING, 0);
assertEquals("Double-width code point", 0x1F4FB, segment.getCodePoint());
segment.setLength(1);
assertEquals("Inalid A", -1, segment.getCodePoint());
}
void StringSegmentTest::testCommonPrefixLength() {
- StringSegment segment(SAMPLE_STRING);
+ StringSegment segment(SAMPLE_STRING, 0);
assertEquals("", 11, segment.getCommonPrefixLength(SAMPLE_STRING));
assertEquals("", 4, segment.getCommonPrefixLength(u"๐ป r"));
assertEquals("", 3, segment.getCommonPrefixLength(u"๐ป x"));
import java.text.ParsePosition;
import java.util.ArrayList;
import java.util.Collection;
-import java.util.Comparator;
import java.util.List;
import com.ibm.icu.impl.number.AffixPatternProvider;
private final int parseFlags;
private final List<NumberParseMatcher> matchers;
private final List<UnicodeSet> leads;
- private Comparator<ParsedNumber> comparator;
private boolean frozen;
/**
} else {
leads = null;
}
- comparator = ParsedNumber.COMPARATOR; // default value
this.parseFlags = parseFlags;
frozen = false;
}
this.leads.add(leadCodePoints);
}
- public void setComparator(Comparator<ParsedNumber> comparator) {
- assert !frozen;
- this.comparator = comparator;
- }
-
public void freeze() {
frozen = true;
}
int initialOffset = segment.getOffset();
for (int i = 0; i < matchers.size(); i++) {
+ // TODO: Check leadChars here?
NumberParseMatcher matcher = matchers.get(i);
// In a non-greedy parse, we attempt all possible matches and pick the best.
for (int charsToConsume = 0; charsToConsume < segment.length();) {
- charsToConsume += Character.charCount(Character.codePointAt(segment, charsToConsume));
+ charsToConsume += Character.charCount(segment.codePointAt(charsToConsume));
// Run the matcher on a segment of the current length.
candidate.copyFrom(initial);
// If the entire segment was consumed, recurse.
if (segment.getOffset() - initialOffset == charsToConsume) {
parseLongestRecursive(segment, candidate);
- if (comparator.compare(candidate, result) > 0) {
+ if (candidate.isBetterThan(result)) {
result.copyFrom(candidate);
}
}
return d;
}
+
+ boolean isBetterThan(ParsedNumber other) {
+ return COMPARATOR.compare(this, other) > 0;
+ }
}
return str.charAt(index + start);
}
+ public int codePointAt(int index) {
+ return str.codePointAt(index + start);
+ }
+
@Override
public CharSequence subSequence(int start, int end) {
throw new AssertionError(); // Never used