]> granicus.if.org Git - icu/commitdiff
ICU-20735 simpler state saving for C++ string tries
authorMarkus Scherer <markus.icu@gmail.com>
Sat, 10 Aug 2019 17:51:28 +0000 (10:51 -0700)
committerMarkus Scherer <markus.icu@gmail.com>
Mon, 12 Aug 2019 21:49:10 +0000 (14:49 -0700)
icu4c/source/common/unicode/bytestrie.h
icu4c/source/common/unicode/ucharstrie.h
icu4c/source/test/intltest/bytestrietest.cpp
icu4c/source/test/intltest/ucharstrietest.cpp
icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/BytesTrieTest.java
icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/CharsTrieTest.java

index 9dc0ed2615785b9f397e8de5cbaf54f2b24874b4..8d5420f3fbca7ae1108ce318bd69c6c224e6ab25 100644 (file)
@@ -97,6 +97,39 @@ public:
         return *this;
     }
 
+    /**
+     * Returns the state of this trie as a 64-bit integer.
+     * The state value is never 0.
+     *
+     * @return opaque state value
+     * @see resetToState64
+     * @draft ICU 65
+     */
+    uint64_t getState64() const {
+        return (static_cast<uint64_t>(remainingMatchLength_ + 2) << kState64RemainingShift) |
+            (uint64_t)(pos_ - bytes_);
+    }
+
+    /**
+     * Resets this trie to the saved state.
+     * Unlike resetToState(State), the 64-bit state value
+     * must be from getState64() from the same trie object or
+     * from one initialized the exact same way.
+     * Because of no validation, this method is faster.
+     *
+     * @param state The opaque trie state value from getState64().
+     * @return *this
+     * @see getState64
+     * @see resetToState
+     * @see reset
+     * @draft ICU 65
+     */
+    BytesTrie &resetToState64(uint64_t state) {
+        remainingMatchLength_ = static_cast<int32_t>(state >> kState64RemainingShift) - 2;
+        pos_ = bytes_ + (state & kState64PosMask);
+        return *this;
+    }
+
     /**
      * BytesTrie state object, for saving a trie's current state
      * and resetting the trie back to this state later.
@@ -505,6 +538,13 @@ private:
     static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1;  // 0x2fff
     static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1;  // 0xdffff
 
+    // For getState64():
+    // The remainingMatchLength_ is -1..14=(kMaxLinearMatchLength=0x10)-2
+    // so we need at least 5 bits for that.
+    // We add 2 to store it as a positive value 1..16=kMaxLinearMatchLength.
+    static constexpr int32_t kState64RemainingShift = 59;
+    static constexpr uint64_t kState64PosMask = (UINT64_C(1) << kState64RemainingShift) - 1;
+
     uint8_t *ownedArray_;
 
     // Fixed value referencing the BytesTrie bytes.
index a702758beb31f93ce82e61da83bad2b98a88e72a..8bc914e24d6354e77e904bee32e9bb555db6b821 100644 (file)
@@ -97,6 +97,39 @@ public:
         return *this;
     }
 
+    /**
+     * Returns the state of this trie as a 64-bit integer.
+     * The state value is never 0.
+     *
+     * @return opaque state value
+     * @see resetToState64
+     * @draft ICU 65
+     */
+    uint64_t getState64() const {
+        return (static_cast<uint64_t>(remainingMatchLength_ + 2) << kState64RemainingShift) |
+            (uint64_t)(pos_ - uchars_);
+    }
+
+    /**
+     * Resets this trie to the saved state.
+     * Unlike resetToState(State), the 64-bit state value
+     * must be from getState64() from the same trie object or
+     * from one initialized the exact same way.
+     * Because of no validation, this method is faster.
+     *
+     * @param state The opaque trie state value from getState64().
+     * @return *this
+     * @see getState64
+     * @see resetToState
+     * @see reset
+     * @draft ICU 65
+     */
+    UCharsTrie &resetToState64(uint64_t state) {
+        remainingMatchLength_ = static_cast<int32_t>(state >> kState64RemainingShift) - 2;
+        pos_ = uchars_ + (state & kState64PosMask);
+        return *this;
+    }
+
     /**
      * UCharsTrie state object, for saving a trie's current state
      * and resetting the trie back to this state later.
@@ -563,6 +596,13 @@ private:
 
     static const int32_t kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1;  // 0x03feffff
 
+    // For getState64():
+    // The remainingMatchLength_ is -1..14=(kMaxLinearMatchLength=0x10)-2
+    // so we need at least 5 bits for that.
+    // We add 2 to store it as a positive value 1..16=kMaxLinearMatchLength.
+    static constexpr int32_t kState64RemainingShift = 59;
+    static constexpr uint64_t kState64PosMask = (UINT64_C(1) << kState64RemainingShift) - 1;
+
     char16_t *ownedArray_;
 
     // Fixed value referencing the UCharsTrie words.
index 8a2ac2cf5f47ab072a31efb8e72013dd8392cef4..bdf0b9003bb7976e856dbe3417cb01ae1d4a7a0b 100644 (file)
@@ -64,6 +64,7 @@ public:
     void checkFirst(BytesTrie &trie, const StringAndValue data[], int32_t dataLength);
     void checkNext(BytesTrie &trie, const StringAndValue data[], int32_t dataLength);
     void checkNextWithState(BytesTrie &trie, const StringAndValue data[], int32_t dataLength);
+    void checkNextWithState64(BytesTrie &trie, const StringAndValue data[], int32_t dataLength);
     void checkNextString(BytesTrie &trie, const StringAndValue data[], int32_t dataLength);
     void checkIterator(const BytesTrie &trie, const StringAndValue data[], int32_t dataLength);
     void checkIterator(BytesTrie::Iterator &iter, const StringAndValue data[], int32_t dataLength);
@@ -613,6 +614,7 @@ void BytesTrieTest::checkData(const StringAndValue data[], int32_t dataLength, U
     checkFirst(*trie, data, dataLength);
     checkNext(*trie, data, dataLength);
     checkNextWithState(*trie, data, dataLength);
+    checkNextWithState64(*trie, data, dataLength);
     checkNextString(*trie, data, dataLength);
     checkIterator(*trie, data, dataLength);
 }
@@ -825,6 +827,61 @@ void BytesTrieTest::checkNextWithState(BytesTrie &trie,
     }
 }
 
+void BytesTrieTest::checkNextWithState64(BytesTrie &trie,
+                                         const StringAndValue data[], int32_t dataLength) {
+    assertTrue("trie(initial state).getState64()!=0", trie.getState64() != 0);
+    for(int32_t i=0; i<dataLength; ++i) {
+        const char *expectedString=data[i].s;
+        int32_t stringLength= static_cast<int32_t>(strlen(expectedString));
+        int32_t partialLength = stringLength / 3;
+        for(int32_t j=0; j<partialLength; ++j) {
+            if(!USTRINGTRIE_MATCHES(trie.next(expectedString[j]))) {
+                errln("trie.next()=USTRINGTRIE_NO_MATCH for a prefix of %s", data[i].s);
+                return;
+            }
+        }
+        uint64_t state = trie.getState64();
+        assertTrue("trie.getState64()!=0", state != 0);
+        UStringTrieResult resultAtState=trie.current();
+        UStringTrieResult result;
+        int32_t valueAtState=-99;
+        if(USTRINGTRIE_HAS_VALUE(resultAtState)) {
+            valueAtState=trie.getValue();
+        }
+        result=trie.next(0);  // mismatch
+        if(result!=USTRINGTRIE_NO_MATCH || result!=trie.current()) {
+            errln("trie.next(0) matched after part of %s", data[i].s);
+        }
+        if( resultAtState!=trie.resetToState64(state).current() ||
+            (USTRINGTRIE_HAS_VALUE(resultAtState) && valueAtState!=trie.getValue())
+        ) {
+            errln("trie.next(part of %s) changes current()/getValue() after "
+                  "getState64/next(0)/resetToState64",
+                  data[i].s);
+        } else if(!USTRINGTRIE_HAS_VALUE(
+                      result=trie.next(expectedString+partialLength,
+                                       stringLength-partialLength)) ||
+                  result!=trie.current()) {
+            errln("trie.next(rest of %s) does not seem to contain %s after "
+                  "getState64/next(0)/resetToState64",
+                  data[i].s, data[i].s);
+        } else if(!USTRINGTRIE_HAS_VALUE(
+                      result=trie.resetToState64(state).
+                                  next(expectedString+partialLength,
+                                       stringLength-partialLength)) ||
+                  result!=trie.current()) {
+            errln("trie does not seem to contain %s after getState64/next(rest)/resetToState64",
+                  data[i].s);
+        } else if(trie.getValue()!=data[i].value) {
+            errln("trie value for %s is %ld=0x%lx instead of expected %ld=0x%lx",
+                  data[i].s,
+                  (long)trie.getValue(), (long)trie.getValue(),
+                  (long)data[i].value, (long)data[i].value);
+        }
+        trie.reset();
+    }
+}
+
 // next(string) is also tested in other functions,
 // but here we try to go partway through the string, and then beyond it.
 void BytesTrieTest::checkNextString(BytesTrie &trie,
index 316880eb3f4328110ca6389a0f2beaddd606ff5a..394c48b3b59ffd7899575fd332a3ff60d56d9160 100644 (file)
@@ -71,6 +71,7 @@ public:
     void checkFirst(UCharsTrie &trie, const StringAndValue data[], int32_t dataLength);
     void checkNext(UCharsTrie &trie, const StringAndValue data[], int32_t dataLength);
     void checkNextWithState(UCharsTrie &trie, const StringAndValue data[], int32_t dataLength);
+    void checkNextWithState64(UCharsTrie &trie, const StringAndValue data[], int32_t dataLength);
     void checkNextString(UCharsTrie &trie, const StringAndValue data[], int32_t dataLength);
     void checkIterator(UCharsTrie &trie, const StringAndValue data[], int32_t dataLength);
     void checkIterator(UCharsTrie::Iterator &iter, const StringAndValue data[], int32_t dataLength);
@@ -762,6 +763,7 @@ void UCharsTrieTest::checkData(const StringAndValue data[], int32_t dataLength,
     checkFirst(*trie, data, dataLength);
     checkNext(*trie, data, dataLength);
     checkNextWithState(*trie, data, dataLength);
+    checkNextWithState64(*trie, data, dataLength);
     checkNextString(*trie, data, dataLength);
     checkIterator(*trie, data, dataLength);
 }
@@ -987,6 +989,61 @@ void UCharsTrieTest::checkNextWithState(UCharsTrie &trie,
     }
 }
 
+void UCharsTrieTest::checkNextWithState64(UCharsTrie &trie,
+                                          const StringAndValue data[], int32_t dataLength) {
+    assertTrue("trie(initial state).getState64()!=0", trie.getState64() != 0);
+    for(int32_t i=0; i<dataLength; ++i) {
+        UnicodeString expectedString=UnicodeString(data[i].s, -1, US_INV).unescape();
+        int32_t stringLength=expectedString.length();
+        int32_t partialLength = stringLength / 3;
+        for(int32_t j=0; j<partialLength; ++j) {
+            if(!USTRINGTRIE_MATCHES(trie.next(expectedString[j]))) {
+                errln("trie.next()=USTRINGTRIE_NO_MATCH for a prefix of %s", data[i].s);
+                return;
+            }
+        }
+        uint64_t state = trie.getState64();
+        assertTrue("trie.getState64()!=0", state != 0);
+        UStringTrieResult resultAtState=trie.current();
+        UStringTrieResult result;
+        int32_t valueAtState=-99;
+        if(USTRINGTRIE_HAS_VALUE(resultAtState)) {
+            valueAtState=trie.getValue();
+        }
+        result=trie.next(0);  // mismatch
+        if(result!=USTRINGTRIE_NO_MATCH || result!=trie.current()) {
+            errln("trie.next(0) matched after part of %s", data[i].s);
+        }
+        if( resultAtState!=trie.resetToState64(state).current() ||
+            (USTRINGTRIE_HAS_VALUE(resultAtState) && valueAtState!=trie.getValue())
+        ) {
+            errln("trie.next(part of %s) changes current()/getValue() after "
+                  "getState64/next(0)/resetToState64",
+                  data[i].s);
+        } else if(!USTRINGTRIE_HAS_VALUE(
+                      result=trie.next(expectedString.getTerminatedBuffer()+partialLength,
+                                       stringLength-partialLength)) ||
+                  result!=trie.current()) {
+            errln("trie.next(rest of %s) does not seem to contain %s after "
+                  "getState64/next(0)/resetToState64",
+                  data[i].s, data[i].s);
+        } else if(!USTRINGTRIE_HAS_VALUE(
+                      result=trie.resetToState64(state).
+                                  next(expectedString.getTerminatedBuffer()+partialLength,
+                                       stringLength-partialLength)) ||
+                  result!=trie.current()) {
+            errln("trie does not seem to contain %s after getState64/next(rest)/resetToState64",
+                  data[i].s);
+        } else if(trie.getValue()!=data[i].value) {
+            errln("trie value for %s is %ld=0x%lx instead of expected %ld=0x%lx",
+                  data[i].s,
+                  (long)trie.getValue(), (long)trie.getValue(),
+                  (long)data[i].value, (long)data[i].value);
+        }
+        trie.reset();
+    }
+}
+
 // next(string) is also tested in other functions,
 // but here we try to go partway through the string, and then beyond it.
 void UCharsTrieTest::checkNextString(UCharsTrie &trie,
index 6cca357f752983409f54588acfbd20c0c60f499b..8efea3a539c2278a1c9da767ed603d39f508e9a6 100644 (file)
@@ -768,16 +768,16 @@ public class BytesTrieTest extends TestFmwk {
                 (resultAtState.hasValue() && valueAtState!=trie.getValue())
             ) {
                 errln("trie.next(part of "+data[i].s+") changes current()/getValue() after "+
-                      "saveState/next(0)/resetToState");
+                      "getState64/next(0)/resetToState64");
             } else if(!(result=trie.next(expectedString, partialLength, stringLength)).hasValue() ||
                       result!=trie.current()) {
                 errln("trie.next(rest of "+data[i].s+") does not seem to contain "+data[i].s+" after "+
-                      "saveState/next(0)/resetToState");
+                      "getState64/next(0)/resetToState64");
             } else if(!(result=trie.resetToState64(state).
                                 next(expectedString, partialLength, stringLength)).hasValue() ||
                       result!=trie.current()) {
                 errln("trie does not seem to contain "+data[i].s+
-                      " after saveState/next(rest)/resetToState");
+                      " after getState64/next(rest)/resetToState64");
             } else if(trie.getValue()!=data[i].value) {
                 errln(String.format("trie value for %s is %d=0x%x instead of expected %d=0x%x",
                                     data[i].s,
index 149e7bfdddcfd9c4f4ca132b733704f928e0ba41..3fdc2ed43a02043065362fdd22d6cea0a3aa7151 100644 (file)
@@ -914,16 +914,16 @@ public class CharsTrieTest extends TestFmwk {
                 (resultAtState.hasValue() && valueAtState!=trie.getValue())
             ) {
                 errln("trie.next(part of "+data[i].s+") changes current()/getValue() after "+
-                      "saveState/next(0)/resetToState");
+                      "getState64/next(0)/resetToState64");
             } else if(!(result=trie.next(expectedString, partialLength, stringLength)).hasValue() ||
                       result!=trie.current()) {
                 errln("trie.next(rest of "+data[i].s+") does not seem to contain "+data[i].s+" after "+
-                      "saveState/next(0)/resetToState");
+                      "getState64/next(0)/resetToState64");
             } else if(!(result=trie.resetToState64(state).
                                 next(expectedString, partialLength, stringLength)).hasValue() ||
                       result!=trie.current()) {
                 errln("trie does not seem to contain "+data[i].s+
-                      " after saveState/next(rest)/resetToState");
+                      " after getState64/next(rest)/resetToState64");
             } else if(trie.getValue()!=data[i].value) {
                 errln(String.format("trie value for %s is %d=0x%x instead of expected %d=0x%x",
                                     data[i].s,