]> granicus.if.org Git - icu/commitdiff
ICU-12410 add Edits::Iterator and change UnicodeString case mappings to get & apply...
authorMarkus Scherer <markus.icu@gmail.com>
Wed, 4 Jan 2017 21:20:08 +0000 (21:20 +0000)
committerMarkus Scherer <markus.icu@gmail.com>
Wed, 4 Jan 2017 21:20:08 +0000 (21:20 +0000)
X-SVN-Rev: 39547

icu4c/source/common/unicode/ucasemap.h
icu4c/source/common/unistr_case.cpp
icu4c/source/common/unistr_titlecase_brkiter.cpp
icu4c/source/common/ustr_imp.h
icu4c/source/common/ustr_titlecase_brkiter.cpp
icu4c/source/common/ustrcase.cpp

index a97609d150e549758afff25e302e07b47d8f06b4..25a1af25dd514e50e8581dfb9f44a9a9c3983e6a 100644 (file)
@@ -117,13 +117,41 @@ public:
      */
     Edits() :
             array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0),
-            errorCode(U_ZERO_ERROR) {}
+            omit(FALSE), errorCode(U_ZERO_ERROR) {}
     ~Edits();
+
     /**
      * Resets the data but may not release memory.
      * @internal ICU 59 technology preview
      */
     void reset();
+
+    /**
+     * Controls whether the case mapping function is to write or omit
+     * characters that do not change.
+     * The complete result can be computed by applying just the changes
+     * to the original string.
+     * @see omitUnchanged
+     * @see writeUnchanged
+     * @internal ICU 59 technology preview
+     */
+    Edits &setWriteUnchanged(UBool write) {
+        omit = !write;
+        return *this;
+    }
+    /**
+     * @return TRUE if the case mapping function is to omit characters that do not change.
+     * @see setWriteUnchanged
+     * @internal ICU 59 technology preview
+     */
+    UBool omitUnchanged() const { return omit; }
+    /**
+     * @return TRUE if the case mapping function is to write characters that do not change.
+     * @see setWriteUnchanged
+     * @internal ICU 59 technology preview
+     */
+    UBool writeUnchanged() const { return !omit; }
+
     /**
      * Adds a record for an unchanged segment of text.
      * @internal ICU 59 technology preview
@@ -148,6 +176,75 @@ public:
      * @internal ICU 59 technology preview
      */
     int32_t lengthDelta() const { return delta; }
+    /**
+     * @return TRUE if there are any change edits
+     * @internal ICU 59 technology preview
+     */
+    UBool hasChanges() const;
+
+    /**
+     * Access to the list of edits.
+     * @see getCoarseIterator
+     * @see getFineIterator
+     * @internal ICU 59 technology preview
+     */
+    struct Iterator final : public UMemory {
+        /**
+         * Advances to the next edit.
+         * @return TRUE if there is another edit
+         * @internal ICU 59 technology preview
+         */
+        UBool next(UErrorCode &errorCode);
+
+        /**
+         * TRUE if this edit replaces oldLength units with newLength different ones.
+         * FALSE if oldLength units remain unchanged.
+         * @internal ICU 59 technology preview
+         */
+        UBool changed;
+        /**
+         * Number of units in the original string which are replaced or remain unchanged.
+         * @internal ICU 59 technology preview
+         */
+        int32_t oldLength;
+        /**
+         * Number of units in the modified string, if changed is TRUE.
+         * Same as oldLength if changed is FALSE.
+         * @internal ICU 59 technology preview
+         */
+        int32_t newLength;
+
+    private:
+        friend class Edits;
+
+        Iterator(const uint16_t *a, int32_t len, UBool crs) :
+                array(a), index(0), length(len), width(0), remaining(0), coarse(crs) {}
+
+        int32_t readLength(int32_t head);
+
+        const uint16_t *array;
+        int32_t index, length;
+        int32_t width, remaining;
+        UBool coarse;
+    };
+
+    /**
+     * Returns an Iterator for coarse-grained changes for simple string updates.
+     * @return an Iterator that merges adjacent changes.
+     * @internal ICU 59 technology preview
+     */
+    Iterator getCoarseIterator() const {
+        return Iterator(array, length, TRUE);
+    }
+
+    /**
+     * Returns an Iterator for fine-grained changes for modifying text with metadata.
+     * @return an Iterator that separates adjacent changes.
+     * @internal ICU 59 technology preview
+     */
+    Iterator getFineIterator() const {
+        return Iterator(array, length, FALSE);
+    }
 
 private:
     Edits(const Edits &) = delete;
@@ -165,6 +262,7 @@ private:
     int32_t capacity;
     int32_t length;
     int32_t delta;
+    UBool omit;
     UErrorCode errorCode;
     uint16_t stackArray[STACK_CAPACITY];
 };
@@ -188,7 +286,9 @@ private:
  *
  * @internal ICU 59 technology preview
  */
-#define UCASEMAP_OMIT_UNCHANGED 0x4000
+// TODO: does not work well as an option because we would need to set/reset it on UCaseMaps
+// that are often const, replaced for now by Edits.setWriteUnchanged(UBool)
+// #define UCASEMAP_OMIT_UNCHANGED 0x4000
 
 #endif  // U_HIDE_INTERNAL_API
 
@@ -520,6 +620,8 @@ ucasemap_utf8FoldCase(const UCaseMap *csm,
                       const char *src, int32_t srcLength,
                       UErrorCode *pErrorCode);
 
+#if U_SHOW_CPLUSPLUS_API
+
 // Not #ifndef U_HIDE_INTERNAL_API because UnicodeString needs the UStringCaseMapper.
 /**
  * Internal string case mapping function type.
@@ -535,4 +637,5 @@ UStringCaseMapper(const UCaseMap *csm,
                   icu::Edits *edits,
                   UErrorCode *pErrorCode);
 
+#endif  // U_SHOW_CPLUSPLUS_API
 #endif
index 1715b6ec66e268d4784ee35af1e3d8f094bff637..32fb20e87e6f87d00426390eff4edf7483534c98 100644 (file)
@@ -25,6 +25,7 @@
 #include "unicode/ustring.h"
 #include "unicode/unistr.h"
 #include "unicode/uchar.h"
+#include "uassert.h"
 #include "uelement.h"
 #include "ustr_imp.h"
 
@@ -94,49 +95,104 @@ UnicodeString::caseMap(const UCaseMap *csm,
     return *this;
   }
 
-  // We need to allocate a new buffer for the internal string case mapping function.
-  // This is very similar to how doReplace() keeps the old array pointer
-  // and deletes the old array itself after it is done.
-  // In addition, we are forcing cloneArrayIfNeeded() to always allocate a new array.
-  UChar oldStackBuffer[US_STACKBUF_SIZE];
+  UChar oldBuffer[2 * US_STACKBUF_SIZE];
   UChar *oldArray;
-  int32_t oldLength;
-
-  if(fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) {
-    // copy the stack buffer contents because it will be overwritten
-    oldArray = oldStackBuffer;
-    oldLength = getShortLength();
-    u_memcpy(oldStackBuffer, fUnion.fStackFields.fBuffer, oldLength);
+  int32_t oldLength = length();
+  int32_t newLength;
+  UBool writable = isBufferWritable();
+  UErrorCode errorCode = U_ZERO_ERROR;
+
+  // Try to avoid heap-allocating a new character array for this string.
+  if (writable ? oldLength <= UPRV_LENGTHOF(oldBuffer) : oldLength < US_STACKBUF_SIZE) {
+    // Short string: Copy the contents into a temporary buffer and
+    // case-map back into the current array, or into the stack buffer.
+    UChar *buffer = getArrayStart();
+    int32_t capacity;
+    oldArray = oldBuffer;
+    u_memcpy(oldBuffer, buffer, oldLength);
+    if (writable) {
+      capacity = getCapacity();
+    } else {
+      // Switch from the read-only alias or shared heap buffer to the stack buffer.
+      if (!cloneArrayIfNeeded(US_STACKBUF_SIZE, US_STACKBUF_SIZE, /* doCopyArray= */ FALSE)) {
+        return *this;
+      }
+      U_ASSERT(fUnion.fFields.fLengthAndFlags & kUsingStackBuffer);
+      buffer = fUnion.fStackFields.fBuffer;
+      capacity = US_STACKBUF_SIZE;
+    }
+    newLength = stringCaseMapper(csm, buffer, capacity, oldArray, oldLength, NULL, &errorCode);
+    if (U_SUCCESS(errorCode)) {
+      setLength(newLength);
+      return *this;
+    } else if (errorCode == U_BUFFER_OVERFLOW_ERROR) {
+      // common overflow handling below
+    } else {
+      setToBogus();
+      return *this;
+    }
   } else {
+    // Longer string or read-only buffer:
+    // Collect only changes and then apply them to this string.
+    // Case mapping often changes only small parts of a string,
+    // and often does not change its length.
     oldArray = getArrayStart();
-    oldLength = length();
+    Edits edits;
+    edits.setWriteUnchanged(FALSE);
+    UChar replacementChars[200];
+    int32_t replacementLength = stringCaseMapper(
+            csm, replacementChars, UPRV_LENGTHOF(replacementChars),
+            oldArray, oldLength, &edits, &errorCode);
+    UErrorCode editsError = U_ZERO_ERROR;
+    if (edits.setErrorCode(editsError)) {
+      setToBogus();
+      return *this;
+    }
+    newLength = oldLength + edits.lengthDelta();
+    if (U_SUCCESS(errorCode)) {
+      if (!cloneArrayIfNeeded(newLength, newLength)) {
+        return *this;
+      }
+      int32_t index = 0;  // index into this string
+      int32_t replIndex = 0;  // index into replacementChars
+      for (Edits::Iterator iter = edits.getCoarseIterator(); iter.next(errorCode);) {
+        if (iter.changed) {
+          doReplace(index, iter.oldLength, replacementChars, replIndex, iter.newLength);
+          replIndex += iter.newLength;
+        }
+        index += iter.newLength;
+      }
+      if (U_FAILURE(errorCode)) {
+        setToBogus();
+      }
+      U_ASSERT(replIndex == replacementLength);
+      return *this;
+    } else if (errorCode == U_BUFFER_OVERFLOW_ERROR) {
+      // common overflow handling below
+    } else {
+      setToBogus();
+      return *this;
+    }
   }
 
-  int32_t capacity;
-  if(oldLength <= US_STACKBUF_SIZE) {
-    capacity = US_STACKBUF_SIZE;
-  } else {
-    capacity = oldLength + 20;
-  }
+  // Handle buffer overflow, newLength is known.
+  // We need to allocate a new buffer for the internal string case mapping function.
+  // This is very similar to how doReplace() keeps the old array pointer
+  // and deletes the old array itself after it is done.
+  // In addition, we are forcing cloneArrayIfNeeded() to always allocate a new array.
   int32_t *bufferToDelete = 0;
-  if(!cloneArrayIfNeeded(capacity, capacity, FALSE, &bufferToDelete, TRUE)) {
+  if (!cloneArrayIfNeeded(newLength, newLength, FALSE, &bufferToDelete, TRUE)) {
     return *this;
   }
-
-  // Case-map, and if the result is too long, then reallocate and repeat.
-  UErrorCode errorCode;
-  int32_t newLength;
-  do {
-    errorCode = U_ZERO_ERROR;
-    newLength = stringCaseMapper(csm, getArrayStart(), getCapacity(),
-                                 oldArray, oldLength, &errorCode);
-    setLength(newLength);
-  } while(errorCode==U_BUFFER_OVERFLOW_ERROR && cloneArrayIfNeeded(newLength, newLength, FALSE));
-
+  errorCode = U_ZERO_ERROR;
+  newLength = stringCaseMapper(csm, getArrayStart(), getCapacity(),
+                               oldArray, oldLength, NULL, &errorCode);
   if (bufferToDelete) {
     uprv_free(bufferToDelete);
   }
-  if(U_FAILURE(errorCode)) {
+  if (U_SUCCESS(errorCode)) {
+    setLength(newLength);
+  } else {
     setToBogus();
   }
   return *this;
index 3d6737cfc5e3d9601a3d9c48a1c70dd0916a6f41..c909133cdbe1346bc64f32e08dcb3d2284dee77d 100644 (file)
@@ -32,9 +32,10 @@ static int32_t U_CALLCONV
 unistr_case_internalToTitle(const UCaseMap *csm,
                             UChar *dest, int32_t destCapacity,
                             const UChar *src, int32_t srcLength,
+                            icu::Edits *edits,
                             UErrorCode *pErrorCode) {
   ubrk_setText(csm->iter, src, srcLength, pErrorCode);
-  return ustrcase_internalToTitle(csm, dest, destCapacity, src, srcLength, pErrorCode);
+  return ustrcase_internalToTitle(csm, dest, destCapacity, src, srcLength, edits, pErrorCode);
 }
 
 /*
index 12a9ac9ca317681f158e6eb56c3f21849896333b..544d05c376822b9dba7928fc745d0bb114de38f6 100644 (file)
@@ -102,6 +102,10 @@ uprv_haveProperties(UErrorCode *pErrorCode);
 /*U_CFUNC int8_t
 uprv_loadPropsData(UErrorCode *errorCode);*/
 
+#ifdef __cplusplus
+// TODO: Consider moving these case mapping definitions
+// into a new internal header like ucasemap_imp.h.
+
 /*
  * Internal string casing functions implementing
  * ustring.h/ustrcase.c and UnicodeString case mapping functions.
@@ -117,10 +121,6 @@ struct UCaseMap {
     uint32_t options;
 };
 
-#ifndef __UCASEMAP_H__
-typedef struct UCaseMap UCaseMap;
-#endif
-
 #if UCONFIG_NO_BREAK_ITERATION
 #   define UCASEMAP_INITIALIZER { NULL, { 0 }, 0, 0 }
 #else
@@ -209,8 +209,6 @@ ucasemap_mapUTF8(const UCaseMap *csm,
                  UTF8CaseMapper *stringCaseMapper,
                  UErrorCode *pErrorCode);
 
-#ifdef __cplusplus
-
 U_NAMESPACE_BEGIN
 namespace GreekUpper {
 
index 21a53be4ff72f086daa125e81f709f61c3ce8d23..d5e5a2c24152ac18f84de66b8868fc231eedd71f 100644 (file)
@@ -66,7 +66,7 @@ u_strToTitle(UChar *dest, int32_t destCapacity,
         &csm,
         dest, destCapacity,
         src, srcLength,
-        ustrcase_internalToTitle, pErrorCode);
+        ustrcase_internalToTitle, NULL, pErrorCode);
     if(titleIter==NULL && csm.iter!=NULL) {
         ubrk_close(csm.iter);
     }
index 7459e3572edbfa53262c726a79ce764948c9c4ae..c833345788a4f6a7469a87680d385687fec2d5c1 100644 (file)
@@ -208,12 +208,120 @@ UBool Edits::growArray() {
 }
 
 UBool Edits::setErrorCode(UErrorCode &outErrorCode) {
-    if(U_FAILURE(outErrorCode)) { return TRUE; }
-    if(U_SUCCESS(errorCode)) { return FALSE; }
+    if (U_FAILURE(outErrorCode)) { return TRUE; }
+    if (U_SUCCESS(errorCode)) { return FALSE; }
     outErrorCode = errorCode;
     return TRUE;
 }
 
+UBool Edits::hasChanges() const {
+    if (delta != 0) {
+        return TRUE;
+    }
+    for (int32_t i = 0; i < length; ++i) {
+        if (array[i] > MAX_UNCHANGED) {
+            return TRUE;
+        }
+    }
+    return FALSE;
+}
+
+UBool Edits::Iterator::next(UErrorCode &errorCode) {
+    if (U_FAILURE(errorCode)) { return FALSE; }
+    // Always set all relevant public fields: Do not rely on them not having been touched.
+    if (remaining > 0) {
+        // Fine-grained iterator: Continue a sequence of equal-length changes.
+        changed = TRUE;
+        oldLength = newLength = width;
+        --remaining;
+        return TRUE;
+    }
+    if (index >= length) {
+        return FALSE;
+    }
+    int32_t u = array[index++];
+    if (u <= MAX_UNCHANGED) {
+        // Combine adjacent unchanged ranges.
+        changed = FALSE;
+        oldLength = u + 1;
+        while (index < length && (u = array[index]) <= MAX_UNCHANGED) {
+            ++index;
+            if (u >= (INT32_MAX - oldLength)) {
+                errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+                return FALSE;
+            }
+            oldLength += u + 1;
+        }
+        newLength = oldLength;
+        return TRUE;
+    }
+    changed = TRUE;
+    if (u <= MAX_SHORT_CHANGE) {
+        if (coarse) {
+            int32_t w = u >> 12;
+            int32_t len = (u & 0xfff) + 1;
+            oldLength = newLength = w * len;
+        } else {
+            // Split a sequence of equal-length changes that was compressed into one unit.
+            oldLength = newLength = width = u >> 12;
+            remaining = u & 0xfff;
+            return TRUE;
+        }
+    } else {
+        U_ASSERT(u <= 0x7fff);
+        oldLength = readLength((u >> 6) & 0x3f);
+        newLength = readLength(u & 0x3f);
+        if (!coarse) {
+            return TRUE;
+        }
+    }
+    // Combine adjacent changes.
+    while (index < length && (u = array[index]) > MAX_UNCHANGED) {
+        ++index;
+        if (u <= MAX_SHORT_CHANGE) {
+            int32_t w = u >> 12;
+            int32_t len = (u & 0xfff) + 1;
+            len = w * len;
+            if (len > (INT32_MAX - oldLength) || len > (INT32_MAX - newLength)) {
+                errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+                return FALSE;
+            }
+            oldLength += len;
+            newLength += len;
+        } else {
+            U_ASSERT(u <= 0x7fff);
+            int32_t oldLen = readLength((u >> 6) & 0x3f);
+            int32_t newLen = readLength(u & 0x3f);
+            if (oldLen > (INT32_MAX - oldLength) || newLen > (INT32_MAX - newLength)) {
+                errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+                return FALSE;
+            }
+            oldLength += oldLen;
+            newLength += newLen;
+        }
+    }
+    return TRUE;
+}
+
+int32_t Edits::Iterator::readLength(int32_t head) {
+    if (head < LENGTH_IN_1TRAIL) {
+        return head;
+    } else if (head < LENGTH_IN_2TRAIL) {
+        U_ASSERT(index < length);
+        U_ASSERT(array[index] >= 0x8000);
+        return array[index++];
+    } else {
+        U_ASSERT((index + 2) <= length);
+        U_ASSERT(array[index] >= 0x8000);
+        U_ASSERT(array[index + 1] >= 0x8000);
+        int32_t len = ((head & 1) << 30) |
+                ((int32_t)(array[index] & 0x7fff) << 15) |
+                (array[index + 1] & 0x7fff);
+        index += 2;
+        return len;
+    }
+}
+
 U_NAMESPACE_END
 
 U_NAMESPACE_USE
@@ -224,7 +332,7 @@ U_NAMESPACE_USE
 static inline int32_t
 appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
              int32_t result, const UChar *s,
-             uint32_t options, int32_t cpLength, icu::Edits *edits) {
+             int32_t cpLength, icu::Edits *edits) {
     UChar32 c;
     int32_t length;
 
@@ -233,9 +341,9 @@ appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
         /* (not) original code point */
         if(edits!=NULL) {
             edits->addUnchanged(cpLength);
-        }
-        if(options & UCASEMAP_OMIT_UNCHANGED) {
-            return destIndex;
+            if(edits->omitUnchanged()) {
+                return destIndex;
+            }
         }
         c=~result;
         if(destIndex<destCapacity && c<=0xffff) {  // BMP slightly-fastpath
@@ -308,6 +416,12 @@ static inline int32_t
 appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
                 const UChar *s, int32_t length, icu::Edits *edits) {
     if(length>0) {
+        if(edits!=NULL) {
+            edits->addUnchanged(length);
+            if(edits->omitUnchanged()) {
+                return destIndex;
+            }
+        }
         if(length>(INT32_MAX-destIndex)) {
             return -1;  // integer overflow
         }
@@ -315,9 +429,6 @@ appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
             u_memcpy(dest+destIndex, s, length);
         }
         destIndex+=length;
-        if(edits!=NULL) {
-            edits->addUnchanged(length);
-        }
     }
     return destIndex;
 }
@@ -379,7 +490,7 @@ _caseMap(const UCaseMap *csm, UCaseMapFull *map,
         const UChar *s;
         c=map(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &locCache);
         destIndex = appendResult(dest, destIndex, destCapacity, c, s,
-                                 csm->options, srcIndex - cpStart, edits);
+                                 srcIndex - cpStart, edits);
         if (destIndex < 0) {
             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
             return 0;
@@ -482,7 +593,7 @@ ustrcase_internalToTitle(const UCaseMap *csm,
                 c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, &csc, &s,
                                     csm->locale, &locCache);
                 destIndex=appendResult(dest, destIndex, destCapacity, c, s,
-                                       csm->options, titleLimit-titleStart, edits);
+                                       titleLimit-titleStart, edits);
                 if(destIndex<0) {
                     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
                     return 0;
@@ -1104,7 +1215,7 @@ int32_t toUpper(const UCaseMap *csm,
             }
 
             UBool change;
-            if ((csm->options & UCASEMAP_OMIT_UNCHANGED) == 0 && edits == NULL) {
+            if (edits == NULL) {
                 change = TRUE;  // common, simple usage
             } else {
                 // Find out first whether we are changing the text.
@@ -1130,7 +1241,7 @@ int32_t toUpper(const UCaseMap *csm,
                         edits->addUnchanged(oldLength);
                     }
                     // Write unchanged text?
-                    change |= (csm->options & UCASEMAP_OMIT_UNCHANGED) == 0;
+                    change = edits->writeUnchanged();
                 }
             }
 
@@ -1155,7 +1266,7 @@ int32_t toUpper(const UCaseMap *csm,
             const UChar *s;
             c=ucase_toFullUpper(csm->csp, c, NULL, NULL, &s, csm->locale, &locCache);
             destIndex = appendResult(dest, destIndex, destCapacity, c, s,
-                                     csm->options, nextIndex - i, edits);
+                                     nextIndex - i, edits);
             if (destIndex < 0) {
                 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
                 return 0;
@@ -1228,7 +1339,7 @@ ustrcase_internalFold(const UCaseMap *csm,
         const UChar *s;
         c = ucase_toFullFolding(csm->csp, c, &s, csm->options);
         destIndex = appendResult(dest, destIndex, destCapacity, c, s,
-                                 csm->options, srcIndex - cpStart, edits);
+                                 srcIndex - cpStart, edits);
         if (destIndex < 0) {
             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
             return 0;