ICU-12410 move/re-port titlecasing, support Edits

author Markus Scherer <markus.icu@gmail.com>

Tue, 7 Feb 2017 00:58:52 +0000 (00:58 +0000)

committer Markus Scherer <markus.icu@gmail.com>

Tue, 7 Feb 2017 00:58:52 +0000 (00:58 +0000)
author Markus Scherer <markus.icu@gmail.com>
Tue, 7 Feb 2017 00:58:52 +0000 (00:58 +0000)
committer Markus Scherer <markus.icu@gmail.com>
Tue, 7 Feb 2017 00:58:52 +0000 (00:58 +0000)
diff --git a/icu4c/source/common/ustrcase.cpp b/icu4c/source/common/ustrcase.cpp

index 2a44f6eba4c5dbcb2b443dc968ddfcd6ea7520c6..f08a1fae53fb98a6f1531ac8e92b11ab5035c0bd 100644 (file)
--- a/icu4c/source/common/ustrcase.cpp
+++ b/icu4c/source/common/ustrcase.cpp
@@ -577,15 +577,15 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
      /* titlecasing loop */
      while(prev<srcLength) {
          /* find next index where to titlecase */
-        int32_t idx;
+        int32_t index;
          if(isFirstIndex) {
              isFirstIndex=FALSE;
-            idx=iter->first();
+            index=iter->first();
          } else {
-            idx=iter->next();
+            index=iter->next();
          }
-        if(idx==UBRK_DONE || idx>srcLength) {
-            idx=srcLength;
+        if(index==UBRK_DONE || index>srcLength) {
+            index=srcLength;
          }
  
          /*
@@ -601,24 +601,24 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
           * b) first case letter (titlecase)         [titleStart..titleLimit[
           * c) subsequent characters (lowercase)                 [titleLimit..index[
           */
-        if(prev<idx) {
+        if(prev<index) {
              /* find and copy uncased characters [prev..titleStart[ */
              int32_t titleStart=prev;
              int32_t titleLimit=prev;
              UChar32 c;
-            U16_NEXT(src, titleLimit, idx, c);
+            U16_NEXT(src, titleLimit, index, c);
              if((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(NULL, c)) {
                  /* Adjust the titlecasing index (titleStart) to the next cased character. */
                  for(;;) {
                      titleStart=titleLimit;
-                    if(titleLimit==idx) {
+                    if(titleLimit==index) {
                          /*
                           * only uncased characters in [prev..index[
                           * stop with titleStart==titleLimit==index
                           */
                          break;
                      }
-                    U16_NEXT(src, titleLimit, idx, c);
+                    U16_NEXT(src, titleLimit, index, c);
                      if(UCASE_NONE!=ucase_getType(NULL, c)) {
                          break; /* cased letter at [titleStart..titleLimit[ */
                      }
@@ -645,10 +645,10 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
                  }
  
                  /* Special case Dutch IJ titlecasing */
-                if (titleStart+1 < idx &&
+                if (titleStart+1 < index &&
                          caseLocale == UCASE_LOC_DUTCH &&
                          (src[titleStart] == 0x0049 || src[titleStart] == 0x0069) &&
-                        (src[titleStart+1] == 0x004A || src[titleStart+1] == 0x006A)) {
+                        src[titleStart+1] == 0x006A) {
                      destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A);
                      if(destIndex<0) {
                          errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
@@ -661,7 +661,7 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
                  }
  
                  /* lowercase [titleLimit..index[ */
-                if(titleLimit<idx) {
+                if(titleLimit<index) {
                      if((options&U_TITLECASE_NO_LOWERCASE)==0) {
                          /* Normal operation: Lowercase the rest of the word. */
                          destIndex+=
@@ -669,7 +669,7 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
                                  caseLocale, options, ucase_toFullLower,
                                  dest+destIndex, destCapacity-destIndex,
                                  src, &csc,
-                                titleLimit, idx,
+                                titleLimit, index,
                                  edits, errorCode);
                          if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
                              errorCode=U_ZERO_ERROR;
@@ -680,7 +680,7 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
                      } else {
                          /* Optionally just copy the rest of the word unchanged. */
                          destIndex=appendUnchanged(dest, destIndex, destCapacity,
-                                                  src+titleLimit, idx-titleLimit, options, edits);
+                                                  src+titleLimit, index-titleLimit, options, edits);
                          if(destIndex<0) {
                              errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
                              return 0;
@@ -690,7 +690,7 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
              }
          }
  
-        prev=idx;
+        prev=index;
      }
  
      return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java

index 6c86968f46067f180b0305d162d3ba843e712156..f5822c85ef09aca861510e29b293538081c0b146 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java
@@ -4,6 +4,8 @@ package com.ibm.icu.impl;
  
  import java.io.IOException;
  
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.BreakIterator;
  import com.ibm.icu.text.Edits;
  import com.ibm.icu.util.ICUUncheckedIOException;
  
@@ -191,6 +193,15 @@ public final class CaseMapImpl {
          }
      }
  
+    private static void internalToLower(int caseLocale, int options, StringContextIterator iter,
+            Appendable dest, Edits edits) throws IOException {
+        int c;
+        while ((c = iter.nextCaseMapCP()) >= 0) {
+            c = UCaseProps.INSTANCE.toFullLower(c, iter, dest, caseLocale);
+            appendResult(c, dest, iter.getCPLength(), options, edits);
+        }
+    }
+
      public static <A extends Appendable> A toLower(int caseLocale, int options,
              CharSequence src, A dest, Edits edits) {
          try {
@@ -198,11 +209,7 @@ public final class CaseMapImpl {
                  edits.reset();
              }
              StringContextIterator iter = new StringContextIterator(src);
-            int c;
-            while ((c = iter.nextCaseMapCP()) >= 0) {
-                c = UCaseProps.INSTANCE.toFullLower(c, iter, dest, caseLocale);
-                appendResult(c, dest, iter.getCPLength(), options, edits);
-            }
+            internalToLower(caseLocale, options, iter, dest, edits);
              return dest;
          } catch (IOException e) {
              throw new ICUUncheckedIOException(e);
@@ -230,6 +237,107 @@ public final class CaseMapImpl {
          }
      }
  
+    public static <A extends Appendable> A toTitle(
+            int caseLocale, int options, BreakIterator titleIter,
+            CharSequence src, A dest, Edits edits) {
+        try {
+            if (edits != null) {
+                edits.reset();
+            }
+
+            /* set up local variables */
+            StringContextIterator iter = new StringContextIterator(src);
+            int srcLength = src.length();
+            int prev=0;
+            boolean isFirstIndex=true;
+
+            /* titlecasing loop */
+            while(prev<srcLength) {
+                /* find next index where to titlecase */
+                int index;
+                if(isFirstIndex) {
+                    isFirstIndex=false;
+                    index=titleIter.first();
+                } else {
+                    index=titleIter.next();
+                }
+                if(index==BreakIterator.DONE || index>srcLength) {
+                    index=srcLength;
+                }
+
+                /*
+                 * Unicode 4 & 5 section 3.13 Default Case Operations:
+                 *
+                 * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
+                 * #29, "Text Boundaries." Between each pair of word boundaries, find the first
+                 * cased character F. If F exists, map F to default_title(F); then map each
+                 * subsequent character C to default_lower(C).
+                 *
+                 * In this implementation, segment [prev..index[ into 3 parts:
+                 * a) uncased characters (copy as-is) [prev..titleStart[
+                 * b) first case letter (titlecase)         [titleStart..titleLimit[
+                 * c) subsequent characters (lowercase)                 [titleLimit..index[
+                 */
+                if(prev<index) {
+                    // find and copy uncased characters [prev..titleStart[
+                    int titleStart=prev;
+                    iter.setLimit(index);
+                    int c=iter.nextCaseMapCP();
+                    if((options&UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT)==0
+                            && UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {
+                        // Adjust the titlecasing index (titleStart) to the next cased character.
+                        while((c=iter.nextCaseMapCP())>=0
+                                && UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {}
+                        // If c<0 then we have only uncased characters in [prev..index[
+                        // and stopped with titleStart==titleLimit==index.
+                        titleStart=iter.getCPStart();
+                        appendUnchanged(src, prev, titleStart-prev, dest, options, edits);
+                    }
+
+                    if(titleStart<index) {
+                        int titleLimit=iter.getCPLimit();
+                        // titlecase c which is from [titleStart..titleLimit[
+                        c = UCaseProps.INSTANCE.toFullTitle(c, iter, dest, caseLocale);
+                        appendResult(c, dest, iter.getCPLength(), options, edits);
+
+                        // Special case Dutch IJ titlecasing
+                        if (titleStart+1 < index && caseLocale == UCaseProps.LOC_DUTCH) {
+                            char c1 = src.charAt(titleStart);
+                            char c2 = src.charAt(titleStart+1);
+                            if ((c1 == 'i' || c1 == 'I') && c2 == 'j') {
+                                dest.append('J');
+                                if (edits != null) {
+                                    edits.addReplace(1, 1);
+                                }
+                                c=iter.nextCaseMapCP();
+                                titleLimit++;
+                                assert c == c2;
+                                assert titleLimit == iter.getCPLimit();
+                            }
+                        }
+
+                        // lowercase [titleLimit..index[
+                        if(titleLimit<index) {
+                            if((options&UCharacter.TITLECASE_NO_LOWERCASE)==0) {
+                                // Normal operation: Lowercase the rest of the word.
+                                internalToLower(caseLocale, options, iter, dest, edits);
+                            } else {
+                                // Optionally just copy the rest of the word unchanged.
+                                appendUnchanged(src, titleLimit, index-titleLimit, dest, options, edits);
+                                iter.moveToLimit();
+                            }
+                        }
+                    }
+                }
+
+                prev=index;
+            }
+            return dest;
+        } catch (IOException e) {
+            throw new ICUUncheckedIOException(e);
+        }
+    }
+
      private static final class GreekUpper {
          // Data bits.
          private static final int UPPER_MASK = 0x3ff;
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java

index 1da49283ac58d285edd34d1e70cd9d75aa33e899..6b5619d23cee0f41b6c0d5e09991fb7c2bd638f3 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java
@@ -612,6 +612,7 @@ public final class UCaseProps {
      private static final int LOC_TURKISH=2;
      private static final int LOC_LITHUANIAN=3;
      static final int LOC_GREEK=4;
+    public static final int LOC_DUTCH=5;
  
      public static final int getCaseLocale(Locale locale) {
          return getCaseLocale(locale.getLanguage());
@@ -634,6 +635,8 @@ public final class UCaseProps {
                  return LOC_GREEK;
              } else if(language.equals("lt")) {
                  return LOC_LITHUANIAN;
+            } else if(language.equals("nl")) {
+                return LOC_DUTCH;
              }
          } else if(language.length()==3) {
              if(language.equals("tur") || language.equals("aze")) {
@@ -642,6 +645,8 @@ public final class UCaseProps {
                  return LOC_GREEK;
              } else if(language.equals("lit")) {
                  return LOC_LITHUANIAN;
+            } else if(language.equals("nld")) {
+                return LOC_DUTCH;
              }
          }
          return LOC_ROOT;
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java b/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java

index 5ebde55596c5556cb847df4b8f8d28946a360f31..ff6d7a5daac2afd84149eb773d4b7f8652f5a5fd 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java
@@ -16,7 +16,6 @@ import java.util.Locale;
  import java.util.Map;
  
  import com.ibm.icu.impl.CaseMapImpl;
-import com.ibm.icu.impl.CaseMapImpl.StringContextIterator;
  import com.ibm.icu.impl.IllegalIcuArgumentException;
  import com.ibm.icu.impl.Trie2;
  import com.ibm.icu.impl.UBiDiProps;
@@ -4944,7 +4943,8 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
                      caseLocale, CaseMapImpl.OMIT_UNCHANGED_TEXT, str, new StringBuilder(), edits);
              return applyEdits(str, replacementChars, edits);
          } else {
-            return CaseMapImpl.toLower(caseLocale, 0, str, new StringBuilder(), null).toString();
+            return CaseMapImpl.toLower(caseLocale, 0, str,
+                    new StringBuilder(str.length()), null).toString();
          }
      }
  
@@ -4960,7 +4960,26 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
                      caseLocale, CaseMapImpl.OMIT_UNCHANGED_TEXT, str, new StringBuilder(), edits);
              return applyEdits(str, replacementChars, edits);
          } else {
-            return CaseMapImpl.toUpper(caseLocale, 0, str, new StringBuilder(), null).toString();
+            return CaseMapImpl.toUpper(caseLocale, 0, str,
+                    new StringBuilder(str.length()), null).toString();
+        }
+    }
+
+    private static String toTitleCase(int caseLocale, int options, BreakIterator titleIter, String str) {
+        if (str.length() <= 100) {
+            if (str.isEmpty()) {
+                return str;
+            }
+            // Collect and apply only changes.
+            // Good if no or few changes. Bad (slow) if many changes.
+            Edits edits = new Edits();
+            StringBuilder replacementChars = CaseMapImpl.toTitle(
+                    caseLocale, CaseMapImpl.OMIT_UNCHANGED_TEXT, titleIter, str,
+                    new StringBuilder(), edits);
+            return applyEdits(str, replacementChars, edits);
+        } else {
+            return CaseMapImpl.toTitle(caseLocale, 0, titleIter, str,
+                    new StringBuilder(str.length()), null).toString();
          }
      }
  
@@ -5053,7 +5072,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
      public static String toTitleCase(Locale locale, String str,
              BreakIterator breakiter)
      {
-        return toTitleCase(ULocale.forLocale(locale), str, breakiter);
+        return toTitleCase(locale, str, breakiter, 0);
      }
  
      /**
@@ -5103,124 +5122,12 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
       * @see #TITLECASE_NO_BREAK_ADJUSTMENT
       */
      public static String toTitleCase(ULocale locale, String str,
-            BreakIterator titleIter,
-            int options) {
-        StringContextIterator iter = new StringContextIterator(str);
-        StringBuilder result = new StringBuilder(str.length());
-        int c, nc, srcLength = str.length();
-
-        if (locale == null) {
-            locale = ULocale.getDefault();
-        }
-        int caseLocale = UCaseProps.getCaseLocale(locale);
-
+            BreakIterator titleIter, int options) {
          if(titleIter == null) {
              titleIter = BreakIterator.getWordInstance(locale);
          }
          titleIter.setText(str);
-
-        int prev, titleStart, index;
-        boolean isFirstIndex;
-        boolean isDutch = locale.getLanguage().equals("nl");
-        boolean FirstIJ = true;
-
-        /* set up local variables */
-        prev=0;
-        isFirstIndex=true;
-
-        /* titlecasing loop */
-        while(prev<srcLength) {
-            /* find next index where to titlecase */
-            if(isFirstIndex) {
-                isFirstIndex=false;
-                index=titleIter.first();
-            } else {
-                index=titleIter.next();
-            }
-            if(index==BreakIterator.DONE || index>srcLength) {
-                index=srcLength;
-            }
-
-            /*
-             * Unicode 4 & 5 section 3.13 Default Case Operations:
-             *
-             * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
-             * #29, "Text Boundaries." Between each pair of word boundaries, find the first
-             * cased character F. If F exists, map F to default_title(F); then map each
-             * subsequent character C to default_lower(C).
-             *
-             * In this implementation, segment [prev..index[ into 3 parts:
-             * a) uncased characters (copy as-is) [prev..titleStart[
-             * b) first case letter (titlecase)         [titleStart..titleLimit[
-             * c) subsequent characters (lowercase)                 [titleLimit..index[
-             */
-            if(prev<index) {
-                /* find and copy uncased characters [prev..titleStart[ */
-                iter.setLimit(index);
-                c=iter.nextCaseMapCP();
-                if((options&TITLECASE_NO_BREAK_ADJUSTMENT)==0
-                        && UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {
-                    while((c=iter.nextCaseMapCP())>=0
-                            && UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {}
-                    titleStart=iter.getCPStart();
-                    if(prev<titleStart) {
-                        result.append(str, prev, titleStart);
-                    }
-                } else {
-                    titleStart=prev;
-                }
-
-                if(titleStart<index) {
-                    FirstIJ = true;
-                    /* titlecase c which is from titleStart */
-                    c = UCaseProps.INSTANCE.toFullTitle(c, iter, result, caseLocale);
-
-                    /* decode the result and lowercase up to index */
-                    for(;;) {
-                        if(c<0) {
-                            /* (not) original code point */
-                            c=~c;
-                            result.appendCodePoint(c);
-                        } else if(c<=UCaseProps.MAX_STRING_LENGTH) {
-                            /* mapping already appended to result */
-                        } else {
-                            /* append single-code point mapping */
-                            result.appendCodePoint(c);
-                        }
-
-                        if((options&TITLECASE_NO_LOWERCASE)!=0) {
-                            /* Optionally just copy the rest of the word unchanged. */
-
-                            int titleLimit=iter.getCPLimit();
-                            if(titleLimit<index) {
-                                /* Special Case - Dutch IJ Titlecasing */
-                                if (isDutch && c == 0x0049 && str.charAt(titleLimit) == 'j') {
-                                    result.append('J').append(str, titleLimit + 1, index);
-                                } else {
-                                    result.append(str, titleLimit, index);
-                                }
-                            }
-                            iter.moveToLimit();
-                            break;
-                        } else if((nc=iter.nextCaseMapCP())>=0) {
-                            if (isDutch && (nc == 0x004A ||  nc == 0x006A)
-                                    && (c == 0x0049) && (FirstIJ == true)) {
-                                c = 0x004A; /* J */
-                                FirstIJ = false;
-                            } else {
-                                /* Normal operation: Lowercase the rest of the word. */
-                                c = UCaseProps.INSTANCE.toFullLower(nc, iter, result, caseLocale);
-                            }
-                        } else {
-                            break;
-                        }
-                    }
-                }
-            }
-
-            prev=index;
-        }
-        return result.toString();
+        return toTitleCase(getCaseLocale(locale), options, titleIter, str);
      }
  
  
@@ -5323,7 +5230,11 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
      public static String toTitleCase(Locale locale, String str,
              BreakIterator titleIter,
              int options) {
-        return toTitleCase(ULocale.forLocale(locale), str, titleIter, options);
+        if(titleIter == null) {
+            titleIter = BreakIterator.getWordInstance(locale);
+        }
+        titleIter.setText(str);
+        return toTitleCase(getCaseLocale(locale), options, titleIter, str);
      }
  
      /**
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/CaseMap.java b/icu4j/main/classes/core/src/com/ibm/icu/text/CaseMap.java

index e1747b635c40dc893630d736fe65c7d99b7f979f..6b0cea0540d9182a733037bc2ca7bc22a8def41b 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/CaseMap.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/CaseMap.java
@@ -9,9 +9,6 @@ import com.ibm.icu.impl.UCaseProps;
  import com.ibm.icu.lang.UCharacter;
  import com.ibm.icu.util.ULocale;
  
-// TODO: issues/questions
-// - optimizing strategies for unstyled text: stop after number of changes or length of replacement?
-
  /**
   * Low-level case mapping options and methods. Immutable.
   * "Setters" return instances with the union of the current and new options set.
@@ -262,7 +259,12 @@ public abstract class CaseMap {
           */
           public <A extends Appendable> A apply(
                   Locale locale, BreakIterator iter, CharSequence src, A dest, Edits edits) {
-             return null;
+             if (iter == null) {
+                 iter = BreakIterator.getWordInstance(locale);
+             }
+             iter.setText(src.toString());
+             return CaseMapImpl.toTitle(
+                     getCaseLocale(locale), internalOptions, iter, src, dest, edits);
           }
      }
author	Markus Scherer <markus.icu@gmail.com>
	Tue, 7 Feb 2017 00:58:52 +0000 (00:58 +0000)
committer	Markus Scherer <markus.icu@gmail.com>
	Tue, 7 Feb 2017 00:58:52 +0000 (00:58 +0000)
icu4c/source/common/ustrcase.cpp		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/CaseMap.java		patch \| blob \| history