From: Markus Scherer Date: Wed, 25 Jan 2017 23:52:55 +0000 (+0000) Subject: ICU-12410 start to port class CaseMap to Java; make case mapping impl code work with... X-Git-Tag: release-59-rc~145^2~15 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=2fe503c981a3b216bf391d1dac2bd83751ed20c7;p=icu ICU-12410 start to port class CaseMap to Java; make case mapping impl code work with CharSequence & Appendable X-SVN-Rev: 39603 --- diff --git a/icu4c/source/common/ustrcase.cpp b/icu4c/source/common/ustrcase.cpp index d213d039df1..2a44f6eba4c 100644 --- a/icu4c/source/common/ustrcase.cpp +++ b/icu4c/source/common/ustrcase.cpp @@ -1177,7 +1177,7 @@ UBool isFollowedByCasedLetter(const UCaseProps *csp, const UChar *s, int32_t i, * for each character. * TODO: Try to re-consolidate one way or another with the non-Greek function. */ -int32_t toUpper(int32_t caseLocale, uint32_t options, +int32_t toUpper(uint32_t options, UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, Edits *edits, @@ -1305,7 +1305,7 @@ int32_t toUpper(int32_t caseLocale, uint32_t options, } } else { const UChar *s; - c=ucase_toFullUpper(NULL, c, NULL, NULL, &s, caseLocale); + c=ucase_toFullUpper(NULL, c, NULL, NULL, &s, UCASE_LOC_GREEK); destIndex = appendResult(dest, destIndex, destCapacity, c, s, nextIndex - i, options, edits); if (destIndex < 0) { @@ -1349,7 +1349,7 @@ ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_IT icu::Edits *edits, UErrorCode &errorCode) { if (caseLocale == UCASE_LOC_GREEK) { - return GreekUpper::toUpper(caseLocale, options, dest, destCapacity, src, srcLength, edits, errorCode); + return GreekUpper::toUpper(options, dest, destCapacity, src, srcLength, edits, errorCode); } UCaseContext csc=UCASECONTEXT_INITIALIZER; csc.p=(void *)src; diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMap.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMap.java index c9b0206747a..8ec51544b6f 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMap.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMap.java @@ -2,6 +2,10 @@ // License & terms of use: http://www.unicode.org/copyright.html#License package com.ibm.icu.impl; +import java.io.IOException; + +import com.ibm.icu.text.Edits; +import com.ibm.icu.util.ICUUncheckedIOException; import com.ibm.icu.util.ULocale; // TODO: rename to CaseMapImpl @@ -13,11 +17,11 @@ public final class CaseMap { public static final class StringContextIterator implements UCaseProps.ContextIterator { /** * Constructor. - * @param s String to iterate over. + * @param src String to iterate over. */ - public StringContextIterator(String s) { - this.s=s; - limit=s.length(); + public StringContextIterator(CharSequence src) { + this.s=src; + limit=src.length(); cpStart=cpLimit=index=0; dir=0; } @@ -61,7 +65,7 @@ public final class CaseMap { public int nextCaseMapCP() { cpStart=cpLimit; if(cpLimit0 && index0) { - c=s.codePointBefore(index); + c=Character.codePointBefore(s, index); index-=Character.charCount(c); return c; } @@ -121,44 +129,107 @@ public final class CaseMap { } // variables - protected String s; + protected CharSequence s; protected int index, limit, cpStart, cpLimit; protected int dir; // 0=initial state >0=forward <0=backward } - /** Appends a full case mapping result, see {@link UCaseProps#MAX_STRING_LENGTH}. */ - private static final void appendResult(int c, StringBuilder result) { + private static int appendCodePoint(Appendable a, int c) throws IOException { + if (c <= Character.MAX_VALUE) { + a.append((char)c); + return 1; + } else { + a.append((char)(0xd7c0 + (c >> 10))); + a.append((char)(Character.MIN_LOW_SURROGATE + (c & 0x3ff))); + return 2; + } + } + + /** + * Appends a full case mapping result, see {@link UCaseProps#MAX_STRING_LENGTH}. + * @throws IOException + */ + private static void appendResult(int result, Appendable dest, + int cpLength, int options, Edits edits) throws IOException { // Decode the result. - if (c < 0) { + if (result < 0) { // (not) original code point - result.appendCodePoint(~c); - } else if (c <= UCaseProps.MAX_STRING_LENGTH) { + if (edits != null) { + edits.addUnchanged(cpLength); + // TODO: remove package path + if ((options & com.ibm.icu.text.CaseMap.OMIT_UNCHANGED_TEXT) != 0) { + return; + } + } + appendCodePoint(dest, ~result); + } else if (result <= UCaseProps.MAX_STRING_LENGTH) { // The mapping has already been appended to result. + if (edits != null) { + edits.addReplace(cpLength, result); + } } else { // Append the single-code point mapping. - result.appendCodePoint(c); + int length = appendCodePoint(dest, result); + if (edits != null) { + edits.addReplace(cpLength, length); + } } } - // TODO: Move the other string case mapping functions from UCharacter to here, too. - - public static String toUpper(ULocale locale, String str) { - if (locale == null) { - locale = ULocale.getDefault(); + private static final void appendUnchanged(CharSequence src, int start, int length, + Appendable dest, int options, Edits edits) throws IOException { + if (length > 0) { + if (edits != null) { + edits.addUnchanged(length); + // TODO: remove package path + if ((options & com.ibm.icu.text.CaseMap.OMIT_UNCHANGED_TEXT) != 0) { + return; + } + } + dest.append(src, start, start + length); } - int[] locCache = new int[] { UCaseProps.getCaseLocale(locale, null) }; - if (locCache[0] == UCaseProps.LOC_GREEK) { - return GreekUpper.toUpper(str, locCache); + } + + public static A toLower(int caseLocale, int options, + CharSequence src, A dest, Edits edits) { + try { + if (edits != null) { + edits.reset(); + } + StringContextIterator iter = new StringContextIterator(src); + int c; + while ((c = iter.nextCaseMapCP()) >= 0) { + c = UCaseProps.INSTANCE.toFullLower(c, iter, dest, caseLocale); + appendResult(c, dest, iter.getCPLength(), options, edits); + } + return dest; + } catch (IOException e) { + throw new ICUUncheckedIOException(e); } + } - StringContextIterator iter = new StringContextIterator(str); - StringBuilder result = new StringBuilder(str.length()); - int c; - while((c=iter.nextCaseMapCP())>=0) { - c = UCaseProps.INSTANCE.toFullUpper(c, iter, result, locale, locCache); - appendResult(c, result); + public static String toUpper(ULocale locale, String str) { + try { + int options = 0; Edits edits = null; // TODO + if (locale == null) { + locale = ULocale.getDefault(); + } + int caseLocale = UCaseProps.getCaseLocale(locale); + if (caseLocale == UCaseProps.LOC_GREEK) { + return GreekUpper.toUpper(str); + } + + StringContextIterator iter = new StringContextIterator(str); + StringBuilder result = new StringBuilder(str.length()); + int c; + while((c=iter.nextCaseMapCP())>=0) { + c = UCaseProps.INSTANCE.toFullUpper(c, iter, result, caseLocale); + appendResult(c, result, iter.getCPLength(), options, edits); + } + return result.toString(); + } catch (IOException e) { + throw new ICUUncheckedIOException(e); } - return result.toString(); } private static final class GreekUpper { @@ -662,8 +733,10 @@ public final class CaseMap { * TODO: Try to re-consolidate one way or another with the non-Greek function. * *

Keep this consistent with the C++ versions in ustrcase.cpp (UTF-16) and ucasemap.cpp (UTF-8). + * @throws IOException */ - private static String toUpper(CharSequence s, int[] locCache) { + private static String toUpper(CharSequence s) throws IOException { + int options = 0; Edits edits = null; // TODO StringBuilder result = new StringBuilder(s.length()); int state = 0; for (int i = 0; i < s.length();) { @@ -747,8 +820,8 @@ public final class CaseMap { --numYpogegrammeni; } } else { - c = UCaseProps.INSTANCE.toFullUpper(c, null, result, null, locCache); - appendResult(c, result); + c = UCaseProps.INSTANCE.toFullUpper(c, null, result, UCaseProps.LOC_GREEK); + appendResult(c, result, nextIndex - i, options, edits); } i = nextIndex; state = nextState; diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java index 927cdc03cdd..1da49283ac5 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java @@ -24,6 +24,7 @@ package com.ibm.icu.impl; import java.io.IOException; import java.nio.ByteBuffer; import java.util.Iterator; +import java.util.Locale; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UProperty; @@ -71,7 +72,7 @@ public final class UCaseProps { // read exceptions[] count=indexes[IX_EXC_LENGTH]; if(count>0) { - exceptions=ICUBinary.getChars(bytes, count, 0); + exceptions=ICUBinary.getString(bytes, count, 0); } // read unfold[] @@ -150,7 +151,7 @@ public final class UCaseProps { * * @param excWord (in) initial exceptions word * @param index (in) desired slot index - * @param excOffset (in) offset into exceptions[] after excWord=exceptions[excOffset++]; + * @param excOffset (in) offset into exceptions[] after excWord=exceptions.charAt(excOffset++); * @return bits 31..0: slot value * 63..32: modified excOffset, moved to the last char of the value, use +1 for beginning of next slot */ @@ -158,11 +159,11 @@ public final class UCaseProps { long value; if((excWord&EXC_DOUBLE_SLOTS)==0) { excOffset+=slotOffset(excWord, index); - value=exceptions[excOffset]; + value=exceptions.charAt(excOffset); } else { excOffset+=2*slotOffset(excWord, index); - value=exceptions[excOffset++]; - value=(value<<16)|exceptions[excOffset]; + value=exceptions.charAt(excOffset++); + value=(value<<16)|exceptions.charAt(excOffset); } return value |((long)excOffset<<32); } @@ -172,11 +173,11 @@ public final class UCaseProps { int value; if((excWord&EXC_DOUBLE_SLOTS)==0) { excOffset+=slotOffset(excWord, index); - value=exceptions[excOffset]; + value=exceptions.charAt(excOffset); } else { excOffset+=2*slotOffset(excWord, index); - value=exceptions[excOffset++]; - value=(value<<16)|exceptions[excOffset]; + value=exceptions.charAt(excOffset++); + value=(value<<16)|exceptions.charAt(excOffset); } return value; } @@ -191,7 +192,7 @@ public final class UCaseProps { } } else { int excOffset=getExceptionsOffset(props); - int excWord=exceptions[excOffset++]; + int excWord=exceptions.charAt(excOffset++); if(hasSlot(excWord, EXC_LOWER)) { c=getSlotValue(excWord, EXC_LOWER, excOffset); } @@ -207,7 +208,7 @@ public final class UCaseProps { } } else { int excOffset=getExceptionsOffset(props); - int excWord=exceptions[excOffset++]; + int excWord=exceptions.charAt(excOffset++); if(hasSlot(excWord, EXC_UPPER)) { c=getSlotValue(excWord, EXC_UPPER, excOffset); } @@ -223,7 +224,7 @@ public final class UCaseProps { } } else { int excOffset=getExceptionsOffset(props); - int excWord=exceptions[excOffset++]; + int excWord=exceptions.charAt(excOffset++); int index; if(hasSlot(excWord, EXC_TITLE)) { index=EXC_TITLE; @@ -291,7 +292,7 @@ public final class UCaseProps { */ int excOffset0, excOffset=getExceptionsOffset(props); int closureOffset; - int excWord=exceptions[excOffset++]; + int excWord=exceptions.charAt(excOffset++); int index, closureLength, fullLength, length; excOffset0=excOffset; @@ -334,7 +335,7 @@ public final class UCaseProps { /* add the full case folding string */ length=fullLength&0xf; if(length!=0) { - set.add(new String(exceptions, excOffset, length)); + set.add(exceptions.substring(excOffset, excOffset+length)); excOffset+=length; } @@ -348,8 +349,9 @@ public final class UCaseProps { } /* add each code point in the closure string */ - for(index=0; index>EXC_DOT_SHIFT)&DOT_MASK; + return (exceptions.charAt(getExceptionsOffset(props))>>EXC_DOT_SHIFT)&DOT_MASK; } } @@ -605,38 +607,44 @@ public final class UCaseProps { */ public static final int MAX_STRING_LENGTH=0x1f; - private static final int LOC_UNKNOWN=0; - private static final int LOC_ROOT=1; + //ivate static final int LOC_UNKNOWN=0; + public static final int LOC_ROOT=1; private static final int LOC_TURKISH=2; private static final int LOC_LITHUANIAN=3; static final int LOC_GREEK=4; - /* - * Checks and caches the type of locale ID as it is relevant for case mapping. - * If the locCache is not null, then it must be initialized with locCache[0]=0 . - */ - static final int getCaseLocale(ULocale locale, int[] locCache) { - int result; - - if(locCache!=null && (result=locCache[0])!=LOC_UNKNOWN) { - return result; - } - - result=LOC_ROOT; - - String language=locale.getLanguage(); - if(language.equals("tr") || language.equals("tur") || language.equals("az") || language.equals("aze")) { - result=LOC_TURKISH; - } else if(language.equals("el") || language.equals("ell")) { - result=LOC_GREEK; - } else if(language.equals("lt") || language.equals("lit")) { - result=LOC_LITHUANIAN; - } - - if(locCache!=null) { - locCache[0]=result; + public static final int getCaseLocale(Locale locale) { + return getCaseLocale(locale.getLanguage()); + } + public static final int getCaseLocale(ULocale locale) { + return getCaseLocale(locale.getLanguage()); + } + /** Accepts both 2- and 3-letter language subtags. */ + private static final int getCaseLocale(String language) { + // Check the subtag length to reduce the number of comparisons + // for locales without special behavior. + // Fastpath for English "en" which is often used for default (=root locale) case mappings, + // and for Chinese "zh": Very common but no special case mapping behavior. + if(language.length()==2) { + if(language.equals("en") || language.charAt(0)>'t') { + return LOC_ROOT; + } else if(language.equals("tr") || language.equals("az")) { + return LOC_TURKISH; + } else if(language.equals("el")) { + return LOC_GREEK; + } else if(language.equals("lt")) { + return LOC_LITHUANIAN; + } + } else if(language.length()==3) { + if(language.equals("tur") || language.equals("aze")) { + return LOC_TURKISH; + } else if(language.equals("ell")) { + return LOC_GREEK; + } else if(language.equals("lit")) { + return LOC_LITHUANIAN; + } } - return result; + return LOC_ROOT; } /* Is followed by {case-ignorable}* cased ? (dir determines looking forward/backward) */ @@ -797,19 +805,14 @@ public final class UCaseProps { * See ContextIterator for details. * If iter==null then a context-independent result is returned. * @param out If the mapping result is a string, then it is appended to out. - * @param locale Locale ID for locale-dependent mappings. - * @param locCache Initialize locCache[0] to 0; may be used to cache the result of parsing - * the locale ID for subsequent calls. - * Can be null. + * @param caseLocale Case locale value from ucase_getCaseLocale(). * @return Output code point or string length, see MAX_STRING_LENGTH. * * @see ContextIterator * @see #MAX_STRING_LENGTH * @internal */ - public final int toFullLower(int c, ContextIterator iter, - StringBuilder out, - ULocale locale, int[] locCache) { + public final int toFullLower(int c, ContextIterator iter, Appendable out, int caseLocale) { int result, props; result=c; @@ -820,22 +823,20 @@ public final class UCaseProps { } } else { int excOffset=getExceptionsOffset(props), excOffset2; - int excWord=exceptions[excOffset++]; + int excWord=exceptions.charAt(excOffset++); int full; excOffset2=excOffset; if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) { /* use hardcoded conditions and mappings */ - int loc=getCaseLocale(locale, locCache); - /* * Test for conditional mappings first * (otherwise the unconditional default mappings are always taken), * then test for characters that have unconditional mappings in SpecialCasing.txt, * then get the UnicodeData.txt mappings. */ - if( loc==LOC_LITHUANIAN && + if( caseLocale==LOC_LITHUANIAN && /* base characters, find accents above */ (((c==0x49 || c==0x4a || c==0x12e) && isFollowedByMoreAbove(iter)) || @@ -858,30 +859,34 @@ public final class UCaseProps { 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE */ - switch(c) { - case 0x49: /* LATIN CAPITAL LETTER I */ - out.append(iDot); - return 2; - case 0x4a: /* LATIN CAPITAL LETTER J */ - out.append(jDot); - return 2; - case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */ - out.append(iOgonekDot); - return 2; - case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */ - out.append(iDotGrave); - return 3; - case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */ - out.append(iDotAcute); - return 3; - case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */ - out.append(iDotTilde); - return 3; - default: - return 0; /* will not occur */ + try { + switch(c) { + case 0x49: /* LATIN CAPITAL LETTER I */ + out.append(iDot); + return 2; + case 0x4a: /* LATIN CAPITAL LETTER J */ + out.append(jDot); + return 2; + case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */ + out.append(iOgonekDot); + return 2; + case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */ + out.append(iDotGrave); + return 3; + case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */ + out.append(iDotAcute); + return 3; + case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */ + out.append(iDotTilde); + return 3; + default: + return 0; /* will not occur */ + } + } catch (IOException e) { + throw new ICUUncheckedIOException(e); } /* # Turkish and Azeri */ - } else if(loc==LOC_TURKISH && c==0x130) { + } else if(caseLocale==LOC_TURKISH && c==0x130) { /* # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri # The following rules handle those cases. @@ -890,7 +895,7 @@ public final class UCaseProps { 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE */ return 0x69; - } else if(loc==LOC_TURKISH && c==0x307 && isPrecededBy_I(iter)) { + } else if(caseLocale==LOC_TURKISH && c==0x307 && isPrecededBy_I(iter)) { /* # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. # This matches the behavior of the canonically equivalent I-dot_above @@ -899,7 +904,7 @@ public final class UCaseProps { 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE */ return 0; /* remove the dot (continue without output) */ - } else if(loc==LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter)) { + } else if(caseLocale==LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter)) { /* # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. @@ -913,8 +918,12 @@ public final class UCaseProps { 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ - out.append(iDot); - return 2; + try { + out.append(iDot); + return 2; + } catch (IOException e) { + throw new ICUUncheckedIOException(e); + } } else if( c==0x3a3 && !isFollowedByCasedLetter(iter, 1) && isFollowedByCasedLetter(iter, -1) /* -1=preceded */ @@ -936,11 +945,15 @@ public final class UCaseProps { /* start of full case mapping strings */ excOffset=(int)(value>>32)+1; - /* set the output pointer to the lowercase mapping */ - out.append(exceptions, excOffset, full); + try { + // append the lowercase mapping + out.append(exceptions, excOffset, excOffset+full); - /* return the string length */ - return full; + /* return the string length */ + return full; + } catch (IOException e) { + throw new ICUUncheckedIOException(e); + } } } @@ -954,8 +967,8 @@ public final class UCaseProps { /* internal */ private final int toUpperOrTitle(int c, ContextIterator iter, - StringBuilder out, - ULocale locale, int[] locCache, + Appendable out, + int loc, boolean upperNotTitle) { int result; int props; @@ -968,15 +981,13 @@ public final class UCaseProps { } } else { int excOffset=getExceptionsOffset(props), excOffset2; - int excWord=exceptions[excOffset++]; + int excWord=exceptions.charAt(excOffset++); int full, index; excOffset2=excOffset; if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) { /* use hardcoded conditions and mappings */ - int loc=getCaseLocale(locale, locCache); - if(loc==LOC_TURKISH && c==0x69) { /* # Turkish and Azeri @@ -1026,11 +1037,15 @@ public final class UCaseProps { } if(full!=0) { - /* set the output pointer to the result string */ - out.append(exceptions, excOffset, full); - - /* return the string length */ - return full; + try { + // append the result string + out.append(exceptions, excOffset, excOffset+full); + + /* return the string length */ + return full; + } catch (IOException e) { + throw new ICUUncheckedIOException(e); + } } } @@ -1049,15 +1064,15 @@ public final class UCaseProps { } public final int toFullUpper(int c, ContextIterator iter, - StringBuilder out, - ULocale locale, int[] locCache) { - return toUpperOrTitle(c, iter, out, locale, locCache, true); + Appendable out, + int caseLocale) { + return toUpperOrTitle(c, iter, out, caseLocale, true); } public final int toFullTitle(int c, ContextIterator iter, - StringBuilder out, - ULocale locale, int[] locCache) { - return toUpperOrTitle(c, iter, out, locale, locCache, false); + Appendable out, + int caseLocale) { + return toUpperOrTitle(c, iter, out, caseLocale, false); } /* case folding ------------------------------------------------------------- */ @@ -1117,7 +1132,7 @@ public final class UCaseProps { } } else { int excOffset=getExceptionsOffset(props); - int excWord=exceptions[excOffset++]; + int excWord=exceptions.charAt(excOffset++); int index; if((excWord&EXC_CONDITIONAL_FOLD)!=0) { /* special case folding mappings, hardcoded */ @@ -1168,7 +1183,7 @@ public final class UCaseProps { * together in a way that they still fold to common result strings. */ - public final int toFullFolding(int c, StringBuilder out, int options) { + public final int toFullFolding(int c, Appendable out, int options) { int result; int props; @@ -1180,7 +1195,7 @@ public final class UCaseProps { } } else { int excOffset=getExceptionsOffset(props), excOffset2; - int excWord=exceptions[excOffset++]; + int excWord=exceptions.charAt(excOffset++); int full, index; excOffset2=excOffset; @@ -1194,8 +1209,12 @@ public final class UCaseProps { return 0x69; } else if(c==0x130) { /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ - out.append(iDot); - return 2; + try { + out.append(iDot); + return 2; + } catch (IOException e) { + throw new ICUUncheckedIOException(e); + } } } else { /* Turkic mappings */ @@ -1219,11 +1238,15 @@ public final class UCaseProps { full=(full>>4)&0xf; if(full!=0) { - /* set the output pointer to the result string */ - out.append(exceptions, excOffset, full); - - /* return the string length */ - return full; + try { + // append the result string + out.append(exceptions, excOffset, excOffset+full); + + /* return the string length */ + return full; + } catch (IOException e) { + throw new ICUUncheckedIOException(e); + } } } @@ -1242,7 +1265,6 @@ public final class UCaseProps { /* case mapping properties API ---------------------------------------------- */ - private static final int[] rootLocCache = { LOC_ROOT }; /* * We need a StringBuilder for multi-code point output from the * full case mapping functions. However, we do not actually use that output, @@ -1282,20 +1304,20 @@ public final class UCaseProps { */ case UProperty.CHANGES_WHEN_LOWERCASED: dummyStringBuilder.setLength(0); - return toFullLower(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0; + return toFullLower(c, null, dummyStringBuilder, LOC_ROOT)>=0; case UProperty.CHANGES_WHEN_UPPERCASED: dummyStringBuilder.setLength(0); - return toFullUpper(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0; + return toFullUpper(c, null, dummyStringBuilder, LOC_ROOT)>=0; case UProperty.CHANGES_WHEN_TITLECASED: dummyStringBuilder.setLength(0); - return toFullTitle(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0; + return toFullTitle(c, null, dummyStringBuilder, LOC_ROOT)>=0; /* case UProperty.CHANGES_WHEN_CASEFOLDED: -- in UCharacterProperty.java */ case UProperty.CHANGES_WHEN_CASEMAPPED: dummyStringBuilder.setLength(0); return - toFullLower(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0 || - toFullUpper(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0 || - toFullTitle(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0; + toFullLower(c, null, dummyStringBuilder, LOC_ROOT)>=0 || + toFullUpper(c, null, dummyStringBuilder, LOC_ROOT)>=0 || + toFullTitle(c, null, dummyStringBuilder, LOC_ROOT)>=0; default: return false; } @@ -1303,7 +1325,7 @@ public final class UCaseProps { // data members -------------------------------------------------------- *** private int indexes[]; - private char exceptions[]; + private String exceptions; private char unfold[]; private Trie2_16 trie; diff --git a/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java b/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java index 40fecc7b10b..c2385f84ff8 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java @@ -29,6 +29,7 @@ import com.ibm.icu.impl.UPropertyAliases; import com.ibm.icu.lang.UCharacterEnums.ECharacterCategory; import com.ibm.icu.lang.UCharacterEnums.ECharacterDirection; import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.Edits; import com.ibm.icu.text.Normalizer2; import com.ibm.icu.util.RangeValueIterator; import com.ibm.icu.util.ULocale; @@ -4960,29 +4961,37 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection * @stable ICU 3.2 */ public static String toLowerCase(ULocale locale, String str) { - StringContextIterator iter = new StringContextIterator(str); - StringBuilder result = new StringBuilder(str.length()); - int[] locCache = new int[1]; - int c; - - if (locale == null) { - locale = ULocale.getDefault(); + // TODO: remove package path + if (str.length() <= 100) { + if (str.isEmpty()) { + return str; + } + // Collect and apply only changes. + // Good if no or few changes. + // Bad (slow) if many changes. + Edits edits = new Edits(); + StringBuilder replacementChars = com.ibm.icu.text.CaseMap.toLower( + locale, com.ibm.icu.text.CaseMap.OMIT_UNCHANGED_TEXT, str, + new StringBuilder(), edits); + return applyEdits(str, replacementChars, edits); + } else { + return com.ibm.icu.text.CaseMap.toLower(locale, 0, str, new StringBuilder(), null).toString(); } - locCache[0]=0; - - while((c=iter.nextCaseMapCP())>=0) { - c = UCaseProps.INSTANCE.toFullLower(c, iter, result, locale, locCache); + } - /* decode the result */ - if(c<0) { - /* (not) original code point */ - c=~c; - } else if(c<=UCaseProps.MAX_STRING_LENGTH) { - /* mapping already appended to result */ - continue; - /* } else { append single-code point mapping */ + private static String applyEdits(String str, StringBuilder replacementChars, Edits edits) { + if (!edits.hasChanges()) { + return str; + } + StringBuilder result = new StringBuilder(str.length() + edits.lengthDelta()); + for (Edits.Iterator ei = edits.getCoarseIterator(); ei.next();) { + if (ei.hasChange()) { + int i = ei.replacementIndex(); + result.append(replacementChars, i, i + ei.newLength()); + } else { + int i = ei.sourceIndex(); + result.append(str, i, i + ei.oldLength()); } - result.appendCodePoint(c); } return result.toString(); } @@ -5063,13 +5072,12 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection int options) { StringContextIterator iter = new StringContextIterator(str); StringBuilder result = new StringBuilder(str.length()); - int[] locCache = new int[1]; int c, nc, srcLength = str.length(); if (locale == null) { locale = ULocale.getDefault(); } - locCache[0]=0; + int caseLocale = UCaseProps.getCaseLocale(locale); if(titleIter == null) { titleIter = BreakIterator.getWordInstance(locale); @@ -5130,7 +5138,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection if(titleStart A toLower( + ULocale locale, int options, CharSequence src, A dest, Edits edits) { + if (locale == null) { + locale = ULocale.getDefault(); + } + int caseLocale = UCaseProps.getCaseLocale(locale); + // TODO: remove package path + return com.ibm.icu.impl.CaseMap.toLower(caseLocale, options, src, dest, edits); + } + + /** + * Uppercases a string and optionally records edits. + * Casing is locale-dependent and context-sensitive. + * The result may be longer or shorter than the original. + * + * @param locale The locale ID. + * @param options Options bit set, usually 0. See {@link #OMIT_UNCHANGED_TEXT}. + * @param src The original string. + * @param dest A buffer for the result string. Must not be null. + * @param edits Records edits for index mapping, working with styled text, + * and getting only changes (if any). + * This function calls edits.reset() first. edits can be null. + * @return dest with the result string (or only changes) appended. + * + * @see UCharacter#toUpperCase(ULocale, String) + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public static A toUpper( + ULocale locale, int options, CharSequence src, A dest, Edits edits) { + return null; + } + + /** + * Titlecases a string and optionally records edits. + * Casing is locale-dependent and context-sensitive. + * The result may be longer or shorter than the original. + * + * Titlecasing uses a break iterator to find the first characters of words + * that are to be titlecased. It titlecases those characters and lowercases + * all others. (This can be modified with options bits.) + * + * @param locale The locale ID. + * @param options Options bit set, usually 0. See {@link #OMIT_UNCHANGED_TEXT}, + * {@link UCharacter#TITLECASE_NO_LOWERCASE}, + * {@link UCharacter#TITLECASE_NO_BREAK_ADJUSTMENT}. + * @param iter A break iterator to find the first characters of words that are to be titlecased. + * It is set to the source string (setText()) + * and used one or more times for iteration (first() and next()). + * If null, then a word break iterator for the locale is used + * (or something equivalent). + * @param src The original string. + * @param dest A buffer for the result string. Must not be null. + * @param edits Records edits for index mapping, working with styled text, + * and getting only changes (if any). + * This function calls edits.reset() first. edits can be null. + * @return dest with the result string (or only changes) appended. + * + * @see UCharacter#toTitleCase(ULocale, String, BreakIterator, int) + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public static A toTitle( + ULocale locale, int options, BreakIterator iter, + CharSequence src, A dest, Edits edits) { + return null; + } + /** + * Case-folds a string and optionally records edits. + * + * Case-folding is locale-independent and not context-sensitive, + * but there is an option for whether to include or exclude mappings for dotted I + * and dotless i that are marked with 'T' in CaseFolding.txt. + * + * The result may be longer or shorter than the original. + * + * @param options Options bit set, usually 0. See {@link #OMIT_UNCHANGED_TEXT}, + * {@link UCharacter#FOLD_CASE_DEFAULT}, + * {@link UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I}. + * @param src The original string. + * @param dest A buffer for the result string. Must not be null. + * @param edits Records edits for index mapping, working with styled text, + * and getting only changes (if any). + * This function calls edits.reset() first. edits can be null. + * @return dest with the result string (or only changes) appended. + * + * @see UCharacter#foldCase(String, int) + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public static A foldCase( + int options, CharSequence src, A dest, Edits edits) { + return null; + } } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/Edits.java b/icu4j/main/classes/core/src/com/ibm/icu/text/Edits.java index 7d1ffd091cb..b1239527c14 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/Edits.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/Edits.java @@ -10,9 +10,6 @@ import java.util.Arrays; * Supports replacements, insertions, deletions in linear progression. * Does not support moving/reordering of text. * - * An Edits object tracks a separate UErrorCode, but ICU string transformation functions - * (e.g., case mapping functions) merge any such errors into their API's UErrorCode. - * * @draft ICU 59 * @provisional This API might change or be removed in a future release. */ diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java index 38510c39899..4443beb8d80 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java @@ -3866,7 +3866,6 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa int n = getRangeCount(); int result; StringBuilder full = new StringBuilder(); - int locCache[] = new int[1]; for (int i=0; i, Compa // add case mappings // (does not add long s for regular s, or Kelvin for k, for example) for (int cp=start; cp<=end; ++cp) { - result = csp.toFullLower(cp, null, full, root, locCache); + result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT); addCaseMapping(foldSet, result, full); - result = csp.toFullTitle(cp, null, full, root, locCache); + result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT); addCaseMapping(foldSet, result, full); - result = csp.toFullUpper(cp, null, full, root, locCache); + result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT); addCaseMapping(foldSet, result, full); result = csp.toFullFolding(cp, full, 0); @@ -3906,6 +3905,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa } else { BreakIterator bi = BreakIterator.getWordInstance(root); for (String str : strings) { + // TODO: call lower-level functions foldSet.add(UCharacter.toLowerCase(root, str)); foldSet.add(UCharacter.toTitleCase(root, str, bi)); foldSet.add(UCharacter.toUpperCase(root, str)); diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/LowercaseTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/LowercaseTransliterator.java index 95bb60b32b3..dfed3526653 100644 --- a/icu4j/main/classes/translit/src/com/ibm/icu/text/LowercaseTransliterator.java +++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/LowercaseTransliterator.java @@ -44,7 +44,7 @@ class LowercaseTransliterator extends Transliterator{ private final UCaseProps csp; private ReplaceableContextIterator iter; private StringBuilder result; - private int[] locCache; + private int caseLocale; /** * Constructs a transliterator. @@ -56,8 +56,7 @@ class LowercaseTransliterator extends Transliterator{ csp=UCaseProps.INSTANCE; iter=new ReplaceableContextIterator(); result = new StringBuilder(); - locCache = new int[1]; - locCache[0]=0; + caseLocale = UCaseProps.getCaseLocale(locale); } /** @@ -85,7 +84,7 @@ class LowercaseTransliterator extends Transliterator{ iter.setLimit(offsets.limit); iter.setContextLimits(offsets.contextStart, offsets.contextLimit); while((c=iter.nextCaseMapCP())>=0) { - c=csp.toFullLower(c, iter, result, locale, locCache); + c=csp.toFullLower(c, iter, result, caseLocale); if(iter.didReachLimit() && isIncremental) { // the case mapping function tried to look beyond the context limit diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/TitlecaseTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/TitlecaseTransliterator.java index d3dc2968105..96f11c8e293 100644 --- a/icu4j/main/classes/translit/src/com/ibm/icu/text/TitlecaseTransliterator.java +++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/TitlecaseTransliterator.java @@ -42,7 +42,7 @@ class TitlecaseTransliterator extends Transliterator { private final UCaseProps csp; private ReplaceableContextIterator iter; private StringBuilder result; - private int[] locCache; + private int caseLocale; /** * Constructs a transliterator. @@ -55,8 +55,7 @@ class TitlecaseTransliterator extends Transliterator { csp=UCaseProps.INSTANCE; iter=new ReplaceableContextIterator(); result = new StringBuilder(); - locCache = new int[1]; - locCache[0]=0; + caseLocale = UCaseProps.getCaseLocale(locale); } /** @@ -119,9 +118,9 @@ class TitlecaseTransliterator extends Transliterator { type=csp.getTypeOrIgnorable(c); if(type>=0) { // not case-ignorable if(doTitle) { - c=csp.toFullTitle(c, iter, result, locale, locCache); + c=csp.toFullTitle(c, iter, result, caseLocale); } else { - c=csp.toFullLower(c, iter, result, locale, locCache); + c=csp.toFullLower(c, iter, result, caseLocale); } doTitle = type==0; // doTitle=isUncased diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/UppercaseTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/UppercaseTransliterator.java index 77e2dfd7073..bd9e3fed38a 100644 --- a/icu4j/main/classes/translit/src/com/ibm/icu/text/UppercaseTransliterator.java +++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/UppercaseTransliterator.java @@ -41,7 +41,7 @@ class UppercaseTransliterator extends Transliterator { private final UCaseProps csp; private ReplaceableContextIterator iter; private StringBuilder result; - private int[] locCache; + private int caseLocale; /** * Constructs a transliterator. @@ -52,8 +52,7 @@ class UppercaseTransliterator extends Transliterator { csp=UCaseProps.INSTANCE; iter=new ReplaceableContextIterator(); result = new StringBuilder(); - locCache = new int[1]; - locCache[0]=0; + caseLocale = UCaseProps.getCaseLocale(locale); } /** @@ -81,7 +80,7 @@ class UppercaseTransliterator extends Transliterator { iter.setLimit(offsets.limit); iter.setContextLimits(offsets.contextStart, offsets.contextLimit); while((c=iter.nextCaseMapCP())>=0) { - c=csp.toFullUpper(c, iter, result, locale, locCache); + c=csp.toFullUpper(c, iter, result, caseLocale); if(iter.didReachLimit() && isIncremental) { // the case mapping function tried to look beyond the context limit