import com.ibm.icu.lang.UCharacterEnums.ECharacterDirection;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.Normalizer2;
-import com.ibm.icu.text.UTF16;
import com.ibm.icu.util.RangeValueIterator;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.util.ValueIterator;
/**
* {@icuenhanced java.lang.Character}.{@icu _usage_}
*
- * <p>The UCharacter class provides extensions to the
- * <a href="http://java.sun.com/j2se/1.5/docs/api/java/lang/Character.html">
- * java.lang.Character</a> class. These extensions provide support for
- * more Unicode properties and together with the <a href=../text/UTF16.html>UTF16</a>
- * class, provide support for supplementary characters (those with code
- * points above U+FFFF).
+ * <p>The UCharacter class provides extensions to the {@link java.lang.Character} class.
+ * These extensions provide support for more Unicode properties.
* Each ICU release supports the latest version of Unicode available at that time.
*
+ * <p>For some time before Java 5 added support for supplementary Unicode code points,
+ * The ICU UCharacter class and many other ICU classes already supported them.
+ * Some UCharacter methods and constants were widened slightly differently than
+ * how the Character class methods and constants were widened later.
+ * In particular, {@link Character#MAX_VALUE} is still a char with the value U+FFFF,
+ * while the {@link UCharacter#MAX_VALUE} is an int with the value U+10FFFF.
+ *
* <p>Code points are represented in these API using ints. While it would be
* more convenient in Java to have a separate primitive datatype for them,
* ints suffice in the meantime.
// public data members -----------------------------------------------
/**
- * The lowest Unicode code point value.
+ * The lowest Unicode code point value, constant 0.
+ * Same as {@link Character#MIN_CODE_POINT}, same integer value as {@link Character#MIN_VALUE}.
+ *
* @stable ICU 2.1
*/
- public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE;
+ public static final int MIN_VALUE = Character.MIN_CODE_POINT;
/**
- * The highest Unicode code point value (scalar value) according to the
- * Unicode Standard.
- * This is a 21-bit value (21 bits, rounded up).<br>
- * Up-to-date Unicode implementation of java.lang.Character.MAX_VALUE
+ * The highest Unicode code point value (scalar value), constant U+10FFFF (uses 21 bits).
+ * Same as {@link Character#MAX_CODE_POINT}.
+ *
+ * <p>Up-to-date Unicode implementation of {@link Character#MAX_VALUE}
+ * which is still a char with the value U+FFFF.
+ *
* @stable ICU 2.1
*/
- public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;
+ public static final int MAX_VALUE = Character.MAX_CODE_POINT;
/**
- * The minimum value for Supplementary code points
+ * The minimum value for Supplementary code points, constant U+10000.
+ * Same as {@link Character#MIN_SUPPLEMENTARY_CODE_POINT}.
+ *
* @stable ICU 2.1
*/
- public static final int SUPPLEMENTARY_MIN_VALUE =
- UTF16.SUPPLEMENTARY_MIN_VALUE;
+ public static final int SUPPLEMENTARY_MIN_VALUE = Character.MIN_SUPPLEMENTARY_CODE_POINT;
/**
* Unicode value used when translating into Unicode encoding form and there
/**
* Converts argument code point and returns a String object representing
- * the code point's value in UTF16 format.
- * The result is a string whose length is 1 for non-supplementary code
- * points, 2 otherwise.<br>
- * com.ibm.ibm.icu.UTF16 can be used to parse Strings generated by this
- * function.<br>
- * Up-to-date Unicode implementation of java.lang.Character.toString()
+ * the code point's value in UTF-16 format.
+ * The result is a string whose length is 1 for BMP code points, 2 for supplementary ones.
+ *
+ * <p>Up-to-date Unicode implementation of java.lang.Character.toString().
+ *
* @param ch code point
* @return string representation of the code point, null if code point is not
* defined in unicode
return String.valueOf((char)ch);
}
- StringBuilder result = new StringBuilder();
- result.append(UTF16.getLeadSurrogate(ch));
- result.append(UTF16.getTrailSurrogate(ch));
- return result.toString();
+ return new String(Character.toChars(ch));
}
/**
if (ch < MIN_VALUE) {
return false;
}
- if (ch < UTF16.SURROGATE_MIN_VALUE) {
+ if (ch < Character.MIN_SURROGATE) {
return true;
}
- if (ch <= UTF16.SURROGATE_MAX_VALUE) {
+ if (ch <= Character.MAX_SURROGATE) {
return false;
}
if (UCharacterUtility.isNonCharacter(ch)) {
{
int size = str.length();
int codepoint;
- for (int i = 0; i < size; i ++)
+ for (int i = 0; i < size; i += Character.charCount(codepoint))
{
- codepoint = UTF16.charAt(str, i);
+ codepoint = str.codePointAt(i);
if (!isLegal(codepoint)) {
return false;
}
- if (isSupplementary(codepoint)) {
- i ++;
- }
}
return true;
}
}
int cp;
StringBuilder sb = new StringBuilder();
- for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
- cp = UTF16.charAt(s,i);
+ for (int i = 0; i < s.length(); i += Character.charCount(cp)) {
+ cp = s.codePointAt(i);
if (i != 0) sb.append(separator);
sb.append(UCharacter.getName(cp));
}
/**
- * {@icu} Returns a code point corresponding to the two UTF16 characters.
+ * {@icu} Returns a code point corresponding to the two surrogate code units.
+ *
* @param lead the lead char
* @param trail the trail char
* @return code point if surrogate characters are valid.
- * @exception IllegalArgumentException thrown when argument characters do
- * not form a valid codepoint
+ * @exception IllegalArgumentException thrown when the code units do
+ * not form a valid code point
* @stable ICU 2.1
*/
public static int getCodePoint(char lead, char trail)
{
- if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) {
- return UCharacterProperty.getRawSupplementary(lead, trail);
+ if (Character.isSurrogatePair(lead, trail)) {
+ return Character.toCodePoint(lead, trail);
}
throw new IllegalArgumentException("Illegal surrogate characters");
}
/**
- * {@icu} Returns the code point corresponding to the UTF16 character.
- * @param char16 the UTF16 character
+ * {@icu} Returns the code point corresponding to the BMP code point.
+ *
+ * @param char16 the BMP code point
* @return code point if argument is a valid character.
* @exception IllegalArgumentException thrown when char16 is not a valid
- * codepoint
+ * code point
* @stable ICU 2.1
*/
public static int getCodePoint(char char16)
* If the limit parameter is negative or past the string, then the
* string length is restored as the iteration limit.
*
- * This limit does not affect the next() function which always
+ * <p>This limit does not affect the next() function which always
* iterates to the very end of the string.
*
* @param lim The iteration limit.
/**
* Iterate forward through the string to fetch the next code point
* to be case-mapped, and set the context indexes for it.
- * Performance optimization, to save on function calls and redundant
- * tests. Combines UTF16.charAt(), UTF16.getCharCount(), and setIndex().
*
- * When the iteration limit is reached (and -1 is returned),
+ * <p>When the iteration limit is reached (and -1 is returned),
* getCPStart() will be at the iteration limit.
*
- * Iteration with next() does not affect the position for nextCaseMapCP().
+ * <p>Iteration with next() does not affect the position for nextCaseMapCP().
*
* @return The next code point to be case-mapped, or <0 when the iteration is done.
*/
public int nextCaseMapCP() {
cpStart=cpLimit;
if(cpLimit<limit) {
- int c=s.charAt(cpLimit++);
- if(UTF16.LEAD_SURROGATE_MIN_VALUE<=c || c<=UTF16.TRAIL_SURROGATE_MAX_VALUE) {
- char c2;
- if( c<=UTF16.LEAD_SURROGATE_MAX_VALUE && cpLimit<limit &&
- UTF16.TRAIL_SURROGATE_MIN_VALUE<=(c2=s.charAt(cpLimit)) &&
- c2<=UTF16.TRAIL_SURROGATE_MAX_VALUE
- ) {
- // supplementary code point
- ++cpLimit;
- c=UCharacterProperty.getRawSupplementary((char)c, c2);
- // else unpaired surrogate code point
- }
- // else BMP code point
- }
+ int c=s.codePointAt(cpLimit);
+ cpLimit+=Character.charCount(c);
return c;
} else {
return -1;
int c;
if(dir>0 && index<s.length()) {
- c=UTF16.charAt(s, index);
- index+=UTF16.getCharCount(c);
+ c=s.codePointAt(index);
+ index+=Character.charCount(c);
return c;
} else if(dir<0 && index>0) {
- c=UTF16.charAt(s, index-1);
- index-=UTF16.getCharCount(c);
+ c=s.codePointBefore(index);
+ index-=Character.charCount(c);
return c;
}
return -1;
int titleLimit=iter.getCPLimit();
if(titleLimit<index) {
- // TODO: With Java 5, this would want to be
- // result.append(str, titleLimit, index);
- String appendStr = str.substring(titleLimit,index);
/* Special Case - Dutch IJ Titlecasing */
- if ( isDutch && c == 0x0049 && appendStr.startsWith("j")) {
- appendStr = "J" + appendStr.substring(1);
+ if (isDutch && c == 0x0049 && str.charAt(titleLimit) == 'j') {
+ result.append('J').append(str, titleLimit + 1, index);
+ } else {
+ result.append(str, titleLimit, index);
}
- result.append(appendStr);
}
iter.moveToLimit();
break;
length = str.length();
for(i=0; i<length;) {
- c=UTF16.charAt(str, i);
- i+=UTF16.getCharCount(c);
+ c=str.codePointAt(i);
+ i+=Character.charCount(c);
c = UCaseProps.INSTANCE.toFullFolding(c, result, options);
/* decode the result */
switch (propertyEnum) {
case UProperty.AGE: return getAge(codepoint).toString();
case UProperty.ISO_COMMENT: return getISOComment(codepoint);
- case UProperty.BIDI_MIRRORING_GLYPH: return UTF16.valueOf(getMirror(codepoint));
- case UProperty.CASE_FOLDING: return foldCase(UTF16.valueOf(codepoint), true);
- case UProperty.LOWERCASE_MAPPING: return toLowerCase(UTF16.valueOf(codepoint));
+ case UProperty.BIDI_MIRRORING_GLYPH: return toString(getMirror(codepoint));
+ case UProperty.CASE_FOLDING: return toString(foldCase(codepoint, true));
+ case UProperty.LOWERCASE_MAPPING: return toString(toLowerCase(codepoint));
case UProperty.NAME: return getName(codepoint);
- case UProperty.SIMPLE_CASE_FOLDING: return UTF16.valueOf(foldCase(codepoint,true));
- case UProperty.SIMPLE_LOWERCASE_MAPPING: return UTF16.valueOf(toLowerCase(codepoint));
- case UProperty.SIMPLE_TITLECASE_MAPPING: return UTF16.valueOf(toTitleCase(codepoint));
- case UProperty.SIMPLE_UPPERCASE_MAPPING: return UTF16.valueOf(toUpperCase(codepoint));
- case UProperty.TITLECASE_MAPPING: return toTitleCase(UTF16.valueOf(codepoint),null);
+ case UProperty.SIMPLE_CASE_FOLDING: return toString(foldCase(codepoint, true));
+ case UProperty.SIMPLE_LOWERCASE_MAPPING: return toString(toLowerCase(codepoint));
+ case UProperty.SIMPLE_TITLECASE_MAPPING: return toString(toTitleCase(codepoint));
+ case UProperty.SIMPLE_UPPERCASE_MAPPING: return toString(toUpperCase(codepoint));
+ case UProperty.TITLECASE_MAPPING: return toString(toTitleCase(codepoint));
case UProperty.UNICODE_1_NAME: return getName1_0(codepoint);
- case UProperty.UPPERCASE_MAPPING: return toUpperCase(UTF16.valueOf(codepoint));
+ case UProperty.UPPERCASE_MAPPING: return toString(toUpperCase(codepoint));
}
throw new IllegalArgumentException("Illegal Property Enum");
}
// JDK 1.5 API coverage
/**
- * Cover the JDK 1.5 API, for convenience.
- * @see UTF16#LEAD_SURROGATE_MIN_VALUE
+ * Constant U+D800, same as {@link Character#MIN_HIGH_SURROGATE}.
+ *
* @stable ICU 3.0
*/
- public static final char MIN_HIGH_SURROGATE = UTF16.LEAD_SURROGATE_MIN_VALUE;
+ public static final char MIN_HIGH_SURROGATE = Character.MIN_HIGH_SURROGATE;
/**
- * Cover the JDK 1.5 API, for convenience.
- * @see UTF16#LEAD_SURROGATE_MAX_VALUE
+ * Constant U+DBFF, same as {@link Character#MAX_HIGH_SURROGATE}.
+ *
* @stable ICU 3.0
*/
- public static final char MAX_HIGH_SURROGATE = UTF16.LEAD_SURROGATE_MAX_VALUE;
+ public static final char MAX_HIGH_SURROGATE = Character.MAX_HIGH_SURROGATE;
/**
- * Cover the JDK 1.5 API, for convenience.
- * @see UTF16#TRAIL_SURROGATE_MIN_VALUE
+ * Constant U+DC00, same as {@link Character#MIN_LOW_SURROGATE}.
+ *
* @stable ICU 3.0
*/
- public static final char MIN_LOW_SURROGATE = UTF16.TRAIL_SURROGATE_MIN_VALUE;
+ public static final char MIN_LOW_SURROGATE = Character.MIN_LOW_SURROGATE;
/**
- * Cover the JDK 1.5 API, for convenience.
- * @see UTF16#TRAIL_SURROGATE_MAX_VALUE
+ * Constant U+DFFF, same as {@link Character#MAX_LOW_SURROGATE}.
+ *
* @stable ICU 3.0
*/
- public static final char MAX_LOW_SURROGATE = UTF16.TRAIL_SURROGATE_MAX_VALUE;
+ public static final char MAX_LOW_SURROGATE = Character.MAX_LOW_SURROGATE;
/**
- * Cover the JDK 1.5 API, for convenience.
- * @see UTF16#SURROGATE_MIN_VALUE
+ * Constant U+D800, same as {@link Character#MIN_SURROGATE}.
+ *
* @stable ICU 3.0
*/
- public static final char MIN_SURROGATE = UTF16.SURROGATE_MIN_VALUE;
+ public static final char MIN_SURROGATE = Character.MIN_SURROGATE;
/**
- * Cover the JDK 1.5 API, for convenience.
- * @see UTF16#SURROGATE_MAX_VALUE
+ * Constant U+DFFF, same as {@link Character#MAX_SURROGATE}.
+ *
* @stable ICU 3.0
*/
- public static final char MAX_SURROGATE = UTF16.SURROGATE_MAX_VALUE;
+ public static final char MAX_SURROGATE = Character.MAX_SURROGATE;
/**
- * Cover the JDK 1.5 API, for convenience.
- * @see UTF16#SUPPLEMENTARY_MIN_VALUE
+ * Constant U+10000, same as {@link Character#MIN_SUPPLEMENTARY_CODE_POINT}.
+ *
* @stable ICU 3.0
*/
- public static final int MIN_SUPPLEMENTARY_CODE_POINT = UTF16.SUPPLEMENTARY_MIN_VALUE;
+ public static final int MIN_SUPPLEMENTARY_CODE_POINT = Character.MIN_SUPPLEMENTARY_CODE_POINT;
/**
- * Cover the JDK 1.5 API, for convenience.
- * @see UTF16#CODEPOINT_MAX_VALUE
+ * Constant U+10FFFF, same as {@link Character#MAX_CODE_POINT}.
+ *
* @stable ICU 3.0
*/
- public static final int MAX_CODE_POINT = UTF16.CODEPOINT_MAX_VALUE;
+ public static final int MAX_CODE_POINT = Character.MAX_CODE_POINT;
/**
- * Cover the JDK 1.5 API, for convenience.
- * @see UTF16#CODEPOINT_MIN_VALUE
+ * Constant U+0000, same as {@link Character#MIN_CODE_POINT}.
+ *
* @stable ICU 3.0
*/
- public static final int MIN_CODE_POINT = UTF16.CODEPOINT_MIN_VALUE;
+ public static final int MIN_CODE_POINT = Character.MIN_CODE_POINT;
/**
* Cover the JDK 1.5 API, for convenience.
}
/**
- * Cover the JDK 1.5 API, for convenience.
+ * Same as {@link Character#isSupplementaryCodePoint}.
+ *
* @param cp the code point to check
* @return true if cp is a supplementary code point
* @stable ICU 3.0
*/
public static final boolean isSupplementaryCodePoint(int cp) {
- return cp >= UTF16.SUPPLEMENTARY_MIN_VALUE
- && cp <= UTF16.CODEPOINT_MAX_VALUE;
+ return Character.isSupplementaryCodePoint(cp);
}
/**
- * Cover the JDK 1.5 API, for convenience.
+ * Same as {@link Character#isHighSurrogate}.
+ *
* @param ch the char to check
* @return true if ch is a high (lead) surrogate
* @stable ICU 3.0
*/
public static boolean isHighSurrogate(char ch) {
- return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE;
+ return Character.isHighSurrogate(ch);
}
/**
- * Cover the JDK 1.5 API, for convenience.
+ * Same as {@link Character#isLowSurrogate}.
+ *
* @param ch the char to check
* @return true if ch is a low (trail) surrogate
* @stable ICU 3.0
*/
public static boolean isLowSurrogate(char ch) {
- return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE;
+ return Character.isLowSurrogate(ch);
}
/**
- * Cover the JDK 1.5 API, for convenience. Return true if the chars
- * form a valid surrogate pair.
+ * Same as {@link Character#isSurrogatePair}.
+ *
* @param high the high (lead) char
* @param low the low (trail) char
* @return true if high, low form a surrogate pair
* @stable ICU 3.0
*/
public static final boolean isSurrogatePair(char high, char low) {
- return isHighSurrogate(high) && isLowSurrogate(low);
+ return Character.isSurrogatePair(high, low);
}
/**
- * Cover the JDK 1.5 API, for convenience. Return the number of chars needed
- * to represent the code point. This does not check the
- * code point for validity.
+ * Same as {@link Character#charCount}.
+ * Returns the number of chars needed to represent the code point (1 or 2).
+ * This does not check the code point for validity.
+ *
* @param cp the code point to check
* @return the number of chars needed to represent the code point
- * @see UTF16#getCharCount
* @stable ICU 3.0
*/
public static int charCount(int cp) {
- return UTF16.getCharCount(cp);
+ return Character.charCount(cp);
}
/**
- * Cover the JDK 1.5 API, for convenience. Return the code point represented by
- * the characters. This does not check the surrogate pair for validity.
+ * Same as {@link Character#toCodePoint}.
+ * Returns the code point represented by the two surrogate code units.
+ * This does not check the surrogate pair for validity.
+ *
* @param high the high (lead) surrogate
* @param low the low (trail) surrogate
* @return the code point formed by the surrogate pair
* @stable ICU 3.0
*/
public static final int toCodePoint(char high, char low) {
- return UCharacterProperty.getRawSupplementary(high, low);
+ return Character.toCodePoint(high, low);
}
/**
- * Cover the JDK 1.5 API, for convenience. Return the code point at index.
- * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
- * API. This examines only the characters at index and index+1.
+ * Same as {@link Character#codePointAt(CharSequence, int)}.
+ * Returns the code point at index.
+ * This examines only the characters at index and index+1.
+ *
* @param seq the characters to check
* @param index the index of the first or only char forming the code point
* @return the code point at the index
}
/**
- * Cover the JDK 1.5 API, for convenience. Return the code point at index.
- * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
- * API. This examines only the characters at index and index+1.
+ * Same as {@link Character#codePointAt(char[], int)}.
+ * Returns the code point at index.
+ * This examines only the characters at index and index+1.
+ *
* @param text the characters to check
* @param index the index of the first or only char forming the code point
* @return the code point at the index
}
/**
- * Cover the JDK 1.5 API, for convenience. Return the code point at index.
- * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
- * API. This examines only the characters at index and index+1.
+ * Same as {@link Character#codePointAt(char[], int, int)}.
+ * Returns the code point at index.
+ * This examines only the characters at index and index+1.
+ *
* @param text the characters to check
* @param index the index of the first or only char forming the code point
* @param limit the limit of the valid text
}
/**
- * Cover the JDK 1.5 API, for convenience. Return the code point before index.
- * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
- * API. This examines only the characters at index-1 and index-2.
+ * Same as {@link Character#codePointBefore(CharSequence, int)}.
+ * Return the code point before index.
+ * This examines only the characters at index-1 and index-2.
+ *
* @param seq the characters to check
* @param index the index after the last or only char forming the code point
* @return the code point before the index
}
/**
- * Cover the JDK 1.5 API, for convenience. Return the code point before index.
- * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
- * API. This examines only the characters at index-1 and index-2.
+ * Same as {@link Character#codePointBefore(char[], int)}.
+ * Returns the code point before index.
+ * This examines only the characters at index-1 and index-2.
+ *
* @param text the characters to check
* @param index the index after the last or only char forming the code point
* @return the code point before the index
}
/**
- * Cover the JDK 1.5 API, for convenience. Return the code point before index.
- * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
- * API. This examines only the characters at index-1 and index-2.
+ * Same as {@link Character#codePointBefore(char[], int, int)}.
+ * Return the code point before index.
+ * This examines only the characters at index-1 and index-2.
+ *
* @param text the characters to check
* @param index the index after the last or only char forming the code point
* @param limit the start of the valid text
}
/**
- * Cover the JDK 1.5 API, for convenience. Writes the chars representing the
+ * Same as {@link Character#toChars(int, char[], int)}.
+ * Writes the chars representing the
* code point into the destination at the given index.
+ *
* @param cp the code point to convert
* @param dst the destination array into which to put the char(s) representing the code point
* @param dstIndex the index at which to put the first (or only) char
* @stable ICU 3.0
*/
public static final int toChars(int cp, char[] dst, int dstIndex) {
- if (cp >= 0) {
- if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
- dst[dstIndex] = (char)cp;
- return 1;
- }
- if (cp <= MAX_CODE_POINT) {
- dst[dstIndex] = UTF16.getLeadSurrogate(cp);
- dst[dstIndex+1] = UTF16.getTrailSurrogate(cp);
- return 2;
- }
- }
- throw new IllegalArgumentException();
+ return Character.toChars(cp, dst, dstIndex);
}
/**
- * Cover the JDK 1.5 API, for convenience. Returns a char array
- * representing the code point.
+ * Same as {@link Character#toChars(int)}.
+ * Returns a char array representing the code point.
+ *
* @param cp the code point to convert
* @return an array containing the char(s) representing the code point
* @throws IllegalArgumentException if cp is not a valid code point
* @stable ICU 3.0
*/
public static final char[] toChars(int cp) {
- if (cp >= 0) {
- if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
- return new char[] { (char)cp };
- }
- if (cp <= MAX_CODE_POINT) {
- return new char[] {
- UTF16.getLeadSurrogate(cp),
- UTF16.getTrailSurrogate(cp)
- };
- }
- }
- throw new IllegalArgumentException();
+ return Character.toChars(cp);
}
/**