+++ /dev/null
-/**
- *******************************************************************************
- * Copyright (C) 2004-2012, International Business Machines Corporation and *
- * others. All Rights Reserved. *
- *******************************************************************************
- */
-package com.ibm.icu.impl;
-
-/**
- * For generation of Implicit CEs
- * @author Mark Davis
- *
- * Cleaned up so that changes can be made more easily.
- * Old values:
-# First Implicit: E26A792D
-# Last Implicit: E3DC70C0
-# First CJK: E0030300
-# Last CJK: E0A9DD00
-# First CJK_A: E0A9DF00
-# Last CJK_A: E0DE3100
-@internal
- */
-public class ImplicitCEGenerator {
-
- /**
- * constants
- */
- static final boolean DEBUG = false;
-
- static final long topByte = 0xFF000000L;
- static final long bottomByte = 0xFFL;
- static final long fourBytes = 0xFFFFFFFFL;
-
- static final int MAX_INPUT = 0x220001; // 2 * Unicode range + 2
-
-// public static final int CJK_BASE = 0x4E00;
-// public static final int CJK_LIMIT = 0x9FFF+1;
-// public static final int CJK_COMPAT_USED_BASE = 0xFA0E;
-// public static final int CJK_COMPAT_USED_LIMIT = 0xFA2F+1;
-// public static final int CJK_A_BASE = 0x3400;
-// public static final int CJK_A_LIMIT = 0x4DBF+1;
-// public static final int CJK_B_BASE = 0x20000;
-// public static final int CJK_B_LIMIT = 0x2A6DF+1;
-
- public static final int
- // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
- // 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; (Unicode 6.1)
- CJK_BASE = 0x4E00,
- CJK_LIMIT = 0x9FCC+1,
-
- CJK_COMPAT_USED_BASE = 0xFA0E,
- CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
-
- //3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
- //4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
-
- CJK_A_BASE = 0x3400,
- CJK_A_LIMIT = 0x4DB5+1,
-
- //20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
- //2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
-
- CJK_B_BASE = 0x20000,
- CJK_B_LIMIT = 0x2A6D6+1,
-
- //2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
- //2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
-
- CJK_C_BASE = 0x2A700,
- CJK_C_LIMIT = 0x2B734+1,
-
- //2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
- //2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
-
- CJK_D_BASE = 0x2B740,
- CJK_D_LIMIT = 0x2B81D+1
-
- // when adding to this list, look for all occurrences (in project) of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!!
- ;
-
-// private void throwError(String title, int cp) {
-// throw new IllegalArgumentException(title + "\t" + Utility.hex(cp, 6) + "\t" +
-// Utility.hex(getImplicitFromRaw(cp) & fourBytes));
-// }
-//
-// private void throwError(String title, long ce) {
-// throw new IllegalArgumentException(title + "\t" + Utility.hex(ce & fourBytes));
-// }
-//
-// private void show(int i) {
-// if (i >= 0 && i <= MAX_INPUT) {
-// System.out.println(Utility.hex(i) + "\t" + Utility.hex(getImplicitFromRaw(i) & fourBytes));
-// }
-// }
-
- /**
- * Precomputed by constructor
- */
- int final3Multiplier;
- int final4Multiplier;
- int final3Count;
- int final4Count;
- int medialCount;
- int min3Primary;
- int min4Primary;
- int max4Primary;
- int minTrail;
- int maxTrail;
- int max3Trail;
- int max4Trail;
- int min4Boundary;
-
- public int getGap4() {
- return final4Multiplier - 1;
- }
-
- public int getGap3() {
- return final3Multiplier - 1;
- }
-
- // old comment
- // we must skip all 00, 01, 02, FF bytes, so most bytes have 252 values
- // we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
- // we shift so that HAN all has the same first primary, for compression.
- // for the 4 byte case, we make the gap as large as we can fit.
-
- /**
- * Supply parameters for generating implicit CEs
- */
- public ImplicitCEGenerator(int minPrimary, int maxPrimary) {
- // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
- this(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1);
- }
-
- /**
- * Set up to generate implicits.
- * @param minPrimary The minimum primary value.
- * @param maxPrimary The maximum primary value.
- * @param minTrail final byte
- * @param maxTrail final byte
- * @param gap3 the gap we leave for tailoring for 3-byte forms
- * @param primaries3count number of 3-byte primarys we can use (normally 1)
- */
- public ImplicitCEGenerator(int minPrimary, int maxPrimary, int minTrail, int maxTrail, int gap3, int primaries3count) {
- // some simple parameter checks
- if (minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) {
- throw new IllegalArgumentException("bad lead bytes");
- }
- if (minTrail < 0 || minTrail >= maxTrail || maxTrail > 0xFF) {
- throw new IllegalArgumentException("bad trail bytes");
- }
- if (primaries3count < 1) {
- throw new IllegalArgumentException("bad three-byte primaries");
- }
-
- this.minTrail = minTrail;
- this.maxTrail = maxTrail;
-
- min3Primary = minPrimary;
- max4Primary = maxPrimary;
- // compute constants for use later.
- // number of values we can use in trailing bytes
- // leave room for empty values between AND above, e.g. if gap = 2
- // range 3..7 => +3 -4 -5 -6 -7: so 1 value
- // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
- // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
- final3Multiplier = gap3 + 1;
- final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
- max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
-
- // medials can use full range
- medialCount = (maxTrail - minTrail + 1);
- // find out how many values fit in each form
- int threeByteCount = medialCount * final3Count;
- // now determine where the 3/4 boundary is.
- // we use 3 bytes below the boundary, and 4 above
- int primariesAvailable = maxPrimary - minPrimary + 1;
- int primaries4count = primariesAvailable - primaries3count;
-
- int min3ByteCoverage = primaries3count * threeByteCount;
- min4Primary = minPrimary + primaries3count;
- min4Boundary = min3ByteCoverage;
- // Now expand out the multiplier for the 4 bytes, and redo.
-
- int totalNeeded = MAX_INPUT - min4Boundary;
- int neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
- if (DEBUG) System.out.println("neededPerPrimaryByte: " + neededPerPrimaryByte);
-
- int neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
- if (DEBUG) System.out.println("neededPerFinalByte: " + neededPerFinalByte);
-
- int gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
- if (DEBUG) System.out.println("expandedGap: " + gap4);
- if (gap4 < 1) throw new IllegalArgumentException("must have larger gap4s");
-
- final4Multiplier = gap4 + 1;
- final4Count = neededPerFinalByte;
- max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
-
- if (primaries4count * medialCount * medialCount * final4Count < MAX_INPUT) {
- throw new IllegalArgumentException("internal error");
- }
- if (DEBUG) {
- System.out.println("final4Count: " + final4Count);
- for (int counter = 0; counter < final4Count; ++counter) {
- int value = minTrail + (1 + counter)*final4Multiplier;
- System.out.println(counter + "\t" + value + "\t" + Utility.hex(value));
- }
- }
- }
-
- static public int divideAndRoundUp(int a, int b) {
- return 1 + (a-1)/b;
- }
-
- /**
- * Converts implicit CE into raw integer
- * @param implicit The implicit value passed.
- * @return -1 if illegal format
- */
- public int getRawFromImplicit(int implicit) {
- int result;
- int b3 = implicit & 0xFF;
- implicit >>= 8;
- int b2 = implicit & 0xFF;
- implicit >>= 8;
- int b1 = implicit & 0xFF;
- implicit >>= 8;
- int b0 = implicit & 0xFF;
-
- // simple parameter checks
- if (b0 < min3Primary || b0 > max4Primary
- || b1 < minTrail || b1 > maxTrail) return -1;
- // normal offsets
- b1 -= minTrail;
-
- // take care of the final values, and compose
- if (b0 < min4Primary) {
- if (b2 < minTrail || b2 > max3Trail || b3 != 0) return -1;
- b2 -= minTrail;
- int remainder = b2 % final3Multiplier;
- if (remainder != 0) return -1;
- b0 -= min3Primary;
- b2 /= final3Multiplier;
- result = ((b0 * medialCount) + b1) * final3Count + b2;
- } else {
- if (b2 < minTrail || b2 > maxTrail
- || b3 < minTrail || b3 > max4Trail) return -1;
- b2 -= minTrail;
- b3 -= minTrail;
- int remainder = b3 % final4Multiplier;
- if (remainder != 0) return -1;
- b3 /= final4Multiplier;
- b0 -= min4Primary;
- result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
- }
- // final check
- if (result < 0 || result > MAX_INPUT) return -1;
- return result;
- }
-
- /**
- * Generate the implicit CE, from raw integer.
- * Left shifted to put the first byte at the top of an int.
- * @param cp code point
- * @return Primary implicit weight
- */
- public int getImplicitFromRaw(int cp) {
- if (cp < 0 || cp > MAX_INPUT) {
- throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
- }
- int last0 = cp - min4Boundary;
- if (last0 < 0) {
- int last1 = cp / final3Count;
- last0 = cp % final3Count;
-
- int last2 = last1 / medialCount;
- last1 %= medialCount;
-
- last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
- last1 = minTrail + last1; // offset
- last2 = min3Primary + last2; // offset
-
- if (last2 >= min4Primary) {
- throw new IllegalArgumentException("4-byte out of range: " +
- Utility.hex(cp) + ", " + Utility.hex(last2));
- }
-
- return (last2 << 24) + (last1 << 16) + (last0 << 8);
- } else {
- int last1 = last0 / final4Count;
- last0 %= final4Count;
-
- int last2 = last1 / medialCount;
- last1 %= medialCount;
-
- int last3 = last2 / medialCount;
- last2 %= medialCount;
-
- last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
- last1 = minTrail + last1; // offset
- last2 = minTrail + last2; // offset
- last3 = min4Primary + last3; // offset
-
- if (last3 > max4Primary) {
- throw new IllegalArgumentException("4-byte out of range: " +
- Utility.hex(cp) + ", " + Utility.hex(last3));
- }
-
- return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
- }
- }
-
- /**
- * Gets an Implicit from a code point. Internally,
- * swaps (which produces a raw value 0..220000,
- * then converts raw to implicit.
- * @param cp The code point to convert to implicit.
- * @return Primary implicit weight
- */
- public int getImplicitFromCodePoint(int cp) {
- if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
-
- // Produce Raw value
- // note, we add 1 so that the first value is always empty!!
- cp = ImplicitCEGenerator.swapCJK(cp) + 1;
- // we now have a range of numbers from 0 to 220000.
-
- if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
-
- return getImplicitFromRaw(cp);
- }
-
- /**
- * Function used to:
- * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
- * b) bump any non-CJK characters by 10FFFF.
- * The relevant blocks are:
- * A: 4E00..9FFF; CJK Unified Ideographs
- * F900..FAFF; CJK Compatibility Ideographs
- * B: 3400..4DBF; CJK Unified Ideographs Extension A
- * 20000..XX; CJK Unified Ideographs Extension B (and others later on)
- * As long as
- * no new B characters are allocated between 4E00 and FAFF, and
- * no new A characters are outside of this range,
- * (very high probability) this simple code will work.
- * The reordered blocks are:
- * Block1 is CJK
- * Block2 is CJK_COMPAT_USED
- * Block3 is CJK_A
- * (all contiguous)
- * Any other CJK gets its normal code point
- * Any non-CJK gets +10FFFF
- * When we reorder Block1, we make sure that it is at the very start,
- * so that it will use a 3-byte form.
- * Warning: the we only pick up the compatibility characters that are
- * NOT decomposed, so that block is smaller!
- */
-
- static int NON_CJK_OFFSET = 0x110000;
-
- public static int swapCJK(int i) {
-
- if (i >= CJK_BASE) {
- if (i < CJK_LIMIT) return i - CJK_BASE;
-
- if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET;
-
- if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE
- + (CJK_LIMIT - CJK_BASE);
- if (i < CJK_B_BASE) return i + NON_CJK_OFFSET;
-
- if (i < CJK_B_LIMIT) return i; // non-BMP-CJK
-
- if (i < CJK_C_BASE) return i + NON_CJK_OFFSET;
-
- if (i < CJK_C_LIMIT) return i; // non-BMP-CJK
-
- if (i < CJK_D_BASE) return i + NON_CJK_OFFSET;
-
- if (i < CJK_D_LIMIT) return i; // non-BMP-CJK
-
- return i + NON_CJK_OFFSET; // non-CJK
- }
- if (i < CJK_A_BASE) return i + NON_CJK_OFFSET;
-
- if (i < CJK_A_LIMIT) return i - CJK_A_BASE
- + (CJK_LIMIT - CJK_BASE)
- + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
- return i + NON_CJK_OFFSET; // non-CJK
- }
-
-
- /**
- * @return Minimal trail value
- */
- public int getMinTrail() {
- return minTrail;
- }
-
- /**
- * @return Maximal trail value
- */
- public int getMaxTrail() {
- return maxTrail;
- }
-
- public int getCodePointFromRaw(int i) {
- i--;
- int result = 0;
- if(i >= NON_CJK_OFFSET) {
- result = i - NON_CJK_OFFSET;
- } else if(i >= CJK_B_BASE) {
- result = i;
- } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
- // rest of CJKs, compacted
- if(i < CJK_LIMIT - CJK_BASE) {
- result = i + CJK_BASE;
- } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
- result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
- } else {
- result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
- }
- } else {
- result = -1;
- }
- return result;
- }
-
- public int getRawFromCodePoint(int i) {
- return swapCJK(i)+1;
- }
-}
/**
*******************************************************************************
-* Copyright (C) 1996-2009, International Business Machines Corporation and *
-* others. All Rights Reserved. *
+* Copyright (C) 1996-2014, International Business Machines Corporation and
+* others. All Rights Reserved.
*******************************************************************************
*/
-package com.ibm.icu.impl;
+package com.ibm.icu.impl.coll;
-import com.ibm.icu.text.UCharacterIterator;
+import com.ibm.icu.util.ByteArrayWrapper;
/**
- * <p>Binary Ordered Compression for Unicode</p>
+ * <p>Binary Ordered Compression Scheme for Unicode</p>
*
* <p>Users are strongly encouraged to read the ICU paper on
* <a href="http://www.icu-project.org/docs/papers/binary_ordered_compression_for_unicode.html">
* @author Syn Wee Quek
* @since release 2.2, May 3rd 2002
*/
-public class BOCU
+public class BOCSU
{
- // public constructors --------------------------------------------------
-
// public methods -------------------------------------------------------
-
+
/**
- * <p>Encode the code points of a string as a sequence of bytes,
- * preserving lexical order.</p>
- * <p>The minimum size of buffer required for the compression can be
- * preflighted by getCompressionLength(String).</p>
- * @param source text source
- * @param buffer output buffer
- * @param offset to start writing to
- * @return end offset where the writing stopped
- * @see #getCompressionLength(String)
- * @exception ArrayIndexOutOfBoundsException thrown if size of buffer is
- * too small for the output.
+ * Encode the code points of a string as
+ * a sequence of byte-encoded differences (slope detection),
+ * preserving lexical order.
+ *
+ * <p>Optimize the difference-taking for runs of Unicode text within
+ * small scripts:
+ *
+ * <p>Most small scripts are allocated within aligned 128-blocks of Unicode
+ * code points. Lexical order is preserved if "prev" is always moved
+ * into the middle of such a block.
+ *
+ * <p>Additionally, "prev" is moved from anywhere in the Unihan
+ * area into the middle of that area.
+ * Note that the identical-level run in a sort key is generated from
+ * NFD text - there are never Hangul characters included.
*/
- public static int compress(String source, byte buffer[], int offset)
- {
- int prev = 0;
- UCharacterIterator iterator = UCharacterIterator.getInstance(source);
- int codepoint = iterator.nextCodePoint();
- while (codepoint != UCharacterIterator.DONE) {
- if (prev < 0x4e00 || prev >= 0xa000) {
- prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_;
- }
- else {
- // Unihan U+4e00..U+9fa5:
- // double-bytes down from the upper end
- prev = 0x9fff - SLOPE_REACH_POS_2_;
+ public static int writeIdenticalLevelRun(int prev, CharSequence s, int i, int length, ByteArrayWrapper sink) {
+ while (i < length) {
+ // We must have capacity>=SLOPE_MAX_BYTES in case writeDiff() writes that much,
+ // but we do not want to force the sink to allocate
+ // for a large min_capacity because we might actually only write one byte.
+ ensureAppendCapacity(sink, 16, s.length() * 2);
+ byte[] buffer = sink.bytes;
+ int capacity = buffer.length;
+ int p = sink.size;
+ int lastSafe = capacity - SLOPE_MAX_BYTES_;
+ while (i < length && p <= lastSafe) {
+ if (prev < 0x4e00 || prev >= 0xa000) {
+ prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_;
+ } else {
+ // Unihan U+4e00..U+9fa5:
+ // double-bytes down from the upper end
+ prev = 0x9fff - SLOPE_REACH_POS_2_;
+ }
+
+ int c = Character.codePointAt(s, i);
+ i += Character.charCount(c);
+ if (c == 0xfffe) {
+ buffer[p++] = 2; // merge separator
+ prev = 0;
+ } else {
+ p = writeDiff(c - prev, buffer, p);
+ prev = c;
+ }
}
-
- offset = writeDiff(codepoint - prev, buffer, offset);
- prev = codepoint;
- codepoint = iterator.nextCodePoint();
+ sink.size = p;
}
- return offset;
+ return prev;
}
-
- /**
- * Return the number of bytes that compress() would write.
- * @param source text source string
- * @return the length of the BOCU result
- * @see #compress(String, byte[], int)
- */
- public static int getCompressionLength(String source)
- {
- int prev = 0;
- int result = 0;
- UCharacterIterator iterator = UCharacterIterator.getInstance(source);
- int codepoint = iterator.nextCodePoint();
- while (codepoint != UCharacterIterator.DONE) {
- if (prev < 0x4e00 || prev >= 0xa000) {
- prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_;
- }
- else {
- // Unihan U+4e00..U+9fa5:
- // double-bytes down from the upper end
- prev = 0x9fff - SLOPE_REACH_POS_2_;
- }
-
- codepoint = iterator.nextCodePoint();
- result += lengthOfDiff(codepoint - prev);
- prev = codepoint;
- }
- return result;
+
+ private static void ensureAppendCapacity(ByteArrayWrapper sink, int minCapacity, int desiredCapacity) {
+ int remainingCapacity = sink.bytes.length - sink.size;
+ if (remainingCapacity >= minCapacity) { return; }
+ if (desiredCapacity < minCapacity) { desiredCapacity = minCapacity; }
+ sink.ensureCapacity(sink.size + desiredCapacity);
}
- // public setter methods -------------------------------------------------
-
- // public getter methods ------------------------------------------------
-
- // public other methods -------------------------------------------------
-
- // protected constructor ------------------------------------------------
-
- // protected data members ------------------------------------------------
-
- // protected methods -----------------------------------------------------
-
// private data members --------------------------------------------------
-
+
/**
* Do not use byte values 0, 1, 2 because they are separators in sort keys.
*/
private static final int SLOPE_MAX_ = 0xff;
private static final int SLOPE_MIDDLE_ = 0x81;
private static final int SLOPE_TAIL_COUNT_ = SLOPE_MAX_ - SLOPE_MIN_ + 1;
- //private static final int SLOPE_MAX_BYTES_ = 4;
+ private static final int SLOPE_MAX_BYTES_ = 4;
/**
* Number of lead bytes:
* Constructor private to prevent initialization
*/
///CLOVER:OFF
- private BOCU()
+ private BOCSU()
{
}
///CLOVER:ON
}
/**
- * Encode one difference value -0x10ffff..+0x10ffff in 1..3 bytes,
+ * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes,
* preserving lexical order
* @param diff
* @param buffer byte buffer to append to
buffer[offset + 3] = (byte)(SLOPE_MIN_
+ diff % SLOPE_TAIL_COUNT_);
diff /= SLOPE_TAIL_COUNT_;
- buffer[offset] = (byte)(SLOPE_MIN_
+ buffer[offset + 2] = (byte)(SLOPE_MIN_
+ diff % SLOPE_TAIL_COUNT_);
diff /= SLOPE_TAIL_COUNT_;
buffer[offset + 1] = (byte)(SLOPE_MIN_
}
return offset;
}
-
- /**
- * How many bytes would writeDiff() write?
- * @param diff
- */
- private static final int lengthOfDiff(int diff)
- {
- if (diff >= SLOPE_REACH_NEG_1_) {
- if (diff <= SLOPE_REACH_POS_1_) {
- return 1;
- }
- else if (diff <= SLOPE_REACH_POS_2_) {
- return 2;
- }
- else if(diff <= SLOPE_REACH_POS_3_) {
- return 3;
- }
- else {
- return 4;
- }
- }
- else {
- if (diff >= SLOPE_REACH_NEG_2_) {
- return 2;
- }
- else if (diff >= SLOPE_REACH_NEG_3_) {
- return 3;
- }
- else {
- return 4;
- }
- }
- }
}
--- /dev/null
+/*
+*******************************************************************************
+* Copyright (C) 2010-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* Collation.java, ported from collation.h/.cpp
+*
+* C++ version created on: 2010oct27
+* created by: Markus W. Scherer
+*/
+
+package com.ibm.icu.impl.coll;
+
+/**
+ * Collation v2 basic definitions and static helper functions.
+ *
+ * Data structures except for expansion tables store 32-bit CEs which are
+ * either specials (see tags below) or are compact forms of 64-bit CEs.
+ */
+public final class Collation {
+ /** UChar32 U_SENTINEL.
+ * TODO: Create a common, public constant?
+ */
+ public static final int SENTINEL_CP = -1;
+
+ // ICU4C compare() API returns enum UCollationResult values (with UCOL_ prefix).
+ // ICU4J just returns int. We use these constants for ease of porting.
+ public static final int LESS = -1;
+ public static final int EQUAL = 0;
+ public static final int GREATER = 1;
+
+ // Special sort key bytes for all levels.
+ public static final int TERMINATOR_BYTE = 0;
+ public static final int LEVEL_SEPARATOR_BYTE = 1;
+ /**
+ * Merge-sort-key separator.
+ * Must not be used as the lead byte of any CE weight,
+ * nor as primary compression low terminator.
+ * Otherwise usable.
+ */
+ public static final int MERGE_SEPARATOR_BYTE = 2;
+ public static final long MERGE_SEPARATOR_PRIMARY = 0x02000000; // U+FFFE
+ static final int MERGE_SEPARATOR_WEIGHT16 = 0x0200; // U+FFFE
+ public static final int MERGE_SEPARATOR_LOWER32 = 0x02000200; // U+FFFE
+ static final int MERGE_SEPARATOR_CE32 = 0x02000202; // U+FFFE
+
+ /**
+ * Primary compression low terminator, must be greater than MERGE_SEPARATOR_BYTE.
+ * Reserved value in primary second byte if the lead byte is compressible.
+ * Otherwise usable in all CE weight bytes.
+ */
+ public static final int PRIMARY_COMPRESSION_LOW_BYTE = 3;
+ /**
+ * Primary compression high terminator.
+ * Reserved value in primary second byte if the lead byte is compressible.
+ * Otherwise usable in all CE weight bytes.
+ */
+ public static final int PRIMARY_COMPRESSION_HIGH_BYTE = 0xff;
+
+ /** Default secondary/tertiary weight lead byte. */
+ static final int COMMON_BYTE = 5;
+ public static final int COMMON_WEIGHT16 = 0x0500;
+ /** Middle 16 bits of a CE with a common secondary weight. */
+ static final int COMMON_SECONDARY_CE = 0x05000000;
+ /** Lower 16 bits of a CE with a common tertiary weight. */
+ static final int COMMON_TERTIARY_CE = 0x0500;
+ /** Lower 32 bits of a CE with common secondary and tertiary weights. */
+ public static final int COMMON_SEC_AND_TER_CE = 0x05000500;
+
+ static final int SECONDARY_MASK = 0xffff0000;
+ public static final int CASE_MASK = 0xc000;
+ static final int SECONDARY_AND_CASE_MASK = SECONDARY_MASK | CASE_MASK;
+ /** Only the 2*6 bits for the pure tertiary weight. */
+ public static final int ONLY_TERTIARY_MASK = 0x3f3f;
+ /** Only the secondary & tertiary bits; no case, no quaternary. */
+ static final int ONLY_SEC_TER_MASK = SECONDARY_MASK | ONLY_TERTIARY_MASK;
+ /** Case bits and tertiary bits. */
+ static final int CASE_AND_TERTIARY_MASK = CASE_MASK | ONLY_TERTIARY_MASK;
+ public static final int QUATERNARY_MASK = 0xc0;
+ /** Case bits and quaternary bits. */
+ public static final int CASE_AND_QUATERNARY_MASK = CASE_MASK | QUATERNARY_MASK;
+
+ static final int UNASSIGNED_IMPLICIT_BYTE = 0xfe; // compressible
+ /**
+ * First unassigned: AlphabeticIndex overflow boundary.
+ * We want a 3-byte primary so that it fits into the root elements table.
+ *
+ * This 3-byte primary will not collide with
+ * any unassigned-implicit 4-byte primaries because
+ * the first few hundred Unicode code points all have real mappings.
+ */
+ static final long FIRST_UNASSIGNED_PRIMARY = 0xfe040200L;
+
+ static final int TRAIL_WEIGHT_BYTE = 0xff; // not compressible
+ static final long FIRST_TRAILING_PRIMARY = 0xff020200L; // [first trailing]
+ public static final long MAX_PRIMARY = 0xffff0000L; // U+FFFF
+ static final int MAX_REGULAR_CE32 = 0xffff0505; // U+FFFF
+
+ // CE32 value for U+FFFD as well as illegal UTF-8 byte sequences (which behave like U+FFFD).
+ // We use the third-highest primary weight for U+FFFD (as in UCA 6.3+).
+ public static final long FFFD_PRIMARY = MAX_PRIMARY - 0x20000;
+ static final int FFFD_CE32 = MAX_REGULAR_CE32 - 0x20000;
+
+ /**
+ * A CE32 is special if its low byte is this or greater.
+ * Impossible case bits 11 mark special CE32s.
+ * This value itself is used to indicate a fallback to the base collator.
+ */
+ static final int SPECIAL_CE32_LOW_BYTE = 0xc0;
+ static final int FALLBACK_CE32 = SPECIAL_CE32_LOW_BYTE;
+ /**
+ * Low byte of a long-primary special CE32.
+ */
+ static final int LONG_PRIMARY_CE32_LOW_BYTE = 0xc1; // SPECIAL_CE32_LOW_BYTE | LONG_PRIMARY_TAG
+
+ static final int UNASSIGNED_CE32 = 0xffffffff; // Compute an unassigned-implicit CE.
+
+ static final int NO_CE32 = 1;
+
+ /** No CE: End of input. Only used in runtime code, not stored in data. */
+ static final long NO_CE_PRIMARY = 1; // not a left-adjusted weight
+ static final int NO_CE_WEIGHT16 = 0x0100; // weight of LEVEL_SEPARATOR_BYTE
+ public static final long NO_CE = 0x101000100L; // NO_CE_PRIMARY, NO_CE_WEIGHT16, NO_CE_WEIGHT16
+
+ /** Sort key levels. */
+
+ /** Unspecified level. */
+ public static final int NO_LEVEL = 0;
+ public static final int PRIMARY_LEVEL = 1;
+ public static final int SECONDARY_LEVEL = 2;
+ public static final int CASE_LEVEL = 3;
+ public static final int TERTIARY_LEVEL = 4;
+ public static final int QUATERNARY_LEVEL = 5;
+ public static final int IDENTICAL_LEVEL = 6;
+ /** Beyond sort key bytes. */
+ public static final int ZERO_LEVEL = 7;
+
+ /**
+ * Sort key level flags: xx_FLAG = 1 << xx_LEVEL.
+ * In Java, use enum Level with flag() getters, or use EnumSet rather than hand-made bit sets.
+ */
+ static final int NO_LEVEL_FLAG = 1;
+ static final int PRIMARY_LEVEL_FLAG = 2;
+ static final int SECONDARY_LEVEL_FLAG = 4;
+ static final int CASE_LEVEL_FLAG = 8;
+ static final int TERTIARY_LEVEL_FLAG = 0x10;
+ static final int QUATERNARY_LEVEL_FLAG = 0x20;
+ static final int IDENTICAL_LEVEL_FLAG = 0x40;
+ static final int ZERO_LEVEL_FLAG = 0x80;
+
+ /**
+ * Special-CE32 tags, from bits 3..0 of a special 32-bit CE.
+ * Bits 31..8 are available for tag-specific data.
+ * Bits 5..4: Reserved. May be used in the future to indicate lccc!=0 and tccc!=0.
+ */
+
+ /**
+ * Fall back to the base collator.
+ * This is the tag value in SPECIAL_CE32_LOW_BYTE and FALLBACK_CE32.
+ * Bits 31..8: Unused, 0.
+ */
+ static final int FALLBACK_TAG = 0;
+ /**
+ * Long-primary CE with COMMON_SEC_AND_TER_CE.
+ * Bits 31..8: Three-byte primary.
+ */
+ static final int LONG_PRIMARY_TAG = 1;
+ /**
+ * Long-secondary CE with zero primary.
+ * Bits 31..16: Secondary weight.
+ * Bits 15.. 8: Tertiary weight.
+ */
+ static final int LONG_SECONDARY_TAG = 2;
+ /**
+ * Unused.
+ * May be used in the future for single-byte secondary CEs (SHORT_SECONDARY_TAG),
+ * storing the secondary in bits 31..24, the ccc in bits 23..16,
+ * and the tertiary in bits 15..8.
+ */
+ static final int RESERVED_TAG_3 = 3;
+ /**
+ * Latin mini expansions of two simple CEs [pp, 05, tt] [00, ss, 05].
+ * Bits 31..24: Single-byte primary weight pp of the first CE.
+ * Bits 23..16: Tertiary weight tt of the first CE.
+ * Bits 15.. 8: Secondary weight ss of the second CE.
+ */
+ static final int LATIN_EXPANSION_TAG = 4;
+ /**
+ * Points to one or more simple/long-primary/long-secondary 32-bit CE32s.
+ * Bits 31..13: Index into int table.
+ * Bits 12.. 8: Length=1..31.
+ */
+ static final int EXPANSION32_TAG = 5;
+ /**
+ * Points to one or more 64-bit CEs.
+ * Bits 31..13: Index into CE table.
+ * Bits 12.. 8: Length=1..31.
+ */
+ static final int EXPANSION_TAG = 6;
+ /**
+ * Builder data, used only in the CollationDataBuilder, not in runtime data.
+ *
+ * If bit 8 is 0: Builder context, points to a list of context-sensitive mappings.
+ * Bits 31..13: Index to the builder's list of ConditionalCE32 for this character.
+ * Bits 12.. 9: Unused, 0.
+ *
+ * If bit 8 is 1 (IS_BUILDER_JAMO_CE32): Builder-only jamoCE32 value.
+ * The builder fetches the Jamo CE32 from the trie.
+ * Bits 31..13: Jamo code point.
+ * Bits 12.. 9: Unused, 0.
+ */
+ static final int BUILDER_DATA_TAG = 7;
+ /**
+ * Points to prefix trie.
+ * Bits 31..13: Index into prefix/contraction data.
+ * Bits 12.. 8: Unused, 0.
+ */
+ static final int PREFIX_TAG = 8;
+ /**
+ * Points to contraction data.
+ * Bits 31..13: Index into prefix/contraction data.
+ * Bits 12..11: Unused, 0.
+ * Bit 10: CONTRACT_TRAILING_CCC flag.
+ * Bit 9: CONTRACT_NEXT_CCC flag.
+ * Bit 8: CONTRACT_SINGLE_CP_NO_MATCH flag.
+ */
+ static final int CONTRACTION_TAG = 9;
+ /**
+ * Decimal digit.
+ * Bits 31..13: Index into int table for non-numeric-collation CE32.
+ * Bit 12: Unused, 0.
+ * Bits 11.. 8: Digit value 0..9.
+ */
+ static final int DIGIT_TAG = 10;
+ /**
+ * Tag for U+0000, for moving the NUL-termination handling
+ * from the regular fastpath into specials-handling code.
+ * Bits 31..8: Unused, 0.
+ */
+ static final int U0000_TAG = 11;
+ /**
+ * Tag for a Hangul syllable.
+ * Bits 31..9: Unused, 0.
+ * Bit 8: HANGUL_NO_SPECIAL_JAMO flag.
+ */
+ static final int HANGUL_TAG = 12;
+ /**
+ * Tag for a lead surrogate code unit.
+ * Optional optimization for UTF-16 string processing.
+ * Bits 31..10: Unused, 0.
+ * 9.. 8: =0: All associated supplementary code points are unassigned-implict.
+ * =1: All associated supplementary code points fall back to the base data.
+ * else: (Normally 2) Look up the data for the supplementary code point.
+ */
+ static final int LEAD_SURROGATE_TAG = 13;
+ /**
+ * Tag for CEs with primary weights in code point order.
+ * Bits 31..13: Index into CE table, for one data "CE".
+ * Bits 12.. 8: Unused, 0.
+ *
+ * This data "CE" has the following bit fields:
+ * Bits 63..32: Three-byte primary pppppp00.
+ * 31.. 8: Start/base code point of the in-order range.
+ * 7: Flag isCompressible primary.
+ * 6.. 0: Per-code point primary-weight increment.
+ */
+ static final int OFFSET_TAG = 14;
+ /**
+ * Implicit CE tag. Compute an unassigned-implicit CE.
+ * All bits are set (UNASSIGNED_CE32=0xffffffff).
+ */
+ static final int IMPLICIT_TAG = 15;
+
+ static boolean isAssignedCE32(int ce32) {
+ return ce32 != FALLBACK_CE32 && ce32 != UNASSIGNED_CE32;
+ }
+
+ /**
+ * We limit the number of CEs in an expansion
+ * so that we can use a small number of length bits in the data structure,
+ * and so that an implementation can copy CEs at runtime without growing a destination buffer.
+ */
+ static final int MAX_EXPANSION_LENGTH = 31;
+ static final int MAX_INDEX = 0x7ffff;
+
+ /**
+ * Set if there is no match for the single (no-suffix) character itself.
+ * This is only possible if there is a prefix.
+ * In this case, discontiguous contraction matching cannot add combining marks
+ * starting from an empty suffix.
+ * The default CE32 is used anyway if there is no suffix match.
+ */
+ static final int CONTRACT_SINGLE_CP_NO_MATCH = 0x100;
+ /** Set if the first character of every contraction suffix has lccc!=0. */
+ static final int CONTRACT_NEXT_CCC = 0x200;
+ /** Set if any contraction suffix ends with lccc!=0. */
+ static final int CONTRACT_TRAILING_CCC = 0x400;
+
+ /** For HANGUL_TAG: None of its Jamo CE32s isSpecialCE32(). */
+ static final int HANGUL_NO_SPECIAL_JAMO = 0x100;
+
+ static final int LEAD_ALL_UNASSIGNED = 0;
+ static final int LEAD_ALL_FALLBACK = 0x100;
+ static final int LEAD_MIXED = 0x200;
+ static final int LEAD_TYPE_MASK = 0x300;
+
+ static int makeLongPrimaryCE32(long p) { return (int)(p | LONG_PRIMARY_CE32_LOW_BYTE); }
+
+ /** Turns the long-primary CE32 into a primary weight pppppp00. */
+ static long primaryFromLongPrimaryCE32(int ce32) {
+ return (long)ce32 & 0xffffff00L;
+ }
+ static long ceFromLongPrimaryCE32(int ce32) {
+ return ((long)(ce32 & 0xffffff00) << 32) | COMMON_SEC_AND_TER_CE;
+ }
+
+ static int makeLongSecondaryCE32(int lower32) {
+ return lower32 | SPECIAL_CE32_LOW_BYTE | LONG_SECONDARY_TAG;
+ }
+ static long ceFromLongSecondaryCE32(int ce32) {
+ return (long)ce32 & 0xffffff00L;
+ }
+
+ /** Makes a special CE32 with tag, index and length. */
+ static int makeCE32FromTagIndexAndLength(int tag, int index, int length) {
+ return (index << 13) | (length << 8) | SPECIAL_CE32_LOW_BYTE | tag;
+ }
+ /** Makes a special CE32 with only tag and index. */
+ static int makeCE32FromTagAndIndex(int tag, int index) {
+ return (index << 13) | SPECIAL_CE32_LOW_BYTE | tag;
+ }
+
+ static boolean isSpecialCE32(int ce32) {
+ return (ce32 & 0xff) >= SPECIAL_CE32_LOW_BYTE;
+ }
+
+ static int tagFromCE32(int ce32) {
+ return ce32 & 0xf;
+ }
+
+ static boolean hasCE32Tag(int ce32, int tag) {
+ return isSpecialCE32(ce32) && tagFromCE32(ce32) == tag;
+ }
+
+ static boolean isLongPrimaryCE32(int ce32) {
+ return hasCE32Tag(ce32, LONG_PRIMARY_TAG);
+ }
+
+ static boolean isSimpleOrLongCE32(int ce32) {
+ return !isSpecialCE32(ce32) ||
+ tagFromCE32(ce32) == LONG_PRIMARY_TAG ||
+ tagFromCE32(ce32) == LONG_SECONDARY_TAG;
+ }
+
+ /**
+ * @return true if the ce32 yields one or more CEs without further data lookups
+ */
+ static boolean isSelfContainedCE32(int ce32) {
+ return !isSpecialCE32(ce32) ||
+ tagFromCE32(ce32) == LONG_PRIMARY_TAG ||
+ tagFromCE32(ce32) == LONG_SECONDARY_TAG ||
+ tagFromCE32(ce32) == LATIN_EXPANSION_TAG;
+ }
+
+ static boolean isPrefixCE32(int ce32) {
+ return hasCE32Tag(ce32, PREFIX_TAG);
+ }
+
+ static boolean isContractionCE32(int ce32) {
+ return hasCE32Tag(ce32, CONTRACTION_TAG);
+ }
+
+ static boolean ce32HasContext(int ce32) {
+ return isSpecialCE32(ce32) &&
+ (tagFromCE32(ce32) == PREFIX_TAG ||
+ tagFromCE32(ce32) == CONTRACTION_TAG);
+ }
+
+ /**
+ * Get the first of the two Latin-expansion CEs encoded in ce32.
+ * @see LATIN_EXPANSION_TAG
+ */
+ static long latinCE0FromCE32(int ce32) {
+ return ((long)(ce32 & 0xff000000) << 32) | COMMON_SECONDARY_CE | ((ce32 & 0xff0000) >> 8);
+ }
+
+ /**
+ * Get the second of the two Latin-expansion CEs encoded in ce32.
+ * @see LATIN_EXPANSION_TAG
+ */
+ static long latinCE1FromCE32(int ce32) {
+ return (((long)ce32 & 0xff00) << 16) | COMMON_TERTIARY_CE;
+ }
+
+ /**
+ * Returns the data index from a special CE32.
+ */
+ static int indexFromCE32(int ce32) {
+ return ce32 >>> 13;
+ }
+
+ /**
+ * Returns the data length from a ce32.
+ */
+ static int lengthFromCE32(int ce32) {
+ return (ce32 >> 8) & 31;
+ }
+
+ /**
+ * Returns the digit value from a DIGIT_TAG ce32.
+ */
+ static char digitFromCE32(int ce32) {
+ return (char)((ce32 >> 8) & 0xf);
+ }
+
+ /** Returns a 64-bit CE from a simple CE32 (not special). */
+ static long ceFromSimpleCE32(int ce32) {
+ // normal form ppppsstt -> pppp0000ss00tt00
+ assert (ce32 & 0xff) < SPECIAL_CE32_LOW_BYTE;
+ return ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | ((ce32 & 0xff) << 8);
+ }
+
+ /** Returns a 64-bit CE from a simple/long-primary/long-secondary CE32. */
+ static long ceFromCE32(int ce32) {
+ int tertiary = ce32 & 0xff;
+ if(tertiary < SPECIAL_CE32_LOW_BYTE) {
+ // normal form ppppsstt -> pppp0000ss00tt00
+ return ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (tertiary << 8);
+ } else {
+ ce32 -= tertiary;
+ if((tertiary & 0xf) == LONG_PRIMARY_TAG) {
+ // long-primary form ppppppC1 -> pppppp00050000500
+ return ((long)ce32 << 32) | COMMON_SEC_AND_TER_CE;
+ } else {
+ // long-secondary form ssssttC2 -> 00000000sssstt00
+ assert (tertiary & 0xf) == LONG_SECONDARY_TAG;
+ return ce32 & 0xffffffffL;
+ }
+ }
+ }
+
+ /** Creates a CE from a primary weight. */
+ public static long makeCE(long p) {
+ return (p << 32) | COMMON_SEC_AND_TER_CE;
+ }
+ /**
+ * Creates a CE from a primary weight,
+ * 16-bit secondary/tertiary weights, and a 2-bit quaternary.
+ */
+ static long makeCE(long p, int s, int t, int q) {
+ return (p << 32) | ((long)s << 16) | t | (q << 6);
+ }
+
+ /**
+ * Increments a 2-byte primary by a code point offset.
+ */
+ public static long incTwoBytePrimaryByOffset(long basePrimary, boolean isCompressible,
+ int offset) {
+ // Extract the second byte, minus the minimum byte value,
+ // plus the offset, modulo the number of usable byte values, plus the minimum.
+ // Reserve the PRIMARY_COMPRESSION_LOW_BYTE and high byte if necessary.
+ long primary;
+ if(isCompressible) {
+ offset += ((int)(basePrimary >> 16) & 0xff) - 4;
+ primary = ((offset % 251) + 4) << 16;
+ offset /= 251;
+ } else {
+ offset += ((int)(basePrimary >> 16) & 0xff) - 2;
+ primary = ((offset % 254) + 2) << 16;
+ offset /= 254;
+ }
+ // First byte, assume no further overflow.
+ return primary | ((basePrimary & 0xff000000L) + ((long)offset << 24));
+ }
+
+ /**
+ * Increments a 3-byte primary by a code point offset.
+ */
+ public static long incThreeBytePrimaryByOffset(long basePrimary, boolean isCompressible,
+ int offset) {
+ // Extract the third byte, minus the minimum byte value,
+ // plus the offset, modulo the number of usable byte values, plus the minimum.
+ offset += ((int)(basePrimary >> 8) & 0xff) - 2;
+ long primary = ((offset % 254) + 2) << 8;
+ offset /= 254;
+ // Same with the second byte,
+ // but reserve the PRIMARY_COMPRESSION_LOW_BYTE and high byte if necessary.
+ if(isCompressible) {
+ offset += ((int)(basePrimary >> 16) & 0xff) - 4;
+ primary |= ((offset % 251) + 4) << 16;
+ offset /= 251;
+ } else {
+ offset += ((int)(basePrimary >> 16) & 0xff) - 2;
+ primary |= ((offset % 254) + 2) << 16;
+ offset /= 254;
+ }
+ // First byte, assume no further overflow.
+ return primary | ((basePrimary & 0xff000000L) + ((long)offset << 24));
+ }
+
+ /**
+ * Decrements a 2-byte primary by one range step (1..0x7f).
+ */
+ static long decTwoBytePrimaryByOneStep(long basePrimary, boolean isCompressible, int step) {
+ // Extract the second byte, minus the minimum byte value,
+ // minus the step, modulo the number of usable byte values, plus the minimum.
+ // Reserve the PRIMARY_COMPRESSION_LOW_BYTE and high byte if necessary.
+ // Assume no further underflow for the first byte.
+ assert(0 < step && step <= 0x7f);
+ int byte2 = ((int)(basePrimary >> 16) & 0xff) - step;
+ if(isCompressible) {
+ if(byte2 < 4) {
+ byte2 += 251;
+ basePrimary -= 0x1000000;
+ }
+ } else {
+ if(byte2 < 2) {
+ byte2 += 254;
+ basePrimary -= 0x1000000;
+ }
+ }
+ return (basePrimary & 0xff000000L) | (byte2 << 16);
+ }
+
+ /**
+ * Decrements a 3-byte primary by one range step (1..0x7f).
+ */
+ static long decThreeBytePrimaryByOneStep(long basePrimary, boolean isCompressible, int step) {
+ // Extract the third byte, minus the minimum byte value,
+ // minus the step, modulo the number of usable byte values, plus the minimum.
+ assert(0 < step && step <= 0x7f);
+ int byte3 = ((int)(basePrimary >> 8) & 0xff) - step;
+ if(byte3 >= 2) {
+ return (basePrimary & 0xffff0000L) | (byte3 << 8);
+ }
+ byte3 += 254;
+ // Same with the second byte,
+ // but reserve the PRIMARY_COMPRESSION_LOW_BYTE and high byte if necessary.
+ int byte2 = ((int)(basePrimary >> 16) & 0xff) - 1;
+ if(isCompressible) {
+ if(byte2 < 4) {
+ byte2 = 0xfe;
+ basePrimary -= 0x1000000;
+ }
+ } else {
+ if(byte2 < 2) {
+ byte2 = 0xff;
+ basePrimary -= 0x1000000;
+ }
+ }
+ // First byte, assume no further underflow.
+ return (basePrimary & 0xff000000L) | (byte2 << 16) | (byte3 << 8);
+ }
+
+ /**
+ * Computes a 3-byte primary for c's OFFSET_TAG data "CE".
+ */
+ static long getThreeBytePrimaryForOffsetData(int c, long dataCE) {
+ long p = dataCE >>> 32; // three-byte primary pppppp00
+ int lower32 = (int)dataCE; // base code point b & step s: bbbbbbss (bit 7: isCompressible)
+ int offset = (c - (lower32 >> 8)) * (lower32 & 0x7f); // delta * increment
+ boolean isCompressible = (lower32 & 0x80) != 0;
+ return Collation.incThreeBytePrimaryByOffset(p, isCompressible, offset);
+ }
+
+ /**
+ * Returns the unassigned-character implicit primary weight for any valid code point c.
+ */
+ static long unassignedPrimaryFromCodePoint(int c) {
+ // Create a gap before U+0000. Use c=-1 for [first unassigned].
+ ++c;
+ // Fourth byte: 18 values, every 14th byte value (gap of 13).
+ long primary = 2 + (c % 18) * 14;
+ c /= 18;
+ // Third byte: 254 values.
+ primary |= (2 + (c % 254)) << 8;
+ c /= 254;
+ // Second byte: 251 values 04..FE excluding the primary compression bytes.
+ primary |= (4 + (c % 251)) << 16;
+ // One lead byte covers all code points (c < 0x1182B4 = 1*251*254*18).
+ return primary | ((long)UNASSIGNED_IMPLICIT_BYTE << 24);
+ }
+
+ static long unassignedCEFromCodePoint(int c) {
+ return makeCE(unassignedPrimaryFromCodePoint(c));
+ }
+
+ static long reorder(byte[] reorderTable, long primary) {
+ return ((reorderTable[(int)primary >>> 24] & 0xffL) << 24) | (primary & 0xffffff);
+ }
+
+ // private Collation() // No instantiation.
+}
--- /dev/null
+/*
+*******************************************************************************
+* Copyright (C) 2013-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* CollationBuilder.java, ported from collationbuilder.h/.cpp
+*
+* C++ version created on: 2013may06
+* created by: Markus W. Scherer
+*/
+
+package com.ibm.icu.impl.coll;
+
+import java.text.ParseException;
+
+import com.ibm.icu.impl.Norm2AllModes;
+import com.ibm.icu.impl.Normalizer2Impl;
+import com.ibm.icu.impl.Normalizer2Impl.Hangul;
+import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.CanonicalIterator;
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.Normalizer2;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.text.UnicodeSetIterator;
+import com.ibm.icu.util.ULocale;
+import com.ibm.icu.util.VersionInfo;
+
+public final class CollationBuilder extends CollationRuleParser.Sink {
+ private static final boolean DEBUG = false;
+ private static final class BundleImporter implements CollationRuleParser.Importer {
+ BundleImporter() {}
+ public String getRules(String localeID, String collationType) {
+ return CollationLoader.loadRules(new ULocale(localeID), collationType);
+ }
+ }
+
+ public CollationBuilder(CollationTailoring b) {
+ nfd = Normalizer2.getNFDInstance();
+ fcd = Norm2AllModes.getFCDNormalizer2();
+ nfcImpl = Norm2AllModes.getNFCInstance().impl;
+ base = b;
+ baseData = b.data;
+ rootElements = new CollationRootElements(b.data.rootElements);
+ variableTop = 0;
+ dataBuilder = new CollationDataBuilder();
+ fastLatinEnabled = true;
+ cesLength = 0;
+ rootPrimaryIndexes = new UVector32();
+ nodes = new UVector64();
+ nfcImpl.ensureCanonIterData();
+ dataBuilder.initForTailoring(baseData);
+ }
+
+ public CollationTailoring parseAndBuild(String ruleString) throws ParseException {
+ if(baseData.rootElements == null) {
+ // C++ U_MISSING_RESOURCE_ERROR
+ throw new UnsupportedOperationException(
+ "missing root elements data, tailoring not supported");
+ }
+ CollationTailoring tailoring = new CollationTailoring(base.settings);
+ CollationRuleParser parser = new CollationRuleParser(baseData);
+ // Note: This always bases &[last variable] and &[first regular]
+ // on the root collator's maxVariable/variableTop.
+ // If we wanted this to change after [maxVariable x], then we would keep
+ // the tailoring.settings pointer here and read its variableTop when we need it.
+ // See http://unicode.org/cldr/trac/ticket/6070
+ variableTop = base.settings.readOnly().variableTop;
+ parser.setSink(this);
+ // In Java, there is only one Importer implementation.
+ // In C++, the importer is a parameter for this method.
+ parser.setImporter(new BundleImporter());
+ parser.parse(ruleString, tailoring.settings.copyOnWrite());
+ if(dataBuilder.hasMappings()) {
+ makeTailoredCEs();
+ closeOverComposites();
+ finalizeCEs();
+ // Copy all of ASCII, and Latin-1 letters, into each tailoring.
+ optimizeSet.add(0, 0x7f);
+ optimizeSet.add(0xc0, 0xff);
+ // Hangul is decomposed on the fly during collation,
+ // and the tailoring data is always built with HANGUL_TAG specials.
+ optimizeSet.remove(Hangul.HANGUL_BASE, Hangul.HANGUL_END);
+ dataBuilder.optimize(optimizeSet);
+ tailoring.ensureOwnedData();
+ if(fastLatinEnabled) { dataBuilder.enableFastLatin(); }
+ dataBuilder.build(tailoring.ownedData);
+ // C++ tailoring.builder = dataBuilder;
+ dataBuilder = null;
+ } else {
+ tailoring.data = baseData;
+ }
+ tailoring.rules = ruleString;
+ // In Java, we do not have a rules version.
+ // In C++, the genrb build tool reads and supplies one,
+ // and the rulesVersion is a parameter for this method.
+ VersionInfo rulesVersion = VersionInfo.getInstance(0, 0, 0, 0);
+ tailoring.setVersion(base.version, rulesVersion);
+ return tailoring;
+ }
+
+ /** Implements CollationRuleParser.Sink. */
+ @Override
+ void addReset(int strength, CharSequence str) {
+ assert(str.length() != 0);
+ if(str.charAt(0) == CollationRuleParser.POS_LEAD) {
+ ces[0] = getSpecialResetPosition(str);
+ cesLength = 1;
+ assert((ces[0] & Collation.CASE_AND_QUATERNARY_MASK) == 0);
+ } else {
+ // normal reset to a character or string
+ String nfdString = nfd.normalize(str);
+ cesLength = dataBuilder.getCEs(nfdString, ces, 0);
+ if(cesLength > Collation.MAX_EXPANSION_LENGTH) {
+ throw new IllegalArgumentException(
+ "reset position maps to too many collation elements (more than 31)");
+ }
+ }
+ if(strength == Collator.IDENTICAL) { return; } // simple reset-at-position
+
+ // &[before strength]position
+ assert(Collator.PRIMARY <= strength && strength <= Collator.TERTIARY);
+ int index = findOrInsertNodeForCEs(strength);
+
+ long node = nodes.elementAti(index);
+ // If the index is for a "weaker" tailored node,
+ // then skip backwards over this and further "weaker" nodes.
+ while(strengthFromNode(node) > strength) {
+ index = previousIndexFromNode(node);
+ node = nodes.elementAti(index);
+ }
+
+ // Find or insert a node whose index we will put into a temporary CE.
+ if(strengthFromNode(node) == strength && isTailoredNode(node)) {
+ // Reset to just before this same-strength tailored node.
+ index = previousIndexFromNode(node);
+ } else if(strength == Collator.PRIMARY) {
+ // root primary node (has no previous index)
+ long p = weight32FromNode(node);
+ if(p == 0) {
+ throw new UnsupportedOperationException(
+ "reset primary-before ignorable not possible");
+ }
+ if(p <= rootElements.getFirstPrimary()) {
+ // There is no primary gap between ignorables and the space-first-primary.
+ throw new UnsupportedOperationException(
+ "reset primary-before first non-ignorable not supported");
+ }
+ if(p == Collation.FIRST_TRAILING_PRIMARY) {
+ // We do not support tailoring to an unassigned-implicit CE.
+ throw new UnsupportedOperationException(
+ "reset primary-before [first trailing] not supported");
+ }
+ p = rootElements.getPrimaryBefore(p, baseData.isCompressiblePrimary(p));
+ index = findOrInsertNodeForPrimary(p);
+ // Go to the last node in this list:
+ // Tailor after the last node between adjacent root nodes.
+ for(;;) {
+ node = nodes.elementAti(index);
+ int nextIndex = nextIndexFromNode(node);
+ if(nextIndex == 0) { break; }
+ index = nextIndex;
+ }
+ } else {
+ // &[before 2] or &[before 3]
+ index = findCommonNode(index, Collator.SECONDARY);
+ if(strength >= Collator.TERTIARY) {
+ index = findCommonNode(index, Collator.TERTIARY);
+ }
+ node = nodes.elementAti(index);
+ if(strengthFromNode(node) == strength) {
+ // Found a same-strength node with an explicit weight.
+ int weight16 = weight16FromNode(node);
+ if(weight16 == 0) {
+ throw new UnsupportedOperationException(
+ (strength == Collator.SECONDARY) ?
+ "reset secondary-before secondary ignorable not possible" :
+ "reset tertiary-before completely ignorable not possible");
+ }
+ assert(weight16 >= Collation.COMMON_WEIGHT16);
+ int previousIndex = previousIndexFromNode(node);
+ if(weight16 == Collation.COMMON_WEIGHT16) {
+ // Reset to just before this same-strength common-weight node.
+ index = previousIndex;
+ } else {
+ // A non-common weight is only possible from a root CE.
+ // Find the higher-level weights, which must all be explicit,
+ // and then find the preceding weight for this level.
+ long previousWeight16 = 0;
+ int previousWeightIndex = -1;
+ int i = index;
+ if(strength == Collator.SECONDARY) {
+ long p;
+ do {
+ i = previousIndexFromNode(node);
+ node = nodes.elementAti(i);
+ if(strengthFromNode(node) == Collator.SECONDARY && !isTailoredNode(node) &&
+ previousWeightIndex < 0) {
+ previousWeightIndex = i;
+ previousWeight16 = weight16FromNode(node);
+ }
+ } while(strengthFromNode(node) > Collator.PRIMARY);
+ assert(!isTailoredNode(node));
+ p = weight32FromNode(node);
+ weight16 = rootElements.getSecondaryBefore(p, weight16);
+ } else {
+ long p;
+ int s;
+ do {
+ i = previousIndexFromNode(node);
+ node = nodes.elementAti(i);
+ if(strengthFromNode(node) == Collator.TERTIARY && !isTailoredNode(node) &&
+ previousWeightIndex < 0) {
+ previousWeightIndex = i;
+ previousWeight16 = weight16FromNode(node);
+ }
+ } while(strengthFromNode(node) > Collator.SECONDARY);
+ assert(!isTailoredNode(node));
+ if(strengthFromNode(node) == Collator.SECONDARY) {
+ s = weight16FromNode(node);
+ do {
+ i = previousIndexFromNode(node);
+ node = nodes.elementAti(i);
+ } while(strengthFromNode(node) > Collator.PRIMARY);
+ assert(!isTailoredNode(node));
+ } else {
+ assert(!nodeHasBefore2(node));
+ s = Collation.COMMON_WEIGHT16;
+ }
+ p = weight32FromNode(node);
+ weight16 = rootElements.getTertiaryBefore(p, s, weight16);
+ assert((weight16 & ~Collation.ONLY_TERTIARY_MASK) == 0);
+ }
+ // Find or insert the new explicit weight before the current one.
+ if(previousWeightIndex >= 0 && weight16 == previousWeight16) {
+ // Tailor after the last node between adjacent root nodes.
+ index = previousIndex;
+ } else {
+ node = nodeFromWeight16(weight16) | nodeFromStrength(strength);
+ index = insertNodeBetween(previousIndex, index, node);
+ }
+ }
+ } else {
+ // Found a stronger node with implied strength-common weight.
+ long hasBefore3 = 0;
+ if(strength == Collator.SECONDARY) {
+ assert(!nodeHasBefore2(node));
+ // Move the HAS_BEFORE3 flag from the parent node
+ // to the new secondary common node.
+ hasBefore3 = node & HAS_BEFORE3;
+ node = (node & ~(long)HAS_BEFORE3) | HAS_BEFORE2;
+ } else {
+ assert(!nodeHasBefore3(node));
+ node |= HAS_BEFORE3;
+ }
+ nodes.setElementAt(node, index);
+ int nextIndex = nextIndexFromNode(node);
+ // Insert default nodes with weights 02 and 05, reset to the 02 node.
+ node = nodeFromWeight16(BEFORE_WEIGHT16) | nodeFromStrength(strength);
+ index = insertNodeBetween(index, nextIndex, node);
+ node = nodeFromWeight16(Collation.COMMON_WEIGHT16) | hasBefore3 |
+ nodeFromStrength(strength);
+ insertNodeBetween(index, nextIndex, node);
+ }
+ // Strength of the temporary CE = strength of its reset position.
+ // Code above raises an error if the before-strength is stronger.
+ strength = ceStrength(ces[cesLength - 1]);
+ }
+ ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength);
+ }
+
+ private long getSpecialResetPosition(CharSequence str) {
+ assert(str.length() == 2);
+ long ce;
+ int strength = Collator.PRIMARY;
+ boolean isBoundary = false;
+ CollationRuleParser.Position pos =
+ CollationRuleParser.POSITION_VALUES[str.charAt(1) - CollationRuleParser.POS_BASE];
+ switch(pos) {
+ case FIRST_TERTIARY_IGNORABLE:
+ // Quaternary CEs are not supported.
+ // Non-zero quaternary weights are possible only on tertiary or stronger CEs.
+ return 0;
+ case LAST_TERTIARY_IGNORABLE:
+ return 0;
+ case FIRST_SECONDARY_IGNORABLE: {
+ // Look for a tailored tertiary node after [0, 0, 0].
+ int index = findOrInsertNodeForRootCE(0, Collator.TERTIARY);
+ long node = nodes.elementAti(index);
+ if((index = nextIndexFromNode(node)) != 0) {
+ node = nodes.elementAti(index);
+ assert(strengthFromNode(node) <= Collator.TERTIARY);
+ if(isTailoredNode(node) && strengthFromNode(node) == Collator.TERTIARY) {
+ return tempCEFromIndexAndStrength(index, Collator.TERTIARY);
+ }
+ }
+ return rootElements.getFirstTertiaryCE();
+ // No need to look for nodeHasAnyBefore() on a tertiary node.
+ }
+ case LAST_SECONDARY_IGNORABLE:
+ ce = rootElements.getLastTertiaryCE();
+ strength = Collator.TERTIARY;
+ break;
+ case FIRST_PRIMARY_IGNORABLE: {
+ // Look for a tailored secondary node after [0, 0, *].
+ int index = findOrInsertNodeForRootCE(0, Collator.SECONDARY);
+ long node = nodes.elementAti(index);
+ while((index = nextIndexFromNode(node)) != 0) {
+ node = nodes.elementAti(index);
+ strength = strengthFromNode(node);
+ if(strength < Collator.SECONDARY) { break; }
+ if(strength == Collator.SECONDARY) {
+ if(isTailoredNode(node)) {
+ if(nodeHasBefore3(node)) {
+ index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
+ assert(isTailoredNode(nodes.elementAti(index)));
+ }
+ return tempCEFromIndexAndStrength(index, Collator.SECONDARY);
+ } else {
+ break;
+ }
+ }
+ }
+ ce = rootElements.getFirstSecondaryCE();
+ strength = Collator.SECONDARY;
+ break;
+ }
+ case LAST_PRIMARY_IGNORABLE:
+ ce = rootElements.getLastSecondaryCE();
+ strength = Collator.SECONDARY;
+ break;
+ case FIRST_VARIABLE:
+ ce = rootElements.getFirstPrimaryCE();
+ isBoundary = true; // FractionalUCA.txt: FDD1 00A0, SPACE first primary
+ break;
+ case LAST_VARIABLE:
+ ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1);
+ break;
+ case FIRST_REGULAR:
+ ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1);
+ isBoundary = true; // FractionalUCA.txt: FDD1 263A, SYMBOL first primary
+ break;
+ case LAST_REGULAR:
+ // Use the Hani-first-primary rather than the actual last "regular" CE before it,
+ // for backward compatibility with behavior before the introduction of
+ // script-first-primary CEs in the root collator.
+ ce = rootElements.firstCEWithPrimaryAtLeast(
+ baseData.getFirstPrimaryForGroup(UScript.HAN));
+ break;
+ case FIRST_IMPLICIT: {
+ int ce32 = baseData.getCE32(0x4e00);
+ assert(Collation.hasCE32Tag(ce32, Collation.OFFSET_TAG));
+ ce = baseData.getCEFromOffsetCE32(0x4e00, ce32);
+ break;
+ }
+ case LAST_IMPLICIT:
+ // We do not support tailoring to an unassigned-implicit CE.
+ throw new UnsupportedOperationException(
+ "reset to [last implicit] not supported");
+ case FIRST_TRAILING:
+ ce = Collation.makeCE(Collation.FIRST_TRAILING_PRIMARY);
+ isBoundary = true; // trailing first primary (there is no mapping for it)
+ break;
+ case LAST_TRAILING:
+ throw new IllegalArgumentException("LDML forbids tailoring to U+FFFF");
+ default:
+ assert(false);
+ return 0;
+ }
+
+ int index = findOrInsertNodeForRootCE(ce, strength);
+ long node = nodes.elementAti(index);
+ if((pos.ordinal() & 1) == 0) {
+ // even pos = [first xyz]
+ if(!nodeHasAnyBefore(node) && isBoundary) {
+ // A <group> first primary boundary is artificially added to FractionalUCA.txt.
+ // It is reachable via its special contraction, but is not normally used.
+ // Find the first character tailored after the boundary CE,
+ // or the first real root CE after it.
+ if((index = nextIndexFromNode(node)) != 0) {
+ // If there is a following node, then it must be tailored
+ // because there are no root CEs with a boundary primary
+ // and non-common secondary/tertiary weights.
+ node = nodes.elementAti(index);
+ assert(isTailoredNode(node));
+ ce = tempCEFromIndexAndStrength(index, strength);
+ } else {
+ assert(strength == Collator.PRIMARY);
+ long p = ce >>> 32;
+ int pIndex = rootElements.findPrimary(p);
+ boolean isCompressible = baseData.isCompressiblePrimary(p);
+ p = rootElements.getPrimaryAfter(p, pIndex, isCompressible);
+ ce = Collation.makeCE(p);
+ index = findOrInsertNodeForRootCE(ce, Collator.PRIMARY);
+ node = nodes.elementAti(index);
+ }
+ }
+ if(nodeHasAnyBefore(node)) {
+ // Get the first node that was tailored before this one at a weaker strength.
+ if(nodeHasBefore2(node)) {
+ index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
+ node = nodes.elementAti(index);
+ }
+ if(nodeHasBefore3(node)) {
+ index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
+ }
+ assert(isTailoredNode(nodes.elementAti(index)));
+ ce = tempCEFromIndexAndStrength(index, strength);
+ }
+ } else {
+ // odd pos = [last xyz]
+ // Find the last node that was tailored after the [last xyz]
+ // at a strength no greater than the position's strength.
+ for(;;) {
+ int nextIndex = nextIndexFromNode(node);
+ if(nextIndex == 0) { break; }
+ long nextNode = nodes.elementAti(nextIndex);
+ if(strengthFromNode(nextNode) < strength) { break; }
+ index = nextIndex;
+ node = nextNode;
+ }
+ // Do not make a temporary CE for a root node.
+ // This last node might be the node for the root CE itself,
+ // or a node with a common secondary or tertiary weight.
+ if(isTailoredNode(node)) {
+ ce = tempCEFromIndexAndStrength(index, strength);
+ }
+ }
+ return ce;
+ }
+
+ /** Implements CollationRuleParser.Sink. */
+ // Java 6: @Override
+ void addRelation(int strength, CharSequence prefix, CharSequence str, CharSequence extension) {
+ String nfdPrefix;
+ if(prefix.length() == 0) {
+ nfdPrefix = "";
+ } else {
+ nfdPrefix = nfd.normalize(prefix);
+ }
+ String nfdString = nfd.normalize(str);
+
+ // The runtime code decomposes Hangul syllables on the fly,
+ // with recursive processing but without making the Jamo pieces visible for matching.
+ // It does not work with certain types of contextual mappings.
+ int nfdLength = nfdString.length();
+ if(nfdLength >= 2) {
+ char c = nfdString.charAt(0);
+ if(Hangul.isJamoL(c) || Hangul.isJamoV(c)) {
+ // While handling a Hangul syllable, contractions starting with Jamo L or V
+ // would not see the following Jamo of that syllable.
+ throw new UnsupportedOperationException(
+ "contractions starting with conjoining Jamo L or V not supported");
+ }
+ c = nfdString.charAt(nfdLength - 1);
+ if(Hangul.isJamoL(c) ||
+ (Hangul.isJamoV(c) && Hangul.isJamoL(nfdString.charAt(nfdLength - 2)))) {
+ // A contraction ending with Jamo L or L+V would require
+ // generating Hangul syllables in addTailComposites() (588 for a Jamo L),
+ // or decomposing a following Hangul syllable on the fly, during contraction matching.
+ throw new UnsupportedOperationException(
+ "contractions ending with conjoining Jamo L or L+V not supported");
+ }
+ // A Hangul syllable completely inside a contraction is ok.
+ }
+ // Note: If there is a prefix, then the parser checked that
+ // both the prefix and the string beging with NFC boundaries (not Jamo V or T).
+ // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0))
+ // (While handling a Hangul syllable, prefixes on Jamo V or T
+ // would not see the previous Jamo of that syllable.)
+
+ if(strength != Collator.IDENTICAL) {
+ // Find the node index after which we insert the new tailored node.
+ int index = findOrInsertNodeForCEs(strength);
+ assert(cesLength > 0);
+ long ce = ces[cesLength - 1];
+ if(strength == Collator.PRIMARY && !isTempCE(ce) && (ce >>> 32) == 0) {
+ // There is no primary gap between ignorables and the space-first-primary.
+ throw new UnsupportedOperationException(
+ "tailoring primary after ignorables not supported");
+ }
+ if(strength == Collator.QUATERNARY && ce == 0) {
+ // The CE data structure does not support non-zero quaternary weights
+ // on tertiary ignorables.
+ throw new UnsupportedOperationException(
+ "tailoring quaternary after tertiary ignorables not supported");
+ }
+ // Insert the new tailored node.
+ index = insertTailoredNodeAfter(index, strength);
+ // Strength of the temporary CE:
+ // The new relation may yield a stronger CE but not a weaker one.
+ int tempStrength = ceStrength(ce);
+ if(strength < tempStrength) { tempStrength = strength; }
+ ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength);
+ }
+
+ setCaseBits(nfdString);
+
+ int cesLengthBeforeExtension = cesLength;
+ if(extension.length() != 0) {
+ String nfdExtension = nfd.normalize(extension);
+ cesLength = dataBuilder.getCEs(nfdExtension, ces, cesLength);
+ if(cesLength > Collation.MAX_EXPANSION_LENGTH) {
+ throw new IllegalArgumentException(
+ "extension string adds too many collation elements (more than 31 total)");
+ }
+ }
+ int ce32 = Collation.UNASSIGNED_CE32;
+ if((!nfdPrefix.contentEquals(prefix) || !nfdString.contentEquals(str)) &&
+ !ignorePrefix(prefix) && !ignoreString(str)) {
+ // Map from the original input to the CEs.
+ // We do this in case the canonical closure is incomplete,
+ // so that it is possible to explicitly provide the missing mappings.
+ ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32);
+ }
+ addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32);
+ cesLength = cesLengthBeforeExtension;
+ }
+
+ /**
+ * Picks one of the current CEs and finds or inserts a node in the graph
+ * for the CE + strength.
+ */
+ private int findOrInsertNodeForCEs(int strength) {
+ assert(Collator.PRIMARY <= strength && strength <= Collator.QUATERNARY);
+
+ // Find the last CE that is at least as "strong" as the requested difference.
+ // Note: Stronger is smaller (Collator.PRIMARY=0).
+ long ce;
+ for(;; --cesLength) {
+ if(cesLength == 0) {
+ ce = ces[0] = 0;
+ cesLength = 1;
+ break;
+ } else {
+ ce = ces[cesLength - 1];
+ }
+ if(ceStrength(ce) <= strength) { break; }
+ }
+
+ if(isTempCE(ce)) {
+ // No need to findCommonNode() here for lower levels
+ // because insertTailoredNodeAfter() will do that anyway.
+ return indexFromTempCE(ce);
+ }
+
+ // root CE
+ if((int)(ce >>> 56) == Collation.UNASSIGNED_IMPLICIT_BYTE) {
+ throw new UnsupportedOperationException(
+ "tailoring relative to an unassigned code point not supported");
+ }
+ return findOrInsertNodeForRootCE(ce, strength);
+ }
+
+ private int findOrInsertNodeForRootCE(long ce, int strength) {
+ assert((int)(ce >>> 56) != Collation.UNASSIGNED_IMPLICIT_BYTE);
+
+ // Find or insert the node for each of the root CE's weights,
+ // down to the requested level/strength.
+ // Root CEs must have common=zero quaternary weights (for which we never insert any nodes).
+ assert((ce & 0xc0) == 0);
+ int index = findOrInsertNodeForPrimary(ce >>> 32 );
+ if(strength >= Collator.SECONDARY) {
+ int lower32 = (int)ce;
+ index = findOrInsertWeakNode(index, lower32 >>> 16, Collator.SECONDARY);
+ if(strength >= Collator.TERTIARY) {
+ index = findOrInsertWeakNode(index, lower32 & Collation.ONLY_TERTIARY_MASK,
+ Collator.TERTIARY);
+ }
+ }
+ return index;
+ }
+
+ /**
+ * Like Java Collections.binarySearch(List, key, Comparator).
+ *
+ * @return the index>=0 where the item was found,
+ * or the index<0 for inserting the string at ~index in sorted order
+ * (index into rootPrimaryIndexes)
+ */
+ private static final int binarySearchForRootPrimaryNode(
+ int[] rootPrimaryIndexes, int length, long[] nodes, long p) {
+ if(length == 0) { return ~0; }
+ int start = 0;
+ int limit = length;
+ for (;;) {
+ int i = (start + limit) / 2;
+ long node = nodes[rootPrimaryIndexes[i]];
+ long nodePrimary = node >>> 32; // weight32FromNode(node)
+ if (p == nodePrimary) {
+ return i;
+ } else if (p < nodePrimary) {
+ if (i == start) {
+ return ~start; // insert s before i
+ }
+ limit = i;
+ } else {
+ if (i == start) {
+ return ~(start + 1); // insert s after i
+ }
+ start = i;
+ }
+ }
+ }
+
+ /** Finds or inserts the node for a root CE's primary weight. */
+ private int findOrInsertNodeForPrimary(long p) {
+ int rootIndex = binarySearchForRootPrimaryNode(
+ rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuffer(), p);
+ if(rootIndex >= 0) {
+ return rootPrimaryIndexes.elementAti(rootIndex);
+ } else {
+ // Start a new list of nodes with this primary.
+ int index = nodes.size();
+ nodes.addElement(nodeFromWeight32(p));
+ rootPrimaryIndexes.insertElementAt(index, ~rootIndex);
+ return index;
+ }
+ }
+
+ /** Finds or inserts the node for a secondary or tertiary weight. */
+ private int findOrInsertWeakNode(int index, int weight16, int level) {
+ assert(0 <= index && index < nodes.size());
+
+ assert(weight16 == 0 || weight16 >= Collation.COMMON_WEIGHT16);
+ // Only reset-before inserts common weights.
+ if(weight16 == Collation.COMMON_WEIGHT16) {
+ return findCommonNode(index, level);
+ }
+ // Find the root CE's weight for this level.
+ // Postpone insertion if not found:
+ // Insert the new root node before the next stronger node,
+ // or before the next root node with the same strength and a larger weight.
+ long node = nodes.elementAti(index);
+ int nextIndex;
+ while((nextIndex = nextIndexFromNode(node)) != 0) {
+ node = nodes.elementAti(nextIndex);
+ int nextStrength = strengthFromNode(node);
+ if(nextStrength <= level) {
+ // Insert before a stronger node.
+ if(nextStrength < level) { break; }
+ // nextStrength == level
+ if(!isTailoredNode(node)) {
+ int nextWeight16 = weight16FromNode(node);
+ if(nextWeight16 == weight16) {
+ // Found the node for the root CE up to this level.
+ return nextIndex;
+ }
+ // Insert before a node with a larger same-strength weight.
+ if(nextWeight16 > weight16) { break; }
+ }
+ }
+ // Skip the next node.
+ index = nextIndex;
+ }
+ node = nodeFromWeight16(weight16) | nodeFromStrength(level);
+ return insertNodeBetween(index, nextIndex, node);
+ }
+
+ /**
+ * Makes and inserts a new tailored node into the list, after the one at index.
+ * Skips over nodes of weaker strength to maintain collation order
+ * ("postpone insertion").
+ * @return the new node's index
+ */
+ private int insertTailoredNodeAfter(int index, int strength) {
+ assert(0 <= index && index < nodes.size());
+ if(strength >= Collator.SECONDARY) {
+ index = findCommonNode(index, Collator.SECONDARY);
+ if(strength >= Collator.TERTIARY) {
+ index = findCommonNode(index, Collator.TERTIARY);
+ }
+ }
+ // Postpone insertion:
+ // Insert the new node before the next one with a strength at least as strong.
+ long node = nodes.elementAti(index);
+ int nextIndex;
+ while((nextIndex = nextIndexFromNode(node)) != 0) {
+ node = nodes.elementAti(nextIndex);
+ if(strengthFromNode(node) <= strength) { break; }
+ // Skip the next node which has a weaker (larger) strength than the new one.
+ index = nextIndex;
+ }
+ node = IS_TAILORED | nodeFromStrength(strength);
+ return insertNodeBetween(index, nextIndex, node);
+ }
+
+ /**
+ * Inserts a new node into the list, between list-adjacent items.
+ * The node's previous and next indexes must not be set yet.
+ * @return the new node's index
+ */
+ private int insertNodeBetween(int index, int nextIndex, long node) {
+ assert(previousIndexFromNode(node) == 0);
+ assert(nextIndexFromNode(node) == 0);
+ assert(nextIndexFromNode(nodes.elementAti(index)) == nextIndex);
+ // Append the new node and link it to the existing nodes.
+ int newIndex = nodes.size();
+ node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex);
+ nodes.addElement(node);
+ // nodes[index].nextIndex = newIndex
+ node = nodes.elementAti(index);
+ nodes.setElementAt(changeNodeNextIndex(node, newIndex), index);
+ // nodes[nextIndex].previousIndex = newIndex
+ if(nextIndex != 0) {
+ node = nodes.elementAti(nextIndex);
+ nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex);
+ }
+ return newIndex;
+ }
+
+ /**
+ * Finds the node which implies or contains a common=05 weight of the given strength
+ * (secondary or tertiary).
+ * Skips weaker nodes and tailored nodes if the current node is stronger
+ * and is followed by an explicit-common-weight node.
+ * Always returns the input index if that node is no stronger than the given strength.
+ */
+ private int findCommonNode(int index, int strength) {
+ assert(Collator.SECONDARY <= strength && strength <= Collator.TERTIARY);
+ long node = nodes.elementAti(index);
+ if(strengthFromNode(node) >= strength) {
+ // The current node is no stronger.
+ return index;
+ }
+ if(strength == Collator.SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) {
+ // The current node implies the strength-common weight.
+ return index;
+ }
+ index = nextIndexFromNode(node);
+ node = nodes.elementAti(index);
+ assert(!isTailoredNode(node) && strengthFromNode(node) == strength &&
+ weight16FromNode(node) == BEFORE_WEIGHT16);
+ // Skip to the explicit common node.
+ do {
+ index = nextIndexFromNode(node);
+ node = nodes.elementAti(index);
+ assert(strengthFromNode(node) >= strength);
+ } while(isTailoredNode(node) || strengthFromNode(node) > strength);
+ assert(weight16FromNode(node) == Collation.COMMON_WEIGHT16);
+ return index;
+ }
+
+ private void setCaseBits(CharSequence nfdString) {
+ int numTailoredPrimaries = 0;
+ for(int i = 0; i < cesLength; ++i) {
+ if(ceStrength(ces[i]) == Collator.PRIMARY) { ++numTailoredPrimaries; }
+ }
+ // We should not be able to get too many case bits because
+ // cesLength<=31==MAX_EXPANSION_LENGTH.
+ // 31 pairs of case bits fit into an long without setting its sign bit.
+ assert(numTailoredPrimaries <= 31);
+
+ long cases = 0;
+ if(numTailoredPrimaries > 0) {
+ CharSequence s = nfdString;
+ UTF16CollationIterator baseCEs = new UTF16CollationIterator(baseData, false, s, 0);
+ int baseCEsLength = baseCEs.fetchCEs() - 1;
+ assert(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation.NO_CE);
+
+ int lastCase = 0;
+ int numBasePrimaries = 0;
+ for(int i = 0; i < baseCEsLength; ++i) {
+ long ce = baseCEs.getCE(i);
+ if((ce >>> 32) != 0) {
+ ++numBasePrimaries;
+ int c = ((int)ce >> 14) & 3;
+ assert(c == 0 || c == 2); // lowercase or uppercase, no mixed case in any base CE
+ if(numBasePrimaries < numTailoredPrimaries) {
+ cases |= (long)c << ((numBasePrimaries - 1) * 2);
+ } else if(numBasePrimaries == numTailoredPrimaries) {
+ lastCase = c;
+ } else if(c != lastCase) {
+ // There are more base primary CEs than tailored primaries.
+ // Set mixed case if the case bits of the remainder differ.
+ lastCase = 1;
+ // Nothing more can change.
+ break;
+ }
+ }
+ }
+ if(numBasePrimaries >= numTailoredPrimaries) {
+ cases |= (long)lastCase << ((numTailoredPrimaries - 1) * 2);
+ }
+ }
+
+ for(int i = 0; i < cesLength; ++i) {
+ long ce = ces[i] & 0xffffffffffff3fffL; // clear old case bits
+ int strength = ceStrength(ce);
+ if(strength == Collator.PRIMARY) {
+ ce |= (cases & 3) << 14;
+ cases >>>= 2;
+ } else if(strength == Collator.TERTIARY) {
+ // Tertiary CEs must have uppercase bits.
+ // See the LDML spec, and comments in class CollationCompare.
+ ce |= 0x8000;
+ }
+ // Tertiary ignorable CEs must have 0 case bits.
+ // We set 0 case bits for secondary CEs too
+ // since currently only U+0345 is cased and maps to a secondary CE,
+ // and it is lowercase. Other secondaries are uncased.
+ // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight.
+ ces[i] = ce;
+ }
+ }
+
+ /** Implements CollationRuleParser.Sink. */
+ @Override
+ void suppressContractions(UnicodeSet set) {
+ dataBuilder.suppressContractions(set);
+ }
+
+ /** Implements CollationRuleParser.Sink. */
+ @Override
+ void optimize(UnicodeSet set) {
+ optimizeSet.addAll(set);
+ }
+
+ /**
+ * Adds the mapping and its canonical closure.
+ * Takes ce32=dataBuilder.encodeCEs(...) so that the data builder
+ * need not re-encode the CEs multiple times.
+ */
+ private int addWithClosure(CharSequence nfdPrefix, CharSequence nfdString,
+ long[] newCEs, int newCEsLength, int ce32) {
+ // Map from the NFD input to the CEs.
+ ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32);
+ ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32);
+ addTailComposites(nfdPrefix, nfdString);
+ return ce32;
+ }
+
+ private int addOnlyClosure(CharSequence nfdPrefix, CharSequence nfdString,
+ long[] newCEs, int newCEsLength, int ce32) {
+ // Map from canonically equivalent input to the CEs. (But not from the all-NFD input.)
+ // TODO: make CanonicalIterator work with CharSequence, or maybe change arguments here to String
+ if(nfdPrefix.length() == 0) {
+ CanonicalIterator stringIter = new CanonicalIterator(nfdString.toString());
+ String prefix = "";
+ for(;;) {
+ String str = stringIter.next();
+ if(str == null) { break; }
+ if(ignoreString(str) || str == nfdString) { continue; }
+ ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32);
+ }
+ } else {
+ CanonicalIterator prefixIter = new CanonicalIterator(nfdPrefix.toString());
+ CanonicalIterator stringIter = new CanonicalIterator(nfdString.toString());
+ for(;;) {
+ String prefix = prefixIter.next();
+ if(prefix == null) { break; }
+ if(ignorePrefix(prefix)) { continue; }
+ boolean samePrefix = prefix == nfdPrefix;
+ for(;;) {
+ String str = stringIter.next();
+ if(str == null) { break; }
+ if(ignoreString(str) || (samePrefix && str == nfdString)) { continue; }
+ ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32);
+ }
+ stringIter.reset();
+ }
+ }
+ return ce32;
+ }
+
+ private void addTailComposites(CharSequence nfdPrefix, CharSequence nfdString) {
+ // Look for the last starter in the NFD string.
+ int lastStarter;
+ int indexAfterLastStarter = nfdString.length();
+ for(;;) {
+ if(indexAfterLastStarter == 0) { return; } // no starter at all
+ lastStarter = Character.codePointBefore(nfdString, indexAfterLastStarter);
+ if(nfd.getCombiningClass(lastStarter) == 0) { break; }
+ indexAfterLastStarter -= Character.charCount(lastStarter);
+ }
+ // No closure to Hangul syllables since we decompose them on the fly.
+ if(Hangul.isJamoL(lastStarter)) { return; }
+
+ // Are there any composites whose decomposition starts with the lastStarter?
+ // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters.
+ // We might find some more equivalent mappings here if it did.
+ UnicodeSet composites = new UnicodeSet();
+ if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; }
+
+ StringBuilder newNFDString = new StringBuilder(), newString = new StringBuilder();
+ long[] newCEs = new long[Collation.MAX_EXPANSION_LENGTH];
+ UnicodeSetIterator iter = new UnicodeSetIterator(composites);
+ while(iter.next()) {
+ assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
+ int composite = iter.codepoint;
+ String decomp = nfd.getDecomposition(composite);
+ if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp,
+ newNFDString, newString)) {
+ continue;
+ }
+ int newCEsLength = dataBuilder.getCEs(nfdPrefix, newNFDString, newCEs, 0);
+ if(newCEsLength > Collation.MAX_EXPANSION_LENGTH) {
+ // Ignore mappings that we cannot store.
+ continue;
+ }
+ // Note: It is possible that the newCEs do not make use of the mapping
+ // for which we are adding the tail composites, in which case we might be adding
+ // unnecessary mappings.
+ // For example, when we add tail composites for ae^ (^=combining circumflex),
+ // UCA discontiguous-contraction matching does not find any matches
+ // for ae_^ (_=any combining diacritic below) *unless* there is also
+ // a contraction mapping for ae.
+ // Thus, if there is no ae contraction, then the ae^ mapping is ignored
+ // while fetching the newCEs for ae_^.
+ // TODO: Try to detect this effectively.
+ // (Alternatively, print a warning when prefix contractions are missing.)
+
+ // We do not need an explicit mapping for the NFD strings.
+ // It is fine if the NFD input collates like this via a sequence of mappings.
+ // It also saves a little bit of space, and may reduce the set of characters with contractions.
+ int ce32 = addIfDifferent(nfdPrefix, newString,
+ newCEs, newCEsLength, Collation.UNASSIGNED_CE32);
+ if(ce32 != Collation.UNASSIGNED_CE32) {
+ // was different, was added
+ addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32);
+ }
+ }
+ }
+
+ private boolean mergeCompositeIntoString(CharSequence nfdString, int indexAfterLastStarter,
+ int composite, CharSequence decomp,
+ StringBuilder newNFDString, StringBuilder newString) {
+ assert(Character.codePointBefore(nfdString, indexAfterLastStarter) ==
+ Character.codePointAt(decomp, 0));
+ int lastStarterLength = Character.offsetByCodePoints(decomp, 0, 1);
+ if(lastStarterLength == decomp.length()) {
+ // Singleton decompositions should be found by addWithClosure()
+ // and the CanonicalIterator, so we can ignore them here.
+ return false;
+ }
+ if(equalSubSequences(nfdString, indexAfterLastStarter, decomp, lastStarterLength)) {
+ // same strings, nothing new to be found here
+ return false;
+ }
+
+ // Make new FCD strings that combine a composite, or its decomposition,
+ // into the nfdString's last starter and the combining marks following it.
+ // Make an NFD version, and a version with the composite.
+ newNFDString.setLength(0);
+ newNFDString.append(nfdString, 0, indexAfterLastStarter);
+ newString.setLength(0);
+ newString.append(nfdString, 0, indexAfterLastStarter - lastStarterLength)
+ .appendCodePoint(composite);
+
+ // The following is related to discontiguous contraction matching,
+ // but builds only FCD strings (or else returns false).
+ int sourceIndex = indexAfterLastStarter;
+ int decompIndex = lastStarterLength;
+ // Small optimization: We keep the source character across loop iterations
+ // because we do not always consume it,
+ // and then need not fetch it again nor look up its combining class again.
+ int sourceChar = Collation.SENTINEL_CP;
+ // The cc variables need to be declared before the loop so that at the end
+ // they are set to the last combining classes seen.
+ int sourceCC = 0;
+ int decompCC = 0;
+ for(;;) {
+ if(sourceChar < 0) {
+ if(sourceIndex >= nfdString.length()) { break; }
+ sourceChar = Character.codePointAt(nfdString, sourceIndex);
+ sourceCC = nfd.getCombiningClass(sourceChar);
+ assert(sourceCC != 0);
+ }
+ // We consume a decomposition character in each iteration.
+ if(decompIndex >= decomp.length()) { break; }
+ int decompChar = Character.codePointAt(decomp, decompIndex);
+ decompCC = nfd.getCombiningClass(decompChar);
+ // Compare the two characters and their combining classes.
+ if(decompCC == 0) {
+ // Unable to merge because the source contains a non-zero combining mark
+ // but the composite's decomposition contains another starter.
+ // The strings would not be equivalent.
+ return false;
+ } else if(sourceCC < decompCC) {
+ // Composite + sourceChar would not be FCD.
+ return false;
+ } else if(decompCC < sourceCC) {
+ newNFDString.appendCodePoint(decompChar);
+ decompIndex += Character.charCount(decompChar);
+ } else if(decompChar != sourceChar) {
+ // Blocked because same combining class.
+ return false;
+ } else { // match: decompChar == sourceChar
+ newNFDString.appendCodePoint(decompChar);
+ decompIndex += Character.charCount(decompChar);
+ sourceIndex += Character.charCount(decompChar);
+ sourceChar = Collation.SENTINEL_CP;
+ }
+ }
+ // We are at the end of at least one of the two inputs.
+ if(sourceChar >= 0) { // more characters from nfdString but not from decomp
+ if(sourceCC < decompCC) {
+ // Appending the next source character to the composite would not be FCD.
+ return false;
+ }
+ newNFDString.append(nfdString, sourceIndex, nfdString.length());
+ newString.append(nfdString, sourceIndex, nfdString.length());
+ } else if(decompIndex < decomp.length()) { // more characters from decomp, not from nfdString
+ newNFDString.append(decomp, decompIndex, decomp.length());
+ }
+ assert(nfd.isNormalized(newNFDString));
+ assert(fcd.isNormalized(newString));
+ assert(nfd.normalize(newString).equals(newNFDString.toString())); // canonically equivalent
+ return true;
+ }
+
+ private boolean equalSubSequences(CharSequence left, int leftStart, CharSequence right, int rightStart) {
+ // C++ UnicodeString::compare(leftStart, 0x7fffffff, right, rightStart, 0x7fffffff) == 0
+ int leftLength = left.length();
+ if((leftLength - leftStart) != (right.length() - rightStart)) { return false; }
+ while(leftStart < leftLength) {
+ if(left.charAt(leftStart++) != right.charAt(rightStart++)) {
+ return false;
+ }
+ }
+ return true;
+ }
+ private boolean ignorePrefix(CharSequence s) {
+ // Do not map non-FCD prefixes.
+ return !isFCD(s);
+ }
+ private boolean ignoreString(CharSequence s) {
+ // Do not map non-FCD strings.
+ // Do not map strings that start with Hangul syllables: We decompose those on the fly.
+ return !isFCD(s) || Hangul.isHangul(s.charAt(0));
+ }
+ private boolean isFCD(CharSequence s) {
+ return fcd.isNormalized(s);
+ }
+
+ private static final UnicodeSet COMPOSITES = new UnicodeSet("[:NFD_QC=N:]");
+ static {
+ // Hangul is decomposed on the fly during collation.
+ COMPOSITES.remove(Hangul.HANGUL_BASE, Hangul.HANGUL_END);
+ }
+
+ private void closeOverComposites() {
+ String prefix = ""; // empty
+ UnicodeSetIterator iter = new UnicodeSetIterator(COMPOSITES);
+ while(iter.next()) {
+ assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
+ String nfdString = nfd.getDecomposition(iter.codepoint);
+ cesLength = dataBuilder.getCEs(nfdString, ces, 0);
+ if(cesLength > Collation.MAX_EXPANSION_LENGTH) {
+ // Too many CEs from the decomposition (unusual), ignore this composite.
+ // We could add a capacity parameter to getCEs() and reallocate if necessary.
+ // However, this can only really happen in contrived cases.
+ continue;
+ }
+ String composite = iter.getString();
+ addIfDifferent(prefix, composite, ces, cesLength, Collation.UNASSIGNED_CE32);
+ }
+ }
+
+ private int addIfDifferent(CharSequence prefix, CharSequence str,
+ long[] newCEs, int newCEsLength, int ce32) {
+ long[] oldCEs = new long[Collation.MAX_EXPANSION_LENGTH];
+ int oldCEsLength = dataBuilder.getCEs(prefix, str, oldCEs, 0);
+ if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) {
+ if(ce32 == Collation.UNASSIGNED_CE32) {
+ ce32 = dataBuilder.encodeCEs(newCEs, newCEsLength);
+ }
+ dataBuilder.addCE32(prefix, str, ce32);
+ }
+ return ce32;
+ }
+
+ private static boolean sameCEs(long[] ces1, int ces1Length,
+ long[] ces2, int ces2Length) {
+ if(ces1Length != ces2Length) {
+ return false;
+ }
+ assert(ces1Length <= Collation.MAX_EXPANSION_LENGTH);
+ for(int i = 0; i < ces1Length; ++i) {
+ if(ces1[i] != ces2[i]) { return false; }
+ }
+ return true;
+ }
+
+ private static final int alignWeightRight(int w) {
+ if(w != 0) {
+ while((w & 0xff) == 0) { w >>>= 8; }
+ }
+ return w;
+ }
+
+ /**
+ * Walks the tailoring graph and overwrites tailored nodes with new CEs.
+ * After this, the graph is destroyed.
+ * The nodes array can then be used only as a source of tailored CEs.
+ */
+ private void makeTailoredCEs() {
+ CollationWeights primaries = new CollationWeights();
+ CollationWeights secondaries = new CollationWeights();
+ CollationWeights tertiaries = new CollationWeights();
+ long[] nodesArray = nodes.getBuffer();
+
+ for(int rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) {
+ int i = rootPrimaryIndexes.elementAti(rpi);
+ long node = nodesArray[i];
+ long p = weight32FromNode(node);
+ int s = p == 0 ? 0 : Collation.COMMON_WEIGHT16;
+ int t = s;
+ int q = 0;
+ boolean pIsTailored = false;
+ boolean sIsTailored = false;
+ boolean tIsTailored = false;
+ if(DEBUG) {
+ System.out.printf("\nprimary %x\n", alignWeightRight((int)p));
+ }
+ int pIndex = p == 0 ? 0 : rootElements.findPrimary(p);
+ int nextIndex = nextIndexFromNode(node);
+ while(nextIndex != 0) {
+ i = nextIndex;
+ node = nodesArray[i];
+ nextIndex = nextIndexFromNode(node);
+ int strength = strengthFromNode(node);
+ if(strength == Collator.QUATERNARY) {
+ assert(isTailoredNode(node));
+ if(DEBUG) {
+ System.out.print(" quat+ ");
+ }
+ if(q == 3) {
+ // C++ U_BUFFER_OVERFLOW_ERROR
+ throw new UnsupportedOperationException("quaternary tailoring gap too small");
+ }
+ ++q;
+ } else {
+ if(strength == Collator.TERTIARY) {
+ if(isTailoredNode(node)) {
+ if(DEBUG) {
+ System.out.print(" ter+ ");
+ }
+ if(!tIsTailored) {
+ // First tailored tertiary node for [p, s].
+ int tCount = countTailoredNodes(nodesArray, nextIndex,
+ Collator.TERTIARY) + 1;
+ int tLimit;
+ if(t == 0) {
+ // Gap at the beginning of the tertiary CE range.
+ t = rootElements.getTertiaryBoundary() - 0x100;
+ tLimit = (int)rootElements.getFirstTertiaryCE() & Collation.ONLY_TERTIARY_MASK;
+ } else if(t == BEFORE_WEIGHT16) {
+ tLimit = Collation.COMMON_WEIGHT16;
+ } else if(!pIsTailored && !sIsTailored) {
+ // p and s are root weights.
+ tLimit = rootElements.getTertiaryAfter(pIndex, s, t);
+ } else {
+ // [p, s] is tailored.
+ assert(t == Collation.COMMON_WEIGHT16);
+ tLimit = rootElements.getTertiaryBoundary();
+ }
+ assert(tLimit == 0x4000 || (tLimit & ~Collation.ONLY_TERTIARY_MASK) == 0);
+ tertiaries.initForTertiary();
+ if(!tertiaries.allocWeights(t, tLimit, tCount)) {
+ // C++ U_BUFFER_OVERFLOW_ERROR
+ throw new UnsupportedOperationException("tertiary tailoring gap too small");
+ }
+ tIsTailored = true;
+ }
+ t = (int)tertiaries.nextWeight();
+ assert(t != 0xffffffff);
+ } else {
+ t = weight16FromNode(node);
+ tIsTailored = false;
+ if(DEBUG) {
+ System.out.printf(" ter %x\n", alignWeightRight(t));
+ }
+ }
+ } else {
+ if(strength == Collator.SECONDARY) {
+ if(isTailoredNode(node)) {
+ if(DEBUG) {
+ System.out.print(" sec+ ");
+ }
+ if(!sIsTailored) {
+ // First tailored secondary node for p.
+ int sCount = countTailoredNodes(nodesArray, nextIndex,
+ Collator.SECONDARY) + 1;
+ int sLimit;
+ if(s == 0) {
+ // Gap at the beginning of the secondary CE range.
+ s = rootElements.getSecondaryBoundary() - 0x100;
+ sLimit = (int)(rootElements.getFirstSecondaryCE() >> 16);
+ } else if(s == BEFORE_WEIGHT16) {
+ sLimit = Collation.COMMON_WEIGHT16;
+ } else if(!pIsTailored) {
+ // p is a root primary.
+ sLimit = rootElements.getSecondaryAfter(pIndex, s);
+ } else {
+ // p is a tailored primary.
+ assert(s == Collation.COMMON_WEIGHT16);
+ sLimit = rootElements.getSecondaryBoundary();
+ }
+ if(s == Collation.COMMON_WEIGHT16) {
+ // Do not tailor into the getSortKey() range of
+ // compressed common secondaries.
+ s = rootElements.getLastCommonSecondary();
+ }
+ secondaries.initForSecondary();
+ if(!secondaries.allocWeights(s, sLimit, sCount)) {
+ // C++ U_BUFFER_OVERFLOW_ERROR
+ throw new UnsupportedOperationException("secondary tailoring gap too small");
+ }
+ sIsTailored = true;
+ }
+ s = (int)secondaries.nextWeight();
+ assert(s != 0xffffffff);
+ } else {
+ s = weight16FromNode(node);
+ sIsTailored = false;
+ if(DEBUG) {
+ System.out.printf(" sec %x\n", alignWeightRight(s));
+ }
+ }
+ } else /* Collator.PRIMARY */ {
+ assert(isTailoredNode(node));
+ if(DEBUG) {
+ System.out.print("pri+ ");
+ }
+ if(!pIsTailored) {
+ // First tailored primary node in this list.
+ int pCount = countTailoredNodes(nodesArray, nextIndex,
+ Collator.PRIMARY) + 1;
+ boolean isCompressible = baseData.isCompressiblePrimary(p);
+ long pLimit =
+ rootElements.getPrimaryAfter(p, pIndex, isCompressible);
+ primaries.initForPrimary(isCompressible);
+ if(!primaries.allocWeights(p, pLimit, pCount)) {
+ // C++ U_BUFFER_OVERFLOW_ERROR // TODO: introduce a more specific UErrorCode?
+ throw new UnsupportedOperationException("primary tailoring gap too small");
+ }
+ pIsTailored = true;
+ }
+ p = primaries.nextWeight();
+ assert(p != 0xffffffffL);
+ s = Collation.COMMON_WEIGHT16;
+ sIsTailored = false;
+ }
+ t = s == 0 ? 0 : Collation.COMMON_WEIGHT16;
+ tIsTailored = false;
+ }
+ q = 0;
+ }
+ if(isTailoredNode(node)) {
+ nodesArray[i] = Collation.makeCE(p, s, t, q);
+ if(DEBUG) {
+ System.out.printf("%016x\n", nodesArray[i]);
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Counts the tailored nodes of the given strength up to the next node
+ * which is either stronger or has an explicit weight of this strength.
+ */
+ private static int countTailoredNodes(long[] nodesArray, int i, int strength) {
+ int count = 0;
+ for(;;) {
+ if(i == 0) { break; }
+ long node = nodesArray[i];
+ if(strengthFromNode(node) < strength) { break; }
+ if(strengthFromNode(node) == strength) {
+ if(isTailoredNode(node)) {
+ ++count;
+ } else {
+ break;
+ }
+ }
+ i = nextIndexFromNode(node);
+ }
+ return count;
+ }
+
+ private static final class CEFinalizer implements CollationDataBuilder.CEModifier {
+ CEFinalizer(long[] ces) {
+ finalCEs = ces;
+ }
+ public long modifyCE32(int ce32) {
+ assert(!Collation.isSpecialCE32(ce32));
+ if(CollationBuilder.isTempCE32(ce32)) {
+ // retain case bits
+ return finalCEs[CollationBuilder.indexFromTempCE32(ce32)] | ((ce32 & 0xc0) << 8);
+ } else {
+ return Collation.NO_CE;
+ }
+ }
+ public long modifyCE(long ce) {
+ if(CollationBuilder.isTempCE(ce)) {
+ // retain case bits
+ return finalCEs[CollationBuilder.indexFromTempCE(ce)] | (ce & 0xc000);
+ } else {
+ return Collation.NO_CE;
+ }
+ }
+
+ private long[] finalCEs;
+ };
+
+ /** Replaces temporary CEs with the final CEs they point to. */
+ private void finalizeCEs() {
+ CollationDataBuilder newBuilder = new CollationDataBuilder();
+ newBuilder.initForTailoring(baseData);
+ CEFinalizer finalizer = new CEFinalizer(nodes.getBuffer());
+ newBuilder.copyFrom(dataBuilder, finalizer);
+ dataBuilder = newBuilder;
+ }
+
+ /**
+ * Encodes "temporary CE" data into a CE that fits into the CE32 data structure,
+ * with 2-byte primary, 1-byte secondary and 6-bit tertiary,
+ * with valid CE byte values.
+ *
+ * The index must not exceed 20 bits (0xfffff).
+ * The strength must fit into 2 bits (Collator.PRIMARY..Collator.QUATERNARY).
+ *
+ * Temporary CEs are distinguished from real CEs by their use of
+ * secondary weights 06..45 which are otherwise reserved for compressed sort keys.
+ *
+ * The case bits are unused and available.
+ */
+ private static long tempCEFromIndexAndStrength(int index, int strength) {
+ return
+ // CE byte offsets, to ensure valid CE bytes, and case bits 11
+ 0x4040000006002000L +
+ // index bits 19..13 -> primary byte 1 = CE bits 63..56 (byte values 40..BF)
+ ((long)(index & 0xfe000) << 43) +
+ // index bits 12..6 -> primary byte 2 = CE bits 55..48 (byte values 40..BF)
+ ((long)(index & 0x1fc0) << 42) +
+ // index bits 5..0 -> secondary byte 1 = CE bits 31..24 (byte values 06..45)
+ ((index & 0x3f) << 24) +
+ // strength bits 1..0 -> tertiary byte 1 = CE bits 13..8 (byte values 20..23)
+ (strength << 8);
+ }
+ private static int indexFromTempCE(long tempCE) {
+ tempCE -= 0x4040000006002000L;
+ return
+ ((int)(tempCE >> 43) & 0xfe000) |
+ ((int)(tempCE >> 42) & 0x1fc0) |
+ ((int)(tempCE >> 24) & 0x3f);
+ }
+ private static int strengthFromTempCE(long tempCE) {
+ return ((int)tempCE >> 8) & 3;
+ }
+ private static boolean isTempCE(long ce) {
+ int sec = (int)ce >>> 24;
+ return 6 <= sec && sec <= 0x45;
+ }
+
+ private static int indexFromTempCE32(int tempCE32) {
+ tempCE32 -= 0x40400620;
+ return
+ ((int)(tempCE32 >> 11) & 0xfe000) |
+ ((int)(tempCE32 >> 10) & 0x1fc0) |
+ ((int)(tempCE32 >> 8) & 0x3f);
+ }
+ private static boolean isTempCE32(int ce32) {
+ return
+ (ce32 & 0xff) >= 2 && // not a long-primary/long-secondary CE32
+ 6 <= ((ce32 >> 8) & 0xff) && ((ce32 >> 8) & 0xff) <= 0x45;
+ }
+
+ private static int ceStrength(long ce) {
+ return
+ isTempCE(ce) ? strengthFromTempCE(ce) :
+ (ce & 0xff00000000000000L) != 0 ? Collator.PRIMARY :
+ ((int)ce & 0xff000000) != 0 ? Collator.SECONDARY :
+ ce != 0 ? Collator.TERTIARY :
+ Collator.IDENTICAL;
+ }
+
+ /** The secondary/tertiary lower limit for tailoring before the common weight. */
+ private static final int BEFORE_WEIGHT16 = Collation.MERGE_SEPARATOR_WEIGHT16;
+
+ /** At most 1M nodes, limited by the 20 bits in node bit fields. */
+ private static final int MAX_INDEX = 0xfffff;
+ /**
+ * Node bit 6 is set on a primary node if there are tailored nodes
+ * with secondary values below the common secondary weight (05),
+ * from a reset-secondary-before (&[before 2]).
+ */
+ private static final int HAS_BEFORE2 = 0x40;
+ /**
+ * Node bit 5 is set on a primary or secondary node if there are tailored nodes
+ * with tertiary values below the common tertiary weight (05),
+ * from a reset-tertiary-before (&[before 3]).
+ */
+ private static final int HAS_BEFORE3 = 0x20;
+ /**
+ * Node bit 3 distinguishes a tailored node, which has no weight value,
+ * from a node with an explicit (root or default) weight.
+ */
+ private static final int IS_TAILORED = 8;
+
+ private static long nodeFromWeight32(long weight32) {
+ return weight32 << 32;
+ }
+ private static long nodeFromWeight16(int weight16) {
+ return (long)weight16 << 48;
+ }
+ private static long nodeFromPreviousIndex(int previous) {
+ return (long)previous << 28;
+ }
+ private static long nodeFromNextIndex(int next) {
+ return next << 8;
+ }
+ private static long nodeFromStrength(int strength) {
+ return strength;
+ }
+
+ private static long weight32FromNode(long node) {
+ return node >>> 32;
+ }
+ private static int weight16FromNode(long node) {
+ return (int)(node >> 48) & 0xffff;
+ }
+ private static int previousIndexFromNode(long node) {
+ return (int)(node >> 28) & MAX_INDEX;
+ }
+ private static int nextIndexFromNode(long node) {
+ return ((int)node >> 8) & MAX_INDEX;
+ }
+ private static int strengthFromNode(long node) {
+ return (int)node & 3;
+ }
+
+ private static boolean nodeHasBefore2(long node) {
+ return (node & HAS_BEFORE2) != 0;
+ }
+ private static boolean nodeHasBefore3(long node) {
+ return (node & HAS_BEFORE3) != 0;
+ }
+ private static boolean nodeHasAnyBefore(long node) {
+ return (node & (HAS_BEFORE2 | HAS_BEFORE3)) != 0;
+ }
+ private static boolean isTailoredNode(long node) {
+ return (node & IS_TAILORED) != 0;
+ }
+
+ private static long changeNodePreviousIndex(long node, int previous) {
+ return (node & 0xffff00000fffffffL) | nodeFromPreviousIndex(previous);
+ }
+ private static long changeNodeNextIndex(long node, int next) {
+ return (node & 0xfffffffff00000ffL) | nodeFromNextIndex(next);
+ }
+
+ private Normalizer2 nfd, fcd;
+ private Normalizer2Impl nfcImpl;
+
+ private CollationTailoring base;
+ private CollationData baseData;
+ private CollationRootElements rootElements;
+ private long variableTop;
+
+ private CollationDataBuilder dataBuilder;
+ private boolean fastLatinEnabled;
+ private UnicodeSet optimizeSet = new UnicodeSet();
+
+ private long[] ces = new long[Collation.MAX_EXPANSION_LENGTH];
+ private int cesLength;
+
+ /**
+ * Indexes of nodes with root primary weights, sorted by primary.
+ * Compact form of a TreeMap from root primary to node index.
+ *
+ * This is a performance optimization for finding reset positions.
+ * Without this, we would have to search through the entire nodes list.
+ * It also allows storing root primary weights in list head nodes,
+ * without previous index, leaving room in root primary nodes for 32-bit primary weights.
+ */
+ private UVector32 rootPrimaryIndexes;
+ /**
+ * Data structure for assigning tailored weights and CEs.
+ * Doubly-linked lists of nodes in mostly collation order.
+ * Each list starts with a root primary node and ends with a nextIndex of 0.
+ *
+ * When there are any nodes in the list, then there is always a root primary node at index 0.
+ * This allows some code not to have to check explicitly for nextIndex==0.
+ *
+ * Root primary nodes have 32-bit weights but do not have previous indexes.
+ * All other nodes have at most 16-bit weights and do have previous indexes.
+ *
+ * Nodes with explicit weights store root collator weights,
+ * or default weak weights (e.g., secondary 05) for stronger nodes.
+ * "Tailored" nodes, with the IS_TAILORED bit set,
+ * do not store explicit weights but rather
+ * create a difference of a certain strength from the preceding node.
+ *
+ * A root node is followed by either
+ * - a root/default node of the same strength, or
+ * - a root/default node of the next-weaker strength, or
+ * - a tailored node of the same strength.
+ *
+ * A node of a given strength normally implies "common" weights on weaker levels.
+ *
+ * A node with HAS_BEFORE2 must be immediately followed by
+ * a secondary node with BEFORE_WEIGHT16, then a secondary tailored node,
+ * and later an explicit common-secondary node.
+ * (&[before 2] resets to the BEFORE_WEIGHT16 node so that
+ * the following addRelation(secondary) tailors right after that.
+ * If we did not have this node and instead were to reset on the primary node,
+ * then addRelation(secondary) would skip forward to the the COMMON_WEIGHT16 node.)
+ *
+ * All secondary tailored nodes between these two explicit ones
+ * will be assigned lower-than-common secondary weights.
+ * If the flag is not set, then there are no explicit secondary nodes
+ * with the common or lower weights.
+ *
+ * Same for HAS_BEFORE3 for tertiary nodes and weights.
+ * A node must not have both flags set.
+ *
+ * Tailored CEs are initially represented in a CollationDataBuilder as temporary CEs
+ * which point to stable indexes in this list,
+ * and temporary CEs stored in a CollationDataBuilder only point to tailored nodes.
+ *
+ * A temporary CE in the ces[] array may point to a non-tailored reset-before-position node,
+ * until the next relation is added.
+ *
+ * At the end, the tailored weights are allocated as necessary,
+ * then the tailored nodes are replaced with final CEs,
+ * and the CollationData is rewritten by replacing temporary CEs with final ones.
+ *
+ * We cannot simply insert new nodes in the middle of the array
+ * because that would invalidate the indexes stored in existing temporary CEs.
+ * We need to use a linked graph with stable indexes to existing nodes.
+ * A doubly-linked list seems easiest to maintain.
+ *
+ * Each node is stored as an long, with its fields stored as bit fields.
+ *
+ * Root primary node:
+ * - primary weight: 32 bits 63..32
+ * - reserved/unused/zero: 4 bits 31..28
+ *
+ * Weaker root nodes & tailored nodes:
+ * - a weight: 16 bits 63..48
+ * + a root or default weight for a non-tailored node
+ * + unused/zero for a tailored node
+ * - index to the previous node: 20 bits 47..28
+ *
+ * All types of nodes:
+ * - index to the next node: 20 bits 27..8
+ * + nextIndex=0 in last node per root-primary list
+ * - reserved/unused/zero bits: bits 7, 4, 2
+ * - HAS_BEFORE2: bit 6
+ * - HAS_BEFORE3: bit 5
+ * - IS_TAILORED: bit 3
+ * - the difference strength (primary/secondary/tertiary/quaternary): 2 bits 1..0
+ *
+ * We could allocate structs with pointers, but we would have to store them
+ * in a pointer list so that they can be indexed from temporary CEs,
+ * and they would require more memory allocations.
+ */
+ private UVector64 nodes;
+}
--- /dev/null
+/*
+ *******************************************************************************
+ * Copyright (C) 1996-2014, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ *******************************************************************************
+ * CollationCompare.java, ported from collationcompare.h/.cpp
+ *
+ * C++ version created on: 2012feb14 with new and old collation code
+ * created by: Markus W. Scherer
+ */
+
+package com.ibm.icu.impl.coll;
+
+import com.ibm.icu.text.Collator;
+
+public final class CollationCompare /* all static */ {
+ public static int compareUpToQuaternary(CollationIterator left, CollationIterator right,
+ CollationSettings settings) {
+ int options = settings.options;
+ long variableTop;
+ if ((options & CollationSettings.ALTERNATE_MASK) == 0) {
+ variableTop = 0;
+ } else {
+ // +1 so that we can use "<" and primary ignorables test out early.
+ variableTop = settings.variableTop + 1;
+ }
+ boolean anyVariable = false;
+
+ // Fetch CEs, compare primaries, store secondary & tertiary weights.
+ for (;;) {
+ // We fetch CEs until we get a non-ignorable primary or reach the end.
+ long leftPrimary;
+ do {
+ long ce = left.nextCE();
+ leftPrimary = ce >>> 32;
+ if (leftPrimary < variableTop && leftPrimary > Collation.MERGE_SEPARATOR_PRIMARY) {
+ // Variable CE, shift it to quaternary level.
+ // Ignore all following primary ignorables, and shift further variable CEs.
+ anyVariable = true;
+ do {
+ // Store only the primary of the variable CE.
+ left.setCurrentCE(ce & 0xffffffff00000000L);
+ for (;;) {
+ ce = left.nextCE();
+ leftPrimary = ce >>> 32;
+ if (leftPrimary == 0) {
+ left.setCurrentCE(0);
+ } else {
+ break;
+ }
+ }
+ } while (leftPrimary < variableTop && leftPrimary > Collation.MERGE_SEPARATOR_PRIMARY);
+ }
+ } while (leftPrimary == 0);
+
+ long rightPrimary;
+ do {
+ long ce = right.nextCE();
+ rightPrimary = ce >>> 32;
+ if (rightPrimary < variableTop && rightPrimary > Collation.MERGE_SEPARATOR_PRIMARY) {
+ // Variable CE, shift it to quaternary level.
+ // Ignore all following primary ignorables, and shift further variable CEs.
+ anyVariable = true;
+ do {
+ // Store only the primary of the variable CE.
+ right.setCurrentCE(ce & 0xffffffff00000000L);
+ for (;;) {
+ ce = right.nextCE();
+ rightPrimary = ce >>> 32;
+ if (rightPrimary == 0) {
+ right.setCurrentCE(0);
+ } else {
+ break;
+ }
+ }
+ } while (rightPrimary < variableTop && rightPrimary > Collation.MERGE_SEPARATOR_PRIMARY);
+ }
+ } while (rightPrimary == 0);
+
+ if (leftPrimary != rightPrimary) {
+ // Return the primary difference, with script reordering.
+ byte[] reorderTable = settings.reorderTable;
+ if (reorderTable != null) {
+ leftPrimary = Collation.reorder(reorderTable, leftPrimary);
+ rightPrimary = Collation.reorder(reorderTable, rightPrimary);
+ }
+ return (leftPrimary < rightPrimary) ? Collation.LESS : Collation.GREATER;
+ }
+ if (leftPrimary == Collation.NO_CE_PRIMARY) {
+ break;
+ }
+ }
+
+ // Compare the buffered secondary & tertiary weights.
+ // We might skip the secondary level but continue with the case level
+ // which is turned on separately.
+ if (CollationSettings.getStrength(options) >= Collator.SECONDARY) {
+ if ((options & CollationSettings.BACKWARD_SECONDARY) == 0) {
+ int leftIndex = 0;
+ int rightIndex = 0;
+ for (;;) {
+ int leftSecondary;
+ do {
+ leftSecondary = ((int) left.getCE(leftIndex++)) >>> 16;
+ } while (leftSecondary == 0);
+
+ int rightSecondary;
+ do {
+ rightSecondary = ((int) right.getCE(rightIndex++)) >>> 16;
+ } while (rightSecondary == 0);
+
+ if (leftSecondary != rightSecondary) {
+ return (leftSecondary < rightSecondary) ? Collation.LESS : Collation.GREATER;
+ }
+ if (leftSecondary == Collation.NO_CE_WEIGHT16) {
+ break;
+ }
+ }
+ } else {
+ // The backwards secondary level compares secondary weights backwards
+ // within segments separated by the merge separator (U+FFFE, weight 02).
+ int leftStart = 0;
+ int rightStart = 0;
+ for (;;) {
+ // Find the merge separator or the NO_CE terminator.
+ int leftLimit = leftStart;
+ long leftLower32;
+ while ((leftLower32 = left.getCE(leftLimit) & 0xffffffffL) > Collation.MERGE_SEPARATOR_LOWER32
+ || leftLower32 == 0) {
+ ++leftLimit;
+ }
+ int rightLimit = rightStart;
+ long rightLower32;
+ while ((rightLower32 = right.getCE(rightLimit) & 0xffffffffL) > Collation.MERGE_SEPARATOR_LOWER32
+ || rightLower32 == 0) {
+ ++rightLimit;
+ }
+
+ // Compare the segments.
+ int leftIndex = leftLimit;
+ int rightIndex = rightLimit;
+ for (;;) {
+ int leftSecondary = 0;
+ while (leftSecondary == 0 && leftIndex > leftStart) {
+ leftSecondary = ((int) left.getCE(--leftIndex)) >>> 16;
+ }
+
+ int rightSecondary = 0;
+ while (rightSecondary == 0 && rightIndex > rightStart) {
+ rightSecondary = ((int) right.getCE(--rightIndex)) >>> 16;
+ }
+
+ if (leftSecondary != rightSecondary) {
+ return (leftSecondary < rightSecondary) ? Collation.LESS : Collation.GREATER;
+ }
+ if (leftSecondary == 0) {
+ break;
+ }
+ }
+
+ // Did we reach the end of either string?
+ // Both strings have the same number of merge separators,
+ // or else there would have been a primary-level difference.
+ assert (left.getCE(leftLimit) == right.getCE(rightLimit));
+ if (left.getCE(leftLimit) == Collation.NO_CE) {
+ break;
+ }
+ // Skip both merge separators and continue.
+ leftStart = leftLimit + 1;
+ rightStart = rightLimit + 1;
+ }
+ }
+ }
+
+ if ((options & CollationSettings.CASE_LEVEL) != 0) {
+ int strength = CollationSettings.getStrength(options);
+ int leftIndex = 0;
+ int rightIndex = 0;
+ for (;;) {
+ int leftCase, leftLower32, rightCase;
+ if (strength == Collator.PRIMARY) {
+ // Primary+caseLevel: Ignore case level weights of primary ignorables.
+ // Otherwise we would get a-umlaut > a
+ // which is not desirable for accent-insensitive sorting.
+ // Check for (lower 32 bits) == 0 as well because variable CEs are stored
+ // with only primary weights.
+ long ce;
+ do {
+ ce = left.getCE(leftIndex++);
+ leftCase = (int) ce;
+ } while ((ce >>> 32) == 0 || leftCase == 0);
+ leftLower32 = leftCase;
+ leftCase &= 0xc000;
+
+ do {
+ ce = right.getCE(rightIndex++);
+ rightCase = (int) ce;
+ } while ((ce >>> 32) == 0 || rightCase == 0);
+ rightCase &= 0xc000;
+ } else {
+ // Secondary+caseLevel: By analogy with the above,
+ // ignore case level weights of secondary ignorables.
+ //
+ // Note: A tertiary CE has uppercase case bits (0.0.ut)
+ // to keep tertiary+caseFirst well-formed.
+ //
+ // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
+ // Otherwise a tertiary CE's uppercase would be no greater than
+ // a primary/secondary CE's uppercase.
+ // (See UCA well-formedness condition 2.)
+ // We could construct a special case weight higher than uppercase,
+ // but it's simpler to always ignore case weights of secondary ignorables,
+ // turning 0.0.ut into 0.0.0.t.
+ // (See LDML Collation, Case Parameters.)
+ do {
+ leftCase = (int) left.getCE(leftIndex++);
+ } while ((leftCase & 0xffff0000) == 0);
+ leftLower32 = leftCase;
+ leftCase &= 0xc000;
+
+ do {
+ rightCase = (int) right.getCE(rightIndex++);
+ } while ((rightCase & 0xffff0000) == 0);
+ rightCase &= 0xc000;
+ }
+
+ // No need to handle NO_CE and MERGE_SEPARATOR specially:
+ // There is one case weight for each previous-level weight,
+ // so level length differences were handled there.
+ if (leftCase != rightCase) {
+ if ((options & CollationSettings.UPPER_FIRST) == 0) {
+ return (leftCase < rightCase) ? Collation.LESS : Collation.GREATER;
+ } else {
+ return (leftCase < rightCase) ? Collation.GREATER : Collation.LESS;
+ }
+ }
+ if ((leftLower32 >>> 16) == Collation.NO_CE_WEIGHT16) {
+ break;
+ }
+ }
+ }
+ if (CollationSettings.getStrength(options) <= Collator.SECONDARY) {
+ return Collation.EQUAL;
+ }
+
+ int tertiaryMask = CollationSettings.getTertiaryMask(options);
+
+ int leftIndex = 0;
+ int rightIndex = 0;
+ int anyQuaternaries = 0;
+ for (;;) {
+ int leftLower32, leftTertiary;
+ do {
+ leftLower32 = (int) left.getCE(leftIndex++);
+ anyQuaternaries |= leftLower32;
+ assert ((leftLower32 & Collation.ONLY_TERTIARY_MASK) != 0 || (leftLower32 & 0xc0c0) == 0);
+ leftTertiary = leftLower32 & tertiaryMask;
+ } while (leftTertiary == 0);
+
+ int rightLower32, rightTertiary;
+ do {
+ rightLower32 = (int) right.getCE(rightIndex++);
+ anyQuaternaries |= rightLower32;
+ assert ((rightLower32 & Collation.ONLY_TERTIARY_MASK) != 0 || (rightLower32 & 0xc0c0) == 0);
+ rightTertiary = rightLower32 & tertiaryMask;
+ } while (rightTertiary == 0);
+
+ if (leftTertiary != rightTertiary) {
+ if (CollationSettings.sortsTertiaryUpperCaseFirst(options)) {
+ // Pass through NO_CE and MERGE_SEPARATOR
+ // and keep real tertiary weights larger than the MERGE_SEPARATOR.
+ // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
+ // to keep tertiary CEs well-formed.
+ // Their case+tertiary weights must be greater than those of
+ // primary and secondary CEs.
+ if (leftTertiary > Collation.MERGE_SEPARATOR_WEIGHT16) {
+ if ((leftLower32 & 0xffff0000) != 0) {
+ leftTertiary ^= 0xc000;
+ } else {
+ leftTertiary += 0x4000;
+ }
+ }
+ if (rightTertiary > Collation.MERGE_SEPARATOR_WEIGHT16) {
+ if ((rightLower32 & 0xffff0000) != 0) {
+ rightTertiary ^= 0xc000;
+ } else {
+ rightTertiary += 0x4000;
+ }
+ }
+ }
+ return (leftTertiary < rightTertiary) ? Collation.LESS : Collation.GREATER;
+ }
+ if (leftTertiary == Collation.NO_CE_WEIGHT16) {
+ break;
+ }
+ }
+ if (CollationSettings.getStrength(options) <= Collator.TERTIARY) {
+ return Collation.EQUAL;
+ }
+
+ if (!anyVariable && (anyQuaternaries & 0xc0) == 0) {
+ // If there are no "variable" CEs and no non-zero quaternary weights,
+ // then there are no quaternary differences.
+ return Collation.EQUAL;
+ }
+
+ leftIndex = 0;
+ rightIndex = 0;
+ for (;;) {
+ long leftQuaternary;
+ do {
+ long ce = left.getCE(leftIndex++);
+ leftQuaternary = ce & 0xffff;
+ if (leftQuaternary == 0) {
+ // Variable primary or completely ignorable.
+ leftQuaternary = ce >>> 32;
+ } else if (leftQuaternary <= Collation.MERGE_SEPARATOR_WEIGHT16) {
+ // Leave NO_CE or MERGE_SEPARATOR as is.
+ } else {
+ // Regular CE, not tertiary ignorable.
+ // Preserve the quaternary weight in bits 7..6.
+ leftQuaternary |= 0xffffff3fL;
+ }
+ } while (leftQuaternary == 0);
+
+ long rightQuaternary;
+ do {
+ long ce = right.getCE(rightIndex++);
+ rightQuaternary = ce & 0xffff;
+ if (rightQuaternary == 0) {
+ // Variable primary or completely ignorable.
+ rightQuaternary = ce >>> 32;
+ } else if (rightQuaternary <= Collation.MERGE_SEPARATOR_WEIGHT16) {
+ // Leave NO_CE or MERGE_SEPARATOR as is.
+ } else {
+ // Regular CE, not tertiary ignorable.
+ // Preserve the quaternary weight in bits 7..6.
+ rightQuaternary |= 0xffffff3fL;
+ }
+ } while (rightQuaternary == 0);
+
+ if (leftQuaternary != rightQuaternary) {
+ // Return the difference, with script reordering.
+ byte[] reorderTable = settings.reorderTable;
+ if (reorderTable != null) {
+ leftQuaternary = Collation.reorder(reorderTable, leftQuaternary);
+ rightQuaternary = Collation.reorder(reorderTable, rightQuaternary);
+ }
+ return (leftQuaternary < rightQuaternary) ? Collation.LESS : Collation.GREATER;
+ }
+ if (leftQuaternary == Collation.NO_CE_WEIGHT16) {
+ break;
+ }
+ }
+ return Collation.EQUAL;
+ }
+}
--- /dev/null
+/*
+*******************************************************************************
+* Copyright (C) 2010-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* CollationData.java, ported from collationdata.h/.cpp
+*
+* C++ version created on: 2010oct27
+* created by: Markus W. Scherer
+*/
+
+package com.ibm.icu.impl.coll;
+
+import com.ibm.icu.impl.Normalizer2Impl;
+import com.ibm.icu.impl.Trie2_32;
+import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.UnicodeSet;
+
+/**
+ * Collation data container.
+ * Immutable data created by a CollationDataBuilder, or loaded from a file,
+ * or deserialized from API-provided binary data.
+ *
+ * Includes data for the collation base (root/default), aliased if this is not the base.
+ */
+public final class CollationData {
+ CollationData(Normalizer2Impl nfc) {
+ nfcImpl = nfc;
+ }
+
+ public int getCE32(int c) {
+ return trie.get(c);
+ }
+
+ int getCE32FromSupplementary(int c) {
+ return trie.get(c); // TODO: port UTRIE2_GET32_FROM_SUPP(trie, c) to Java?
+ }
+
+ boolean isDigit(int c) {
+ return c < 0x660 ? c <= 0x39 && 0x30 <= c :
+ Collation.hasCE32Tag(getCE32(c), Collation.DIGIT_TAG);
+ }
+
+ public boolean isUnsafeBackward(int c, boolean numeric) {
+ return unsafeBackwardSet.contains(c) || (numeric && isDigit(c));
+ }
+
+ public boolean isCompressibleLeadByte(int b) {
+ return compressibleBytes[b];
+ }
+
+ public boolean isCompressiblePrimary(long p) {
+ return isCompressibleLeadByte((int)p >>> 24);
+ }
+
+ /**
+ * Returns the CE32 from two contexts words.
+ * Access to the defaultCE32 for contraction and prefix matching.
+ */
+ int getCE32FromContexts(int index) {
+ return ((int)contexts.charAt(index) << 16) | contexts.charAt(index + 1);
+ }
+
+ /**
+ * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG).
+ * Requires that ce32 is special.
+ */
+ int getIndirectCE32(int ce32) {
+ assert(Collation.isSpecialCE32(ce32));
+ int tag = Collation.tagFromCE32(ce32);
+ if(tag == Collation.DIGIT_TAG) {
+ // Fetch the non-numeric-collation CE32.
+ ce32 = ce32s[Collation.indexFromCE32(ce32)];
+ } else if(tag == Collation.LEAD_SURROGATE_TAG) {
+ ce32 = Collation.UNASSIGNED_CE32;
+ } else if(tag == Collation.U0000_TAG) {
+ // Fetch the normal ce32 for U+0000.
+ ce32 = ce32s[0];
+ }
+ return ce32;
+ }
+
+ /**
+ * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG),
+ * if ce32 is special.
+ */
+ int getFinalCE32(int ce32) {
+ if(Collation.isSpecialCE32(ce32)) {
+ ce32 = getIndirectCE32(ce32);
+ }
+ return ce32;
+ }
+
+ /**
+ * Computes a CE from c's ce32 which has the OFFSET_TAG.
+ */
+ long getCEFromOffsetCE32(int c, int ce32) {
+ long dataCE = ces[Collation.indexFromCE32(ce32)];
+ return Collation.makeCE(Collation.getThreeBytePrimaryForOffsetData(c, dataCE));
+ }
+
+ /**
+ * Returns the FCD16 value for code point c. c must be >= 0.
+ */
+ int getFCD16(int c) {
+ return nfcImpl.getFCD16(c);
+ }
+
+ /**
+ * Returns the first primary for the script's reordering group.
+ * @return the primary with only the first primary lead byte of the group
+ * (not necessarily an actual root collator primary weight),
+ * or 0 if the script is unknown
+ */
+ long getFirstPrimaryForGroup(int script) {
+ int index = findScript(script);
+ if(index < 0) {
+ return 0;
+ }
+ long head = scripts[index];
+ return (head & 0xff00) << 16;
+ }
+
+ /**
+ * Returns the last primary for the script's reordering group.
+ * @return the last primary of the group
+ * (not an actual root collator primary weight),
+ * or 0 if the script is unknown
+ */
+ public long getLastPrimaryForGroup(int script) {
+ int index = findScript(script);
+ if(index < 0) {
+ return 0;
+ }
+ int head = scripts[index];
+ long lastByte = head & 0xff;
+ return ((lastByte + 1) << 24) - 1;
+ }
+
+ /**
+ * Finds the reordering group which contains the primary weight.
+ * @return the first script of the group, or -1 if the weight is beyond the last group
+ */
+ public int getGroupForPrimary(long p) {
+ p >>= 24; // Reordering groups are distinguished by primary lead bytes.
+ for(int i = 0; i < scripts.length; i = i + 2 + scripts[i + 1]) {
+ int lastByte = scripts[i] & 0xff;
+ if(p <= lastByte) {
+ return scripts[i + 2];
+ }
+ }
+ return -1;
+ }
+
+ private int findScript(int script) {
+ if(script < 0 || 0xffff < script) { return -1; }
+ for(int i = 0; i < scripts.length;) {
+ int limit = i + 2 + scripts[i + 1];
+ for(int j = i + 2; j < limit; ++j) {
+ if(script == scripts[j]) { return i; }
+ }
+ i = limit;
+ }
+ return -1;
+ }
+
+ public int[] getEquivalentScripts(int script) {
+ int i = findScript(script);
+ if(i < 0) { return EMPTY_INT_ARRAY; }
+ int length = scripts[i + 1];
+ assert(length != 0);
+ int dest[] = new int[length];
+ i += 2;
+ dest[0] = scripts[i++];
+ for(int j = 1; j < length; ++j) {
+ script = scripts[i++];
+ // Sorted insertion.
+ for(int k = j;; --k) {
+ // Invariant: dest[k] is free to receive either script or dest[k - 1].
+ if(k > 0 && script < dest[k - 1]) {
+ dest[k] = dest[k - 1];
+ } else {
+ dest[k] = script;
+ break;
+ }
+ }
+ }
+ return dest;
+ }
+
+ /**
+ * Writes the permutation table for the given reordering of scripts and groups,
+ * mapping from default-order primary-weight lead bytes to reordered lead bytes.
+ * The caller checks for illegal arguments and
+ * takes care of [DEFAULT] and memory allocation.
+ */
+ public void makeReorderTable(int[] reorder, byte[] table) {
+ int length = reorder.length;
+ // Initialize the table.
+ // Never reorder special low and high primary lead bytes.
+ int lowByte;
+ for(lowByte = 0; lowByte <= Collation.MERGE_SEPARATOR_BYTE; ++lowByte) {
+ table[lowByte] = (byte)lowByte;
+ }
+ // lowByte == 03
+
+ int highByte;
+ for(highByte = 0xff; highByte >= Collation.TRAIL_WEIGHT_BYTE; --highByte) {
+ table[highByte] = (byte)highByte;
+ }
+ // highByte == FE
+
+ // Set intermediate bytes to 0 to indicate that they have not been set yet.
+ for(int i = lowByte; i <= highByte; ++i) {
+ table[i] = 0;
+ }
+
+ // Get the set of special reorder codes in the input list.
+ // This supports up to 32 special reorder codes;
+ // it works for data with codes beyond Collator.ReorderCodes.LIMIT.
+ int specials = 0;
+ for(int i = 0; i < length; ++i) {
+ int reorderCode = reorder[i] - Collator.ReorderCodes.FIRST;
+ if(0 <= reorderCode && reorderCode <= 31) {
+ specials |= 1 << reorderCode;
+ }
+ }
+
+ // Start the reordering with the special low reorder codes that do not occur in the input.
+ for(int i = 0;; i += 3) {
+ if(scripts[i + 1] != 1) { break; } // Went beyond special single-code reorder codes.
+ int reorderCode = scripts[i + 2] - Collator.ReorderCodes.FIRST;
+ if(reorderCode < 0) { break; } // Went beyond special reorder codes.
+ if((specials & (1 << reorderCode)) == 0) {
+ int head = scripts[i];
+ int firstByte = head >> 8;
+ int lastByte = head & 0xff;
+ do { table[firstByte++] = (byte)lowByte++; } while(firstByte <= lastByte);
+ }
+ }
+
+ // Reorder according to the input scripts, continuing from the bottom of the bytes range.
+ for(int i = 0; i < length;) {
+ int script = reorder[i++];
+ if(script == UScript.UNKNOWN) {
+ // Put the remaining scripts at the top.
+ while(i < length) {
+ script = reorder[--length];
+ if(script == UScript.UNKNOWN) { // Must occur at most once.
+ throw new IllegalArgumentException(
+ "setReorderCodes(): duplicate UScript.UNKNOWN");
+ }
+ if(script == Collator.ReorderCodes.DEFAULT) {
+ throw new IllegalArgumentException(
+ "setReorderCodes(): UScript.DEFAULT together with other scripts");
+ }
+ int index = findScript(script);
+ if(index < 0) { continue; }
+ int head = scripts[index];
+ int firstByte = head >> 8;
+ int lastByte = head & 0xff;
+ if(table[firstByte] != 0) { // Duplicate or equivalent script.
+ throw new IllegalArgumentException(
+ "setReorderCodes(): duplicate or equivalent script " +
+ scriptCodeString(script));
+ }
+ do { table[lastByte--] = (byte)highByte--; } while(firstByte <= lastByte);
+ }
+ break;
+ }
+ if(script == Collator.ReorderCodes.DEFAULT) {
+ // The default code must be the only one in the list, and that is handled by the caller.
+ // Otherwise it must not be used.
+ throw new IllegalArgumentException(
+ "setReorderCodes(): UScript.DEFAULT together with other scripts");
+ }
+ int index = findScript(script);
+ if(index < 0) { continue; }
+ int head = scripts[index];
+ int firstByte = head >> 8;
+ int lastByte = head & 0xff;
+ if(table[firstByte] != 0) { // Duplicate or equivalent script.
+ throw new IllegalArgumentException(
+ "setReorderCodes(): duplicate or equivalent script " +
+ scriptCodeString(script));
+ }
+ do { table[firstByte++] = (byte)lowByte++; } while(firstByte <= lastByte);
+ }
+
+ // Put all remaining scripts into the middle.
+ // Avoid table[0] which must remain 0.
+ for(int i = 1; i <= 0xff; ++i) {
+ if(table[i] == 0) { table[i] = (byte)lowByte++; }
+ }
+ assert(lowByte == highByte + 1);
+ }
+
+ private static String scriptCodeString(int script) {
+ // Do not use the script name here: We do not want to depend on that data.
+ return (script < Collator.ReorderCodes.FIRST) ?
+ Integer.toString(script) : "0x" + Integer.toHexString(script);
+ }
+
+ private static final int[] EMPTY_INT_ARRAY = new int[0];
+
+ /** @see jamoCE32s */
+ static final int JAMO_CE32S_LENGTH = 19 + 21 + 27;
+
+ /** Main lookup trie. */
+ Trie2_32 trie;
+ /**
+ * Array of CE32 values.
+ * At index 0 there must be CE32(U+0000)
+ * to support U+0000's special-tag for NUL-termination handling.
+ */
+ int[] ce32s;
+ /** Array of CE values for expansions and OFFSET_TAG. */
+ long[] ces;
+ /** Array of prefix and contraction-suffix matching data. */
+ String contexts;
+ /** Base collation data, or null if this data itself is a base. */
+ public CollationData base;
+ /**
+ * Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T.
+ * They are normally simple CE32s, rarely expansions.
+ * For fast handling of HANGUL_TAG.
+ */
+ int[] jamoCE32s = new int[JAMO_CE32S_LENGTH];
+ public Normalizer2Impl nfcImpl;
+ /** The single-byte primary weight (xx000000) for numeric collation. */
+ long numericPrimary = 0x12000000;
+
+ /** 256 flags for which primary-weight lead bytes are compressible. */
+ public boolean[] compressibleBytes;
+ /**
+ * Set of code points that are unsafe for starting string comparison after an identical prefix,
+ * or in backwards CE iteration.
+ */
+ UnicodeSet unsafeBackwardSet;
+
+ /**
+ * Fast Latin table for common-Latin-text string comparisons.
+ * Data structure see class CollationFastLatin.
+ */
+ public char[] fastLatinTable;
+ /**
+ * Header portion of the fastLatinTable.
+ * In C++, these are one array, and the header is skipped for mapping characters.
+ * In Java, two arrays work better.
+ */
+ char[] fastLatinTableHeader;
+
+ /**
+ * Data for scripts and reordering groups.
+ * Uses include building a reordering permutation table and
+ * providing script boundaries to AlphabeticIndex.
+ *
+ * This data is a sorted list of primary-weight lead byte ranges (reordering groups),
+ * each with a list of pairs sorted in base collation order;
+ * each pair contains a script/reorder code and the lowest primary weight for that script.
+ *
+ * Data structure:
+ * - Each reordering group is encoded in n+2 16-bit integers.
+ * - First integer:
+ * Bits 15..8: First byte of the reordering group's range.
+ * Bits 7..0: Last byte of the reordering group's range.
+ * - Second integer:
+ * Length n of the list of script/reordering codes.
+ * - Each further integer is a script or reordering code.
+ */
+ char[] scripts;
+
+ /**
+ * Collation elements in the root collator.
+ * Used by the CollationRootElements class. The data structure is described there.
+ * null in a tailoring.
+ */
+ public long[] rootElements;
+}
--- /dev/null
+/*
+*******************************************************************************
+* Copyright (C) 2012-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* CollationDataBuilder.java, ported from collationdatabuilder.h/.cpp
+*
+* C++ version created on: 2012apr01
+* created by: Markus W. Scherer
+*/
+
+package com.ibm.icu.impl.coll;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+
+import com.ibm.icu.impl.Norm2AllModes;
+import com.ibm.icu.impl.Normalizer2Impl;
+import com.ibm.icu.impl.Trie2;
+import com.ibm.icu.impl.Normalizer2Impl.Hangul;
+import com.ibm.icu.impl.Trie2Writable;
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.text.UnicodeSetIterator;
+import com.ibm.icu.util.CharsTrie;
+import com.ibm.icu.util.CharsTrieBuilder;
+import com.ibm.icu.util.StringTrieBuilder;
+
+/**
+ * Low-level CollationData builder.
+ * Takes (character, CE) pairs and builds them into runtime data structures.
+ * Supports characters with context prefixes and contraction suffixes.
+ */
+final class CollationDataBuilder { // not final in C++
+ /**
+ * Collation element modifier. Interface class for a modifier
+ * that changes a tailoring builder's temporary CEs to final CEs.
+ * Called for every non-special CE32 and every expansion CE.
+ */
+ interface CEModifier {
+ /** Returns a new CE to replace the non-special input CE32, or else Collation.NO_CE. */
+ long modifyCE32(int ce32);
+ /** Returns a new CE to replace the input CE, or else Collation.NO_CE. */
+ long modifyCE(long ce);
+ }
+
+ CollationDataBuilder() {
+ nfcImpl = Norm2AllModes.getNFCInstance().impl;
+ base = null;
+ baseSettings = null;
+ trie = null;
+ ce32s = new UVector32();
+ ce64s = new UVector64();
+ conditionalCE32s = new ArrayList<ConditionalCE32>();
+ modified = false;
+ fastLatinEnabled = false;
+ fastLatinBuilder = null;
+ collIter = null;
+ // Reserve the first CE32 for U+0000.
+ ce32s.addElement(0);
+ }
+
+ void initForTailoring(CollationData b) {
+ if(trie != null) {
+ throw new IllegalStateException("attempt to reuse a CollationDataBuilder");
+ }
+ if(b == null) {
+ throw new IllegalArgumentException("null CollationData");
+ }
+ base = b;
+
+ // For a tailoring, the default is to fall back to the base.
+ trie = new Trie2Writable(Collation.FALLBACK_CE32, Collation.FFFD_CE32);
+
+ // Set the Latin-1 letters block so that it is allocated first in the data array,
+ // to try to improve locality of reference when sorting Latin-1 text.
+ // Do not use utrie2_setRange32() since that will not actually allocate blocks
+ // that are filled with the default value.
+ // ASCII (0..7F) is already preallocated anyway.
+ for(int c = 0xc0; c <= 0xff; ++c) {
+ trie.set(c, Collation.FALLBACK_CE32);
+ }
+
+ // Hangul syllables are not tailorable (except via tailoring Jamos).
+ // Always set the Hangul tag to help performance.
+ // Do this here, rather than in buildMappings(),
+ // so that we see the HANGUL_TAG in various assertions.
+ int hangulCE32 = Collation.makeCE32FromTagAndIndex(Collation.HANGUL_TAG, 0);
+ trie.setRange(Hangul.HANGUL_BASE, Hangul.HANGUL_END, hangulCE32, true);
+
+ // Copy the set contents but don't copy/clone the set as a whole because
+ // that would copy the isFrozen state too.
+ unsafeBackwardSet.addAll(b.unsafeBackwardSet);
+ }
+
+ boolean isCompressibleLeadByte(int b) {
+ return base.isCompressibleLeadByte(b);
+ }
+
+ boolean isCompressiblePrimary(long p) {
+ return isCompressibleLeadByte((int)p >>> 24);
+ }
+
+ /**
+ * @return true if this builder has mappings (e.g., add() has been called)
+ */
+ boolean hasMappings() { return modified; }
+
+ /**
+ * @return true if c has CEs in this builder
+ */
+ boolean isAssigned(int c) {
+ return Collation.isAssignedCE32(trie.get(c));
+ }
+
+ void add(CharSequence prefix, CharSequence s, long ces[], int cesLength) {
+ int ce32 = encodeCEs(ces, cesLength);
+ addCE32(prefix, s, ce32);
+ }
+
+ /**
+ * Encodes the ces as either the returned ce32 by itself,
+ * or by storing an expansion, with the returned ce32 referring to that.
+ *
+ * <p>add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength))
+ */
+ int encodeCEs(long ces[], int cesLength) {
+ if(cesLength < 0 || cesLength > Collation.MAX_EXPANSION_LENGTH) {
+ throw new IllegalArgumentException("mapping to too many CEs");
+ }
+ if(!isMutable()) {
+ throw new IllegalStateException("attempt to add mappings after build()");
+ }
+ if(cesLength == 0) {
+ // Convenience: We cannot map to nothing, but we can map to a completely ignorable CE.
+ // Do this here so that callers need not do it.
+ return encodeOneCEAsCE32(0);
+ } else if(cesLength == 1) {
+ return encodeOneCE(ces[0]);
+ } else if(cesLength == 2) {
+ // Try to encode two CEs as one CE32.
+ long ce0 = ces[0];
+ long ce1 = ces[1];
+ long p0 = ce0 >>> 32;
+ if((ce0 & 0xffffffffff00ffL) == Collation.COMMON_SECONDARY_CE &&
+ (ce1 & 0xffffffff00ffffffL) == Collation.COMMON_TERTIARY_CE &&
+ p0 != 0) {
+ // Latin mini expansion
+ return
+ (int)p0 |
+ (((int)ce0 & 0xff00) << 8) |
+ (((int)ce1 >> 16) & 0xff00) |
+ Collation.SPECIAL_CE32_LOW_BYTE |
+ Collation.LATIN_EXPANSION_TAG;
+ }
+ }
+ // Try to encode two or more CEs as CE32s.
+ int[] newCE32s = new int[Collation.MAX_EXPANSION_LENGTH]; // TODO: instance field?
+ for(int i = 0;; ++i) {
+ if(i == cesLength) {
+ return encodeExpansion32(newCE32s, 0, cesLength);
+ }
+ int ce32 = encodeOneCEAsCE32(ces[i]);
+ if(ce32 == Collation.NO_CE32) { break; }
+ newCE32s[i] = ce32;
+ }
+ return encodeExpansion(ces, 0, cesLength);
+ }
+
+ void addCE32(CharSequence prefix, CharSequence s, int ce32) {
+ if(s.length() == 0) {
+ throw new IllegalArgumentException("mapping from empty string");
+ }
+ if(!isMutable()) {
+ throw new IllegalStateException("attempt to add mappings after build()");
+ }
+ int c = Character.codePointAt(s, 0);
+ int cLength = Character.charCount(c);
+ int oldCE32 = trie.get(c);
+ boolean hasContext = prefix.length() != 0|| s.length() > cLength;
+ if(oldCE32 == Collation.FALLBACK_CE32) {
+ // First tailoring for c.
+ // If c has contextual base mappings or if we add a contextual mapping,
+ // then copy the base mappings.
+ // Otherwise we just override the base mapping.
+ int baseCE32 = base.getFinalCE32(base.getCE32(c));
+ if(hasContext || Collation.ce32HasContext(baseCE32)) {
+ oldCE32 = copyFromBaseCE32(c, baseCE32, true);
+ trie.set(c, oldCE32);
+ }
+ }
+ if(!hasContext) {
+ // No prefix, no contraction.
+ if(!isBuilderContextCE32(oldCE32)) {
+ trie.set(c, ce32);
+ } else {
+ ConditionalCE32 cond = getConditionalCE32ForCE32(oldCE32);
+ cond.builtCE32 = Collation.NO_CE32;
+ cond.ce32 = ce32;
+ }
+ } else {
+ ConditionalCE32 cond;
+ if(!isBuilderContextCE32(oldCE32)) {
+ // Replace the simple oldCE32 with a builder context CE32
+ // pointing to a new ConditionalCE32 list head.
+ int index = addConditionalCE32("\0", oldCE32);
+ int contextCE32 = makeBuilderContextCE32(index);
+ trie.set(c, contextCE32);
+ contextChars.add(c);
+ cond = getConditionalCE32(index);
+ } else {
+ cond = getConditionalCE32ForCE32(oldCE32);
+ cond.builtCE32 = Collation.NO_CE32;
+ }
+ CharSequence suffix = s.subSequence(cLength, s.length());
+ String context = new StringBuilder().append((char)prefix.length()).
+ append(prefix).append(suffix).toString();
+ unsafeBackwardSet.addAll(suffix);
+ for(;;) {
+ // invariant: context > cond.context
+ int next = cond.next;
+ if(next < 0) {
+ // Append a new ConditionalCE32 after cond.
+ int index = addConditionalCE32(context, ce32);
+ cond.next = index;
+ break;
+ }
+ ConditionalCE32 nextCond = getConditionalCE32(next);
+ int cmp = context.compareTo(nextCond.context);
+ if(cmp < 0) {
+ // Insert a new ConditionalCE32 between cond and nextCond.
+ int index = addConditionalCE32(context, ce32);
+ cond.next = index;
+ getConditionalCE32(index).next = next;
+ break;
+ } else if(cmp == 0) {
+ // Same context as before, overwrite its ce32.
+ nextCond.ce32 = ce32;
+ break;
+ }
+ cond = nextCond;
+ }
+ }
+ modified = true;
+ }
+
+ /**
+ * Copies all mappings from the src builder, with modifications.
+ * This builder here must not be built yet, and should be empty.
+ */
+ void copyFrom(CollationDataBuilder src, CEModifier modifier) {
+ if(!isMutable()) {
+ throw new IllegalStateException("attempt to copyFrom() after build()");
+ }
+ CopyHelper helper = new CopyHelper(src, this, modifier);
+ Iterator<Trie2.Range> trieIterator = src.trie.iterator();
+ Trie2.Range range;
+ while(trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
+ enumRangeForCopy(range.startCodePoint, range.endCodePoint, range.value, helper);
+ }
+ // Update the contextChars and the unsafeBackwardSet while copying,
+ // in case a character had conditional mappings in the source builder
+ // and they were removed later.
+ modified |= src.modified;
+ }
+
+ void optimize(UnicodeSet set) {
+ if(set.isEmpty()) { return; }
+ UnicodeSetIterator iter = new UnicodeSetIterator(set);
+ while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) {
+ int c = iter.codepoint;
+ int ce32 = trie.get(c);
+ if(ce32 == Collation.FALLBACK_CE32) {
+ ce32 = base.getFinalCE32(base.getCE32(c));
+ ce32 = copyFromBaseCE32(c, ce32, true);
+ trie.set(c, ce32);
+ }
+ }
+ modified = true;
+ }
+
+ void suppressContractions(UnicodeSet set) {
+ if(set.isEmpty()) { return; }
+ UnicodeSetIterator iter = new UnicodeSetIterator(set);
+ while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) {
+ int c = iter.codepoint;
+ int ce32 = trie.get(c);
+ if(ce32 == Collation.FALLBACK_CE32) {
+ ce32 = base.getFinalCE32(base.getCE32(c));
+ if(Collation.ce32HasContext(ce32)) {
+ ce32 = copyFromBaseCE32(c, ce32, false /* without context */);
+ trie.set(c, ce32);
+ }
+ } else if(isBuilderContextCE32(ce32)) {
+ ce32 = getConditionalCE32ForCE32(ce32).ce32;
+ // Simply abandon the list of ConditionalCE32.
+ // The caller will copy this builder in the end,
+ // eliminating unreachable data.
+ trie.set(c, ce32);
+ contextChars.remove(c);
+ }
+ }
+ modified = true;
+ }
+
+ void enableFastLatin() { fastLatinEnabled = true; }
+ void build(CollationData data) {
+ buildMappings(data);
+ if(base != null) {
+ data.numericPrimary = base.numericPrimary;
+ data.compressibleBytes = base.compressibleBytes;
+ data.scripts = base.scripts;
+ }
+ buildFastLatinTable(data);
+ }
+
+ /**
+ * Looks up CEs for s and appends them to the ces array.
+ * Does not handle normalization: s should be in FCD form.
+ *
+ * Does not write completely ignorable CEs.
+ * Does not write beyond Collation.MAX_EXPANSION_LENGTH.
+ *
+ * @return incremented cesLength
+ */
+ int getCEs(CharSequence s, long ces[], int cesLength) {
+ return getCEs(s, 0, ces, cesLength);
+ }
+
+ int getCEs(CharSequence prefix, CharSequence s, long ces[], int cesLength) {
+ int prefixLength = prefix.length();
+ if(prefixLength == 0) {
+ return getCEs(s, 0, ces, cesLength);
+ } else {
+ return getCEs(new StringBuilder(prefix).append(s), prefixLength, ces, cesLength);
+ }
+ }
+
+ /**
+ * Build-time context and CE32 for a code point.
+ * If a code point has contextual mappings, then the default (no-context) mapping
+ * and all conditional mappings are stored in a singly-linked list
+ * of ConditionalCE32, sorted by context strings.
+ *
+ * Context strings sort by prefix length, then by prefix, then by contraction suffix.
+ * Context strings must be unique and in ascending order.
+ */
+ private static final class ConditionalCE32 {
+ ConditionalCE32(String ct, int ce) {
+ context = ct;
+ ce32 = ce;
+ defaultCE32 = Collation.NO_CE32;
+ builtCE32 = Collation.NO_CE32;
+ next = -1;
+ }
+
+ boolean hasContext() { return context.length() > 1; }
+ int prefixLength() { return context.charAt(0); }
+
+ /**
+ * "\0" for the first entry for any code point, with its default CE32.
+ *
+ * Otherwise one unit with the length of the prefix string,
+ * then the prefix string, then the contraction suffix.
+ */
+ String context;
+ /**
+ * CE32 for the code point and its context.
+ * Can be special (e.g., for an expansion) but not contextual (prefix or contraction tag).
+ */
+ int ce32;
+ /**
+ * Default CE32 for all contexts with this same prefix.
+ * Initially NO_CE32. Set only while building runtime data structures,
+ * and only on one of the nodes of a sub-list with the same prefix.
+ */
+ int defaultCE32;
+ /**
+ * CE32 for the built contexts.
+ * When fetching CEs from the builder, the contexts are built into their runtime form
+ * so that the normal collation implementation can process them.
+ * The result is cached in the list head. It is reset when the contexts are modified.
+ */
+ int builtCE32;
+ /**
+ * Index of the next ConditionalCE32.
+ * Negative for the end of the list.
+ */
+ int next;
+ }
+
+ protected int getCE32FromOffsetCE32(boolean fromBase, int c, int ce32) {
+ int i = Collation.indexFromCE32(ce32);
+ long dataCE = fromBase ? base.ces[i] : ce64s.elementAti(i);
+ long p = Collation.getThreeBytePrimaryForOffsetData(c, dataCE);
+ return Collation.makeLongPrimaryCE32(p);
+ }
+
+ protected int addCE(long ce) {
+ int length = ce64s.size();
+ for(int i = 0; i < length; ++i) {
+ if(ce == ce64s.elementAti(i)) { return i; }
+ }
+ ce64s.addElement(ce);
+ return length;
+ }
+
+ protected int addCE32(int ce32) {
+ int length = ce32s.size();
+ for(int i = 0; i < length; ++i) {
+ if(ce32 == ce32s.elementAti(i)) { return i; }
+ }
+ ce32s.addElement(ce32);
+ return length;
+ }
+
+ protected int addConditionalCE32(String context, int ce32) {
+ assert(context.length() != 0);
+ int index = conditionalCE32s.size();
+ if(index > Collation.MAX_INDEX) {
+ throw new IndexOutOfBoundsException("too many context-sensitive mappings");
+ // BufferOverflowException is a better fit
+ // but cannot be constructed with a message string.
+ }
+ ConditionalCE32 cond = new ConditionalCE32(context, ce32);
+ conditionalCE32s.add(cond);
+ return index;
+ }
+
+ protected ConditionalCE32 getConditionalCE32(int index) {
+ return conditionalCE32s.get(index);
+ }
+ protected ConditionalCE32 getConditionalCE32ForCE32(int ce32) {
+ return getConditionalCE32(Collation.indexFromCE32(ce32));
+ }
+
+ protected static int makeBuilderContextCE32(int index) {
+ return Collation.makeCE32FromTagAndIndex(Collation.BUILDER_DATA_TAG, index);
+ }
+ protected static boolean isBuilderContextCE32(int ce32) {
+ return Collation.hasCE32Tag(ce32, Collation.BUILDER_DATA_TAG);
+ }
+
+ protected static int encodeOneCEAsCE32(long ce) {
+ long p = ce >>> 32;
+ int lower32 = (int)ce;
+ int t = lower32 & 0xffff;
+ assert((t & 0xc000) != 0xc000); // Impossible case bits 11 mark special CE32s.
+ if((ce & 0xffff00ff00ffL) == 0) {
+ // normal form ppppsstt
+ return (int)p | (lower32 >>> 16) | (t >> 8);
+ } else if((ce & 0xffffffffffL) == Collation.COMMON_SEC_AND_TER_CE) {
+ // long-primary form ppppppC1
+ return Collation.makeLongPrimaryCE32(p);
+ } else if(p == 0 && (t & 0xff) == 0) {
+ // long-secondary form ssssttC2
+ return Collation.makeLongSecondaryCE32(lower32);
+ }
+ return Collation.NO_CE32;
+ }
+
+ protected int encodeOneCE(long ce) {
+ // Try to encode one CE as one CE32.
+ int ce32 = encodeOneCEAsCE32(ce);
+ if(ce32 != Collation.NO_CE32) { return ce32; }
+ int index = addCE(ce);
+ if(index > Collation.MAX_INDEX) {
+ throw new IndexOutOfBoundsException("too many mappings");
+ // BufferOverflowException is a better fit
+ // but cannot be constructed with a message string.
+ }
+ return Collation.makeCE32FromTagIndexAndLength(Collation.EXPANSION_TAG, index, 1);
+ }
+
+ protected int encodeExpansion(long ces[], int start, int length) {
+ // See if this sequence of CEs has already been stored.
+ long first = ces[start];
+ int ce64sMax = ce64s.size() - length;
+ for(int i = 0; i <= ce64sMax; ++i) {
+ if(first == ce64s.elementAti(i)) {
+ if(i > Collation.MAX_INDEX) {
+ throw new IndexOutOfBoundsException("too many mappings");
+ // BufferOverflowException is a better fit
+ // but cannot be constructed with a message string.
+ }
+ for(int j = 1;; ++j) {
+ if(j == length) {
+ return Collation.makeCE32FromTagIndexAndLength(
+ Collation.EXPANSION_TAG, i, length);
+ }
+ if(ce64s.elementAti(i + j) != ces[start + j]) { break; }
+ }
+ }
+ }
+ // Store the new sequence.
+ int i = ce64s.size();
+ if(i > Collation.MAX_INDEX) {
+ throw new IndexOutOfBoundsException("too many mappings");
+ // BufferOverflowException is a better fit
+ // but cannot be constructed with a message string.
+ }
+ for(int j = 0; j < length; ++j) {
+ ce64s.addElement(ces[start + j]);
+ }
+ return Collation.makeCE32FromTagIndexAndLength(Collation.EXPANSION_TAG, i, length);
+ }
+
+ protected int encodeExpansion32(int newCE32s[], int start, int length) {
+ // See if this sequence of CE32s has already been stored.
+ int first = newCE32s[start];
+ int ce32sMax = ce32s.size() - length;
+ for(int i = 0; i <= ce32sMax; ++i) {
+ if(first == ce32s.elementAti(i)) {
+ if(i > Collation.MAX_INDEX) {
+ throw new IndexOutOfBoundsException("too many mappings");
+ // BufferOverflowException is a better fit
+ // but cannot be constructed with a message string.
+ }
+ for(int j = 1;; ++j) {
+ if(j == length) {
+ return Collation.makeCE32FromTagIndexAndLength(
+ Collation.EXPANSION32_TAG, i, length);
+ }
+ if(ce32s.elementAti(i + j) != newCE32s[start + j]) { break; }
+ }
+ }
+ }
+ // Store the new sequence.
+ int i = ce32s.size();
+ if(i > Collation.MAX_INDEX) {
+ throw new IndexOutOfBoundsException("too many mappings");
+ // BufferOverflowException is a better fit
+ // but cannot be constructed with a message string.
+ }
+ for(int j = 0; j < length; ++j) {
+ ce32s.addElement(newCE32s[start + j]);
+ }
+ return Collation.makeCE32FromTagIndexAndLength(Collation.EXPANSION32_TAG, i, length);
+ }
+
+ protected int copyFromBaseCE32(int c, int ce32, boolean withContext) {
+ if(!Collation.isSpecialCE32(ce32)) { return ce32; }
+ switch(Collation.tagFromCE32(ce32)) {
+ case Collation.LONG_PRIMARY_TAG:
+ case Collation.LONG_SECONDARY_TAG:
+ case Collation.LATIN_EXPANSION_TAG:
+ // copy as is
+ break;
+ case Collation.EXPANSION32_TAG: {
+ int index = Collation.indexFromCE32(ce32);
+ int length = Collation.lengthFromCE32(ce32);
+ ce32 = encodeExpansion32(base.ce32s, index, length);
+ break;
+ }
+ case Collation.EXPANSION_TAG: {
+ int index = Collation.indexFromCE32(ce32);
+ int length = Collation.lengthFromCE32(ce32);
+ ce32 = encodeExpansion(base.ces, index, length);
+ break;
+ }
+ case Collation.PREFIX_TAG: {
+ // Flatten prefixes and nested suffixes (contractions)
+ // into a linear list of ConditionalCE32.
+ int trieIndex = Collation.indexFromCE32(ce32);
+ ce32 = base.getCE32FromContexts(trieIndex); // Default if no prefix match.
+ if(!withContext) {
+ return copyFromBaseCE32(c, ce32, false);
+ }
+ ConditionalCE32 head = new ConditionalCE32("", 0);
+ StringBuilder context = new StringBuilder("\0");
+ int index;
+ if(Collation.isContractionCE32(ce32)) {
+ index = copyContractionsFromBaseCE32(context, c, ce32, head);
+ } else {
+ ce32 = copyFromBaseCE32(c, ce32, true);
+ head.next = index = addConditionalCE32(context.toString(), ce32);
+ }
+ ConditionalCE32 cond = getConditionalCE32(index); // the last ConditionalCE32 so far
+ CharsTrie.Iterator prefixes = CharsTrie.iterator(base.contexts, trieIndex + 2, 0);
+ while(prefixes.hasNext()) {
+ CharsTrie.Entry entry = prefixes.next();
+ context.setLength(0);
+ context.append(entry.chars).reverse().insert(0, (char)entry.chars.length());
+ ce32 = entry.value;
+ if(Collation.isContractionCE32(ce32)) {
+ index = copyContractionsFromBaseCE32(context, c, ce32, cond);
+ } else {
+ ce32 = copyFromBaseCE32(c, ce32, true);
+ cond.next = index = addConditionalCE32(context.toString(), ce32);
+ }
+ cond = getConditionalCE32(index);
+ }
+ ce32 = makeBuilderContextCE32(head.next);
+ contextChars.add(c);
+ break;
+ }
+ case Collation.CONTRACTION_TAG: {
+ if(!withContext) {
+ int index = Collation.indexFromCE32(ce32);
+ ce32 = base.getCE32FromContexts(index); // Default if no suffix match.
+ return copyFromBaseCE32(c, ce32, false);
+ }
+ ConditionalCE32 head = new ConditionalCE32("", 0);
+ StringBuilder context = new StringBuilder("\0");
+ copyContractionsFromBaseCE32(context, c, ce32, head);
+ ce32 = makeBuilderContextCE32(head.next);
+ contextChars.add(c);
+ break;
+ }
+ case Collation.HANGUL_TAG:
+ throw new UnsupportedOperationException("We forbid tailoring of Hangul syllables.");
+ case Collation.OFFSET_TAG:
+ ce32 = getCE32FromOffsetCE32(true, c, ce32);
+ break;
+ case Collation.IMPLICIT_TAG:
+ ce32 = encodeOneCE(Collation.unassignedCEFromCodePoint(c));
+ break;
+ default:
+ throw new AssertionError("copyFromBaseCE32(c, ce32, withContext) " +
+ "requires ce32 == base.getFinalCE32(ce32)");
+ }
+ return ce32;
+ }
+
+ /**
+ * Copies base contractions to a list of ConditionalCE32.
+ * Sets cond.next to the index of the first new item
+ * and returns the index of the last new item.
+ */
+ protected int copyContractionsFromBaseCE32(StringBuilder context, int c, int ce32,
+ ConditionalCE32 cond) {
+ int trieIndex = Collation.indexFromCE32(ce32);
+ int index;
+ if((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
+ // No match on the single code point.
+ // We are underneath a prefix, and the default mapping is just
+ // a fallback to the mappings for a shorter prefix.
+ assert(context.length() > 1);
+ index = -1;
+ } else {
+ ce32 = base.getCE32FromContexts(trieIndex); // Default if no suffix match.
+ assert(!Collation.isContractionCE32(ce32));
+ ce32 = copyFromBaseCE32(c, ce32, true);
+ cond.next = index = addConditionalCE32(context.toString(), ce32);
+ cond = getConditionalCE32(index);
+ }
+
+ int suffixStart = context.length();
+ CharsTrie.Iterator suffixes = CharsTrie.iterator(base.contexts, trieIndex + 2, 0);
+ while(suffixes.hasNext()) {
+ CharsTrie.Entry entry = suffixes.next();
+ context.append(entry.chars);
+ ce32 = copyFromBaseCE32(c, entry.value, true);
+ cond.next = index = addConditionalCE32(context.toString(), ce32);
+ // No need to update the unsafeBackwardSet because the tailoring set
+ // is already a copy of the base set.
+ cond = getConditionalCE32(index);
+ context.setLength(suffixStart);
+ }
+ assert(index >= 0);
+ return index;
+ }
+
+ private static final class CopyHelper {
+ CopyHelper(CollationDataBuilder s, CollationDataBuilder d,
+ CollationDataBuilder.CEModifier m) {
+ src = s;
+ dest = d;
+ modifier = m;
+ }
+
+ void copyRangeCE32(int start, int end, int ce32) {
+ ce32 = copyCE32(ce32);
+ dest.trie.setRange(start, end, ce32, true);
+ if(CollationDataBuilder.isBuilderContextCE32(ce32)) {
+ dest.contextChars.add(start, end);
+ }
+ }
+
+ int copyCE32(int ce32) {
+ if(!Collation.isSpecialCE32(ce32)) {
+ long ce = modifier.modifyCE32(ce32);
+ if(ce != Collation.NO_CE) {
+ ce32 = dest.encodeOneCE(ce);
+ }
+ } else {
+ int tag = Collation.tagFromCE32(ce32);
+ if(tag == Collation.EXPANSION32_TAG) {
+ int[] srcCE32s = src.ce32s.getBuffer();
+ int srcIndex = Collation.indexFromCE32(ce32);
+ int length = Collation.lengthFromCE32(ce32);
+ // Inspect the source CE32s. Just copy them if none are modified.
+ // Otherwise copy to modifiedCEs, with modifications.
+ boolean isModified = false;
+ for(int i = 0; i < length; ++i) {
+ ce32 = srcCE32s[srcIndex + i];
+ long ce;
+ if(Collation.isSpecialCE32(ce32) ||
+ (ce = modifier.modifyCE32(ce32)) == Collation.NO_CE) {
+ if(isModified) {
+ modifiedCEs[i] = Collation.ceFromCE32(ce32);
+ }
+ } else {
+ if(!isModified) {
+ for(int j = 0; j < i; ++j) {
+ modifiedCEs[j] = Collation.ceFromCE32(srcCE32s[srcIndex + j]);
+ }
+ isModified = true;
+ }
+ modifiedCEs[i] = ce;
+ }
+ }
+ if(isModified) {
+ ce32 = dest.encodeCEs(modifiedCEs, length);
+ } else {
+ ce32 = dest.encodeExpansion32(srcCE32s, srcIndex, length);
+ }
+ } else if(tag == Collation.EXPANSION_TAG) {
+ long[] srcCEs = src.ce64s.getBuffer();
+ int srcIndex = Collation.indexFromCE32(ce32);
+ int length = Collation.lengthFromCE32(ce32);
+ // Inspect the source CEs. Just copy them if none are modified.
+ // Otherwise copy to modifiedCEs, with modifications.
+ boolean isModified = false;
+ for(int i = 0; i < length; ++i) {
+ long srcCE = srcCEs[srcIndex + i];
+ long ce = modifier.modifyCE(srcCE);
+ if(ce == Collation.NO_CE) {
+ if(isModified) {
+ modifiedCEs[i] = srcCE;
+ }
+ } else {
+ if(!isModified) {
+ for(int j = 0; j < i; ++j) {
+ modifiedCEs[j] = srcCEs[srcIndex + j];
+ }
+ isModified = true;
+ }
+ modifiedCEs[i] = ce;
+ }
+ }
+ if(isModified) {
+ ce32 = dest.encodeCEs(modifiedCEs, length);
+ } else {
+ ce32 = dest.encodeExpansion(srcCEs, srcIndex, length);
+ }
+ } else if(tag == Collation.BUILDER_DATA_TAG) {
+ // Copy the list of ConditionalCE32.
+ ConditionalCE32 cond = src.getConditionalCE32ForCE32(ce32);
+ assert(!cond.hasContext());
+ int destIndex = dest.addConditionalCE32(
+ cond.context, copyCE32(cond.ce32));
+ ce32 = CollationDataBuilder.makeBuilderContextCE32(destIndex);
+ while(cond.next >= 0) {
+ cond = src.getConditionalCE32(cond.next);
+ ConditionalCE32 prevDestCond = dest.getConditionalCE32(destIndex);
+ destIndex = dest.addConditionalCE32(
+ cond.context, copyCE32(cond.ce32));
+ int suffixStart = cond.prefixLength() + 1;
+ dest.unsafeBackwardSet.addAll(cond.context.substring(suffixStart));
+ prevDestCond.next = destIndex;
+ }
+ } else {
+ // Just copy long CEs and Latin mini expansions (and other expected values) as is,
+ // assuming that the modifier would not modify them.
+ assert(tag == Collation.LONG_PRIMARY_TAG ||
+ tag == Collation.LONG_SECONDARY_TAG ||
+ tag == Collation.LATIN_EXPANSION_TAG ||
+ tag == Collation.HANGUL_TAG);
+ }
+ }
+ return ce32;
+ }
+
+ CollationDataBuilder src;
+ CollationDataBuilder dest;
+ CollationDataBuilder.CEModifier modifier;
+ long[] modifiedCEs = new long[Collation.MAX_EXPANSION_LENGTH];
+ }
+
+ private static void
+ enumRangeForCopy(int start, int end, int value, CopyHelper helper) {
+ if(value != Collation.UNASSIGNED_CE32 && value != Collation.FALLBACK_CE32) {
+ helper.copyRangeCE32(start, end, value);
+ }
+ }
+
+ protected boolean getJamoCE32s(int jamoCE32s[]) {
+ boolean anyJamoAssigned = base == null; // always set jamoCE32s in the base data
+ boolean needToCopyFromBase = false;
+ for(int j = 0; j < CollationData.JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types.
+ int jamo = jamoCpFromIndex(j);
+ boolean fromBase = false;
+ int ce32 = trie.get(jamo);
+ anyJamoAssigned |= Collation.isAssignedCE32(ce32);
+ // TODO: Try to prevent [optimize [Jamo]] from counting as anyJamoAssigned.
+ // (As of CLDR 24 [2013] the Korean tailoring does not optimize conjoining Jamo.)
+ if(ce32 == Collation.FALLBACK_CE32) {
+ fromBase = true;
+ ce32 = base.getCE32(jamo);
+ }
+ if(Collation.isSpecialCE32(ce32)) {
+ switch(Collation.tagFromCE32(ce32)) {
+ case Collation.LONG_PRIMARY_TAG:
+ case Collation.LONG_SECONDARY_TAG:
+ case Collation.LATIN_EXPANSION_TAG:
+ // Copy the ce32 as-is.
+ break;
+ case Collation.EXPANSION32_TAG:
+ case Collation.EXPANSION_TAG:
+ case Collation.PREFIX_TAG:
+ case Collation.CONTRACTION_TAG:
+ if(fromBase) {
+ // Defer copying until we know if anyJamoAssigned.
+ ce32 = Collation.FALLBACK_CE32;
+ needToCopyFromBase = true;
+ }
+ break;
+ case Collation.IMPLICIT_TAG:
+ // An unassigned Jamo should only occur in tests with incomplete bases.
+ assert(fromBase);
+ ce32 = Collation.FALLBACK_CE32;
+ needToCopyFromBase = true;
+ break;
+ case Collation.OFFSET_TAG:
+ ce32 = getCE32FromOffsetCE32(fromBase, jamo, ce32);
+ break;
+ case Collation.FALLBACK_TAG:
+ case Collation.RESERVED_TAG_3:
+ case Collation.BUILDER_DATA_TAG:
+ case Collation.DIGIT_TAG:
+ case Collation.U0000_TAG:
+ case Collation.HANGUL_TAG:
+ case Collation.LEAD_SURROGATE_TAG:
+ throw new AssertionError(String.format("unexpected special tag in ce32=0x%08x", ce32));
+ }
+ }
+ jamoCE32s[j] = ce32;
+ }
+ if(anyJamoAssigned && needToCopyFromBase) {
+ for(int j = 0; j < CollationData.JAMO_CE32S_LENGTH; ++j) {
+ if(jamoCE32s[j] == Collation.FALLBACK_CE32) {
+ int jamo = jamoCpFromIndex(j);
+ jamoCE32s[j] = copyFromBaseCE32(jamo, base.getCE32(jamo),
+ /*withContext=*/ true);
+ }
+ }
+ }
+ return anyJamoAssigned;
+ }
+
+ protected void setDigitTags() {
+ UnicodeSet digits = new UnicodeSet("[:Nd:]");
+ UnicodeSetIterator iter = new UnicodeSetIterator(digits);
+ while(iter.next()) {
+ assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
+ int c = iter.codepoint;
+ int ce32 = trie.get(c);
+ if(ce32 != Collation.FALLBACK_CE32 && ce32 != Collation.UNASSIGNED_CE32) {
+ int index = addCE32(ce32);
+ if(index > Collation.MAX_INDEX) {
+ throw new IndexOutOfBoundsException("too many mappings");
+ // BufferOverflowException is a better fit
+ // but cannot be constructed with a message string.
+ }
+ ce32 = Collation.makeCE32FromTagIndexAndLength(
+ Collation.DIGIT_TAG, index, UCharacter.digit(c)); // u_charDigitValue(c)
+ trie.set(c, ce32);
+ }
+ }
+ }
+
+ protected void setLeadSurrogates() {
+ for(char lead = 0xd800; lead < 0xdc00; ++lead) {
+ int leadValue = -1;
+ // utrie2_enumForLeadSurrogate(trie, lead, null, , &value);
+ Iterator<Trie2.Range> trieIterator = trie.iteratorForLeadSurrogate(lead);
+ while(trieIterator.hasNext()) {
+ Trie2.Range range = trieIterator.next();
+ // The rest of this loop is equivalent to C++ enumRangeLeadValue().
+ int value = range.value;
+ if(value == Collation.UNASSIGNED_CE32) {
+ value = Collation.LEAD_ALL_UNASSIGNED;
+ } else if(value == Collation.FALLBACK_CE32) {
+ value = Collation.LEAD_ALL_FALLBACK;
+ } else {
+ leadValue = Collation.LEAD_MIXED;
+ break;
+ }
+ if(leadValue < 0) {
+ leadValue = value;
+ } else if(leadValue != value) {
+ leadValue = Collation.LEAD_MIXED;
+ break;
+ }
+ }
+ trie.setForLeadSurrogateCodeUnit(lead,
+ Collation.makeCE32FromTagAndIndex(Collation.LEAD_SURROGATE_TAG, 0) | leadValue);
+ }
+ }
+
+ protected void buildMappings(CollationData data) {
+ if(!isMutable()) {
+ throw new IllegalStateException("attempt to build() after build()");
+ }
+
+ buildContexts();
+
+ int[] jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH];
+ int jamoIndex = -1;
+ if(getJamoCE32s(jamoCE32s)) {
+ jamoIndex = ce32s.size();
+ for(int i = 0; i < CollationData.JAMO_CE32S_LENGTH; ++i) {
+ ce32s.addElement(jamoCE32s[i]);
+ }
+ // Small optimization: Use a bit in the Hangul ce32
+ // to indicate that none of the Jamo CE32s are isSpecialCE32()
+ // (as it should be in the root collator).
+ // It allows CollationIterator to avoid recursive function calls and per-Jamo tests.
+ // In order to still have good trie compression and keep this code simple,
+ // we only set this flag if a whole block of 588 Hangul syllables starting with
+ // a common leading consonant (Jamo L) has this property.
+ boolean isAnyJamoVTSpecial = false;
+ for(int i = Hangul.JAMO_L_COUNT; i < CollationData.JAMO_CE32S_LENGTH; ++i) {
+ if(Collation.isSpecialCE32(jamoCE32s[i])) {
+ isAnyJamoVTSpecial = true;
+ break;
+ }
+ }
+ int hangulCE32 = Collation.makeCE32FromTagAndIndex(Collation.HANGUL_TAG, 0);
+ int c = Hangul.HANGUL_BASE;
+ for(int i = 0; i < Hangul.JAMO_L_COUNT; ++i) { // iterate over the Jamo L
+ int ce32 = hangulCE32;
+ if(!isAnyJamoVTSpecial && !Collation.isSpecialCE32(jamoCE32s[i])) {
+ ce32 |= Collation.HANGUL_NO_SPECIAL_JAMO;
+ }
+ int limit = c + Hangul.JAMO_VT_COUNT;
+ trie.setRange(c, limit - 1, ce32, true);
+ c = limit;
+ }
+ } else {
+ // Copy the Hangul CE32s from the base in blocks per Jamo L,
+ // assuming that HANGUL_NO_SPECIAL_JAMO is set or not set for whole blocks.
+ for(int c = Hangul.HANGUL_BASE; c < Hangul.HANGUL_LIMIT;) {
+ int ce32 = base.getCE32(c);
+ assert(Collation.hasCE32Tag(ce32, Collation.HANGUL_TAG));
+ int limit = c + Hangul.JAMO_VT_COUNT;
+ trie.setRange(c, limit - 1, ce32, true);
+ c = limit;
+ }
+ }
+
+ setDigitTags();
+ setLeadSurrogates();
+
+ // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG.
+ ce32s.setElementAt(trie.get(0), 0);
+ trie.set(0, Collation.makeCE32FromTagAndIndex(Collation.U0000_TAG, 0));
+
+ data.trie = trie.toTrie2_32();
+
+ // Mark each lead surrogate as "unsafe"
+ // if any of its 1024 associated supplementary code points is "unsafe".
+ int c = 0x10000;
+ for(char lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
+ if(unsafeBackwardSet.containsSome(c, c + 0x3ff)) {
+ unsafeBackwardSet.add(lead);
+ }
+ }
+ unsafeBackwardSet.freeze();
+
+ data.ce32s = ce32s.getBuffer();
+ data.ces = ce64s.getBuffer();
+ data.contexts = contexts.toString();
+
+ data.base = base;
+ if(jamoIndex >= 0) {
+ data.jamoCE32s = jamoCE32s; // C++: data.ce32s + jamoIndex
+ } else {
+ data.jamoCE32s = base.jamoCE32s;
+ }
+ data.unsafeBackwardSet = unsafeBackwardSet;
+ }
+
+ protected void clearContexts() {
+ contexts.setLength(0);
+ UnicodeSetIterator iter = new UnicodeSetIterator(contextChars);
+ while(iter.next()) {
+ assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
+ int ce32 = trie.get(iter.codepoint);
+ assert(isBuilderContextCE32(ce32));
+ getConditionalCE32ForCE32(ce32).builtCE32 = Collation.NO_CE32;
+ }
+ }
+
+ protected void buildContexts() {
+ // Ignore abandoned lists and the cached builtCE32,
+ // and build all contexts from scratch.
+ contexts.setLength(0);
+ UnicodeSetIterator iter = new UnicodeSetIterator(contextChars);
+ while(iter.next()) {
+ assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
+ int c = iter.codepoint;
+ int ce32 = trie.get(c);
+ if(!isBuilderContextCE32(ce32)) {
+ throw new AssertionError("Impossible: No context data for c in contextChars.");
+ }
+ ConditionalCE32 cond = getConditionalCE32ForCE32(ce32);
+ ce32 = buildContext(cond);
+ trie.set(c, ce32);
+ }
+ }
+
+ protected int buildContext(ConditionalCE32 head) {
+ // The list head must have no context.
+ assert(!head.hasContext());
+ // The list head must be followed by one or more nodes that all do have context.
+ assert(head.next >= 0);
+ CharsTrieBuilder prefixBuilder = new CharsTrieBuilder();
+ CharsTrieBuilder contractionBuilder = new CharsTrieBuilder();
+ for(ConditionalCE32 cond = head;; cond = getConditionalCE32(cond.next)) {
+ // After the list head, the prefix or suffix can be empty, but not both.
+ assert(cond == head || cond.hasContext());
+ int prefixLength = cond.prefixLength();
+ StringBuilder prefix = new StringBuilder().append(cond.context, 0, prefixLength + 1);
+ String prefixString = prefix.toString();
+ // Collect all contraction suffixes for one prefix.
+ ConditionalCE32 firstCond = cond;
+ ConditionalCE32 lastCond = cond;
+ while(cond.next >= 0 &&
+ (cond = getConditionalCE32(cond.next)).context.startsWith(prefixString)) {
+ lastCond = cond;
+ }
+ int ce32;
+ int suffixStart = prefixLength + 1; // == prefix.length()
+ if(lastCond.context.length() == suffixStart) {
+ // One prefix without contraction suffix.
+ assert(firstCond == lastCond);
+ ce32 = lastCond.ce32;
+ cond = lastCond;
+ } else {
+ // Build the contractions trie.
+ contractionBuilder.clear();
+ // Entry for an empty suffix, to be stored before the trie.
+ int emptySuffixCE32 = Collation.NO_CE32; // Will always be set to a real value.
+ int flags = 0;
+ if(firstCond.context.length() == suffixStart) {
+ // There is a mapping for the prefix and the single character c. (p|c)
+ // If no other suffix matches, then we return this value.
+ emptySuffixCE32 = firstCond.ce32;
+ cond = getConditionalCE32(firstCond.next);
+ } else {
+ // There is no mapping for the prefix and just the single character.
+ // (There is no p|c, only p|cd, p|ce etc.)
+ flags |= Collation.CONTRACT_SINGLE_CP_NO_MATCH;
+ // When the prefix matches but none of the prefix-specific suffixes,
+ // then we fall back to the mappings with the next-longest prefix,
+ // and ultimately to mappings with no prefix.
+ // Each fallback might be another set of contractions.
+ // For example, if there are mappings for ch, p|cd, p|ce, but not for p|c,
+ // then in text "pch" we find the ch contraction.
+ for(cond = head;; cond = getConditionalCE32(cond.next)) {
+ int length = cond.prefixLength();
+ if(length == prefixLength) { break; }
+ if(cond.defaultCE32 != Collation.NO_CE32 &&
+ (length==0 || prefixString.regionMatches(
+ prefix.length() - length, cond.context, 1, length)
+ /* C++: prefix.endsWith(cond.context, 1, length) */)) {
+ emptySuffixCE32 = cond.defaultCE32;
+ }
+ }
+ cond = firstCond;
+ }
+ // Optimization: Set a flag when
+ // the first character of every contraction suffix has lccc!=0.
+ // Short-circuits contraction matching when a normal letter follows.
+ flags |= Collation.CONTRACT_NEXT_CCC;
+ // Add all of the non-empty suffixes into the contraction trie.
+ for(;;) {
+ String suffix = cond.context.substring(suffixStart);
+ int fcd16 = nfcImpl.getFCD16(suffix.codePointAt(0));
+ if(fcd16 <= 0xff) {
+ flags &= ~Collation.CONTRACT_NEXT_CCC;
+ }
+ fcd16 = nfcImpl.getFCD16(suffix.codePointBefore(suffix.length()));
+ if(fcd16 > 0xff) {
+ // The last suffix character has lccc!=0, allowing for discontiguous contractions.
+ flags |= Collation.CONTRACT_TRAILING_CCC;
+ }
+ contractionBuilder.add(suffix, cond.ce32);
+ if(cond == lastCond) { break; }
+ cond = getConditionalCE32(cond.next);
+ }
+ int index = addContextTrie(emptySuffixCE32, contractionBuilder);
+ if(index > Collation.MAX_INDEX) {
+ throw new IndexOutOfBoundsException("too many context-sensitive mappings");
+ // BufferOverflowException is a better fit
+ // but cannot be constructed with a message string.
+ }
+ ce32 = Collation.makeCE32FromTagAndIndex(Collation.CONTRACTION_TAG, index) | flags;
+ }
+ assert(cond == lastCond);
+ firstCond.defaultCE32 = ce32;
+ if(prefixLength == 0) {
+ if(cond.next < 0) {
+ // No non-empty prefixes, only contractions.
+ return ce32;
+ }
+ } else {
+ prefix.delete(0, 1); // Remove the length unit.
+ prefix.reverse();
+ prefixBuilder.add(prefix, ce32);
+ if(cond.next < 0) { break; }
+ }
+ }
+ assert(head.defaultCE32 != Collation.NO_CE32);
+ int index = addContextTrie(head.defaultCE32, prefixBuilder);
+ if(index > Collation.MAX_INDEX) {
+ throw new IndexOutOfBoundsException("too many context-sensitive mappings");
+ // BufferOverflowException is a better fit
+ // but cannot be constructed with a message string.
+ }
+ return Collation.makeCE32FromTagAndIndex(Collation.PREFIX_TAG, index);
+ }
+
+ protected int addContextTrie(int defaultCE32, CharsTrieBuilder trieBuilder) {
+ StringBuilder context = new StringBuilder();
+ context.append((char)(defaultCE32 >> 16)).append((char)defaultCE32);
+ context.append(trieBuilder.buildCharSequence(StringTrieBuilder.Option.SMALL));
+ int index = contexts.indexOf(context.toString());
+ if(index < 0) {
+ index = contexts.length();
+ contexts.append(context);
+ }
+ return index;
+ }
+
+ protected void buildFastLatinTable(CollationData data) {
+ if(!fastLatinEnabled) { return; }
+
+ fastLatinBuilder = new CollationFastLatinBuilder();
+ if(fastLatinBuilder.forData(data)) {
+ char[] header = fastLatinBuilder.getHeader();
+ char[] table = fastLatinBuilder.getTable();
+ if(base != null &&
+ Arrays.equals(header, base.fastLatinTableHeader) &&
+ Arrays.equals(table, base.fastLatinTable)) {
+ // Same fast Latin table as in the base, use that one instead.
+ fastLatinBuilder = null;
+ header = base.fastLatinTableHeader;
+ table = base.fastLatinTable;
+ }
+ data.fastLatinTableHeader = header;
+ data.fastLatinTable = table;
+ } else {
+ fastLatinBuilder = null;
+ }
+ }
+
+ protected int getCEs(CharSequence s, int start, long ces[], int cesLength) {
+ if(collIter == null) {
+ collIter = new DataBuilderCollationIterator(this, new CollationData(nfcImpl));
+ if(collIter == null) { return 0; }
+ }
+ return collIter.fetchCEs(s, start, ces, cesLength);
+ }
+
+ protected static int jamoCpFromIndex(int i) {
+ // 0 <= i < CollationData.JAMO_CE32S_LENGTH = 19 + 21 + 27
+ if(i < Hangul.JAMO_L_COUNT) { return Hangul.JAMO_L_BASE + i; }
+ i -= Hangul.JAMO_L_COUNT;
+ if(i < Hangul.JAMO_V_COUNT) { return Hangul.JAMO_V_BASE + i; }
+ i -= Hangul.JAMO_V_COUNT;
+ // i < 27
+ return Hangul.JAMO_T_BASE + 1 + i;
+ }
+
+ /**
+ * Build-time collation element and character iterator.
+ * Uses the runtime CollationIterator for fetching CEs for a string
+ * but reads from the builder's unfinished data structures.
+ * In particular, this class reads from the unfinished trie
+ * and has to avoid CollationIterator.nextCE() and redirect other
+ * calls to data.getCE32() and data.getCE32FromSupplementary().
+ *
+ * We do this so that we need not implement the collation algorithm
+ * again for the builder and make it behave exactly like the runtime code.
+ * That would be more difficult to test and maintain than this indirection.
+ *
+ * Some CE32 tags (for example, the DIGIT_TAG) do not occur in the builder data,
+ * so the data accesses from those code paths need not be modified.
+ *
+ * This class iterates directly over whole code points
+ * so that the CollationIterator does not need the finished trie
+ * for handling the LEAD_SURROGATE_TAG.
+ */
+ private static final class DataBuilderCollationIterator extends CollationIterator {
+ DataBuilderCollationIterator(CollationDataBuilder b, CollationData newData) {
+ super(newData, /*numeric=*/ false);
+ builder = b;
+ builderData = newData;
+ builderData.base = builder.base;
+ // Set all of the jamoCE32s[] to indirection CE32s.
+ for(int j = 0; j < CollationData.JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types.
+ int jamo = CollationDataBuilder.jamoCpFromIndex(j);
+ jamoCE32s[j] = Collation.makeCE32FromTagAndIndex(Collation.BUILDER_DATA_TAG, jamo) |
+ CollationDataBuilder.IS_BUILDER_JAMO_CE32;
+ }
+ builderData.jamoCE32s = jamoCE32s;
+ }
+
+ int fetchCEs(CharSequence str, int start, long ces[], int cesLength) {
+ // Set the pointers each time, in case they changed due to reallocation.
+ builderData.ce32s = builder.ce32s.getBuffer();
+ builderData.ces = builder.ce64s.getBuffer();
+ builderData.contexts = builder.contexts.toString();
+ // Modified copy of CollationIterator.nextCE() and CollationIterator.nextCEFromCE32().
+ reset();
+ s = str;
+ pos = start;
+ while(pos < s.length()) {
+ // No need to keep all CEs in the iterator buffer.
+ clearCEs();
+ int c = Character.codePointAt(s, pos);
+ pos += Character.charCount(c);
+ int ce32 = builder.trie.get(c);
+ CollationData d;
+ if(ce32 == Collation.FALLBACK_CE32) {
+ d = builder.base;
+ ce32 = builder.base.getCE32(c);
+ } else {
+ d = builderData;
+ }
+ appendCEsFromCE32(d, c, ce32, /*forward=*/ true);
+ for(int i = 0; i < getCEsLength(); ++i) {
+ long ce = getCE(i);
+ if(ce != 0) {
+ if(cesLength < Collation.MAX_EXPANSION_LENGTH) {
+ ces[cesLength] = ce;
+ }
+ ++cesLength;
+ }
+ }
+ }
+ return cesLength;
+ }
+
+ @Override
+ public void resetToOffset(int newOffset) {
+ reset();
+ pos = newOffset;
+ }
+
+ @Override
+ public int getOffset() {
+ return pos;
+ }
+
+ @Override
+ public int nextCodePoint() {
+ if(pos == s.length()) {
+ return Collation.SENTINEL_CP;
+ }
+ int c = Character.codePointAt(s, pos);
+ pos += Character.charCount(c);
+ return c;
+ }
+
+ @Override
+ public int previousCodePoint() {
+ if(pos == 0) {
+ return Collation.SENTINEL_CP;
+ }
+ int c = Character.codePointBefore(s, pos);
+ pos -= Character.charCount(c);
+ return c;
+ }
+
+ @Override
+ protected void forwardNumCodePoints(int num) {
+ pos = Character.offsetByCodePoints(s, pos, num);
+ }
+
+ @Override
+ protected void backwardNumCodePoints(int num) {
+ pos = Character.offsetByCodePoints(s, pos, -num);
+ }
+
+ @Override
+ protected int getDataCE32(int c) {
+ return builder.trie.get(c);
+ }
+
+ @Override
+ protected int getCE32FromBuilderData(int ce32) {
+ assert(Collation.hasCE32Tag(ce32, Collation.BUILDER_DATA_TAG));
+ if((ce32 & CollationDataBuilder.IS_BUILDER_JAMO_CE32) != 0) {
+ int jamo = Collation.indexFromCE32(ce32);
+ return builder.trie.get(jamo);
+ } else {
+ ConditionalCE32 cond = builder.getConditionalCE32ForCE32(ce32);
+ if(cond.builtCE32 == Collation.NO_CE32) {
+ // Build the context-sensitive mappings into their runtime form and cache the result.
+ try {
+ cond.builtCE32 = builder.buildContext(cond);
+ } catch(IndexOutOfBoundsException e) {
+ builder.clearContexts();
+ cond.builtCE32 = builder.buildContext(cond);
+ }
+ builderData.contexts = builder.contexts.toString();
+ }
+ return cond.builtCE32;
+ }
+ }
+
+ protected final CollationDataBuilder builder;
+ protected final CollationData builderData;
+ protected final int[] jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH];
+ protected CharSequence s;
+ protected int pos;
+ }
+
+ protected final boolean isMutable() {
+ // C++ tests !(trie == NULL || utrie2_isFrozen(trie))
+ // but Java Trie2Writable does not have an observable isFrozen() state.
+ return trie != null && unsafeBackwardSet != null && !unsafeBackwardSet.isFrozen();
+ }
+
+ /** @see Collation.BUILDER_DATA_TAG */
+ private static final int IS_BUILDER_JAMO_CE32 = 0x100;
+
+ protected Normalizer2Impl nfcImpl;
+ protected CollationData base;
+ protected CollationSettings baseSettings;
+ protected Trie2Writable trie;
+ protected UVector32 ce32s;
+ protected UVector64 ce64s;
+ protected ArrayList<ConditionalCE32> conditionalCE32s; // vector of ConditionalCE32
+ // Characters that have context (prefixes or contraction suffixes).
+ protected UnicodeSet contextChars = new UnicodeSet();
+ // Serialized UCharsTrie structures for finalized contexts.
+ protected StringBuilder contexts = new StringBuilder();
+ protected UnicodeSet unsafeBackwardSet = new UnicodeSet();
+ protected boolean modified;
+
+ protected boolean fastLatinEnabled;
+ protected CollationFastLatinBuilder fastLatinBuilder;
+
+ protected DataBuilderCollationIterator collIter;
+}
--- /dev/null
+/*
+*******************************************************************************
+* Copyright (C) 2013-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* CollationDataReader.java, ported from collationdatareader.h/.cpp
+*
+* C++ version created on: 2013feb07
+* created by: Markus W. Scherer
+*/
+
+package com.ibm.icu.impl.coll;
+
+import java.io.BufferedInputStream;
+import java.io.DataInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+
+import com.ibm.icu.impl.ICUBinary;
+import com.ibm.icu.impl.Trie2_32;
+import com.ibm.icu.impl.USerializedSet;
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.UnicodeSet;
+
+/**
+ * Collation binary data reader.
+ */
+final class CollationDataReader /* all static */ {
+ // The following constants are also copied into source/common/ucol_swp.cpp.
+ // Keep them in sync!
+ /**
+ * Number of int indexes.
+ *
+ * Can be 2 if there are only options.
+ * Can be 7 or 8 if there are only options and a script reordering.
+ * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0.
+ */
+ static final int IX_INDEXES_LENGTH = 0;
+ /**
+ * Bits 31..24: numericPrimary, for numeric collation
+ * 23..16: fast Latin format version (0 = no fast Latin table)
+ * 15.. 0: options bit set
+ */
+ static final int IX_OPTIONS = 1;
+ static final int IX_RESERVED2 = 2;
+ static final int IX_RESERVED3 = 3;
+
+ /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */
+ static final int IX_JAMO_CE32S_START = 4;
+
+ // Byte offsets from the start of the data, after the generic header.
+ // The indexes[] are at byte offset 0, other data follows.
+ // Each data item is aligned properly.
+ // The data items should be in descending order of unit size,
+ // to minimize the need for padding.
+ // Each item's byte length is given by the difference between its offset and
+ // the next index/offset value.
+ /** Byte offset to int reorderCodes[]. */
+ static final int IX_REORDER_CODES_OFFSET = 5;
+ /**
+ * Byte offset to uint8_t reorderTable[].
+ * Empty table if <256 bytes (padding only).
+ * Otherwise 256 bytes or more (with padding).
+ */
+ static final int IX_REORDER_TABLE_OFFSET = 6;
+ /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */
+ static final int IX_TRIE_OFFSET = 7;
+
+ static final int IX_RESERVED8_OFFSET = 8;
+ /** Byte offset to long ces[]. */
+ static final int IX_CES_OFFSET = 9;
+ static final int IX_RESERVED10_OFFSET = 10;
+ /** Byte offset to int ce32s[]. */
+ static final int IX_CE32S_OFFSET = 11;
+
+ /** Byte offset to uint32_t rootElements[]. */
+ static final int IX_ROOT_ELEMENTS_OFFSET = 12;
+ /** Byte offset to UChar *contexts[]. */
+ static final int IX_CONTEXTS_OFFSET = 13;
+ /** Byte offset to char [] with serialized unsafeBackwardSet. */
+ static final int IX_UNSAFE_BWD_OFFSET = 14;
+ /** Byte offset to char fastLatinTable[]. */
+ static final int IX_FAST_LATIN_TABLE_OFFSET = 15;
+
+ /** Byte offset to char scripts[]. */
+ static final int IX_SCRIPTS_OFFSET = 16;
+ /**
+ * Byte offset to boolean compressibleBytes[].
+ * Empty table if <256 bytes (padding only).
+ * Otherwise 256 bytes or more (with padding).
+ */
+ static final int IX_COMPRESSIBLE_BYTES_OFFSET = 17;
+ static final int IX_RESERVED18_OFFSET = 18;
+ static final int IX_TOTAL_SIZE = 19;
+
+ static void read(CollationTailoring base, InputStream inBytes,
+ CollationTailoring tailoring) throws IOException {
+ BufferedInputStream bis = new BufferedInputStream(inBytes);
+ tailoring.version = ICUBinary.readHeaderAndDataVersion(bis, DATA_FORMAT, IS_ACCEPTABLE);
+ if(base != null && base.getUCAVersion() != tailoring.getUCAVersion()) {
+ throw new RuntimeException("Tailoring UCA version differs from base data UCA version");
+ }
+
+ DataInputStream ds = new DataInputStream(bis);
+ int indexesLength = ds.readInt(); // inIndexes[IX_INDEXES_LENGTH]
+ if(indexesLength < 2) {
+ throw new RuntimeException("not enough indexes");
+ }
+ int[] inIndexes = new int[IX_TOTAL_SIZE + 1];
+ inIndexes[0] = indexesLength;
+ for(int i = 1; i < indexesLength && i < inIndexes.length; ++i) {
+ inIndexes[i] = ds.readInt();
+ }
+ for(int i = indexesLength; i < inIndexes.length; ++i) {
+ inIndexes[i] = -1;
+ }
+ if(indexesLength > inIndexes.length) {
+ ds.skipBytes((indexesLength - inIndexes.length) * 4);
+ }
+
+ // Assume that the tailoring data is in initial state,
+ // with null pointers and 0 lengths.
+
+ // Set pointers to non-empty data parts.
+ // Do this in order of their byte offsets. (Should help porting to Java.)
+
+ int index; // one of the indexes[] slots
+ int offset; // byte offset for the index part
+ int length; // number of bytes in the index part
+
+ CollationData baseData = base == null ? null : base.data;
+ int[] reorderCodes;
+ index = IX_REORDER_CODES_OFFSET;
+ offset = inIndexes[index];
+ length = inIndexes[index + 1] - offset;
+ if(length >= 4) {
+ if(baseData == null) {
+ // We assume for collation settings that
+ // the base data does not have a reordering.
+ throw new RuntimeException("Collation base data must not reorder scripts");
+ }
+ reorderCodes = new int[length / 4];
+ for(int i = 0; i < length / 4; ++i) {
+ reorderCodes[i] = ds.readInt();
+ }
+ length &= 3;
+ } else {
+ reorderCodes = new int[0];
+ }
+ ds.skipBytes(length);
+
+ // There should be a reorder table only if there are reorder codes.
+ // However, when there are reorder codes the reorder table may be omitted to reduce
+ // the data size.
+ byte[] reorderTable = null;
+ index = IX_REORDER_TABLE_OFFSET;
+ offset = inIndexes[index];
+ length = inIndexes[index + 1] - offset;
+ if(length >= 256) {
+ if(reorderCodes.length == 0) {
+ throw new RuntimeException("Reordering table without reordering codes");
+ }
+ reorderTable = new byte[256];
+ ds.readFully(reorderTable);
+ length -= 256;
+ } else {
+ // If we have reorder codes, then build the reorderTable at the end,
+ // when the CollationData is otherwise complete.
+ }
+ ds.skipBytes(length);
+
+ if(baseData != null && baseData.numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000L)) {
+ throw new RuntimeException("Tailoring numeric primary weight differs from base data");
+ }
+ CollationData data = null; // Remains null if there are no mappings.
+
+ index = IX_TRIE_OFFSET;
+ offset = inIndexes[index];
+ length = inIndexes[index + 1] - offset;
+ if(length >= 8) {
+ tailoring.ensureOwnedData();
+ data = tailoring.ownedData;
+ data.base = baseData;
+ data.numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000L;
+ data.trie = tailoring.trie = Trie2_32.createFromSerialized(ds);
+ int trieLength = data.trie.getSerializedLength();
+ if(trieLength > length) {
+ throw new RuntimeException("Not enough bytes for the mappings trie"); // No mappings.
+ }
+ length -= trieLength;
+ } else if(baseData != null) {
+ // Use the base data. Only the settings are tailored.
+ tailoring.data = baseData;
+ } else {
+ throw new RuntimeException("Missing collation data mappings"); // No mappings.
+ }
+ ds.skipBytes(length);
+
+ index = IX_RESERVED8_OFFSET;
+ offset = inIndexes[index];
+ length = inIndexes[index + 1] - offset;
+ ds.skipBytes(length);
+
+ index = IX_CES_OFFSET;
+ offset = inIndexes[index];
+ length = inIndexes[index + 1] - offset;
+ if(length >= 8) {
+ if(data == null) {
+ throw new RuntimeException("Tailored ces without tailored trie");
+ }
+ data.ces = new long[length / 8];
+ for(int i = 0; i < length / 8; ++i) {
+ data.ces[i] = ds.readLong();
+ }
+ length &= 7;
+ }
+ ds.skipBytes(length);
+
+ index = IX_RESERVED10_OFFSET;
+ offset = inIndexes[index];
+ length = inIndexes[index + 1] - offset;
+ ds.skipBytes(length);
+
+ index = IX_CE32S_OFFSET;
+ offset = inIndexes[index];
+ length = inIndexes[index + 1] - offset;
+ if(length >= 4) {
+ if(data == null) {
+ throw new RuntimeException("Tailored ce32s without tailored trie");
+ }
+ data.ce32s = new int[length / 4];
+ for(int i = 0; i < length / 4; ++i) {
+ data.ce32s[i] = ds.readInt();
+ }
+ length &= 3;
+ }
+ ds.skipBytes(length);
+
+ int jamoCE32sStart = inIndexes[IX_JAMO_CE32S_START];
+ if(jamoCE32sStart >= 0) {
+ if(data == null || data.ce32s == null) {
+ throw new RuntimeException("JamoCE32sStart index into non-existent ce32s[]");
+ }
+ data.jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH];
+ System.arraycopy(data.ce32s, jamoCE32sStart, data.jamoCE32s, 0, CollationData.JAMO_CE32S_LENGTH);
+ } else if(data == null) {
+ // Nothing to do.
+ } else if(baseData != null) {
+ data.jamoCE32s = baseData.jamoCE32s;
+ } else {
+ throw new RuntimeException("Missing Jamo CE32s for Hangul processing");
+ }
+
+ index = IX_ROOT_ELEMENTS_OFFSET;
+ offset = inIndexes[index];
+ length = inIndexes[index + 1] - offset;
+ if(length >= 4) {
+ int rootElementsLength = length / 4;
+ if(data == null) {
+ throw new RuntimeException("Root elements but no mappings");
+ }
+ if(rootElementsLength <= CollationRootElements.IX_SEC_TER_BOUNDARIES) {
+ throw new RuntimeException("Root elements array too short");
+ }
+ data.rootElements = new long[rootElementsLength];
+ for(int i = 0; i < rootElementsLength; ++i) {
+ data.rootElements[i] = ds.readInt() & 0xffffffffL; // unsigned int -> long
+ }
+ long commonSecTer = data.rootElements[CollationRootElements.IX_COMMON_SEC_AND_TER_CE];
+ if(commonSecTer != Collation.COMMON_SEC_AND_TER_CE) {
+ throw new RuntimeException("Common sec/ter weights in base data differ from the hardcoded value");
+ }
+ long secTerBoundaries = data.rootElements[CollationRootElements.IX_SEC_TER_BOUNDARIES];
+ if((secTerBoundaries >>> 24) < CollationKeys.SEC_COMMON_HIGH) {
+ // [fixed last secondary common byte] is too low,
+ // and secondary weights would collide with compressed common secondaries.
+ throw new RuntimeException("[fixed last secondary common byte] is too low");
+ }
+ length &= 3;
+ }
+ ds.skipBytes(length);
+
+ index = IX_CONTEXTS_OFFSET;
+ offset = inIndexes[index];
+ length = inIndexes[index + 1] - offset;
+ if(length >= 2) {
+ if(data == null) {
+ throw new RuntimeException("Tailored contexts without tailored trie");
+ }
+ StringBuilder sb = new StringBuilder(length / 2);
+ for(int i = 0; i < length / 2; ++i) {
+ sb.append(ds.readChar());
+ }
+ data.contexts = sb.toString();
+ length &= 1;
+ }
+ ds.skipBytes(length);
+
+ index = IX_UNSAFE_BWD_OFFSET;
+ offset = inIndexes[index];
+ length = inIndexes[index + 1] - offset;
+ if(length >= 2) {
+ if(data == null) {
+ throw new RuntimeException("Unsafe-backward-set but no mappings");
+ }
+ if(baseData == null) {
+ // Create the unsafe-backward set for the root collator.
+ // Include all non-zero combining marks and trail surrogates.
+ // We do this at load time, rather than at build time,
+ // to simplify Unicode version bootstrapping:
+ // The root data builder only needs the new FractionalUCA.txt data,
+ // but it need not be built with a version of ICU already updated to
+ // the corresponding new Unicode Character Database.
+ //
+ // The following is an optimized version of
+ // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
+ // It is faster and requires fewer code dependencies.
+ tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff); // trail surrogates
+ data.nfcImpl.addLcccChars(tailoring.unsafeBackwardSet);
+ } else {
+ // Clone the root collator's set contents.
+ tailoring.unsafeBackwardSet = baseData.unsafeBackwardSet.cloneAsThawed();
+ }
+ // Add the ranges from the data file to the unsafe-backward set.
+ USerializedSet sset = new USerializedSet();
+ char[] unsafeData = new char[length / 2];
+ for(int i = 0; i < length / 2; ++i) {
+ unsafeData[i] = ds.readChar();
+ }
+ length &= 1;
+ sset.getSet(unsafeData, 0);
+ int count = sset.countRanges();
+ int[] range = new int[2];
+ for(int i = 0; i < count; ++i) {
+ sset.getRange(i, range);
+ tailoring.unsafeBackwardSet.add(range[0], range[1]);
+ }
+ // Mark each lead surrogate as "unsafe"
+ // if any of its 1024 associated supplementary code points is "unsafe".
+ int c = 0x10000;
+ for(int lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
+ if(!tailoring.unsafeBackwardSet.containsNone(c, c + 0x3ff)) {
+ tailoring.unsafeBackwardSet.add(lead);
+ }
+ }
+ tailoring.unsafeBackwardSet.freeze();
+ data.unsafeBackwardSet = tailoring.unsafeBackwardSet;
+ } else if(data == null) {
+ // Nothing to do.
+ } else if(baseData != null) {
+ // No tailoring-specific data: Alias the root collator's set.
+ data.unsafeBackwardSet = baseData.unsafeBackwardSet;
+ } else {
+ throw new RuntimeException("Missing unsafe-backward-set");
+ }
+ ds.skipBytes(length);
+
+ // If the fast Latin format version is different,
+ // or the version is set to 0 for "no fast Latin table",
+ // then just always use the normal string comparison path.
+ index = IX_FAST_LATIN_TABLE_OFFSET;
+ offset = inIndexes[index];
+ length = inIndexes[index + 1] - offset;
+ if(data != null) {
+ data.fastLatinTable = null;
+ data.fastLatinTableHeader = null;
+ if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin.VERSION) {
+ if(length >= 2) {
+ char header0 = ds.readChar();
+ int headerLength = header0 & 0xff;
+ data.fastLatinTableHeader = new char[headerLength];
+ data.fastLatinTableHeader[0] = header0;
+ for(int i = 1; i < headerLength; ++i) {
+ data.fastLatinTableHeader[i] = ds.readChar();
+ }
+ int tableLength = length / 2 - headerLength;
+ data.fastLatinTable = new char[tableLength];
+ for(int i = 0; i < tableLength; ++i) {
+ data.fastLatinTable[i] = ds.readChar();
+ }
+ length &= 1;
+ if((header0 >> 8) != CollationFastLatin.VERSION) {
+ throw new RuntimeException("Fast-Latin table version differs from version in data header");
+ }
+ } else if(baseData != null) {
+ data.fastLatinTable = baseData.fastLatinTable;
+ data.fastLatinTableHeader = baseData.fastLatinTableHeader;
+ }
+ }
+ }
+ ds.skipBytes(length);
+
+ index = IX_SCRIPTS_OFFSET;
+ offset = inIndexes[index];
+ length = inIndexes[index + 1] - offset;
+ if(length >= 2) {
+ if(data == null) {
+ throw new RuntimeException("Script order data but no mappings");
+ }
+ data.scripts = new char[length / 2];
+ for(int i = 0; i < length / 2; ++i) {
+ data.scripts[i] = ds.readChar();
+ }
+ length &= 1;
+ } else if(data == null) {
+ // Nothing to do.
+ } else if(baseData != null) {
+ data.scripts = baseData.scripts;
+ }
+ ds.skipBytes(length);
+
+ index = IX_COMPRESSIBLE_BYTES_OFFSET;
+ offset = inIndexes[index];
+ length = inIndexes[index + 1] - offset;
+ if(length >= 256) {
+ if(data == null) {
+ throw new RuntimeException("Data for compressible primary lead bytes but no mappings");
+ }
+ data.compressibleBytes = new boolean[256];
+ for(int i = 0; i < 256; ++i) {
+ data.compressibleBytes[i] = ds.readBoolean();
+ }
+ length -= 256;
+ } else if(data == null) {
+ // Nothing to do.
+ } else if(baseData != null) {
+ data.compressibleBytes = baseData.compressibleBytes;
+ } else {
+ throw new RuntimeException("Missing data for compressible primary lead bytes");
+ }
+ ds.skipBytes(length);
+
+ index = IX_RESERVED18_OFFSET;
+ offset = inIndexes[index];
+ length = inIndexes[index + 1] - offset;
+ ds.skipBytes(length);
+
+ ds.close();
+
+ CollationSettings ts = tailoring.settings.readOnly();
+ int options = inIndexes[IX_OPTIONS] & 0xffff;
+ char[] fastLatinPrimaries = new char[CollationFastLatin.LATIN_LIMIT];
+ int fastLatinOptions = CollationFastLatin.getOptions(
+ tailoring.data, ts, fastLatinPrimaries);
+ if(options == ts.options && ts.variableTop != 0 &&
+ Arrays.equals(reorderCodes, ts.reorderCodes) &&
+ fastLatinOptions == ts.fastLatinOptions &&
+ (fastLatinOptions < 0 ||
+ Arrays.equals(fastLatinPrimaries, ts.fastLatinPrimaries))) {
+ return;
+ }
+
+ CollationSettings settings = tailoring.settings.copyOnWrite();
+ settings.options = options;
+ // Set variableTop from options and scripts data.
+ settings.variableTop = tailoring.data.getLastPrimaryForGroup(
+ Collator.ReorderCodes.FIRST + settings.getMaxVariable());
+ if(settings.variableTop == 0) {
+ throw new RuntimeException("The maxVariable could not be mapped to a variableTop");
+ }
+
+ if(reorderCodes.length == 0 || reorderTable != null) {
+ settings.setReordering(reorderCodes, reorderTable);
+ } else {
+ byte[] table = new byte[256];
+ baseData.makeReorderTable(reorderCodes, table);
+ settings.setReordering(reorderCodes, table);
+ }
+
+ settings.fastLatinOptions = CollationFastLatin.getOptions(
+ tailoring.data, settings,
+ settings.fastLatinPrimaries);
+ }
+
+ private static final class IsAcceptable implements ICUBinary.Authenticate {
+ // @Override when we switch to Java 6
+ public boolean isDataVersionAcceptable(byte version[]) {
+ return version[0] == 4;
+ }
+ }
+ private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
+ private static final byte DATA_FORMAT[] = { 0x55, 0x43, 0x6f, 0x6c }; // "UCol"
+
+ private CollationDataReader() {} // no constructor
+}
+
+/*
+ * Format of collation data (ucadata.icu, binary data in coll/ *.res files):
+ * See ICU4C source/common/collationdatareader.h.
+ */
--- /dev/null
+/*
+*******************************************************************************
+* Copyright (C) 2012-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* CollationFCD.java, ported from collationfcd.h/.cpp
+*
+* C++ version created on: 2012aug18
+* created by: Markus W. Scherer
+*/
+
+package com.ibm.icu.impl.coll;
+
+import com.ibm.icu.text.UTF16;
+
+/**
+ * Data and functions for the FCD check fast path.
+ *
+ * The fast path looks at a pair of 16-bit code units and checks
+ * whether there is an FCD boundary between them;
+ * there is if the first unit has a trailing ccc=0 (!hasTccc(first))
+ * or the second unit has a leading ccc=0 (!hasLccc(second)),
+ * or both.
+ * When the fast path finds a possible non-boundary,
+ * then the FCD check slow path looks at the actual sequence of FCD values.
+ *
+ * This is a pure optimization.
+ * The fast path must at least find all possible non-boundaries.
+ * If the fast path is too pessimistic, it costs performance.
+ *
+ * For a pair of BMP characters, the fast path tests are precise (1 bit per character).
+ *
+ * For a supplementary code point, the two units are its lead and trail surrogates.
+ * We set hasTccc(lead)=true if any of its 1024 associated supplementary code points
+ * has lccc!=0 or tccc!=0.
+ * We set hasLccc(trail)=true for all trail surrogates.
+ * As a result, we leave the fast path if the lead surrogate might start a
+ * supplementary code point that is not FCD-inert.
+ * (So the fast path need not detect that there is a surrogate pair,
+ * nor look ahead to the next full code point.)
+ *
+ * hasLccc(lead)=true if any of its 1024 associated supplementary code points
+ * has lccc!=0, for fast boundary checking between BMP & supplementary.
+ *
+ * hasTccc(trail)=false:
+ * It should only be tested for unpaired trail surrogates which are FCD-inert.
+ */
+public final class CollationFCD {
+ public static boolean hasLccc(int c) {
+ assert c <= 0xffff;
+ // c can be negative, e.g., Collation.SENTINEL_CP from UCharIterator;
+ // that is handled in the first test.
+ int i;
+ return
+ // U+0300 is the first character with lccc!=0.
+ c >= 0x300 &&
+ (i = lcccIndex[c >> 5]) != 0 &&
+ (lcccBits[i] & (1 << (c & 0x1f))) != 0;
+ }
+
+ public static boolean hasTccc(int c) {
+ assert c <= 0xffff;
+ // c can be negative, e.g., Collation.SENTINEL_CP from UCharIterator;
+ // that is handled in the first test.
+ int i;
+ return
+ // U+00C0 is the first character with tccc!=0.
+ c >= 0xc0 &&
+ (i = tcccIndex[c >> 5]) != 0 &&
+ (tcccBits[i] & (1 << (c & 0x1f))) != 0;
+ }
+
+ static boolean mayHaveLccc(int c) {
+ // Handles all of Unicode 0..10FFFF.
+ // c can be negative, e.g., Collation.SENTINEL_CP.
+ // U+0300 is the first character with lccc!=0.
+ if(c < 0x300) { return false; }
+ if(c > 0xffff) { c = UTF16.getLeadSurrogate(c); }
+ int i;
+ return
+ (i = lcccIndex[c >> 5]) != 0 &&
+ (lcccBits[i] & (1 << (c & 0x1f))) != 0;
+ }
+
+ /**
+ * Tibetan composite vowel signs (U+0F73, U+0F75, U+0F81)
+ * must be decomposed before reaching the core collation code,
+ * or else some sequences including them, even ones passing the FCD check,
+ * do not yield canonically equivalent results.
+ *
+ * This is a fast and imprecise test.
+ *
+ * @param c a code point
+ * @return true if c is U+0F73, U+0F75 or U+0F81 or one of several other Tibetan characters
+ */
+ static boolean maybeTibetanCompositeVowel(int c) {
+ return (c & 0x1fff01) == 0xf01;
+ }
+
+ /**
+ * Tibetan composite vowel signs (U+0F73, U+0F75, U+0F81)
+ * must be decomposed before reaching the core collation code,
+ * or else some sequences including them, even ones passing the FCD check,
+ * do not yield canonically equivalent results.
+ *
+ * They have distinct lccc/tccc combinations: 129/130 or 129/132.
+ *
+ * @param fcd16 the FCD value (lccc/tccc combination) of a code point
+ * @return true if fcd16 is from U+0F73, U+0F75 or U+0F81
+ */
+ static boolean isFCD16OfTibetanCompositeVowel(int fcd16) {
+ return fcd16 == 0x8182 || fcd16 == 0x8184;
+ }
+
+ // CollationFCD(); // No instantiation.
+
+ // TODO: machine-generate by: icu/tools/unicode/c/genuca/genuca.cpp
+
+ private static final byte[] lcccIndex={
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,1,1,2,3,0,0,0,0,
+0,0,0,0,4,0,0,0,0,0,0,0,5,6,7,0,
+8,0,9,0xa,0,0,0xb,0xc,0xd,0xe,0xf,0,0,0,0,0x10,
+0x11,0x12,0x13,0,0,0,0,0x14,0,0x15,0x16,0,0,0x15,0x17,0,
+0,0x15,0x17,0,0,0x15,0x17,0,0,0x15,0x17,0,0,0,0x17,0,
+0,0,0x18,0,0,0x15,0x17,0,0,0,0x17,0,0,0,0x19,0,
+0,0x1a,0x1b,0,0,0x1c,0x1b,0,0x1c,0x1d,0,0x1e,0x1f,0,0x20,0,
+0,0x21,0,0,0x17,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0x22,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0x23,0x23,0,0,0,0,0x24,0,
+0,0,0,0,0,0x25,0,0,0,0x13,0,0,0,0,0,0,
+0x26,0,0,0x27,0,0,0,0,0,0x23,0x28,0x10,0,0x29,0,0x2a,
+0,0x2b,0,0,0,0,0x2c,0x2d,0,0,0,0,0,0,1,0x2e,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0x2f,0x30,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0x31,0,0,0,0x32,0,0,0,1,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0x33,0,0,0x34,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0x35,0x32,0,0,0x36,0,0,0,0,0,0,0,0,
+0x20,0,0,0,0,0,0x28,0x37,0,0x38,0x39,0,0,0x39,0x3a,0,
+0,0,0,0,0,0x3b,0x3c,0x3d,0,0,0,0,0,0,0,0x17,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0x3e,0x23,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0x3f,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0x40,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+};
+
+ private static final byte[] tcccIndex={
+0,0,0,0,0,0,2,3,4,5,6,7,0,8,9,0xa,
+0xb,0xc,0,0,0,0,0,0,1,1,0xd,0xe,0xf,0x10,0x11,0,
+0x12,0x13,0x14,0x15,0x16,0,0x17,0x18,0,0,0,0,0x19,0x1a,0x1b,0,
+0x1c,0x1d,0x1e,0x1f,0,0,0x20,0x21,0x22,0x23,0x24,0,0,0,0,0x25,
+0x26,0x27,0x28,0,0,0,0,0x29,0,0x2a,0x2b,0,0,0x2c,0x2d,0,
+0,0x2e,0x2f,0,0,0x2c,0x30,0,0,0x2c,0x31,0,0,0,0x30,0,
+0,0,0x32,0,0,0x2c,0x30,0,0,0,0x30,0,0,0,0x33,0,
+0,0x34,0x35,0,0,0x36,0x35,0,0x36,0x37,0,0x38,0x39,0,0x3a,0,
+0,0x3b,0,0,0x30,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0x3c,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0x3d,0x3d,0,0,0,0,0x3e,0,
+0,0,0,0,0,0x3f,0,0,0,0x28,0,0,0,0,0,0,
+0x40,0,0,0x41,0,0,0,0,0,0x3d,0x42,0x25,0,0x43,0,0x44,
+0,0x45,0,0,0,0,0x46,0x47,0,0,0,0,0,0,1,0x48,
+1,1,1,1,0x49,1,1,0x4a,0x4b,1,0x4c,0x4d,1,0x4e,0x4f,0x50,
+0,0,0,0,0,0,0x51,0x52,0,0x53,0,0,0x54,0x55,0x56,0,
+0x57,0x58,0x59,0x5a,0x5b,0x5c,0,0x5d,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0x2c,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0x5e,0,0,0,0x5f,0,0,0,1,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0x60,0x61,0x62,0x63,0x61,0x62,0x64,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0x65,0x5f,0,0,0x66,0,0,0,0,0,0,0,0,
+0x3a,0,0,0,0,0,0x42,0x67,0,0x68,0x69,0,0,0x69,0x6a,0,
+0,0,0,0,0,0x6b,0x6c,0x6d,0,0,0,0,0,0,0,0x30,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0x6e,0x3d,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0x3c,0x6f,0x70,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0x71,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+};
+
+ private static final int[] lcccBits={
+0,0xffffffff,0xffff7fff,0xffff,0xf8,0xfffe0000,0xbfffffff,0xb6,0x7ff0000,0xfffff800,0x10000,0x9fc00000,0x3d9f,0x20000,0xffff0000,0x7ff,
+0xff800,0xfbc00000,0x3eef,0xe000000,0x7ffffff0,0x10000000,0x1e2000,0x2000,0x602000,0x400,0x7000000,0xf00,0x3000000,0x2a00000,0x3c3e0000,0xdf,
+0x40,0x6800000,0xe0000000,0x100000,0x20040000,0x200,0x1800000,0x9fe00001,0x10,0xc00,0xc0040,0x800000,0xfff70000,0x1021fd,0xf000007f,0x1fff0000,
+0x1ffe2,0x38000,0x80000000,0xfc00,0x6000000,0x3ff08000,0x30000,0x3ffff,0x3800,0x80000,1,0xc19d0000,2,0x400000,0x35,0x40000000,
+0x7f
+};
+ private static final int[] tcccBits={
+0,0xffffffff,0x3e7effbf,0xbe7effbf,0xfffcffff,0x7ef1ff3f,0xfff3f1f8,0x7fffff3f,0x18003,0xdfffe000,0xff31ffcf,0xcfffffff,0xfffc0,0xffff7fff,0xffff,0x1d760,
+0x1fc00,0x187c00,0x200708b,0x2000000,0x708b0000,0xc00000,0xf8,0xfccf0006,0x33ffcfc,0xfffe0000,0xbfffffff,0xb6,0x7ff0000,0x7c,0xfffff800,0x10000,
+0x9fc80005,0x3d9f,0x20000,0xffff0000,0x7ff,0xff800,0xfbc00000,0x3eef,0xe000000,0x7ffffff0,0x10120200,0xff1e2000,0x10000000,0xb0002000,0x10480000,0x4e002000,
+0x2000,0x30002000,0x602100,0x24000400,0x7000000,0xf00,0x3000000,0x2a00000,0x3d7e0000,0xdf,0x40,0x6800000,0xe0000000,0x100000,0x20040000,0x200,
+0x1800000,0x9fe00001,0x10,0xc00,0xc0040,0x800000,0xfff70000,0x1021fd,0xf000007f,0xbffffff,0x3ffffff,0x3f3fffff,0xaaff3f3f,0x3fffffff,0x1fdfffff,0xefcfffde,
+0x1fdc7fff,0x1fff0000,0x1ffe2,0x800,0xc000000,0x4000,0xe000,0x1210,0x50,0x292,0x333e005,0x333,0xf000,0x3c0f,0x38000,0x80000000,
+0xfc00,0x55555000,0x36db02a5,0x46100000,0x47900000,0x3ff08000,0x30000,0x3ffff,0x3800,0x80000,1,0xc19d0000,2,0x400000,0x35,0x5f7ffc00,
+0x7fdb,0x7f
+};
+
+}
--- /dev/null
+/*
+*******************************************************************************
+* Copyright (C) 2013-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* CollationFastLatin.java, ported from collationfastlatin.h/.cpp
+*
+* C++ version created on: 2013aug09
+* created by: Markus W. Scherer
+*/
+
+package com.ibm.icu.impl.coll;
+
+import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.Collator;
+
+public final class CollationFastLatin /* all static */ {
+ /**
+ * Fast Latin format version (one byte 1..FF).
+ * Must be incremented for any runtime-incompatible changes,
+ * in particular, for changes to any of the following constants.
+ *
+ * When the major version number of the main data format changes,
+ * we can reset this fast Latin version to 1.
+ */
+ public static final int VERSION = 1;
+
+ public static final int LATIN_MAX = 0x17f;
+ public static final int LATIN_LIMIT = LATIN_MAX + 1;
+
+ static final int LATIN_MAX_UTF8_LEAD = 0xc5; // UTF-8 lead byte of LATIN_MAX
+
+ static final int PUNCT_START = 0x2000;
+ static final int PUNCT_LIMIT = 0x2040;
+
+ // excludes U+FFFE & U+FFFF
+ static final int NUM_FAST_CHARS = LATIN_LIMIT + (PUNCT_LIMIT - PUNCT_START);
+
+ // Note on the supported weight ranges:
+ // Analysis of UCA 6.3 and CLDR 23 non-search tailorings shows that
+ // the CEs for characters in the above ranges, excluding expansions with length >2,
+ // excluding contractions of >2 characters, and other restrictions
+ // (see the builder's getCEsFromCE32()),
+ // use at most about 150 primary weights,
+ // where about 94 primary weights are possibly-variable (space/punct/symbol/currency),
+ // at most 4 secondary before-common weights,
+ // at most 4 secondary after-common weights,
+ // at most 16 secondary high weights (in secondary CEs), and
+ // at most 4 tertiary after-common weights.
+ // The following ranges are designed to support slightly more weights than that.
+ // (en_US_POSIX is unusual: It creates about 64 variable + 116 Latin primaries.)
+
+ // Digits may use long primaries (preserving more short ones)
+ // or short primaries (faster) without changing this data structure.
+ // (If we supported numeric collation, then digits would have to have long primaries
+ // so that special handling does not affect the fast path.)
+
+ static final int SHORT_PRIMARY_MASK = 0xfc00; // bits 15..10
+ static final int INDEX_MASK = 0x3ff; // bits 9..0 for expansions & contractions
+ static final int SECONDARY_MASK = 0x3e0; // bits 9..5
+ static final int CASE_MASK = 0x18; // bits 4..3
+ static final int LONG_PRIMARY_MASK = 0xfff8; // bits 15..3
+ static final int TERTIARY_MASK = 7; // bits 2..0
+ static final int CASE_AND_TERTIARY_MASK = CASE_MASK | TERTIARY_MASK;
+
+ static final int TWO_SHORT_PRIMARIES_MASK =
+ (SHORT_PRIMARY_MASK << 16) | SHORT_PRIMARY_MASK; // 0xfc00fc00
+ static final int TWO_LONG_PRIMARIES_MASK =
+ (LONG_PRIMARY_MASK << 16) | LONG_PRIMARY_MASK; // 0xfff8fff8
+ static final int TWO_SECONDARIES_MASK =
+ (SECONDARY_MASK << 16) | SECONDARY_MASK; // 0x3e003e0
+ static final int TWO_CASES_MASK =
+ (CASE_MASK << 16) | CASE_MASK; // 0x180018
+ static final int TWO_TERTIARIES_MASK =
+ (TERTIARY_MASK << 16) | TERTIARY_MASK; // 0x70007
+
+ /**
+ * Contraction with one fast Latin character.
+ * Use INDEX_MASK to find the start of the contraction list after the fixed table.
+ * The first entry contains the default mapping.
+ * Otherwise use CONTR_CHAR_MASK for the contraction character index
+ * (in ascending order).
+ * Use CONTR_LENGTH_SHIFT for the length of the entry
+ * (1=BAIL_OUT, 2=one CE, 3=two CEs).
+ *
+ * Also, U+0000 maps to a contraction entry, so that the fast path need not
+ * check for NUL termination.
+ * It usually maps to a contraction list with only the completely ignorable default value.
+ */
+ static final int CONTRACTION = 0x400;
+ /**
+ * An expansion encodes two CEs.
+ * Use INDEX_MASK to find the pair of CEs after the fixed table.
+ *
+ * The higher a mini CE value, the easier it is to process.
+ * For expansions and higher, no context needs to be considered.
+ */
+ static final int EXPANSION = 0x800;
+ /**
+ * Encodes one CE with a long/low mini primary (there are 128).
+ * All potentially-variable primaries must be in this range,
+ * to make the short-primary path as fast as possible.
+ */
+ static final int MIN_LONG = 0xc00;
+ static final int LONG_INC = 8;
+ static final int MAX_LONG = 0xff8;
+ /**
+ * Encodes one CE with a short/high primary (there are 60),
+ * plus a secondary CE if the secondary weight is high.
+ * Fast handling: At least all letter primaries should be in this range.
+ */
+ static final int MIN_SHORT = 0x1000;
+ static final int SHORT_INC = 0x400;
+ /** The highest primary weight is reserved for U+FFFF. */
+ static final int MAX_SHORT = SHORT_PRIMARY_MASK;
+
+ static final int MIN_SEC_BEFORE = 0; // must add SEC_OFFSET
+ static final int SEC_INC = 0x20;
+ static final int MAX_SEC_BEFORE = MIN_SEC_BEFORE + 4 * SEC_INC; // 5 before common
+ static final int COMMON_SEC = MAX_SEC_BEFORE + SEC_INC;
+ static final int MIN_SEC_AFTER = COMMON_SEC + SEC_INC;
+ static final int MAX_SEC_AFTER = MIN_SEC_AFTER + 5 * SEC_INC; // 6 after common
+ static final int MIN_SEC_HIGH = MAX_SEC_AFTER + SEC_INC; // 20 high secondaries
+ static final int MAX_SEC_HIGH = SECONDARY_MASK;
+
+ /**
+ * Lookup: Add this offset to secondary weights, except for completely ignorable CEs.
+ * Must be greater than any special value, e.g., MERGE_WEIGHT.
+ * The exact value is not relevant for the format version.
+ */
+ static final int SEC_OFFSET = SEC_INC;
+ static final int COMMON_SEC_PLUS_OFFSET = COMMON_SEC + SEC_OFFSET;
+
+ static final int TWO_SEC_OFFSETS =
+ (SEC_OFFSET << 16) | SEC_OFFSET; // 0x200020
+ static final int TWO_COMMON_SEC_PLUS_OFFSET =
+ (COMMON_SEC_PLUS_OFFSET << 16) | COMMON_SEC_PLUS_OFFSET;
+
+ static final int LOWER_CASE = 8; // case bits include this offset
+ static final int TWO_LOWER_CASES = (LOWER_CASE << 16) | LOWER_CASE; // 0x80008
+
+ static final int COMMON_TER = 0; // must add TER_OFFSET
+ static final int MAX_TER_AFTER = 7; // 7 after common
+
+ /**
+ * Lookup: Add this offset to tertiary weights, except for completely ignorable CEs.
+ * Must be greater than any special value, e.g., MERGE_WEIGHT.
+ * Must be greater than case bits as well, so that with combined case+tertiary weights
+ * plus the offset the tertiary bits does not spill over into the case bits.
+ * The exact value is not relevant for the format version.
+ */
+ static final int TER_OFFSET = SEC_OFFSET;
+ static final int COMMON_TER_PLUS_OFFSET = COMMON_TER + TER_OFFSET;
+
+ static final int TWO_TER_OFFSETS = (TER_OFFSET << 16) | TER_OFFSET;
+ static final int TWO_COMMON_TER_PLUS_OFFSET =
+ (COMMON_TER_PLUS_OFFSET << 16) | COMMON_TER_PLUS_OFFSET;
+
+ static final int MERGE_WEIGHT = 3;
+ static final int EOS = 2; // end of string
+ static final int BAIL_OUT = 1;
+
+ /**
+ * Contraction result first word bits 8..0 contain the
+ * second contraction character, as a char index 0..NUM_FAST_CHARS-1.
+ * Each contraction list is terminated with a word containing CONTR_CHAR_MASK.
+ */
+ static final int CONTR_CHAR_MASK = 0x1ff;
+ /**
+ * Contraction result first word bits 10..9 contain the result length:
+ * 1=bail out, 2=one mini CE, 3=two mini CEs
+ */
+ static final int CONTR_LENGTH_SHIFT = 9;
+
+ /**
+ * Comparison return value when the regular comparison must be used.
+ * The exact value is not relevant for the format version.
+ */
+ public static final int BAIL_OUT_RESULT = -2;
+
+ static int getCharIndex(char c) {
+ if(c <= LATIN_MAX) {
+ return c;
+ } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
+ return c - (PUNCT_START - LATIN_LIMIT);
+ } else {
+ // Not a fast Latin character.
+ // Note: U+FFFE & U+FFFF are forbidden in tailorings
+ // and thus do not occur in any contractions.
+ return -1;
+ }
+ }
+
+ /**
+ * Computes the options value for the compare functions
+ * and writes the precomputed primary weights.
+ * Returns -1 if the Latin fastpath is not supported for the data and settings.
+ * The capacity must be LATIN_LIMIT.
+ */
+ public static int getOptions(CollationData data, CollationSettings settings,
+ char[] primaries) {
+ char[] header = data.fastLatinTableHeader;
+ if(header == null) { return -1; }
+ assert((header[0] >> 8) == VERSION);
+ assert(primaries.length == LATIN_LIMIT);
+ if(primaries.length != LATIN_LIMIT) { return -1; }
+
+ int miniVarTop;
+ if((settings.options & CollationSettings.ALTERNATE_MASK) == 0) {
+ // No mini primaries are variable, set a variableTop just below the
+ // lowest long mini primary.
+ miniVarTop = MIN_LONG - 1;
+ } else {
+ int v1 = (int)(settings.variableTop >> 24);
+ int headerLength = header[0] & 0xff;
+ int i = headerLength - 1;
+ if(i <= 0 || v1 > (header[i] & 0x7f)) {
+ return -1; // variableTop >= digits, should not occur
+ }
+ while(i > 1 && v1 <= (header[i - 1] & 0x7f)) { --i; }
+ // In the table header, the miniVarTop is in bits 15..7, with 4 zero bits 19..16 implied.
+ // Shift right to make it comparable with long mini primaries in bits 15..3.
+ miniVarTop = (header[i] & 0xff80) >> 4;
+ }
+
+ byte[] reorderTable = settings.reorderTable;
+ if(reorderTable != null) {
+ char[] scripts = data.scripts;
+ int length = data.scripts.length;
+ int prevLastByte = 0;
+ for(int i = 0; i < length;) {
+ // reordered last byte of the group
+ int lastByte = reorderTable[scripts[i] & 0xff] & 0xff;
+ if(lastByte < prevLastByte) {
+ // The permutation affects the groups up to Latin.
+ return -1;
+ }
+ if(scripts[i + 2] == UScript.LATIN) { break; }
+ i = i + 2 + scripts[i + 1];
+ prevLastByte = lastByte;
+ }
+ }
+
+ char[] table = data.fastLatinTable; // skip the header
+ for(int c = 0; c < LATIN_LIMIT; ++c) {
+ int p = table[c];
+ if(p >= MIN_SHORT) {
+ p &= SHORT_PRIMARY_MASK;
+ } else if(p > miniVarTop) {
+ p &= LONG_PRIMARY_MASK;
+ } else {
+ p = 0;
+ }
+ primaries[c] = (char)p;
+ }
+ if((settings.options & CollationSettings.NUMERIC) != 0) {
+ // Bail out for digits.
+ for(int c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; }
+ }
+
+ // Shift the miniVarTop above other options.
+ return (miniVarTop << 16) | settings.options;
+ }
+
+ public static int compareUTF16(char[] table, char[] primaries, int options,
+ CharSequence left, CharSequence right, int startIndex) {
+ // This is a modified copy of CollationCompare.compareUpToQuaternary(),
+ // optimized for common Latin text.
+ // Keep them in sync!
+
+ int variableTop = options >> 16; // see getOptions()
+ options &= 0xffff; // needed for CollationSettings.getStrength() to work
+
+ // Check for supported characters, fetch mini CEs, and compare primaries.
+ int leftIndex = startIndex, rightIndex = startIndex;
+ /**
+ * Single mini CE or a pair.
+ * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
+ * If there is only one, then it is in the lower bits, and the upper bits are 0.
+ */
+ int leftPair = 0, rightPair = 0;
+ for(;;) {
+ // We fetch CEs until we get a non-ignorable primary or reach the end.
+ while(leftPair == 0) {
+ if(leftIndex == left.length()) {
+ leftPair = EOS;
+ break;
+ }
+ int c = left.charAt(leftIndex++);
+ if(c <= LATIN_MAX) {
+ leftPair = primaries[c];
+ if(leftPair != 0) { break; }
+ if(c <= 0x39 && c >= 0x30 && (options & CollationSettings.NUMERIC) != 0) {
+ return BAIL_OUT_RESULT;
+ }
+ leftPair = table[c];
+ } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
+ leftPair = table[c - PUNCT_START + LATIN_LIMIT];
+ } else {
+ leftPair = lookup(table, c);
+ }
+ if(leftPair >= MIN_SHORT) {
+ leftPair &= SHORT_PRIMARY_MASK;
+ break;
+ } else if(leftPair > variableTop) {
+ leftPair &= LONG_PRIMARY_MASK;
+ break;
+ } else {
+ long pairAndInc = nextPair(table, c, leftPair, left, leftIndex);
+ if(pairAndInc < 0) {
+ ++leftIndex;
+ pairAndInc = ~pairAndInc;
+ }
+ leftPair = (int)pairAndInc;
+ if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
+ leftPair = getPrimaries(variableTop, leftPair);
+ }
+ }
+
+ while(rightPair == 0) {
+ if(rightIndex == right.length()) {
+ rightPair = EOS;
+ break;
+ }
+ int c = right.charAt(rightIndex++);
+ if(c <= LATIN_MAX) {
+ rightPair = primaries[c];
+ if(rightPair != 0) { break; }
+ if(c <= 0x39 && c >= 0x30 && (options & CollationSettings.NUMERIC) != 0) {
+ return BAIL_OUT_RESULT;
+ }
+ rightPair = table[c];
+ } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
+ rightPair = table[c - PUNCT_START + LATIN_LIMIT];
+ } else {
+ rightPair = lookup(table, c);
+ }
+ if(rightPair >= MIN_SHORT) {
+ rightPair &= SHORT_PRIMARY_MASK;
+ break;
+ } else if(rightPair > variableTop) {
+ rightPair &= LONG_PRIMARY_MASK;
+ break;
+ } else {
+ long pairAndInc = nextPair(table, c, rightPair, right, rightIndex);
+ if(pairAndInc < 0) {
+ ++rightIndex;
+ pairAndInc = ~pairAndInc;
+ }
+ rightPair = (int)pairAndInc;
+ if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
+ rightPair = getPrimaries(variableTop, rightPair);
+ }
+ }
+
+ if(leftPair == rightPair) {
+ if(leftPair == EOS) { break; }
+ leftPair = rightPair = 0;
+ continue;
+ }
+ int leftPrimary = leftPair & 0xffff;
+ int rightPrimary = rightPair & 0xffff;
+ if(leftPrimary != rightPrimary) {
+ // Return the primary difference.
+ return (leftPrimary < rightPrimary) ? Collation.LESS : Collation.GREATER;
+ }
+ if(leftPair == EOS) { break; }
+ leftPair >>>= 16;
+ rightPair >>>= 16;
+ }
+ // In the following, we need to re-fetch each character because we did not buffer the CEs,
+ // but we know that the string is well-formed and
+ // only contains supported characters and mappings.
+
+ // We might skip the secondary level but continue with the case level
+ // which is turned on separately.
+ if(CollationSettings.getStrength(options) >= Collator.SECONDARY) {
+ leftIndex = rightIndex = startIndex;
+ leftPair = rightPair = 0;
+ for(;;) {
+ while(leftPair == 0) {
+ if(leftIndex == left.length()) {
+ leftPair = EOS;
+ break;
+ }
+ int c = left.charAt(leftIndex++);
+ if(c <= LATIN_MAX) {
+ leftPair = table[c];
+ } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
+ leftPair = table[c - PUNCT_START + LATIN_LIMIT];
+ } else {
+ leftPair = lookup(table, c);
+ }
+ if(leftPair >= MIN_SHORT) {
+ leftPair = getSecondariesFromOneShortCE(leftPair);
+ break;
+ } else if(leftPair > variableTop) {
+ leftPair = COMMON_SEC_PLUS_OFFSET;
+ break;
+ } else {
+ long pairAndInc = nextPair(table, c, leftPair, left, leftIndex);
+ if(pairAndInc < 0) {
+ ++leftIndex;
+ pairAndInc = ~pairAndInc;
+ }
+ leftPair = getSecondaries(variableTop, (int)pairAndInc);
+ }
+ }
+
+ while(rightPair == 0) {
+ if(rightIndex == right.length()) {
+ rightPair = EOS;
+ break;
+ }
+ int c = right.charAt(rightIndex++);
+ if(c <= LATIN_MAX) {
+ rightPair = table[c];
+ } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
+ rightPair = table[c - PUNCT_START + LATIN_LIMIT];
+ } else {
+ rightPair = lookup(table, c);
+ }
+ if(rightPair >= MIN_SHORT) {
+ rightPair = getSecondariesFromOneShortCE(rightPair);
+ break;
+ } else if(rightPair > variableTop) {
+ rightPair = COMMON_SEC_PLUS_OFFSET;
+ break;
+ } else {
+ long pairAndInc = nextPair(table, c, rightPair, right, rightIndex);
+ if(pairAndInc < 0) {
+ ++rightIndex;
+ pairAndInc = ~pairAndInc;
+ }
+ rightPair = getSecondaries(variableTop, (int)pairAndInc);
+ }
+ }
+
+ if(leftPair == rightPair) {
+ if(leftPair == EOS) { break; }
+ leftPair = rightPair = 0;
+ continue;
+ }
+ int leftSecondary = leftPair & 0xffff;
+ int rightSecondary = rightPair & 0xffff;
+ if(leftSecondary != rightSecondary) {
+ if((options & CollationSettings.BACKWARD_SECONDARY) != 0) {
+ // Full support for backwards secondary requires backwards contraction matching
+ // and moving backwards between merge separators.
+ return BAIL_OUT_RESULT;
+ }
+ return (leftSecondary < rightSecondary) ? Collation.LESS : Collation.GREATER;
+ }
+ if(leftPair == EOS) { break; }
+ leftPair >>>= 16;
+ rightPair >>>= 16;
+ }
+ }
+
+ if((options & CollationSettings.CASE_LEVEL) != 0) {
+ boolean strengthIsPrimary = CollationSettings.getStrength(options) == Collator.PRIMARY;
+ leftIndex = rightIndex = startIndex;
+ leftPair = rightPair = 0;
+ for(;;) {
+ while(leftPair == 0) {
+ if(leftIndex == left.length()) {
+ leftPair = EOS;
+ break;
+ }
+ int c = left.charAt(leftIndex++);
+ leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
+ if(leftPair < MIN_LONG) {
+ long pairAndInc = nextPair(table, c, leftPair, left, leftIndex);
+ if(pairAndInc < 0) {
+ ++leftIndex;
+ pairAndInc = ~pairAndInc;
+ }
+ leftPair = (int)pairAndInc;
+ }
+ leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
+ }
+
+ while(rightPair == 0) {
+ if(rightIndex == right.length()) {
+ rightPair = EOS;
+ break;
+ }
+ int c = right.charAt(rightIndex++);
+ rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
+ if(rightPair < MIN_LONG) {
+ long pairAndInc = nextPair(table, c, rightPair, right, rightIndex);
+ if(pairAndInc < 0) {
+ ++rightIndex;
+ pairAndInc = ~pairAndInc;
+ }
+ rightPair = (int)pairAndInc;
+ }
+ rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
+ }
+
+ if(leftPair == rightPair) {
+ if(leftPair == EOS) { break; }
+ leftPair = rightPair = 0;
+ continue;
+ }
+ int leftCase = leftPair & 0xffff;
+ int rightCase = rightPair & 0xffff;
+ if(leftCase != rightCase) {
+ if((options & CollationSettings.UPPER_FIRST) == 0) {
+ return (leftCase < rightCase) ? Collation.LESS : Collation.GREATER;
+ } else {
+ return (leftCase < rightCase) ? Collation.GREATER : Collation.LESS;
+ }
+ }
+ if(leftPair == EOS) { break; }
+ leftPair >>>= 16;
+ rightPair >>>= 16;
+ }
+ }
+ if(CollationSettings.getStrength(options) <= Collator.SECONDARY) { return Collation.EQUAL; }
+
+ // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
+ boolean withCaseBits = CollationSettings.isTertiaryWithCaseBits(options);
+
+ leftIndex = rightIndex = startIndex;
+ leftPair = rightPair = 0;
+ for(;;) {
+ while(leftPair == 0) {
+ if(leftIndex == left.length()) {
+ leftPair = EOS;
+ break;
+ }
+ int c = left.charAt(leftIndex++);
+ leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
+ if(leftPair < MIN_LONG) {
+ long pairAndInc = nextPair(table, c, leftPair, left, leftIndex);
+ if(pairAndInc < 0) {
+ ++leftIndex;
+ pairAndInc = ~pairAndInc;
+ }
+ leftPair = (int)pairAndInc;
+ }
+ leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
+ }
+
+ while(rightPair == 0) {
+ if(rightIndex == right.length()) {
+ rightPair = EOS;
+ break;
+ }
+ int c = right.charAt(rightIndex++);
+ rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
+ if(rightPair < MIN_LONG) {
+ long pairAndInc = nextPair(table, c, rightPair, right, rightIndex);
+ if(pairAndInc < 0) {
+ ++rightIndex;
+ pairAndInc = ~pairAndInc;
+ }
+ rightPair = (int)pairAndInc;
+ }
+ rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
+ }
+
+ if(leftPair == rightPair) {
+ if(leftPair == EOS) { break; }
+ leftPair = rightPair = 0;
+ continue;
+ }
+ int leftTertiary = leftPair & 0xffff;
+ int rightTertiary = rightPair & 0xffff;
+ if(leftTertiary != rightTertiary) {
+ if(CollationSettings.sortsTertiaryUpperCaseFirst(options)) {
+ // Pass through EOS and MERGE_WEIGHT
+ // and keep real tertiary weights larger than the MERGE_WEIGHT.
+ // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
+ if(leftTertiary > MERGE_WEIGHT) {
+ leftTertiary ^= CASE_MASK;
+ }
+ if(rightTertiary > MERGE_WEIGHT) {
+ rightTertiary ^= CASE_MASK;
+ }
+ }
+ return (leftTertiary < rightTertiary) ? Collation.LESS : Collation.GREATER;
+ }
+ if(leftPair == EOS) { break; }
+ leftPair >>>= 16;
+ rightPair >>>= 16;
+ }
+ if(CollationSettings.getStrength(options) <= Collator.TERTIARY) { return Collation.EQUAL; }
+
+ leftIndex = rightIndex = startIndex;
+ leftPair = rightPair = 0;
+ for(;;) {
+ while(leftPair == 0) {
+ if(leftIndex == left.length()) {
+ leftPair = EOS;
+ break;
+ }
+ int c = left.charAt(leftIndex++);
+ leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
+ if(leftPair < MIN_LONG) {
+ long pairAndInc = nextPair(table, c, leftPair, left, leftIndex);
+ if(pairAndInc < 0) {
+ ++leftIndex;
+ pairAndInc = ~pairAndInc;
+ }
+ leftPair = (int)pairAndInc;
+ }
+ leftPair = getQuaternaries(variableTop, leftPair);
+ }
+
+ while(rightPair == 0) {
+ if(rightIndex == right.length()) {
+ rightPair = EOS;
+ break;
+ }
+ int c = right.charAt(rightIndex++);
+ rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
+ if(rightPair < MIN_LONG) {
+ long pairAndInc = nextPair(table, c, rightPair, right, rightIndex);
+ if(pairAndInc < 0) {
+ ++rightIndex;
+ pairAndInc = ~pairAndInc;
+ }
+ rightPair = (int)pairAndInc;
+ }
+ rightPair = getQuaternaries(variableTop, rightPair);
+ }
+
+ if(leftPair == rightPair) {
+ if(leftPair == EOS) { break; }
+ leftPair = rightPair = 0;
+ continue;
+ }
+ int leftQuaternary = leftPair & 0xffff;
+ int rightQuaternary = rightPair & 0xffff;
+ if(leftQuaternary != rightQuaternary) {
+ return (leftQuaternary < rightQuaternary) ? Collation.LESS : Collation.GREATER;
+ }
+ if(leftPair == EOS) { break; }
+ leftPair >>>= 16;
+ rightPair >>>= 16;
+ }
+ return Collation.EQUAL;
+ }
+
+ private static int lookup(char[] table, int c) {
+ assert(c > LATIN_MAX);
+ if(PUNCT_START <= c && c < PUNCT_LIMIT) {
+ return table[c - PUNCT_START + LATIN_LIMIT];
+ } else if(c == 0xfffe) {
+ return MERGE_WEIGHT;
+ } else if(c == 0xffff) {
+ return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;
+ } else {
+ return BAIL_OUT;
+ }
+ }
+
+ /**
+ * Java returns a negative result (use the '~' operator) if sIndex is to be incremented.
+ * C++ modifies sIndex.
+ */
+ private static long nextPair(char[] table, int c, int ce, CharSequence s16, int sIndex) {
+ if(ce >= MIN_LONG || ce < CONTRACTION) {
+ return ce; // simple or special mini CE
+ } else if(ce >= EXPANSION) {
+ int index = NUM_FAST_CHARS + (ce & INDEX_MASK);
+ return ((long)table[index + 1] << 16) | table[index];
+ } else /* ce >= CONTRACTION */ {
+ // Contraction list: Default mapping followed by
+ // 0 or more single-character contraction suffix mappings.
+ int index = NUM_FAST_CHARS + (ce & INDEX_MASK);
+ boolean inc = false; // true if the next char is consumed.
+ if(sIndex != s16.length()) {
+ // Read the next character.
+ int c2;
+ int nextIndex = sIndex;
+ c2 = s16.charAt(nextIndex++);
+ if(c2 > LATIN_MAX) {
+ if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) {
+ c2 = c2 - PUNCT_START + LATIN_LIMIT; // 2000..203F -> 0180..01BF
+ } else if(c2 == 0xfffe || c2 == 0xffff) {
+ c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions.
+ } else {
+ return BAIL_OUT;
+ }
+ }
+ // Look for the next character in the contraction suffix list,
+ // which is in ascending order of single suffix characters.
+ int i = index;
+ int head = table[i]; // first skip the default mapping
+ int x;
+ do {
+ i += head >> CONTR_LENGTH_SHIFT;
+ head = table[i];
+ x = head & CONTR_CHAR_MASK;
+ } while(x < c2);
+ if(x == c2) {
+ index = i;
+ inc = true;
+ }
+ }
+ // Return the CE or CEs for the default or contraction mapping.
+ int length = table[index] >> CONTR_LENGTH_SHIFT;
+ if(length == 1) {
+ return BAIL_OUT;
+ }
+ ce = table[index + 1];
+ long result;
+ if(length == 2) {
+ result = ce;
+ } else {
+ result = ((long)table[index + 2] << 16) | ce;
+ }
+ return inc ? ~result : result;
+ }
+ }
+
+ private static int getPrimaries(int variableTop, int pair) {
+ int ce = pair & 0xffff;
+ if(ce >= MIN_SHORT) { return pair & TWO_SHORT_PRIMARIES_MASK; }
+ if(ce > variableTop) { return pair & TWO_LONG_PRIMARIES_MASK; }
+ if(ce >= MIN_LONG) { return 0; } // variable
+ return pair; // special mini CE
+ }
+
+ private static int getSecondariesFromOneShortCE(int ce) {
+ ce &= SECONDARY_MASK;
+ if(ce < MIN_SEC_HIGH) {
+ return ce + SEC_OFFSET;
+ } else {
+ return ((ce + SEC_OFFSET) << 16) | COMMON_SEC_PLUS_OFFSET;
+ }
+ }
+
+ private static int getSecondaries(int variableTop, int pair) {
+ if(pair <= 0xffff) {
+ // one mini CE
+ if(pair >= MIN_SHORT) {
+ pair = getSecondariesFromOneShortCE(pair);
+ } else if(pair > variableTop) {
+ pair = COMMON_SEC_PLUS_OFFSET;
+ } else if(pair >= MIN_LONG) {
+ pair = 0; // variable
+ }
+ // else special mini CE
+ } else {
+ int ce = pair & 0xffff;
+ if(ce >= MIN_SHORT) {
+ pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS;
+ } else if(ce > variableTop) {
+ pair = TWO_COMMON_SEC_PLUS_OFFSET;
+ } else {
+ assert(ce >= MIN_LONG);
+ pair = 0; // variable
+ }
+ }
+ return pair;
+ }
+
+ private static int getCases(int variableTop, boolean strengthIsPrimary, int pair) {
+ // Primary+caseLevel: Ignore case level weights of primary ignorables.
+ // Otherwise: Ignore case level weights of secondary ignorables.
+ // For details see the comments in the CollationCompare class.
+ // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
+ if(pair <= 0xffff) {
+ // one mini CE
+ if(pair >= MIN_SHORT) {
+ // A high secondary weight means we really have two CEs,
+ // a primary CE and a secondary CE.
+ int ce = pair;
+ pair &= CASE_MASK; // explicit weight of primary CE
+ if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
+ pair |= LOWER_CASE << 16; // implied weight of secondary CE
+ }
+ } else if(pair > variableTop) {
+ pair = LOWER_CASE;
+ } else if(pair >= MIN_LONG) {
+ pair = 0; // variable
+ }
+ // else special mini CE
+ } else {
+ // two mini CEs, same primary groups, neither expands like above
+ int ce = pair & 0xffff;
+ if(ce >= MIN_SHORT) {
+ if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) {
+ pair &= CASE_MASK;
+ } else {
+ pair &= TWO_CASES_MASK;
+ }
+ } else if(ce > variableTop) {
+ pair = TWO_LOWER_CASES;
+ } else {
+ assert(ce >= MIN_LONG);
+ pair = 0; // variable
+ }
+ }
+ return pair;
+ }
+
+ private static int getTertiaries(int variableTop, boolean withCaseBits, int pair) {
+ if(pair <= 0xffff) {
+ // one mini CE
+ if(pair >= MIN_SHORT) {
+ // A high secondary weight means we really have two CEs,
+ // a primary CE and a secondary CE.
+ int ce = pair;
+ if(withCaseBits) {
+ pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET;
+ if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
+ pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16;
+ }
+ } else {
+ pair = (pair & TERTIARY_MASK) + TER_OFFSET;
+ if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
+ pair |= COMMON_TER_PLUS_OFFSET << 16;
+ }
+ }
+ } else if(pair > variableTop) {
+ pair = (pair & TERTIARY_MASK) + TER_OFFSET;
+ if(withCaseBits) {
+ pair |= LOWER_CASE;
+ }
+ } else if(pair >= MIN_LONG) {
+ pair = 0; // variable
+ }
+ // else special mini CE
+ } else {
+ // two mini CEs, same primary groups, neither expands like above
+ int ce = pair & 0xffff;
+ if(ce >= MIN_SHORT) {
+ if(withCaseBits) {
+ pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK;
+ } else {
+ pair &= TWO_TERTIARIES_MASK;
+ }
+ pair += TWO_TER_OFFSETS;
+ } else if(ce > variableTop) {
+ pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS;
+ if(withCaseBits) {
+ pair |= TWO_LOWER_CASES;
+ }
+ } else {
+ assert(ce >= MIN_LONG);
+ pair = 0; // variable
+ }
+ }
+ return pair;
+ }
+
+ private static int getQuaternaries(int variableTop, int pair) {
+ // Return the primary weight of a variable CE,
+ // or the maximum primary weight for a non-variable, not-completely-ignorable CE.
+ if(pair <= 0xffff) {
+ // one mini CE
+ if(pair >= MIN_SHORT) {
+ // A high secondary weight means we really have two CEs,
+ // a primary CE and a secondary CE.
+ if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) {
+ pair = TWO_SHORT_PRIMARIES_MASK;
+ } else {
+ pair = SHORT_PRIMARY_MASK;
+ }
+ } else if(pair > variableTop) {
+ pair = SHORT_PRIMARY_MASK;
+ } else if(pair >= MIN_LONG) {
+ pair &= LONG_PRIMARY_MASK; // variable
+ }
+ // else special mini CE
+ } else {
+ // two mini CEs, same primary groups, neither expands like above
+ int ce = pair & 0xffff;
+ if(ce > variableTop) {
+ pair = TWO_SHORT_PRIMARIES_MASK;
+ } else {
+ assert(ce >= MIN_LONG);
+ pair &= TWO_LONG_PRIMARIES_MASK; // variable
+ }
+ }
+ return pair;
+ }
+
+ private CollationFastLatin() {} // no constructor
+}
--- /dev/null
+/*
+*******************************************************************************
+* Copyright (C) 2013-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* CollationFastLatinBuilder.java, ported from collationfastlatinbuilder.h/.cpp
+*
+* C++ version created on: 2013aug09
+* created by: Markus W. Scherer
+*/
+
+package com.ibm.icu.impl.coll;
+
+import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.util.CharsTrie;
+
+final class CollationFastLatinBuilder {
+ // #define DEBUG_COLLATION_FAST_LATIN_BUILDER 0 // 0 or 1 or 2
+
+ /**
+ * Compare two signed long values as if they were unsigned.
+ */
+ private static final int compareInt64AsUnsigned(long a, long b) {
+ a += 0x8000000000000000L;
+ b += 0x8000000000000000L;
+ if(a < b) {
+ return -1;
+ } else if(a > b) {
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+
+ /**
+ * Like Java Collections.binarySearch(List, String, Comparator).
+ *
+ * @return the index>=0 where the item was found,
+ * or the index<0 for inserting the string at ~index in sorted order
+ */
+ private static final int binarySearch(long[] list, int limit, long ce) {
+ if (limit == 0) { return ~0; }
+ int start = 0;
+ for (;;) {
+ int i = (start + limit) / 2;
+ int cmp = compareInt64AsUnsigned(ce, list[i]);
+ if (cmp == 0) {
+ return i;
+ } else if (cmp < 0) {
+ if (i == start) {
+ return ~start; // insert ce before i
+ }
+ limit = i;
+ } else {
+ if (i == start) {
+ return ~(start + 1); // insert ce after i
+ }
+ start = i;
+ }
+ }
+ }
+
+ CollationFastLatinBuilder() {
+ ce0 = 0;
+ ce1 = 0;
+ contractionCEs = new UVector64();
+ uniqueCEs = new UVector64();
+ miniCEs = null;
+ firstDigitPrimary = 0;
+ firstLatinPrimary = 0;
+ lastLatinPrimary = 0;
+ firstShortPrimary = 0;
+ shortPrimaryOverflow = false;
+ headerLength = 0;
+ }
+
+ boolean forData(CollationData data) {
+ if(result.length() != 0) { // This builder is not reusable.
+ throw new IllegalStateException("attempt to reuse a CollationFastLatinBuilder");
+ }
+ if(!loadGroups(data)) { return false; }
+
+ // Fast handling of digits.
+ firstShortPrimary = firstDigitPrimary;
+ getCEs(data);
+ encodeUniqueCEs();
+ if(shortPrimaryOverflow) {
+ // Give digits long mini primaries,
+ // so that there are more short primaries for letters.
+ firstShortPrimary = firstLatinPrimary;
+ resetCEs();
+ getCEs(data);
+ encodeUniqueCEs();
+ }
+ // Note: If we still have a short-primary overflow but not a long-primary overflow,
+ // then we could calculate how many more long primaries would fit,
+ // and set the firstShortPrimary to that many after the current firstShortPrimary,
+ // and try again.
+ // However, this might only benefit the en_US_POSIX tailoring,
+ // and it is simpler to suppress building fast Latin data for it in genrb,
+ // or by returning false here if shortPrimaryOverflow.
+
+ boolean ok = !shortPrimaryOverflow;
+ if(ok) {
+ encodeCharCEs();
+ encodeContractions();
+ }
+ contractionCEs.removeAllElements(); // might reduce heap memory usage
+ uniqueCEs.removeAllElements();
+ return ok;
+ }
+
+ // C++ returns one combined array with the contents of the result buffer.
+ // Java returns two arrays (header & table) because we cannot use pointer arithmetic,
+ // and we do not want to index into the table with an offset.
+ char[] getHeader() {
+ char[] resultArray = new char[headerLength];
+ result.getChars(0, headerLength, resultArray, 0);
+ return resultArray;
+ }
+
+ char[] getTable() {
+ char[] resultArray = new char[result.length() - headerLength];
+ result.getChars(headerLength, result.length(), resultArray, 0);
+ return resultArray;
+ }
+
+ private boolean loadGroups(CollationData data) {
+ result.append(0); // reserved for version & headerLength
+ // The first few reordering groups should be special groups
+ // (space, punct, ..., digit) followed by Latn, then Grek and other scripts.
+ for(int i = 0;;) {
+ if(i >= data.scripts.length) {
+ throw new AssertionError("no Latn script");
+ }
+ int head = data.scripts[i];
+ int lastByte = head & 0xff; // last primary byte in the group
+ int group = data.scripts[i + 2];
+ if(group == Collator.ReorderCodes.DIGIT) {
+ firstDigitPrimary = (long)(head & 0xff00) << 16;
+ headerLength = result.length();
+ int r0 = (CollationFastLatin.VERSION << 8) | headerLength;
+ result.setCharAt(0, (char)r0);
+ } else if(group == UScript.LATIN) {
+ if(firstDigitPrimary == 0) {
+ throw new AssertionError("no digit group");
+ }
+ firstLatinPrimary = (long)(head & 0xff00) << 16;
+ lastLatinPrimary = ((long)lastByte << 24) | 0xffffff;
+ break;
+ } else if(firstDigitPrimary == 0) {
+ // a group below digits
+ if(lastByte > 0x7f) {
+ // We only use 7 bits for the last byte of a below-digits group.
+ // This does not warrant an errorCode, but we do not build a fast Latin table.
+ return false;
+ }
+ result.append((char)lastByte);
+ }
+ i = i + 2 + data.scripts[i + 1];
+ }
+ return true;
+ }
+
+ private boolean inSameGroup(long p, long q) {
+ // Both or neither need to be encoded as short primaries,
+ // so that we can test only one and use the same bit mask.
+ if(p >= firstShortPrimary) {
+ return q >= firstShortPrimary;
+ } else if(q >= firstShortPrimary) {
+ return false;
+ }
+ // Both or neither must be potentially-variable,
+ // so that we can test only one and determine if both are variable.
+ if(p >= firstDigitPrimary) {
+ return q >= firstDigitPrimary;
+ } else if(q >= firstDigitPrimary) {
+ return false;
+ }
+ // Both will be encoded with long mini primaries.
+ // They must be in the same special reordering group,
+ // so that we can test only one and determine if both are variable.
+ p >>= 24; // first primary byte
+ q >>= 24;
+ assert(p != 0 && q != 0);
+ assert(p <= result.charAt(headerLength - 1)); // the loop will terminate
+ for(int i = 1;; ++i) {
+ long lastByte = result.charAt(i);
+ if(p <= lastByte) {
+ return q <= lastByte;
+ } else if(q <= lastByte) {
+ return false;
+ }
+ }
+ }
+
+ private void resetCEs() {
+ contractionCEs.removeAllElements();
+ uniqueCEs.removeAllElements();
+ shortPrimaryOverflow = false;
+ result.setLength(headerLength);
+ }
+
+ private void getCEs(CollationData data) {
+ int i = 0;
+ for(char c = 0;; ++i, ++c) {
+ if(c == CollationFastLatin.LATIN_LIMIT) {
+ c = CollationFastLatin.PUNCT_START;
+ } else if(c == CollationFastLatin.PUNCT_LIMIT) {
+ break;
+ }
+ CollationData d;
+ int ce32 = data.getCE32(c);
+ if(ce32 == Collation.FALLBACK_CE32) {
+ d = data.base;
+ ce32 = d.getCE32(c);
+ } else {
+ d = data;
+ }
+ if(getCEsFromCE32(d, c, ce32)) {
+ charCEs[i][0] = ce0;
+ charCEs[i][1] = ce1;
+ addUniqueCE(ce0);
+ addUniqueCE(ce1);
+ } else {
+ // bail out for c
+ charCEs[i][0] = ce0 = Collation.NO_CE;
+ charCEs[i][1] = ce1 = 0;
+ }
+ if(c == 0 && !isContractionCharCE(ce0)) {
+ // Always map U+0000 to a contraction.
+ // Write a contraction list with only a default value if there is no real contraction.
+ assert(contractionCEs.isEmpty());
+ addContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, ce0, ce1);
+ charCEs[0][0] = (Collation.NO_CE_PRIMARY << 32) | CONTRACTION_FLAG;
+ charCEs[0][1] = 0;
+ }
+ }
+ // Terminate the last contraction list.
+ contractionCEs.addElement(CollationFastLatin.CONTR_CHAR_MASK);
+ }
+
+ private boolean getCEsFromCE32(CollationData data, int c, int ce32) {
+ ce32 = data.getFinalCE32(ce32);
+ ce1 = 0;
+ if(Collation.isSimpleOrLongCE32(ce32)) {
+ ce0 = Collation.ceFromCE32(ce32);
+ } else {
+ switch(Collation.tagFromCE32(ce32)) {
+ case Collation.LATIN_EXPANSION_TAG:
+ ce0 = Collation.latinCE0FromCE32(ce32);
+ ce1 = Collation.latinCE1FromCE32(ce32);
+ break;
+ case Collation.EXPANSION32_TAG: {
+ int index = Collation.indexFromCE32(ce32);
+ int length = Collation.lengthFromCE32(ce32);
+ if(length <= 2) {
+ ce0 = Collation.ceFromCE32(data.ce32s[index]);
+ if(length == 2) {
+ ce1 = Collation.ceFromCE32(data.ce32s[index + 1]);
+ }
+ break;
+ } else {
+ return false;
+ }
+ }
+ case Collation.EXPANSION_TAG: {
+ int index = Collation.indexFromCE32(ce32);
+ int length = Collation.lengthFromCE32(ce32);
+ if(length <= 2) {
+ ce0 = data.ces[index];
+ if(length == 2) {
+ ce1 = data.ces[index + 1];
+ }
+ break;
+ } else {
+ return false;
+ }
+ }
+ // Note: We could support PREFIX_TAG (assert c>=0)
+ // by recursing on its default CE32 and checking that none of the prefixes starts
+ // with a fast Latin character.
+ // However, currently (2013) there are only the L-before-middle-dot
+ // prefix mappings in the Latin range, and those would be rejected anyway.
+ case Collation.CONTRACTION_TAG:
+ assert(c >= 0);
+ return getCEsFromContractionCE32(data, ce32);
+ case Collation.OFFSET_TAG:
+ assert(c >= 0);
+ ce0 = data.getCEFromOffsetCE32(c, ce32);
+ break;
+ default:
+ return false;
+ }
+ }
+ // A mapping can be completely ignorable.
+ if(ce0 == 0) { return ce1 == 0; }
+ // We do not support an ignorable ce0 unless it is completely ignorable.
+ long p0 = ce0 >>> 32;
+ if(p0 == 0) { return false; }
+ // We only support primaries up to the Latin script.
+ if(p0 > lastLatinPrimary) { return false; }
+ // We support non-common secondary and case weights only together with short primaries.
+ int lower32_0 = (int)ce0;
+ if(p0 < firstShortPrimary) {
+ int sc0 = lower32_0 & Collation.SECONDARY_AND_CASE_MASK;
+ if(sc0 != Collation.COMMON_SECONDARY_CE) { return false; }
+ }
+ // No below-common tertiary weights.
+ if((lower32_0 & Collation.ONLY_TERTIARY_MASK) < Collation.COMMON_WEIGHT16) { return false; }
+ if(ce1 != 0) {
+ // Both primaries must be in the same group,
+ // or both must get short mini primaries,
+ // or a short-primary CE is followed by a secondary CE.
+ // This is so that we can test the first primary and use the same mask for both,
+ // and determine for both whether they are variable.
+ long p1 = ce1 >>> 32;
+ if(p1 == 0 ? p0 < firstShortPrimary : !inSameGroup(p0, p1)) { return false; }
+ int lower32_1 = (int)ce1;
+ // No tertiary CEs.
+ if((lower32_1 >>> 16) == 0) { return false; }
+ // We support non-common secondary and case weights
+ // only for secondary CEs or together with short primaries.
+ if(p1 != 0 && p1 < firstShortPrimary) {
+ int sc1 = lower32_1 & Collation.SECONDARY_AND_CASE_MASK;
+ if(sc1 != Collation.COMMON_SECONDARY_CE) { return false; }
+ }
+ // No below-common tertiary weights.
+ if((lower32_0 & Collation.ONLY_TERTIARY_MASK) < Collation.COMMON_WEIGHT16) { return false; }
+ }
+ // No quaternary weights.
+ if(((ce0 | ce1) & Collation.QUATERNARY_MASK) != 0) { return false; }
+ return true;
+ }
+
+ private boolean getCEsFromContractionCE32(CollationData data, int ce32) {
+ int trieIndex = Collation.indexFromCE32(ce32);
+ ce32 = data.getCE32FromContexts(trieIndex); // Default if no suffix match.
+ // Since the original ce32 is not a prefix mapping,
+ // the default ce32 must not be another contraction.
+ assert(!Collation.isContractionCE32(ce32));
+ int contractionIndex = contractionCEs.size();
+ if(getCEsFromCE32(data, Collation.SENTINEL_CP, ce32)) {
+ addContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, ce0, ce1);
+ } else {
+ // Bail out for c-without-contraction.
+ addContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, Collation.NO_CE, 0);
+ }
+ // Handle an encodable contraction unless the next contraction is too long
+ // and starts with the same character.
+ int prevX = -1;
+ boolean addContraction = false;
+ CharsTrie.Iterator suffixes = CharsTrie.iterator(data.contexts, trieIndex + 2, 0);
+ while(suffixes.hasNext()) {
+ CharsTrie.Entry entry = suffixes.next();
+ CharSequence suffix = entry.chars;
+ int x = CollationFastLatin.getCharIndex(suffix.charAt(0));
+ if(x < 0) { continue; } // ignore anything but fast Latin text
+ if(x == prevX) {
+ if(addContraction) {
+ // Bail out for all contractions starting with this character.
+ addContractionEntry(x, Collation.NO_CE, 0);
+ addContraction = false;
+ }
+ continue;
+ }
+ if(addContraction) {
+ addContractionEntry(prevX, ce0, ce1);
+ }
+ ce32 = entry.value;
+ if(suffix.length() == 1 && getCEsFromCE32(data, Collation.SENTINEL_CP, ce32)) {
+ addContraction = true;
+ } else {
+ addContractionEntry(x, Collation.NO_CE, 0);
+ addContraction = false;
+ }
+ prevX = x;
+ }
+ if(addContraction) {
+ addContractionEntry(prevX, ce0, ce1);
+ }
+ // Note: There might not be any fast Latin contractions, but
+ // we need to enter contraction handling anyway so that we can bail out
+ // when there is a non-fast-Latin character following.
+ // For example: Danish &Y<<u+umlaut, when we compare Y vs. u\u0308 we need to see the
+ // following umlaut and bail out, rather than return the difference of Y vs. u.
+ ce0 = (Collation.NO_CE_PRIMARY << 32) | CONTRACTION_FLAG | contractionIndex;
+ ce1 = 0;
+ return true;
+ }
+
+ private void addContractionEntry(int x, long cce0, long cce1) {
+ contractionCEs.addElement(x);
+ contractionCEs.addElement(cce0);
+ contractionCEs.addElement(cce1);
+ addUniqueCE(cce0);
+ addUniqueCE(cce1);
+ }
+
+ private void addUniqueCE(long ce) {
+ if(ce == 0 || (ce >>> 32) == Collation.NO_CE_PRIMARY) { return; }
+ ce &= ~(long)Collation.CASE_MASK; // blank out case bits
+ int i = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce);
+ if(i < 0) {
+ uniqueCEs.insertElementAt(ce, ~i);
+ }
+ }
+
+ private int getMiniCE(long ce) {
+ ce &= ~(long)Collation.CASE_MASK; // blank out case bits
+ int index = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce);
+ assert(index >= 0);
+ return miniCEs[index];
+ }
+
+ private void encodeUniqueCEs() {
+ miniCEs = new char[uniqueCEs.size()];
+ int group = 1;
+ long lastGroupByte = result.charAt(group);
+ // The lowest unique CE must be at least a secondary CE.
+ assert(((int)uniqueCEs.elementAti(0) >>> 16) != 0);
+ long prevPrimary = 0;
+ int prevSecondary = 0;
+ int pri = 0;
+ int sec = 0;
+ int ter = CollationFastLatin.COMMON_TER;
+ for(int i = 0; i < uniqueCEs.size(); ++i) {
+ long ce = uniqueCEs.elementAti(i);
+ // Note: At least one of the p/s/t weights changes from one unique CE to the next.
+ // (uniqueCEs does not store case bits.)
+ long p = ce >>> 32;
+ if(p != prevPrimary) {
+ int p1 = (int)(p >> 24);
+ while(p1 > lastGroupByte) {
+ assert(pri <= CollationFastLatin.MAX_LONG);
+ // Add the last "long primary" in or before the group
+ // into the upper 9 bits of the group entry.
+ result.setCharAt(group, (char)((pri << 4) | lastGroupByte));
+ if(++group < headerLength) { // group is 1-based
+ lastGroupByte = result.charAt(group);
+ } else {
+ lastGroupByte = 0xff;
+ break;
+ }
+ }
+ if(p < firstShortPrimary) {
+ if(pri == 0) {
+ pri = CollationFastLatin.MIN_LONG;
+ } else if(pri < CollationFastLatin.MAX_LONG) {
+ pri += CollationFastLatin.LONG_INC;
+ } else {
+ /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER
+ printf("long-primary overflow for %08x\n", p);
+ #endif */
+ miniCEs[i] = CollationFastLatin.BAIL_OUT;
+ continue;
+ }
+ } else {
+ if(pri < CollationFastLatin.MIN_SHORT) {
+ pri = CollationFastLatin.MIN_SHORT;
+ } else if(pri < (CollationFastLatin.MAX_SHORT - CollationFastLatin.SHORT_INC)) {
+ // Reserve the highest primary weight for U+FFFF.
+ pri += CollationFastLatin.SHORT_INC;
+ } else {
+ /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER
+ printf("short-primary overflow for %08x\n", p);
+ #endif */
+ shortPrimaryOverflow = true;
+ miniCEs[i] = CollationFastLatin.BAIL_OUT;
+ continue;
+ }
+ }
+ prevPrimary = p;
+ prevSecondary = Collation.COMMON_WEIGHT16;
+ sec = CollationFastLatin.COMMON_SEC;
+ ter = CollationFastLatin.COMMON_TER;
+ }
+ int lower32 = (int)ce;
+ int s = lower32 >>> 16;
+ if(s != prevSecondary) {
+ if(pri == 0) {
+ if(sec == 0) {
+ sec = CollationFastLatin.MIN_SEC_HIGH;
+ } else if(sec < CollationFastLatin.MAX_SEC_HIGH) {
+ sec += CollationFastLatin.SEC_INC;
+ } else {
+ miniCEs[i] = CollationFastLatin.BAIL_OUT;
+ continue;
+ }
+ prevSecondary = s;
+ ter = CollationFastLatin.COMMON_TER;
+ } else if(s < Collation.COMMON_WEIGHT16) {
+ if(sec == CollationFastLatin.COMMON_SEC) {
+ sec = CollationFastLatin.MIN_SEC_BEFORE;
+ } else if(sec < CollationFastLatin.MAX_SEC_BEFORE) {
+ sec += CollationFastLatin.SEC_INC;
+ } else {
+ miniCEs[i] = CollationFastLatin.BAIL_OUT;
+ continue;
+ }
+ } else if(s == Collation.COMMON_WEIGHT16) {
+ sec = CollationFastLatin.COMMON_SEC;
+ } else {
+ if(sec < CollationFastLatin.MIN_SEC_AFTER) {
+ sec = CollationFastLatin.MIN_SEC_AFTER;
+ } else if(sec < CollationFastLatin.MAX_SEC_AFTER) {
+ sec += CollationFastLatin.SEC_INC;
+ } else {
+ miniCEs[i] = CollationFastLatin.BAIL_OUT;
+ continue;
+ }
+ }
+ prevSecondary = s;
+ ter = CollationFastLatin.COMMON_TER;
+ }
+ assert((lower32 & Collation.CASE_MASK) == 0); // blanked out in uniqueCEs
+ int t = lower32 & Collation.ONLY_TERTIARY_MASK;
+ if(t > Collation.COMMON_WEIGHT16) {
+ if(ter < CollationFastLatin.MAX_TER_AFTER) {
+ ++ter;
+ } else {
+ miniCEs[i] = CollationFastLatin.BAIL_OUT;
+ continue;
+ }
+ }
+ if(CollationFastLatin.MIN_LONG <= pri && pri <= CollationFastLatin.MAX_LONG) {
+ assert(sec == CollationFastLatin.COMMON_SEC);
+ miniCEs[i] = (char)(pri | ter);
+ } else {
+ miniCEs[i] = (char)(pri | sec | ter);
+ }
+ }
+ /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER
+ printf("last mini primary: %04x\n", pri);
+ #endif */
+ /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER >= 2
+ for(int i = 0; i < uniqueCEs.size(); ++i) {
+ long ce = uniqueCEs.elementAti(i);
+ printf("unique CE 0x%016lx -> 0x%04x\n", ce, miniCEs[i]);
+ }
+ #endif */
+ }
+
+ private void encodeCharCEs() {
+ int miniCEsStart = result.length();
+ for(int i = 0; i < CollationFastLatin.NUM_FAST_CHARS; ++i) {
+ result.append(0); // initialize to completely ignorable
+ }
+ int indexBase = result.length();
+ for(int i = 0; i < CollationFastLatin.NUM_FAST_CHARS; ++i) {
+ long ce = charCEs[i][0];
+ if(isContractionCharCE(ce)) { continue; } // defer contraction
+ int miniCE = encodeTwoCEs(ce, charCEs[i][1]);
+ if((miniCE >>> 16) > 0) { // if ((unsigned)miniCE > 0xffff)
+ // Note: There is a chance that this new expansion is the same as a previous one,
+ // and if so, then we could reuse the other expansion.
+ // However, that seems unlikely.
+ int expansionIndex = result.length() - indexBase;
+ if(expansionIndex > CollationFastLatin.INDEX_MASK) {
+ miniCE = CollationFastLatin.BAIL_OUT;
+ } else {
+ result.append((char)(miniCE >> 16)).append((char)miniCE);
+ miniCE = CollationFastLatin.EXPANSION | expansionIndex;
+ }
+ }
+ result.setCharAt(miniCEsStart + i, (char)miniCE);
+ }
+ }
+
+ private void encodeContractions() {
+ // We encode all contraction lists so that the first word of a list
+ // terminates the previous list, and we only need one additional terminator at the end.
+ int indexBase = headerLength + CollationFastLatin.NUM_FAST_CHARS;
+ int firstContractionIndex = result.length();
+ for(int i = 0; i < CollationFastLatin.NUM_FAST_CHARS; ++i) {
+ long ce = charCEs[i][0];
+ if(!isContractionCharCE(ce)) { continue; }
+ int contractionIndex = result.length() - indexBase;
+ if(contractionIndex > CollationFastLatin.INDEX_MASK) {
+ result.setCharAt(headerLength + i, (char) CollationFastLatin.BAIL_OUT);
+ continue;
+ }
+ boolean firstTriple = true;
+ for(int index = (int)ce & 0x7fffffff;; index += 3) {
+ long x = contractionCEs.elementAti(index);
+ if(x == CollationFastLatin.CONTR_CHAR_MASK && !firstTriple) { break; }
+ long cce0 = contractionCEs.elementAti(index + 1);
+ long cce1 = contractionCEs.elementAti(index + 2);
+ int miniCE = encodeTwoCEs(cce0, cce1);
+ if(miniCE == CollationFastLatin.BAIL_OUT) {
+ result.append((char)(x | (1 << CollationFastLatin.CONTR_LENGTH_SHIFT)));
+ } else if(miniCE <= 0xffff) {
+ result.append((char)(x | (2 << CollationFastLatin.CONTR_LENGTH_SHIFT)));
+ result.append((char)miniCE);
+ } else {
+ result.append((char)(x | (3 << CollationFastLatin.CONTR_LENGTH_SHIFT)));
+ result.append((char)(miniCE >> 16)).append((char)miniCE);
+ }
+ firstTriple = false;
+ }
+ // Note: There is a chance that this new contraction list is the same as a previous one,
+ // and if so, then we could truncate the result and reuse the other list.
+ // However, that seems unlikely.
+ result.setCharAt(headerLength + i,
+ (char)(CollationFastLatin.CONTRACTION | contractionIndex));
+ }
+ if(result.length() > firstContractionIndex) {
+ // Terminate the last contraction list.
+ result.append((char)CollationFastLatin.CONTR_CHAR_MASK);
+ }
+ /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER
+ printf("** fast Latin %d * 2 = %d bytes\n", result.length(), result.length() * 2);
+ puts(" header & below-digit groups map");
+ int i = 0;
+ for(; i < headerLength; ++i) {
+ printf(" %04x", result[i]);
+ }
+ printf("\n char mini CEs");
+ assert(CollationFastLatin.NUM_FAST_CHARS % 16 == 0);
+ for(; i < indexBase; i += 16) {
+ int c = i - headerLength;
+ if(c >= CollationFastLatin.LATIN_LIMIT) {
+ c = CollationFastLatin.PUNCT_START + c - CollationFastLatin.LATIN_LIMIT;
+ }
+ printf("\n %04x:", c);
+ for(int j = 0; j < 16; ++j) {
+ printf(" %04x", result[i + j]);
+ }
+ }
+ printf("\n expansions & contractions");
+ for(; i < result.length(); ++i) {
+ if((i - indexBase) % 16 == 0) { puts(""); }
+ printf(" %04x", result[i]);
+ }
+ puts("");
+ #endif */
+ }
+
+ private int encodeTwoCEs(long first, long second) {
+ if(first == 0) {
+ return 0; // completely ignorable
+ }
+ if(first == Collation.NO_CE) {
+ return CollationFastLatin.BAIL_OUT;
+ }
+ assert((first >>> 32) != Collation.NO_CE_PRIMARY);
+
+ int miniCE = getMiniCE(first);
+ if(miniCE == CollationFastLatin.BAIL_OUT) { return miniCE; }
+ if(miniCE >= CollationFastLatin.MIN_SHORT) {
+ // Extract & copy the case bits.
+ // Shift them from normal CE bits 15..14 to mini CE bits 4..3.
+ int c = (((int)first & Collation.CASE_MASK) >> (14 - 3));
+ // Only in mini CEs: Ignorable case bits = 0, lowercase = 1.
+ c += CollationFastLatin.LOWER_CASE;
+ miniCE |= c;
+ }
+ if(second == 0) { return miniCE; }
+
+ int miniCE1 = getMiniCE(second);
+ if(miniCE1 == CollationFastLatin.BAIL_OUT) { return miniCE1; }
+
+ int case1 = (int)second & Collation.CASE_MASK;
+ if(miniCE >= CollationFastLatin.MIN_SHORT &&
+ (miniCE & CollationFastLatin.SECONDARY_MASK) == CollationFastLatin.COMMON_SEC) {
+ // Try to combine the two mini CEs into one.
+ int sec1 = miniCE1 & CollationFastLatin.SECONDARY_MASK;
+ int ter1 = miniCE1 & CollationFastLatin.TERTIARY_MASK;
+ if(sec1 >= CollationFastLatin.MIN_SEC_HIGH && case1 == 0 &&
+ ter1 == CollationFastLatin.COMMON_TER) {
+ // sec1>=sec_high implies pri1==0.
+ return (miniCE & ~CollationFastLatin.SECONDARY_MASK) | sec1;
+ }
+ }
+
+ if(miniCE1 <= CollationFastLatin.SECONDARY_MASK || CollationFastLatin.MIN_SHORT <= miniCE1) {
+ // Secondary CE, or a CE with a short primary, copy the case bits.
+ case1 = (case1 >> (14 - 3)) + CollationFastLatin.LOWER_CASE;
+ miniCE1 |= case1;
+ }
+ return (miniCE << 16) | miniCE1;
+ }
+
+ private static boolean isContractionCharCE(long ce) {
+ return (ce >>> 32) == Collation.NO_CE_PRIMARY && ce != Collation.NO_CE;
+ }
+
+ private static final long CONTRACTION_FLAG = 0x80000000L;
+
+ // temporary "buffer"
+ private long ce0, ce1;
+
+ private long[][] charCEs = new long[CollationFastLatin.NUM_FAST_CHARS][2];
+
+ private UVector64 contractionCEs;
+ private UVector64 uniqueCEs;
+
+ /** One 16-bit mini CE per unique CE. */
+ private char[] miniCEs;
+
+ // These are constant for a given list of CollationData.scripts.
+ private long firstDigitPrimary;
+ private long firstLatinPrimary;
+ private long lastLatinPrimary;
+ // This determines the first normal primary weight which is mapped to
+ // a short mini primary. It must be >=firstDigitPrimary.
+ private long firstShortPrimary;
+
+ private boolean shortPrimaryOverflow;
+
+ private StringBuilder result = new StringBuilder();
+ private int headerLength;
+}
--- /dev/null
+/*
+*******************************************************************************
+* Copyright (C) 2010-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* CollationIterator.java, ported from collationiterator.h/.cpp
+*
+* C++ version created on: 2010oct27
+* created by: Markus W. Scherer
+*/
+
+package com.ibm.icu.impl.coll;
+
+import com.ibm.icu.impl.Normalizer2Impl.Hangul;
+import com.ibm.icu.impl.Trie2_32;
+import com.ibm.icu.util.BytesTrie;
+import com.ibm.icu.util.CharsTrie;
+
+/**
+ * Collation element iterator and abstract character iterator.
+ *
+ * When a method returns a code point value, it must be in 0..10FFFF,
+ * except it can be negative as a sentinel value.
+ */
+public abstract class CollationIterator {
+ private static final class CEBuffer {
+ /** Large enough for CEs of most short strings. */
+ private static final int INITIAL_CAPACITY = 40;
+
+ CEBuffer() {}
+
+ void append(long ce) {
+ if(length >= INITIAL_CAPACITY) {
+ ensureAppendCapacity(1);
+ }
+ buffer[length++] = ce;
+ }
+
+ void appendUnsafe(long ce) {
+ buffer[length++] = ce;
+ }
+
+ void ensureAppendCapacity(int appCap) {
+ int capacity = buffer.length;
+ if((length + appCap) <= capacity) { return; }
+ do {
+ if(capacity < 1000) {
+ capacity *= 4;
+ } else {
+ capacity *= 2;
+ }
+ } while(capacity < (length + appCap));
+ long[] newBuffer = new long[capacity];
+ System.arraycopy(buffer, 0, newBuffer, 0, length);
+ buffer = newBuffer;
+ }
+
+ void incLength() {
+ // Use INITIAL_CAPACITY for a very simple fastpath.
+ // (Rather than buffer.getCapacity().)
+ if(length >= INITIAL_CAPACITY) {
+ ensureAppendCapacity(1);
+ }
+ ++length;
+ }
+
+ long set(int i, long ce) {
+ return buffer[i] = ce;
+ }
+ long get(int i) { return buffer[i]; }
+
+ long[] getCEs() { return buffer; }
+
+ int length = 0;
+
+ private long[] buffer = new long[INITIAL_CAPACITY];
+ }
+
+ // State of combining marks skipped in discontiguous contraction.
+ // We create a state object on first use and keep it around deactivated between uses.
+ private static final class SkippedState {
+ // Born active but empty.
+ SkippedState() {}
+ void clear() {
+ oldBuffer.setLength(0);
+ pos = 0;
+ // The newBuffer is reset by setFirstSkipped().
+ }
+
+ boolean isEmpty() { return oldBuffer.length() == 0; }
+
+ boolean hasNext() { return pos < oldBuffer.length(); }
+
+ // Requires hasNext().
+ int next() {
+ int c = oldBuffer.codePointAt(pos);
+ pos += Character.charCount(c);
+ return c;
+ }
+
+ // Accounts for one more input code point read beyond the end of the marks buffer.
+ void incBeyond() {
+ assert(!hasNext());
+ ++pos;
+ }
+
+ // Goes backward through the skipped-marks buffer.
+ // Returns the number of code points read beyond the skipped marks
+ // that need to be backtracked through normal input.
+ int backwardNumCodePoints(int n) {
+ int length = oldBuffer.length();
+ int beyond = pos - length;
+ if(beyond > 0) {
+ if(beyond >= n) {
+ // Not back far enough to re-enter the oldBuffer.
+ pos -= n;
+ return n;
+ } else {
+ // Back out all beyond-oldBuffer code points and re-enter the buffer.
+ pos = oldBuffer.offsetByCodePoints(length, beyond - n);
+ return beyond;
+ }
+ } else {
+ // Go backwards from inside the oldBuffer.
+ pos = oldBuffer.offsetByCodePoints(pos, -n);
+ return 0;
+ }
+ }
+
+ void setFirstSkipped(int c) {
+ skipLengthAtMatch = 0;
+ newBuffer.setLength(0);
+ newBuffer.appendCodePoint(c);
+ }
+
+ void skip(int c) {
+ newBuffer.appendCodePoint(c);
+ }
+
+ void recordMatch() { skipLengthAtMatch = newBuffer.length(); }
+
+ // Replaces the characters we consumed with the newly skipped ones.
+ void replaceMatch() {
+ // Note: UnicodeString.replace() pins pos to at most length().
+ int oldLength = oldBuffer.length();
+ if(pos > oldLength) { pos = oldLength; }
+ oldBuffer.delete(0, pos).insert(0, newBuffer, 0, skipLengthAtMatch);
+ pos = 0;
+ }
+
+ void saveTrieState(CharsTrie trie) { trie.saveState(state); }
+ void resetToTrieState(CharsTrie trie) { trie.resetToState(state); }
+
+ // Combining marks skipped in previous discontiguous-contraction matching.
+ // After that discontiguous contraction was completed, we start reading them from here.
+ private final StringBuilder oldBuffer = new StringBuilder();
+ // Combining marks newly skipped in current discontiguous-contraction matching.
+ // These might have been read from the normal text or from the oldBuffer.
+ private final StringBuilder newBuffer = new StringBuilder();
+ // Reading index in oldBuffer,
+ // or counter for how many code points have been read beyond oldBuffer (pos-oldBuffer.length()).
+ private int pos;
+ // newBuffer.length() at the time of the last matching character.
+ // When a partial match fails, we back out skipped and partial-matching input characters.
+ private int skipLengthAtMatch;
+ // We save the trie state before we attempt to match a character,
+ // so that we can skip it and try the next one.
+ private CharsTrie.State state = new CharsTrie.State();
+ };
+
+ /**
+ * Partially constructs the iterator.
+ * In Java, we cache partially constructed iterators
+ * and finish their setup when starting to work on text
+ * (via reset(boolean) and the setText(numeric, ...) methods of subclasses).
+ * This avoids memory allocations for iterators that remain unused.
+ *
+ * <p>In C++, there is only one constructor, and iterators are
+ * stack-allocated as needed.
+ */
+ public CollationIterator(CollationData d) {
+ trie = d.trie;
+ data = d;
+ numCpFwd = -1;
+ isNumeric = false;
+ ceBuffer = null;
+ }
+
+ public CollationIterator(CollationData d, boolean numeric) {
+ trie = d.trie;
+ data = d;
+ numCpFwd = -1;
+ isNumeric = numeric;
+ ceBuffer = new CEBuffer();
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ // Subclasses: Call this method and then add more specific checks.
+ // Compare the iterator state but not the collation data (trie & data fields):
+ // Assume that the caller compares the data.
+ // Ignore skipped since that should be unused between calls to nextCE().
+ // (It only stays around to avoid another memory allocation.)
+ if(!this.getClass().equals(other.getClass())) { return false; }
+ CollationIterator o = (CollationIterator)other;
+ if(!(ceBuffer.length == o.ceBuffer.length &&
+ cesIndex == o.cesIndex &&
+ numCpFwd == o.numCpFwd &&
+ isNumeric == o.isNumeric)) {
+ return false;
+ }
+ for(int i = 0; i < ceBuffer.length; ++i) {
+ if(ceBuffer.get(i) != o.ceBuffer.get(i)) { return false; }
+ }
+ return true;
+ }
+
+ /**
+ * Resets the iterator state and sets the position to the specified offset.
+ * Subclasses must implement, and must call the parent class method,
+ * or CollationIterator.reset().
+ */
+ public abstract void resetToOffset(int newOffset);
+
+ public abstract int getOffset();
+
+ /**
+ * Returns the next collation element.
+ */
+ public final long nextCE() {
+ if(cesIndex < ceBuffer.length) {
+ // Return the next buffered CE.
+ return ceBuffer.get(cesIndex++);
+ }
+ assert cesIndex == ceBuffer.length;
+ ceBuffer.incLength();
+ long cAndCE32 = handleNextCE32();
+ int c = (int)(cAndCE32 >> 32);
+ int ce32 = (int)cAndCE32;
+ int t = ce32 & 0xff;
+ if(t < Collation.SPECIAL_CE32_LOW_BYTE) { // Forced-inline of isSpecialCE32(ce32).
+ // Normal CE from the main data.
+ // Forced-inline of ceFromSimpleCE32(ce32).
+ return ceBuffer.set(cesIndex++,
+ ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (t << 8));
+ }
+ CollationData d;
+ // The compiler should be able to optimize the previous and the following
+ // comparisons of t with the same constant.
+ if(t == Collation.SPECIAL_CE32_LOW_BYTE) {
+ if(c < 0) {
+ return ceBuffer.set(cesIndex++, Collation.NO_CE);
+ }
+ d = data.base;
+ ce32 = d.getCE32(c);
+ t = ce32 & 0xff;
+ if(t < Collation.SPECIAL_CE32_LOW_BYTE) {
+ // Normal CE from the base data.
+ return ceBuffer.set(cesIndex++,
+ ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (t << 8));
+ }
+ } else {
+ d = data;
+ }
+ if(t == Collation.LONG_PRIMARY_CE32_LOW_BYTE) {
+ // Forced-inline of ceFromLongPrimaryCE32(ce32).
+ return ceBuffer.set(cesIndex++,
+ ((long)(ce32 - t) << 32) | Collation.COMMON_SEC_AND_TER_CE);
+ }
+ return nextCEFromCE32(d, c, ce32);
+ }
+
+ /**
+ * Fetches all CEs.
+ * @return getCEsLength()
+ */
+ public final int fetchCEs() {
+ while(nextCE() != Collation.NO_CE) {
+ // No need to loop for each expansion CE.
+ cesIndex = ceBuffer.length;
+ }
+ return ceBuffer.length;
+ }
+
+ /**
+ * Overwrites the current CE (the last one returned by nextCE()).
+ */
+ final void setCurrentCE(long ce) {
+ assert cesIndex > 0;
+ ceBuffer.set(cesIndex - 1, ce);
+ }
+
+ /**
+ * Returns the previous collation element.
+ */
+ public final long previousCE(UVector32 offsets) {
+ if(ceBuffer.length > 0) {
+ // Return the previous buffered CE.
+ return ceBuffer.get(--ceBuffer.length);
+ }
+ offsets.removeAllElements();
+ int limitOffset = getOffset();
+ int c = previousCodePoint();
+ if(c < 0) { return Collation.NO_CE; }
+ if(data.isUnsafeBackward(c, isNumeric)) {
+ return previousCEUnsafe(c, offsets);
+ }
+ // Simple, safe-backwards iteration:
+ // Get a CE going backwards, handle prefixes but no contractions.
+ int ce32 = data.getCE32(c);
+ CollationData d;
+ if(ce32 == Collation.FALLBACK_CE32) {
+ d = data.base;
+ ce32 = d.getCE32(c);
+ } else {
+ d = data;
+ }
+ if(Collation.isSimpleOrLongCE32(ce32)) {
+ return Collation.ceFromCE32(ce32);
+ }
+ appendCEsFromCE32(d, c, ce32, false);
+ if(ceBuffer.length > 1) {
+ offsets.addElement(getOffset());
+ // For an expansion, the offset of each non-initial CE is the limit offset,
+ // consistent with forward iteration.
+ while(offsets.size() <= ceBuffer.length) {
+ offsets.addElement(limitOffset);
+ };
+ }
+ return ceBuffer.get(--ceBuffer.length);
+ }
+
+ public final int getCEsLength() {
+ return ceBuffer.length;
+ }
+
+ public final long getCE(int i) {
+ return ceBuffer.get(i);
+ }
+
+ public final long[] getCEs() {
+ return ceBuffer.getCEs();
+ }
+
+ final void clearCEs() {
+ cesIndex = ceBuffer.length = 0;
+ }
+
+ public final void clearCEsIfNoneRemaining() {
+ if(cesIndex == ceBuffer.length) { clearCEs(); }
+ }
+
+ /**
+ * Returns the next code point (with post-increment).
+ * Public for identical-level comparison and for testing.
+ */
+ public abstract int nextCodePoint();
+
+ /**
+ * Returns the previous code point (with pre-decrement).
+ * Public for identical-level comparison and for testing.
+ */
+ public abstract int previousCodePoint();
+
+ protected final void reset() {
+ cesIndex = ceBuffer.length = 0;
+ if(skipped != null) { skipped.clear(); }
+ }
+ /**
+ * Resets the state as well as the numeric setting,
+ * and completes the initialization.
+ * Only exists in Java where we reset cached CollationIterator instances
+ * rather than stack-allocating temporary ones.
+ * (See also the constructor comments.)
+ */
+ protected final void reset(boolean numeric) {
+ if(ceBuffer == null) {
+ ceBuffer = new CEBuffer();
+ }
+ reset();
+ isNumeric = numeric;
+ }
+
+ /**
+ * Returns the next code point and its local CE32 value.
+ * Returns Collation.FALLBACK_CE32 at the end of the text (c<0)
+ * or when c's CE32 value is to be looked up in the base data (fallback).
+ *
+ * The code point is used for fallbacks, context and implicit weights.
+ * It is ignored when the returned CE32 is not special (e.g., FFFD_CE32).
+ *
+ * Returns the code point in bits 63..32 (signed) and the CE32 in bits 31..0.
+ */
+ protected long handleNextCE32() {
+ int c = nextCodePoint();
+ if(c < 0) { return NO_CP_AND_CE32; }
+ return makeCodePointAndCE32Pair(c, data.getCE32(c));
+ }
+ protected long makeCodePointAndCE32Pair(int c, int ce32) {
+ return ((long)c << 32) | (ce32 & 0xffffffffL);
+ }
+ protected static final long NO_CP_AND_CE32 = (-1L << 32) | (Collation.FALLBACK_CE32 & 0xffffffffL);
+
+ /**
+ * Called when handleNextCE32() returns a LEAD_SURROGATE_TAG for a lead surrogate code unit.
+ * Returns the trail surrogate in that case and advances past it,
+ * if a trail surrogate follows the lead surrogate.
+ * Otherwise returns any other code unit and does not advance.
+ */
+ protected char handleGetTrailSurrogate() {
+ return 0;
+ }
+
+ /**
+ * Called when handleNextCE32() returns with c==0, to see whether it is a NUL terminator.
+ * (Not needed in Java.)
+ */
+ /*protected boolean foundNULTerminator() {
+ return false;
+ }*/
+
+ /**
+ * @return false if surrogate code points U+D800..U+DFFF
+ * map to their own implicit primary weights (for UTF-16),
+ * or true if they map to CE(U+FFFD) (for UTF-8)
+ */
+ protected boolean forbidSurrogateCodePoints() {
+ return false;
+ }
+
+ protected abstract void forwardNumCodePoints(int num);
+
+ protected abstract void backwardNumCodePoints(int num);
+
+ /**
+ * Returns the CE32 from the data trie.
+ * Normally the same as data.getCE32(), but overridden in the builder.
+ * Call this only when the faster data.getCE32() cannot be used.
+ */
+ protected int getDataCE32(int c) {
+ return data.getCE32(c);
+ }
+
+ protected int getCE32FromBuilderData(int ce32) {
+ throw new RuntimeException("internal program error: should be unreachable");
+ }
+
+ protected final void appendCEsFromCE32(CollationData d, int c, int ce32,
+ boolean forward) {
+ while(Collation.isSpecialCE32(ce32)) {
+ switch(Collation.tagFromCE32(ce32)) {
+ case Collation.FALLBACK_TAG:
+ case Collation.RESERVED_TAG_3:
+ throw new RuntimeException("internal program error: should be unreachable");
+ case Collation.LONG_PRIMARY_TAG:
+ ceBuffer.append(Collation.ceFromLongPrimaryCE32(ce32));
+ return;
+ case Collation.LONG_SECONDARY_TAG:
+ ceBuffer.append(Collation.ceFromLongSecondaryCE32(ce32));
+ return;
+ case Collation.LATIN_EXPANSION_TAG:
+ ceBuffer.ensureAppendCapacity(2);
+ ceBuffer.set(ceBuffer.length, Collation.latinCE0FromCE32(ce32));
+ ceBuffer.set(ceBuffer.length + 1, Collation.latinCE1FromCE32(ce32));
+ ceBuffer.length += 2;
+ return;
+ case Collation.EXPANSION32_TAG: {
+ int index = Collation.indexFromCE32(ce32);
+ int length = Collation.lengthFromCE32(ce32);
+ ceBuffer.ensureAppendCapacity(length);
+ do {
+ ceBuffer.appendUnsafe(Collation.ceFromCE32(d.ce32s[index++]));
+ } while(--length > 0);
+ return;
+ }
+ case Collation.EXPANSION_TAG: {
+ int index = Collation.indexFromCE32(ce32);
+ int length = Collation.lengthFromCE32(ce32);
+ ceBuffer.ensureAppendCapacity(length);
+ do {
+ ceBuffer.appendUnsafe(d.ces[index++]);
+ } while(--length > 0);
+ return;
+ }
+ case Collation.BUILDER_DATA_TAG:
+ ce32 = getCE32FromBuilderData(ce32);
+ if(ce32 == Collation.FALLBACK_CE32) {
+ d = data.base;
+ ce32 = d.getCE32(c);
+ }
+ break;
+ case Collation.PREFIX_TAG:
+ if(forward) { backwardNumCodePoints(1); }
+ ce32 = getCE32FromPrefix(d, ce32);
+ if(forward) { forwardNumCodePoints(1); }
+ break;
+ case Collation.CONTRACTION_TAG: {
+ int index = Collation.indexFromCE32(ce32);
+ int defaultCE32 = d.getCE32FromContexts(index); // Default if no suffix match.
+ if(!forward) {
+ // Backward contractions are handled by previousCEUnsafe().
+ // c has contractions but they were not found.
+ ce32 = defaultCE32;
+ break;
+ }
+ int nextCp;
+ if(skipped == null && numCpFwd < 0) {
+ // Some portion of nextCE32FromContraction() pulled out here as an ASCII fast path,
+ // avoiding the function call and the nextSkippedCodePoint() overhead.
+ nextCp = nextCodePoint();
+ if(nextCp < 0) {
+ // No more text.
+ ce32 = defaultCE32;
+ break;
+ } else if((ce32 & Collation.CONTRACT_NEXT_CCC) != 0 &&
+ !CollationFCD.mayHaveLccc(nextCp)) {
+ // All contraction suffixes start with characters with lccc!=0
+ // but the next code point has lccc==0.
+ backwardNumCodePoints(1);
+ ce32 = defaultCE32;
+ break;
+ }
+ } else {
+ nextCp = nextSkippedCodePoint();
+ if(nextCp < 0) {
+ // No more text.
+ ce32 = defaultCE32;
+ break;
+ } else if((ce32 & Collation.CONTRACT_NEXT_CCC) != 0 &&
+ !CollationFCD.mayHaveLccc(nextCp)) {
+ // All contraction suffixes start with characters with lccc!=0
+ // but the next code point has lccc==0.
+ backwardNumSkipped(1);
+ ce32 = defaultCE32;
+ break;
+ }
+ }
+ ce32 = nextCE32FromContraction(d, ce32, d.contexts, index + 2, defaultCE32, nextCp);
+ if(ce32 == Collation.NO_CE32) {
+ // CEs from a discontiguous contraction plus the skipped combining marks
+ // have been appended already.
+ return;
+ }
+ break;
+ }
+ case Collation.DIGIT_TAG:
+ if(isNumeric) {
+ appendNumericCEs(ce32, forward);
+ return;
+ } else {
+ // Fetch the non-numeric-collation CE32 and continue.
+ ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
+ break;
+ }
+ case Collation.U0000_TAG:
+ assert(c == 0);
+ // NUL-terminated input not supported in Java.
+ // Fetch the normal ce32 for U+0000 and continue.
+ ce32 = d.ce32s[0];
+ break;
+ case Collation.HANGUL_TAG: {
+ int[] jamoCE32s = d.jamoCE32s;
+ c -= Hangul.HANGUL_BASE;
+ int t = c % Hangul.JAMO_T_COUNT;
+ c /= Hangul.JAMO_T_COUNT;
+ int v = c % Hangul.JAMO_V_COUNT;
+ c /= Hangul.JAMO_V_COUNT;
+ if((ce32 & Collation.HANGUL_NO_SPECIAL_JAMO) != 0) {
+ // None of the Jamo CE32s are isSpecialCE32().
+ // Avoid recursive function calls and per-Jamo tests.
+ ceBuffer.ensureAppendCapacity(t == 0 ? 2 : 3);
+ ceBuffer.set(ceBuffer.length, Collation.ceFromCE32(jamoCE32s[c]));
+ ceBuffer.set(ceBuffer.length + 1, Collation.ceFromCE32(jamoCE32s[19 + v]));
+ ceBuffer.length += 2;
+ if(t != 0) {
+ ceBuffer.appendUnsafe(Collation.ceFromCE32(jamoCE32s[39 + t]));
+ }
+ return;
+ } else {
+ // We should not need to compute each Jamo code point.
+ // In particular, there should be no offset or implicit ce32.
+ appendCEsFromCE32(d, Collation.SENTINEL_CP, jamoCE32s[c], forward);
+ appendCEsFromCE32(d, Collation.SENTINEL_CP, jamoCE32s[19 + v], forward);
+ if(t == 0) { return; }
+ // offset 39 = 19 + 21 - 1:
+ // 19 = JAMO_L_COUNT
+ // 21 = JAMO_T_COUNT
+ // -1 = omit t==0
+ ce32 = jamoCE32s[39 + t];
+ c = Collation.SENTINEL_CP;
+ break;
+ }
+ }
+ case Collation.LEAD_SURROGATE_TAG: {
+ assert(forward); // Backward iteration should never see lead surrogate code _unit_ data.
+ assert(isLeadSurrogate(c));
+ char trail;
+ if(Character.isLowSurrogate(trail = handleGetTrailSurrogate())) {
+ c = Character.toCodePoint((char)c, trail);
+ ce32 &= Collation.LEAD_TYPE_MASK;
+ if(ce32 == Collation.LEAD_ALL_UNASSIGNED) {
+ ce32 = Collation.UNASSIGNED_CE32; // unassigned-implicit
+ } else if(ce32 == Collation.LEAD_ALL_FALLBACK ||
+ (ce32 = d.getCE32FromSupplementary(c)) == Collation.FALLBACK_CE32) {
+ // fall back to the base data
+ d = d.base;
+ ce32 = d.getCE32FromSupplementary(c);
+ }
+ } else {
+ // c is an unpaired surrogate.
+ ce32 = Collation.UNASSIGNED_CE32;
+ }
+ break;
+ }
+ case Collation.OFFSET_TAG:
+ assert(c >= 0);
+ ceBuffer.append(d.getCEFromOffsetCE32(c, ce32));
+ return;
+ case Collation.IMPLICIT_TAG:
+ assert(c >= 0);
+ if(isSurrogate(c) && forbidSurrogateCodePoints()) {
+ ce32 = Collation.FFFD_CE32;
+ break;
+ } else {
+ ceBuffer.append(Collation.unassignedCEFromCodePoint(c));
+ return;
+ }
+ }
+ }
+ ceBuffer.append(Collation.ceFromSimpleCE32(ce32));
+ }
+
+ // TODO: Propose widening the UTF16 method.
+ private static final boolean isSurrogate(int c) {
+ return (c & 0xfffff800) == 0xd800;
+ }
+
+ // TODO: Propose widening the UTF16 method.
+ protected static final boolean isLeadSurrogate(int c) {
+ return (c & 0xfffffc00) == 0xd800;
+ }
+
+ // TODO: Propose widening the UTF16 method.
+ protected static final boolean isTrailSurrogate(int c) {
+ return (c & 0xfffffc00) == 0xdc00;
+ }
+
+ // Main lookup trie of the data object.
+ protected final Trie2_32 trie;
+ protected final CollationData data;
+
+ private final long nextCEFromCE32(CollationData d, int c, int ce32) {
+ --ceBuffer.length; // Undo ceBuffer.incLength().
+ appendCEsFromCE32(d, c, ce32, true);
+ return ceBuffer.get(cesIndex++);
+ }
+
+ private final int getCE32FromPrefix(CollationData d, int ce32) {
+ int index = Collation.indexFromCE32(ce32);
+ ce32 = d.getCE32FromContexts(index); // Default if no prefix match.
+ index += 2;
+ // Number of code points read before the original code point.
+ int lookBehind = 0;
+ CharsTrie prefixes = new CharsTrie(d.contexts, index);
+ for(;;) {
+ int c = previousCodePoint();
+ if(c < 0) { break; }
+ ++lookBehind;
+ BytesTrie.Result match = prefixes.nextForCodePoint(c);
+ if(match.hasValue()) {
+ ce32 = prefixes.getValue();
+ }
+ if(!match.hasNext()) { break; }
+ }
+ forwardNumCodePoints(lookBehind);
+ return ce32;
+ }
+
+ private final int nextSkippedCodePoint() {
+ if(skipped != null && skipped.hasNext()) { return skipped.next(); }
+ if(numCpFwd == 0) { return Collation.SENTINEL_CP; }
+ int c = nextCodePoint();
+ if(skipped != null && !skipped.isEmpty() && c >= 0) { skipped.incBeyond(); }
+ if(numCpFwd > 0 && c >= 0) { --numCpFwd; }
+ return c;
+ }
+
+ private final void backwardNumSkipped(int n) {
+ if(skipped != null && !skipped.isEmpty()) {
+ n = skipped.backwardNumCodePoints(n);
+ }
+ backwardNumCodePoints(n);
+ if(numCpFwd >= 0) { numCpFwd += n; }
+ }
+
+ private final int nextCE32FromContraction(
+ CollationData d, int contractionCE32,
+ CharSequence trieChars, int trieOffset, int ce32, int c) {
+ // c: next code point after the original one
+
+ // Number of code points read beyond the original code point.
+ // Needed for discontiguous contraction matching.
+ int lookAhead = 1;
+ // Number of code points read since the last match (initially only c).
+ int sinceMatch = 1;
+ // Normally we only need a contiguous match,
+ // and therefore need not remember the suffixes state from before a mismatch for retrying.
+ // If we are already processing skipped combining marks, then we do track the state.
+ CharsTrie suffixes = new CharsTrie(trieChars, trieOffset);
+ if(skipped != null && !skipped.isEmpty()) { skipped.saveTrieState(suffixes); }
+ BytesTrie.Result match = suffixes.firstForCodePoint(c);
+ for(;;) {
+ int nextCp;
+ if(match.hasValue()) {
+ ce32 = suffixes.getValue();
+ if(!match.hasNext() || (c = nextSkippedCodePoint()) < 0) {
+ return ce32;
+ }
+ if(skipped != null && !skipped.isEmpty()) { skipped.saveTrieState(suffixes); }
+ sinceMatch = 1;
+ } else if(match == BytesTrie.Result.NO_MATCH || (nextCp = nextSkippedCodePoint()) < 0) {
+ // No match for c, or partial match (BytesTrie.Result.NO_VALUE) and no further text.
+ // Back up if necessary, and try a discontiguous contraction.
+ if((contractionCE32 & Collation.CONTRACT_TRAILING_CCC) != 0 &&
+ // Discontiguous contraction matching extends an existing match.
+ // If there is no match yet, then there is nothing to do.
+ ((contractionCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) == 0 ||
+ sinceMatch < lookAhead)) {
+ // The last character of at least one suffix has lccc!=0,
+ // allowing for discontiguous contractions.
+ // UCA S2.1.1 only processes non-starters immediately following
+ // "a match in the table" (sinceMatch=1).
+ if(sinceMatch > 1) {
+ // Return to the state after the last match.
+ // (Return to sinceMatch=0 and re-fetch the first partially-matched character.)
+ backwardNumSkipped(sinceMatch);
+ c = nextSkippedCodePoint();
+ lookAhead -= sinceMatch - 1;
+ sinceMatch = 1;
+ }
+ if(d.getFCD16(c) > 0xff) {
+ return nextCE32FromDiscontiguousContraction(
+ d, suffixes, ce32, lookAhead, c);
+ }
+ }
+ break;
+ } else {
+ // Continue after partial match (BytesTrie.Result.NO_VALUE) for c.
+ // It does not have a result value, therefore it is not itself "a match in the table".
+ // If a partially-matched c has ccc!=0 then
+ // it might be skipped in discontiguous contraction.
+ c = nextCp;
+ ++sinceMatch;
+ }
+ ++lookAhead;
+ match = suffixes.nextForCodePoint(c);
+ }
+ backwardNumSkipped(sinceMatch);
+ return ce32;
+ }
+
+ private final int nextCE32FromDiscontiguousContraction(
+ CollationData d, CharsTrie suffixes, int ce32,
+ int lookAhead, int c) {
+ // UCA section 3.3.2 Contractions:
+ // Contractions that end with non-starter characters
+ // are known as discontiguous contractions.
+ // ... discontiguous contractions must be detected in input text
+ // whenever the final sequence of non-starter characters could be rearranged
+ // so as to make a contiguous matching sequence that is canonically equivalent.
+
+ // UCA: http://www.unicode.org/reports/tr10/#S2.1
+ // S2.1 Find the longest initial substring S at each point that has a match in the table.
+ // S2.1.1 If there are any non-starters following S, process each non-starter C.
+ // S2.1.2 If C is not blocked from S, find if S + C has a match in the table.
+ // Note: A non-starter in a string is called blocked
+ // if there is another non-starter of the same canonical combining class or zero
+ // between it and the last character of canonical combining class 0.
+ // S2.1.3 If there is a match, replace S by S + C, and remove C.
+
+ // First: Is a discontiguous contraction even possible?
+ int fcd16 = d.getFCD16(c);
+ assert(fcd16 > 0xff); // The caller checked this already, as a shortcut.
+ int nextCp = nextSkippedCodePoint();
+ if(nextCp < 0) {
+ // No further text.
+ backwardNumSkipped(1);
+ return ce32;
+ }
+ ++lookAhead;
+ int prevCC = fcd16 & 0xff;
+ fcd16 = d.getFCD16(nextCp);
+ if(fcd16 <= 0xff) {
+ // The next code point after c is a starter (S2.1.1 "process each non-starter").
+ backwardNumSkipped(2);
+ return ce32;
+ }
+
+ // We have read and matched (lookAhead-2) code points,
+ // read non-matching c and peeked ahead at nextCp.
+ // Return to the state before the mismatch and continue matching with nextCp.
+ if(skipped == null || skipped.isEmpty()) {
+ if(skipped == null) {
+ skipped = new SkippedState();
+ }
+ suffixes.reset();
+ if(lookAhead > 2) {
+ // Replay the partial match so far.
+ backwardNumCodePoints(lookAhead);
+ suffixes.firstForCodePoint(nextCodePoint());
+ for(int i = 3; i < lookAhead; ++i) {
+ suffixes.nextForCodePoint(nextCodePoint());
+ }
+ // Skip c (which did not match) and nextCp (which we will try now).
+ forwardNumCodePoints(2);
+ }
+ skipped.saveTrieState(suffixes);
+ } else {
+ // Reset to the trie state before the failed match of c.
+ skipped.resetToTrieState(suffixes);
+ }
+
+ skipped.setFirstSkipped(c);
+ // Number of code points read since the last match (at this point: c and nextCp).
+ int sinceMatch = 2;
+ c = nextCp;
+ for(;;) {
+ BytesTrie.Result match;
+ // "If C is not blocked from S, find if S + C has a match in the table." (S2.1.2)
+ if(prevCC < (fcd16 >> 8) && (match = suffixes.nextForCodePoint(c)).hasValue()) {
+ // "If there is a match, replace S by S + C, and remove C." (S2.1.3)
+ // Keep prevCC unchanged.
+ ce32 = suffixes.getValue();
+ sinceMatch = 0;
+ skipped.recordMatch();
+ if(!match.hasNext()) { break; }
+ skipped.saveTrieState(suffixes);
+ } else {
+ // No match for "S + C", skip C.
+ skipped.skip(c);
+ skipped.resetToTrieState(suffixes);
+ prevCC = fcd16 & 0xff;
+ }
+ if((c = nextSkippedCodePoint()) < 0) { break; }
+ ++sinceMatch;
+ fcd16 = d.getFCD16(c);
+ if(fcd16 <= 0xff) {
+ // The next code point after c is a starter (S2.1.1 "process each non-starter").
+ break;
+ }
+ }
+ backwardNumSkipped(sinceMatch);
+ boolean isTopDiscontiguous = skipped.isEmpty();
+ skipped.replaceMatch();
+ if(isTopDiscontiguous && !skipped.isEmpty()) {
+ // We did get a match after skipping one or more combining marks,
+ // and we are not in a recursive discontiguous contraction.
+ // Append CEs from the contraction ce32
+ // and then from the combining marks that we skipped before the match.
+ c = Collation.SENTINEL_CP;
+ for(;;) {
+ appendCEsFromCE32(d, c, ce32, true);
+ // Fetch CE32s for skipped combining marks from the normal data, with fallback,
+ // rather than from the CollationData where we found the contraction.
+ if(!skipped.hasNext()) { break; }
+ c = skipped.next();
+ ce32 = getDataCE32(c);
+ if(ce32 == Collation.FALLBACK_CE32) {
+ d = data.base;
+ ce32 = d.getCE32(c);
+ } else {
+ d = data;
+ }
+ // Note: A nested discontiguous-contraction match
+ // replaces consumed combining marks with newly skipped ones
+ // and resets the reading position to the beginning.
+ }
+ skipped.clear();
+ ce32 = Collation.NO_CE32; // Signal to the caller that the result is in the ceBuffer.
+ }
+ return ce32;
+ }
+
+ /**
+ * Returns the previous CE when data.isUnsafeBackward(c, isNumeric).
+ */
+ private final long previousCEUnsafe(int c, UVector32 offsets) {
+ // We just move through the input counting safe and unsafe code points
+ // without collecting the unsafe-backward substring into a buffer and
+ // switching to it.
+ // This is to keep the logic simple. Otherwise we would have to handle
+ // prefix matching going before the backward buffer, switching
+ // to iteration and back, etc.
+ // In the most important case of iterating over a normal string,
+ // reading from the string itself is already maximally fast.
+ // The only drawback there is that after getting the CEs we always
+ // skip backward to the safe character rather than switching out
+ // of a backwardBuffer.
+ // But this should not be the common case for previousCE(),
+ // and correctness and maintainability are more important than
+ // complex optimizations.
+ // Find the first safe character before c.
+ int numBackward = 1;
+ while((c = previousCodePoint()) >= 0) {
+ ++numBackward;
+ if(!data.isUnsafeBackward(c, isNumeric)) {
+ break;
+ }
+ }
+ // Set the forward iteration limit.
+ // Note: This counts code points.
+ // We cannot enforce a limit in the middle of a surrogate pair or similar.
+ numCpFwd = numBackward;
+ // Reset the forward iterator.
+ cesIndex = 0;
+ assert(ceBuffer.length == 0);
+ // Go forward and collect the CEs.
+ int offset = getOffset();
+ while(numCpFwd > 0) {
+ // nextCE() normally reads one code point.
+ // Contraction matching and digit specials read more and check numCpFwd.
+ --numCpFwd;
+ // Append one or more CEs to the ceBuffer.
+ nextCE();
+ assert(ceBuffer.get(ceBuffer.length - 1) != Collation.NO_CE);
+ // No need to loop for getting each expansion CE from nextCE().
+ cesIndex = ceBuffer.length;
+ // However, we need to write an offset for each CE.
+ // This is for CollationElementIterator.getOffset() to return
+ // intermediate offsets from the unsafe-backwards segment.
+ assert(offsets.size() < ceBuffer.length);
+ offsets.addElement(offset);
+ // For an expansion, the offset of each non-initial CE is the limit offset,
+ // consistent with forward iteration.
+ offset = getOffset();
+ while(offsets.size() < ceBuffer.length) {
+ offsets.addElement(offset);
+ };
+ }
+ assert(offsets.size() == ceBuffer.length);
+ // End offset corresponding to just after the unsafe-backwards segment.
+ offsets.addElement(offset);
+ // Reset the forward iteration limit
+ // and move backward to before the segment for which we fetched CEs.
+ numCpFwd = -1;
+ backwardNumCodePoints(numBackward);
+ // Use the collected CEs and return the last one.
+ cesIndex = 0; // Avoid cesIndex > ceBuffer.length when that gets decremented.
+ return ceBuffer.get(--ceBuffer.length);
+ }
+
+ /**
+ * Turns a string of digits (bytes 0..9)
+ * into a sequence of CEs that will sort in numeric order.
+ *
+ * Starts from this ce32's digit value and consumes the following/preceding digits.
+ * The digits string must not be empty and must not have leading zeros.
+ */
+ private final void appendNumericCEs(int ce32, boolean forward) {
+ // Collect digits.
+ // TODO: Use some kind of a byte buffer? We only store values 0..9.
+ StringBuilder digits = new StringBuilder();
+ if(forward) {
+ for(;;) {
+ char digit = Collation.digitFromCE32(ce32);
+ digits.append(digit);
+ if(numCpFwd == 0) { break; }
+ int c = nextCodePoint();
+ if(c < 0) { break; }
+ ce32 = data.getCE32(c);
+ if(ce32 == Collation.FALLBACK_CE32) {
+ ce32 = data.base.getCE32(c);
+ }
+ if(!Collation.hasCE32Tag(ce32, Collation.DIGIT_TAG)) {
+ backwardNumCodePoints(1);
+ break;
+ }
+ if(numCpFwd > 0) { --numCpFwd; }
+ }
+ } else {
+ for(;;) {
+ char digit = Collation.digitFromCE32(ce32);
+ digits.append(digit);
+ int c = previousCodePoint();
+ if(c < 0) { break; }
+ ce32 = data.getCE32(c);
+ if(ce32 == Collation.FALLBACK_CE32) {
+ ce32 = data.base.getCE32(c);
+ }
+ if(!Collation.hasCE32Tag(ce32, Collation.DIGIT_TAG)) {
+ forwardNumCodePoints(1);
+ break;
+ }
+ }
+ // Reverse the digit string.
+ digits.reverse();
+ }
+ int pos = 0;
+ do {
+ // Skip leading zeros.
+ while(pos < (digits.length() - 1) && digits.charAt(pos) == 0) { ++pos; }
+ // Write a sequence of CEs for at most 254 digits at a time.
+ int segmentLength = digits.length() - pos;
+ if(segmentLength > 254) { segmentLength = 254; }
+ appendNumericSegmentCEs(digits.subSequence(pos, pos + segmentLength));
+ pos += segmentLength;
+ } while(pos < digits.length());
+ }
+
+ /**
+ * Turns 1..254 digits into a sequence of CEs.
+ * Called by appendNumericCEs() for each segment of at most 254 digits.
+ */
+ private final void appendNumericSegmentCEs(CharSequence digits) {
+ int length = digits.length();
+ assert(1 <= length && length <= 254);
+ assert(length == 1 || digits.charAt(0) != 0);
+ long numericPrimary = data.numericPrimary;
+ // Note: We use primary byte values 2..255: digits are not compressible.
+ if(length <= 7) {
+ // Very dense encoding for small numbers.
+ int value = digits.charAt(0);
+ for(int i = 1; i < length; ++i) {
+ value = value * 10 + digits.charAt(i);
+ }
+ // Primary weight second byte values:
+ // 74 byte values 2.. 75 for small numbers in two-byte primary weights.
+ // 40 byte values 76..115 for medium numbers in three-byte primary weights.
+ // 16 byte values 116..131 for large numbers in four-byte primary weights.
+ // 124 byte values 132..255 for very large numbers with 4..127 digit pairs.
+ int firstByte = 2;
+ int numBytes = 74;
+ if(value < numBytes) {
+ // Two-byte primary for 0..73, good for day & month numbers etc.
+ long primary = numericPrimary | ((firstByte + value) << 16);
+ ceBuffer.append(Collation.makeCE(primary));
+ return;
+ }
+ value -= numBytes;
+ firstByte += numBytes;
+ numBytes = 40;
+ if(value < numBytes * 254) {
+ // Three-byte primary for 74..10233=74+40*254-1, good for year numbers and more.
+ long primary = numericPrimary |
+ ((firstByte + value / 254) << 16) | ((2 + value % 254) << 8);
+ ceBuffer.append(Collation.makeCE(primary));
+ return;
+ }
+ value -= numBytes * 254;
+ firstByte += numBytes;
+ numBytes = 16;
+ if(value < numBytes * 254 * 254) {
+ // Four-byte primary for 10234..1042489=10234+16*254*254-1.
+ long primary = numericPrimary | (2 + value % 254);
+ value /= 254;
+ primary |= (2 + value % 254) << 8;
+ value /= 254;
+ primary |= (firstByte + value % 254) << 16;
+ ceBuffer.append(Collation.makeCE(primary));
+ return;
+ }
+ // original value > 1042489
+ }
+ assert(length >= 7);
+
+ // The second primary byte value 132..255 indicates the number of digit pairs (4..127),
+ // then we generate primary bytes with those pairs.
+ // Omit trailing 00 pairs.
+ // Decrement the value for the last pair.
+
+ // Set the exponent. 4 pairs.132, 5 pairs.133, ..., 127 pairs.255.
+ int numPairs = (length + 1) / 2;
+ long primary = numericPrimary | ((132 - 4 + numPairs) << 16);
+ // Find the length without trailing 00 pairs.
+ while(digits.charAt(length - 1) == 0 && digits.charAt(length - 2) == 0) {
+ length -= 2;
+ }
+ // Read the first pair.
+ int pair;
+ int pos;
+ if((length & 1) != 0) {
+ // Only "half a pair" if we have an odd number of digits.
+ pair = digits.charAt(0);
+ pos = 1;
+ } else {
+ pair = digits.charAt(0) * 10 + digits.charAt(1);
+ pos = 2;
+ }
+ pair = 11 + 2 * pair;
+ // Add the pairs of digits between pos and length.
+ int shift = 8;
+ while(pos < length) {
+ if(shift == 0) {
+ // Every three pairs/bytes we need to store a 4-byte-primary CE
+ // and start with a new CE with the '0' primary lead byte.
+ primary |= pair;
+ ceBuffer.append(Collation.makeCE(primary));
+ primary = numericPrimary;
+ shift = 16;
+ } else {
+ primary |= pair << shift;
+ shift -= 8;
+ }
+ pair = 11 + 2 * (digits.charAt(pos) * 10 + digits.charAt(pos + 1));
+ pos += 2;
+ }
+ primary |= (pair - 1) << shift;
+ ceBuffer.append(Collation.makeCE(primary));
+ }
+
+ private CEBuffer ceBuffer;
+ private int cesIndex;
+
+ private SkippedState skipped;
+
+ // Number of code points to read forward, or -1.
+ // Used as a forward iteration limit in previousCEUnsafe().
+ private int numCpFwd;
+ // Numeric collation (CollationSettings.NUMERIC).
+ private boolean isNumeric;
+}
--- /dev/null
+/*
+ *******************************************************************************
+ * Copyright (C) 2012-2014, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ *******************************************************************************
+ * CollationKeys.java, ported from collationkeys.h/.cpp
+ *
+ * C++ version created on: 2012sep02
+ * created by: Markus W. Scherer
+ */
+
+package com.ibm.icu.impl.coll;
+
+import com.ibm.icu.text.Collator;
+
+public final class CollationKeys /* all methods are static */ {
+
+ // Java porting note: C++ SortKeyByteSink class extends a common class ByteSink,
+ // which is not available in Java. We don't need a super class created for implementing
+ // collation features.
+ public static abstract class SortKeyByteSink {
+ protected byte[] buffer_;
+ // protected int capacity_; == buffer_.length
+ private int appended_ = 0;
+ // not used in Java -- private int ignore_ = 0;
+
+ public SortKeyByteSink(byte[] dest) {
+ buffer_ = dest;
+ }
+
+ /**
+ * Needed in Java for when we write to the buffer directly.
+ * In C++, the SortKeyByteSink is a subclass of ByteSink and lower-level code can write to that.
+ * TODO: Can we make Java SortKeyByteSink have-a ByteArrayWrapper and write through to it?
+ * Or maybe create interface ByteSink, have SortKeyByteSink implement it, and have BOCSU write to that??
+ */
+ public void setBufferAndAppended(byte[] dest, int app) {
+ buffer_ = dest;
+ appended_ = app;
+ }
+
+ /* not used in Java -- public void IgnoreBytes(int numIgnore) {
+ ignore_ = numIgnore;
+ } */
+
+ /**
+ * @param bytes
+ * the array of byte
+ * @param n
+ * the length of bytes to be appended
+ */
+ public void Append(byte[] bytes, int n) {
+ if (n <= 0 || bytes == null) {
+ return;
+ }
+
+ /* not used in Java -- if (ignore_ > 0) {
+ int ignoreRest = ignore_ - n;
+ if (ignoreRest >= 0) {
+ ignore_ = ignoreRest;
+ return;
+ } else {
+ start = ignore_;
+ n = -ignoreRest;
+ ignore_ = 0;
+ }
+ } */
+
+ int length = appended_;
+ appended_ += n;
+
+ int available = buffer_.length - length;
+ if (n <= available) {
+ System.arraycopy(bytes, 0, buffer_, length, n);
+ } else {
+ AppendBeyondCapacity(bytes, 0, n, length);
+ }
+ }
+
+ public void Append(int b) {
+ /* not used in Java -- if (ignore_ > 0) {
+ --ignore_;
+ } else */ {
+ if (appended_ < buffer_.length || Resize(1, appended_)) {
+ buffer_[appended_] = (byte) b;
+ }
+ ++appended_;
+ }
+ }
+
+ // Java porting note: This method is not used by collator implementation.
+ //
+ // virtual char *GetAppendBuffer(int min_capacity,
+ // int desired_capacity_hint,
+ // char *scratch, int scratch_capacity,
+ // int *result_capacity);
+
+ public int NumberOfBytesAppended() {
+ return appended_;
+ }
+
+ public int GetRemainingCapacity() {
+ return /* not used in Java -- ignore_ + */ buffer_.length - appended_;
+ }
+
+ public boolean Overflowed() {
+ return appended_ > buffer_.length;
+ }
+
+ /* not used in Java -- public boolean IsOk() {
+ return true;
+ } */
+
+ /**
+ * @param bytes
+ * the array of byte
+ * @param start
+ * the start index within the array to be appended
+ * @param n
+ * the length of bytes to be appended
+ * @param length
+ * the length of buffer required to store the entire data (i.e. already appended
+ * bytes + bytes to be appended by this method)
+ */
+ protected abstract void AppendBeyondCapacity(byte[] bytes, int start, int n, int length);
+
+ protected abstract boolean Resize(int appendCapacity, int length);
+ }
+
+ public static class LevelCallback {
+ /**
+ * @param level
+ * The next level about to be written to the ByteSink.
+ * @return true if the level is to be written (the base class implementation always returns
+ * true)
+ */
+ boolean needToWrite(int level) {
+ return true;
+ }
+ }
+ public static final LevelCallback SIMPLE_LEVEL_FALLBACK = new LevelCallback();
+
+ private static final class SortKeyLevel {
+ private static final int INITIAL_CAPACITY = 40;
+
+ byte[] buffer = new byte[INITIAL_CAPACITY];
+ int len = 0;
+ // not used in Java -- private static final boolean ok = true; // In C++ "ok" is reset when memory allocations fail.
+
+ SortKeyLevel() {
+ }
+
+ /* not used in Java -- boolean isOk() {
+ return ok;
+ } */
+
+ boolean isEmpty() {
+ return len == 0;
+ }
+
+ int length() {
+ return len;
+ }
+
+ // Java porting note: Java uses this instead of C++ operator [] overload
+ // uint8_t operator[](int index)
+ byte getAt(int index) {
+ return buffer[index];
+ }
+
+ byte[] data() {
+ return buffer;
+ }
+
+ void appendByte(int b) {
+ if (len < buffer.length || ensureCapacity(1)) {
+ buffer[len++] = (byte) b;
+ }
+ }
+
+ void appendWeight16(int w) {
+ assert ((w & 0xffff) != 0);
+ byte b0 = (byte) (w >>> 8);
+ byte b1 = (byte) w;
+ int appendLength = (b1 == 0) ? 1 : 2;
+ if ((len + appendLength) <= buffer.length || ensureCapacity(appendLength)) {
+ buffer[len++] = b0;
+ if (b1 != 0) {
+ buffer[len++] = b1;
+ }
+ }
+ }
+
+ void appendWeight32(long w) {
+ assert (w != 0);
+ byte[] bytes = new byte[] { (byte) (w >>> 24), (byte) (w >>> 16), (byte) (w >>> 8),
+ (byte) w };
+ int appendLength = (bytes[1] == 0) ? 1 : (bytes[2] == 0) ? 2 : (bytes[3] == 0) ? 3 : 4;
+ if ((len + appendLength) <= buffer.length || ensureCapacity(appendLength)) {
+ buffer[len++] = bytes[0];
+ if (bytes[1] != 0) {
+ buffer[len++] = bytes[1];
+ if (bytes[2] != 0) {
+ buffer[len++] = bytes[2];
+ if (bytes[3] != 0) {
+ buffer[len++] = bytes[3];
+ }
+ }
+ }
+ }
+ }
+
+ void appendReverseWeight16(int w) {
+ assert ((w & 0xffff) != 0);
+ byte b0 = (byte) (w >>> 8);
+ byte b1 = (byte) w;
+ int appendLength = (b1 == 0) ? 1 : 2;
+ if ((len + appendLength) <= buffer.length || ensureCapacity(appendLength)) {
+ if (b1 == 0) {
+ buffer[len++] = b0;
+ } else {
+ buffer[len] = b1;
+ buffer[len + 1] = b0;
+ len += 2;
+ }
+ }
+ }
+
+ // Appends all but the last byte to the sink. The last byte should be the 01 terminator.
+ void appendTo(SortKeyByteSink sink) {
+ assert (len > 0 && buffer[len - 1] == 1);
+ sink.Append(buffer, len - 1);
+ }
+
+ private boolean ensureCapacity(int appendCapacity) {
+ /* not used in Java -- if (!ok) {
+ return false;
+ } */
+ int newCapacity = 2 * buffer.length;
+ int altCapacity = len + 2 * appendCapacity;
+ if (newCapacity < altCapacity) {
+ newCapacity = altCapacity;
+ }
+ if (newCapacity < 200) {
+ newCapacity = 200;
+ }
+ byte[] newbuf = new byte[newCapacity];
+ System.arraycopy(buffer, 0, newbuf, 0, len);
+ buffer = newbuf;
+
+ return true;
+ }
+ }
+
+ private static SortKeyLevel getSortKeyLevel(int levels, int level) {
+ return (levels & level) != 0 ? new SortKeyLevel() : null;
+ }
+
+ private CollationKeys() {
+ } // no instantiation
+
+ // Secondary level: Compress up to 33 common weights as 05..25 or 25..45.
+ private static final int SEC_COMMON_LOW = Collation.COMMON_BYTE;
+ private static final int SEC_COMMON_MIDDLE = SEC_COMMON_LOW + 0x20;
+ static final int SEC_COMMON_HIGH = SEC_COMMON_LOW + 0x40; // read by CollationDataReader
+ private static final int SEC_COMMON_MAX_COUNT = 0x21;
+
+ // Case level, lowerFirst: Compress up to 7 common weights as 1..7 or 7..13.
+ private static final int CASE_LOWER_FIRST_COMMON_LOW = 1;
+ private static final int CASE_LOWER_FIRST_COMMON_MIDDLE = 7;
+ private static final int CASE_LOWER_FIRST_COMMON_HIGH = 13;
+ private static final int CASE_LOWER_FIRST_COMMON_MAX_COUNT = 7;
+
+ // Case level, upperFirst: Compress up to 13 common weights as 3..15.
+ private static final int CASE_UPPER_FIRST_COMMON_LOW = 3;
+ @SuppressWarnings("unused")
+ private static final int CASE_UPPER_FIRST_COMMON_HIGH = 15;
+ private static final int CASE_UPPER_FIRST_COMMON_MAX_COUNT = 13;
+
+ // Tertiary level only (no case): Compress up to 97 common weights as 05..65 or 65..C5.
+ private static final int TER_ONLY_COMMON_LOW = Collation.COMMON_BYTE;
+ private static final int TER_ONLY_COMMON_MIDDLE = TER_ONLY_COMMON_LOW + 0x60;
+ private static final int TER_ONLY_COMMON_HIGH = TER_ONLY_COMMON_LOW + 0xc0;
+ private static final int TER_ONLY_COMMON_MAX_COUNT = 0x61;
+
+ // Tertiary with case, lowerFirst: Compress up to 33 common weights as 05..25 or 25..45.
+ private static final int TER_LOWER_FIRST_COMMON_LOW = Collation.COMMON_BYTE;
+ private static final int TER_LOWER_FIRST_COMMON_MIDDLE = TER_LOWER_FIRST_COMMON_LOW + 0x20;
+ private static final int TER_LOWER_FIRST_COMMON_HIGH = TER_LOWER_FIRST_COMMON_LOW + 0x40;
+ private static final int TER_LOWER_FIRST_COMMON_MAX_COUNT = 0x21;
+
+ // Tertiary with case, upperFirst: Compress up to 33 common weights as 85..A5 or A5..C5.
+ private static final int TER_UPPER_FIRST_COMMON_LOW = Collation.COMMON_BYTE + 0x80;
+ private static final int TER_UPPER_FIRST_COMMON_MIDDLE = TER_UPPER_FIRST_COMMON_LOW + 0x20;
+ private static final int TER_UPPER_FIRST_COMMON_HIGH = TER_UPPER_FIRST_COMMON_LOW + 0x40;
+ private static final int TER_UPPER_FIRST_COMMON_MAX_COUNT = 0x21;
+
+ // Quaternary level: Compress up to 113 common weights as 1C..8C or 8C..FC.
+ private static final int QUAT_COMMON_LOW = 0x1c;
+ private static final int QUAT_COMMON_MIDDLE = QUAT_COMMON_LOW + 0x70;
+ private static final int QUAT_COMMON_HIGH = QUAT_COMMON_LOW + 0xE0;
+ private static final int QUAT_COMMON_MAX_COUNT = 0x71;
+ // Primary weights shifted to quaternary level must be encoded with
+ // a lead byte below the common-weight compression range.
+ private static final int QUAT_SHIFTED_LIMIT_BYTE = QUAT_COMMON_LOW - 1; // 0x1b
+
+ /**
+ * Map from collation strength (UColAttributeValue) to a mask of Collation.Level bits up to that
+ * strength, excluding the CASE_LEVEL which is independent of the strength, and excluding
+ * IDENTICAL_LEVEL which this function does not write.
+ */
+ private static final int levelMasks[] = new int[] {
+ 2, // UCOL_PRIMARY -> PRIMARY_LEVEL
+ 6, // UCOL_SECONDARY -> up to SECONDARY_LEVEL
+ 0x16, // UCOL_TERTIARY -> up to TERTIARY_LEVEL
+ 0x36, // UCOL_QUATERNARY -> up to QUATERNARY_LEVEL
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0,
+ 0x36 // UCOL_IDENTICAL -> up to QUATERNARY_LEVEL
+ };
+
+ /**
+ * Writes the sort key bytes for minLevel up to the iterator data's strength. Optionally writes
+ * the case level. Stops writing levels when callback.needToWrite(level) returns false.
+ * Separates levels with the LEVEL_SEPARATOR_BYTE but does not write a TERMINATOR_BYTE.
+ */
+ public static void writeSortKeyUpToQuaternary(CollationIterator iter, boolean[] compressibleBytes,
+ CollationSettings settings, SortKeyByteSink sink, int minLevel, LevelCallback callback,
+ boolean preflight) {
+
+ int options = settings.options;
+ // Set of levels to process and write.
+ int levels = levelMasks[CollationSettings.getStrength(options)];
+ if ((options & CollationSettings.CASE_LEVEL) != 0) {
+ levels |= Collation.CASE_LEVEL_FLAG;
+ }
+ // Minus the levels below minLevel.
+ levels &= ~((1 << minLevel) - 1);
+ if (levels == 0) {
+ return;
+ }
+
+ long variableTop;
+ if ((options & CollationSettings.ALTERNATE_MASK) == 0) {
+ variableTop = 0;
+ } else {
+ // +1 so that we can use "<" and primary ignorables test out early.
+ variableTop = settings.variableTop + 1;
+ }
+ byte[] reorderTable = settings.reorderTable;
+
+ int tertiaryMask = CollationSettings.getTertiaryMask(options);
+
+ byte[] p234 = new byte[3];
+ SortKeyLevel cases = getSortKeyLevel(levels, Collation.CASE_LEVEL_FLAG);
+ SortKeyLevel secondaries = getSortKeyLevel(levels, Collation.SECONDARY_LEVEL_FLAG);
+ SortKeyLevel tertiaries = getSortKeyLevel(levels, Collation.TERTIARY_LEVEL_FLAG);
+ SortKeyLevel quaternaries = getSortKeyLevel(levels, Collation.QUATERNARY_LEVEL_FLAG);
+
+ int compressedP1 = 0; // 0==no compression; otherwise reordered compressible lead byte
+ int commonCases = 0;
+ int commonSecondaries = 0;
+ int commonTertiaries = 0;
+ int commonQuaternaries = 0;
+
+ int prevSecondary = 0;
+ boolean anyMergeSeparators = false;
+
+ for (;;) {
+ // No need to keep all CEs in the buffer when we write a sort key.
+ iter.clearCEsIfNoneRemaining();
+ long ce = iter.nextCE();
+ long p = ce >>> 32;
+ if (p < variableTop && p > Collation.MERGE_SEPARATOR_PRIMARY) {
+ // Variable CE, shift it to quaternary level.
+ // Ignore all following primary ignorables, and shift further variable CEs.
+ if (commonQuaternaries != 0) {
+ --commonQuaternaries;
+ while (commonQuaternaries >= QUAT_COMMON_MAX_COUNT) {
+ quaternaries.appendByte(QUAT_COMMON_MIDDLE);
+ commonQuaternaries -= QUAT_COMMON_MAX_COUNT;
+ }
+ // Shifted primary weights are lower than the common weight.
+ quaternaries.appendByte(QUAT_COMMON_LOW + commonQuaternaries);
+ commonQuaternaries = 0;
+ }
+ do {
+ if ((levels & Collation.QUATERNARY_LEVEL_FLAG) != 0) {
+ int p1 = (int) p >>> 24;
+ if (reorderTable != null) {
+ p1 = reorderTable[p1] & 0xff;
+ }
+ if (p1 >= QUAT_SHIFTED_LIMIT_BYTE) {
+ // Prevent shifted primary lead bytes from
+ // overlapping with the common compression range.
+ quaternaries.appendByte(QUAT_SHIFTED_LIMIT_BYTE);
+ }
+ quaternaries.appendWeight32((p1 << 24) | (p & 0xffffff));
+ }
+ do {
+ ce = iter.nextCE();
+ p = ce >>> 32;
+ } while (p == 0);
+ } while (p < variableTop && p > Collation.MERGE_SEPARATOR_PRIMARY);
+ }
+ // ce could be primary ignorable, or NO_CE, or the merge separator,
+ // or a regular primary CE, but it is not variable.
+ // If ce==NO_CE, then write nothing for the primary level but
+ // terminate compression on all levels and then exit the loop.
+ if (p > Collation.NO_CE_PRIMARY && (levels & Collation.PRIMARY_LEVEL_FLAG) != 0) {
+ int p1 = (int) p >>> 24;
+ if (reorderTable != null) {
+ p1 = reorderTable[p1] & 0xff;
+ }
+ if (p1 != compressedP1) {
+ if (compressedP1 != 0) {
+ if (p1 < compressedP1) {
+ // No primary compression terminator
+ // at the end of the level or merged segment.
+ if (p1 > Collation.MERGE_SEPARATOR_BYTE) {
+ sink.Append(Collation.PRIMARY_COMPRESSION_LOW_BYTE);
+ }
+ } else {
+ sink.Append(Collation.PRIMARY_COMPRESSION_HIGH_BYTE);
+ }
+ }
+ sink.Append(p1);
+ // Test the un-reordered lead byte for compressibility but
+ // remember the reordered lead byte.
+ if (compressibleBytes[(int) p >>> 24]) {
+ compressedP1 = p1;
+ } else {
+ compressedP1 = 0;
+ }
+ }
+ byte p2 = (byte) (p >>> 16);
+ if (p2 != 0) {
+ p234[0] = p2;
+ p234[1] = (byte) (p >>> 8);
+ p234[2] = (byte) p;
+ sink.Append(p234, (p234[1] == 0) ? 1 : (p234[2] == 0) ? 2 : 3);
+ }
+ // Optimization for internalNextSortKeyPart():
+ // When the primary level overflows we can stop because we need not
+ // calculate (preflight) the whole sort key length.
+ if (!preflight && sink.Overflowed()) {
+ // not used in Java -- if (!sink.IsOk()) {
+ // Java porting note: U_MEMORY_ALLOCATION_ERROR is set here in
+ // C implementation. IsOk() in Java always returns true, so this
+ // is a dead code.
+ return;
+ }
+ }
+
+ int lower32 = (int) ce;
+ if (lower32 == 0) {
+ continue;
+ } // completely ignorable, no secondary/case/tertiary/quaternary
+
+ if ((levels & Collation.SECONDARY_LEVEL_FLAG) != 0) {
+ int s = lower32 >>> 16; // 16 bits
+ if (s == 0) {
+ // secondary ignorable
+ } else if (s == Collation.COMMON_WEIGHT16) {
+ ++commonSecondaries;
+ } else if ((options & CollationSettings.BACKWARD_SECONDARY) == 0) {
+ if (commonSecondaries != 0) {
+ --commonSecondaries;
+ while (commonSecondaries >= SEC_COMMON_MAX_COUNT) {
+ secondaries.appendByte(SEC_COMMON_MIDDLE);
+ commonSecondaries -= SEC_COMMON_MAX_COUNT;
+ }
+ int b;
+ if (s < Collation.COMMON_WEIGHT16) {
+ b = SEC_COMMON_LOW + commonSecondaries;
+ } else {
+ b = SEC_COMMON_HIGH - commonSecondaries;
+ }
+ secondaries.appendByte(b);
+ commonSecondaries = 0;
+ }
+ secondaries.appendWeight16(s);
+ } else {
+ if (commonSecondaries != 0) {
+ --commonSecondaries;
+ // Append reverse weights. The level will be re-reversed later.
+ int remainder = commonSecondaries % SEC_COMMON_MAX_COUNT;
+ int b;
+ if (prevSecondary < Collation.COMMON_WEIGHT16) {
+ b = SEC_COMMON_LOW + remainder;
+ } else {
+ b = SEC_COMMON_HIGH - remainder;
+ }
+ secondaries.appendByte(b);
+ commonSecondaries -= remainder;
+ // commonSecondaries is now a multiple of SEC_COMMON_MAX_COUNT.
+ while (commonSecondaries > 0) { // same as >= SEC_COMMON_MAX_COUNT
+ secondaries.appendByte(SEC_COMMON_MIDDLE);
+ commonSecondaries -= SEC_COMMON_MAX_COUNT;
+ }
+ // commonSecondaries == 0
+ }
+ // Reduce separators so that we can look for byte<=1 later.
+ if (s <= Collation.MERGE_SEPARATOR_WEIGHT16) {
+ if (s == Collation.MERGE_SEPARATOR_WEIGHT16) {
+ anyMergeSeparators = true;
+ }
+ secondaries.appendByte((s >>> 8) - 1);
+ } else {
+ secondaries.appendReverseWeight16(s);
+ }
+ prevSecondary = s;
+ }
+ }
+
+ if ((levels & Collation.CASE_LEVEL_FLAG) != 0) {
+ if ((CollationSettings.getStrength(options) == Collator.PRIMARY) ? p == 0
+ : (lower32 >>> 16) == 0) {
+ // Primary+caseLevel: Ignore case level weights of primary ignorables.
+ // Otherwise: Ignore case level weights of secondary ignorables.
+ // For details see the comments in the CollationCompare class.
+ } else {
+ int c = (lower32 >>> 8) & 0xff; // case bits & tertiary lead byte
+ assert ((c & 0xc0) != 0xc0);
+ if ((c & 0xc0) == 0 && c > Collation.MERGE_SEPARATOR_BYTE) {
+ ++commonCases;
+ } else {
+ if ((options & CollationSettings.UPPER_FIRST) == 0) {
+ // lowerFirst: Compress common weights to nibbles 1..7..13, mixed=14,
+ // upper=15.
+ if (commonCases != 0) {
+ --commonCases;
+ while (commonCases >= CASE_LOWER_FIRST_COMMON_MAX_COUNT) {
+ cases.appendByte(CASE_LOWER_FIRST_COMMON_MIDDLE << 4);
+ commonCases -= CASE_LOWER_FIRST_COMMON_MAX_COUNT;
+ }
+ int b;
+ if (c <= Collation.MERGE_SEPARATOR_BYTE) {
+ b = CASE_LOWER_FIRST_COMMON_LOW + commonCases;
+ } else {
+ b = CASE_LOWER_FIRST_COMMON_HIGH - commonCases;
+ }
+ cases.appendByte(b << 4);
+ commonCases = 0;
+ }
+ if (c > Collation.MERGE_SEPARATOR_BYTE) {
+ c = (CASE_LOWER_FIRST_COMMON_HIGH + (c >>> 6)) << 4; // 14 or 15
+ }
+ } else {
+ // upperFirst: Compress common weights to nibbles 3..15, mixed=2,
+ // upper=1.
+ // The compressed common case weights only go up from the "low" value
+ // because with upperFirst the common weight is the highest one.
+ if (commonCases != 0) {
+ --commonCases;
+ while (commonCases >= CASE_UPPER_FIRST_COMMON_MAX_COUNT) {
+ cases.appendByte(CASE_UPPER_FIRST_COMMON_LOW << 4);
+ commonCases -= CASE_UPPER_FIRST_COMMON_MAX_COUNT;
+ }
+ cases.appendByte((CASE_UPPER_FIRST_COMMON_LOW + commonCases) << 4);
+ commonCases = 0;
+ }
+ if (c > Collation.MERGE_SEPARATOR_BYTE) {
+ c = (CASE_UPPER_FIRST_COMMON_LOW - (c >>> 6)) << 4; // 2 or 1
+ }
+ }
+ // c is a separator byte 01 or 02,
+ // or a left-shifted nibble 0x10, 0x20, ... 0xf0.
+ cases.appendByte(c);
+ }
+ }
+ }
+
+ if ((levels & Collation.TERTIARY_LEVEL_FLAG) != 0) {
+ int t = lower32 & tertiaryMask;
+ assert ((lower32 & 0xc000) != 0xc000);
+ if (t == Collation.COMMON_WEIGHT16) {
+ ++commonTertiaries;
+ } else if ((tertiaryMask & 0x8000) == 0) {
+ // Tertiary weights without case bits.
+ // Move lead bytes 06..3F to C6..FF for a large common-weight range.
+ if (commonTertiaries != 0) {
+ --commonTertiaries;
+ while (commonTertiaries >= TER_ONLY_COMMON_MAX_COUNT) {
+ tertiaries.appendByte(TER_ONLY_COMMON_MIDDLE);
+ commonTertiaries -= TER_ONLY_COMMON_MAX_COUNT;
+ }
+ int b;
+ if (t < Collation.COMMON_WEIGHT16) {
+ b = TER_ONLY_COMMON_LOW + commonTertiaries;
+ } else {
+ b = TER_ONLY_COMMON_HIGH - commonTertiaries;
+ }
+ tertiaries.appendByte(b);
+ commonTertiaries = 0;
+ }
+ if (t > Collation.COMMON_WEIGHT16) {
+ t += 0xc000;
+ }
+ tertiaries.appendWeight16(t);
+ } else if ((options & CollationSettings.UPPER_FIRST) == 0) {
+ // Tertiary weights with caseFirst=lowerFirst.
+ // Move lead bytes 06..BF to 46..FF for the common-weight range.
+ if (commonTertiaries != 0) {
+ --commonTertiaries;
+ while (commonTertiaries >= TER_LOWER_FIRST_COMMON_MAX_COUNT) {
+ tertiaries.appendByte(TER_LOWER_FIRST_COMMON_MIDDLE);
+ commonTertiaries -= TER_LOWER_FIRST_COMMON_MAX_COUNT;
+ }
+ int b;
+ if (t < Collation.COMMON_WEIGHT16) {
+ b = TER_LOWER_FIRST_COMMON_LOW + commonTertiaries;
+ } else {
+ b = TER_LOWER_FIRST_COMMON_HIGH - commonTertiaries;
+ }
+ tertiaries.appendByte(b);
+ commonTertiaries = 0;
+ }
+ if (t > Collation.COMMON_WEIGHT16) {
+ t += 0x4000;
+ }
+ tertiaries.appendWeight16(t);
+ } else {
+ // Tertiary weights with caseFirst=upperFirst.
+ // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
+ // to keep tertiary CEs well-formed.
+ // Their case+tertiary weights must be greater than those of
+ // primary and secondary CEs.
+ //
+ // Separators 01..02 -> 01..02 (unchanged)
+ // Lowercase 03..04 -> 83..84 (includes uncased)
+ // Common weight 05 -> 85..C5 (common-weight compression range)
+ // Lowercase 06..3F -> C6..FF
+ // Mixed case 43..7F -> 43..7F
+ // Uppercase 83..BF -> 03..3F
+ // Tertiary CE 86..BF -> C6..FF
+ if (t <= Collation.MERGE_SEPARATOR_WEIGHT16) {
+ // Keep separators unchanged.
+ } else if ((lower32 >>> 16) != 0) {
+ // Invert case bits of primary & secondary CEs.
+ t ^= 0xc000;
+ if (t < (TER_UPPER_FIRST_COMMON_HIGH << 8)) {
+ t -= 0x4000;
+ }
+ } else {
+ // Keep uppercase bits of tertiary CEs.
+ assert (0x8600 <= t && t <= 0xbfff);
+ t += 0x4000;
+ }
+ if (commonTertiaries != 0) {
+ --commonTertiaries;
+ while (commonTertiaries >= TER_UPPER_FIRST_COMMON_MAX_COUNT) {
+ tertiaries.appendByte(TER_UPPER_FIRST_COMMON_MIDDLE);
+ commonTertiaries -= TER_UPPER_FIRST_COMMON_MAX_COUNT;
+ }
+ int b;
+ if (t < (TER_UPPER_FIRST_COMMON_LOW << 8)) {
+ b = TER_UPPER_FIRST_COMMON_LOW + commonTertiaries;
+ } else {
+ b = TER_UPPER_FIRST_COMMON_HIGH - commonTertiaries;
+ }
+ tertiaries.appendByte(b);
+ commonTertiaries = 0;
+ }
+ tertiaries.appendWeight16(t);
+ }
+ }
+
+ if ((levels & Collation.QUATERNARY_LEVEL_FLAG) != 0) {
+ int q = lower32 & 0xffff;
+ if ((q & 0xc0) == 0 && q > Collation.MERGE_SEPARATOR_WEIGHT16) {
+ ++commonQuaternaries;
+ } else if (q <= Collation.MERGE_SEPARATOR_WEIGHT16
+ && (options & CollationSettings.ALTERNATE_MASK) == 0
+ && (quaternaries.isEmpty() || quaternaries.getAt(quaternaries.length() - 1) == Collation.MERGE_SEPARATOR_BYTE)) {
+ // If alternate=non-ignorable and there are only
+ // common quaternary weights between two separators,
+ // then we need not write anything between these separators.
+ // The only weights greater than the merge separator and less than the common
+ // weight
+ // are shifted primary weights, which are not generated for
+ // alternate=non-ignorable.
+ // There are also exactly as many quaternary weights as tertiary weights,
+ // so level length differences are handled already on tertiary level.
+ // Any above-common quaternary weight will compare greater regardless.
+ quaternaries.appendByte(q >>> 8);
+ } else {
+ if (q <= Collation.MERGE_SEPARATOR_WEIGHT16) {
+ q >>>= 8;
+ } else {
+ q = 0xfc + ((q >>> 6) & 3);
+ }
+ if (commonQuaternaries != 0) {
+ --commonQuaternaries;
+ while (commonQuaternaries >= QUAT_COMMON_MAX_COUNT) {
+ quaternaries.appendByte(QUAT_COMMON_MIDDLE);
+ commonQuaternaries -= QUAT_COMMON_MAX_COUNT;
+ }
+ int b;
+ if (q < QUAT_COMMON_LOW) {
+ b = QUAT_COMMON_LOW + commonQuaternaries;
+ } else {
+ b = QUAT_COMMON_HIGH - commonQuaternaries;
+ }
+ quaternaries.appendByte(b);
+ commonQuaternaries = 0;
+ }
+ quaternaries.appendByte(q);
+ }
+ }
+
+ if ((lower32 >>> 24) == Collation.LEVEL_SEPARATOR_BYTE) {
+ break;
+ } // ce == NO_CE
+ }
+
+ // Append the beyond-primary levels.
+ // not used in Java -- boolean ok = true;
+ if ((levels & Collation.SECONDARY_LEVEL_FLAG) != 0) {
+ if (!callback.needToWrite(Collation.SECONDARY_LEVEL)) {
+ return;
+ }
+ // not used in Java -- ok &= secondaries.isOk();
+ sink.Append(Collation.LEVEL_SEPARATOR_BYTE);
+ byte[] secs = secondaries.data();
+ int length = secondaries.length() - 1; // Ignore the trailing NO_CE.
+ if ((options & CollationSettings.BACKWARD_SECONDARY) != 0) {
+ // The backwards secondary level compares secondary weights backwards
+ // within segments separated by the merge separator (U+FFFE, weight 02).
+ // The separator weights 01 & 02 were reduced to 00 & 01 so that
+ // we do not accidentally separate at a _second_ weight byte of 02.
+ int start = 0;
+ for (;;) {
+ // Find the merge separator or the NO_CE terminator.
+ int limit;
+ if (anyMergeSeparators) {
+ limit = start;
+ while (((int)secs[limit] & 0xff) > 1) {
+ ++limit;
+ }
+ } else {
+ limit = length;
+ }
+ // Reverse this segment.
+ if (start < limit) {
+ for (int i = start, j = limit - 1; i < j; i++, j--) {
+ byte tmp = secs[i];
+ secs[i] = secs[j];
+ secs[j] = tmp;
+ }
+ }
+ // Did we reach the end of the string?
+ if (secs[limit] == 0) {
+ break;
+ }
+ // Restore the merge separator.
+ secs[limit] = 2;
+ // Skip the merge separator and continue.
+ start = limit + 1;
+ }
+ }
+ sink.Append(secs, length);
+ }
+
+ if ((levels & Collation.CASE_LEVEL_FLAG) != 0) {
+ if (!callback.needToWrite(Collation.CASE_LEVEL)) {
+ return;
+ }
+ // not used in Java -- ok &= cases.isOk();
+ sink.Append(Collation.LEVEL_SEPARATOR_BYTE);
+ // Write pairs of nibbles as bytes, except separator bytes as themselves.
+ int length = cases.length() - 1; // Ignore the trailing NO_CE.
+ byte b = 0;
+ for (int i = 0; i < length; ++i) {
+ byte c = cases.getAt(i);
+ if (c <= Collation.MERGE_SEPARATOR_BYTE) {
+ assert (c != 0);
+ if (b != 0) {
+ sink.Append(b);
+ b = 0;
+ }
+ sink.Append(c);
+ } else {
+ assert ((c & 0xf) == 0);
+ if (b == 0) {
+ b = c;
+ } else {
+ sink.Append(b | (c >>> 4));
+ b = 0;
+ }
+ }
+ }
+ if (b != 0) {
+ sink.Append(b);
+ }
+ }
+
+ if ((levels & Collation.TERTIARY_LEVEL_FLAG) != 0) {
+ if (!callback.needToWrite(Collation.TERTIARY_LEVEL)) {
+ return;
+ }
+ // not used in Java -- ok &= tertiaries.isOk();
+ sink.Append(Collation.LEVEL_SEPARATOR_BYTE);
+ tertiaries.appendTo(sink);
+ }
+
+ if ((levels & Collation.QUATERNARY_LEVEL_FLAG) != 0) {
+ if (!callback.needToWrite(Collation.QUATERNARY_LEVEL)) {
+ return;
+ }
+ // not used in Java -- ok &= quaternaries.isOk();
+ sink.Append(Collation.LEVEL_SEPARATOR_BYTE);
+ quaternaries.appendTo(sink);
+ }
+
+ // not used in Java -- if (!ok || !sink.IsOk()) {
+ // Java porting note: U_MEMORY_ALLOCATION_ERROR is set here in
+ // C implementation. IsOk() in Java always returns true, so this
+ // is a dead code.
+ }
+}
--- /dev/null
+/*
+*******************************************************************************
+*
+* Copyright (C) 1996-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+*
+* CollationLoader.java, ported from ucol_res.cpp
+*
+* created by: Markus W. Scherer
+*/
+
+package com.ibm.icu.impl.coll;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.util.MissingResourceException;
+
+import com.ibm.icu.impl.ICUResourceBundle;
+import com.ibm.icu.util.Output;
+import com.ibm.icu.util.ULocale;
+import com.ibm.icu.util.UResourceBundle;
+
+/**
+ * Convenience string denoting the Collation data tree
+ */
+public final class CollationLoader {
+
+ // not implemented, all methods are static
+ private CollationLoader() {
+ }
+
+ private static volatile String rootRules = null;
+
+ private static void loadRootRules() {
+ if (rootRules != null) {
+ return;
+ }
+ synchronized(CollationLoader.class) {
+ if (rootRules == null) {
+ UResourceBundle rootBundle = UResourceBundle.getBundleInstance(
+ ICUResourceBundle.ICU_COLLATION_BASE_NAME, ULocale.ROOT);
+ rootRules = rootBundle.getString("UCARules");
+ }
+ }
+ }
+
+ // C++: static void appendRootRules(UnicodeString &s)
+ public static String getRootRules() {
+ loadRootRules();
+ return rootRules;
+ }
+
+ static String loadRules(ULocale locale, CharSequence collationType) {
+ UResourceBundle bundle = UResourceBundle.getBundleInstance(
+ ICUResourceBundle.ICU_COLLATION_BASE_NAME, locale);
+ UResourceBundle data = ((ICUResourceBundle)bundle).getWithFallback("collations/" + collationType);
+ String rules = data.getString("Sequence");
+ return rules;
+ }
+
+ private static final UResourceBundle getWithFallback(UResourceBundle table, String entryName) {
+ try {
+ return ((ICUResourceBundle)table).getWithFallback(entryName);
+ } catch(MissingResourceException e) {
+ return null;
+ }
+ }
+
+ public static CollationTailoring loadTailoring(ULocale locale, Output<ULocale> outValidLocale) {
+
+ // Java porting note: ICU4J getWithFallback/getStringWithFallback currently does not
+ // work well when alias table is involved in a resource path, unless full path is specified.
+ // For now, collation resources does not contain such data, so the code below should work fine.
+
+ CollationTailoring root = CollationRoot.getRoot();
+ String localeName = locale.getName();
+ if (localeName.length() == 0 || localeName.equals("root")) {
+ outValidLocale.value = ULocale.ROOT;
+ return root;
+ }
+
+ UResourceBundle bundle = null;
+ try {
+ bundle = UResourceBundle.getBundleInstance(
+ ICUResourceBundle.ICU_COLLATION_BASE_NAME, locale);
+ } catch (MissingResourceException e) {
+ outValidLocale.value = ULocale.ROOT;
+ return root;
+ }
+
+ ULocale validLocale = bundle.getULocale();
+ // Normalize the root locale. See
+ // http://bugs.icu-project.org/trac/ticket/10715
+ String validLocaleName = validLocale.getName();
+ if (validLocaleName.length() == 0 || validLocaleName.equals("root")) {
+ validLocale = ULocale.ROOT;
+ }
+ outValidLocale.value = validLocale;
+
+ // There are zero or more tailorings in the collations table.
+ UResourceBundle collations;
+ try {
+ collations = ((ICUResourceBundle)bundle).get("collations");
+ if (collations == null) {
+ return root;
+ }
+ } catch(MissingResourceException ignored) {
+ return root;
+ }
+
+ // Fetch the collation type from the locale ID and the default type from the data.
+ String type = locale.getKeywordValue("collation");
+ String defaultType = "standard";
+
+ try {
+ String defT = ((ICUResourceBundle)collations).getStringWithFallback("default");
+ if (defT != null) {
+ defaultType = defT;
+ }
+ } catch(MissingResourceException ignored) {
+ }
+
+ if (type == null || type.equals("default")) {
+ type = defaultType;
+ }
+
+ // Load the collations/type tailoring, with type fallback.
+
+ // Java porting note: typeFallback is used for setting U_USING_DEFAULT_WARNING in
+ // ICU4C, but not used by ICU4J
+
+ // boolean typeFallback = false;
+ UResourceBundle data = getWithFallback(collations, type);
+ if (data == null &&
+ type.length() > 6 && type.startsWith("search")) {
+ // fall back from something like "searchjl" to "search"
+ // typeFallback = true;
+ type = "search";
+ data = getWithFallback(collations, type);
+ }
+
+ if (data == null && !type.equals(defaultType)) {
+ // fall back to the default type
+ // typeFallback = true;
+ type = defaultType;
+ data = getWithFallback(collations, type);
+ }
+
+ if (data == null && !type.equals("standard")) {
+ // fall back to the "standard" type
+ // typeFallback = true;
+ type = "standard";
+ data = getWithFallback(collations, type);
+ }
+
+ if (data == null) {
+ return root;
+ }
+
+ // Is this the same as the root collator? If so, then use that instead.
+ ULocale actualLocale = data.getULocale();
+ // http://bugs.icu-project.org/trac/ticket/10715 ICUResourceBundle(root).getULocale() != ULocale.ROOT
+ // Therefore not just if (actualLocale.equals(ULocale.ROOT) && type.equals("standard")) {
+ String actualLocaleName = actualLocale.getName();
+ if (actualLocaleName.length() == 0 || actualLocaleName.equals("root")) {
+ actualLocale = ULocale.ROOT;
+ if (type.equals("standard")) {
+ return root;
+ }
+ }
+
+ CollationTailoring t = new CollationTailoring(root.settings);
+ t.actualLocale = actualLocale;
+
+ // deserialize
+ UResourceBundle binary = ((ICUResourceBundle)data).get("%%CollationBin");
+ byte[] inBytes = binary.getBinary(null);
+ ByteArrayInputStream inStream = new ByteArrayInputStream(inBytes);
+ try {
+ CollationDataReader.read(root, inStream, t);
+ } catch (IOException e) {
+ throw new RuntimeException("Failed to load collation tailoring data for locale:"
+ + actualLocale + " type:" + type, e);
+ } // No need to close BAIS.
+
+ // Try to fetch the optional rules string.
+ try {
+ String s = ((ICUResourceBundle)data).getString("Sequence");
+ if (s != null) {
+ t.rules = s;
+ }
+ } catch(MissingResourceException ignored) {
+ }
+
+ // Set the collation types on the informational locales,
+ // except when they match the default types (for brevity and backwards compatibility).
+ // For the valid locale, suppress the default type.
+ if (!type.equals(defaultType)) {
+ outValidLocale.value = validLocale.setKeywordValue("collation", type);
+ }
+
+ // For the actual locale, suppress the default type *according to the actual locale*.
+ // For example, zh has default=pinyin and contains all of the Chinese tailorings.
+ // zh_Hant has default=stroke but has no other data.
+ // For the valid locale "zh_Hant" we need to suppress stroke.
+ // For the actual locale "zh" we need to suppress pinyin instead.
+ if (!actualLocale.equals(validLocale)) {
+ // Opening a bundle for the actual locale should always succeed.
+ UResourceBundle actualBundle = UResourceBundle.getBundleInstance(
+ ICUResourceBundle.ICU_COLLATION_BASE_NAME, actualLocale);
+ try {
+ String defT = ((ICUResourceBundle)actualBundle).getStringWithFallback("collations/default");
+ if (defT != null) {
+ defaultType = defT;
+ }
+ } catch(MissingResourceException ignored) {
+ }
+ }
+
+ if (!type.equals(defaultType)) {
+ t.actualLocale = t.actualLocale.setKeywordValue("collation", type);
+ }
+
+ // if (typeFallback) {
+ // ICU4C implementation sets U_USING_DEFAULT_WARNING here
+ // }
+
+ return t;
+ }
+}
--- /dev/null
+/*
+*******************************************************************************
+* Copyright (C) 2012-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* CollationRoot.java, ported from collationroot.h/.cpp
+*
+* C++ version created on: 2012dec17
+* created by: Markus W. Scherer
+*/
+
+package com.ibm.icu.impl.coll;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.MissingResourceException;
+
+import com.ibm.icu.impl.ICUData;
+import com.ibm.icu.impl.ICUResourceBundle;
+
+/**
+ * Collation root provider.
+ */
+public final class CollationRoot { // purely static
+ private static final CollationTailoring rootSingleton;
+ private static final RuntimeException exception;
+
+ public static final CollationTailoring getRoot() {
+ if(exception != null) {
+ throw exception;
+ }
+ return rootSingleton;
+ }
+ public static final CollationData getData() {
+ CollationTailoring root = getRoot();
+ return root.data;
+ }
+ static final CollationSettings getSettings() {
+ CollationTailoring root = getRoot();
+ return root.settings.readOnly();
+ }
+
+ static { // Corresponds to C++ load() function.
+ CollationTailoring t = new CollationTailoring(null);
+ String path = ICUResourceBundle.ICU_BUNDLE + "/coll/ucadata.icu";
+ InputStream inBytes = ICUData.getRequiredStream(path);
+ RuntimeException e2 = null;
+ try {
+ CollationDataReader.read(null, inBytes, t);
+ } catch(IOException e) {
+ t = null;
+ e2 = new MissingResourceException(
+ "IOException while reading CLDR root data",
+ "CollationRoot", path);
+ } catch(RuntimeException e) {
+ t = null;
+ e2 = e;
+ }
+ rootSingleton = t;
+ exception = e2;
+ }
+}
--- /dev/null
+/*
+*******************************************************************************
+* Copyright (C) 2013-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* CollationRootElements.java, ported from collationrootelements.h/.cpp
+*
+* C++ version created on: 2013mar01
+* created by: Markus W. Scherer
+*/
+
+package com.ibm.icu.impl.coll;
+
+/**
+ * Container and access methods for collation elements and weights
+ * that occur in the root collator.
+ * Needed for finding boundaries for building a tailoring.
+ *
+ * This class takes and returns 16-bit secondary and tertiary weights.
+ */
+public final class CollationRootElements {
+ public CollationRootElements(long[] rootElements) {
+ elements = rootElements;
+ }
+
+ /**
+ * Higher than any root primary.
+ */
+ public static final long PRIMARY_SENTINEL = 0xffffff00L;
+
+ /**
+ * Flag in a root element, set if the element contains secondary & tertiary weights,
+ * rather than a primary.
+ */
+ public static final int SEC_TER_DELTA_FLAG = 0x80;
+ /**
+ * Mask for getting the primary range step value from a primary-range-end element.
+ */
+ public static final int PRIMARY_STEP_MASK = 0x7f;
+
+ /**
+ * Index of the first CE with a non-zero tertiary weight.
+ * Same as the start of the compact root elements table.
+ */
+ public static final int IX_FIRST_TERTIARY_INDEX = 0;
+ /**
+ * Index of the first CE with a non-zero secondary weight.
+ */
+ static final int IX_FIRST_SECONDARY_INDEX = 1;
+ /**
+ * Index of the first CE with a non-zero primary weight.
+ */
+ static final int IX_FIRST_PRIMARY_INDEX = 2;
+ /**
+ * Must match Collation.COMMON_SEC_AND_TER_CE.
+ */
+ static final int IX_COMMON_SEC_AND_TER_CE = 3;
+ /**
+ * Secondary & tertiary boundaries.
+ * Bits 31..24: [fixed last secondary common byte 45]
+ * Bits 23..16: [fixed first ignorable secondary byte 80]
+ * Bits 15.. 8: reserved, 0
+ * Bits 7.. 0: [fixed first ignorable tertiary byte 3C]
+ */
+ static final int IX_SEC_TER_BOUNDARIES = 4;
+ /**
+ * The current number of indexes.
+ * Currently the same as elements[IX_FIRST_TERTIARY_INDEX].
+ */
+ static final int IX_COUNT = 5;
+
+ /**
+ * Returns the boundary between tertiary weights of primary/secondary CEs
+ * and those of tertiary CEs.
+ * This is the upper limit for tertiaries of primary/secondary CEs.
+ * This minus one is the lower limit for tertiaries of tertiary CEs.
+ */
+ public int getTertiaryBoundary() {
+ return ((int)elements[IX_SEC_TER_BOUNDARIES] << 8) & 0xff00;
+ }
+
+ /**
+ * Returns the first assigned tertiary CE.
+ */
+ long getFirstTertiaryCE() {
+ return elements[(int)elements[IX_FIRST_TERTIARY_INDEX]] & ~SEC_TER_DELTA_FLAG;
+ }
+
+ /**
+ * Returns the last assigned tertiary CE.
+ */
+ long getLastTertiaryCE() {
+ return elements[(int)elements[IX_FIRST_SECONDARY_INDEX] - 1] & ~SEC_TER_DELTA_FLAG;
+ }
+
+ /**
+ * Returns the last common secondary weight.
+ * This is the lower limit for secondaries of primary CEs.
+ */
+ public int getLastCommonSecondary() {
+ return ((int)elements[IX_SEC_TER_BOUNDARIES] >> 16) & 0xff00;
+ }
+
+ /**
+ * Returns the boundary between secondary weights of primary CEs
+ * and those of secondary CEs.
+ * This is the upper limit for secondaries of primary CEs.
+ * This minus one is the lower limit for secondaries of secondary CEs.
+ */
+ public int getSecondaryBoundary() {
+ return ((int)elements[IX_SEC_TER_BOUNDARIES] >> 8) & 0xff00;
+ }
+
+ /**
+ * Returns the first assigned secondary CE.
+ */
+ long getFirstSecondaryCE() {
+ return elements[(int)elements[IX_FIRST_SECONDARY_INDEX]] & ~SEC_TER_DELTA_FLAG;
+ }
+
+ /**
+ * Returns the last assigned secondary CE.
+ */
+ long getLastSecondaryCE() {
+ return elements[(int)elements[IX_FIRST_PRIMARY_INDEX] - 1] & ~SEC_TER_DELTA_FLAG;
+ }
+
+ /**
+ * Returns the first assigned primary weight.
+ */
+ long getFirstPrimary() {
+ return elements[(int)elements[IX_FIRST_PRIMARY_INDEX]]; // step=0: cannot be a range end
+ }
+
+ /**
+ * Returns the first assigned primary CE.
+ */
+ long getFirstPrimaryCE() {
+ return Collation.makeCE(getFirstPrimary());
+ }
+
+ /**
+ * Returns the last root CE with a primary weight before p.
+ * Intended only for reordering group boundaries.
+ */
+ long lastCEWithPrimaryBefore(long p) {
+ if(p == 0) { return 0; }
+ assert(p > elements[(int)elements[IX_FIRST_PRIMARY_INDEX]]);
+ int index = findP(p);
+ long q = elements[index];
+ long secTer;
+ if(p == (q & 0xffffff00L)) {
+ // p == elements[index] is a root primary. Find the CE before it.
+ // We must not be in a primary range.
+ assert((q & PRIMARY_STEP_MASK) == 0);
+ secTer = elements[index - 1];
+ if((secTer & SEC_TER_DELTA_FLAG) == 0) {
+ // Primary CE just before p.
+ p = secTer & 0xffffff00L;
+ secTer = Collation.COMMON_SEC_AND_TER_CE;
+ } else {
+ // secTer = last secondary & tertiary for the previous primary
+ index -= 2;
+ for(;;) {
+ p = elements[index];
+ if((p & SEC_TER_DELTA_FLAG) == 0) {
+ p &= 0xffffff00L;
+ break;
+ }
+ --index;
+ }
+ }
+ } else {
+ // p > elements[index] which is the previous primary.
+ // Find the last secondary & tertiary weights for it.
+ p = q & 0xffffff00L;
+ secTer = Collation.COMMON_SEC_AND_TER_CE;
+ for(;;) {
+ q = elements[++index];
+ if((q & SEC_TER_DELTA_FLAG) == 0) {
+ // We must not be in a primary range.
+ assert((q & PRIMARY_STEP_MASK) == 0);
+ break;
+ }
+ secTer = q;
+ }
+ }
+ return (p << 32) | (secTer & ~SEC_TER_DELTA_FLAG);
+ }
+
+ /**
+ * Returns the first root CE with a primary weight of at least p.
+ * Intended only for reordering group boundaries.
+ */
+ long firstCEWithPrimaryAtLeast(long p) {
+ if(p == 0) { return 0; }
+ int index = findP(p);
+ if(p != (elements[index] & 0xffffff00L)) {
+ for(;;) {
+ p = elements[++index];
+ if((p & SEC_TER_DELTA_FLAG) == 0) {
+ // First primary after p. We must not be in a primary range.
+ assert((p & PRIMARY_STEP_MASK) == 0);
+ break;
+ }
+ }
+ }
+ // The code above guarantees that p has at most 3 bytes: (p & 0xff) == 0.
+ return (p << 32) | Collation.COMMON_SEC_AND_TER_CE;
+ }
+
+ /**
+ * Returns the primary weight before p.
+ * p must be greater than the first root primary.
+ */
+ long getPrimaryBefore(long p, boolean isCompressible) {
+ int index = findPrimary(p);
+ int step;
+ long q = elements[index];
+ if(p == (q & 0xffffff00L)) {
+ // Found p itself. Return the previous primary.
+ // See if p is at the end of a previous range.
+ step = (int)q & PRIMARY_STEP_MASK;
+ if(step == 0) {
+ // p is not at the end of a range. Look for the previous primary.
+ do {
+ p = elements[--index];
+ } while((p & SEC_TER_DELTA_FLAG) != 0);
+ return p & 0xffffff00L;
+ }
+ } else {
+ // p is in a range, and not at the start.
+ long nextElement = elements[index + 1];
+ assert(isEndOfPrimaryRange(nextElement));
+ step = (int)nextElement & PRIMARY_STEP_MASK;
+ }
+ // Return the previous range primary.
+ if((p & 0xffff) == 0) {
+ return Collation.decTwoBytePrimaryByOneStep(p, isCompressible, step);
+ } else {
+ return Collation.decThreeBytePrimaryByOneStep(p, isCompressible, step);
+ }
+ }
+
+ /** Returns the secondary weight before [p, s]. */
+ int getSecondaryBefore(long p, int s) {
+ int index;
+ int previousSec, sec;
+ if(p == 0) {
+ index = (int)elements[IX_FIRST_SECONDARY_INDEX];
+ // Gap at the beginning of the secondary CE range.
+ previousSec = 0;
+ sec = (int)(elements[index] >> 16);
+ } else {
+ index = findPrimary(p) + 1;
+ previousSec = Collation.MERGE_SEPARATOR_WEIGHT16;
+ sec = Collation.COMMON_WEIGHT16;
+ }
+ assert(s >= sec);
+ while(s > sec) {
+ previousSec = sec;
+ assert((elements[index] & SEC_TER_DELTA_FLAG) != 0);
+ sec = (int)(elements[index++] >> 16);
+ }
+ assert(sec == s);
+ return previousSec;
+ }
+
+ /** Returns the tertiary weight before [p, s, t]. */
+ int getTertiaryBefore(long p, int s, int t) {
+ assert((t & ~Collation.ONLY_TERTIARY_MASK) == 0);
+ int index;
+ int previousTer;
+ long secTer;
+ if(p == 0) {
+ if(s == 0) {
+ index = (int)elements[IX_FIRST_TERTIARY_INDEX];
+ // Gap at the beginning of the tertiary CE range.
+ previousTer = 0;
+ } else {
+ index = (int)elements[IX_FIRST_SECONDARY_INDEX];
+ previousTer = Collation.MERGE_SEPARATOR_WEIGHT16;
+ }
+ secTer = elements[index] & ~SEC_TER_DELTA_FLAG;
+ } else {
+ index = findPrimary(p) + 1;
+ previousTer = Collation.MERGE_SEPARATOR_WEIGHT16;
+ secTer = Collation.COMMON_SEC_AND_TER_CE;
+ }
+ long st = ((long)s << 16) | t;
+ while(st > secTer) {
+ if((int)(secTer >> 16) == s) { previousTer = (int)secTer; }
+ assert((elements[index] & SEC_TER_DELTA_FLAG) != 0);
+ secTer = elements[index++] & ~SEC_TER_DELTA_FLAG;
+ }
+ assert(secTer == st);
+ return previousTer & 0xffff;
+ }
+
+ /**
+ * Finds the index of the input primary.
+ * p must occur as a root primary, and must not be 0.
+ */
+ int findPrimary(long p) {
+ // Requirement: p must occur as a root primary.
+ assert((p & 0xff) == 0); // at most a 3-byte primary
+ int index = findP(p);
+ // If p is in a range, then we just assume that p is an actual primary in this range.
+ // (Too cumbersome/expensive to check.)
+ // Otherwise, it must be an exact match.
+ assert(isEndOfPrimaryRange(elements[index + 1]) || p == (elements[index] & 0xffffff00L));
+ return index;
+ }
+
+ /**
+ * Returns the primary weight after p where index=findPrimary(p).
+ * p must be at least the first root primary.
+ */
+ long getPrimaryAfter(long p, int index, boolean isCompressible) {
+ assert(p == (elements[index] & 0xffffff00L) || isEndOfPrimaryRange(elements[index + 1]));
+ long q = elements[++index];
+ int step;
+ if((q & SEC_TER_DELTA_FLAG) == 0 && (step = (int)q & PRIMARY_STEP_MASK) != 0) {
+ // Return the next primary in this range.
+ if((p & 0xffff) == 0) {
+ return Collation.incTwoBytePrimaryByOffset(p, isCompressible, step);
+ } else {
+ return Collation.incThreeBytePrimaryByOffset(p, isCompressible, step);
+ }
+ } else {
+ // Return the next primary in the list.
+ while((q & SEC_TER_DELTA_FLAG) != 0) {
+ q = elements[++index];
+ }
+ assert((q & PRIMARY_STEP_MASK) == 0);
+ return q;
+ }
+ }
+ /**
+ * Returns the secondary weight after [p, s] where index=findPrimary(p)
+ * except use index=0 for p=0.
+ */
+ int getSecondaryAfter(int index, int s) {
+ int secLimit;
+ if(index == 0) {
+ // primary = 0
+ index = (int)elements[IX_FIRST_SECONDARY_INDEX];
+ // Gap at the end of the secondary CE range.
+ secLimit = 0x10000;
+ } else {
+ assert(index >= (int)elements[IX_FIRST_PRIMARY_INDEX]);
+ ++index;
+ // Gap for secondaries of primary CEs.
+ secLimit = getSecondaryBoundary();
+ }
+ for(;;) {
+ long secTer = elements[index];
+ if((secTer & SEC_TER_DELTA_FLAG) == 0) { return secLimit; }
+ int sec = (int)(secTer >> 16);
+ if(sec > s) { return sec; }
+ ++index;
+ }
+ }
+ /**
+ * Returns the tertiary weight after [p, s, t] where index=findPrimary(p)
+ * except use index=0 for p=0.
+ */
+ int getTertiaryAfter(int index, int s, int t) {
+ int terLimit;
+ if(index == 0) {
+ // primary = 0
+ if(s == 0) {
+ index = (int)elements[IX_FIRST_TERTIARY_INDEX];
+ // Gap at the end of the tertiary CE range.
+ terLimit = 0x4000;
+ } else {
+ index = (int)elements[IX_FIRST_SECONDARY_INDEX];
+ // Gap for tertiaries of primary/secondary CEs.
+ terLimit = getTertiaryBoundary();
+ }
+ } else {
+ assert(index >= (int)elements[IX_FIRST_PRIMARY_INDEX]);
+ ++index;
+ terLimit = getTertiaryBoundary();
+ }
+ long st = (((long)s & 0xffffffffL) << 16) | t;
+ for(;;) {
+ long secTer = elements[index];
+ // No tertiary greater than t for this primary+secondary.
+ if((secTer & SEC_TER_DELTA_FLAG) == 0 || (secTer >> 16) > s) { return terLimit; }
+ secTer &= ~SEC_TER_DELTA_FLAG;
+ if(secTer > st) { return (int)secTer & 0xffff; }
+ ++index;
+ }
+ }
+
+ /**
+ * Finds the largest index i where elements[i]<=p.
+ * Requires first primary<=p<0xffffff00 (PRIMARY_SENTINEL).
+ * Does not require that p is a root collator primary.
+ */
+ private int findP(long p) {
+ // p need not occur as a root primary.
+ // For example, it might be a reordering group boundary.
+ assert((p >> 24) != Collation.UNASSIGNED_IMPLICIT_BYTE);
+ // modified binary search
+ int start = (int)elements[IX_FIRST_PRIMARY_INDEX];
+ assert(p >= elements[start]);
+ int limit = elements.length - 1;
+ assert(elements[limit] >= PRIMARY_SENTINEL);
+ assert(p < elements[limit]);
+ while((start + 1) < limit) {
+ // Invariant: elements[start] and elements[limit] are primaries,
+ // and elements[start]<=p<=elements[limit].
+ int i = (start + limit) / 2;
+ long q = elements[i];
+ if((q & SEC_TER_DELTA_FLAG) != 0) {
+ // Find the next primary.
+ int j = i + 1;
+ for(;;) {
+ if(j == limit) { break; }
+ q = elements[j];
+ if((q & SEC_TER_DELTA_FLAG) == 0) {
+ i = j;
+ break;
+ }
+ ++j;
+ }
+ if((q & SEC_TER_DELTA_FLAG) != 0) {
+ // Find the preceding primary.
+ j = i - 1;
+ for(;;) {
+ if(j == start) { break; }
+ q = elements[j];
+ if((q & SEC_TER_DELTA_FLAG) == 0) {
+ i = j;
+ break;
+ }
+ --j;
+ }
+ if((q & SEC_TER_DELTA_FLAG) != 0) {
+ // No primary between start and limit.
+ break;
+ }
+ }
+ }
+ if(p < (q & 0xffffff00L)) { // Reset the "step" bits of a range end primary.
+ limit = i;
+ } else {
+ start = i;
+ }
+ }
+ return start;
+ }
+
+ private static boolean isEndOfPrimaryRange(long q) {
+ return (q & SEC_TER_DELTA_FLAG) == 0 && (q & PRIMARY_STEP_MASK) != 0;
+ }
+
+ /**
+ * Data structure: See ICU4C source/i18n/collationrootelements.h.
+ */
+ private long[] elements;
+}
--- /dev/null
+/*
+*******************************************************************************
+* Copyright (C) 2013-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* CollationRuleParser.java, ported from collationruleparser.h/.cpp
+*
+* C++ version created on: 2013apr10
+* created by: Markus W. Scherer
+*/
+
+package com.ibm.icu.impl.coll;
+
+import java.text.ParseException;
+import java.util.ArrayList;
+
+import com.ibm.icu.impl.IllegalIcuArgumentException;
+import com.ibm.icu.impl.PatternProps;
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UProperty;
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.Normalizer2;
+import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.util.ULocale;
+
+public final class CollationRuleParser {
+ /** Special reset positions. */
+ enum Position {
+ FIRST_TERTIARY_IGNORABLE,
+ LAST_TERTIARY_IGNORABLE,
+ FIRST_SECONDARY_IGNORABLE,
+ LAST_SECONDARY_IGNORABLE,
+ FIRST_PRIMARY_IGNORABLE,
+ LAST_PRIMARY_IGNORABLE,
+ FIRST_VARIABLE,
+ LAST_VARIABLE,
+ FIRST_REGULAR,
+ LAST_REGULAR,
+ FIRST_IMPLICIT,
+ LAST_IMPLICIT,
+ FIRST_TRAILING,
+ LAST_TRAILING
+ }
+ static final Position[] POSITION_VALUES = Position.values();
+
+ /**
+ * First character of contractions that encode special reset positions.
+ * U+FFFE cannot be tailored via rule syntax.
+ *
+ * The second contraction character is POS_BASE + Position.
+ */
+ static final char POS_LEAD = 0xfffe;
+ /**
+ * Base for the second character of contractions that encode special reset positions.
+ * Braille characters U+28xx are printable and normalization-inert.
+ * @see POS_LEAD
+ */
+ static final char POS_BASE = 0x2800;
+
+ static abstract class Sink {
+ /**
+ * Adds a reset.
+ * strength=UCOL_IDENTICAL for &str.
+ * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3.
+ */
+ abstract void addReset(int strength, CharSequence str);
+ /**
+ * Adds a relation with strength and prefix | str / extension.
+ */
+ abstract void addRelation(int strength, CharSequence prefix,
+ CharSequence str, CharSequence extension);
+
+ void suppressContractions(UnicodeSet set) {}
+
+ void optimize(UnicodeSet set) {}
+ }
+
+ interface Importer {
+ String getRules(String localeID, String collationType);
+ }
+
+ /**
+ * Constructor.
+ * The Sink must be set before parsing.
+ * The Importer can be set, otherwise [import locale] syntax is not supported.
+ */
+ CollationRuleParser(CollationData base) {
+ baseData = base;
+ }
+
+ /**
+ * Sets the pointer to a Sink object.
+ * The pointer is aliased: Pointer copy without cloning or taking ownership.
+ */
+ void setSink(Sink sinkAlias) {
+ sink = sinkAlias;
+ }
+
+ /**
+ * Sets the pointer to an Importer object.
+ * The pointer is aliased: Pointer copy without cloning or taking ownership.
+ */
+ void setImporter(Importer importerAlias) {
+ importer = importerAlias;
+ }
+
+ void parse(String ruleString, CollationSettings outSettings) throws ParseException {
+ settings = outSettings;
+ parse(ruleString);
+ }
+
+ private static final int UCOL_DEFAULT = -1;
+ private static final int UCOL_OFF = 0;
+ private static final int UCOL_ON = 1;
+
+ /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */
+ private static final int STRENGTH_MASK = 0xf;
+ private static final int STARRED_FLAG = 0x10;
+ private static final int OFFSET_SHIFT = 8;
+
+ private static final String BEFORE = "[before";
+
+ // In C++, we parse into temporary UnicodeString objects named "raw" or "str".
+ // In Java, we reuse this StringBuilder.
+ private final StringBuilder rawBuilder = new StringBuilder();
+
+ private void parse(String ruleString) throws ParseException {
+ rules = ruleString;
+ ruleIndex = 0;
+
+ while(ruleIndex < rules.length()) {
+ char c = rules.charAt(ruleIndex);
+ if(PatternProps.isWhiteSpace(c)) {
+ ++ruleIndex;
+ continue;
+ }
+ switch(c) {
+ case 0x26: // '&'
+ parseRuleChain();
+ break;
+ case 0x5b: // '['
+ parseSetting();
+ break;
+ case 0x23: // '#' starts a comment, until the end of the line
+ ruleIndex = skipComment(ruleIndex + 1);
+ break;
+ case 0x40: // '@' is equivalent to [backwards 2]
+ settings.setFlag(CollationSettings.BACKWARD_SECONDARY, true);
+ ++ruleIndex;
+ break;
+ case 0x21: // '!' used to turn on Thai/Lao character reversal
+ // Accept but ignore. The root collator has contractions
+ // that are equivalent to the character reversal, where appropriate.
+ ++ruleIndex;
+ break;
+ default:
+ setParseError("expected a reset or setting or comment");
+ break;
+ }
+ }
+ }
+
+ private void parseRuleChain() throws ParseException {
+ int resetStrength = parseResetAndPosition();
+ boolean isFirstRelation = true;
+ for(;;) {
+ int result = parseRelationOperator();
+ if(result < 0) {
+ if(ruleIndex < rules.length() && rules.charAt(ruleIndex) == 0x23) {
+ // '#' starts a comment, until the end of the line
+ ruleIndex = skipComment(ruleIndex + 1);
+ continue;
+ }
+ if(isFirstRelation) {
+ setParseError("reset not followed by a relation");
+ }
+ return;
+ }
+ int strength = result & STRENGTH_MASK;
+ if(resetStrength < Collator.IDENTICAL) {
+ // reset-before rule chain
+ if(isFirstRelation) {
+ if(strength != resetStrength) {
+ setParseError("reset-before strength differs from its first relation");
+ return;
+ }
+ } else {
+ if(strength < resetStrength) {
+ setParseError("reset-before strength followed by a stronger relation");
+ return;
+ }
+ }
+ }
+ int i = ruleIndex + (result >> OFFSET_SHIFT); // skip over the relation operator
+ if((result & STARRED_FLAG) == 0) {
+ parseRelationStrings(strength, i);
+ } else {
+ parseStarredCharacters(strength, i);
+ }
+ isFirstRelation = false;
+ }
+ }
+
+ private int parseResetAndPosition() throws ParseException {
+ int i = skipWhiteSpace(ruleIndex + 1);
+ int j;
+ char c;
+ int resetStrength;
+ if(rules.regionMatches(i, BEFORE, 0, BEFORE.length()) &&
+ (j = i + BEFORE.length()) < rules.length() &&
+ PatternProps.isWhiteSpace(rules.charAt(j)) &&
+ ((j = skipWhiteSpace(j + 1)) + 1) < rules.length() &&
+ 0x31 <= (c = rules.charAt(j)) && c <= 0x33 &&
+ rules.charAt(j + 1) == 0x5d) {
+ // &[before n] with n=1 or 2 or 3
+ resetStrength = Collator.PRIMARY + (c - 0x31);
+ i = skipWhiteSpace(j + 2);
+ } else {
+ resetStrength = Collator.IDENTICAL;
+ }
+ if(i >= rules.length()) {
+ setParseError("reset without position");
+ return UCOL_DEFAULT;
+ }
+ if(rules.charAt(i) == 0x5b) { // '['
+ i = parseSpecialPosition(i, rawBuilder);
+ } else {
+ i = parseTailoringString(i, rawBuilder);
+ }
+ try {
+ sink.addReset(resetStrength, rawBuilder);
+ } catch(Exception e) {
+ setParseError("adding reset failed", e);
+ return UCOL_DEFAULT;
+ }
+ ruleIndex = i;
+ return resetStrength;
+ }
+
+ private int parseRelationOperator() {
+ ruleIndex = skipWhiteSpace(ruleIndex);
+ if(ruleIndex >= rules.length()) { return UCOL_DEFAULT; }
+ int strength;
+ int i = ruleIndex;
+ char c = rules.charAt(i++);
+ switch(c) {
+ case 0x3c: // '<'
+ if(i < rules.length() && rules.charAt(i) == 0x3c) { // <<
+ ++i;
+ if(i < rules.length() && rules.charAt(i) == 0x3c) { // <<<
+ ++i;
+ if(i < rules.length() && rules.charAt(i) == 0x3c) { // <<<<
+ ++i;
+ strength = Collator.QUATERNARY;
+ } else {
+ strength = Collator.TERTIARY;
+ }
+ } else {
+ strength = Collator.SECONDARY;
+ }
+ } else {
+ strength = Collator.PRIMARY;
+ }
+ if(i < rules.length() && rules.charAt(i) == 0x2a) { // '*'
+ ++i;
+ strength |= STARRED_FLAG;
+ }
+ break;
+ case 0x3b: // ';' same as <<
+ strength = Collator.SECONDARY;
+ break;
+ case 0x2c: // ',' same as <<<
+ strength = Collator.TERTIARY;
+ break;
+ case 0x3d: // '='
+ strength = Collator.IDENTICAL;
+ if(i < rules.length() && rules.charAt(i) == 0x2a) { // '*'
+ ++i;
+ strength |= STARRED_FLAG;
+ }
+ break;
+ default:
+ return UCOL_DEFAULT;
+ }
+ return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
+ }
+
+ private void parseRelationStrings(int strength, int i) throws ParseException {
+ // Parse
+ // prefix | str / extension
+ // where prefix and extension are optional.
+ String prefix = "";
+ CharSequence extension = "";
+ i = parseTailoringString(i, rawBuilder);
+ char next = (i < rules.length()) ? rules.charAt(i) : 0;
+ if(next == 0x7c) { // '|' separates the context prefix from the string.
+ prefix = rawBuilder.toString();
+ i = parseTailoringString(i + 1, rawBuilder);
+ next = (i < rules.length()) ? rules.charAt(i) : 0;
+ }
+ // str = rawBuilder (do not modify rawBuilder any more in this function)
+ if(next == 0x2f) { // '/' separates the string from the extension.
+ StringBuilder extBuilder = new StringBuilder();
+ i = parseTailoringString(i + 1, extBuilder);
+ extension = extBuilder;
+ }
+ if(prefix.length() != 0) {
+ int prefix0 = prefix.codePointAt(0);
+ int c = rawBuilder.codePointAt(0);
+ if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
+ setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary");
+ return;
+ }
+ }
+ try {
+ sink.addRelation(strength, prefix, rawBuilder, extension);
+ } catch(Exception e) {
+ setParseError("adding relation failed", e);
+ return;
+ }
+ ruleIndex = i;
+ }
+
+ private void parseStarredCharacters(int strength, int i) throws ParseException {
+ String empty = "";
+ i = parseString(skipWhiteSpace(i), rawBuilder);
+ if(rawBuilder.length() == 0) {
+ setParseError("missing starred-relation string");
+ return;
+ }
+ int prev = -1;
+ int j = 0;
+ for(;;) {
+ while(j < rawBuilder.length()) {
+ int c = rawBuilder.codePointAt(j);
+ if(!nfd.isInert(c)) {
+ setParseError("starred-relation string is not all NFD-inert");
+ return;
+ }
+ try {
+ sink.addRelation(strength, empty, UTF16.valueOf(c), empty);
+ } catch(Exception e) {
+ setParseError("adding relation failed", e);
+ return;
+ }
+ j += Character.charCount(c);
+ prev = c;
+ }
+ if(i >= rules.length() || rules.charAt(i) != 0x2d) { // '-'
+ break;
+ }
+ if(prev < 0) {
+ setParseError("range without start in starred-relation string");
+ return;
+ }
+ i = parseString(i + 1, rawBuilder);
+ if(rawBuilder.length() == 0) {
+ setParseError("range without end in starred-relation string");
+ return;
+ }
+ int c = rawBuilder.codePointAt(0);
+ if(c < prev) {
+ setParseError("range start greater than end in starred-relation string");
+ return;
+ }
+ // range prev-c
+ while(++prev <= c) {
+ if(!nfd.isInert(prev)) {
+ setParseError("starred-relation string range is not all NFD-inert");
+ return;
+ }
+ if(isSurrogate(prev)) {
+ setParseError("starred-relation string range contains a surrogate");
+ return;
+ }
+ if(0xfffd <= prev && prev <= 0xffff) {
+ setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF");
+ return;
+ }
+ try {
+ sink.addRelation(strength, empty, UTF16.valueOf(prev), empty);
+ } catch(Exception e) {
+ setParseError("adding relation failed", e);
+ return;
+ }
+ }
+ prev = -1;
+ j = Character.charCount(c);
+ }
+ ruleIndex = skipWhiteSpace(i);
+ }
+
+ private int parseTailoringString(int i, StringBuilder raw) throws ParseException {
+ i = parseString(skipWhiteSpace(i), raw);
+ if(raw.length() == 0) {
+ setParseError("missing relation string");
+ }
+ return skipWhiteSpace(i);
+ }
+
+ private int parseString(int i, StringBuilder raw) throws ParseException {
+ raw.setLength(0);
+ while(i < rules.length()) {
+ char c = rules.charAt(i++);
+ if(isSyntaxChar(c)) {
+ if(c == 0x27) { // apostrophe
+ if(i < rules.length() && rules.charAt(i) == 0x27) {
+ // Double apostrophe, encodes a single one.
+ raw.append((char)0x27);
+ ++i;
+ continue;
+ }
+ // Quote literal text until the next single apostrophe.
+ for(;;) {
+ if(i == rules.length()) {
+ setParseError("quoted literal text missing terminating apostrophe");
+ return i;
+ }
+ c = rules.charAt(i++);
+ if(c == 0x27) {
+ if(i < rules.length() && rules.charAt(i) == 0x27) {
+ // Double apostrophe inside quoted literal text,
+ // still encodes a single apostrophe.
+ ++i;
+ } else {
+ break;
+ }
+ }
+ raw.append(c);
+ }
+ } else if(c == 0x5c) { // backslash
+ if(i == rules.length()) {
+ setParseError("backslash escape at the end of the rule string");
+ return i;
+ }
+ int cp = rules.codePointAt(i);
+ raw.appendCodePoint(cp);
+ i += Character.charCount(cp);
+ } else {
+ // Any other syntax character terminates a string.
+ --i;
+ break;
+ }
+ } else if(PatternProps.isWhiteSpace(c)) {
+ // Unquoted white space terminates a string.
+ --i;
+ break;
+ } else {
+ raw.append(c);
+ }
+ }
+ for(int j = 0; j < raw.length();) {
+ int c = raw.codePointAt(j);
+ if(isSurrogate(c)) {
+ setParseError("string contains an unpaired surrogate");
+ return i;
+ }
+ if(0xfffd <= c && c <= 0xffff) {
+ setParseError("string contains U+FFFD, U+FFFE or U+FFFF");
+ return i;
+ }
+ j += Character.charCount(c);
+ }
+ return i;
+ }
+
+ // TODO: Widen UTF16.isSurrogate(char16) to take an int.
+ private static final boolean isSurrogate(int c) {
+ return (c & 0xfffff800) == 0xd800;
+ }
+
+ private static final String[] positions = {
+ "first tertiary ignorable",
+ "last tertiary ignorable",
+ "first secondary ignorable",
+ "last secondary ignorable",
+ "first primary ignorable",
+ "last primary ignorable",
+ "first variable",
+ "last variable",
+ "first regular",
+ "last regular",
+ "first implicit",
+ "last implicit",
+ "first trailing",
+ "last trailing"
+ };
+
+ /**
+ * Sets str to a contraction of U+FFFE and (U+2800 + Position).
+ * @return rule index after the special reset position
+ * @throws ParseException
+ */
+ private int parseSpecialPosition(int i, StringBuilder str) throws ParseException {
+ int j = readWords(i + 1, rawBuilder);
+ if(j > i && rules.charAt(j) == 0x5d && rawBuilder.length() != 0) { // words end with ]
+ ++j;
+ String raw = rawBuilder.toString();
+ str.setLength(0);
+ for(int pos = 0; pos < positions.length; ++pos) {
+ if(raw.equals(positions[pos])) {
+ str.append(POS_LEAD).append((char)(POS_BASE + pos));
+ return j;
+ }
+ }
+ if(raw.equals("top")) {
+ str.append(POS_LEAD).append((char)(POS_BASE + Position.LAST_REGULAR.ordinal()));
+ return j;
+ }
+ if(raw.equals("variable top")) {
+ str.append(POS_LEAD).append((char)(POS_BASE + Position.LAST_VARIABLE.ordinal()));
+ return j;
+ }
+ }
+ setParseError("not a valid special reset position");
+ return i;
+ }
+
+ private void parseSetting() throws ParseException {
+ int i = ruleIndex + 1;
+ int j = readWords(i, rawBuilder);
+ if(j <= i || rawBuilder.length() == 0) {
+ setParseError("expected a setting/option at '['");
+ }
+ // startsWith() etc. are available for String but not CharSequence/StringBuilder.
+ String raw = rawBuilder.toString();
+ if(rules.charAt(j) == 0x5d) { // words end with ]
+ ++j;
+ if(raw.startsWith("reorder") &&
+ (raw.length() == 7 || raw.charAt(7) == 0x20)) {
+ parseReordering(raw);
+ ruleIndex = j;
+ return;
+ }
+ if(raw.equals("backwards 2")) {
+ settings.setFlag(CollationSettings.BACKWARD_SECONDARY, true);
+ ruleIndex = j;
+ return;
+ }
+ String v;
+ int valueIndex = raw.lastIndexOf(0x20);
+ if(valueIndex >= 0) {
+ v = raw.substring(valueIndex + 1);
+ raw = raw.substring(0, valueIndex);
+ } else {
+ v = "";
+ }
+ if(raw.equals("strength") && v.length() == 1) {
+ int value = UCOL_DEFAULT;
+ char c = v.charAt(0);
+ if(0x31 <= c && c <= 0x34) { // 1..4
+ value = Collator.PRIMARY + (c - 0x31);
+ } else if(c == 0x49) { // 'I'
+ value = Collator.IDENTICAL;
+ }
+ if(value != UCOL_DEFAULT) {
+ settings.setStrength(value);
+ ruleIndex = j;
+ return;
+ }
+ } else if(raw.equals("alternate")) {
+ int value = UCOL_DEFAULT;
+ if(v.equals("non-ignorable")) {
+ value = 0; // UCOL_NON_IGNORABLE
+ } else if(v.equals("shifted")) {
+ value = 1; // UCOL_SHIFTED
+ }
+ if(value != UCOL_DEFAULT) {
+ settings.setAlternateHandlingShifted(value > 0);
+ ruleIndex = j;
+ return;
+ }
+ } else if(raw.equals("maxVariable")) {
+ int value = UCOL_DEFAULT;
+ if(v.equals("space")) {
+ value = CollationSettings.MAX_VAR_SPACE;
+ } else if(v.equals("punct")) {
+ value = CollationSettings.MAX_VAR_PUNCT;
+ } else if(v.equals("symbol")) {
+ value = CollationSettings.MAX_VAR_SYMBOL;
+ } else if(v.equals("currency")) {
+ value = CollationSettings.MAX_VAR_CURRENCY;
+ }
+ if(value != UCOL_DEFAULT) {
+ settings.setMaxVariable(value, 0);
+ settings.variableTop = baseData.getLastPrimaryForGroup(
+ Collator.ReorderCodes.FIRST + value);
+ assert(settings.variableTop != 0);
+ ruleIndex = j;
+ return;
+ }
+ } else if(raw.equals("caseFirst")) {
+ int value = UCOL_DEFAULT;
+ if(v.equals("off")) {
+ value = UCOL_OFF;
+ } else if(v.equals("lower")) {
+ value = CollationSettings.CASE_FIRST; // UCOL_LOWER_FIRST
+ } else if(v.equals("upper")) {
+ value = CollationSettings.CASE_FIRST_AND_UPPER_MASK; // UCOL_UPPER_FIRST
+ }
+ if(value != UCOL_DEFAULT) {
+ settings.setCaseFirst(value);
+ ruleIndex = j;
+ return;
+ }
+ } else if(raw.equals("caseLevel")) {
+ int value = getOnOffValue(v);
+ if(value != UCOL_DEFAULT) {
+ settings.setFlag(CollationSettings.CASE_LEVEL, value > 0);
+ ruleIndex = j;
+ return;
+ }
+ } else if(raw.equals("normalization")) {
+ int value = getOnOffValue(v);
+ if(value != UCOL_DEFAULT) {
+ settings.setFlag(CollationSettings.CHECK_FCD, value > 0);
+ ruleIndex = j;
+ return;
+ }
+ } else if(raw.equals("numericOrdering")) {
+ int value = getOnOffValue(v);
+ if(value != UCOL_DEFAULT) {
+ settings.setFlag(CollationSettings.NUMERIC, value > 0);
+ ruleIndex = j;
+ return;
+ }
+ } else if(raw.equals("hiraganaQ")) {
+ int value = getOnOffValue(v);
+ if(value != UCOL_DEFAULT) {
+ if(value == UCOL_ON) {
+ setParseError("[hiraganaQ on] is not supported");
+ }
+ ruleIndex = j;
+ return;
+ }
+ } else if(raw.equals("import")) {
+ // BCP 47 language tag -> ICU locale ID
+ ULocale localeID;
+ try {
+ localeID = new ULocale.Builder().setLanguageTag(v).build();
+ } catch(Exception e) {
+ setParseError("expected language tag in [import langTag]", e);
+ return;
+ }
+ // localeID minus all keywords
+ String baseID = localeID.getBaseName();
+ // @collation=type, or length=0 if not specified
+ String collationType = localeID.getKeywordValue("collation");
+ if(importer == null) {
+ setParseError("[import langTag] is not supported");
+ } else {
+ String importedRules;
+ try {
+ importedRules =
+ importer.getRules(baseID,
+ collationType != null ? collationType : "standard");
+ } catch(Exception e) {
+ setParseError("[import langTag] failed", e);
+ return;
+ }
+ String outerRules = rules;
+ int outerRuleIndex = ruleIndex;
+ try {
+ parse(importedRules);
+ } catch(Exception e) {
+ ruleIndex = outerRuleIndex; // Restore the original index for error reporting.
+ setParseError("parsing imported rules failed", e);
+ }
+ rules = outerRules;
+ ruleIndex = j;
+ }
+ return;
+ }
+ } else if(rules.charAt(j) == 0x5b) { // words end with [
+ UnicodeSet set = new UnicodeSet();
+ j = parseUnicodeSet(j, set);
+ if(raw.equals("optimize")) {
+ try {
+ sink.optimize(set);
+ } catch(Exception e) {
+ setParseError("[optimize set] failed", e);
+ }
+ ruleIndex = j;
+ return;
+ } else if(raw.equals("suppressContractions")) {
+ try {
+ sink.suppressContractions(set);
+ } catch(Exception e) {
+ setParseError("[suppressContractions set] failed", e);
+ }
+ ruleIndex = j;
+ return;
+ }
+ }
+ setParseError("not a valid setting/option");
+ }
+
+ private void parseReordering(CharSequence raw) throws ParseException {
+ int i = 7; // after "reorder"
+ if(i == raw.length()) {
+ // empty [reorder] with no codes
+ settings.resetReordering();
+ return;
+ }
+ // Parse the codes in [reorder aa bb cc].
+ ArrayList<Integer> reorderCodes = new ArrayList<Integer>();
+ while(i < raw.length()) {
+ ++i; // skip the word-separating space
+ int limit = i;
+ while(limit < raw.length() && raw.charAt(limit) != ' ') { ++limit; }
+ String word = raw.subSequence(i, limit).toString();
+ int code = getReorderCode(word);
+ if(code < 0) {
+ setParseError("unknown script or reorder code");
+ return;
+ }
+ reorderCodes.add(code);
+ i = limit;
+ }
+ int length = reorderCodes.size();
+ if(length == 1 && reorderCodes.get(0) == Collator.ReorderCodes.DEFAULT) {
+ // The root collator does not have a reordering, by definition.
+ settings.resetReordering();
+ return;
+ }
+ int[] codes = new int[reorderCodes.size()];
+ int j = 0;
+ for(Integer code : reorderCodes) { codes[j++] = code; }
+ byte[] table = new byte[256];
+ baseData.makeReorderTable(codes, table);
+ settings.setReordering(codes, table);
+ }
+
+ private static final String[] gSpecialReorderCodes = {
+ "space", "punct", "symbol", "currency", "digit"
+ };
+
+ /**
+ * Gets a script or reorder code from its string representation.
+ * @return the script/reorder code, or
+ * -1==Collator.ReorderCodes.REORDER_CODE_DEFAULT, or
+ * -2 if not recognized
+ */
+ public static int getReorderCode(String word) {
+ for(int i = 0; i < gSpecialReorderCodes.length; ++i) {
+ if(word.equalsIgnoreCase(gSpecialReorderCodes[i])) {
+ return Collator.ReorderCodes.FIRST + i;
+ }
+ }
+ try {
+ int script = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, word);
+ if(script >= 0) {
+ return script;
+ }
+ } catch (IllegalIcuArgumentException e) {
+ // fall through
+ }
+ if(word.equalsIgnoreCase("default")) {
+ return Collator.ReorderCodes.DEFAULT;
+ }
+ return -2;
+ }
+
+ private static int getOnOffValue(String s) {
+ if(s.equals("on")) {
+ return UCOL_ON;
+ } else if(s.equals("off")) {
+ return UCOL_OFF;
+ } else {
+ return UCOL_DEFAULT;
+ }
+ }
+
+ private int parseUnicodeSet(int i, UnicodeSet set) throws ParseException {
+ // Collect a UnicodeSet pattern between a balanced pair of [brackets].
+ int level = 0;
+ int j = i;
+ for(;;) {
+ if(j == rules.length()) {
+ setParseError("unbalanced UnicodeSet pattern brackets");
+ return j;
+ }
+ char c = rules.charAt(j++);
+ if(c == 0x5b) { // '['
+ ++level;
+ } else if(c == 0x5d) { // ']'
+ if(--level == 0) { break; }
+ }
+ }
+ try {
+ set.applyPattern(rules.substring(i, j));
+ } catch(Exception e) {
+ setParseError("not a valid UnicodeSet pattern: " + e.getMessage());
+ }
+ j = skipWhiteSpace(j);
+ if(j == rules.length() || rules.charAt(j) != 0x5d) {
+ setParseError("missing option-terminating ']' after UnicodeSet pattern");
+ return j;
+ }
+ return ++j;
+ }
+
+ private int readWords(int i, StringBuilder raw) {
+ raw.setLength(0);
+ i = skipWhiteSpace(i);
+ for(;;) {
+ if(i >= rules.length()) { return 0; }
+ char c = rules.charAt(i);
+ if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) { // syntax except -_
+ if(raw.length() == 0) { return i; }
+ int lastIndex = raw.length() - 1;
+ if(raw.charAt(lastIndex) == ' ') { // remove trailing space
+ raw.setLength(lastIndex);
+ }
+ return i;
+ }
+ if(PatternProps.isWhiteSpace(c)) {
+ raw.append(' ');
+ i = skipWhiteSpace(i + 1);
+ } else {
+ raw.append(c);
+ ++i;
+ }
+ }
+ }
+
+ private int skipComment(int i) {
+ // skip to past the newline
+ while(i < rules.length()) {
+ char c = rules.charAt(i++);
+ // LF or FF or CR or NEL or LS or PS
+ if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
+ // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
+ // NLF (new line function) = CR or LF or CR+LF or NEL.
+ // No need to collect all of CR+LF because a following LF will be ignored anyway.
+ break;
+ }
+ }
+ return i;
+ }
+
+ private void setParseError(String reason) throws ParseException {
+ throw makeParseException(reason);
+ }
+
+ private void setParseError(String reason, Exception e) throws ParseException {
+ ParseException newExc = makeParseException(reason + ": " + e.getMessage());
+ newExc.initCause(e);
+ throw newExc;
+ }
+
+ private ParseException makeParseException(String reason) {
+ return new ParseException(appendErrorContext(reason), ruleIndex);
+ }
+
+ private static final int U_PARSE_CONTEXT_LEN = 16;
+
+ // C++ setErrorContext()
+ private String appendErrorContext(String reason) {
+ // Note: This relies on the calling code maintaining the ruleIndex
+ // at a position that is useful for debugging.
+ // For example, at the beginning of a reset or relation etc.
+ StringBuilder msg = new StringBuilder(reason);
+ msg.append(" at index ").append(ruleIndex);
+ // We are not counting line numbers.
+
+ msg.append(" near \"");
+ // before ruleIndex
+ int start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
+ if(start < 0) {
+ start = 0;
+ } else if(start > 0 && Character.isLowSurrogate(rules.charAt(start))) {
+ ++start;
+ }
+ msg.append(rules, start, ruleIndex);
+
+ msg.append('!');
+ // starting from ruleIndex
+ int length = rules.length() - ruleIndex;
+ if(length >= U_PARSE_CONTEXT_LEN) {
+ length = U_PARSE_CONTEXT_LEN - 1;
+ if(Character.isHighSurrogate(rules.charAt(ruleIndex + length - 1))) {
+ --length;
+ }
+ }
+ msg.append(rules, ruleIndex, ruleIndex + length);
+ return msg.append('\"').toString();
+ }
+
+ /**
+ * ASCII [:P:] and [:S:]:
+ * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E]
+ */
+ private static boolean isSyntaxChar(int c) {
+ return 0x21 <= c && c <= 0x7e &&
+ (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
+ (0x5b <= c && c <= 0x60) || (0x7b <= c));
+ }
+
+ private int skipWhiteSpace(int i) {
+ while(i < rules.length() && PatternProps.isWhiteSpace(rules.charAt(i))) {
+ ++i;
+ }
+ return i;
+ }
+
+ private Normalizer2 nfd = Normalizer2.getNFDInstance();
+ private Normalizer2 nfc = Normalizer2.getNFCInstance();
+
+ private String rules;
+ private final CollationData baseData;
+ private CollationSettings settings;
+
+ private Sink sink;
+ private Importer importer;
+
+ private int ruleIndex;
+}
--- /dev/null
+/*
+*******************************************************************************
+* Copyright (C) 2013-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* CollationSettings.java, ported from collationsettings.h/.cpp
+*
+* C++ version created on: 2013feb07
+* created by: Markus W. Scherer
+*/
+
+package com.ibm.icu.impl.coll;
+
+import java.util.Arrays;
+
+import com.ibm.icu.text.Collator;
+
+/**
+ * Collation settings/options/attributes.
+ * These are the values that can be changed via API.
+ */
+public final class CollationSettings extends SharedObject {
+ /**
+ * Options bit 0: Perform the FCD check on the input text and deliver normalized text.
+ */
+ public static final int CHECK_FCD = 1;
+ /**
+ * Options bit 1: Numeric collation.
+ * Also known as CODAN = COllate Digits As Numbers.
+ *
+ * Treat digit sequences as numbers with CE sequences in numeric order,
+ * rather than returning a normal CE for each digit.
+ */
+ public static final int NUMERIC = 2;
+ /**
+ * "Shifted" alternate handling, see ALTERNATE_MASK.
+ */
+ static final int SHIFTED = 4;
+ /**
+ * Options bits 3..2: Alternate-handling mask. 0 for non-ignorable.
+ * Reserve values 8 and 0xc for shift-trimmed and blanked.
+ */
+ static final int ALTERNATE_MASK = 0xc;
+ /**
+ * Options bits 6..4: The 3-bit maxVariable value bit field is shifted by this value.
+ */
+ static final int MAX_VARIABLE_SHIFT = 4;
+ /** maxVariable options bit mask before shifting. */
+ static final int MAX_VARIABLE_MASK = 0x70;
+ /** Options bit 7: Reserved/unused/0. */
+ /**
+ * Options bit 8: Sort uppercase first if caseLevel or caseFirst is on.
+ */
+ static final int UPPER_FIRST = 0x100;
+ /**
+ * Options bit 9: Keep the case bits in the tertiary weight (they trump other tertiary values)
+ * unless case level is on (when they are *moved* into the separate case level).
+ * By default, the case bits are removed from the tertiary weight (ignored).
+ *
+ * When CASE_FIRST is off, UPPER_FIRST must be off too, corresponding to
+ * the tri-value UCOL_CASE_FIRST attribute: UCOL_OFF vs. UCOL_LOWER_FIRST vs. UCOL_UPPER_FIRST.
+ */
+ public static final int CASE_FIRST = 0x200;
+ /**
+ * Options bit mask for caseFirst and upperFirst, before shifting.
+ * Same value as caseFirst==upperFirst.
+ */
+ public static final int CASE_FIRST_AND_UPPER_MASK = CASE_FIRST | UPPER_FIRST;
+ /**
+ * Options bit 10: Insert the case level between the secondary and tertiary levels.
+ */
+ public static final int CASE_LEVEL = 0x400;
+ /**
+ * Options bit 11: Compare secondary weights backwards. ("French secondary")
+ */
+ public static final int BACKWARD_SECONDARY = 0x800;
+ /**
+ * Options bits 15..12: The 4-bit strength value bit field is shifted by this value.
+ * It is the top used bit field in the options. (No need to mask after shifting.)
+ */
+ static final int STRENGTH_SHIFT = 12;
+ /** Strength options bit mask before shifting. */
+ static final int STRENGTH_MASK = 0xf000;
+
+ /** maxVariable values */
+ static final int MAX_VAR_SPACE = 0;
+ static final int MAX_VAR_PUNCT = 1;
+ static final int MAX_VAR_SYMBOL = 2;
+ static final int MAX_VAR_CURRENCY = 3;
+
+ CollationSettings() {}
+
+ @Override
+ public CollationSettings clone() {
+ CollationSettings newSettings = (CollationSettings)super.clone();
+ // Note: The reorderTable and reorderCodes need not be cloned
+ // because, in Java, they only get replaced but not modified.
+ newSettings.fastLatinPrimaries = fastLatinPrimaries.clone();
+ return newSettings;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if(!this.getClass().equals(other.getClass())) { return false; }
+ CollationSettings o = (CollationSettings)other;
+ if(options != o.options) { return false; }
+ if((options & ALTERNATE_MASK) != 0 && variableTop != o.variableTop) { return false; }
+ if(!Arrays.equals(reorderCodes, o.reorderCodes)) { return false; }
+ return true;
+ }
+
+ @Override
+ public int hashCode() {
+ int h = options << 8;
+ if((options & ALTERNATE_MASK) != 0) { h ^= variableTop; }
+ h ^= reorderCodes.length;
+ for(int i = 0; i < reorderCodes.length; ++i) {
+ h ^= (reorderCodes[i] << i);
+ }
+ return h;
+ }
+
+ public void resetReordering() {
+ // When we turn off reordering, we want to set a null permutation
+ // rather than a no-op permutation.
+ reorderTable = null;
+ reorderCodes = EMPTY_INT_ARRAY;
+ }
+ // No aliasReordering() in Java. Use setReordering(). See comments near reorderCodes.
+ public void setReordering(int[] codes, byte[] table) {
+ if(codes == null) {
+ codes = EMPTY_INT_ARRAY;
+ }
+ assert (codes.length == 0) == (table == null);
+ reorderTable = table;
+ reorderCodes = codes;
+ }
+
+ // In C++, we use enums for attributes and their values, with a special value for the default.
+ // Combined getter/setter methods handle many attributes.
+ // In Java, we have specific methods for getting, setting, and set-to-default,
+ // except that this class uses bits in its own bit set for simple values.
+
+ public void setStrength(int value) {
+ int noStrength = options & ~STRENGTH_MASK;
+ switch(value) {
+ case Collator.PRIMARY:
+ case Collator.SECONDARY:
+ case Collator.TERTIARY:
+ case Collator.QUATERNARY:
+ case Collator.IDENTICAL:
+ options = noStrength | (value << STRENGTH_SHIFT);
+ break;
+ default:
+ throw new IllegalArgumentException("illegal strength value " + value);
+ }
+ }
+
+ public void setStrengthDefault(int defaultOptions) {
+ int noStrength = options & ~STRENGTH_MASK;
+ options = noStrength | (defaultOptions & STRENGTH_MASK);
+ }
+
+ static int getStrength(int options) {
+ return options >> STRENGTH_SHIFT;
+ }
+
+ public int getStrength() {
+ return getStrength(options);
+ }
+
+ /** Sets the options bit for an on/off attribute. */
+ public void setFlag(int bit, boolean value) {
+ if(value) {
+ options |= bit;
+ } else {
+ options &= ~bit;
+ }
+ }
+
+ public void setFlagDefault(int bit, int defaultOptions) {
+ options = (options & ~bit) | (defaultOptions & bit);
+ }
+
+ public boolean getFlag(int bit) {
+ return (options & bit) != 0;
+ }
+
+ public void setCaseFirst(int value) {
+ assert value == 0 || value == CASE_FIRST || value == CASE_FIRST_AND_UPPER_MASK;
+ int noCaseFirst = options & ~CASE_FIRST_AND_UPPER_MASK;
+ options = noCaseFirst | value;
+ }
+
+ public void setCaseFirstDefault(int defaultOptions) {
+ int noCaseFirst = options & ~CASE_FIRST_AND_UPPER_MASK;
+ options = noCaseFirst | (defaultOptions & CASE_FIRST_AND_UPPER_MASK);
+ }
+
+ public int getCaseFirst() {
+ return options & CASE_FIRST_AND_UPPER_MASK;
+ }
+
+ public void setAlternateHandlingShifted(boolean value) {
+ int noAlternate = options & ~ALTERNATE_MASK;
+ if(value) {
+ options = noAlternate | SHIFTED;
+ } else {
+ options = noAlternate;
+ }
+ }
+
+ public void setAlternateHandlingDefault(int defaultOptions) {
+ int noAlternate = options & ~ALTERNATE_MASK;
+ options = noAlternate | (defaultOptions & ALTERNATE_MASK);
+ }
+
+ public boolean getAlternateHandling() {
+ return (options & ALTERNATE_MASK) != 0;
+ }
+
+ public void setMaxVariable(int value, int defaultOptions) {
+ int noMax = options & ~MAX_VARIABLE_MASK;
+ switch(value) {
+ case MAX_VAR_SPACE:
+ case MAX_VAR_PUNCT:
+ case MAX_VAR_SYMBOL:
+ case MAX_VAR_CURRENCY:
+ options = noMax | (value << MAX_VARIABLE_SHIFT);
+ break;
+ case -1:
+ options = noMax | (defaultOptions & MAX_VARIABLE_MASK);
+ break;
+ default:
+ throw new IllegalArgumentException("illegal maxVariable value " + value);
+ }
+ }
+
+ public int getMaxVariable() {
+ return (options & MAX_VARIABLE_MASK) >> MAX_VARIABLE_SHIFT;
+ }
+
+ /**
+ * Include case bits in the tertiary level if caseLevel=off and caseFirst!=off.
+ */
+ static boolean isTertiaryWithCaseBits(int options) {
+ return (options & (CASE_LEVEL | CASE_FIRST)) == CASE_FIRST;
+ }
+ static int getTertiaryMask(int options) {
+ // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
+ return isTertiaryWithCaseBits(options) ?
+ Collation.CASE_AND_TERTIARY_MASK : Collation.ONLY_TERTIARY_MASK;
+ }
+
+ static boolean sortsTertiaryUpperCaseFirst(int options) {
+ // On tertiary level, consider case bits and sort uppercase first
+ // if caseLevel is off and caseFirst==upperFirst.
+ return (options & (CASE_LEVEL | CASE_FIRST_AND_UPPER_MASK)) == CASE_FIRST_AND_UPPER_MASK;
+ }
+
+ public boolean dontCheckFCD() {
+ return (options & CHECK_FCD) == 0;
+ }
+
+ boolean hasBackwardSecondary() {
+ return (options & BACKWARD_SECONDARY) != 0;
+ }
+
+ public boolean isNumeric() {
+ return (options & NUMERIC) != 0;
+ }
+
+ /** CHECK_FCD etc. */
+ public int options = (Collator.TERTIARY << STRENGTH_SHIFT) | // DEFAULT_STRENGTH
+ (MAX_VAR_PUNCT << MAX_VARIABLE_SHIFT);
+ /** Variable-top primary weight. */
+ public long variableTop;
+ /** 256-byte table for reordering permutation of primary lead bytes; null if no reordering. */
+ public byte[] reorderTable;
+ /** Array of reorder codes; ignored if length == 0. */
+ public int[] reorderCodes = EMPTY_INT_ARRAY;
+ // Note: In C++, we keep a memory block around for the reorder codes and the permutation table,
+ // and modify them for new codes.
+ // In Java, we simply copy references and then never modify the array contents.
+ // The caller must abandon the arrays.
+ // Reorder codes from the public setter API must be cloned.
+ private static final int[] EMPTY_INT_ARRAY = new int[0];
+
+ /** Options for CollationFastLatin. Negative if disabled. */
+ public int fastLatinOptions = -1;
+ // fastLatinPrimaries.length must be equal to CollationFastLatin.LATIN_LIMIT,
+ // but we do not import CollationFastLatin to reduce circular dependencies.
+ public char[] fastLatinPrimaries = new char[0x180]; // mutable contents
+}
--- /dev/null
+/*
+*******************************************************************************
+* Copyright (C) 2013-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* CollationTailoring.java, ported from collationtailoring.h/.cpp
+*
+* C++ version created on: 2013mar12
+* created by: Markus W. Scherer
+*/
+
+package com.ibm.icu.impl.coll;
+
+import java.util.Map;
+
+import com.ibm.icu.impl.Norm2AllModes;
+import com.ibm.icu.impl.Normalizer2Impl;
+import com.ibm.icu.impl.Trie2_32;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.util.ULocale;
+import com.ibm.icu.util.VersionInfo;
+
+/**
+ * Collation tailoring data & settings.
+ * This is a container of values for a collation tailoring
+ * built from rules or deserialized from binary data.
+ *
+ * It is logically immutable: Do not modify its values.
+ * The fields are public for convenience.
+ */
+public final class CollationTailoring {
+ CollationTailoring(SharedObject.Reference<CollationSettings> baseSettings) {
+ if(baseSettings != null) {
+ assert(baseSettings.readOnly().reorderCodes.length == 0);
+ assert(baseSettings.readOnly().reorderTable == null);
+ settings = baseSettings.clone();
+ } else {
+ settings = new SharedObject.Reference<CollationSettings>(new CollationSettings());
+ }
+ }
+
+ void ensureOwnedData() {
+ if(ownedData == null) {
+ Normalizer2Impl nfcImpl = Norm2AllModes.getNFCInstance().impl;
+ ownedData = new CollationData(nfcImpl);
+ }
+ data = ownedData;
+ }
+
+ static VersionInfo makeBaseVersion(VersionInfo ucaVersion) {
+ return VersionInfo.getInstance(
+ VersionInfo.UCOL_BUILDER_VERSION.getMajor(),
+ (ucaVersion.getMajor() << 3) + ucaVersion.getMinor(),
+ ucaVersion.getMilli() << 6,
+ 0);
+ }
+ void setVersion(VersionInfo baseVersion, VersionInfo rulesVersion) {
+ version = VersionInfo.getInstance(
+ VersionInfo.UCOL_BUILDER_VERSION.getMajor(),
+ baseVersion.getMinor(),
+ (baseVersion.getMilli() & 0xc0) + ((rulesVersion.getMajor() + (rulesVersion.getMajor() >> 6)) & 0x3f),
+ (rulesVersion.getMinor() << 3) + (rulesVersion.getMinor() >> 5) + rulesVersion.getMilli() +
+ (rulesVersion.getMicro() << 4) + (rulesVersion.getMicro() >> 4));
+ }
+ int getUCAVersion() {
+ return (version.getMinor() << 4) | (version.getMilli() >> 6);
+ }
+
+ // data for sorting etc.
+ public CollationData data; // == base data or ownedData
+ public SharedObject.Reference<CollationSettings> settings; // reference-counted
+ public String rules = "";
+ // The locale is null (C++: bogus) when built from rules or constructed from a binary blob.
+ // It can then be set by the service registration code which is thread-safe.
+ public ULocale actualLocale = ULocale.ROOT;
+ // UCA version u.v.w & rules version r.s.t.q:
+ // version[0]: builder version (runtime version is mixed in at runtime)
+ // version[1]: bits 7..3=u, bits 2..0=v
+ // version[2]: bits 7..6=w, bits 5..0=r
+ // version[3]= (s<<5)+(s>>3)+t+(q<<4)+(q>>4)
+ public VersionInfo version = ZERO_VERSION;
+ private static final VersionInfo ZERO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
+
+ // owned objects
+ CollationData ownedData;
+ Trie2_32 trie;
+ UnicodeSet unsafeBackwardSet;
+ public Map<Integer, Integer> maxExpansions;
+
+ /*
+ * Not Cloneable: A CollationTailoring cannot be copied.
+ * It is immutable, and the data trie cannot be copied either.
+ */
+}
--- /dev/null
+/*
+*******************************************************************************
+*
+* Copyright (C) 1999-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* CollationWeights.java, ported from collationweights.h/.cpp
+*
+* C++ version created on: 2001mar08 as ucol_wgt.h
+* created by: Markus W. Scherer
+*/
+
+package com.ibm.icu.impl.coll;
+
+import java.util.Arrays;
+
+/**
+ * Allocates n collation element weights between two exclusive limits.
+ * Used only internally by the collation tailoring builder.
+ */
+public final class CollationWeights {
+ public CollationWeights() {}
+
+ public void initForPrimary(boolean compressible) {
+ middleLength=1;
+ minBytes[1] = Collation.MERGE_SEPARATOR_BYTE + 1;
+ maxBytes[1] = Collation.TRAIL_WEIGHT_BYTE;
+ if(compressible) {
+ minBytes[2] = Collation.PRIMARY_COMPRESSION_LOW_BYTE + 1;
+ maxBytes[2] = Collation.PRIMARY_COMPRESSION_HIGH_BYTE - 1;
+ } else {
+ minBytes[2] = 2;
+ maxBytes[2] = 0xff;
+ }
+ minBytes[3] = 2;
+ maxBytes[3] = 0xff;
+ minBytes[4] = 2;
+ maxBytes[4] = 0xff;
+ }
+
+ public void initForSecondary() {
+ // We use only the lower 16 bits for secondary weights.
+ middleLength=3;
+ minBytes[1] = 0;
+ maxBytes[1] = 0;
+ minBytes[2] = 0;
+ maxBytes[2] = 0;
+ minBytes[3] = Collation.MERGE_SEPARATOR_BYTE + 1;
+ maxBytes[3] = 0xff;
+ minBytes[4] = 2;
+ maxBytes[4] = 0xff;
+ }
+
+ public void initForTertiary() {
+ // We use only the lower 16 bits for tertiary weights.
+ middleLength=3;
+ minBytes[1] = 0;
+ maxBytes[1] = 0;
+ minBytes[2] = 0;
+ maxBytes[2] = 0;
+ // We use only 6 bits per byte.
+ // The other bits are used for case & quaternary weights.
+ minBytes[3] = Collation.MERGE_SEPARATOR_BYTE + 1;
+ maxBytes[3] = 0x3f;
+ minBytes[4] = 2;
+ maxBytes[4] = 0x3f;
+ }
+
+ /**
+ * Determine heuristically
+ * what ranges to use for a given number of weights between (excluding)
+ * two limits.
+ *
+ * @param lowerLimit A collation element weight; the ranges will be filled to cover
+ * weights greater than this one.
+ * @param upperLimit A collation element weight; the ranges will be filled to cover
+ * weights less than this one.
+ * @param n The number of collation element weights w necessary such that
+ * lowerLimit<w<upperLimit in lexical order.
+ * @return true if it is possible to fit n elements between the limits
+ */
+ public boolean allocWeights(long lowerLimit, long upperLimit, int n) {
+ // Call getWeightRanges() and then determine heuristically
+ // which ranges to use for a given number of weights between (excluding)
+ // two limits.
+ // puts("");
+
+ if(!getWeightRanges(lowerLimit, upperLimit)) {
+ // printf("error: unable to get Weight ranges\n");
+ return false;
+ }
+
+ /* try until we find suitably large ranges */
+ for(;;) {
+ /* get the smallest number of bytes in a range */
+ int minLength=ranges[0].length;
+
+ if(allocWeightsInShortRanges(n, minLength)) { break; }
+
+ if(minLength == 4) {
+ // printf("error: the maximum number of %ld weights is insufficient for n=%ld\n",
+ // minLengthCount, n);
+ return false;
+ }
+
+ if(allocWeightsInMinLengthRanges(n, minLength)) { break; }
+
+ /* no good match, lengthen all minLength ranges and iterate */
+ // printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1);
+ for(int i=0; ranges[i].length==minLength; ++i) {
+ lengthenRange(ranges[i]);
+ }
+ }
+
+ /* puts("final ranges:");
+ for(int i=0; i<rangeCount; ++i) {
+ printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .count=%ld\n",
+ i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].count);
+ } */
+
+ rangeIndex = 0;
+ if(rangeCount < ranges.length) {
+ ranges[rangeCount] = null; // force a crash when going out of bounds
+ }
+ return true;
+ }
+
+ /**
+ * Given a set of ranges calculated by allocWeights(),
+ * iterate through the weights.
+ * The ranges are modified to keep the current iteration state.
+ *
+ * @return The next weight in the ranges, or 0xffffffff if there is none left.
+ */
+ public long nextWeight() {
+ if(rangeIndex >= rangeCount) {
+ return 0xffffffffL;
+ } else {
+ /* get the next weight */
+ WeightRange range = ranges[rangeIndex];
+ long weight = range.start;
+ if(--range.count == 0) {
+ /* this range is finished */
+ ++rangeIndex;
+ } else {
+ /* increment the weight for the next value */
+ range.start = incWeight(weight, range.length);
+ assert(range.start <= range.end);
+ }
+
+ return weight;
+ }
+ }
+
+ /** @internal */
+ private static final class WeightRange implements Comparable<WeightRange> {
+ long start, end;
+ int length, count;
+
+ // Java 6: @Override
+ public int compareTo(WeightRange other) {
+ long l=start;
+ long r=other.start;
+ if(l<r) {
+ return -1;
+ } else if(l>r) {
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+ }
+
+ /* helper functions for CE weights */
+
+ public static int lengthOfWeight(long weight) {
+ if((weight&0xffffff)==0) {
+ return 1;
+ } else if((weight&0xffff)==0) {
+ return 2;
+ } else if((weight&0xff)==0) {
+ return 3;
+ } else {
+ return 4;
+ }
+ }
+
+ private static int getWeightTrail(long weight, int length) {
+ return (int)(weight>>(8*(4-length)))&0xff;
+ }
+
+ private static long setWeightTrail(long weight, int length, int trail) {
+ length=8*(4-length);
+ return (weight&(0xffffff00L<<length))|((long)trail<<length);
+ }
+
+ private static int getWeightByte(long weight, int idx) {
+ return getWeightTrail(weight, idx); /* same calculation */
+ }
+
+ private static long setWeightByte(long weight, int idx, int b) {
+ long mask; /* 0xffffffff except a 00 "hole" for the index-th byte */
+
+ idx*=8;
+ if(idx<32) {
+ mask=0xffffffffL>>idx;
+ } else {
+ // Do not use int>>32 because on some platforms that does not shift at all
+ // while we need it to become 0.
+ // PowerPC: 0xffffffff>>32 = 0 (wanted)
+ // x86: 0xffffffff>>32 = 0xffffffff (not wanted)
+ //
+ // ANSI C99 6.5.7 Bitwise shift operators:
+ // "If the value of the right operand is negative
+ // or is greater than or equal to the width of the promoted left operand,
+ // the behavior is undefined."
+ mask=0;
+ }
+ idx=32-idx;
+ mask|=0xffffff00L<<idx;
+ return (weight&mask)|((long)b<<idx);
+ }
+
+ private static long truncateWeight(long weight, int length) {
+ return weight&(0xffffffffL<<(8*(4-length)));
+ }
+
+ private static long incWeightTrail(long weight, int length) {
+ return weight+(1L<<(8*(4-length)));
+ }
+
+ private static long decWeightTrail(long weight, int length) {
+ return weight-(1L<<(8*(4-length)));
+ }
+
+ /** @return number of usable byte values for byte idx */
+ private int countBytes(int idx) {
+ return maxBytes[idx] - minBytes[idx] + 1;
+ }
+
+ private long incWeight(long weight, int length) {
+ for(;;) {
+ int b=getWeightByte(weight, length);
+ if(b<maxBytes[length]) {
+ return setWeightByte(weight, length, b+1);
+ } else {
+ // Roll over, set this byte to the minimum and increment the previous one.
+ weight=setWeightByte(weight, length, minBytes[length]);
+ --length;
+ assert(length > 0);
+ }
+ }
+ }
+
+ private long incWeightByOffset(long weight, int length, int offset) {
+ for(;;) {
+ offset += getWeightByte(weight, length);
+ if(offset <= maxBytes[length]) {
+ return setWeightByte(weight, length, offset);
+ } else {
+ // Split the offset between this byte and the previous one.
+ offset -= minBytes[length];
+ weight = setWeightByte(weight, length, minBytes[length] + offset % countBytes(length));
+ offset /= countBytes(length);
+ --length;
+ assert(length > 0);
+ }
+ }
+ }
+
+ private void lengthenRange(WeightRange range) {
+ int length=range.length+1;
+ range.start=setWeightTrail(range.start, length, minBytes[length]);
+ range.end=setWeightTrail(range.end, length, maxBytes[length]);
+ range.count*=countBytes(length);
+ range.length=length;
+ }
+
+ /**
+ * Takes two CE weights and calculates the
+ * possible ranges of weights between the two limits, excluding them.
+ * For weights with up to 4 bytes there are up to 2*4-1=7 ranges.
+ */
+ private boolean getWeightRanges(long lowerLimit, long upperLimit) {
+ assert(lowerLimit != 0);
+ assert(upperLimit != 0);
+
+ /* get the lengths of the limits */
+ int lowerLength=lengthOfWeight(lowerLimit);
+ int upperLength=lengthOfWeight(upperLimit);
+
+ // printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength);
+ // printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength);
+ assert(lowerLength>=middleLength);
+ // Permit upperLength<middleLength: The upper limit for secondaries is 0x10000.
+
+ if(lowerLimit>=upperLimit) {
+ // printf("error: no space between lower & upper limits\n");
+ return false;
+ }
+
+ /* check that neither is a prefix of the other */
+ if(lowerLength<upperLength) {
+ if(lowerLimit==truncateWeight(upperLimit, lowerLength)) {
+ // printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit, upperLimit);
+ return false;
+ }
+ }
+ /* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */
+
+ WeightRange[] lower = new WeightRange[5]; /* [0] and [1] are not used - this simplifies indexing */
+ WeightRange middle = new WeightRange();
+ WeightRange[] upper = new WeightRange[5];
+
+ /*
+ * With the limit lengths of 1..4, there are up to 7 ranges for allocation:
+ * range minimum length
+ * lower[4] 4
+ * lower[3] 3
+ * lower[2] 2
+ * middle 1
+ * upper[2] 2
+ * upper[3] 3
+ * upper[4] 4
+ *
+ * We are now going to calculate up to 7 ranges.
+ * Some of them will typically overlap, so we will then have to merge and eliminate ranges.
+ */
+ long weight=lowerLimit;
+ for(int length=lowerLength; length>middleLength; --length) {
+ int trail=getWeightTrail(weight, length);
+ if(trail<maxBytes[length]) {
+ lower[length] = new WeightRange();
+ lower[length].start=incWeightTrail(weight, length);
+ lower[length].end=setWeightTrail(weight, length, maxBytes[length]);
+ lower[length].length=length;
+ lower[length].count=maxBytes[length]-trail;
+ }
+ weight=truncateWeight(weight, length-1);
+ }
+ if(weight<0xff000000L) {
+ middle.start=incWeightTrail(weight, middleLength);
+ } else {
+ // Prevent overflow for primary lead byte FF
+ // which would yield a middle range starting at 0.
+ middle.start=0xffffffffL; // no middle range
+ }
+
+ weight=upperLimit;
+ for(int length=upperLength; length>middleLength; --length) {
+ int trail=getWeightTrail(weight, length);
+ if(trail>minBytes[length]) {
+ upper[length] = new WeightRange();
+ upper[length].start=setWeightTrail(weight, length, minBytes[length]);
+ upper[length].end=decWeightTrail(weight, length);
+ upper[length].length=length;
+ upper[length].count=trail-minBytes[length];
+ }
+ weight=truncateWeight(weight, length-1);
+ }
+ middle.end=decWeightTrail(weight, middleLength);
+
+ /* set the middle range */
+ middle.length=middleLength;
+ if(middle.end>=middle.start) {
+ middle.count=(int)((middle.end-middle.start)>>(8*(4-middleLength)))+1;
+ } else {
+ /* no middle range, eliminate overlaps */
+
+ /* reduce or remove the lower ranges that go beyond upperLimit */
+ for(int length=4; length>middleLength; --length) {
+ if(lower[length] != null && upper[length] != null &&
+ lower[length].count>0 && upper[length].count>0) {
+ long start=upper[length].start;
+ long end=lower[length].end;
+
+ if(end>=start || incWeight(end, length)==start) {
+ /* lower and upper ranges collide or are directly adjacent: merge these two and remove all shorter ranges */
+ start=lower[length].start;
+ end=lower[length].end=upper[length].end;
+ /*
+ * merging directly adjacent ranges needs to subtract the 0/1 gaps in between;
+ * it may result in a range with count>countBytes
+ */
+ lower[length].count=
+ getWeightTrail(end, length)-getWeightTrail(start, length)+1+
+ countBytes(length)*(getWeightByte(end, length-1)-getWeightByte(start, length-1));
+ upper[length].count=0;
+ while(--length>middleLength) {
+ if(lower[length] != null) {
+ lower[length].count = 0;
+ }
+ if(upper[length] != null) {
+ upper[length].count = 0;
+ }
+ }
+ break;
+ }
+ }
+ }
+ }
+
+ /* print ranges
+ for(int length=4; length>=2; --length) {
+ if(lower[length].count>0) {
+ printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, lower[length].start, lower[length].end, lower[length].count);
+ }
+ }
+ if(middle.count>0) {
+ printf("middle .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start, middle.end, middle.count);
+ }
+ for(int length=2; length<=4; ++length) {
+ if(upper[length].count>0) {
+ printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, upper[length].start, upper[length].end, upper[length].count);
+ }
+ } */
+
+ /* copy the ranges, shortest first, into the result array */
+ rangeCount=0;
+ if(middle.count>0) {
+ ranges[0] = middle;
+ rangeCount=1;
+ }
+ for(int length=middleLength+1; length<=4; ++length) {
+ /* copy upper first so that later the middle range is more likely the first one to use */
+ if(upper[length] != null && upper[length].count>0) {
+ ranges[rangeCount++]=upper[length];
+ }
+ if(lower[length] != null && lower[length].count>0) {
+ ranges[rangeCount++]=lower[length];
+ }
+ }
+ return rangeCount>0;
+ }
+
+ private boolean allocWeightsInShortRanges(int n, int minLength) {
+ // See if the first few minLength and minLength+1 ranges have enough weights.
+ for(int i = 0; i < rangeCount && ranges[i].length <= (minLength + 1); ++i) {
+ if(n <= ranges[i].count) {
+ // Use the first few minLength and minLength+1 ranges.
+ if(ranges[i].length > minLength) {
+ // Reduce the number of weights from the last minLength+1 range
+ // which might sort before some minLength ranges,
+ // so that we use all weights in the minLength ranges.
+ ranges[i].count = n;
+ }
+ rangeCount = i + 1;
+ // printf("take first %ld ranges\n", rangeCount);
+
+ if(rangeCount>1) {
+ /* sort the ranges by weight values */
+ Arrays.sort(ranges, 0, rangeCount);
+ }
+ return true;
+ }
+ n -= ranges[i].count; // still >0
+ }
+ return false;
+ }
+
+ private boolean allocWeightsInMinLengthRanges(int n, int minLength) {
+ // See if the minLength ranges have enough weights
+ // when we split one and lengthen the following ones.
+ int count = 0;
+ int minLengthRangeCount;
+ for(minLengthRangeCount = 0;
+ minLengthRangeCount < rangeCount &&
+ ranges[minLengthRangeCount].length == minLength;
+ ++minLengthRangeCount) {
+ count += ranges[minLengthRangeCount].count;
+ }
+
+ int nextCountBytes = countBytes(minLength + 1);
+ if(n > count * nextCountBytes) { return false; }
+
+ // Use the minLength ranges. Merge them, and then split again as necessary.
+ long start = ranges[0].start;
+ long end = ranges[0].end;
+ for(int i = 1; i < minLengthRangeCount; ++i) {
+ if(ranges[i].start < start) { start = ranges[i].start; }
+ if(ranges[i].end > end) { end = ranges[i].end; }
+ }
+
+ // Calculate how to split the range between minLength (count1) and minLength+1 (count2).
+ // Goal:
+ // count1 + count2 * nextCountBytes = n
+ // count1 + count2 = count
+ // These turn into
+ // (count - count2) + count2 * nextCountBytes = n
+ // and then into the following count1 & count2 computations.
+ int count2 = (n - count) / (nextCountBytes - 1); // number of weights to be lengthened
+ int count1 = count - count2; // number of minLength weights
+ if(count2 == 0 || (count1 + count2 * nextCountBytes) < n) {
+ // round up
+ ++count2;
+ --count1;
+ assert((count1 + count2 * nextCountBytes) >= n);
+ }
+
+ ranges[0].start = start;
+
+ if(count1 == 0) {
+ // Make one long range.
+ ranges[0].end = end;
+ ranges[0].count = count;
+ lengthenRange(ranges[0]);
+ rangeCount = 1;
+ } else {
+ // Split the range, lengthen the second part.
+ // printf("split the range number %ld (out of %ld minLength ranges) by %ld:%ld\n",
+ // splitRange, rangeCount, count1, count2);
+
+ // Next start = start + count1. First end = 1 before that.
+ ranges[0].end = incWeightByOffset(start, minLength, count1 - 1);
+ ranges[0].count = count1;
+
+ if(ranges[1] == null) {
+ ranges[1] = new WeightRange();
+ }
+ ranges[1].start = incWeight(ranges[0].end, minLength);
+ ranges[1].end = end;
+ ranges[1].length = minLength; // +1 when lengthened
+ ranges[1].count = count2; // *countBytes when lengthened
+ lengthenRange(ranges[1]);
+ rangeCount = 2;
+ }
+ return true;
+ }
+
+ private int middleLength;
+ private int[] minBytes = new int[5]; // for byte 1, 2, 3, 4
+ private int[] maxBytes = new int[5];
+ private WeightRange[] ranges = new WeightRange[7];
+ private int rangeIndex;
+ private int rangeCount;
+}
--- /dev/null
+/*
+*******************************************************************************
+* Copyright (C) 2013-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* ContractionsAndExpansions.java, ported from collationsets.h/.cpp
+*
+* C++ version created on: 2013feb09
+* created by: Markus W. Scherer
+*/
+
+package com.ibm.icu.impl.coll;
+
+import java.util.Iterator;
+
+import com.ibm.icu.impl.Trie2;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.util.CharsTrie;
+import com.ibm.icu.util.CharsTrie.Entry;
+
+public final class ContractionsAndExpansions {
+ // C++: The following fields are @internal, only public for access by callback.
+ private CollationData data;
+ private UnicodeSet contractions;
+ private UnicodeSet expansions;
+ private CESink sink;
+ private boolean addPrefixes;
+ private int checkTailored = 0; // -1: collected tailored +1: exclude tailored
+ private UnicodeSet tailored = new UnicodeSet();
+ private UnicodeSet ranges;
+ private StringBuilder unreversedPrefix = new StringBuilder();
+ private String suffix;
+ private long[] ces = new long[Collation.MAX_EXPANSION_LENGTH];
+
+ public static interface CESink {
+ void handleCE(long ce);
+ void handleExpansion(long ces[], int start, int length);
+ }
+
+ public ContractionsAndExpansions(UnicodeSet con, UnicodeSet exp, CESink s, boolean prefixes) {
+ contractions = con;
+ expansions = exp;
+ sink = s;
+ addPrefixes = prefixes;
+ }
+
+ public void forData(CollationData d) {
+ // Add all from the data, can be tailoring or base.
+ if (d.base != null) {
+ checkTailored = -1;
+ }
+ data = d;
+ Iterator<Trie2.Range> trieIterator = data.trie.iterator();
+ Trie2.Range range;
+ while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
+ enumCnERange(range.startCodePoint, range.endCodePoint, range.value, this);
+ }
+ if (d.base == null) {
+ return;
+ }
+ // Add all from the base data but only for un-tailored code points.
+ tailored.freeze();
+ checkTailored = 1;
+ data = d.base;
+ trieIterator = data.trie.iterator();
+ while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
+ enumCnERange(range.startCodePoint, range.endCodePoint, range.value, this);
+ }
+ }
+
+ private void enumCnERange(int start, int end, int ce32, ContractionsAndExpansions cne) {
+ if (cne.checkTailored == 0) {
+ // There is no tailoring.
+ // No need to collect nor check the tailored set.
+ } else if (cne.checkTailored < 0) {
+ // Collect the set of code points with mappings in the tailoring data.
+ if (ce32 == Collation.FALLBACK_CE32) {
+ return; // fallback to base, not tailored
+ } else {
+ cne.tailored.add(start, end);
+ }
+ // checkTailored > 0: Exclude tailored ranges from the base data enumeration.
+ } else if (start == end) {
+ if (cne.tailored.contains(start)) {
+ return;
+ }
+ } else if (cne.tailored.containsSome(start, end)) {
+ if (cne.ranges == null) {
+ cne.ranges = new UnicodeSet();
+ }
+ cne.ranges.set(start, end).removeAll(cne.tailored);
+ int count = cne.ranges.getRangeCount();
+ for (int i = 0; i < count; ++i) {
+ cne.handleCE32(cne.ranges.getRangeStart(i), cne.ranges.getRangeEnd(i), ce32);
+ }
+ }
+ cne.handleCE32(start, end, ce32);
+ }
+
+ public void forCodePoint(CollationData d, int c) {
+ int ce32 = d.getCE32(c);
+ if (ce32 == Collation.FALLBACK_CE32) {
+ d = d.base;
+ ce32 = d.getCE32(c);
+ }
+ data = d;
+ handleCE32(c, c, ce32);
+ }
+
+ private void handleCE32(int start, int end, int ce32) {
+ for (;;) {
+ if ((ce32 & 0xff) < Collation.SPECIAL_CE32_LOW_BYTE) {
+ // !isSpecialCE32()
+ if (sink != null) {
+ sink.handleCE(Collation.ceFromSimpleCE32(ce32));
+ }
+ return;
+ }
+ switch (Collation.tagFromCE32(ce32)) {
+ case Collation.FALLBACK_TAG:
+ return;
+ case Collation.RESERVED_TAG_3:
+ case Collation.BUILDER_DATA_TAG:
+ case Collation.LEAD_SURROGATE_TAG:
+ // Java porting note: U_INTERNAL_PROGRAM_ERROR is set to errorCode in ICU4C.
+ throw new AssertionError(
+ String.format("Unexpected CE32 tag type %d for ce32=0x%08x",
+ Collation.tagFromCE32(ce32), ce32));
+ case Collation.LONG_PRIMARY_TAG:
+ if (sink != null) {
+ sink.handleCE(Collation.ceFromLongPrimaryCE32(ce32));
+ }
+ return;
+ case Collation.LONG_SECONDARY_TAG:
+ if (sink != null) {
+ sink.handleCE(Collation.ceFromLongSecondaryCE32(ce32));
+ }
+ return;
+ case Collation.LATIN_EXPANSION_TAG:
+ if (sink != null) {
+ ces[0] = Collation.latinCE0FromCE32(ce32);
+ ces[1] = Collation.latinCE1FromCE32(ce32);
+ sink.handleExpansion(ces, 0, 2);
+ }
+ // Optimization: If we have a prefix,
+ // then the relevant strings have been added already.
+ if (unreversedPrefix.length() == 0) {
+ addExpansions(start, end);
+ }
+ return;
+ case Collation.EXPANSION32_TAG:
+ if (sink != null) {
+ int idx = Collation.indexFromCE32(ce32);
+ int length = Collation.lengthFromCE32(ce32);
+ for (int i = 0; i < length; ++i) {
+ ces[i] = Collation.ceFromCE32(data.ce32s[idx + i]);
+ }
+ sink.handleExpansion(ces, 0, length);
+ }
+ // Optimization: If we have a prefix,
+ // then the relevant strings have been added already.
+ if (unreversedPrefix.length() == 0) {
+ addExpansions(start, end);
+ }
+ return;
+ case Collation.EXPANSION_TAG:
+ if (sink != null) {
+ int idx = Collation.indexFromCE32(ce32);
+ int length = Collation.lengthFromCE32(ce32);
+ sink.handleExpansion(data.ces, idx, length);
+ }
+ // Optimization: If we have a prefix,
+ // then the relevant strings have been added already.
+ if (unreversedPrefix.length() == 0) {
+ addExpansions(start, end);
+ }
+ return;
+ case Collation.PREFIX_TAG:
+ handlePrefixes(start, end, ce32);
+ return;
+ case Collation.CONTRACTION_TAG:
+ handleContractions(start, end, ce32);
+ return;
+ case Collation.DIGIT_TAG:
+ // Fetch the non-numeric-collation CE32 and continue.
+ ce32 = data.ce32s[Collation.indexFromCE32(ce32)];
+ break;
+ case Collation.U0000_TAG:
+ assert (start == 0 && end == 0);
+ // Fetch the normal ce32 for U+0000 and continue.
+ ce32 = data.ce32s[0];
+ break;
+ case Collation.HANGUL_TAG:
+ if (sink != null) {
+ // TODO: This should be optimized,
+ // especially if [start..end] is the complete Hangul range. (assert that)
+ UTF16CollationIterator iter = new UTF16CollationIterator(data);
+ StringBuilder hangul = new StringBuilder(1);
+ for (int c = start; c <= end; ++c) {
+ hangul.setLength(0);
+ hangul.appendCodePoint(c);
+ iter.setText(false, hangul, 0);
+ int length = iter.fetchCEs();
+ // Ignore the terminating non-CE.
+ assert (length >= 2 && iter.getCE(length - 1) == Collation.NO_CE);
+ sink.handleExpansion(iter.getCEs(), 0, length - 1);
+ }
+ }
+ // Optimization: If we have a prefix,
+ // then the relevant strings have been added already.
+ if (unreversedPrefix.length() == 0) {
+ addExpansions(start, end);
+ }
+ return;
+ case Collation.OFFSET_TAG:
+ // Currently no need to send offset CEs to the sink.
+ return;
+ case Collation.IMPLICIT_TAG:
+ // Currently no need to send implicit CEs to the sink.
+ return;
+ }
+ }
+ }
+
+ private void handlePrefixes(int start, int end, int ce32) {
+ int index = Collation.indexFromCE32(ce32);
+ ce32 = data.getCE32FromContexts(index); // Default if no prefix match.
+ handleCE32(start, end, ce32);
+ if (!addPrefixes) {
+ return;
+ }
+ CharsTrie.Iterator prefixes = new CharsTrie(data.contexts, index + 2).iterator();
+ while (prefixes.hasNext()) {
+ Entry e = prefixes.next();
+ setPrefix(e.chars);
+ // Prefix/pre-context mappings are special kinds of contractions
+ // that always yield expansions.
+ addStrings(start, end, contractions);
+ addStrings(start, end, expansions);
+ handleCE32(start, end, e.value);
+ }
+ resetPrefix();
+ }
+
+ void handleContractions(int start, int end, int ce32) {
+ int index = Collation.indexFromCE32(ce32);
+ if ((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
+ // No match on the single code point.
+ // We are underneath a prefix, and the default mapping is just
+ // a fallback to the mappings for a shorter prefix.
+ assert (unreversedPrefix.length() != 0);
+ } else {
+ ce32 = data.getCE32FromContexts(index); // Default if no suffix match.
+ assert (!Collation.isContractionCE32(ce32));
+ handleCE32(start, end, ce32);
+ }
+ CharsTrie.Iterator suffixes = new CharsTrie(data.contexts, index + 2).iterator();
+ while (suffixes.hasNext()) {
+ Entry e = suffixes.next();
+ suffix = e.chars.toString();
+ addStrings(start, end, contractions);
+ if (unreversedPrefix.length() != 0) {
+ addStrings(start, end, expansions);
+ }
+ handleCE32(start, end, e.value);
+ }
+ suffix = null;
+ }
+
+ void addExpansions(int start, int end) {
+ if (unreversedPrefix.length() == 0 && suffix == null) {
+ if (expansions != null) {
+ expansions.add(start, end);
+ }
+ } else {
+ addStrings(start, end, expansions);
+ }
+ }
+
+ void addStrings(int start, int end, UnicodeSet set) {
+ if (set == null) {
+ return;
+ }
+ StringBuilder s = new StringBuilder(unreversedPrefix);
+ do {
+ s.appendCodePoint(start);
+ if (suffix != null) {
+ s.append(suffix);
+ }
+ set.add(s);
+ s.setLength(unreversedPrefix.length());
+ } while (++start <= end);
+ }
+
+ // Prefixes are reversed in the data structure.
+ private void setPrefix(CharSequence pfx) {
+ unreversedPrefix.setLength(0);
+ unreversedPrefix.append(pfx).reverse();
+ }
+
+ private void resetPrefix() {
+ unreversedPrefix.setLength(0);
+ }
+}
\ No newline at end of file
--- /dev/null
+/*
+*******************************************************************************
+* Copyright (C) 2012-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* FCDIterCollationIterator.java, ported from uitercollationiterator.h/.cpp
+*
+* C++ version created on: 2012sep23 (from utf16collationiterator.h)
+* created by: Markus W. Scherer
+*/
+
+package com.ibm.icu.impl.coll;
+
+import com.ibm.icu.impl.Normalizer2Impl;
+import com.ibm.icu.text.UCharacterIterator;
+
+/**
+ * Incrementally checks the input text for FCD and normalizes where necessary.
+ */
+public final class FCDIterCollationIterator extends IterCollationIterator {
+ public FCDIterCollationIterator(CollationData data, boolean numeric,
+ UCharacterIterator ui, int startIndex) {
+ super(data, numeric, ui);
+ state = State.ITER_CHECK_FWD;
+ start = startIndex;
+ nfcImpl = data.nfcImpl;
+ }
+
+ @Override
+ public void resetToOffset(int newOffset) {
+ super.resetToOffset(newOffset);
+ start = newOffset;
+ state = State.ITER_CHECK_FWD;
+ }
+
+ @Override
+ public int getOffset() {
+ if(state.compareTo(State.ITER_CHECK_BWD) <= 0) {
+ return iter.getIndex();
+ } else if(state == State.ITER_IN_FCD_SEGMENT) {
+ return pos;
+ } else if(pos == 0) {
+ return start;
+ } else {
+ return limit;
+ }
+ }
+
+ @Override
+ public int nextCodePoint() {
+ int c;
+ for(;;) {
+ if(state == State.ITER_CHECK_FWD) {
+ c = iter.next();
+ if(c < 0) {
+ return c;
+ }
+ if(CollationFCD.hasTccc(c)) {
+ if(CollationFCD.maybeTibetanCompositeVowel(c) ||
+ CollationFCD.hasLccc(iter.current())) {
+ iter.previous();
+ if(!nextSegment()) {
+ return Collation.SENTINEL_CP;
+ }
+ continue;
+ }
+ }
+ if(isLeadSurrogate(c)) {
+ int trail = iter.next();
+ if(isTrailSurrogate(trail)) {
+ return Character.toCodePoint((char)c, (char)trail);
+ } else if(trail >= 0) {
+ iter.previous();
+ }
+ }
+ return c;
+ } else if(state == State.ITER_IN_FCD_SEGMENT && pos != limit) {
+ c = iter.nextCodePoint();
+ pos += Character.charCount(c);
+ assert(c >= 0);
+ return c;
+ } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 &&
+ pos != normalized.length()) {
+ c = normalized.codePointAt(pos);
+ pos += Character.charCount(c);
+ return c;
+ } else {
+ switchToForward();
+ }
+ }
+ }
+
+ @Override
+ public int previousCodePoint() {
+ int c;
+ for(;;) {
+ if(state == State.ITER_CHECK_BWD) {
+ c = iter.previous();
+ if(c < 0) {
+ start = pos = 0;
+ state = State.ITER_IN_FCD_SEGMENT;
+ return Collation.SENTINEL_CP;
+ }
+ if(CollationFCD.hasLccc(c)) {
+ int prev = Collation.SENTINEL_CP;
+ if(CollationFCD.maybeTibetanCompositeVowel(c) ||
+ CollationFCD.hasTccc(prev = iter.previous())) {
+ iter.next();
+ if(prev >= 0) {
+ iter.next();
+ }
+ if(!previousSegment()) {
+ return Collation.SENTINEL_CP;
+ }
+ continue;
+ }
+ // hasLccc(trail)=true for all trail surrogates
+ if(isTrailSurrogate(c)) {
+ if(prev < 0) {
+ prev = iter.previous();
+ }
+ if(isLeadSurrogate(prev)) {
+ return Character.toCodePoint((char)prev, (char)c);
+ }
+ }
+ if(prev >= 0) {
+ iter.next();
+ }
+ }
+ return c;
+ } else if(state == State.ITER_IN_FCD_SEGMENT && pos != start) {
+ c = iter.previousCodePoint();
+ pos -= Character.charCount(c);
+ assert(c >= 0);
+ return c;
+ } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos != 0) {
+ c = normalized.codePointBefore(pos);
+ pos -= Character.charCount(c);
+ return c;
+ } else {
+ switchToBackward();
+ }
+ }
+ }
+
+ @Override
+ protected long handleNextCE32() {
+ int c;
+ for(;;) {
+ if(state == State.ITER_CHECK_FWD) {
+ c = iter.next();
+ if(c < 0) {
+ return NO_CP_AND_CE32;
+ }
+ if(CollationFCD.hasTccc(c)) {
+ if(CollationFCD.maybeTibetanCompositeVowel(c) ||
+ CollationFCD.hasLccc(iter.current())) {
+ iter.previous();
+ if(!nextSegment()) {
+ c = Collation.SENTINEL_CP;
+ return Collation.FALLBACK_CE32;
+ }
+ continue;
+ }
+ }
+ break;
+ } else if(state == State.ITER_IN_FCD_SEGMENT && pos != limit) {
+ c = iter.next();
+ ++pos;
+ assert(c >= 0);
+ break;
+ } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 &&
+ pos != normalized.length()) {
+ c = normalized.charAt(pos++);
+ break;
+ } else {
+ switchToForward();
+ }
+ }
+ return makeCodePointAndCE32Pair(c, trie.getFromU16SingleLead((char)c));
+ }
+
+ @Override
+ protected char handleGetTrailSurrogate() {
+ if(state.compareTo(State.ITER_IN_FCD_SEGMENT) <= 0) {
+ int trail = iter.next();
+ if(isTrailSurrogate(trail)) {
+ if(state == State.ITER_IN_FCD_SEGMENT) { ++pos; }
+ } else if(trail >= 0) {
+ iter.previous();
+ }
+ return (char)trail;
+ } else {
+ assert(pos < normalized.length());
+ char trail;
+ if(Character.isLowSurrogate(trail = normalized.charAt(pos))) { ++pos; }
+ return trail;
+ }
+ }
+
+ @Override
+ protected void forwardNumCodePoints(int num) {
+ // Specify the class to avoid a virtual-function indirection.
+ // In Java, we would declare this class final.
+ while(num > 0 && nextCodePoint() >= 0) {
+ --num;
+ }
+ }
+
+ @Override
+ protected void backwardNumCodePoints(int num) {
+ // Specify the class to avoid a virtual-function indirection.
+ // In Java, we would declare this class final.
+ while(num > 0 && previousCodePoint() >= 0) {
+ --num;
+ }
+ }
+
+ /**
+ * Switches to forward checking if possible.
+ */
+ private void switchToForward() {
+ assert(state == State.ITER_CHECK_BWD ||
+ (state == State.ITER_IN_FCD_SEGMENT && pos == limit) ||
+ (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos == normalized.length()));
+ if(state == State.ITER_CHECK_BWD) {
+ // Turn around from backward checking.
+ start = pos = iter.getIndex();
+ if(pos == limit) {
+ state = State.ITER_CHECK_FWD; // Check forward.
+ } else { // pos < limit
+ state = State.ITER_IN_FCD_SEGMENT; // Stay in FCD segment.
+ }
+ } else {
+ // Reached the end of the FCD segment.
+ if(state == State.ITER_IN_FCD_SEGMENT) {
+ // The input text segment is FCD, extend it forward.
+ } else {
+ // The input text segment needed to be normalized.
+ // Switch to checking forward from it.
+ if(state == State.IN_NORM_ITER_AT_START) {
+ iter.moveIndex(limit - start);
+ }
+ start = limit;
+ }
+ state = State.ITER_CHECK_FWD;
+ }
+ }
+
+ /**
+ * Extends the FCD text segment forward or normalizes around pos.
+ * @return true if success
+ */
+ private boolean nextSegment() {
+ assert(state == State.ITER_CHECK_FWD);
+ // The input text [start..(iter index)[ passes the FCD check.
+ pos = iter.getIndex();
+ // Collect the characters being checked, in case they need to be normalized.
+ if(s == null) {
+ s = new StringBuilder();
+ } else {
+ s.setLength(0);
+ }
+ int prevCC = 0;
+ for(;;) {
+ // Fetch the next character and its fcd16 value.
+ int c = iter.nextCodePoint();
+ if(c < 0) { break; }
+ int fcd16 = nfcImpl.getFCD16(c);
+ int leadCC = fcd16 >> 8;
+ if(leadCC == 0 && s.length() != 0) {
+ // FCD boundary before this character.
+ iter.previousCodePoint();
+ break;
+ }
+ s.appendCodePoint(c);
+ if(leadCC != 0 && (prevCC > leadCC || CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
+ // Fails FCD check. Find the next FCD boundary and normalize.
+ for(;;) {
+ c = iter.nextCodePoint();
+ if(c < 0) { break; }
+ if(nfcImpl.getFCD16(c) <= 0xff) {
+ iter.previousCodePoint();
+ break;
+ }
+ s.appendCodePoint(c);
+ }
+ normalize(s);
+ start = pos;
+ limit = pos + s.length();
+ state = State.IN_NORM_ITER_AT_LIMIT;
+ pos = 0;
+ return true;
+ }
+ prevCC = fcd16 & 0xff;
+ if(prevCC == 0) {
+ // FCD boundary after the last character.
+ break;
+ }
+ }
+ limit = pos + s.length();
+ assert(pos != limit);
+ iter.moveIndex(-s.length());
+ state = State.ITER_IN_FCD_SEGMENT;
+ return true;
+ }
+
+ /**
+ * Switches to backward checking.
+ */
+ private void switchToBackward() {
+ assert(state == State.ITER_CHECK_FWD ||
+ (state == State.ITER_IN_FCD_SEGMENT && pos == start) ||
+ (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos == 0));
+ if(state == State.ITER_CHECK_FWD) {
+ // Turn around from forward checking.
+ limit = pos = iter.getIndex();
+ if(pos == start) {
+ state = State.ITER_CHECK_BWD; // Check backward.
+ } else { // pos > start
+ state = State.ITER_IN_FCD_SEGMENT; // Stay in FCD segment.
+ }
+ } else {
+ // Reached the start of the FCD segment.
+ if(state == State.ITER_IN_FCD_SEGMENT) {
+ // The input text segment is FCD, extend it backward.
+ } else {
+ // The input text segment needed to be normalized.
+ // Switch to checking backward from it.
+ if(state == State.IN_NORM_ITER_AT_LIMIT) {
+ iter.moveIndex(start - limit);
+ }
+ limit = start;
+ }
+ state = State.ITER_CHECK_BWD;
+ }
+ }
+
+ /**
+ * Extends the FCD text segment backward or normalizes around pos.
+ * @return true if success
+ */
+ private boolean previousSegment() {
+ assert(state == State.ITER_CHECK_BWD);
+ // The input text [(iter index)..limit[ passes the FCD check.
+ pos = iter.getIndex();
+ // Collect the characters being checked, in case they need to be normalized.
+ if(s == null) {
+ s = new StringBuilder();
+ } else {
+ s.setLength(0);
+ }
+ int nextCC = 0;
+ for(;;) {
+ // Fetch the previous character and its fcd16 value.
+ int c = iter.previousCodePoint();
+ if(c < 0) { break; }
+ int fcd16 = nfcImpl.getFCD16(c);
+ int trailCC = fcd16 & 0xff;
+ if(trailCC == 0 && s.length() != 0) {
+ // FCD boundary after this character.
+ iter.nextCodePoint();
+ break;
+ }
+ s.appendCodePoint(c);
+ if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
+ CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
+ // Fails FCD check. Find the previous FCD boundary and normalize.
+ while(fcd16 > 0xff) {
+ c = iter.previousCodePoint();
+ if(c < 0) { break; }
+ fcd16 = nfcImpl.getFCD16(c);
+ if(fcd16 == 0) {
+ iter.nextCodePoint();
+ break;
+ }
+ s.appendCodePoint(c);
+ }
+ s.reverse();
+ normalize(s);
+ limit = pos;
+ start = pos - s.length();
+ state = State.IN_NORM_ITER_AT_START;
+ pos = normalized.length();
+ return true;
+ }
+ nextCC = fcd16 >> 8;
+ if(nextCC == 0) {
+ // FCD boundary before the following character.
+ break;
+ }
+ }
+ start = pos - s.length();
+ assert(pos != start);
+ iter.moveIndex(s.length());
+ state = State.ITER_IN_FCD_SEGMENT;
+ return true;
+ }
+
+ private void normalize(CharSequence s) {
+ if(normalized == null) {
+ normalized = new StringBuilder();
+ }
+ // NFD without argument checking.
+ nfcImpl.decompose(s, normalized);
+ }
+
+ private enum State {
+ /**
+ * The input text [start..(iter index)[ passes the FCD check.
+ * Moving forward checks incrementally.
+ * pos & limit are undefined.
+ */
+ ITER_CHECK_FWD,
+ /**
+ * The input text [(iter index)..limit[ passes the FCD check.
+ * Moving backward checks incrementally.
+ * start & pos are undefined.
+ */
+ ITER_CHECK_BWD,
+ /**
+ * The input text [start..limit[ passes the FCD check.
+ * pos tracks the current text index.
+ */
+ ITER_IN_FCD_SEGMENT,
+ /**
+ * The input text [start..limit[ failed the FCD check and was normalized.
+ * pos tracks the current index in the normalized string.
+ * The text iterator is at the limit index.
+ */
+ IN_NORM_ITER_AT_LIMIT,
+ /**
+ * The input text [start..limit[ failed the FCD check and was normalized.
+ * pos tracks the current index in the normalized string.
+ * The text iterator is at the start index.
+ */
+ IN_NORM_ITER_AT_START
+ }
+
+ private State state;
+
+ private int start;
+ private int pos;
+ private int limit;
+
+ private final Normalizer2Impl nfcImpl;
+ private StringBuilder s;
+ private StringBuilder normalized;
+}
--- /dev/null
+/*
+*******************************************************************************
+* Copyright (C) 2010-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* FCDUTF16CollationIterator.java, ported from utf16collationiterator.h/.cpp
+*
+* C++ version created on: 2010oct27
+* created by: Markus W. Scherer
+*/
+
+package com.ibm.icu.impl.coll;
+
+import com.ibm.icu.impl.Normalizer2Impl;
+
+/**
+ * Incrementally checks the input text for FCD and normalizes where necessary.
+ */
+public final class FCDUTF16CollationIterator extends UTF16CollationIterator {
+ /**
+ * Partial constructor, see {@link CollationIterator#CollationIterator(CollationData)}.
+ */
+ public FCDUTF16CollationIterator(CollationData d) {
+ super(d);
+ nfcImpl = d.nfcImpl;
+ }
+
+ public FCDUTF16CollationIterator(CollationData data, boolean numeric, CharSequence s, int p) {
+ super(data, numeric, s, p);
+ rawSeq = s;
+ segmentStart = p;
+ rawLimit = s.length();
+ nfcImpl = data.nfcImpl;
+ checkDir = 1;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ // Skip the UTF16CollationIterator and call its parent.
+ if(!((CollationIterator)this).equals(other)) { return false; }
+ FCDUTF16CollationIterator o = (FCDUTF16CollationIterator)other;
+ // Compare the iterator state but not the text: Assume that the caller does that.
+ if(checkDir != o.checkDir) { return false; }
+ if(checkDir == 0 && (seq == rawSeq) != (o.seq == o.rawSeq)) { return false; }
+ if(checkDir != 0 || seq == rawSeq) {
+ return (pos - rawStart) == (o.pos - /*o.*/ rawStart);
+ } else {
+ return (segmentStart - rawStart) == (o.segmentStart - /*o.*/ rawStart) &&
+ (pos - start) == (o.pos - o.start);
+ }
+ }
+
+ @Override
+ public void resetToOffset(int newOffset) {
+ reset();
+ seq = rawSeq;
+ start = segmentStart = pos = rawStart + newOffset;
+ limit = rawLimit;
+ checkDir = 1;
+ }
+
+ @Override
+ public int getOffset() {
+ if(checkDir != 0 || seq == rawSeq) {
+ return pos - rawStart;
+ } else if(pos == start) {
+ return segmentStart - rawStart;
+ } else {
+ return segmentLimit - rawStart;
+ }
+ }
+
+ @Override
+ public void setText(boolean numeric, CharSequence s, int p) {
+ super.setText(numeric, s, p);
+ rawSeq = s;
+ segmentStart = p;
+ rawLimit = limit = s.length();
+ checkDir = 1;
+ }
+
+ @Override
+ public int nextCodePoint() {
+ char c;
+ for(;;) {
+ if(checkDir > 0) {
+ if(pos == limit) {
+ return Collation.SENTINEL_CP;
+ }
+ c = seq.charAt(pos++);
+ if(CollationFCD.hasTccc(c)) {
+ if(CollationFCD.maybeTibetanCompositeVowel(c) ||
+ (pos != limit && CollationFCD.hasLccc(seq.charAt(pos)))) {
+ --pos;
+ nextSegment();
+ c = seq.charAt(pos++);
+ }
+ }
+ break;
+ } else if(checkDir == 0 && pos != limit) {
+ c = seq.charAt(pos++);
+ break;
+ } else {
+ switchToForward();
+ }
+ }
+ char trail;
+ if(Character.isHighSurrogate(c) && pos != limit &&
+ Character.isLowSurrogate(trail = seq.charAt(pos))) {
+ ++pos;
+ return Character.toCodePoint(c, trail);
+ } else {
+ return c;
+ }
+ }
+
+ @Override
+ public int previousCodePoint() {
+ char c;
+ for(;;) {
+ if(checkDir < 0) {
+ if(pos == start) {
+ return Collation.SENTINEL_CP;
+ }
+ c = seq.charAt(--pos);
+ if(CollationFCD.hasLccc(c)) {
+ if(CollationFCD.maybeTibetanCompositeVowel(c) ||
+ (pos != start && CollationFCD.hasTccc(seq.charAt(pos - 1)))) {
+ ++pos;
+ previousSegment();
+ c = seq.charAt(--pos);
+ }
+ }
+ break;
+ } else if(checkDir == 0 && pos != start) {
+ c = seq.charAt(--pos);
+ break;
+ } else {
+ switchToBackward();
+ }
+ }
+ char lead;
+ if(Character.isLowSurrogate(c) && pos != start &&
+ Character.isHighSurrogate(lead = seq.charAt(pos - 1))) {
+ --pos;
+ return Character.toCodePoint(lead, c);
+ } else {
+ return c;
+ }
+ }
+
+ @Override
+ protected long handleNextCE32() {
+ char c;
+ for(;;) {
+ if(checkDir > 0) {
+ if(pos == limit) {
+ return NO_CP_AND_CE32;
+ }
+ c = seq.charAt(pos++);
+ if(CollationFCD.hasTccc(c)) {
+ if(CollationFCD.maybeTibetanCompositeVowel(c) ||
+ (pos != limit && CollationFCD.hasLccc(seq.charAt(pos)))) {
+ --pos;
+ nextSegment();
+ c = seq.charAt(pos++);
+ }
+ }
+ break;
+ } else if(checkDir == 0 && pos != limit) {
+ c = seq.charAt(pos++);
+ break;
+ } else {
+ switchToForward();
+ }
+ }
+ return makeCodePointAndCE32Pair(c, trie.getFromU16SingleLead(c));
+ }
+
+ /* boolean foundNULTerminator(); */
+
+ @Override
+ protected void forwardNumCodePoints(int num) {
+ // Specify the class to avoid a virtual-function indirection.
+ // In Java, we would declare this class final.
+ while(num > 0 && nextCodePoint() >= 0) {
+ --num;
+ }
+ }
+
+ @Override
+ protected void backwardNumCodePoints(int num) {
+ // Specify the class to avoid a virtual-function indirection.
+ // In Java, we would declare this class final.
+ while(num > 0 && previousCodePoint() >= 0) {
+ --num;
+ }
+ }
+
+ /**
+ * Switches to forward checking if possible.
+ * To be called when checkDir < 0 || (checkDir == 0 && pos == limit).
+ * Returns with checkDir > 0 || (checkDir == 0 && pos != limit).
+ */
+ private void switchToForward() {
+ assert((checkDir < 0 && seq == rawSeq) || (checkDir == 0 && pos == limit));
+ if(checkDir < 0) {
+ // Turn around from backward checking.
+ start = segmentStart = pos;
+ if(pos == segmentLimit) {
+ limit = rawLimit;
+ checkDir = 1; // Check forward.
+ } else { // pos < segmentLimit
+ checkDir = 0; // Stay in FCD segment.
+ }
+ } else {
+ // Reached the end of the FCD segment.
+ if(seq == rawSeq) {
+ // The input text segment is FCD, extend it forward.
+ } else {
+ // The input text segment needed to be normalized.
+ // Switch to checking forward from it.
+ seq = rawSeq;
+ pos = start = segmentStart = segmentLimit;
+ // Note: If this segment is at the end of the input text,
+ // then it might help to return false to indicate that, so that
+ // we do not have to re-check and normalize when we turn around and go backwards.
+ // However, that would complicate the call sites for an optimization of an unusual case.
+ }
+ limit = rawLimit;
+ checkDir = 1;
+ }
+ }
+
+ /**
+ * Extend the FCD text segment forward or normalize around pos.
+ * To be called when checkDir > 0 && pos != limit.
+ * Returns with checkDir == 0 and pos != limit.
+ */
+ private void nextSegment() {
+ assert(checkDir > 0 && seq == rawSeq && pos != limit);
+ // The input text [segmentStart..pos[ passes the FCD check.
+ int p = pos;
+ int prevCC = 0;
+ for(;;) {
+ // Fetch the next character's fcd16 value.
+ int q = p;
+ int c = Character.codePointAt(seq, p);
+ p += Character.charCount(c);
+ int fcd16 = nfcImpl.getFCD16(c);
+ int leadCC = fcd16 >> 8;
+ if(leadCC == 0 && q != pos) {
+ // FCD boundary before the [q, p[ character.
+ limit = segmentLimit = q;
+ break;
+ }
+ if(leadCC != 0 && (prevCC > leadCC || CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
+ // Fails FCD check. Find the next FCD boundary and normalize.
+ do {
+ q = p;
+ if(p == rawLimit) { break; }
+ c = Character.codePointAt(seq, p);
+ p += Character.charCount(c);
+ } while(nfcImpl.getFCD16(c) > 0xff);
+ normalize(pos, q);
+ pos = start;
+ break;
+ }
+ prevCC = fcd16 & 0xff;
+ if(p == rawLimit || prevCC == 0) {
+ // FCD boundary after the last character.
+ limit = segmentLimit = p;
+ break;
+ }
+ }
+ assert(pos != limit);
+ checkDir = 0;
+ }
+
+ /**
+ * Switches to backward checking.
+ * To be called when checkDir > 0 || (checkDir == 0 && pos == start).
+ * Returns with checkDir < 0 || (checkDir == 0 && pos != start).
+ */
+ private void switchToBackward() {
+ assert((checkDir > 0 && seq == rawSeq) || (checkDir == 0 && pos == start));
+ if(checkDir > 0) {
+ // Turn around from forward checking.
+ limit = segmentLimit = pos;
+ if(pos == segmentStart) {
+ start = rawStart;
+ checkDir = -1; // Check backward.
+ } else { // pos > segmentStart
+ checkDir = 0; // Stay in FCD segment.
+ }
+ } else {
+ // Reached the start of the FCD segment.
+ if(seq == rawSeq) {
+ // The input text segment is FCD, extend it backward.
+ } else {
+ // The input text segment needed to be normalized.
+ // Switch to checking backward from it.
+ seq = rawSeq;
+ pos = limit = segmentLimit = segmentStart;
+ }
+ start = rawStart;
+ checkDir = -1;
+ }
+ }
+
+ /**
+ * Extend the FCD text segment backward or normalize around pos.
+ * To be called when checkDir < 0 && pos != start.
+ * Returns with checkDir == 0 and pos != start.
+ */
+ private void previousSegment() {
+ assert(checkDir < 0 && seq == rawSeq && pos != start);
+ // The input text [pos..segmentLimit[ passes the FCD check.
+ int p = pos;
+ int nextCC = 0;
+ for(;;) {
+ // Fetch the previous character's fcd16 value.
+ int q = p;
+ int c = Character.codePointBefore(seq, p);
+ p -= Character.charCount(c);
+ int fcd16 = nfcImpl.getFCD16(c);
+ int trailCC = fcd16 & 0xff;
+ if(trailCC == 0 && q != pos) {
+ // FCD boundary after the [p, q[ character.
+ start = segmentStart = q;
+ break;
+ }
+ if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
+ CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
+ // Fails FCD check. Find the previous FCD boundary and normalize.
+ do {
+ q = p;
+ if(fcd16 <= 0xff || p == rawStart) { break; }
+ c = Character.codePointBefore(seq, p);
+ p -= Character.charCount(c);
+ } while((fcd16 = nfcImpl.getFCD16(c)) != 0);
+ normalize(q, pos);
+ pos = limit;
+ break;
+ }
+ nextCC = fcd16 >> 8;
+ if(p == rawStart || nextCC == 0) {
+ // FCD boundary before the following character.
+ start = segmentStart = p;
+ break;
+ }
+ }
+ assert(pos != start);
+ checkDir = 0;
+ }
+
+ private void normalize(int from, int to) {
+ if(normalized == null) {
+ normalized = new StringBuilder();
+ }
+ // NFD without argument checking.
+ nfcImpl.decompose(rawSeq, from, to, normalized, to - from);
+ // Switch collation processing into the FCD buffer
+ // with the result of normalizing [segmentStart, segmentLimit[.
+ segmentStart = from;
+ segmentLimit = to;
+ seq = normalized;
+ start = 0;
+ limit = start + normalized.length();
+ }
+
+ // Text pointers: The input text is rawSeq[rawStart, rawLimit[.
+ // (In C++, these are const UChar * pointers.
+ // In Java, we use CharSequence rawSeq and the parent class' seq
+ // together with int indexes.)
+ //
+ // checkDir > 0:
+ //
+ // The input text rawSeq[segmentStart..pos[ passes the FCD check.
+ // Moving forward checks incrementally.
+ // segmentLimit is undefined. seq == rawSeq. limit == rawLimit.
+ //
+ // checkDir < 0:
+ // The input text rawSeq[pos..segmentLimit[ passes the FCD check.
+ // Moving backward checks incrementally.
+ // segmentStart is undefined. seq == rawSeq. start == rawStart.
+ //
+ // checkDir == 0:
+ //
+ // The input text rawSeq[segmentStart..segmentLimit[ is being processed.
+ // These pointers are at FCD boundaries.
+ // Either this text segment already passes the FCD check
+ // and seq==rawSeq && segmentStart==start<=pos<=limit==segmentLimit,
+ // or the current segment had to be normalized so that
+ // rawSeq[segmentStart..segmentLimit[ turned into the normalized string,
+ // corresponding to seq==normalized && 0==start<=pos<=limit==start+normalized.length().
+ private CharSequence rawSeq;
+ private static final int rawStart = 0;
+ private int segmentStart;
+ private int segmentLimit;
+ private int rawLimit;
+
+ private final Normalizer2Impl nfcImpl;
+ private StringBuilder normalized;
+ // Direction of incremental FCD check. See comments before rawStart.
+ private int checkDir;
+}
--- /dev/null
+/*
+*******************************************************************************
+* Copyright (C) 2012-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* IterCollationIterator.java, ported from uitercollationiterator.h/.cpp
+*
+* C++ version created on: 2012sep23 (from utf16collationiterator.h)
+* created by: Markus W. Scherer
+*/
+
+package com.ibm.icu.impl.coll;
+
+import com.ibm.icu.text.UCharacterIterator;
+
+/**
+ * UCharIterator-based collation element and character iterator.
+ * Handles normalized text, with length or NUL-terminated.
+ * Unnormalized text is handled by a subclass.
+ */
+public class IterCollationIterator extends CollationIterator {
+ public IterCollationIterator(CollationData d, boolean numeric, UCharacterIterator ui) {
+ super(d, numeric);
+ iter = ui;
+ }
+
+ @Override
+ public void resetToOffset(int newOffset) {
+ reset();
+ iter.setIndex(newOffset);
+ }
+
+ @Override
+ public int getOffset() {
+ return iter.getIndex();
+ }
+
+ @Override
+ public int nextCodePoint() {
+ return iter.nextCodePoint();
+ }
+
+ @Override
+ public int previousCodePoint() {
+ return iter.previousCodePoint();
+ }
+
+ @Override
+ protected long handleNextCE32() {
+ int c = iter.next();
+ if(c < 0) {
+ return NO_CP_AND_CE32;
+ }
+ return makeCodePointAndCE32Pair(c, trie.getFromU16SingleLead((char)c));
+ }
+
+ @Override
+ protected char handleGetTrailSurrogate() {
+ int trail = iter.next();
+ if(!isTrailSurrogate(trail) && trail >= 0) { iter.previous(); }
+ return (char)trail;
+ }
+
+ @Override
+ protected void forwardNumCodePoints(int num) {
+ iter.moveCodePointIndex(num);
+ }
+
+ @Override
+ protected void backwardNumCodePoints(int num) {
+ iter.moveCodePointIndex(-num);
+ }
+
+ protected UCharacterIterator iter;
+}
--- /dev/null
+/*
+*******************************************************************************
+* Copyright (C) 2013-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* SharedObject.java, ported from sharedobject.h/.cpp
+*
+* C++ version created on: 2013dec19
+* created by: Markus W. Scherer
+*/
+
+package com.ibm.icu.impl.coll;
+
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * Base class for shared, reference-counted, auto-deleted objects.
+ * Java subclasses are mutable and must implement clone().
+ *
+ * <p>In C++, the SharedObject base class is used for both memory and ownership management.
+ * In Java, memory management (deletion after last reference is gone)
+ * is up to the garbage collector,
+ * but the reference counter is still used to see whether the referent is the sole owner.
+ *
+ * <p>Usage:
+ * <pre>
+ * class S extends SharedObject {
+ * public clone() { ... }
+ * }
+ *
+ * // Either use the nest class Reference (which costs an extra allocation),
+ * // or duplicate its code in the class that uses S
+ * // (which duplicates code and is more error-prone).
+ * class U {
+ * // For read-only access, use s.readOnly().
+ * // For writable access, use S ownedS = s.copyOnWrite();
+ * private SharedObject.Reference<S> s;
+ * // Returns a writable version of s.
+ * // If there is exactly one owner, then s itself is returned.
+ * // If there are multiple owners, then s is replaced with a clone,
+ * // and that is returned.
+ * private S getOwnedS() {
+ * return s.copyOnWrite();
+ * }
+ * public U clone() {
+ * ...
+ * c.s = s.clone();
+ * ...
+ * }
+ * }
+ *
+ * class V {
+ * // For read-only access, use s directly.
+ * // For writable access, use S ownedS = getOwnedS();
+ * private S s;
+ * // Returns a writable version of s.
+ * // If there is exactly one owner, then s itself is returned.
+ * // If there are multiple owners, then s is replaced with a clone,
+ * // and that is returned.
+ * private S getOwnedS() {
+ * if(s.getRefCount() > 1) {
+ * S ownedS = s.clone();
+ * s.removeRef();
+ * s = ownedS;
+ * ownedS.addRef();
+ * }
+ * return s;
+ * }
+ * public U clone() {
+ * ...
+ * s.addRef();
+ * ...
+ * }
+ * protected void finalize() {
+ * ...
+ * if(s != null) {
+ * s.removeRef();
+ * s = null;
+ * }
+ * ...
+ * }
+ * }
+ * </pre>
+ *
+ * Either use only Java memory management, or use addRef()/removeRef().
+ * Sharing requires reference-counting.
+ *
+ * TODO: Consider making this more widely available inside ICU,
+ * or else adopting a different model.
+ */
+public class SharedObject implements Cloneable {
+ /**
+ * Similar to a smart pointer, basically a port of the static methods of C++ SharedObject.
+ */
+ public static final class Reference<T extends SharedObject> implements Cloneable {
+ private T ref;
+
+ public Reference(T r) {
+ ref = r;
+ if(r != null) {
+ r.addRef();
+ }
+ }
+
+ @SuppressWarnings("unchecked")
+ @Override
+ public Reference<T> clone() {
+ Reference<T> c;
+ try {
+ c = (Reference<T>)super.clone();
+ } catch (CloneNotSupportedException e) {
+ // Should never happen.
+ throw new RuntimeException(e);
+ }
+ if(ref != null) {
+ ref.addRef();
+ }
+ return c;
+ }
+
+ public T readOnly() { return ref; }
+
+ /**
+ * Returns a writable version of the reference.
+ * If there is exactly one owner, then the reference itself is returned.
+ * If there are multiple owners, then the reference is replaced with a clone,
+ * and that is returned.
+ */
+ public T copyOnWrite() {
+ T r = ref;
+ if(r.getRefCount() <= 1) { return r; }
+ @SuppressWarnings("unchecked")
+ T r2 = (T)r.clone();
+ r.removeRef();
+ ref = r2;
+ r2.addRef();
+ return r2;
+ }
+
+ public void clear() {
+ if(ref != null) {
+ ref.removeRef();
+ ref = null;
+ }
+ }
+
+ @Override
+ protected void finalize() throws Throwable {
+ super.finalize();
+ clear();
+ }
+ }
+
+ /** Initializes refCount to 0. */
+ public SharedObject() {}
+
+ /** Initializes refCount to 0. */
+ @Override
+ public SharedObject clone() {
+ SharedObject c;
+ try {
+ c = (SharedObject)super.clone();
+ } catch (CloneNotSupportedException e) {
+ // Should never happen.
+ throw new RuntimeException(e);
+ }
+ c.refCount = new AtomicInteger();
+ return c;
+ }
+
+ /**
+ * Increments the number of references to this object. Thread-safe.
+ */
+ public final void addRef() { refCount.incrementAndGet(); }
+ /**
+ * Decrements the number of references to this object,
+ * and auto-deletes "this" if the number becomes 0. Thread-safe.
+ */
+ public final void removeRef() {
+ // Deletion in Java is up to the garbage collector.
+ refCount.decrementAndGet();
+ }
+
+ /**
+ * Returns the reference counter. Uses a memory barrier.
+ */
+ public final int getRefCount() { return refCount.get(); }
+
+ public final void deleteIfZeroRefCount() {
+ // Deletion in Java is up to the garbage collector.
+ }
+
+ private AtomicInteger refCount = new AtomicInteger();
+}
--- /dev/null
+/*
+*******************************************************************************
+* Copyright (C) 2013-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* TailoredSet.java, ported from collationsets.h/.cpp
+*
+* C++ version created on: 2013feb09
+* created by: Markus W. Scherer
+*/
+
+package com.ibm.icu.impl.coll;
+
+import java.util.Iterator;
+
+import com.ibm.icu.impl.Normalizer2Impl.Hangul;
+import com.ibm.icu.impl.Trie2;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.util.CharsTrie;
+import com.ibm.icu.util.CharsTrie.Entry;
+
+/**
+ * Finds the set of characters and strings that sort differently in the tailoring
+ * from the base data.
+ *
+ * Every mapping in the tailoring needs to be compared to the base,
+ * because some mappings are copied for optimization, and
+ * all contractions for a character are copied if any contractions for that character
+ * are added, modified or removed.
+ *
+ * It might be simpler to re-parse the rule string, but:
+ * - That would require duplicating some of the from-rules builder code.
+ * - That would make the runtime code depend on the builder.
+ * - That would only work if we have the rule string, and we allow users to
+ * omit the rule string from data files.
+ */
+public final class TailoredSet {
+
+ private CollationData data;
+ private CollationData baseData;
+ private UnicodeSet tailored;
+ private StringBuilder unreversedPrefix = new StringBuilder();
+ private String suffix;
+
+ public TailoredSet(UnicodeSet t) {
+ tailored = t;
+ }
+
+ public void forData(CollationData d) {
+ data = d;
+ baseData = d.base;
+ assert (baseData != null);
+ // utrie2_enum(data->trie, NULL, enumTailoredRange, this);
+ Iterator<Trie2.Range> trieIterator = data.trie.iterator();
+ Trie2.Range range;
+ while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
+ enumTailoredRange(range.startCodePoint, range.endCodePoint, range.value, this);
+ }
+ }
+
+ private void enumTailoredRange(int start, int end, int ce32, TailoredSet ts) {
+ if (ce32 == Collation.FALLBACK_CE32) {
+ return; // fallback to base, not tailored
+ }
+ ts.handleCE32(start, end, ce32);
+ }
+
+ // Java porting note: ICU4C returns U_SUCCESS(error) and it's not applicable to ICU4J.
+ // Also, ICU4C requires handleCE32() to be public because it is used by the callback
+ // function (enumTailoredRange()). This is not necessary for Java implementation.
+ private void handleCE32(int start, int end, int ce32) {
+ assert (ce32 != Collation.FALLBACK_CE32);
+ if (Collation.isSpecialCE32(ce32)) {
+ ce32 = data.getIndirectCE32(ce32);
+ if (ce32 == Collation.FALLBACK_CE32) {
+ return;
+ }
+ }
+ do {
+ int baseCE32 = baseData.getFinalCE32(baseData.getCE32(start));
+ // Do not just continue if ce32 == baseCE32 because
+ // contractions and expansions in different data objects
+ // normally differ even if they have the same data offsets.
+ if (Collation.isSelfContainedCE32(ce32) && Collation.isSelfContainedCE32(baseCE32)) {
+ // fastpath
+ if (ce32 != baseCE32) {
+ tailored.add(start);
+ }
+ } else {
+ compare(start, ce32, baseCE32);
+ }
+ } while (++start <= end);
+ }
+
+ private void compare(int c, int ce32, int baseCE32) {
+ if (Collation.isPrefixCE32(ce32)) {
+ int dataIndex = Collation.indexFromCE32(ce32);
+ ce32 = data.getFinalCE32(data.getCE32FromContexts(dataIndex));
+ if (Collation.isPrefixCE32(baseCE32)) {
+ int baseIndex = Collation.indexFromCE32(baseCE32);
+ baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
+ comparePrefixes(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2);
+ } else {
+ addPrefixes(data, c, data.contexts, dataIndex + 2);
+ }
+ } else if (Collation.isPrefixCE32(baseCE32)) {
+ int baseIndex = Collation.indexFromCE32(baseCE32);
+ baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
+ addPrefixes(baseData, c, baseData.contexts, baseIndex + 2);
+ }
+
+ if (Collation.isContractionCE32(ce32)) {
+ int dataIndex = Collation.indexFromCE32(ce32);
+ if ((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
+ ce32 = Collation.NO_CE32;
+ } else {
+ ce32 = data.getFinalCE32(data.getCE32FromContexts(dataIndex));
+ }
+ if (Collation.isContractionCE32(baseCE32)) {
+ int baseIndex = Collation.indexFromCE32(baseCE32);
+ if ((baseCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
+ baseCE32 = Collation.NO_CE32;
+ } else {
+ baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
+ }
+ compareContractions(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2);
+ } else {
+ addContractions(c, data.contexts, dataIndex + 2);
+ }
+ } else if (Collation.isContractionCE32(baseCE32)) {
+ int baseIndex = Collation.indexFromCE32(baseCE32);
+ baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
+ addContractions(c, baseData.contexts, baseIndex + 2);
+ }
+
+ int tag;
+ if (Collation.isSpecialCE32(ce32)) {
+ tag = Collation.tagFromCE32(ce32);
+ assert (tag != Collation.PREFIX_TAG);
+ assert (tag != Collation.CONTRACTION_TAG);
+ // Currently, the tailoring data builder does not write offset tags.
+ // They might be useful for saving space,
+ // but they would complicate the builder,
+ // and in tailorings we assume that performance of tailored characters is more important.
+ assert (tag != Collation.OFFSET_TAG);
+ } else {
+ tag = -1;
+ }
+ int baseTag;
+ if (Collation.isSpecialCE32(baseCE32)) {
+ baseTag = Collation.tagFromCE32(baseCE32);
+ assert (baseTag != Collation.PREFIX_TAG);
+ assert (baseTag != Collation.CONTRACTION_TAG);
+ } else {
+ baseTag = -1;
+ }
+
+ // Non-contextual mappings, expansions, etc.
+ if (baseTag == Collation.OFFSET_TAG) {
+ // We might be comparing a tailoring CE which is a copy of
+ // a base offset-tag CE, via the [optimize [set]] syntax
+ // or when a single-character mapping was copied for tailored contractions.
+ // Offset tags always result in long-primary CEs,
+ // with common secondary/tertiary weights.
+ if (!Collation.isLongPrimaryCE32(ce32)) {
+ add(c);
+ return;
+ }
+ long dataCE = baseData.ces[Collation.indexFromCE32(baseCE32)];
+ long p = Collation.getThreeBytePrimaryForOffsetData(c, dataCE);
+ if (Collation.primaryFromLongPrimaryCE32(ce32) != p) {
+ add(c);
+ return;
+ }
+ }
+
+ if (tag != baseTag) {
+ add(c);
+ return;
+ }
+
+ if (tag == Collation.EXPANSION32_TAG) {
+ int length = Collation.lengthFromCE32(ce32);
+ int baseLength = Collation.lengthFromCE32(baseCE32);
+
+ if (length != baseLength) {
+ add(c);
+ return;
+ }
+
+ int idx0 = Collation.indexFromCE32(ce32);
+ int idx1 = Collation.indexFromCE32(baseCE32);
+
+ for (int i = 0; i < length; ++i) {
+ if (data.ce32s[idx0 + i] != baseData.ce32s[idx1 + i]) {
+ add(c);
+ break;
+ }
+ }
+ } else if (tag == Collation.EXPANSION_TAG) {
+ int length = Collation.lengthFromCE32(ce32);
+ int baseLength = Collation.lengthFromCE32(baseCE32);
+
+ if (length != baseLength) {
+ add(c);
+ return;
+ }
+
+ int idx0 = Collation.indexFromCE32(ce32);
+ int idx1 = Collation.indexFromCE32(baseCE32);
+
+ for (int i = 0; i < length; ++i) {
+ if (data.ces[idx0 + i] != baseData.ces[idx1 + i]) {
+ add(c);
+ break;
+ }
+ }
+ } else if (tag == Collation.HANGUL_TAG) {
+ StringBuilder jamos = new StringBuilder();
+ int length = Hangul.decompose(c, jamos);
+ if (tailored.contains(jamos.charAt(0)) || tailored.contains(jamos.charAt(1))
+ || (length == 3 && tailored.contains(jamos.charAt(2)))) {
+ add(c);
+ }
+ } else if (ce32 != baseCE32) {
+ add(c);
+ }
+ }
+
+ private void comparePrefixes(int c, CharSequence p, int pidx, CharSequence q, int qidx) {
+ // Parallel iteration over prefixes of both tables.
+ CharsTrie.Iterator prefixes = new CharsTrie(p, pidx).iterator();
+ CharsTrie.Iterator basePrefixes = new CharsTrie(q, qidx).iterator();
+ String tp = null; // Tailoring prefix.
+ String bp = null; // Base prefix.
+ // Use a string with a U+FFFF as the limit sentinel.
+ // U+FFFF is untailorable and will not occur in prefixes.
+ String none = "\uffff";
+ Entry te = null, be = null;
+ for (;;) {
+ if (tp == null) {
+ if (prefixes.hasNext()) {
+ te = prefixes.next();
+ tp = te.chars.toString();
+ } else {
+ te = null;
+ tp = none;
+ }
+ }
+ if (bp == null) {
+ if (basePrefixes.hasNext()) {
+ be = basePrefixes.next();
+ bp = be.chars.toString();
+ } else {
+ be = null;
+ bp = none;
+ }
+ }
+ if (tp == none && bp == none) {
+ break;
+ }
+ int cmp = tp.compareTo(bp);
+ if (cmp < 0) {
+ // tp occurs in the tailoring but not in the base.
+ assert (te != null);
+ addPrefix(data, tp, c, te.value);
+ te = null;
+ tp = null;
+ } else if (cmp > 0) {
+ // bp occurs in the base but not in the tailoring.
+ assert (be != null);
+ addPrefix(baseData, bp, c, be.value);
+ be = null;
+ bp = null;
+ } else {
+ setPrefix(tp);
+ assert (te != null && be != null);
+ compare(c, te.value, be.value);
+ resetPrefix();
+ te = be = null;
+ tp = bp = null;
+ }
+ }
+ }
+
+ private void compareContractions(int c, CharSequence p, int pidx, CharSequence q, int qidx) {
+ // Parallel iteration over suffixes of both tables.
+ CharsTrie.Iterator suffixes = new CharsTrie(p, pidx).iterator();
+ CharsTrie.Iterator baseSuffixes = new CharsTrie(q, qidx).iterator();
+ String ts = null; // Tailoring suffix.
+ String bs = null; // Base suffix.
+ // Use a string with two U+FFFF as the limit sentinel.
+ // U+FFFF is untailorable and will not occur in contractions except maybe
+ // as a single suffix character for a root-collator boundary contraction.
+ String none = "\uffff\uffff";
+ Entry te = null, be = null;
+ for (;;) {
+ if (ts == null) {
+ if (suffixes.hasNext()) {
+ te = suffixes.next();
+ ts = te.chars.toString();
+ } else {
+ te = null;
+ ts = none;
+ }
+ }
+ if (bs == null) {
+ if (baseSuffixes.hasNext()) {
+ be = baseSuffixes.next();
+ bs = be.chars.toString();
+ } else {
+ be = null;
+ bs = none;
+ }
+ }
+ if (ts == none && bs == none) {
+ break;
+ }
+ int cmp = ts.compareTo(bs);
+ if (cmp < 0) {
+ // ts occurs in the tailoring but not in the base.
+ addSuffix(c, ts);
+ te = null;
+ ts = null;
+ } else if (cmp > 0) {
+ // bs occurs in the base but not in the tailoring.
+ addSuffix(c, bs);
+ be = null;
+ bs = null;
+ } else {
+ suffix = ts;
+ compare(c, te.value, be.value);
+ suffix = null;
+ te = be = null;
+ ts = bs = null;
+ }
+ }
+ }
+
+ private void addPrefixes(CollationData d, int c, CharSequence p, int pidx) {
+ CharsTrie.Iterator prefixes = new CharsTrie(p, pidx).iterator();
+ while (prefixes.hasNext()) {
+ Entry e = prefixes.next();
+ addPrefix(d, e.chars, c, e.value);
+ }
+ }
+
+ private void addPrefix(CollationData d, CharSequence pfx, int c, int ce32) {
+ setPrefix(pfx);
+ ce32 = d.getFinalCE32(ce32);
+ if (Collation.isContractionCE32(ce32)) {
+ int idx = Collation.indexFromCE32(ce32);
+ addContractions(c, d.contexts, idx + 2);
+ }
+ tailored.add(new StringBuilder(unreversedPrefix.appendCodePoint(c)));
+ resetPrefix();
+ }
+
+ private void addContractions(int c, CharSequence p, int pidx) {
+ CharsTrie.Iterator suffixes = new CharsTrie(p, pidx).iterator();
+ while (suffixes.hasNext()) {
+ Entry e = suffixes.next();
+ addSuffix(c, e.chars);
+ }
+ }
+
+ private void addSuffix(int c, CharSequence sfx) {
+ tailored.add(new StringBuilder(unreversedPrefix).appendCodePoint(c).append(sfx));
+ }
+
+ private void add(int c) {
+ if (unreversedPrefix.length() == 0 && suffix == null) {
+ tailored.add(c);
+ } else {
+ StringBuilder s = new StringBuilder(unreversedPrefix);
+ s.appendCodePoint(c);
+ if (suffix != null) {
+ s.append(suffix);
+ }
+ tailored.add(s);
+ }
+ }
+
+ // Prefixes are reversed in the data structure.
+ private void setPrefix(CharSequence pfx) {
+ unreversedPrefix.setLength(0);
+ unreversedPrefix.append(pfx).reverse();
+ }
+
+ private void resetPrefix() {
+ unreversedPrefix.setLength(0);
+ }
+}
+
--- /dev/null
+/*
+*******************************************************************************
+* Copyright (C) 2010-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* UTF16CollationIterator.java, ported from utf16collationiterator.h/.cpp
+*
+* C++ version created on: 2010oct27
+* created by: Markus W. Scherer
+*/
+
+package com.ibm.icu.impl.coll;
+
+/**
+ * UTF-16 collation element and character iterator.
+ * Handles normalized UTF-16 text, with length or NUL-terminated.
+ * Unnormalized text is handled by a subclass.
+ */
+public class UTF16CollationIterator extends CollationIterator {
+ /**
+ * Partial constructor, see {@link CollationIterator#CollationIterator(CollationData)}.
+ */
+ public UTF16CollationIterator(CollationData d) {
+ super(d);
+ }
+
+ public UTF16CollationIterator(CollationData d, boolean numeric, CharSequence s, int p) {
+ super(d, numeric);
+ seq = s;
+ start = 0;
+ pos = p;
+ limit = s.length();
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if(!super.equals(other)) { return false; }
+ UTF16CollationIterator o = (UTF16CollationIterator)other;
+ // Compare the iterator state but not the text: Assume that the caller does that.
+ return (pos - start) == (o.pos - o.start);
+ }
+
+ @Override
+ public void resetToOffset(int newOffset) {
+ reset();
+ pos = start + newOffset;
+ }
+
+ @Override
+ public int getOffset() {
+ return pos - start;
+ }
+
+ public void setText(boolean numeric, CharSequence s, int p) {
+ reset(numeric);
+ seq = s;
+ start = 0;
+ pos = p;
+ limit = s.length();
+ }
+
+ @Override
+ public int nextCodePoint() {
+ if(pos == limit) {
+ return Collation.SENTINEL_CP;
+ }
+ char c = seq.charAt(pos++);
+ char trail;
+ if(Character.isHighSurrogate(c) && pos != limit &&
+ Character.isLowSurrogate(trail = seq.charAt(pos))) {
+ ++pos;
+ return Character.toCodePoint(c, trail);
+ } else {
+ return c;
+ }
+ }
+
+ @Override
+ public int previousCodePoint() {
+ if(pos == start) {
+ return Collation.SENTINEL_CP;
+ }
+ char c = seq.charAt(--pos);
+ char lead;
+ if(Character.isLowSurrogate(c) && pos != start &&
+ Character.isHighSurrogate(lead = seq.charAt(pos - 1))) {
+ --pos;
+ return Character.toCodePoint(lead, c);
+ } else {
+ return c;
+ }
+ }
+
+ @Override
+ protected long handleNextCE32() {
+ if(pos == limit) {
+ return NO_CP_AND_CE32;
+ }
+ char c = seq.charAt(pos++);
+ return makeCodePointAndCE32Pair(c, trie.getFromU16SingleLead(c));
+ }
+
+ @Override
+ protected char handleGetTrailSurrogate() {
+ if(pos == limit) { return 0; }
+ char trail;
+ if(Character.isLowSurrogate(trail = seq.charAt(pos))) { ++pos; }
+ return trail;
+ }
+
+ /* boolean foundNULTerminator(); */
+
+ @Override
+ protected void forwardNumCodePoints(int num) {
+ while(num > 0 && pos != limit) {
+ char c = seq.charAt(pos++);
+ --num;
+ if(Character.isHighSurrogate(c) && pos != limit &&
+ Character.isLowSurrogate(seq.charAt(pos))) {
+ ++pos;
+ }
+ }
+ }
+
+ @Override
+ protected void backwardNumCodePoints(int num) {
+ while(num > 0 && pos != start) {
+ char c = seq.charAt(--pos);
+ --num;
+ if(Character.isLowSurrogate(c) && pos != start &&
+ Character.isHighSurrogate(seq.charAt(pos-1))) {
+ --pos;
+ }
+ }
+ }
+
+ protected CharSequence seq;
+ protected int start;
+ protected int pos;
+ protected int limit;
+}
--- /dev/null
+/*
+ *******************************************************************************
+ * Copyright (C) 2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
+ *******************************************************************************
+ *
+ * created on: 2014feb10
+ * created by: Markus W. Scherer
+ */
+package com.ibm.icu.impl.coll;
+
+// TODO: There must be a Java class for a growable array of ints without auto-boxing to Integer?!
+// Keep the API parallel to the C++ version for ease of porting. Port methods only as needed.
+// If & when we start using something else, we might keep this as a thin wrapper for porting.
+public final class UVector32 {
+ public UVector32() {}
+ public boolean isEmpty() { return length == 0; }
+ public int size() { return length; }
+ public int elementAti(int i) { return buffer[i]; }
+ public int[] getBuffer() { return buffer; }
+ public void addElement(int e) {
+ ensureAppendCapacity();
+ buffer[length++] = e;
+ }
+ public void setElementAt(int elem, int index) { buffer[index] = elem; }
+ public void insertElementAt(int elem, int index) {
+ ensureAppendCapacity();
+ System.arraycopy(buffer, index, buffer, index + 1, length - index);
+ buffer[index] = elem;
+ ++length;
+ }
+ public void removeAllElements() {
+ length = 0;
+ }
+
+ private void ensureAppendCapacity() {
+ if(length >= buffer.length) {
+ int newCapacity = buffer.length <= 0xffff ? 4 * buffer.length : 2 * buffer.length;
+ int[] newBuffer = new int[newCapacity];
+ System.arraycopy(buffer, 0, newBuffer, 0, length);
+ buffer = newBuffer;
+ }
+ }
+ private int[] buffer = new int[32];
+ private int length = 0;
+}
--- /dev/null
+/*
+ *******************************************************************************
+ * Copyright (C) 2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
+ *******************************************************************************
+ *
+ * created on: 2014feb10
+ * created by: Markus W. Scherer
+ */
+package com.ibm.icu.impl.coll;
+
+// TODO: There must be a Java class for a growable array of longs without auto-boxing to Long?!
+// Keep the API parallel to the C++ version for ease of porting. Port methods only as needed.
+// If & when we start using something else, we might keep this as a thin wrapper for porting.
+public final class UVector64 {
+ public UVector64() {}
+ public boolean isEmpty() { return length == 0; }
+ public int size() { return length; }
+ public long elementAti(int i) { return buffer[i]; }
+ public long[] getBuffer() { return buffer; }
+ public void addElement(long e) {
+ ensureAppendCapacity();
+ buffer[length++] = e;
+ }
+ public void setElementAt(long elem, int index) { buffer[index] = elem; }
+ public void insertElementAt(long elem, int index) {
+ ensureAppendCapacity();
+ System.arraycopy(buffer, index, buffer, index + 1, length - index);
+ buffer[index] = elem;
+ ++length;
+ }
+ public void removeAllElements() {
+ length = 0;
+ }
+
+ private void ensureAppendCapacity() {
+ if(length >= buffer.length) {
+ int newCapacity = buffer.length <= 0xffff ? 4 * buffer.length : 2 * buffer.length;
+ long[] newBuffer = new long[newCapacity];
+ System.arraycopy(buffer, 0, newBuffer, 0, length);
+ buffer = newBuffer;
+ }
+ }
+ private long[] buffer = new long[32];
+ private int length = 0;
+}
package com.ibm.icu.text;
import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
private AlphabeticIndex(ULocale locale, RuleBasedCollator collator) {
collatorOriginal = collator != null ? collator : (RuleBasedCollator) Collator.getInstance(locale);
try {
- collatorPrimaryOnly = (RuleBasedCollator) (collatorOriginal.clone());
+ collatorPrimaryOnly = (RuleBasedCollator) (collatorOriginal.cloneAsThawed());
} catch (Exception e) {
// should never happen
throw new IllegalStateException("Collator cannot be cloned", e);
collatorPrimaryOnly.setStrength(Collator.PRIMARY);
collatorPrimaryOnly.freeze();
- firstCharsInScripts = new ArrayList<String>(HACK_FIRST_CHARS_IN_SCRIPTS);
+ firstCharsInScripts = getFirstCharactersInScripts();
Collections.sort(firstCharsInScripts, collatorPrimaryOnly);
- if (collatorPrimaryOnly.compare("\u4E00", "\u1112") <= 0 &&
- collatorPrimaryOnly.compare("\u1100", "\u4E00") <= 0) {
- // The standard Korean tailoring sorts Hanja (Han characters)
- // as secondary differences from Hangul syllables.
- // This makes U+4E00 not useful as a Han-script boundary.
- // TODO: This becomes obsolete when the root collator gets
- // reliable script-first-primary mappings.
- int hanIndex = Collections.binarySearch(
- firstCharsInScripts, "\u4E00", collatorPrimaryOnly);
- if (hanIndex >= 0) {
- firstCharsInScripts.remove(hanIndex);
- }
- }
// Guard against a degenerate collator where
// some script boundary strings are primary ignorable.
for (;;) {
}
}
- if (locale != null) {
+ // Chinese index characters, which are specific to each of the several Chinese tailorings,
+ // take precedence over the single locale data exemplar set per language.
+ if (!addChineseIndexCharacters() && locale != null) {
addIndexExemplars(locale);
}
}
if (collatorPrimaryOnly.compare(item, firstScriptBoundary) < 0) {
// Ignore a primary-ignorable or non-alphabetic index character.
} else if (collatorPrimaryOnly.compare(item, overflowBoundary) >= 0) {
- // Ignore an index characters that will land in the overflow bucket.
+ // Ignore an index character that will land in the overflow bucket.
} else if (checkDistinct && collatorPrimaryOnly.compare(item, separated(item)) == 0) {
// Ignore a multi-code point index character that does not sort distinctly
// from the sequence of its separate characters.
* but if they aren't available, we have to synthesize them.
*/
private void addIndexExemplars(ULocale locale) {
- // Chinese index characters, which are specific to each of the several Chinese tailorings,
- // take precedence over the single locale data exemplar set per language.
- final String language = locale.getLanguage();
- if (language.equals("zh") || language.equals("ja") || language.equals("ko")) {
- // TODO: This should be done regardless of the language, but it's expensive.
- // We should add a Collator function (can be @internal)
- // to enumerate just the contractions that start with a given code point or string.
- if (addChineseIndexCharacters()) {
- return;
- }
- }
-
UnicodeSet exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_INDEX);
if (exemplars != null) {
initialLabels.addAll(exemplars);
private boolean addChineseIndexCharacters() {
UnicodeSet contractions = new UnicodeSet();
try {
- collatorPrimaryOnly.getContractionsAndExpansions(contractions, null, false);
+ collatorPrimaryOnly.internalAddContractions(BASE.charAt(0), contractions);
} catch (Exception e) {
return false;
}
- String firstHanBoundary = null;
- boolean hasPinyin = false;
+ if (contractions.isEmpty()) { return false; }
+ initialLabels.addAll(contractions);
for (String s : contractions) {
- if (s.startsWith(BASE)) {
- initialLabels.add(s);
- if (firstHanBoundary == null ||
- collatorPrimaryOnly.compare(s, firstHanBoundary) < 0) {
- firstHanBoundary = s;
- }
- char c = s.charAt(s.length() - 1);
- if ('A' <= c && c <= 'Z') {
- hasPinyin = true;
- }
- }
- }
- if (hasPinyin) {
- initialLabels.add('A', 'Z');
- }
- if (firstHanBoundary != null) {
- // The hardcoded list of script boundaries includes U+4E00
- // which is tailored to not be the first primary
- // in all Chinese tailorings except "unihan".
- // Replace U+4E00 with the first boundary string from the tailoring.
- // TODO: This becomes obsolete when the root collator gets
- // reliable script-first-primary mappings.
- int hanIndex = Collections.binarySearch(
- firstCharsInScripts, "\u4E00", collatorPrimaryOnly);
- if (hanIndex >= 0) {
- firstCharsInScripts.set(hanIndex, firstHanBoundary);
+ assert(s.startsWith(BASE));
+ char c = s.charAt(s.length() - 1);
+ if (0x41 <= c && c <= 0x5A) { // A-Z
+ // There are Pinyin labels, add ASCII A-Z labels as well.
+ initialLabels.add(0x41, 0x5A); // A-Z
+ break;
}
- return true;
- } else {
- return false;
}
+ return true;
}
/**
List<String> indexCharacters = initLabels();
// Variables for hasMultiplePrimaryWeights().
- CollationElementIterator cei = collatorPrimaryOnly.getCollationElementIterator("");
- int variableTop;
+ long variableTop;
if (collatorPrimaryOnly.isAlternateHandlingShifted()) {
- variableTop = CollationElementIterator.primaryOrder(collatorPrimaryOnly.getVariableTop());
+ variableTop = collatorPrimaryOnly.getVariableTop() & 0xffffffffL;
} else {
variableTop = 0;
}
}
// Check for multiple primary weights.
if (!current.startsWith(BASE) &&
- hasMultiplePrimaryWeights(cei, variableTop, current) &&
+ hasMultiplePrimaryWeights(collatorPrimaryOnly, variableTop, current) &&
!current.endsWith("\uffff")) {
// "Æ" or "Sch" etc.
for (int i = bucketList.size() - 2;; --i) {
break;
}
if (singleBucket.displayBucket == null &&
- !hasMultiplePrimaryWeights(cei, variableTop, singleBucket.lowerBoundary)) {
+ !hasMultiplePrimaryWeights(collatorPrimaryOnly, variableTop, singleBucket.lowerBoundary)) {
// Add an invisible bucket that redirects strings greater than the expansion
// to the previous single-character bucket.
// For example, after ... Q R S Sch we add Sch\uFFFF->S
}
private static boolean hasMultiplePrimaryWeights(
- CollationElementIterator cei, int variableTop, String s) {
- cei.setText(s);
+ RuleBasedCollator coll, long variableTop, String s) {
+ long[] ces = coll.internalGetCEs(s);
boolean seenPrimary = false;
- for (;;) {
- int ce32 = cei.next();
- if (ce32 == CollationElementIterator.NULLORDER) {
- break;
- }
- int p = CollationElementIterator.primaryOrder(ce32);
- if (p > variableTop && (ce32 & 0xc0) != 0xc0) {
- // not primary ignorable, and not a continuation CE
+ for (int i = 0; i < ces.length; ++i) {
+ long ce = ces[i];
+ long p = ce >>> 32;
+ if (p > variableTop) {
+ // not primary ignorable
if (seenPrimary) {
return true;
}
return false;
}
- /**
- * This list contains one character per script that has the
- * lowest primary weight for that script in the root collator.
- * This list will be copied and sorted to account for script reordering.
- *
- * <p>TODO: This is fragile. If the first character of a script is tailored
- * so that it does not map to the script's lowest primary weight any more,
- * then the buckets will be off.
- * There are hacks in the code to handle the known CJK tailorings of U+4E00.
- *
- * <p>We use "A" not "a" because the en_US_POSIX tailoring sorts A primary-before a.
- */
- private static final List<String> HACK_FIRST_CHARS_IN_SCRIPTS =
- Arrays.asList(new String[] {
- "A", "\u03B1", "\u2C81", "\u0430", "\u2C30", "\u10D0", "\u0561", "\u05D0", "\uD802\uDD00", "\u0800", "\u0621",
- "\u0710", // Syriac
- "\u0840", // Mandaic
- "\u0780", "\u07CA", "\u2D30", "\u1200", "\u0950", "\u0985", "\u0A74", "\u0AD0", "\u0B05", "\u0BD0",
- "\u0C05", "\u0C85", "\u0D05", "\u0D85",
- "\uAAF2", // Meetei Mayek
- "\uA800", "\uA882", "\uD804\uDC83",
- UCharacter.toString(0x111C4), // Sharada
- UCharacter.toString(0x11680), // Takri
- "\u1B83", // Sundanese
- "\uD804\uDC05", // Brahmi (U+11005)
- "\uD802\uDE00", "\u0E01",
- "\u0EDE", // Lao
- "\uAA80", "\u0F40", "\u1C00", "\uA840", "\u1900", "\u1700", "\u1720", "\u1740", "\u1760",
- "\u1A00", // Buginese
- "\u1BC0", // Batak
- "\uA930", "\uA90A", "\u1000",
- UCharacter.toString(0x11103), // Chakma
- "\u1780", "\u1950", "\u1980", "\u1A20", "\uAA00", "\u1B05", "\uA984", "\u1880", "\u1C5A", "\u13A0", "\u1401", "\u1681", "\u16A0", "\uD803\uDC00", "\uA500", "\uA6A0", "\u1100",
- "\u3041", "\u30A1", "\u3105", "\uA000", "\uA4F8",
- UCharacter.toString(0x16F00), // Miao
- "\uD800\uDE80", "\uD800\uDEA0", "\uD802\uDD20", "\uD800\uDF00", "\uD800\uDF30", "\uD801\uDC28", "\uD801\uDC50", "\uD801\uDC80",
- UCharacter.toString(0x110D0), // Sora Sompeng
- "\uD800\uDC00", "\uD802\uDC00", "\uD802\uDE60", "\uD802\uDF00", "\uD802\uDC40",
- "\uD802\uDF40", "\uD802\uDF60", "\uD800\uDF80", "\uD800\uDFA0", "\uD808\uDC00", "\uD80C\uDC00",
- UCharacter.toString(0x109A0), // Meroitic Cursive
- UCharacter.toString(0x10980), // Meroitic Hieroglyphs
- "\u4E00",
- // TODO: The overflow bucket's lowerBoundary string should be the
- // first item after the last reordering group in the collator's script order.
- // This should normally be the first Unicode code point
- // that is unassigned (U+0378 in Unicode 6.3) and untailored.
- // However, at least up to ICU 51 the Hani reordering group includes
- // unassigned code points,
- // and there is no stable string for the start of the trailing-weights range.
- // The only known string that sorts "high" is U+FFFF.
- // When ICU separates Hani vs. unassigned reordering groups, we need to fix this,
- // and fix relevant test code.
- // Ideally, FractionalUCA.txt will have a "script first primary"
- // for unassigned code points.
- "\uFFFF"
- });
+ // TODO: Surely we have at least a ticket for porting these mask values to UCharacter.java?!
+ private static final int GC_LU_MASK = 1 << UCharacter.UPPERCASE_LETTER;
+ private static final int GC_LL_MASK = 1 << UCharacter.LOWERCASE_LETTER;
+ private static final int GC_LT_MASK = 1 << UCharacter.TITLECASE_LETTER;
+ private static final int GC_LM_MASK = 1 << UCharacter.MODIFIER_LETTER;
+ private static final int GC_LO_MASK = 1 << UCharacter.OTHER_LETTER;
+ private static final int GC_L_MASK =
+ GC_LU_MASK|GC_LL_MASK|GC_LT_MASK|GC_LM_MASK|GC_LO_MASK;
+ private static final int GC_CN_MASK = 1 << UCharacter.GENERAL_OTHER_TYPES;
/**
* Return a list of the first character in each script. Only exposed for testing.
* @deprecated This API is ICU internal, only for testing.
*/
@Deprecated
- public static Collection<String> getFirstCharactersInScripts() {
- return HACK_FIRST_CHARS_IN_SCRIPTS;
+ public List<String> getFirstCharactersInScripts() {
+ List<String> dest = new ArrayList<String>(200);
+ // Fetch the script-first-primary contractions which are defined in the root collator.
+ // They all start with U+FDD1.
+ UnicodeSet set = new UnicodeSet();
+ collatorPrimaryOnly.internalAddContractions(0xFDD1, set);
+ if (set.isEmpty()) {
+ throw new UnsupportedOperationException(
+ "AlphabeticIndex requires script-first-primary contractions");
+ }
+ for (String boundary : set) {
+ int gcMask = 1 << UCharacter.getType(boundary.codePointAt(1));
+ if ((gcMask & (GC_L_MASK | GC_CN_MASK)) == 0) {
+ // Ignore boundaries for the special reordering groups.
+ // Take only those for "real scripts" (where the sample character is a Letter,
+ // and the one for unassigned implicit weights (Cn).
+ continue;
+ }
+ dest.add(boundary);
+ }
+ return dest;
}
}
/**
*******************************************************************************
-* Copyright (C) 1996-2014, International Business Machines Corporation and *
-* others. All Rights Reserved. *
-*******************************************************************************
-*
-*
+* Copyright (C) 1996-2014, International Business Machines Corporation and
+* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.text;
-/***
- * import java.text.StringCharacterIterator;
- * import java.text.CharacterIterator;
- */
import java.text.CharacterIterator;
-import java.util.MissingResourceException;
+import java.util.HashMap;
+import java.util.Map;
import com.ibm.icu.impl.CharacterIteratorWrapper;
-import com.ibm.icu.impl.ICUDebug;
-import com.ibm.icu.impl.Norm2AllModes;
-import com.ibm.icu.impl.Normalizer2Impl;
-import com.ibm.icu.impl.StringUCharacterIterator;
-import com.ibm.icu.impl.UCharacterProperty;
-import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.impl.coll.Collation;
+import com.ibm.icu.impl.coll.CollationData;
+import com.ibm.icu.impl.coll.CollationIterator;
+import com.ibm.icu.impl.coll.ContractionsAndExpansions;
+import com.ibm.icu.impl.coll.FCDIterCollationIterator;
+import com.ibm.icu.impl.coll.FCDUTF16CollationIterator;
+import com.ibm.icu.impl.coll.IterCollationIterator;
+import com.ibm.icu.impl.coll.UTF16CollationIterator;
+import com.ibm.icu.impl.coll.UVector32;
/**
* <p><code>CollationElementIterator</code> is an iterator created by
* a RuleBasedCollator to walk through a string. The return result of
- * each iteration is a 32-bit collation element that defines the
+ * each iteration is a 32-bit collation element (CE) that defines the
* ordering priority of the next character or sequence of characters
* in the source string.</p>
*
- * <p>For illustration, consider the following in Spanish:
+ * <p>For illustration, consider the following in Slovak and in traditional Spanish collation:
* <blockquote>
* <pre>
- * "ca" -> the first collation element is collation_element('c') and second
- * collation element is collation_element('a').
- *
- * Since "ch" in Spanish sorts as one entity, the below example returns one
- * collation element for the two characters 'c' and 'h'
- *
- * "cha" -> the first collation element is collation_element('ch') and second
- * collation element is collation_element('a').
+ * "ca" -> the first collation element is CE('c') and the second
+ * collation element is CE('a').
+ * "cha" -> the first collation element is CE('ch') and the second
+ * collation element is CE('a').
* </pre>
* </blockquote>
- * And in German,
+ * And in German phonebook collation,
* <blockquote>
* <pre>
* Since the character 'æ' is a composed character of 'a' and 'e', the
* </p>
*
* <p>For collation ordering comparison, the collation element results
- * can not be compared simply by using basic arithmetric operators,
+ * can not be compared simply by using basic arithmetic operators,
* e.g. <, == or >, further processing has to be done. Details
* can be found in the ICU
- * <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html">
- * user guide</a>. An example of using the CollationElementIterator
+ * <a href="http://userguide.icu-project.org/collation/architecture">
+ * User Guide</a>. An example of using the CollationElementIterator
* for collation ordering comparison is the class
- * <a href=StringSearch.html> com.ibm.icu.text.StringSearch</a>.</p>
+ * {@link com.ibm.icu.text.StringSearch}.</p>
*
* <p>To construct a CollationElementIterator object, users
* call the method getCollationElementIterator() on a
*/
public final class CollationElementIterator
{
-
-
- // public data members --------------------------------------------------
+ private CollationIterator iter_; // owned
+ private RuleBasedCollator rbc_; // aliased
+ private int otherHalf_;
+ /**
+ * <0: backwards; 0: just after reset() (previous() begins from end);
+ * 1: just after setOffset(); >1: forward
+ */
+ private byte dir_;
+ /**
+ * Stores offsets from expansions and from unsafe-backwards iteration,
+ * so that getOffset() returns intermediate offsets for the CEs
+ * that are consistent with forward iteration.
+ */
+ private UVector32 offsets_;
+
+ private String string_; // TODO: needed in Java? if so, then add a UCharacterIterator field too?
+
/**
* <p>This constant is returned by the iterator in the methods
* @see #previous */
public static final int IGNORABLE = 0;
- // public methods -------------------------------------------------------
+ /**
+ * Return the primary order of the specified collation element,
+ * i.e. the first 16 bits. This value is unsigned.
+ * @param ce the collation element
+ * @return the element's 16 bits primary order.
+ * @stable ICU 2.8
+ */
+ public final static int primaryOrder(int ce) {
+ return (ce >>> 16) & 0xffff;
+ }
+
+ /**
+ * Return the secondary order of the specified collation element,
+ * i.e. the 16th to 23th bits, inclusive. This value is unsigned.
+ * @param ce the collation element
+ * @return the element's 8 bits secondary order
+ * @stable ICU 2.8
+ */
+ public final static int secondaryOrder(int ce) {
+ return (ce >>> 8) & 0xff;
+ }
+
+ /**
+ * Return the tertiary order of the specified collation element, i.e. the last
+ * 8 bits. This value is unsigned.
+ * @param ce the collation element
+ * @return the element's 8 bits tertiary order
+ * @stable ICU 2.8
+ */
+ public final static int tertiaryOrder(int ce) {
+ return ce & 0xff;
+ }
+
+
+ private static final int getFirstHalf(long p, int lower32) {
+ return ((int)p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff);
+ }
+
+ private static final int getSecondHalf(long p, int lower32) {
+ return ((int)p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f);
+ }
+
+ private static final boolean ceNeedsTwoParts(long ce) {
+ return (ce & 0xffff00ff003fL) != 0;
+ }
+
+ private CollationElementIterator(RuleBasedCollator collator) {
+ iter_ = null;
+ rbc_ = collator;
+ otherHalf_ = 0;
+ dir_ = 0;
+ offsets_ = null;
+ }
+
+ /**
+ * <p>CollationElementIterator constructor. This takes a source
+ * string and a RuleBasedCollator. The iterator will walk through
+ * the source string based on the rules defined by the
+ * collator. If the source string is empty, NULLORDER will be
+ * returned on the first call to next().</p>
+ *
+ * @param source the source string.
+ * @param collator the RuleBasedCollator
+ * @stable ICU 2.8
+ */
+ CollationElementIterator(String source, RuleBasedCollator collator) {
+ this(collator);
+ setText(source);
+ }
+ // Note: The constructors should take settings & tailoring, not a collator,
+ // to avoid circular dependencies.
+ // However, for equals() we would need to be able to compare tailoring data for equality
+ // without making CollationData or CollationTailoring depend on TailoredSet.
+ // (See the implementation of RuleBasedCollator.equals().)
+ // That might require creating an intermediate class that would be used
+ // by both CollationElementIterator and RuleBasedCollator
+ // but only contain the part of RBC.equals() related to data and rules.
+
+ /**
+ * <p>CollationElementIterator constructor. This takes a source
+ * character iterator and a RuleBasedCollator. The iterator will
+ * walk through the source string based on the rules defined by
+ * the collator. If the source string is empty, NULLORDER will be
+ * returned on the first call to next().</p>
+ *
+ * @param source the source string iterator.
+ * @param collator the RuleBasedCollator
+ * @stable ICU 2.8
+ */
+ CollationElementIterator(CharacterIterator source, RuleBasedCollator collator) {
+ this(collator);
+ setText(source);
+ }
- // public getters -------------------------------------------------------
+ /**
+ * <p>CollationElementIterator constructor. This takes a source
+ * character iterator and a RuleBasedCollator. The iterator will
+ * walk through the source string based on the rules defined by
+ * the collator. If the source string is empty, NULLORDER will be
+ * returned on the first call to next().</p>
+ *
+ * @param source the source string iterator.
+ * @param collator the RuleBasedCollator
+ * @stable ICU 2.8
+ */
+ CollationElementIterator(UCharacterIterator source, RuleBasedCollator collator) {
+ this(collator);
+ setText(source);
+ }
/**
* <p>Returns the character offset in the source string
* next() or previous().
* @stable ICU 2.8
*/
- public int getOffset()
- {
- if (m_bufferOffset_ != -1) {
- if (m_isForwards_) {
- return m_FCDLimit_;
- }
- return m_FCDStart_;
- }
- return m_source_.getIndex();
- }
-
-
- /**
- * <p> Returns the maximum length of any expansion sequence that ends with
- * the specified collation element. If there is no expansion with this
- * collation element as the last element, returns 1.
- * </p>
- * @param ce a collation element returned by previous() or next().
- * @return the maximum length of any expansion sequence ending
- * with the specified collation element.
- * @stable ICU 2.8
- */
- public int getMaxExpansion(int ce)
- {
- int start = 0;
- int limit = m_collator_.m_expansionEndCE_.length;
- long unsignedce = ce & 0xFFFFFFFFl;
- while (start < limit - 1) {
- int mid = start + ((limit - start) >> 1);
- long midce = m_collator_.m_expansionEndCE_[mid] & 0xFFFFFFFFl;
- if (unsignedce <= midce) {
- limit = mid;
+ public int getOffset() {
+ if (dir_ < 0 && offsets_ != null && !offsets_.isEmpty()) {
+ // CollationIterator.previousCE() decrements the CEs length
+ // while it pops CEs from its internal buffer.
+ int i = iter_.getCEsLength();
+ if (otherHalf_ != 0) {
+ // Return the trailing CE offset while we are in the middle of a 64-bit CE.
+ ++i;
}
- else {
- start = mid;
- }
- }
- int result = 1;
- if (m_collator_.m_expansionEndCE_[start] == ce) {
- result = m_collator_.m_expansionEndCEMaxSize_[start];
+ assert (i < offsets_.size());
+ return offsets_.elementAti(i);
}
- else if (limit < m_collator_.m_expansionEndCE_.length &&
- m_collator_.m_expansionEndCE_[limit] == ce) {
- result = m_collator_.m_expansionEndCEMaxSize_[limit];
- }
- else if ((ce & 0xFFFF) == 0x00C0) {
- result = 2;
- }
- return result;
- }
-
- // public other methods -------------------------------------------------
-
- /**
- * <p> Resets the cursor to the beginning of the string. The next
- * call to next() or previous() will return the first and last
- * collation element in the string, respectively.</p>
- *
- * <p>If the RuleBasedCollator used by this iterator has had its
- * attributes changed, calling reset() will reinitialize the
- * iterator to use the new attributes.</p>
- *
- * @stable ICU 2.8
- */
- public void reset()
- {
- m_source_.setToStart();
- updateInternalState();
- m_direction = 0; // initial state
+ return iter_.getOffset();
}
/**
* iteration has been reached.
* @stable ICU 2.8
*/
- public int next()
- {
- assert m_direction >= 0;
- m_direction = 1;
-
- m_isForwards_ = true;
- if (m_CEBufferSize_ > 0) {
- if (m_CEBufferOffset_ < m_CEBufferSize_) {
- // if there are expansions left in the buffer, we return it
- return m_CEBuffer_[m_CEBufferOffset_ ++];
- }
- m_CEBufferSize_ = 0;
- m_CEBufferOffset_ = 0;
- }
-
- int result = NULLORDER;
- char ch = 0;
- do {
- int ch_int = nextChar();
- if (ch_int == UCharacterIterator.DONE) {
- return NULLORDER;
- }
- ch = (char)ch_int;
- if (m_collator_.m_isHiragana4_) {
- /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
- * based on whether the previous codepoint was Hiragana or Katakana.
- */
- m_isCodePointHiragana_ = (m_isCodePointHiragana_ && (ch >= 0x3099 && ch <= 0x309C)) ||
- ((ch >= 0x3040 && ch <= 0x309e) && !(ch > 0x3094 && ch < 0x309d));
- }
-
- if (ch <= 0xFF) {
- // For latin-1 characters we never need to fall back to the UCA
- // table because all of the UCA data is replicated in the
- // latinOneMapping array.
- // Except: Special CEs can result in CE_NOT_FOUND_,
- // for example if the default entry for a prefix-special is "not found",
- // and we do need to fall back to the UCA in such a case.
- // TODO: It would be better if tailoring specials never resulted in "not found"
- // unless the corresponding UCA result is also "not found".
- // That would require a change in the ICU4J collator-from-rule builder.
- result = m_collator_.m_trie_.getLatin1LinearValue(ch);
- } else {
- result = m_collator_.m_trie_.getLeadValue(ch);
- }
- if (!RuleBasedCollator.isSpecial(result)) {
- return result;
- }
- if (result != CE_NOT_FOUND_) {
- result = nextSpecial(m_collator_, result, ch);
- }
- if (result == CE_NOT_FOUND_) {
- // couldn't find a good CE in the tailoring
- if (RuleBasedCollator.UCA_ != null) {
- result = RuleBasedCollator.UCA_.m_trie_.getLeadValue(ch);
- if (RuleBasedCollator.isSpecial(result)) {
- // UCA also gives us a special CE
- result = nextSpecial(RuleBasedCollator.UCA_, result, ch);
- }
- }
- if(result == CE_NOT_FOUND_) {
- // maybe there is no UCA, unlikely in Java, but ported for consistency
- result = nextImplicit(ch);
- }
- }
- } while (result == IGNORABLE && ch >= 0xAC00 && ch <= 0xD7AF);
-
- return result;
+ public int next() {
+ if (dir_ > 1) {
+ // Continue forward iteration. Test this first.
+ if (otherHalf_ != 0) {
+ int oh = otherHalf_;
+ otherHalf_ = 0;
+ return oh;
+ }
+ } else if (dir_ == 1) {
+ // next() after setOffset()
+ dir_ = 2;
+ } else if (dir_ == 0) {
+ // The iter_ is already reset to the start of the text.
+ dir_ = 2;
+ } else /* dir_ < 0 */{
+ // illegal change of direction
+ throw new IllegalStateException("Illegal change of direction");
+ // Java porting note: ICU4C sets U_INVALID_STATE_ERROR to the return status.
+ }
+ // No need to keep all CEs in the buffer when we iterate.
+ iter_.clearCEsIfNoneRemaining();
+ long ce = iter_.nextCE();
+ if (ce == Collation.NO_CE) {
+ return NULLORDER;
+ }
+ // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
+ long p = ce >>> 32;
+ int lower32 = (int) ce;
+ int firstHalf = getFirstHalf(p, lower32);
+ int secondHalf = getSecondHalf(p, lower32);
+ if (secondHalf != 0) {
+ otherHalf_ = secondHalf | 0xc0; // continuation CE
+ }
+ return firstHalf;
}
/**
* the iteration has been reached.
* @stable ICU 2.8
*/
- public int previous()
- {
- assert m_direction <= 0;
- m_direction = -1;
-
- if (m_source_.getIndex() <= 0 && m_isForwards_) {
- // if iterator is new or reset, we can immediate perform backwards
- // iteration even when the offset is not right.
- m_source_.setToLimit();
- updateInternalState();
- }
- m_isForwards_ = false;
- if (m_CEBufferSize_ > 0) {
- if (m_CEBufferOffset_ > 0) {
- return m_CEBuffer_[-- m_CEBufferOffset_];
- }
- m_CEBufferSize_ = 0;
- m_CEBufferOffset_ = 0;
- }
-
- int result = NULLORDER;
- char ch = 0;
- do {
- int ch_int = previousChar();
- if (ch_int == UCharacterIterator.DONE) {
- return NULLORDER;
- }
- ch = (char)ch_int;
- if (m_collator_.m_isHiragana4_) {
- m_isCodePointHiragana_ = (ch >= 0x3040 && ch <= 0x309f);
- }
- if (m_collator_.isContractionEnd(ch) && !isBackwardsStart()) {
- result = previousSpecial(m_collator_, CE_CONTRACTION_, ch);
- }
- else {
- if (ch <= 0xFF) {
- result = m_collator_.m_trie_.getLatin1LinearValue(ch);
- }
- else {
- result = m_collator_.m_trie_.getLeadValue(ch);
- }
- if (RuleBasedCollator.isSpecial(result)) {
- result = previousSpecial(m_collator_, result, ch);
- }
- if (result == CE_NOT_FOUND_) {
- if (!isBackwardsStart()
- && m_collator_.isContractionEnd(ch)) {
- result = CE_CONTRACTION_;
- }
- else {
- if(RuleBasedCollator.UCA_ != null) {
- result = RuleBasedCollator.UCA_.m_trie_.getLeadValue(ch);
- }
- }
-
- if (RuleBasedCollator.isSpecial(result)) {
- if(RuleBasedCollator.UCA_ != null) {
- result = previousSpecial(RuleBasedCollator.UCA_, result, ch);
- }
- }
- }
- }
- } while (result == IGNORABLE && ch >= 0xAC00 && ch <= 0xD7AF);
- if(result == CE_NOT_FOUND_) {
- result = previousImplicit(ch);
- }
- return result;
- }
-
- /**
- * Return the primary order of the specified collation element,
- * i.e. the first 16 bits. This value is unsigned.
- * @param ce the collation element
- * @return the element's 16 bits primary order.
- * @stable ICU 2.8
- */
- public final static int primaryOrder(int ce)
- {
- return (ce & RuleBasedCollator.CE_PRIMARY_MASK_)
- >>> RuleBasedCollator.CE_PRIMARY_SHIFT_;
- }
- /**
- * Return the secondary order of the specified collation element,
- * i.e. the 16th to 23th bits, inclusive. This value is unsigned.
- * @param ce the collation element
- * @return the element's 8 bits secondary order
- * @stable ICU 2.8
- */
- public final static int secondaryOrder(int ce)
- {
- return (ce & RuleBasedCollator.CE_SECONDARY_MASK_)
- >> RuleBasedCollator.CE_SECONDARY_SHIFT_;
+ public int previous() {
+ if (dir_ < 0) {
+ // Continue backwards iteration. Test this first.
+ if (otherHalf_ != 0) {
+ int oh = otherHalf_;
+ otherHalf_ = 0;
+ return oh;
+ }
+ } else if (dir_ == 0) {
+ iter_.resetToOffset(string_.length());
+ dir_ = -1;
+ } else if (dir_ == 1) {
+ // previous() after setOffset()
+ dir_ = -1;
+ } else /* dir_ > 1 */{
+ // illegal change of direction
+ throw new IllegalStateException("Illegal change of direction");
+ // Java porting note: ICU4C sets U_INVALID_STATE_ERROR to the return status.
+ }
+ if (offsets_ == null) {
+ offsets_ = new UVector32();
+ }
+ // If we already have expansion CEs, then we also have offsets.
+ // Otherwise remember the trailing offset in case we need to
+ // write offsets for an artificial expansion.
+ int limitOffset = iter_.getCEsLength() == 0 ? iter_.getOffset() : 0;
+ long ce = iter_.previousCE(offsets_);
+ if (ce == Collation.NO_CE) {
+ return NULLORDER;
+ }
+ // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
+ long p = ce >>> 32;
+ int lower32 = (int) ce;
+ int firstHalf = getFirstHalf(p, lower32);
+ int secondHalf = getSecondHalf(p, lower32);
+ if (secondHalf != 0) {
+ if (offsets_.isEmpty()) {
+ // When we convert a single 64-bit CE into two 32-bit CEs,
+ // we need to make this artificial expansion behave like a normal expansion.
+ // See CollationIterator.previousCE().
+ offsets_.addElement(iter_.getOffset());
+ offsets_.addElement(limitOffset);
+ }
+ otherHalf_ = firstHalf;
+ return secondHalf | 0xc0; // continuation CE
+ }
+ return firstHalf;
}
/**
- * Return the tertiary order of the specified collation element, i.e. the last
- * 8 bits. This value is unsigned.
- * @param ce the collation element
- * @return the element's 8 bits tertiary order
+ * <p> Resets the cursor to the beginning of the string. The next
+ * call to next() or previous() will return the first and last
+ * collation element in the string, respectively.</p>
+ *
+ * <p>If the RuleBasedCollator used by this iterator has had its
+ * attributes changed, calling reset() will reinitialize the
+ * iterator to use the new attributes.</p>
+ *
* @stable ICU 2.8
*/
- public final static int tertiaryOrder(int ce)
- {
- return ce & RuleBasedCollator.CE_TERTIARY_MASK_;
+ public void reset() {
+ iter_ .resetToOffset(0);
+ otherHalf_ = 0;
+ dir_ = 0;
}
/**
* iteration. The user must ensure that the offset is not in the
* middle of a decomposible range.</p>
*
- * @param offset the character offset into the original source string to
+ * @param newOffset the character offset into the original source string to
* set. Note that this is not an offset into the corresponding
* sequence of collation elements.
* @stable ICU 2.8
*/
- public void setOffset(int offset)
- {
- m_direction = 0; // reset to initial state
-
- m_source_.setIndex(offset);
- int ch_int = m_source_.current();
- char ch = (char)ch_int;
- if (ch_int != UCharacterIterator.DONE && m_collator_.isUnsafe(ch)) {
- // if it is unsafe we need to check if it is part of a contraction
- // or a surrogate character
- if (UTF16.isTrailSurrogate(ch)) {
- // if it is a surrogate pair we move up one character
- char prevch = (char)m_source_.previous();
- if (!UTF16.isLeadSurrogate(prevch)) {
- m_source_.setIndex(offset); // go back to the same index
+ public void setOffset(int newOffset) {
+ if (0 < newOffset && newOffset < string_.length()) {
+ int offset = newOffset;
+ do {
+ char c = string_.charAt(offset);
+ if (!rbc_.isUnsafe(c) ||
+ (Character.isHighSurrogate(c) && !rbc_.isUnsafe(string_.codePointAt(offset)))) {
+ break;
}
- }
- else {
- // could be part of a contraction
- // backup to a safe point and iterate till we pass offset
- while (m_source_.getIndex() > 0) {
- if (!m_collator_.isUnsafe(ch)) {
- break;
+ // Back up to before this unsafe character.
+ --offset;
+ } while (offset > 0);
+ if (offset < newOffset) {
+ // We might have backed up more than necessary.
+ // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
+ // but for text "chu" setOffset(2) should remain at 2
+ // although we initially back up to offset 0.
+ // Find the last safe offset no greater than newOffset by iterating forward.
+ int lastSafeOffset = offset;
+ do {
+ iter_.resetToOffset(lastSafeOffset);
+ do {
+ iter_.nextCE();
+ } while ((offset = iter_.getOffset()) == lastSafeOffset);
+ if (offset <= newOffset) {
+ lastSafeOffset = offset;
}
- ch = (char)m_source_.previous();
- }
- updateInternalState();
- int prevoffset = 0;
- while (m_source_.getIndex() <= offset) {
- prevoffset = m_source_.getIndex();
- next();
- }
- m_source_.setIndex(prevoffset);
+ } while (offset < newOffset);
+ newOffset = lastSafeOffset;
}
}
- updateInternalState();
- // direction code to prevent next and previous from returning a
- // character if we are already at the ends
- offset = m_source_.getIndex();
- if (offset == 0/* m_source_.getBeginIndex() */) {
- // preventing previous() from returning characters from the end of
- // the string again if we are at the beginning
- m_isForwards_ = false;
- }
- else if (offset == m_source_.getLength()) {
- // preventing next() from returning characters from the start of
- // the string again if we are at the end
- m_isForwards_ = true;
- }
+ iter_.resetToOffset(newOffset);
+ otherHalf_ = 0;
+ dir_ = 1;
}
/**
* @param source the new source string for iteration.
* @stable ICU 2.8
*/
- public void setText(String source)
- {
- m_srcUtilIter_.setText(source);
- m_source_ = m_srcUtilIter_;
- updateInternalState();
-
- m_direction = 0; // reset to initial state
+ public void setText(String source) {
+ string_ = source; // TODO: do we need to remember the source string in a field?
+ CollationIterator newIter;
+ boolean numeric = rbc_.settings.readOnly().isNumeric();
+ if (rbc_.settings.readOnly().dontCheckFCD()) {
+ newIter = new UTF16CollationIterator(rbc_.data, numeric, string_, 0);
+ } else {
+ newIter = new FCDUTF16CollationIterator(rbc_.data, numeric, string_, 0);
+ }
+ iter_ = newIter;
+ otherHalf_ = 0;
+ dir_ = 0;
}
-
+
/**
* <p>Set a new source string iterator for iteration, and reset the
* offset to the beginning of the text.
* @param source the new source string iterator for iteration.
* @stable ICU 2.8
*/
- public void setText(UCharacterIterator source)
- {
- m_srcUtilIter_.setText(source.getText());
- m_source_ = m_srcUtilIter_;
- updateInternalState();
-
- m_direction = 0; // reset to initial state
+ public void setText(UCharacterIterator source) {
+ string_ = source.getText(); // TODO: do we need to remember the source string in a field?
+ // Note: In C++, we just setText(source.getText()).
+ // In Java, we actually operate on a character iterator.
+ // (The old code apparently did so only for a CharacterIterator;
+ // for a UCharacterIterator it also just used source.getText()).
+ // TODO: do we need to remember the cloned iterator in a field?
+ UCharacterIterator src;
+ try {
+ src = (UCharacterIterator) source.clone();
+ } catch (CloneNotSupportedException e) {
+ // Fall back to ICU 52 behavior of iterating over the text contents
+ // of the UCharacterIterator.
+ setText(source.getText());
+ return;
+ }
+ src.setToStart();
+ CollationIterator newIter;
+ boolean numeric = rbc_.settings.readOnly().isNumeric();
+ if (rbc_.settings.readOnly().dontCheckFCD()) {
+ newIter = new IterCollationIterator(rbc_.data, numeric, src);
+ } else {
+ newIter = new FCDIterCollationIterator(rbc_.data, numeric, src, 0);
+ }
+ iter_ = newIter;
+ otherHalf_ = 0;
+ dir_ = 0;
}
/**
* @param source the new source string iterator for iteration.
* @stable ICU 2.8
*/
- public void setText(CharacterIterator source)
- {
- m_source_ = new CharacterIteratorWrapper(source);
- m_source_.setToStart();
- updateInternalState();
+ public void setText(CharacterIterator source) {
+ // Note: In C++, we just setText(source.getText()).
+ // In Java, we actually operate on a character iterator.
+ // TODO: do we need to remember the iterator in a field?
+ // TODO: apparently we don't clone a CharacterIterator in Java,
+ // we only clone the text for a UCharacterIterator?? see the old code in the constructors
+ UCharacterIterator src = new CharacterIteratorWrapper(source);
+ src.setToStart();
+ string_ = src.getText(); // TODO: do we need to remember the source string in a field?
+ CollationIterator newIter;
+ boolean numeric = rbc_.settings.readOnly().isNumeric();
+ if (rbc_.settings.readOnly().dontCheckFCD()) {
+ newIter = new IterCollationIterator(rbc_.data, numeric, src);
+ } else {
+ newIter = new FCDIterCollationIterator(rbc_.data, numeric, src, 0);
+ }
+ iter_ = newIter;
+ otherHalf_ = 0;
+ dir_ = 0;
+ }
+
+ // Java porting note: This method is @stable ICU 2.0 in ICU4C, but not available
+ // in ICU4J. For now, keep it package local.
+ /**
+ * Gets the comparison order in the desired strength. Ignore the other
+ * differences.
+ * @param order The order value
+ */
+ int strengthOrder(int order) {
+ int s = rbc_.settings.readOnly().getStrength();
+ // Mask off the unwanted differences.
+ if (s == Collator.PRIMARY) {
+ order &= 0xffff0000;
+ }
+ else if (s == Collator.SECONDARY) {
+ order &= 0xffffff00;
+ }
+
+ return order;
+ }
+
+
+ private static final class MaxExpSink implements ContractionsAndExpansions.CESink {
+ MaxExpSink(Map<Integer, Integer> h) {
+ maxExpansions = h;
+ }
+
+ // Java 6: @Override
+ public void handleCE(long ce) {
+ }
+
+ // Java 6: @Override
+ public void handleExpansion(long ces[], int start, int length) {
+ if (length <= 1) {
+ // We do not need to add single CEs into the map.
+ return;
+ }
+ int count = 0; // number of CE "halves"
+ for (int i = 0; i < length; ++i) {
+ count += ceNeedsTwoParts(ces[start + i]) ? 2 : 1;
+ }
+ // last "half" of the last CE
+ long ce = ces[start + length - 1];
+ long p = ce >>> 32;
+ int lower32 = (int) ce;
+ int lastHalf = getSecondHalf(p, lower32);
+ if (lastHalf == 0) {
+ lastHalf = getFirstHalf(p, lower32);
+ assert (lastHalf != 0);
+ } else {
+ lastHalf |= 0xc0; // old-style continuation CE
+ }
+ Integer oldCount = maxExpansions.get(lastHalf);
+ if (oldCount == null || count > oldCount) {
+ maxExpansions.put(lastHalf, count);
+ }
+ }
+
+ private Map<Integer, Integer> maxExpansions;
+ }
+
+ static final Map<Integer, Integer> computeMaxExpansions(CollationData data) {
+ Map<Integer, Integer> maxExpansions = new HashMap<Integer, Integer>();
+ MaxExpSink sink = new MaxExpSink(maxExpansions);
+ new ContractionsAndExpansions(null, null, sink, true).forData(data);
+ return maxExpansions;
+ }
+
+ /**
+ * <p> Returns the maximum length of any expansion sequence that ends with
+ * the specified collation element. If there is no expansion with this
+ * collation element as the last element, returns 1.
+ * </p>
+ * @param ce a collation element returned by previous() or next().
+ * @return the maximum length of any expansion sequence ending
+ * with the specified collation element.
+ * @stable ICU 2.8
+ */
+ public int getMaxExpansion(int ce) {
+ return getMaxExpansion(rbc_.tailoring.maxExpansions, ce);
+ }
- m_direction = 0; // reset to initial state
+ static int getMaxExpansion(Map<Integer, Integer> maxExpansions, int order) {
+ if (order == 0) {
+ return 1;
+ }
+ Integer max;
+ if (maxExpansions != null && (max = maxExpansions.get(order)) != null) {
+ return max;
+ }
+ if ((order & 0xc0) == 0xc0) {
+ // old-style continuation CE
+ return 2;
+ } else {
+ return 1;
+ }
}
- // public miscellaneous methods -----------------------------------------
+ /** Normalizes dir_=1 (just after setOffset()) to dir_=0 (just after reset()). */
+ private byte normalizeDir() {
+ return dir_ == 1 ? 0 : dir_;
+ }
/**
* Tests that argument object is equals to this CollationElementIterator.
* CollationElementIterator
* @stable ICU 2.8
*/
- public boolean equals(Object that)
- {
+ public boolean equals(Object that) {
if (that == this) {
return true;
}
if (that instanceof CollationElementIterator) {
- CollationElementIterator thatceiter
- = (CollationElementIterator)that;
- if (!m_collator_.equals(thatceiter.m_collator_)) {
- return false;
- }
- // checks the text
- return m_source_.getIndex() == thatceiter.m_source_.getIndex()
- && m_source_.getText().equals(
- thatceiter.m_source_.getText());
+ CollationElementIterator thatceiter = (CollationElementIterator) that;
+ return rbc_.equals(thatceiter.rbc_)
+ && otherHalf_ == thatceiter.otherHalf_
+ && normalizeDir() == thatceiter.normalizeDir()
+ && string_.equals(thatceiter.string_)
+ && iter_.equals(thatceiter.iter_);
}
return false;
}
-
+
/**
* Mock implementation of hashCode(). This implementation always returns a constant
* value. When Java assertion is enabled, this method triggers an assertion failure.
* @internal
* @deprecated This API is ICU internal only.
*/
- @Deprecated
public int hashCode() {
assert false : "hashCode not designed";
return 42;
}
- // package private constructors ------------------------------------------
-
- private CollationElementIterator(RuleBasedCollator collator) {
- m_utilStringBuffer_ = new StringBuilder();
- m_collator_ = collator;
- m_CEBuffer_ = new int[CE_BUFFER_INIT_SIZE_];
- m_buffer_ = new StringBuilder();
- m_utilSpecialBackUp_ = new Backup();
- }
-
/**
- * <p>CollationElementIterator constructor. This takes a source
- * string and a RuleBasedCollator. The iterator will walk through
- * the source string based on the rules defined by the
- * collator. If the source string is empty, NULLORDER will be
- * returned on the first call to next().</p>
- *
- * @param source the source string.
- * @param collator the RuleBasedCollator
- * @stable ICU 2.8
+ * @internal
+ * @deprecated This API is ICU internal only.
*/
- CollationElementIterator(String source, RuleBasedCollator collator)
- {
- this(collator);
- m_source_ = m_srcUtilIter_ = new StringUCharacterIterator(source);
- updateInternalState();
- }
-
- /**
- * <p>CollationElementIterator constructor. This takes a source
- * character iterator and a RuleBasedCollator. The iterator will
- * walk through the source string based on the rules defined by
- * the collator. If the source string is empty, NULLORDER will be
- * returned on the first call to next().</p>
- *
- * @param source the source string iterator.
- * @param collator the RuleBasedCollator
- * @stable ICU 2.8
- */
- CollationElementIterator(CharacterIterator source,
- RuleBasedCollator collator)
- {
- this(collator);
- m_srcUtilIter_ = new StringUCharacterIterator();
- m_source_ = new CharacterIteratorWrapper(source);
- updateInternalState();
- }
-
- /**
- * <p>CollationElementIterator constructor. This takes a source
- * character iterator and a RuleBasedCollator. The iterator will
- * walk through the source string based on the rules defined by
- * the collator. If the source string is empty, NULLORDER will be
- * returned on the first call to next().</p>
- *
- * @param source the source string iterator.
- * @param collator the RuleBasedCollator
- * @stable ICU 2.8
- */
- CollationElementIterator(UCharacterIterator source,
- RuleBasedCollator collator)
- {
- this(collator);
- m_srcUtilIter_ = new StringUCharacterIterator();
- m_srcUtilIter_.setText(source.getText());
- m_source_ = m_srcUtilIter_;
- updateInternalState();
- }
-
- // package private data members -----------------------------------------
-
- /**
- * true if current codepoint was Hiragana
- */
- boolean m_isCodePointHiragana_;
- /**
- * Position in the original string that starts with a non-FCD sequence
- */
- int m_FCDStart_;
- /**
- * This is the CE from CEs buffer that should be returned.
- * Initial value is 0.
- * Forwards iteration will end with m_CEBufferOffset_ == m_CEBufferSize_,
- * backwards will end with m_CEBufferOffset_ == 0.
- * The next/previous after we reach the end/beginning of the m_CEBuffer_
- * will cause this value to be reset to 0.
- */
- int m_CEBufferOffset_;
-
- /**
- * This is the position to which we have stored processed CEs.
- * Initial value is 0.
- * The next/previous after we reach the end/beginning of the m_CEBuffer_
- * will cause this value to be reset to 0.
- */
- int m_CEBufferSize_;
- static final int CE_NOT_FOUND_ = 0xF0000000;
- static final int CE_EXPANSION_TAG_ = 1;
- static final int CE_CONTRACTION_TAG_ = 2;
- /**
- * Collate Digits As Numbers (CODAN) implementation
- */
- static final int CE_DIGIT_TAG_ = 13;
-
- // package private methods ----------------------------------------------
-
- /**
- * Sets the collator used.
- * Internal use, all data members will be reset to the default values
- * @param collator to set
- */
- void setCollator(RuleBasedCollator collator)
- {
- m_collator_ = collator;
- updateInternalState();
- }
-
- /**
- * <p>Sets the iterator to point to the collation element corresponding to
- * the specified character (the parameter is a CHARACTER offset in the
- * original string, not an offset into its corresponding sequence of
- * collation elements). The value returned by the next call to next()
- * will be the collation element corresponding to the specified position
- * in the text. Unlike the public method setOffset(int), this method does
- * not try to readjust the offset to the start of a contracting sequence.
- * getOffset() is guaranteed to return the same value as was passed to a
- * preceding call to setOffset().</p>
- * @param offset new character offset into the original text to set.
- */
- void setExactOffset(int offset)
- {
- m_source_.setIndex(offset);
- updateInternalState();
-
- m_direction = 0; // reset to initial state
- }
-
- /**
- * Checks if iterator is in the buffer zone
- * @return true if iterator is in buffer zone, false otherwise
- */
- boolean isInBuffer()
- {
- return m_bufferOffset_ > 0;
- }
-
-
- /**
- * <p>Sets the iterator to point to the collation element corresponding to
- * the specified character (the parameter is a CHARACTER offset in the
- * original string, not an offset into its corresponding sequence of
- * collation elements). The value returned by the next call to next()
- * will be the collation element corresponding to the specified position
- * in the text. Unlike the public method setOffset(int), this method does
- * not try to readjust the offset to the start of a contracting sequence.
- * getOffset() is guaranteed to return the same value as was passed to a
- * preceding call to setOffset().</p>
- * </p>
- * @param source the new source string iterator for iteration.
- * @param offset to the source
- */
- void setText(UCharacterIterator source, int offset)
- {
- m_srcUtilIter_.setText(source.getText());
- m_source_ = m_srcUtilIter_;
- m_source_.setIndex(offset);
- updateInternalState();
-
- m_direction = 0; // reset to initial state
- }
-
- // private inner class --------------------------------------------------
-
- /**
- * Backup data class
- */
- private static final class Backup
- {
- // protected data members -------------------------------------------
-
- /**
- * Backup non FCD sequence limit
- */
- protected int m_FCDLimit_;
- /**
- * Backup non FCD sequence start
- */
- protected int m_FCDStart_;
- /**
- * Backup if previous Codepoint is Hiragana quatenary
- */
- protected boolean m_isCodePointHiragana_;
- /**
- * Backup buffer position
- */
- protected int m_bufferOffset_;
- /**
- * Backup source iterator offset
- */
- protected int m_offset_;
- /**
- * Backup buffer contents
- */
- protected StringBuffer m_buffer_;
-
- // protected constructor --------------------------------------------
-
- /**
- * Empty constructor
- */
- protected Backup()
- {
- m_buffer_ = new StringBuffer();
- }
- }
- // end inner class ------------------------------------------------------
-
- /**
- * Direction of travel
- */
- private boolean m_isForwards_;
- /**
- * Source string iterator
- */
- private UCharacterIterator m_source_;
- /**
- * This is position to the m_buffer_, -1 if iterator is not in m_buffer_
- */
- private int m_bufferOffset_;
- /**
- * Buffer for temporary storage of normalized characters, discontiguous
- * characters and Thai characters
- */
- private StringBuilder m_buffer_;
- /**
- * Position in the original string to continue forward FCD check from.
- */
- private int m_FCDLimit_;
- /**
- * The collator this iterator is based on
- */
- private RuleBasedCollator m_collator_;
- /**
- * true if Hiragana quatenary is on
- */
- //private boolean m_isHiragana4_;
- /**
- * CE buffer
- */
- private int m_CEBuffer_[];
- /**
- * In reality we should not have to deal with expansion sequences longer
- * then 16. However this value can be change if a bigger buffer is needed.
- * Note, if the size is change to too small a number, BIG trouble.
- * Reasonable small value is around 10, if there's no Arabic or other
- * funky collations that have long expansion sequence. This is the longest
- * expansion sequence this can handle without bombing out.
- */
- private static final int CE_BUFFER_INIT_SIZE_ = 512;
- /**
- * Backup storage for special processing inner cases
- */
- private Backup m_utilSpecialBackUp_;
- /**
- * Backup storage in special processing entry state
- */
- private Backup m_utilSpecialEntryBackUp_;
- /**
- * Backup storage in special processing discontiguous state
- */
- private Backup m_utilSpecialDiscontiguousBackUp_;
- /**
- * Utility
- */
- private StringUCharacterIterator m_srcUtilIter_;
- private StringBuilder m_utilStringBuffer_;
- private StringBuilder m_utilSkippedBuffer_;
- private CollationElementIterator m_utilColEIter_;
- private static final Normalizer2Impl m_nfcImpl_ = Norm2AllModes.getNFCInstance().impl;
- private StringBuilder m_unnormalized_;
- private Normalizer2Impl.ReorderingBuffer m_n2Buffer_;
- /**
- * The first non-zero combining class character
- */
- private static final int FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_ = 0xC0;
- /**
- * One character before the first character with leading non-zero combining
- * class
- */
- private static final int LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_ = 0x300;
- /**
- * Mask for the last byte
- */
- private static final int LAST_BYTE_MASK_ = 0xFF;
- /**
- * Shift value for the second last byte
- */
- private static final int SECOND_LAST_BYTE_SHIFT_ = 8;
-
- // special ce values and tags -------------------------------------------
-
-// private static final int CE_EXPANSION_ = 0xF1000000;
- private static final int CE_CONTRACTION_ = 0xF2000000;
- /**
- * Indicates the last ce has been consumed. Compare with NULLORDER.
- * NULLORDER is returned if error occurs.
- */
-/* private static final int CE_NO_MORE_CES_ = 0x00010101;
- private static final int CE_NO_MORE_CES_PRIMARY_ = 0x00010000;
- private static final int CE_NO_MORE_CES_SECONDARY_ = 0x00000100;
- private static final int CE_NO_MORE_CES_TERTIARY_ = 0x00000001;
-*/
- private static final int CE_NOT_FOUND_TAG_ = 0;
- /**
- * Charset processing, not yet implemented
- */
- private static final int CE_CHARSET_TAG_ = 4;
- /**
- * AC00-D7AF
- */
- private static final int CE_HANGUL_SYLLABLE_TAG_ = 6;
- /**
- * D800-DBFF
- */
- private static final int CE_LEAD_SURROGATE_TAG_ = 7;
- /**
- * DC00-DFFF
- */
- private static final int CE_TRAIL_SURROGATE_TAG_ = 8;
- /**
- * 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
- */
- private static final int CE_CJK_IMPLICIT_TAG_ = 9;
- private static final int CE_IMPLICIT_TAG_ = 10;
- static final int CE_SPEC_PROC_TAG_ = 11;
- /**
- * This is a 3 byte primary with starting secondaries and tertiaries.
- * It fits in a single 32 bit CE and is used instead of expansion to save
- * space without affecting the performance (hopefully).
- */
- private static final int CE_LONG_PRIMARY_TAG_ = 12;
-
-// private static final int CE_CE_TAGS_COUNT = 14;
- private static final int CE_BYTE_COMMON_ = 0x05;
-
- // end special ce values and tags ---------------------------------------
-
- private static final int HANGUL_SBASE_ = 0xAC00;
- private static final int HANGUL_LBASE_ = 0x1100;
- private static final int HANGUL_VBASE_ = 0x1161;
- private static final int HANGUL_TBASE_ = 0x11A7;
- private static final int HANGUL_VCOUNT_ = 21;
- private static final int HANGUL_TCOUNT_ = 28;
-
- // CJK stuff ------------------------------------------------------------
-
-/* private static final int CJK_BASE_ = 0x4E00;
- private static final int CJK_LIMIT_ = 0x9FFF+1;
- private static final int CJK_COMPAT_USED_BASE_ = 0xFA0E;
- private static final int CJK_COMPAT_USED_LIMIT_ = 0xFA2F + 1;
- private static final int CJK_A_BASE_ = 0x3400;
- private static final int CJK_A_LIMIT_ = 0x4DBF + 1;
- private static final int CJK_B_BASE_ = 0x20000;
- private static final int CJK_B_LIMIT_ = 0x2A6DF + 1;
- private static final int NON_CJK_OFFSET_ = 0x110000;
-*/
- private static final boolean DEBUG = ICUDebug.enabled("collator");
-
- // Field tracking the current direction. This field was added
- // just for making sure that reset()/setOffset()/setText() is called
- // before switching the iterator direction.
- // We used to allow changing direction without calling reset()/setOffset()
- // setText() in ICU4J, but the API specification was updated to match the
- // ICU4C's specification. The current implementation seems to handle
- // direction change (or not), but it will be completely replaced with
- // a new implementation not allowing this. Until then, we use this field
- // to trigger assertion and make sure our implementation is not depending on
- // the assumption. See ticket#9104.
- private byte m_direction = 0; // -1: backward, 0: initial state, 1: forward
-
- // private methods ------------------------------------------------------
-
- /**
- * Reset the iterator internally
- */
- private void updateInternalState()
- {
- m_isCodePointHiragana_ = false;
- m_buffer_.setLength(0);
- m_bufferOffset_ = -1;
- m_CEBufferOffset_ = 0;
- m_CEBufferSize_ = 0;
- m_FCDLimit_ = -1;
- m_FCDStart_ = m_source_.getLength();
- //m_isHiragana4_ = m_collator_.m_isHiragana4_;
- m_isForwards_ = true;
- }
-
- /**
- * Backup the current internal state
- * @param backup object to store the data
- */
- private void backupInternalState(Backup backup)
- {
- backup.m_offset_ = m_source_.getIndex();
- backup.m_FCDLimit_ = m_FCDLimit_;
- backup.m_FCDStart_ = m_FCDStart_;
- backup.m_isCodePointHiragana_ = m_isCodePointHiragana_;
- backup.m_bufferOffset_ = m_bufferOffset_;
- backup.m_buffer_.setLength(0);
- if (m_bufferOffset_ >= 0) {
- backup.m_buffer_.append(m_buffer_);
- }
- }
-
- /**
- * Update the iterator internally with backed-up state
- * @param backup object that stored the data
- */
- private void updateInternalState(Backup backup)
- {
- m_source_.setIndex(backup.m_offset_);
- m_isCodePointHiragana_ = backup.m_isCodePointHiragana_;
- m_bufferOffset_ = backup.m_bufferOffset_;
- m_FCDLimit_ = backup.m_FCDLimit_;
- m_FCDStart_ = backup.m_FCDStart_;
- m_buffer_.setLength(0);
- if (m_bufferOffset_ >= 0) {
- m_buffer_.append(backup.m_buffer_);
- }
- }
-
- /**
- * A fast combining class retrieval system.
- * @param ch UTF16 character
- * @return combining class of ch
- */
- private int getCombiningClass(int ch)
- {
- if (ch >= LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_ &&
- m_collator_.isUnsafe((char)ch) || ch > 0xFFFF
- ) {
- return m_nfcImpl_.getCC(m_nfcImpl_.getNorm16(ch));
- }
- return 0;
- }
-
- /**
- * <p>Incremental normalization, this is an essential optimization.
- * Assuming FCD checks has been done, normalize the non-FCD characters into
- * the buffer.
- * Source offsets points to the current processing character.
- * </p>
- */
- private void normalize()
- {
- if (m_unnormalized_ == null) {
- m_unnormalized_ = new StringBuilder();
- m_n2Buffer_ = new Normalizer2Impl.ReorderingBuffer(m_nfcImpl_, m_buffer_, 10);
- } else {
- m_unnormalized_.setLength(0);
- m_n2Buffer_.remove();
- }
- int size = m_FCDLimit_ - m_FCDStart_;
- m_source_.setIndex(m_FCDStart_);
- for (int i = 0; i < size; i ++) {
- m_unnormalized_.append((char)m_source_.next());
- }
- m_nfcImpl_.decomposeShort(m_unnormalized_, 0, size, m_n2Buffer_);
- }
-
- /**
- * <p>Incremental FCD check and normalization. Gets the next base character
- * position and determines if the in-between characters needs normalization.
- * </p>
- * <p>When entering, the state is known to be this:
- * <ul>
- * <li>We are working on source string, not the buffer.
- * <li>The leading combining class from the current character is 0 or the
- * trailing combining class of the previous char was zero.
- * </ul>
- * Incoming source offsets points to the current processing character.
- * Return source offsets points to the current processing character.
- * </p>
- * @param ch current character (lead unit)
- * @param offset offset of ch +1
- * @return true if FCDCheck passes, false otherwise
- */
- private boolean FCDCheck(int ch, int offset)
- {
- boolean result = true;
-
- // Get the trailing combining class of the current character.
- // If it's zero, we are OK.
- m_FCDStart_ = offset - 1;
- m_source_.setIndex(offset);
- // trie access
- int fcd;
- if (ch < 0x180) {
- fcd = m_nfcImpl_.getFCD16FromBelow180(ch);
- } else if (m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) {
- if (Character.isHighSurrogate((char)ch)) {
- int c2 = m_source_.next();
- if (c2 < 0) {
- fcd = 0; // end of input
- } else if (Character.isLowSurrogate((char)c2)) {
- fcd = m_nfcImpl_.getFCD16FromNormData(Character.toCodePoint((char)ch, (char)c2));
- } else {
- m_source_.moveIndex(-1);
- fcd = 0;
- }
- } else {
- fcd = m_nfcImpl_.getFCD16FromNormData(ch);
- }
- } else {
- fcd = 0;
- }
-
- int prevTrailCC = fcd & LAST_BYTE_MASK_;
-
- if (prevTrailCC == 0) {
- offset = m_source_.getIndex();
- } else {
- // The current char has a non-zero trailing CC. Scan forward until
- // we find a char with a leading cc of zero.
- while (true) {
- ch = m_source_.nextCodePoint();
- if (ch < 0) {
- offset = m_source_.getIndex();
- break;
- }
- // trie access
- fcd = m_nfcImpl_.getFCD16(ch);
- int leadCC = fcd >> SECOND_LAST_BYTE_SHIFT_;
- if (leadCC == 0) {
- // this is a base character, we stop the FCD checks
- offset = m_source_.getIndex() - Character.charCount(ch);
- break;
- }
-
- if (leadCC < prevTrailCC) {
- result = false;
- }
-
- prevTrailCC = fcd & LAST_BYTE_MASK_;
- }
- }
- m_FCDLimit_ = offset;
- m_source_.setIndex(m_FCDStart_ + 1);
- return result;
- }
-
- /**
- * <p>Method tries to fetch the next character that is in fcd form.</p>
- * <p>Normalization is done if required.</p>
- * <p>Offsets are returned at the next character.</p>
- * @return next fcd character
- */
- private int nextChar()
- {
- int result;
-
- // loop handles the next character whether it is in the buffer or not.
- if (m_bufferOffset_ < 0) {
- // we're working on the source and not normalizing. fast path.
- // note Thai pre-vowel reordering uses buffer too
- result = m_source_.next();
- }
- else {
- // we are in the buffer, buffer offset will never be 0 here
- if (m_bufferOffset_ >= m_buffer_.length()) {
- // Null marked end of buffer, revert to the source string and
- // loop back to top to try again to get a character.
- m_source_.setIndex(m_FCDLimit_);
- m_bufferOffset_ = -1;
- m_buffer_.setLength(0);
- return nextChar();
- }
- return m_buffer_.charAt(m_bufferOffset_ ++);
- }
- int startoffset = m_source_.getIndex();
- if (result < FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_
- // Fast fcd safe path. trail combining class == 0.
- || m_collator_.getDecomposition() == Collator.NO_DECOMPOSITION
- || m_bufferOffset_ >= 0 || m_FCDLimit_ >= startoffset) {
- // skip the fcd checks
- return result;
- }
-
- if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
- // We need to peek at the next character in order to tell if we are
- // FCD
- int next = m_source_.current();
- if (next == UCharacterIterator.DONE
- || next < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
- return result; // end of source string and if next character
- // starts with a base character is always fcd.
- }
- }
-
- // Need a more complete FCD check and possible normalization.
- if (!FCDCheck(result, startoffset)) {
- normalize();
- result = m_buffer_.charAt(0);
- m_bufferOffset_ = 1;
- }
- return result;
- }
-
- /**
- * <p>Incremental normalization, this is an essential optimization.
- * Assuming FCD checks has been done, normalize the non-FCD characters into
- * the buffer.
- * Source offsets points to the current processing character.</p>
- */
- private void normalizeBackwards()
- {
- normalize();
- m_bufferOffset_ = m_buffer_.length();
- }
-
- /**
- * <p>Incremental backwards FCD check and normalization. Gets the previous
- * base character position and determines if the in-between characters
- * needs normalization.
- * </p>
- * <p>When entering, the state is known to be this:
- * <ul>
- * <li>We are working on source string, not the buffer.
- * <li>The trailing combining class from the current character is 0 or the
- * leading combining class of the next char was zero.
- * </ul>
- * Input source offsets points to the previous character.
- * Return source offsets points to the current processing character.
- * </p>
- * @param ch current character
- * @param offset current character offset
- * @return true if FCDCheck passes, false otherwise
- */
- private boolean FCDCheckBackwards(int ch, int offset)
- {
- int fcd;
- m_FCDLimit_ = offset + 1;
- m_source_.setIndex(offset);
- if (ch < 0x180) {
- fcd = m_nfcImpl_.getFCD16FromBelow180(ch);
- } else if (!Character.isLowSurrogate((char)ch)) {
- if (m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) {
- fcd = m_nfcImpl_.getFCD16FromNormData(ch);
- } else {
- fcd = 0;
- }
- } else {
- int c2 = m_source_.previous();
- if (c2 < 0) {
- fcd = 0; // start of input
- } else if (Character.isHighSurrogate((char)c2)) {
- ch = Character.toCodePoint((char)c2, (char)ch);
- fcd = m_nfcImpl_.getFCD16FromNormData(ch);
- --offset;
- } else {
- m_source_.moveIndex(1);
- fcd = 0;
- }
- }
-
- // Scan backward until we find a char with a leading cc of zero.
- boolean result = true;
- if (fcd != 0) {
- int leadCC;
- for (;;) {
- leadCC = fcd >> SECOND_LAST_BYTE_SHIFT_;
- if (leadCC == 0 || (ch = m_source_.previousCodePoint()) < 0) {
- offset = m_source_.getIndex();
- break;
- }
- fcd = m_nfcImpl_.getFCD16(ch);
- int prevTrailCC = fcd & LAST_BYTE_MASK_;
- if (leadCC < prevTrailCC) {
- result = false;
- } else if (fcd == 0) {
- offset = m_source_.getIndex() + Character.charCount(ch);
- break;
- }
- }
- }
-
- // storing character with 0 lead fcd or the 1st accent with a base
- // character before it
- m_FCDStart_ = offset;
- m_source_.setIndex(m_FCDLimit_);
- return result;
- }
-
- /**
- * <p>Method tries to fetch the previous character that is in fcd form.</p>
- * <p>Normalization is done if required.</p>
- * <p>Offsets are returned at the current character.</p>
- * @return previous fcd character
- */
- private int previousChar()
- {
- if (m_bufferOffset_ >= 0) {
- m_bufferOffset_ --;
- if (m_bufferOffset_ >= 0) {
- return m_buffer_.charAt(m_bufferOffset_);
- }
- else {
- // At the start of buffer, route back to string.
- m_buffer_.setLength(0);
- if (m_FCDStart_ == 0) {
- m_FCDStart_ = -1;
- m_source_.setIndex(0);
- return UCharacterIterator.DONE;
- }
- else {
- m_FCDLimit_ = m_FCDStart_;
- m_source_.setIndex(m_FCDStart_);
- return previousChar();
- }
- }
- }
- int result = m_source_.previous();
- int startoffset = m_source_.getIndex();
- if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_
- || m_collator_.getDecomposition() == Collator.NO_DECOMPOSITION
- || m_FCDStart_ <= startoffset || m_source_.getIndex() == 0) {
- return result;
- }
- int ch = m_source_.previous();
- if (ch < FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
- // if previous character is FCD
- m_source_.next();
- return result;
- }
- // Need a more complete FCD check and possible normalization.
- if (!FCDCheckBackwards(result, startoffset)) {
- normalizeBackwards();
- m_bufferOffset_ --;
- result = m_buffer_.charAt(m_bufferOffset_);
- }
- else {
- // fcd checks always reset m_source_ to the limit of the FCD
- m_source_.setIndex(startoffset);
- }
- return result;
- }
-
- /**
- * Determines if it is at the start of source iteration
- * @return true if iterator at the start, false otherwise
- */
- private final boolean isBackwardsStart()
- {
- return (m_bufferOffset_ < 0 && m_source_.getIndex() == 0)
- || (m_bufferOffset_ == 0 && m_FCDStart_ <= 0);
- }
-
- /**
- * Checks if iterator is at the end of its source string.
- * @return true if it is at the end, false otherwise
- */
- private final boolean isEnd()
- {
- if (m_bufferOffset_ >= 0) {
- if (m_bufferOffset_ != m_buffer_.length()) {
- return false;
- }
- else {
- // at end of buffer. check if fcd is at the end
- return m_FCDLimit_ == m_source_.getLength();
- }
- }
- return m_source_.getLength() == m_source_.getIndex();
- }
-
- /**
- * <p>Special CE management for surrogates</p>
- * <p>Lead surrogate is encountered. CE to be retrieved by using the
- * following code unit. If the next code unit is a trail surrogate, both
- * units will be combined to retrieve the CE,
- * otherwise we treat it like an unassigned code point.</p>
- * @param collator collator to use
- * @param ce current CE
- * @param trail character
- * @return next CE for the surrogate characters
- */
- private final int nextSurrogate(RuleBasedCollator collator, int ce,
- char trail)
- {
- if (!UTF16.isTrailSurrogate(trail)) {
- updateInternalState(m_utilSpecialBackUp_);
- return CE_NOT_FOUND_;
- }
- // TODO: CE contain the data from the previous CE + the mask.
- // It should at least be unmasked
- int result = collator.m_trie_.getTrailValue(ce, trail);
- if (result == CE_NOT_FOUND_) {
- updateInternalState(m_utilSpecialBackUp_);
- }
- return result;
- }
-
- /**
- * Gets the CE expansion offset
- * @param collator current collator
- * @param ce ce to test
- * @return expansion offset
- */
- private int getExpansionOffset(RuleBasedCollator collator, int ce)
- {
- return ((ce & 0xFFFFF0) >> 4) - collator.m_expansionOffset_;
- }
-
-
- /**
- * Gets the contraction ce offset
- * @param collator current collator
- * @param ce current ce
- * @return contraction offset
- */
- private int getContractionOffset(RuleBasedCollator collator, int ce)
- {
- return (ce & 0xFFFFFF) - collator.m_contractionOffset_;
- }
-
- /**
- * Checks if CE is a special tag CE
- * @param ce to check
- * @return true if CE is a special tag CE, false otherwise
- */
- private boolean isSpecialPrefixTag(int ce)
- {
- return RuleBasedCollator.isSpecial(ce) &&
- RuleBasedCollator.getTag(ce) == CE_SPEC_PROC_TAG_;
- }
-
- /**
- * <p>Special processing getting a CE that is preceded by a certain
- * prefix.</p>
- * <p>Used for optimizing Japanese length and iteration marks. When a
- * special processing tag is encountered, iterate backwards to see if
- * there's a match.</p>
- * <p>Contraction tables are used, prefix data is stored backwards in the
- * table.</p>
- * @param collator collator to use
- * @param ce current ce
- * @param entrybackup entry backup iterator status
- * @return next collation element
- */
- private int nextSpecialPrefix(RuleBasedCollator collator, int ce,
- Backup entrybackup)
- {
- backupInternalState(m_utilSpecialBackUp_);
- updateInternalState(entrybackup);
- previousChar();
- // We want to look at the character where we entered
-
- while (true) {
- // This loop will run once per source string character, for as
- // long as we are matching a potential contraction sequence
- // First we position ourselves at the begining of contraction
- // sequence
- int entryoffset = getContractionOffset(collator, ce);
- int offset = entryoffset;
- if (isBackwardsStart()) {
- ce = collator.m_contractionCE_[offset];
- break;
- }
- char previous = (char)previousChar();
- while (previous > collator.m_contractionIndex_[offset]) {
- // contraction characters are ordered, skip smaller characters
- offset ++;
- }
-
- if (previous == collator.m_contractionIndex_[offset]) {
- // Found the source string char in the table.
- // Pick up the corresponding CE from the table.
- ce = collator.m_contractionCE_[offset];
- }
- else {
- // Source string char was not in the table, prefix not found
- ce = collator.m_contractionCE_[entryoffset];
- }
-
- if (!isSpecialPrefixTag(ce)) {
- // The source string char was in the contraction table, and
- // the corresponding CE is not a prefix CE. We found the
- // prefix, break out of loop, this CE will end up being
- // returned. This is the normal way out of prefix handling
- // when the source actually contained the prefix.
- break;
- }
- }
- if (ce != CE_NOT_FOUND_) {
- // we found something and we can merilly continue
- updateInternalState(m_utilSpecialBackUp_);
- }
- else { // prefix search was a failure, we have to backup all the way to
- // the start
- updateInternalState(entrybackup);
- }
- return ce;
- }
-
- /**
- * Checks if the ce is a contraction tag
- * @param ce ce to check
- * @return true if ce is a contraction tag, false otherwise
- */
- private boolean isContractionTag(int ce)
- {
- return RuleBasedCollator.isSpecial(ce) &&
- RuleBasedCollator.getTag(ce) == CE_CONTRACTION_TAG_;
- }
-
- /**
- * Method to copy skipped characters into the buffer and sets the fcd
- * position. To ensure that the skipped characters are considered later,
- * we need to place it in the appropriate position in the buffer and
- * reassign the source index. simple case if index reside in string,
- * simply copy to buffer and fcdposition = pos, pos = start of buffer.
- * if pos in normalization buffer, we'll insert the copy infront of pos
- * and point pos to the start of the buffer. why am i doing these copies?
- * well, so that the whole chunk of codes in the getNextCE,
- * ucol_prv_getSpecialCE does not require any changes, which will be
- * really painful.
- * @param skipped character buffer
- */
- private void setDiscontiguous(StringBuilder skipped)
- {
- if (m_bufferOffset_ >= 0) {
- m_buffer_.replace(0, m_bufferOffset_, skipped.toString());
- }
- else {
- m_FCDLimit_ = m_source_.getIndex();
- m_buffer_.setLength(0);
- m_buffer_.append(skipped.toString());
- }
-
- m_bufferOffset_ = 0;
- }
-
- /**
- * Returns the current character for forward iteration
- * @return current character
- */
- private int currentChar()
- {
- if (m_bufferOffset_ < 0) {
- m_source_.previousCodePoint();
- return m_source_.nextCodePoint();
- }
-
- // m_bufferOffset_ is never 0 in normal circumstances except after a
- // discontiguous contraction since it is always returned and moved
- // by 1 when we do nextChar()
- return UTF16.charAt(m_buffer_, m_bufferOffset_ - 1);
- }
-
- /**
- * Method to get the discontiguous collation element within the source.
- * Note this function will set the position to the appropriate places.
- * Passed in character offset points to the second combining character
- * after the start character.
- * @param collator current collator used
- * @param entryoffset index to the start character in the contraction table
- * @return discontiguous collation element offset
- */
- private int nextDiscontiguous(RuleBasedCollator collator, int entryoffset)
- {
- int offset = entryoffset;
- boolean multicontraction = false;
- // since it will be stuffed into this iterator and ran over again
- if (m_utilSkippedBuffer_ == null) {
- m_utilSkippedBuffer_ = new StringBuilder();
- }
- else {
- m_utilSkippedBuffer_.setLength(0);
- }
- int ch = currentChar();
- m_utilSkippedBuffer_.appendCodePoint(ch);
- int prevCC = 0;
- int cc = getCombiningClass(ch);
- // accent after the first character
- if (m_utilSpecialDiscontiguousBackUp_ == null) {
- m_utilSpecialDiscontiguousBackUp_ = new Backup();
- }
- backupInternalState(m_utilSpecialDiscontiguousBackUp_);
- boolean prevWasLead = false;
- while (true) {
- // We read code units for contraction table matching
- // but have to get combining classes for code points
- // to figure out where to stop with discontiguous contraction.
- int ch_int = nextChar();
- char nextch = (char)ch_int;
- if (UTF16.isSurrogate(nextch)) {
- if (prevWasLead) {
- // trail surrogate of surrogate pair, keep previous and current cc
- prevWasLead = false;
- } else {
- prevCC = cc;
- cc = 0; // default cc for an unpaired surrogate
- prevWasLead = false;
- if (Character.isHighSurrogate(nextch)) {
- int trail = nextChar();
- if (Character.isLowSurrogate((char)trail)) {
- cc = getCombiningClass(Character.toCodePoint(nextch, (char)trail));
- prevWasLead = true;
- }
- if (trail >= 0) {
- previousChar(); // restore index after having peeked at the next code unit
- }
- }
- }
- } else {
- prevCC = cc;
- cc = getCombiningClass(ch_int);
- prevWasLead = false;
- }
- if (ch_int < 0 || cc == 0) {
- // if there are no more accents to move around
- // we don't have to shift previousChar, since we are resetting
- // the offset later
- if (multicontraction) {
- if (ch_int >= 0) {
- previousChar(); // backtrack
- }
- setDiscontiguous(m_utilSkippedBuffer_);
- return collator.m_contractionCE_[offset];
- }
- break;
- }
-
- offset ++; // skip the combining class offset
- while ((offset < collator.m_contractionIndex_.length) &&
- (nextch > collator.m_contractionIndex_[offset])) {
- offset ++;
- }
-
- int ce = CE_NOT_FOUND_;
- if ( offset >= collator.m_contractionIndex_.length) {
- break;
- }
- if (nextch != collator.m_contractionIndex_[offset] || cc == prevCC) {
- // unmatched or blocked character
- if ( (m_utilSkippedBuffer_.length()!= 1) ||
- ((m_utilSkippedBuffer_.charAt(0)!= nextch) &&
- (m_bufferOffset_<0) )) { // avoid push to skipped buffer twice
- m_utilSkippedBuffer_.append(nextch);
- }
- offset = entryoffset; // Restore the offset before checking next character.
- continue;
- }
- else {
- ce = collator.m_contractionCE_[offset];
- }
-
- if (ce == CE_NOT_FOUND_) {
- break;
- }
- else if (isContractionTag(ce)) {
- // this is a multi-contraction
- offset = getContractionOffset(collator, ce);
- if (collator.m_contractionCE_[offset] != CE_NOT_FOUND_) {
- multicontraction = true;
- backupInternalState(m_utilSpecialDiscontiguousBackUp_);
- }
- }
- else {
- setDiscontiguous(m_utilSkippedBuffer_);
- return ce;
- }
- }
-
- updateInternalState(m_utilSpecialDiscontiguousBackUp_);
- // backup is one forward of the base character, we need to move back
- // one more
- previousChar();
- return collator.m_contractionCE_[entryoffset];
- }
-
- /**
- * Gets the next contraction ce
- * @param collator collator to use
- * @param ce current ce
- * @return ce of the next contraction
- */
- private int nextContraction(RuleBasedCollator collator, int ce)
- {
- backupInternalState(m_utilSpecialBackUp_);
- int entryce = collator.m_contractionCE_[getContractionOffset(collator, ce)]; //CE_NOT_FOUND_;
- while (true) {
- int entryoffset = getContractionOffset(collator, ce);
- int offset = entryoffset;
-
- if (isEnd()) {
- ce = collator.m_contractionCE_[offset];
- if (ce == CE_NOT_FOUND_) {
- // back up the source over all the chars we scanned going
- // into this contraction.
- ce = entryce;
- updateInternalState(m_utilSpecialBackUp_);
- }
- break;
- }
-
- // get the discontiguos maximum combining class
- int maxCC = (collator.m_contractionIndex_[offset] & 0xFF);
- // checks if all characters have the same combining class
- byte allSame = (byte)(collator.m_contractionIndex_[offset] >> 8);
- char ch = (char)nextChar();
- offset ++;
- while (ch > collator.m_contractionIndex_[offset]) {
- // contraction characters are ordered, skip all smaller
- offset ++;
- }
-
- if (ch == collator.m_contractionIndex_[offset]) {
- // Found the source string char in the contraction table.
- // Pick up the corresponding CE from the table.
- ce = collator.m_contractionCE_[offset];
- }
- else {
- // Source string char was not in contraction table.
- // Unless it is a discontiguous contraction, we are done
- int miss = ch;
- // ticket 8484 - porting changes from C for 6101
- // We test whether the next two char are surrogate pairs.
- // This test is done if the iterator is not in the end.
- // If there is no surrogate pair, the iterator
- // goes back one if needed.
- if(UTF16.isLeadSurrogate(ch) && !isEnd()) {
- char surrNextChar = (char)nextChar();
- if (UTF16.isTrailSurrogate(surrNextChar)) {
- miss = UCharacterProperty.getRawSupplementary(ch, surrNextChar);
- } else {
- previousChar();
- }
- }
- int sCC;
- if (maxCC == 0 || (sCC = getCombiningClass(miss)) == 0
- || sCC > maxCC || (allSame != 0 && sCC == maxCC) ||
- isEnd()) {
- // Contraction can not be discontiguous, back up by one
- previousChar();
- if(miss > 0xFFFF) {
- previousChar();
- }
- ce = collator.m_contractionCE_[entryoffset];
- }
- else {
- // Contraction is possibly discontiguous.
- // find the next character if ch is not a base character
- int ch_int = nextChar();
- if (ch_int != UCharacterIterator.DONE) {
- previousChar();
- }
- char nextch = (char)ch_int;
- if (getCombiningClass(nextch) == 0) {
- previousChar();
- if(miss > 0xFFFF) {
- previousChar();
- }
- // base character not part of discontiguous contraction
- ce = collator.m_contractionCE_[entryoffset];
- }
- else {
- ce = nextDiscontiguous(collator, entryoffset);
- }
- }
- }
-
- if (ce == CE_NOT_FOUND_) {
- // source did not match the contraction, revert back original
- updateInternalState(m_utilSpecialBackUp_);
- ce = entryce;
- break;
- }
-
- // source was a contraction
- if (!isContractionTag(ce)) {
- break;
- }
-
- // ccontinue looping to check for the remaining contraction.
- if (collator.m_contractionCE_[entryoffset] != CE_NOT_FOUND_) {
- // there are further contractions to be performed, so we store
- // the so-far completed ce, so that if we fail in the next
- // round we just return this one.
- entryce = collator.m_contractionCE_[entryoffset];
- backupInternalState(m_utilSpecialBackUp_);
- if (m_utilSpecialBackUp_.m_bufferOffset_ >= 0) {
- m_utilSpecialBackUp_.m_bufferOffset_ --;
- }
- else {
- m_utilSpecialBackUp_.m_offset_ --;
- }
- }
- }
- return ce;
- }
-
- /**
- * Gets the next ce for long primaries, stuffs the rest of the collation
- * elements into the ce buffer
- * @param ce current ce
- * @return next ce
- */
- private int nextLongPrimary(int ce)
- {
- m_CEBuffer_[1] = ((ce & 0xFF) << 24)
- | RuleBasedCollator.CE_CONTINUATION_MARKER_;
- m_CEBufferOffset_ = 1;
- m_CEBufferSize_ = 2;
- m_CEBuffer_[0] = ((ce & 0xFFFF00) << 8) | (CE_BYTE_COMMON_ << 8) |
- CE_BYTE_COMMON_;
- return m_CEBuffer_[0];
- }
-
- /**
- * Gets the number of expansion
- * @param ce current ce
- * @return number of expansion
- */
- private int getExpansionCount(int ce)
- {
- return ce & 0xF;
- }
-
- /**
- * Gets the next expansion ce and stuffs the rest of the collation elements
- * into the ce buffer
- * @param collator current collator
- * @param ce current ce
- * @return next expansion ce
- */
- private int nextExpansion(RuleBasedCollator collator, int ce)
- {
- // NOTE: we can encounter both continuations and expansions in an
- // expansion!
- // I have to decide where continuations are going to be dealt with
- int offset = getExpansionOffset(collator, ce);
- m_CEBufferSize_ = getExpansionCount(ce);
- m_CEBufferOffset_ = 1;
- m_CEBuffer_[0] = collator.m_expansion_[offset];
- if (m_CEBufferSize_ != 0) {
- // if there are less than 16 elements in expansion
- for (int i = 1; i < m_CEBufferSize_; i ++) {
- m_CEBuffer_[i] = collator.m_expansion_[offset + i];
- }
- }
- else {
- // ce are terminated
- m_CEBufferSize_ = 1;
- while (collator.m_expansion_[offset] != 0) {
- m_CEBuffer_[m_CEBufferSize_ ++] =
- collator.m_expansion_[++ offset];
- }
- }
- // in case of one element expansion, we
- // want to immediately return CEpos
- if (m_CEBufferSize_ == 1) {
- m_CEBufferSize_ = 0;
- m_CEBufferOffset_ = 0;
- }
- return m_CEBuffer_[0];
- }
-
- /**
- * Gets the next digit ce
- * @param collator current collator
- * @param ce current collation element
- * @param cp current codepoint
- * @return next digit ce
- */
- private int nextDigit(RuleBasedCollator collator, int ce, int cp)
- {
- // We do a check to see if we want to collate digits as numbers;
- // if so we generate a custom collation key. Otherwise we pull out
- // the value stored in the expansion table.
-
- if (m_collator_.m_isNumericCollation_){
- int collateVal = 0;
- int trailingZeroIndex = 0;
- boolean nonZeroValReached = false;
-
- // I just need a temporary place to store my generated CEs.
- // icu4c uses a unsigned byte array, i'll use a stringbuffer here
- // to avoid dealing with the sign problems and array allocation
- // clear and set initial string buffer length
- m_utilStringBuffer_.setLength(3);
-
- // We parse the source string until we hit a char that's NOT a
- // digit.
- // Use this u_charDigitValue. This might be slow because we have
- // to handle surrogates...
- int digVal = UCharacter.digit(cp);
- // if we have arrived here, we have already processed possible
- // supplementaries that trigered the digit tag -
- // all supplementaries are marked in the UCA.
- // We pad a zero in front of the first element anyways.
- // This takes care of the (probably) most common case where
- // people are sorting things followed by a single digit
- int digIndx = 1;
- for (;;) {
- // Make sure we have enough space.
- if (digIndx >= ((m_utilStringBuffer_.length() - 2) << 1)) {
- m_utilStringBuffer_.setLength(m_utilStringBuffer_.length()
- << 1);
- }
- // Skipping over leading zeroes.
- if (digVal != 0 || nonZeroValReached) {
- if (digVal != 0 && !nonZeroValReached) {
- nonZeroValReached = true;
- }
- // We parse the digit string into base 100 numbers
- // (this fits into a byte).
- // We only add to the buffer in twos, thus if we are
- // parsing an odd character, that serves as the
- // 'tens' digit while the if we are parsing an even
- // one, that is the 'ones' digit. We dumped the
- // parsed base 100 value (collateVal) into a buffer.
- // We multiply each collateVal by 2 (to give us room)
- // and add 5 (to avoid overlapping magic CE byte
- // values). The last byte we subtract 1 to ensure it is
- // less than all the other bytes.
- if (digIndx % 2 != 0) {
- collateVal += digVal;
- // This removes trailing zeroes.
- if (collateVal == 0 && trailingZeroIndex == 0) {
- trailingZeroIndex = ((digIndx - 1) >>> 1) + 2;
- }
- else if (trailingZeroIndex != 0) {
- trailingZeroIndex = 0;
- }
- m_utilStringBuffer_.setCharAt(
- ((digIndx - 1) >>> 1) + 2,
- (char)((collateVal << 1) + 6));
- collateVal = 0;
- }
- else {
- // We drop the collation value into the buffer so if
- // we need to do a "front patch" we don't have to
- // check to see if we're hitting the last element.
-
- collateVal = digVal * 10;
- if (collateVal == 0) {
- if (trailingZeroIndex != 0) {
- trailingZeroIndex = (digIndx >>> 1) + 2;
- }
- } else {
- trailingZeroIndex = 0;
- }
-
- m_utilStringBuffer_.setCharAt((digIndx >>> 1) + 2,
- (char)((collateVal << 1) + 6));
- }
- digIndx ++;
- }
-
- // Get next character.
- if (!isEnd()){
- backupInternalState(m_utilSpecialBackUp_);
- int char32 = nextChar();
- char ch = (char)char32;
- if (UTF16.isLeadSurrogate(ch)){
- if (!isEnd()) {
- char trail = (char)nextChar();
- if (UTF16.isTrailSurrogate(trail)) {
- char32 = UCharacterProperty.getRawSupplementary(
- ch, trail);
- }
- else {
- goBackOne();
- }
- }
- }
-
- digVal = UCharacter.digit(char32);
- if (digVal == -1) {
- // Resetting position to point to the next unprocessed
- // char. We overshot it when doing our test/set for
- // numbers.
- updateInternalState(m_utilSpecialBackUp_);
- break;
- }
- }
- else {
- break;
- }
- }
-
- if (nonZeroValReached == false){
- digIndx = 2;
- m_utilStringBuffer_.setCharAt(2, (char)6);
- }
-
- int endIndex = trailingZeroIndex != 0 ? trailingZeroIndex
- : (digIndx >>> 1) + 2;
-
- if (digIndx % 2 != 0){
- // We missed a value. Since digIndx isn't even, stuck too many
- // values into the buffer (this is what we get for padding the
- // first byte with a zero). "Front-patch" now by pushing all
- // nybbles forward.
- // Doing it this way ensures that at least 50% of the time
- // (statistically speaking) we'll only be doing a single pass
- // and optimizes for strings with single digits. I'm just
- // assuming that's the more common case.
- for (int i = 2; i < endIndex; i ++){
- m_utilStringBuffer_.setCharAt(i,
- (char)((((((m_utilStringBuffer_.charAt(i) - 6) >>> 1)
- % 10) * 10)
- + (((m_utilStringBuffer_.charAt(i + 1) - 6)
- >>> 1) / 10) << 1) + 6));
- }
- -- digIndx;
- }
-
- // Subtract one off of the last byte.
- m_utilStringBuffer_.setCharAt(endIndex - 1,
- (char)(m_utilStringBuffer_.charAt(endIndex - 1) - 1));
-
- // We want to skip over the first two slots in the buffer.
- // The first slot is reserved for the header byte CODAN_PLACEHOLDER.
- // The second slot is for the sign/exponent byte:
- // 0x80 + (decimalPos/2) & 7f.
- m_utilStringBuffer_.setCharAt(0, (char)RuleBasedCollator.CODAN_PLACEHOLDER);
- m_utilStringBuffer_.setCharAt(1,
- (char)(0x80 + ((digIndx >>> 1) & 0x7F)));
-
- // Now transfer the collation key to our collIterate struct.
- // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
- ce = (((m_utilStringBuffer_.charAt(0) << 8)
- // Primary weight
- | m_utilStringBuffer_.charAt(1))
- << RuleBasedCollator.CE_PRIMARY_SHIFT_)
- // Secondary weight
- | (RuleBasedCollator.BYTE_COMMON_
- << RuleBasedCollator.CE_SECONDARY_SHIFT_)
- | RuleBasedCollator.BYTE_COMMON_; // Tertiary weight.
- int i = 2; // Reset the index into the buffer.
-
- m_CEBuffer_[0] = ce;
- m_CEBufferSize_ = 1;
- m_CEBufferOffset_ = 1;
- while (i < endIndex)
- {
- int primWeight = m_utilStringBuffer_.charAt(i ++) << 8;
- if (i < endIndex) {
- primWeight |= m_utilStringBuffer_.charAt(i ++);
- }
- m_CEBuffer_[m_CEBufferSize_ ++]
- = (primWeight << RuleBasedCollator.CE_PRIMARY_SHIFT_)
- | RuleBasedCollator.CE_CONTINUATION_MARKER_;
- }
- return ce;
- }
-
- // no numeric mode, we'll just switch to whatever we stashed and
- // continue
- // find the offset to expansion table
- return collator.m_expansion_[getExpansionOffset(collator, ce)];
- }
-
- /**
- * Gets the next implicit ce for codepoints
- * @param codepoint current codepoint
- * @return implicit ce
- */
- private int nextImplicit(int codepoint)
- {
- int result = RuleBasedCollator.impCEGen_.getImplicitFromCodePoint(codepoint);
- m_CEBuffer_[0] = (result & RuleBasedCollator.CE_PRIMARY_MASK_)
- | 0x00000505;
- m_CEBuffer_[1] = ((result & 0x0000FFFF) << 16) | 0x000000C0;
- m_CEBufferOffset_ = 1;
- m_CEBufferSize_ = 2;
- return m_CEBuffer_[0];
- }
-
- /**
- * Returns the next ce associated with the following surrogate characters
- * @param ch current character
- * @return ce
- */
- private int nextSurrogate(char ch)
- {
- int ch_int = nextChar();
- char nextch = (char)ch_int;
- if (ch_int != CharacterIterator.DONE &&
- UTF16.isTrailSurrogate(nextch)) {
- int codepoint = UCharacterProperty.getRawSupplementary(ch, nextch);
- return nextImplicit(codepoint);
- }
- if (nextch != CharacterIterator.DONE) {
- previousChar(); // reverts back to the original position
- }
- return CE_NOT_FOUND_; // treat like unassigned
- }
-
- /**
- * Returns the next ce for a hangul character, this is an implicit
- * calculation
- * @param collator current collator
- * @param ch current character
- * @return hangul ce
- */
- private int nextHangul(RuleBasedCollator collator, char ch)
- {
- char L = (char)(ch - HANGUL_SBASE_);
-
- // divide into pieces
- // do it in this order since some compilers can do % and / in one
- // operation
- char T = (char)(L % HANGUL_TCOUNT_);
- L /= HANGUL_TCOUNT_;
- char V = (char)(L % HANGUL_VCOUNT_);
- L /= HANGUL_VCOUNT_;
-
- // offset them
- L += HANGUL_LBASE_;
- V += HANGUL_VBASE_;
- T += HANGUL_TBASE_;
-
- // return the first CE, but first put the rest into the expansion
- // buffer
- m_CEBufferSize_ = 0;
- if (!m_collator_.m_isJamoSpecial_) { // FAST PATH
- m_CEBuffer_[m_CEBufferSize_ ++] =
- collator.m_trie_.getLeadValue(L);
- m_CEBuffer_[m_CEBufferSize_ ++] =
- collator.m_trie_.getLeadValue(V);
-
- if (T != HANGUL_TBASE_) {
- m_CEBuffer_[m_CEBufferSize_ ++] =
- collator.m_trie_.getLeadValue(T);
- }
- m_CEBufferOffset_ = 1;
- return m_CEBuffer_[0];
- }
- else {
- // Jamo is Special
- // Since Hanguls pass the FCD check, it is guaranteed that we
- // won't be in the normalization buffer if something like this
- // happens
- // Move Jamos into normalization buffer
- m_buffer_.append(L);
- m_buffer_.append(V);
- if (T != HANGUL_TBASE_) {
- m_buffer_.append(T);
- }
- m_bufferOffset_ = 0;
- m_FCDLimit_ = m_source_.getIndex();
- m_FCDStart_ = m_FCDLimit_ - 1;
- // Indicate where to continue in main input string after
- // exhausting the buffer
- return IGNORABLE;
- }
- }
-
- /**
- * <p>Special CE management. Expansions, contractions etc...</p>
- * @param collator can be plain UCA
- * @param ce current ce
- * @param ch current character
- * @return next special ce
- */
- private int nextSpecial(RuleBasedCollator collator, int ce, char ch)
- {
- int codepoint = ch;
- Backup entrybackup = m_utilSpecialEntryBackUp_;
- // this is to handle recursive looping
- if (entrybackup != null) {
- m_utilSpecialEntryBackUp_ = null;
- }
- else {
- entrybackup = new Backup();
- }
- backupInternalState(entrybackup);
- try { // forces it to assign m_utilSpecialEntryBackup_
- while (true) {
- // This loop will repeat only in the case of contractions,
- // surrogate
- switch(RuleBasedCollator.getTag(ce)) {
- case CE_NOT_FOUND_TAG_:
- // impossible case for icu4j
- return ce;
- case RuleBasedCollator.CE_SURROGATE_TAG_:
- if (isEnd()) {
- return CE_NOT_FOUND_;
- }
- backupInternalState(m_utilSpecialBackUp_);
- char trail = (char)nextChar();
- ce = nextSurrogate(collator, ce, trail);
- // calculate the supplementary code point value,
- // if surrogate was not tailored we go one more round
- codepoint =
- UCharacterProperty.getRawSupplementary(ch, trail);
- break;
- case CE_SPEC_PROC_TAG_:
- ce = nextSpecialPrefix(collator, ce, entrybackup);
- break;
- case CE_CONTRACTION_TAG_:
- ce = nextContraction(collator, ce);
- break;
- case CE_LONG_PRIMARY_TAG_:
- return nextLongPrimary(ce);
- case CE_EXPANSION_TAG_:
- return nextExpansion(collator, ce);
- case CE_DIGIT_TAG_:
- ce = nextDigit(collator, ce, codepoint);
- break;
- // various implicits optimization
- case CE_CJK_IMPLICIT_TAG_:
- // 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
- return nextImplicit(codepoint);
- case CE_IMPLICIT_TAG_: // everything that is not defined
- return nextImplicit(codepoint);
- case CE_TRAIL_SURROGATE_TAG_:
- return CE_NOT_FOUND_; // DC00-DFFF broken surrogate, treat like unassigned
- case CE_LEAD_SURROGATE_TAG_: // D800-DBFF
- return nextSurrogate(ch);
- case CE_HANGUL_SYLLABLE_TAG_: // AC00-D7AF
- return nextHangul(collator, ch);
- case CE_CHARSET_TAG_:
- // not yet implemented probably after 1.8
- return CE_NOT_FOUND_;
- default:
- ce = IGNORABLE;
- // synwee todo, throw exception or something here.
- }
- if (!RuleBasedCollator.isSpecial(ce)) {
- break;
- }
- }
- }
- finally {
- m_utilSpecialEntryBackUp_ = entrybackup;
- }
- return ce;
- }
-
- /**
- * Special processing is getting a CE that is preceded by a certain prefix.
- * Currently this is only needed for optimizing Japanese length and
- * iteration marks. When we encouter a special processing tag, we go
- * backwards and try to see if we have a match. Contraction tables are used
- * - so the whole process is not unlike contraction. prefix data is stored
- * backwards in the table.
- * @param collator current collator
- * @param ce current ce
- * @return previous ce
- */
- private int previousSpecialPrefix(RuleBasedCollator collator, int ce)
- {
- backupInternalState(m_utilSpecialBackUp_);
- while (true) {
- // position ourselves at the begining of contraction sequence
- int offset = getContractionOffset(collator, ce);
- int entryoffset = offset;
- if (isBackwardsStart()) {
- ce = collator.m_contractionCE_[offset];
- break;
- }
- char prevch = (char)previousChar();
- while (prevch > collator.m_contractionIndex_[offset]) {
- // since contraction codepoints are ordered, we skip all that
- // are smaller
- offset ++;
- }
- if (prevch == collator.m_contractionIndex_[offset]) {
- ce = collator.m_contractionCE_[offset];
- }
- else {
- // if there is a completely ignorable code point in the middle
- // of a prefix, we need to act as if it's not there assumption:
- // 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to
- // zero)
- // lone surrogates cannot be set to zero as it would break
- // other processing
- int isZeroCE = collator.m_trie_.getLeadValue(prevch);
- // it's easy for BMP code points
- if (isZeroCE == 0) {
- continue;
- }
- else if (UTF16.isTrailSurrogate(prevch)
- || UTF16.isLeadSurrogate(prevch)) {
- // for supplementary code points, we have to check the next one
- // situations where we are going to ignore
- // 1. beginning of the string: schar is a lone surrogate
- // 2. schar is a lone surrogate
- // 3. schar is a trail surrogate in a valid surrogate
- // sequence that is explicitly set to zero.
- if (!isBackwardsStart()) {
- char lead = (char)previousChar();
- if (UTF16.isLeadSurrogate(lead)) {
- isZeroCE = collator.m_trie_.getLeadValue(lead);
- if (RuleBasedCollator.getTag(isZeroCE)
- == RuleBasedCollator.CE_SURROGATE_TAG_) {
- int finalCE = collator.m_trie_.getTrailValue(
- isZeroCE,
- prevch);
- if (finalCE == 0) {
- // this is a real, assigned completely
- // ignorable code point
- continue;
- }
- }
- }
- else {
- nextChar(); // revert to original offset
- // lone surrogate, completely ignorable
- continue;
- }
- nextChar(); // revert to original offset
- }
- else {
- // lone surrogate at the beggining, completely ignorable
- continue;
- }
- }
-
- // char was not in the table. prefix not found
- ce = collator.m_contractionCE_[entryoffset];
- }
-
- if (!isSpecialPrefixTag(ce)) {
- // char was in the contraction table, and the corresponding ce
- // is not a prefix ce. We found the prefix, break out of loop,
- // this ce will end up being returned.
- break;
- }
- }
- updateInternalState(m_utilSpecialBackUp_);
- return ce;
- }
-
- /**
- * Retrieves the previous contraction ce. To ensure that the backwards and
- * forwards iteration matches, we take the current region of most possible
- * match and pass it through the forward iteration. This will ensure that
- * the obstinate problem of overlapping contractions will not occur.
- * @param collator current collator
- * @param ce current ce
- * @param ch current character
- * @return previous contraction ce
- */
- private int previousContraction(RuleBasedCollator collator, int ce, char ch)
- {
- m_utilStringBuffer_.setLength(0);
- // since we might encounter normalized characters (from the thai
- // processing) we can't use peekCharacter() here.
- char prevch = (char)previousChar();
- boolean atStart = false;
- // TODO: address the comment above - maybe now we *can* use peekCharacter
- //while (collator.isUnsafe(ch) || isThaiPreVowel(prevch)) {
- while (collator.isUnsafe(ch)) {
- m_utilStringBuffer_.insert(0, ch);
- ch = prevch;
- if (isBackwardsStart()) {
- atStart = true;
- break;
- }
- prevch = (char)previousChar();
- }
- if (!atStart) {
- // undo the previousChar() if we didn't reach the beginning
- nextChar();
- }
- // adds the initial base character to the string
- m_utilStringBuffer_.insert(0, ch);
-
- // a new collation element iterator is used to simply things, since
- // using the current collation element iterator will mean that the
- // forward and backwards iteration will share and change the same
- // buffers. it is going to be painful.
- int originaldecomp = collator.getDecomposition();
- // for faster access, since string would have been normalized above
- collator.setDecomposition(Collator.NO_DECOMPOSITION);
- if (m_utilColEIter_ == null) {
- m_utilColEIter_ = new CollationElementIterator(
- m_utilStringBuffer_.toString(),
- collator);
- }
- else {
- m_utilColEIter_.m_collator_ = collator;
- m_utilColEIter_.setText(m_utilStringBuffer_.toString());
- }
- ce = m_utilColEIter_.next();
- m_CEBufferSize_ = 0;
- while (ce != NULLORDER) {
- if (m_CEBufferSize_ == m_CEBuffer_.length) {
- try {
- // increasing cebuffer size
- int tempbuffer[] = new int[m_CEBuffer_.length + 50];
- System.arraycopy(m_CEBuffer_, 0, tempbuffer, 0,
- m_CEBuffer_.length);
- m_CEBuffer_ = tempbuffer;
- }
- catch( MissingResourceException e)
- {
- throw e;
- }
- catch (Exception e) {
- if(DEBUG){
- e.printStackTrace();
- }
- return NULLORDER;
- }
- }
- m_CEBuffer_[m_CEBufferSize_ ++] = ce;
- ce = m_utilColEIter_.next();
- }
- collator.setDecomposition(originaldecomp);
- m_CEBufferOffset_ = m_CEBufferSize_ - 1;
- return m_CEBuffer_[m_CEBufferOffset_];
- }
-
- /**
- * Returns the previous long primary ces
- * @param ce long primary ce
- * @return previous long primary ces
- */
- private int previousLongPrimary(int ce)
- {
- m_CEBufferSize_ = 0;
- m_CEBuffer_[m_CEBufferSize_ ++] =
- ((ce & 0xFFFF00) << 8) | (CE_BYTE_COMMON_ << 8) | CE_BYTE_COMMON_;
- m_CEBuffer_[m_CEBufferSize_ ++] = ((ce & 0xFF) << 24)
- | RuleBasedCollator.CE_CONTINUATION_MARKER_;
- m_CEBufferOffset_ = m_CEBufferSize_ - 1;
- return m_CEBuffer_[m_CEBufferOffset_];
- }
-
- /**
- * Returns the previous expansion ces
- * @param collator current collator
- * @param ce current ce
- * @return previous expansion ce
- */
- private int previousExpansion(RuleBasedCollator collator, int ce)
- {
- // find the offset to expansion table
- int offset = getExpansionOffset(collator, ce);
- m_CEBufferSize_ = getExpansionCount(ce);
- if (m_CEBufferSize_ != 0) {
- // less than 16 elements in expansion
- for (int i = 0; i < m_CEBufferSize_; i ++) {
- m_CEBuffer_[i] = collator.m_expansion_[offset + i];
- }
-
- }
- else {
- // null terminated ces
- while (collator.m_expansion_[offset + m_CEBufferSize_] != 0) {
- m_CEBuffer_[m_CEBufferSize_] =
- collator.m_expansion_[offset + m_CEBufferSize_];
- m_CEBufferSize_ ++;
- }
- }
- m_CEBufferOffset_ = m_CEBufferSize_ - 1;
- return m_CEBuffer_[m_CEBufferOffset_];
- }
-
- /**
- * Getting the digit collation elements
- * @param collator
- * @param ce current collation element
- * @param ch current code point
- * @return digit collation element
- */
- private int previousDigit(RuleBasedCollator collator, int ce, char ch)
- {
- // We do a check to see if we want to collate digits as numbers; if so we generate
- // a custom collation key. Otherwise we pull out the value stored in the expansion table.
- if (m_collator_.m_isNumericCollation_){
- int leadingZeroIndex = 0;
- int collateVal = 0;
- boolean nonZeroValReached = false;
-
- // clear and set initial string buffer length
- m_utilStringBuffer_.setLength(3);
-
- // We parse the source string until we hit a char that's NOT a digit
- // Use this u_charDigitValue. This might be slow because we have to
- // handle surrogates...
- int char32 = ch;
- if (UTF16.isTrailSurrogate(ch)) {
- if (!isBackwardsStart()){
- char lead = (char)previousChar();
- if (UTF16.isLeadSurrogate(lead)) {
- char32 = UCharacterProperty.getRawSupplementary(lead,
- ch);
- }
- else {
- goForwardOne();
- }
- }
- }
- int digVal = UCharacter.digit(char32);
- int digIndx = 0;
- for (;;) {
- // Make sure we have enough space.
- if (digIndx >= ((m_utilStringBuffer_.length() - 2) << 1)) {
- m_utilStringBuffer_.setLength(m_utilStringBuffer_.length()
- << 1);
- }
- // Skipping over "trailing" zeroes but we still add to digIndx.
- if (digVal != 0 || nonZeroValReached) {
- if (digVal != 0 && !nonZeroValReached) {
- nonZeroValReached = true;
- }
-
- // We parse the digit string into base 100 numbers (this
- // fits into a byte).
- // We only add to the buffer in twos, thus if we are
- // parsing an odd character, that serves as the 'tens'
- // digit while the if we are parsing an even one, that is
- // the 'ones' digit. We dumped the parsed base 100 value
- // (collateVal) into a buffer. We multiply each collateVal
- // by 2 (to give us room) and add 5 (to avoid overlapping
- // magic CE byte values). The last byte we subtract 1 to
- // ensure it is less than all the other bytes.
- // Since we're doing in this reverse we want to put the
- // first digit encountered into the ones place and the
- // second digit encountered into the tens place.
-
- if (digIndx % 2 != 0){
- collateVal += digVal * 10;
-
- // This removes leading zeroes.
- if (collateVal == 0 && leadingZeroIndex == 0) {
- leadingZeroIndex = ((digIndx - 1) >>> 1) + 2;
- }
- else if (leadingZeroIndex != 0) {
- leadingZeroIndex = 0;
- }
-
- m_utilStringBuffer_.setCharAt(((digIndx - 1) >>> 1) + 2,
- (char)((collateVal << 1) + 6));
- collateVal = 0;
- }
- else {
- collateVal = digVal;
- }
- }
- digIndx ++;
-
- if (!isBackwardsStart()){
- backupInternalState(m_utilSpecialBackUp_);
- char32 = previousChar();
- if (UTF16.isTrailSurrogate(ch)){
- if (!isBackwardsStart()) {
- char lead = (char)previousChar();
- if (UTF16.isLeadSurrogate(lead)) {
- char32
- = UCharacterProperty.getRawSupplementary(
- lead, ch);
- }
- else {
- updateInternalState(m_utilSpecialBackUp_);
- }
- }
- }
-
- digVal = UCharacter.digit(char32);
- if (digVal == -1) {
- updateInternalState(m_utilSpecialBackUp_);
- break;
- }
- }
- else {
- break;
- }
- }
-
- if (nonZeroValReached == false) {
- digIndx = 2;
- m_utilStringBuffer_.setCharAt(2, (char)6);
- }
-
- if (digIndx % 2 != 0) {
- if (collateVal == 0 && leadingZeroIndex == 0) {
- // This removes the leading 0 in a odd number sequence of
- // numbers e.g. avery001
- leadingZeroIndex = ((digIndx - 1) >>> 1) + 2;
- }
- else {
- // this is not a leading 0, we add it in
- m_utilStringBuffer_.setCharAt((digIndx >>> 1) + 2,
- (char)((collateVal << 1) + 6));
- digIndx ++;
- }
- }
-
- int endIndex = leadingZeroIndex != 0 ? leadingZeroIndex
- : ((digIndx >>> 1) + 2) ;
- digIndx = ((endIndex - 2) << 1) + 1; // removing initial zeros
- // Subtract one off of the last byte.
- // Really the first byte here, but it's reversed...
- m_utilStringBuffer_.setCharAt(2,
- (char)(m_utilStringBuffer_.charAt(2) - 1));
- // We want to skip over the first two slots in the buffer.
- // The first slot is reserved for the header byte CODAN_PLACEHOLDER.
- // The second slot is for the sign/exponent byte:
- // 0x80 + (decimalPos/2) & 7f.
- m_utilStringBuffer_.setCharAt(0, (char)RuleBasedCollator.CODAN_PLACEHOLDER);
- m_utilStringBuffer_.setCharAt(1,
- (char)(0x80 + ((digIndx >>> 1) & 0x7F)));
-
- // Now transfer the collation key to our collIterate struct.
- // The total size for our collation key is endIndx bumped up to the
- // next largest even value divided by two.
- m_CEBufferSize_ = 0;
- m_CEBuffer_[m_CEBufferSize_ ++]
- = (((m_utilStringBuffer_.charAt(0) << 8)
- // Primary weight
- | m_utilStringBuffer_.charAt(1))
- << RuleBasedCollator.CE_PRIMARY_SHIFT_)
- // Secondary weight
- | (RuleBasedCollator.BYTE_COMMON_
- << RuleBasedCollator.CE_SECONDARY_SHIFT_)
- // Tertiary weight.
- | RuleBasedCollator.BYTE_COMMON_;
- int i = endIndex - 1; // Reset the index into the buffer.
- while (i >= 2) {
- int primWeight = m_utilStringBuffer_.charAt(i --) << 8;
- if (i >= 2) {
- primWeight |= m_utilStringBuffer_.charAt(i --);
- }
- m_CEBuffer_[m_CEBufferSize_ ++]
- = (primWeight << RuleBasedCollator.CE_PRIMARY_SHIFT_)
- | RuleBasedCollator.CE_CONTINUATION_MARKER_;
- }
- m_CEBufferOffset_ = m_CEBufferSize_ - 1;
- return m_CEBuffer_[m_CEBufferOffset_];
- }
- else {
- return collator.m_expansion_[getExpansionOffset(collator, ce)];
- }
- }
-
- /**
- * Returns previous hangul ces
- * @param collator current collator
- * @param ch current character
- * @return previous hangul ce
- */
- private int previousHangul(RuleBasedCollator collator, char ch)
- {
- char L = (char)(ch - HANGUL_SBASE_);
- // we do it in this order since some compilers can do % and / in one
- // operation
- char T = (char)(L % HANGUL_TCOUNT_);
- L /= HANGUL_TCOUNT_;
- char V = (char)(L % HANGUL_VCOUNT_);
- L /= HANGUL_VCOUNT_;
-
- // offset them
- L += HANGUL_LBASE_;
- V += HANGUL_VBASE_;
- T += HANGUL_TBASE_;
-
- m_CEBufferSize_ = 0;
- if (!m_collator_.m_isJamoSpecial_) {
- m_CEBuffer_[m_CEBufferSize_ ++] =
- collator.m_trie_.getLeadValue(L);
- m_CEBuffer_[m_CEBufferSize_ ++] =
- collator.m_trie_.getLeadValue(V);
- if (T != HANGUL_TBASE_) {
- m_CEBuffer_[m_CEBufferSize_ ++] =
- collator.m_trie_.getLeadValue(T);
- }
- m_CEBufferOffset_ = m_CEBufferSize_ - 1;
- return m_CEBuffer_[m_CEBufferOffset_];
- }
- else {
- // Since Hanguls pass the FCD check, it is guaranteed that we won't
- // be in the normalization buffer if something like this happens
- // Move Jamos into normalization buffer
- m_buffer_.append(L);
- m_buffer_.append(V);
- if (T != HANGUL_TBASE_) {
- m_buffer_.append(T);
- }
- m_bufferOffset_ = m_buffer_.length();
- m_FCDStart_ = m_source_.getIndex();
- m_FCDLimit_ = m_FCDStart_ + 1;
- return IGNORABLE;
- }
- }
-
- /**
- * Gets implicit codepoint ces
- * @param codepoint current codepoint
- * @return implicit codepoint ces
- */
- private int previousImplicit(int codepoint)
- {
- int result = RuleBasedCollator.impCEGen_.getImplicitFromCodePoint(codepoint);
- m_CEBufferSize_ = 2;
- m_CEBufferOffset_ = 1;
- m_CEBuffer_[0] = (result & RuleBasedCollator.CE_PRIMARY_MASK_)
- | 0x00000505;
- m_CEBuffer_[1] = ((result & 0x0000FFFF) << 16) | 0x000000C0;
- return m_CEBuffer_[1];
- }
-
- /**
- * Gets the previous surrogate ce
- * @param ch current character
- * @return previous surrogate ce
- */
- private int previousSurrogate(char ch)
- {
- if (isBackwardsStart()) {
- // we are at the start of the string, wrong place to be at
- return CE_NOT_FOUND_;
- }
- char prevch = (char)previousChar();
- // Handles Han and Supplementary characters here.
- if (UTF16.isLeadSurrogate(prevch)) {
- return previousImplicit(
- UCharacterProperty.getRawSupplementary(prevch, ch));
- }
- if (prevch != CharacterIterator.DONE) {
- nextChar();
- }
- return CE_NOT_FOUND_; // treat like unassigned
- }
-
- /**
- * <p>Special CE management. Expansions, contractions etc...</p>
- * @param collator can be plain UCA
- * @param ce current ce
- * @param ch current character
- * @return previous special ce
- */
- private int previousSpecial(RuleBasedCollator collator, int ce, char ch)
- {
- while(true) {
- // the only ces that loops are thai, special prefix and
- // contractions
- switch (RuleBasedCollator.getTag(ce)) {
- case CE_NOT_FOUND_TAG_: // this tag always returns
- return ce;
- case RuleBasedCollator.CE_SURROGATE_TAG_: // unpaired lead surrogate
- return CE_NOT_FOUND_;
- case CE_SPEC_PROC_TAG_:
- ce = previousSpecialPrefix(collator, ce);
- break;
- case CE_CONTRACTION_TAG_:
- // may loop for first character e.g. "0x0f71" for english
- if (isBackwardsStart()) {
- // start of string or this is not the end of any contraction
- ce = collator.m_contractionCE_[
- getContractionOffset(collator, ce)];
- break;
- }
- return previousContraction(collator, ce, ch); // else
- case CE_LONG_PRIMARY_TAG_:
- return previousLongPrimary(ce);
- case CE_EXPANSION_TAG_: // always returns
- return previousExpansion(collator, ce);
- case CE_DIGIT_TAG_:
- ce = previousDigit(collator, ce, ch);
- break;
- case CE_HANGUL_SYLLABLE_TAG_: // AC00-D7AF
- return previousHangul(collator, ch);
- case CE_LEAD_SURROGATE_TAG_: // D800-DBFF
- return CE_NOT_FOUND_; // broken surrogate sequence, treat like unassigned
- case CE_TRAIL_SURROGATE_TAG_: // DC00-DFFF
- return previousSurrogate(ch);
- case CE_CJK_IMPLICIT_TAG_:
- // 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
- return previousImplicit(ch);
- case CE_IMPLICIT_TAG_: // everything that is not defined
- // UCA is filled with these. Tailorings are NOT_FOUND
- return previousImplicit(ch);
- case CE_CHARSET_TAG_: // this tag always returns
- return CE_NOT_FOUND_;
- default: // this tag always returns
- ce = IGNORABLE;
- }
- if (!RuleBasedCollator.isSpecial(ce)) {
- break;
- }
- }
- return ce;
- }
-
-// /**
-// * Gets a character from the source string at a given offset.
-// * Handles both normal and iterative cases.
-// * No error checking and does not access the normalization buffer
-// * - caller beware!
-// * @param offset offset from current position which character is to be
-// * retrieved
-// * @return character at current position + offset
-// */
-// private char peekCharacter(int offset)
-// {
-// if (offset != 0) {
-// int currentoffset = m_source_.getIndex();
-// m_source_.setIndex(currentoffset + offset);
-// char result = (char)m_source_.current();
-// m_source_.setIndex(currentoffset);
-// return result;
-// }
-// else {
-// return (char)m_source_.current();
-// }
-// }
-
- /**
- * Moves back 1 position in the source string. This is slightly less
- * complicated than previousChar in that it doesn't normalize while
- * moving back. Boundary checks are not performed.
- * This method is to be used with caution, with the assumption that
- * moving back one position will not exceed the source limits.
- * Use only with nextChar() and never call this API twice in a row without
- * nextChar() in the middle.
- */
- private void goBackOne()
- {
- if (m_bufferOffset_ >= 0) {
- m_bufferOffset_ --;
- }
- else {
- m_source_.setIndex(m_source_.getIndex() - 1);
- }
- }
-
- /**
- * Moves forward 1 position in the source string. This is slightly less
- * complicated than nextChar in that it doesn't normalize while
- * moving back. Boundary checks are not performed.
- * This method is to be used with caution, with the assumption that
- * moving back one position will not exceed the source limits.
- * Use only with previousChar() and never call this API twice in a row
- * without previousChar() in the middle.
- */
- private void goForwardOne()
- {
- if (m_bufferOffset_ < 0) {
- // we're working on the source and not normalizing. fast path.
- // note Thai pre-vowel reordering uses buffer too
- m_source_.setIndex(m_source_.getIndex() + 1);
- }
- else {
- // we are in the buffer, buffer offset will never be 0 here
- m_bufferOffset_ ++;
- }
+ public RuleBasedCollator getRuleBasedCollator() {
+ return rbc_;
}
}
/**
*******************************************************************************
-* Copyright (C) 1996-2012, International Business Machines Corporation and
+* Copyright (C) 1996-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.text;
+import com.ibm.icu.impl.coll.Collation;
+
/**
* <p>A <code>CollationKey</code> represents a <code>String</code>
* under the rules of a specific <code>Collator</code>
*/
public byte[] toByteArray()
{
- int length = 0;
- while (true) {
- if (m_key_[length] == 0) {
- break;
- }
- length ++;
- }
- length ++;
+ int length = getLength() + 1;
byte result[] = new byte[length];
System.arraycopy(m_key_, 0, result, 0, length);
return result;
if (noOfLevels > Collator.PRIMARY) {
while (offset < m_key_.length && m_key_[offset] != 0) {
if (m_key_[offset ++]
- == RuleBasedCollator.SORT_LEVEL_TERMINATOR_) {
+ == Collation.LEVEL_SEPARATOR_BYTE) {
keystrength ++;
noOfLevels --;
if (noOfLevels == Collator.PRIMARY
// if both sort keys have another level, then add a 01 level
// separator and continue
- if (m_key_[index] == RuleBasedCollator.SORT_LEVEL_TERMINATOR_
+ if (m_key_[index] == Collation.LEVEL_SEPARATOR_BYTE
&& source.m_key_[sourceindex]
- == RuleBasedCollator.SORT_LEVEL_TERMINATOR_) {
+ == Collation.LEVEL_SEPARATOR_BYTE) {
++index;
++sourceindex;
- result[rindex++] = RuleBasedCollator.SORT_LEVEL_TERMINATOR_;
+ result[rindex++] = Collation.LEVEL_SEPARATOR_BYTE;
}
else {
break;
+++ /dev/null
-/**
- *******************************************************************************
- * Copyright (C) 1996-2011, International Business Machines Corporation and
- * others. All Rights Reserved.
- *******************************************************************************
- */
-package com.ibm.icu.text;
-
-import java.io.IOException;
-import java.text.ParseException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Enumeration;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import com.ibm.icu.impl.IntTrieBuilder;
-import com.ibm.icu.impl.Norm2AllModes;
-import com.ibm.icu.impl.Normalizer2Impl;
-import com.ibm.icu.impl.TrieBuilder;
-import com.ibm.icu.impl.TrieIterator;
-import com.ibm.icu.impl.UCharacterProperty;
-import com.ibm.icu.impl.Utility;
-import com.ibm.icu.lang.UCharacter;
-import com.ibm.icu.lang.UCharacterCategory;
-import com.ibm.icu.util.RangeValueIterator;
-import com.ibm.icu.util.VersionInfo;
-
-/**
- * Class for building a collator from a list of collation rules. This class is
- * uses CollationRuleParser
- *
- * @author Syn Wee Quek
- * @since release 2.2, June 11 2002
- */
-final class CollationParsedRuleBuilder {
- // package private constructors ------------------------------------------
-
- /**
- * Constructor
- *
- * @param rules
- * collation rules
- * @exception ParseException
- * thrown when argument rules have an invalid syntax
- */
- CollationParsedRuleBuilder(String rules) throws ParseException {
- m_parser_ = new CollationRuleParser(rules);
- m_parser_.assembleTokenList();
- m_utilColEIter_ = RuleBasedCollator.UCA_
- .getCollationElementIterator("");
- }
-
- // package private inner classes -----------------------------------------
-
- /**
- * Inverse UCA wrapper
- */
- static class InverseUCA {
- // package private constructor ---------------------------------------
-
- InverseUCA() {
- }
-
- // package private data member ---------------------------------------
-
- /**
- * Array list of characters
- */
- int m_table_[];
- /**
- * Array list of continuation characters
- */
- char m_continuations_[];
-
- /**
- * UCA version of inverse UCA table
- */
- VersionInfo m_UCA_version_;
-
- // package private method --------------------------------------------
-
- /**
- * Returns the previous inverse ces of the argument ces
- *
- * @param ce
- * ce to test
- * @param contce
- * continuation ce to test
- * @param strength
- * collation strength
- * @param prevresult
- * an array to store the return results previous inverse ce
- * and previous inverse continuation ce
- * @return result of the inverse ce
- */
- final int getInversePrevCE(int ce, int contce, int strength,
- int prevresult[]) {
- int result = findInverseCE(ce, contce);
-
- if (result < 0) {
- prevresult[0] = CollationElementIterator.NULLORDER;
- return -1;
- }
-
- ce &= STRENGTH_MASK_[strength];
- contce &= STRENGTH_MASK_[strength];
-
- prevresult[0] = ce;
- prevresult[1] = contce;
-
- while ((prevresult[0] & STRENGTH_MASK_[strength]) == ce
- && (prevresult[1] & STRENGTH_MASK_[strength]) == contce
- && result > 0) {
- // this condition should prevent falling off the edge of the
- // world
- // here, we end up in a singularity - zero
- prevresult[0] = m_table_[3 * (--result)];
- prevresult[1] = m_table_[3 * result + 1];
- }
- return result;
- }
-
- final int getCEStrengthDifference(int CE, int contCE, int prevCE,
- int prevContCE) {
- int strength = Collator.TERTIARY;
- while (((prevCE & STRENGTH_MASK_[strength]) != (CE & STRENGTH_MASK_[strength]) || (prevContCE & STRENGTH_MASK_[strength]) != (contCE & STRENGTH_MASK_[strength]))
- && (strength != 0)) {
- strength--;
- }
- return strength;
- }
-
- private int compareCEs(int source0, int source1, int target0,
- int target1) {
- int s1 = source0, s2, t1 = target0, t2;
- if (RuleBasedCollator.isContinuation(source1)) {
- s2 = source1;
- } else {
- s2 = 0;
- }
- if (RuleBasedCollator.isContinuation(target1)) {
- t2 = target1;
- } else {
- t2 = 0;
- }
-
- int s = 0, t = 0;
- if (s1 == t1 && s2 == t2) {
- return 0;
- }
- s = (s1 & 0xFFFF0000) | ((s2 & 0xFFFF0000) >>> 16);
- t = (t1 & 0xFFFF0000) | ((t2 & 0xFFFF0000) >>> 16);
- if (s == t) {
- s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00) >> 8;
- t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00) >> 8;
- if (s == t) {
- s = (s1 & 0x000000FF) << 8 | (s2 & 0x000000FF);
- t = (t1 & 0x000000FF) << 8 | (t2 & 0x000000FF);
- return Utility.compareUnsigned(s, t);
- } else {
- return Utility.compareUnsigned(s, t);
- }
- } else {
- return Utility.compareUnsigned(s, t);
- }
- }
-
- /**
- * Finding the inverse CE of the argument CEs
- *
- * @param ce
- * CE to be tested
- * @param contce
- * continuation CE
- * @return inverse CE
- */
- int findInverseCE(int ce, int contce) {
- int bottom = 0;
- int top = m_table_.length / 3;
- int result = 0;
-
- while (bottom < top - 1) {
- result = (top + bottom) >> 1;
- int first = m_table_[3 * result];
- int second = m_table_[3 * result + 1];
- int comparison = compareCEs(first, second, ce, contce);
- if (comparison > 0) {
- top = result;
- } else if (comparison < 0) {
- bottom = result;
- } else {
- break;
- }
- }
-
- return result;
- }
-
- /**
- * Getting gap offsets in the inverse UCA
- *
- * @param listheader
- * parsed token lists
- * @exception Exception
- * thrown when error occurs while finding the collation
- * gaps
- */
- void getInverseGapPositions(
- CollationRuleParser.TokenListHeader listheader)
- throws Exception {
- // reset all the gaps
- CollationRuleParser.Token token = listheader.m_first_;
- int tokenstrength = token.m_strength_;
-
- for (int i = 0; i < 3; i++) {
- listheader.m_gapsHi_[3 * i] = 0;
- listheader.m_gapsHi_[3 * i + 1] = 0;
- listheader.m_gapsHi_[3 * i + 2] = 0;
- listheader.m_gapsLo_[3 * i] = 0;
- listheader.m_gapsLo_[3 * i + 1] = 0;
- listheader.m_gapsLo_[3 * i + 2] = 0;
- listheader.m_numStr_[i] = 0;
- listheader.m_fStrToken_[i] = null;
- listheader.m_lStrToken_[i] = null;
- listheader.m_pos_[i] = -1;
- }
-
- if ((listheader.m_baseCE_ >>> 24) >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_
- && (listheader.m_baseCE_ >>> 24) <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_) {
- // implicits -
- listheader.m_pos_[0] = 0;
- int t1 = listheader.m_baseCE_;
- int t2 = listheader.m_baseContCE_;
- listheader.m_gapsLo_[0] = mergeCE(t1, t2, Collator.PRIMARY);
- listheader.m_gapsLo_[1] = mergeCE(t1, t2, Collator.SECONDARY);
- listheader.m_gapsLo_[2] = mergeCE(t1, t2, Collator.TERTIARY);
- int primaryCE = t1 & RuleBasedCollator.CE_PRIMARY_MASK_
- | (t2 & RuleBasedCollator.CE_PRIMARY_MASK_) >>> 16;
- primaryCE = RuleBasedCollator.impCEGen_
- .getImplicitFromRaw(RuleBasedCollator.impCEGen_
- .getRawFromImplicit(primaryCE) + 1);
-
- t1 = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;
- t2 = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_
- | RuleBasedCollator.CE_CONTINUATION_MARKER_;
-
- // if (listheader.m_baseCE_ < 0xEF000000) {
- // // first implicits have three byte primaries, with a gap of
- // // one so we esentially need to add 2 to the top byte in
- // // listheader.m_baseContCE_
- // t2 += 0x02000000;
- // }
- // else {
- // // second implicits have four byte primaries, with a gap of
- // // IMPLICIT_LAST2_MULTIPLIER_
- // // Now, this guy is not really accessible here, so until we
- // // find a better way to pass it around, assume that the gap
- // is 1
- // t2 += 0x00020000;
- // }
- listheader.m_gapsHi_[0] = mergeCE(t1, t2, Collator.PRIMARY);
- listheader.m_gapsHi_[1] = mergeCE(t1, t2, Collator.SECONDARY);
- listheader.m_gapsHi_[2] = mergeCE(t1, t2, Collator.TERTIARY);
- } else if (listheader.m_indirect_ == true
- && listheader.m_nextCE_ != 0) {
- listheader.m_pos_[0] = 0;
- int t1 = listheader.m_baseCE_;
- int t2 = listheader.m_baseContCE_;
- listheader.m_gapsLo_[0] = mergeCE(t1, t2, Collator.PRIMARY);
- listheader.m_gapsLo_[1] = mergeCE(t1, t2, Collator.SECONDARY);
- listheader.m_gapsLo_[2] = mergeCE(t1, t2, Collator.TERTIARY);
- t1 = listheader.m_nextCE_;
- t2 = listheader.m_nextContCE_;
- listheader.m_gapsHi_[0] = mergeCE(t1, t2, Collator.PRIMARY);
- listheader.m_gapsHi_[1] = mergeCE(t1, t2, Collator.SECONDARY);
- listheader.m_gapsHi_[2] = mergeCE(t1, t2, Collator.TERTIARY);
- } else {
- while (true) {
- if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_) {
- listheader.m_pos_[tokenstrength] = getInverseNext(
- listheader, tokenstrength);
- if (listheader.m_pos_[tokenstrength] >= 0) {
- listheader.m_fStrToken_[tokenstrength] = token;
- } else {
- // The CE must be implicit, since it's not in the
- // table
- // Error
- throw new Exception("Internal program error");
- }
- }
-
- while (token != null && token.m_strength_ >= tokenstrength) {
- if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_) {
- listheader.m_lStrToken_[tokenstrength] = token;
- }
- token = token.m_next_;
- }
- if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_ - 1) {
- // check if previous interval is the same and merge the
- // intervals if it is so
- if (listheader.m_pos_[tokenstrength] == listheader.m_pos_[tokenstrength + 1]) {
- listheader.m_fStrToken_[tokenstrength] = listheader.m_fStrToken_[tokenstrength + 1];
- listheader.m_fStrToken_[tokenstrength + 1] = null;
- listheader.m_lStrToken_[tokenstrength + 1] = null;
- listheader.m_pos_[tokenstrength + 1] = -1;
- }
- }
- if (token != null) {
- tokenstrength = token.m_strength_;
- } else {
- break;
- }
- }
- for (int st = 0; st < 3; st++) {
- int pos = listheader.m_pos_[st];
- if (pos >= 0) {
- int t1 = m_table_[3 * pos];
- int t2 = m_table_[3 * pos + 1];
- listheader.m_gapsHi_[3 * st] = mergeCE(t1, t2,
- Collator.PRIMARY);
- listheader.m_gapsHi_[3 * st + 1] = mergeCE(t1, t2,
- Collator.SECONDARY);
- listheader.m_gapsHi_[3 * st + 2] = (t1 & 0x3f) << 24
- | (t2 & 0x3f) << 16;
- // pos --;
- // t1 = m_table_[3 * pos];
- // t2 = m_table_[3 * pos + 1];
- t1 = listheader.m_baseCE_;
- t2 = listheader.m_baseContCE_;
-
- listheader.m_gapsLo_[3 * st] = mergeCE(t1, t2,
- Collator.PRIMARY);
- listheader.m_gapsLo_[3 * st + 1] = mergeCE(t1, t2,
- Collator.SECONDARY);
- listheader.m_gapsLo_[3 * st + 2] = (t1 & 0x3f) << 24
- | (t2 & 0x3f) << 16;
- }
- }
- }
- }
-
- /**
- * Gets the next CE in the inverse table
- *
- * @param listheader
- * token list header
- * @param strength
- * collation strength
- * @return next ce
- */
- private final int getInverseNext(
- CollationRuleParser.TokenListHeader listheader, int strength) {
- int ce = listheader.m_baseCE_;
- int secondce = listheader.m_baseContCE_;
- int result = findInverseCE(ce, secondce);
-
- if (result < 0) {
- return -1;
- }
-
- ce &= STRENGTH_MASK_[strength];
- secondce &= STRENGTH_MASK_[strength];
-
- int nextce = ce;
- int nextcontce = secondce;
-
- while ((nextce & STRENGTH_MASK_[strength]) == ce
- && (nextcontce & STRENGTH_MASK_[strength]) == secondce) {
- nextce = m_table_[3 * (++result)];
- nextcontce = m_table_[3 * result + 1];
- }
-
- listheader.m_nextCE_ = nextce;
- listheader.m_nextContCE_ = nextcontce;
-
- return result;
- }
- }
-
- // package private data members ------------------------------------------
-
- /**
- * Inverse UCA, instantiate only when required
- */
- static final InverseUCA INVERSE_UCA_;
-
- /**
- * UCA and Inverse UCA version do not match
- */
- private static final String INV_UCA_VERSION_MISMATCH_ = "UCA versions of UCA and inverse UCA should match";
-
- /**
- * UCA and Inverse UCA version do not match
- */
- private static final String UCA_NOT_INSTANTIATED_ = "UCA is not instantiated!";
-
- /**
- * Initializing the inverse UCA
- */
- static {
- InverseUCA temp = null;
- try {
- temp = CollatorReader.getInverseUCA();
- } catch (IOException e) {
- }
- /*
- * try { String invdat = "/com/ibm/icu/impl/data/invuca.icu";
- * InputStream i =
- * CollationParsedRuleBuilder.class.getResourceAsStream(invdat);
- * BufferedInputStream b = new BufferedInputStream(i, 110000);
- * INVERSE_UCA_ = CollatorReader.readInverseUCA(b); b.close();
- * i.close(); } catch (Exception e) { e.printStackTrace(); throw new
- * RuntimeException(e.getMessage()); }
- */
-
- if (temp != null && RuleBasedCollator.UCA_ != null) {
- if (!temp.m_UCA_version_
- .equals(RuleBasedCollator.UCA_.m_UCA_version_)) {
- throw new RuntimeException(INV_UCA_VERSION_MISMATCH_);
- }
- } else {
- throw new RuntimeException(UCA_NOT_INSTANTIATED_);
- }
-
- INVERSE_UCA_ = temp;
- }
-
- // package private methods -----------------------------------------------
-
- /**
- * Parse and sets the collation rules in the argument collator
- *
- * @param collator
- * to set
- * @exception Exception
- * thrown when internal program error occurs
- */
- void setRules(RuleBasedCollator collator) throws Exception {
- if (m_parser_.m_resultLength_ > 0 || m_parser_.m_removeSet_ != null) {
- // we have a set of rules, let's make something of it
- assembleTailoringTable(collator);
- } else { // no rules, but no error either must be only options
- // We will init the collator from UCA
- collator.setWithUCATables();
- }
- // And set only the options
- m_parser_.setDefaultOptionsInCollator(collator);
- }
-
- private void copyRangeFromUCA(BuildTable t, int start, int end) {
- int u = 0;
- for (u = start; u <= end; u++) {
- // if ((CE = ucmpe32_get(t.m_mapping, u)) == UCOL_NOT_FOUND
- int CE = t.m_mapping_.getValue(u);
- if (CE == CE_NOT_FOUND_
- // this test is for contractions that are missing the starting
- // element. Looks like latin-1 should be done before
- // assembling the table, even if it results in more false
- // closure elements
- || (isContractionTableElement(CE) && getCE(
- t.m_contractions_, CE, 0) == CE_NOT_FOUND_)) {
- // m_utilElement_.m_uchars_ = str.toString();
- m_utilElement_.m_uchars_ = UCharacter.toString(u);
- m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
- m_utilElement_.m_prefix_ = 0;
- m_utilElement_.m_CELength_ = 0;
- m_utilElement_.m_prefixChars_ = null;
- m_utilColEIter_.setText(m_utilElement_.m_uchars_);
- while (CE != CollationElementIterator.NULLORDER) {
- CE = m_utilColEIter_.next();
- if (CE != CollationElementIterator.NULLORDER) {
- m_utilElement_.m_CEs_[m_utilElement_.m_CELength_++] = CE;
- }
- }
- addAnElement(t, m_utilElement_);
- }
- }
- }
-
- /**
- * 2. Eliminate the negative lists by doing the following for each non-null
- * negative list: o if previousCE(baseCE, strongestN) != some ListHeader X's
- * baseCE, create new ListHeader X o reverse the list, add to the end of X's
- * positive list. Reset the strength of the first item you add, based on the
- * stronger strength levels of the two lists.
- *
- * 3. For each ListHeader with a non-null positive list: o Find all
- * character strings with CEs between the baseCE and the next/previous CE,
- * at the strength of the first token. Add these to the tailoring. ? That
- * is, if UCA has ... x <<< X << x' <<< X' < y ..., and the tailoring has &
- * x < z... ? Then we change the tailoring to & x <<< X << x' <<< X' < z ...
- *
- * It is possible that this part should be done even while constructing list
- * The problem is that it is unknown what is going to be the strongest
- * weight. So we might as well do it here o Allocate CEs for each token in
- * the list, based on the total number N of the largest level difference,
- * and the gap G between baseCE and nextCE at that level. The relation *
- * between the last item and nextCE is the same as the strongest strength. o
- * Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1) ? There are 3
- * primary items: a, d, e. Fit them into the primary gap. Then fit b and c
- * into the secondary gap between a and d, then fit q into the tertiary gap
- * between b and c. o Example: baseCE << b <<< q << c * nextCE(X,2) ? There
- * are 2 secondary items: b, c. Fit them into the secondary gap. Then fit q
- * into the tertiary gap between b and c. o When incrementing primary
- * values, we will not cross high byte boundaries except where there is only
- * a single-byte primary. That is to ensure that the script reordering will
- * continue to work.
- *
- * @param collator
- * the rule based collator to update
- * @exception Exception
- * thrown when internal program error occurs
- */
- void assembleTailoringTable(RuleBasedCollator collator) throws Exception {
-
- for (int i = 0; i < m_parser_.m_resultLength_; i++) {
- // now we need to generate the CEs
- // We stuff the initial value in the buffers, and increase the
- // appropriate buffer according to strength
- if (m_parser_.m_listHeader_[i].m_first_ != null) {
- // if there are any elements
- // due to the way parser works, subsequent tailorings
- // may remove all the elements from a sequence, therefore
- // leaving an empty tailoring sequence.
- initBuffers(m_parser_.m_listHeader_[i]);
- }
- }
-
- if (m_parser_.m_variableTop_ != null) {
- // stuff the variable top value
- m_parser_.m_options_.m_variableTopValue_ = m_parser_.m_variableTop_.m_CE_[0] >>> 16;
- // remove it from the list
- if (m_parser_.m_variableTop_.m_listHeader_.m_first_ == m_parser_.m_variableTop_) { // first
- // in
- // list
- m_parser_.m_variableTop_.m_listHeader_.m_first_ = m_parser_.m_variableTop_.m_next_;
- }
- if (m_parser_.m_variableTop_.m_listHeader_.m_last_ == m_parser_.m_variableTop_) {
- // first in list
- m_parser_.m_variableTop_.m_listHeader_.m_last_ = m_parser_.m_variableTop_.m_previous_;
- }
- if (m_parser_.m_variableTop_.m_next_ != null) {
- m_parser_.m_variableTop_.m_next_.m_previous_ = m_parser_.m_variableTop_.m_previous_;
- }
- if (m_parser_.m_variableTop_.m_previous_ != null) {
- m_parser_.m_variableTop_.m_previous_.m_next_ = m_parser_.m_variableTop_.m_next_;
- }
- }
-
- BuildTable t = new BuildTable(m_parser_);
-
- // After this, we have assigned CE values to all regular CEs now we
- // will go through list once more and resolve expansions, make
- // UCAElements structs and add them to table
- for (int i = 0; i < m_parser_.m_resultLength_; i++) {
- // now we need to generate the CEs
- // We stuff the initial value in the buffers, and increase the
- // appropriate buffer according to strength */
- createElements(t, m_parser_.m_listHeader_[i]);
- }
-
- m_utilElement_.clear();
-
- // add latin-1 stuff
- copyRangeFromUCA(t, 0, 0xFF);
-
- // add stuff for copying
- if (m_parser_.m_copySet_ != null) {
- int i = 0;
- for (i = 0; i < m_parser_.m_copySet_.getRangeCount(); i++) {
- copyRangeFromUCA(t, m_parser_.m_copySet_.getRangeStart(i),
- m_parser_.m_copySet_.getRangeEnd(i));
- }
- }
-
- // copy contractions from the UCA - this is felt mostly for cyrillic
- char conts[] = RuleBasedCollator.UCA_CONTRACTIONS_;
- int maxUCAContractionLength = RuleBasedCollator.MAX_UCA_CONTRACTION_LENGTH;
- int offset = 0;
- while (conts[offset] != 0) {
- // A continuation is NUL-terminated and NUL-padded
- // except if it has the maximum length.
- int contractionLength = maxUCAContractionLength;
- while (contractionLength > 0 && conts[offset + contractionLength - 1] == 0) {
- --contractionLength;
- }
- int first = Character.codePointAt(conts, offset);
- int firstLength = Character.charCount(first);
- int tailoredCE = t.m_mapping_.getValue(first);
- Elements prefixElm = null;
- if (tailoredCE != CE_NOT_FOUND_) {
- boolean needToAdd = true;
- if (isContractionTableElement(tailoredCE)) {
- if (isTailored(t.m_contractions_, tailoredCE, conts,
- offset + firstLength) == true) {
- needToAdd = false;
- }
- }
- if (!needToAdd && isPrefix(tailoredCE)
- && conts[offset + 1] == 0) {
- // pre-context character in UCA
- // The format for pre-context character is
- // conts[0]: baseCP conts[1]:0 conts[2]:pre-context CP
- Elements elm = new Elements();
- elm.m_CELength_ = 0;
- elm.m_uchars_ = Character.toString(conts[offset]);
- elm.m_cPoints_ = m_utilElement_.m_uchars_;
- elm.m_prefixChars_ = Character.toString(conts[offset + 2]);
- elm.m_prefix_ = 0; // TODO(claireho) : confirm!
- prefixElm = t.m_prefixLookup_.get(elm);
- if ((prefixElm == null)
- || (prefixElm.m_prefixChars_.charAt(0) != conts[offset + 2])) {
- needToAdd = true;
- }
- }
- if (m_parser_.m_removeSet_ != null
- && m_parser_.m_removeSet_.contains(first)) {
- needToAdd = false;
- }
-
- if (needToAdd == true) {
- // we need to add if this contraction is not tailored.
- if (conts[offset + 1] != 0) { // not precontext
- m_utilElement_.m_prefix_ = 0;
- m_utilElement_.m_prefixChars_ = null;
- m_utilElement_.m_uchars_ = new String(conts, offset, contractionLength);
- m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
- m_utilElement_.m_CELength_ = 0;
- m_utilColEIter_.setText(m_utilElement_.m_uchars_);
- } else { // add a pre-context element
- int preKeyLen = 0;
- m_utilElement_.m_uchars_ = Character.toString(conts[offset]);
- m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
- m_utilElement_.m_CELength_ = 0;
- m_utilElement_.m_prefixChars_ = Character.toString(conts[offset + 2]);
- if (prefixElm == null) {
- m_utilElement_.m_prefix_ = 0;
- } else { // TODO (claireho): confirm!
- m_utilElement_.m_prefix_ = m_utilElement_.m_prefix_;
- // m_utilElement_.m_prefix_= prefixElm.m_prefix_;
- }
- m_utilColEIter_.setText(m_utilElement_.m_prefixChars_);
- while (m_utilColEIter_.next() != CollationElementIterator.NULLORDER) {
- // count number of keys for pre-context char.
- preKeyLen++;
- }
- m_utilColEIter_.setText(m_utilElement_.m_prefixChars_ + m_utilElement_.m_uchars_);
- // Skip the keys for prefix character, then copy the
- // rest to el.
- while ((preKeyLen-- > 0)
- && m_utilColEIter_.next() != CollationElementIterator.NULLORDER) {
- continue;
- }
-
- }
- while (true) {
- int CE = m_utilColEIter_.next();
- if (CE != CollationElementIterator.NULLORDER) {
- m_utilElement_.m_CEs_[m_utilElement_.m_CELength_++] = CE;
- } else {
- break;
- }
- }
- addAnElement(t, m_utilElement_);
- }
- } else if (m_parser_.m_removeSet_ != null
- && m_parser_.m_removeSet_.contains(first)) {
- copyRangeFromUCA(t, first, first);
- }
-
- offset += maxUCAContractionLength;
- }
-
- // Add completely ignorable elements
- processUCACompleteIgnorables(t);
-
- // canonical closure
- canonicalClosure(t);
-
- // still need to produce compatibility closure
- assembleTable(t, collator);
- }
-
- // private inner classes -------------------------------------------------
-
- @SuppressWarnings("unused")
- private static class CEGenerator {
- // package private data members --------------------------------------
-
- WeightRange m_ranges_[];
- int m_rangesLength_;
- int m_byteSize_;
- int m_start_;
- int m_limit_;
- int m_maxCount_;
- int m_count_;
- int m_current_;
- int m_fLow_; // forbidden Low
- int m_fHigh_; // forbidden High
-
- // package private constructor ---------------------------------------
-
- CEGenerator() {
- m_ranges_ = new WeightRange[7];
- for (int i = 6; i >= 0; i--) {
- m_ranges_[i] = new WeightRange();
- }
- }
- }
-
- private static class WeightRange implements Comparable<WeightRange> {
- // public methods ----------------------------------------------------
-
- /**
- * Compares this object with target
- *
- * @param target object to compare with
- * @return 0 if equals, 1 if this is > target, -1 otherwise
- */
- public int compareTo(WeightRange target) {
- return Utility.compareUnsigned(m_start_, target.m_start_);
- }
-
- /**
- * Initialize
- */
- public void clear() {
- m_start_ = 0;
- m_end_ = 0;
- m_length_ = 0;
- m_count_ = 0;
- m_length2_ = 0;
- m_count2_ = 0;
- }
-
- // package private data members --------------------------------------
-
- int m_start_;
- int m_end_;
- int m_length_;
- int m_count_;
- int m_length2_;
- int m_count2_;
-
- // package private constructor ---------------------------------------
-
- WeightRange() {
- clear();
- }
-
- /**
- * Copy constructor. Cloneable is troublesome, needs to check for
- * exception
- *
- * @param source
- * to clone
- */
- WeightRange(WeightRange source) {
- m_start_ = source.m_start_;
- m_end_ = source.m_end_;
- m_length_ = source.m_length_;
- m_count_ = source.m_count_;
- m_length2_ = source.m_length2_;
- m_count2_ = source.m_count2_;
- }
- }
-
- private static class MaxJamoExpansionTable {
- // package private data members --------------------------------------
-
- List<Integer> m_endExpansionCE_;
- // vector of booleans
- List<Boolean> m_isV_;
- byte m_maxLSize_;
- byte m_maxVSize_;
- byte m_maxTSize_;
-
- // package private constructor ---------------------------------------
-
- MaxJamoExpansionTable() {
- m_endExpansionCE_ = new ArrayList<Integer>();
- m_isV_ = new ArrayList<Boolean>();
- m_endExpansionCE_.add(Integer.valueOf(0));
- m_isV_.add(Boolean.FALSE);
- m_maxLSize_ = 1;
- m_maxVSize_ = 1;
- m_maxTSize_ = 1;
- }
-
- MaxJamoExpansionTable(MaxJamoExpansionTable table) {
- m_endExpansionCE_ = new ArrayList<Integer>(table.m_endExpansionCE_);
- m_isV_ = new ArrayList<Boolean>(table.m_isV_);
- m_maxLSize_ = table.m_maxLSize_;
- m_maxVSize_ = table.m_maxVSize_;
- m_maxTSize_ = table.m_maxTSize_;
- }
- }
-
- private static class MaxExpansionTable {
- // package private constructor --------------------------------------
-
- MaxExpansionTable() {
- m_endExpansionCE_ = new ArrayList<Integer>();
- m_expansionCESize_ = new ArrayList<Byte>();
- m_endExpansionCE_.add(Integer.valueOf(0));
- m_expansionCESize_.add(Byte.valueOf((byte) 0));
- }
-
- MaxExpansionTable(MaxExpansionTable table) {
- m_endExpansionCE_ = new ArrayList<Integer>(table.m_endExpansionCE_);
- m_expansionCESize_ = new ArrayList<Byte>(table.m_expansionCESize_);
- }
-
- // package private data member --------------------------------------
-
- List<Integer> m_endExpansionCE_;
- List<Byte> m_expansionCESize_;
- }
-
- private static class BasicContractionTable {
- // package private constructors -------------------------------------
-
- BasicContractionTable() {
- m_CEs_ = new ArrayList<Integer>();
- m_codePoints_ = new StringBuilder();
- }
-
- // package private data members -------------------------------------
-
- StringBuilder m_codePoints_;
- List<Integer> m_CEs_;
- }
-
- private static class ContractionTable {
- // package private constructor --------------------------------------
-
- /**
- * Builds a contraction table
- *
- * @param mapping
- */
- ContractionTable(IntTrieBuilder mapping) {
- m_mapping_ = mapping;
- m_elements_ = new ArrayList<BasicContractionTable>();
- m_CEs_ = new ArrayList<Integer>();
- m_codePoints_ = new StringBuilder();
- m_offsets_ = new ArrayList<Integer>();
- m_currentTag_ = CE_NOT_FOUND_TAG_;
- }
-
- /**
- * Copies a contraction table. Not all data will be copied into their
- * own object.
- *
- * @param table
- */
- ContractionTable(ContractionTable table) {
- m_mapping_ = table.m_mapping_;
- m_elements_ = new ArrayList<BasicContractionTable>(table.m_elements_);
- m_codePoints_ = new StringBuilder(table.m_codePoints_);
- m_CEs_ = new ArrayList<Integer>(table.m_CEs_);
- m_offsets_ = new ArrayList<Integer>(table.m_offsets_);
- m_currentTag_ = table.m_currentTag_;
- }
-
- // package private data members ------------------------------------
-
- /**
- * Vector of BasicContractionTable
- */
- List<BasicContractionTable> m_elements_;
- IntTrieBuilder m_mapping_;
- StringBuilder m_codePoints_;
- List<Integer> m_CEs_;
- List<Integer> m_offsets_;
- int m_currentTag_;
- }
-
- /**
- * Private class for combining mark table. The table is indexed by the class
- * value(0-255).
- */
- @SuppressWarnings("unused")
- private static class CombinClassTable {
- /**
- * accumulated numbers of combining marks.
- */
- int[] index = new int[256];
-
- /**
- * code point array for combining marks.
- */
- char[] cPoints;
-
- /**
- * size of cPoints.
- */
- int size;
-
- // constructor
- CombinClassTable() {
- cPoints = null;
- size = 0;
- pos = 0;
- curClass = 1;
- }
-
- /**
- * Copy the combining mark table from ccc and index in compact way.
- *
- * @param cps
- * : code point array
- * @param size
- * : size of ccc
- * @param index
- * : index of combining classes(0-255)
- */
- void generate(char[] cps, int numOfCM, int[] ccIndex) {
- int count = 0;
-
- cPoints = new char[numOfCM];
- for (int i = 0; i < 256; i++) {
- for (int j = 0; j < ccIndex[i]; j++) {
- cPoints[count++] = cps[(i << 8) + j];
- }
- index[i] = count;
- }
- size = count;
- }
-
- /**
- * Get first CM(combining mark) with the combining class value cClass.
- *
- * @param cClass
- * : combining class value.
- * @return combining mark codepoint or 0 if no combining make with class
- * value cClass
- */
- char GetFirstCM(int cClass) {
- curClass = cClass;
- if (cPoints == null || cClass == 0
- || index[cClass] == index[cClass - 1]) {
- return 0;
- }
- pos = 1;
- return cPoints[index[cClass - 1]];
- }
-
- /**
- * Get next CM(combining mark) with the combining class value cClass.
- * Return combining mark codepoint or 0 if no next CM.
- */
- char GetNextCM() {
- if (cPoints == null
- || index[curClass] == (index[curClass - 1] + pos)) {
- return 0;
- }
- return cPoints[index[curClass - 1] + (pos++)];
- }
-
- // private data members
- int pos;
- int curClass;
- }
-
- private static final class BuildTable implements TrieBuilder.DataManipulate {
- // package private methods ------------------------------------------
-
- /**
- * For construction of the Trie tables. Has to be labeled public
- *
- * @param cp The value of the code point.
- * @param offset The value of the offset.
- * @return data offset or 0
- */
- public int getFoldedValue(int cp, int offset) {
- int limit = cp + 0x400;
- while (cp < limit) {
- int value = m_mapping_.getValue(cp);
- boolean inBlockZero = m_mapping_.isInZeroBlock(cp);
- int tag = getCETag(value);
- if (inBlockZero == true) {
- cp += TrieBuilder.DATA_BLOCK_LENGTH;
- } else if (!(isSpecial(value) && (tag == CE_IMPLICIT_TAG_ || tag == CE_NOT_FOUND_TAG_))) {
- // These are values that are starting in either UCA
- // (IMPLICIT_TAG) or in the tailorings (NOT_FOUND_TAG).
- // Presence of these tags means that there is nothing in
- // this position and that it should be skipped.
- return RuleBasedCollator.CE_SPECIAL_FLAG_
- | (CE_SURROGATE_TAG_ << 24) | offset;
- } else {
- ++cp;
- }
- }
- return 0;
- }
-
- // package private constructor --------------------------------------
-
- /**
- * Returns a table
- */
- BuildTable(CollationRuleParser parser) {
- m_collator_ = new RuleBasedCollator();
- m_collator_.setWithUCAData();
- MaxExpansionTable maxet = new MaxExpansionTable();
- MaxJamoExpansionTable maxjet = new MaxJamoExpansionTable();
- m_options_ = parser.m_options_;
- m_expansions_ = new ArrayList<Integer>();
- // Do your own mallocs for the structure, array and have linear
- // Latin 1
- int trieinitialvalue = RuleBasedCollator.CE_SPECIAL_FLAG_
- | (CE_NOT_FOUND_TAG_ << 24);
- // temporary fix for jb3822, 0x100000 -> 30000
- m_mapping_ = new IntTrieBuilder(null, 0x30000, trieinitialvalue,
- trieinitialvalue, true);
- m_prefixLookup_ = new HashMap<Elements, Elements>();
- // uhash_open(prefixLookupHash, prefixLookupComp);
- m_contractions_ = new ContractionTable(m_mapping_);
- // copy UCA's maxexpansion and merge as we go along
- m_maxExpansions_ = maxet;
- // adding an extra initial value for easier manipulation
- for (int i = 0; i < RuleBasedCollator.UCA_.m_expansionEndCE_.length; i++) {
- maxet.m_endExpansionCE_.add(Integer.valueOf(
- RuleBasedCollator.UCA_.m_expansionEndCE_[i]));
- maxet.m_expansionCESize_.add(Byte.valueOf(
- RuleBasedCollator.UCA_.m_expansionEndCEMaxSize_[i]));
- }
- m_maxJamoExpansions_ = maxjet;
-
- m_unsafeCP_ = new byte[UNSAFECP_TABLE_SIZE_];
- m_contrEndCP_ = new byte[UNSAFECP_TABLE_SIZE_];
- Arrays.fill(m_unsafeCP_, (byte) 0);
- Arrays.fill(m_contrEndCP_, (byte) 0);
- }
-
- /**
- * Duplicating a BuildTable. Not all data will be duplicated into their
- * own object.
- *
- * @param table
- * to clone
- */
- BuildTable(BuildTable table) {
- m_collator_ = table.m_collator_;
- m_mapping_ = new IntTrieBuilder(table.m_mapping_);
- m_expansions_ = new ArrayList<Integer>(table.m_expansions_);
- m_contractions_ = new ContractionTable(table.m_contractions_);
- m_contractions_.m_mapping_ = m_mapping_;
- m_options_ = table.m_options_;
- m_maxExpansions_ = new MaxExpansionTable(table.m_maxExpansions_);
- m_maxJamoExpansions_ = new MaxJamoExpansionTable(
- table.m_maxJamoExpansions_);
- m_unsafeCP_ = new byte[table.m_unsafeCP_.length];
- System.arraycopy(table.m_unsafeCP_, 0, m_unsafeCP_, 0,
- m_unsafeCP_.length);
- m_contrEndCP_ = new byte[table.m_contrEndCP_.length];
- System.arraycopy(table.m_contrEndCP_, 0, m_contrEndCP_, 0,
- m_contrEndCP_.length);
- }
-
- // package private data members -------------------------------------
-
- RuleBasedCollator m_collator_;
- IntTrieBuilder m_mapping_;
- List<Integer> m_expansions_;
- ContractionTable m_contractions_;
- // UCATableHeader image;
- CollationRuleParser.OptionSet m_options_;
- MaxExpansionTable m_maxExpansions_;
- MaxJamoExpansionTable m_maxJamoExpansions_;
- byte m_unsafeCP_[];
- byte m_contrEndCP_[];
- Map<Elements, Elements> m_prefixLookup_;
- CombinClassTable cmLookup = null;
- }
-
- private static class Elements {
- // package private data members -------------------------------------
-
- String m_prefixChars_;
- int m_prefix_;
- String m_uchars_;
- /**
- * Working string
- */
- String m_cPoints_;
- /**
- * Offset to the working string
- */
- int m_cPointsOffset_;
- /**
- * These are collation elements - there could be more than one - in case
- * of expansion
- */
- int m_CEs_[];
- int m_CELength_;
- /**
- * This is the value element maps in original table
- */
- int m_mapCE_;
- int m_sizePrim_[];
- int m_sizeSec_[];
- int m_sizeTer_[];
- boolean m_variableTop_;
- boolean m_caseBit_;
-
- // package private constructors -------------------------------------
-
- /**
- * Package private constructor
- */
- Elements() {
- m_sizePrim_ = new int[128];
- m_sizeSec_ = new int[128];
- m_sizeTer_ = new int[128];
- m_CEs_ = new int[256];
- m_CELength_ = 0;
- }
-
- /**
- * Package private constructor
- */
- Elements(Elements element) {
- m_prefixChars_ = element.m_prefixChars_;
- m_prefix_ = element.m_prefix_;
- m_uchars_ = element.m_uchars_;
- m_cPoints_ = element.m_cPoints_;
- m_cPointsOffset_ = element.m_cPointsOffset_;
- m_CEs_ = element.m_CEs_;
- m_CELength_ = element.m_CELength_;
- m_mapCE_ = element.m_mapCE_;
- m_sizePrim_ = element.m_sizePrim_;
- m_sizeSec_ = element.m_sizeSec_;
- m_sizeTer_ = element.m_sizeTer_;
- m_variableTop_ = element.m_variableTop_;
- m_caseBit_ = element.m_caseBit_;
- }
-
- // package private methods -------------------------------------------
-
- /**
- * Initializing the elements
- */
- public void clear() {
- m_prefixChars_ = null;
- m_prefix_ = 0;
- m_uchars_ = null;
- m_cPoints_ = null;
- m_cPointsOffset_ = 0;
- m_CELength_ = 0;
- m_mapCE_ = 0;
- Arrays.fill(m_sizePrim_, 0);
- Arrays.fill(m_sizeSec_, 0);
- Arrays.fill(m_sizeTer_, 0);
- m_variableTop_ = false;
- m_caseBit_ = false;
- }
-
- /**
- * Hashcode calculation for token
- *
- * @return the hashcode
- */
- public int hashCode() {
- String str = m_cPoints_.substring(m_cPointsOffset_);
- return str.hashCode();
- }
-
- /**
- * Equals calculation
- *
- * @param target Object to compare
- * @return true if target is the same as this object
- */
- public boolean equals(Object target) {
- if (target == this) {
- return true;
- }
- if (target instanceof Elements) {
- Elements t = (Elements) target;
- int size = m_cPoints_.length() - m_cPointsOffset_;
- if (size == t.m_cPoints_.length() - t.m_cPointsOffset_) {
- return t.m_cPoints_.regionMatches(t.m_cPointsOffset_,
- m_cPoints_, m_cPointsOffset_, size);
- }
- }
- return false;
- }
- }
-
- // private data member ---------------------------------------------------
-
- /**
- * Maximum strength used in CE building
- */
- private static final int CE_BASIC_STRENGTH_LIMIT_ = 3;
- /**
- * Maximum collation strength
- */
- private static final int CE_STRENGTH_LIMIT_ = 16;
- /**
- * Strength mask array, used in inverse UCA
- */
- private static final int STRENGTH_MASK_[] = { 0xFFFF0000, 0xFFFFFF00,
- 0xFFFFFFFF };
- /**
- * CE tag for not found
- */
- private static final int CE_NOT_FOUND_ = 0xF0000000;
- /**
- * CE tag for not found
- */
- private static final int CE_NOT_FOUND_TAG_ = 0;
- /**
- * This code point results in an expansion
- */
- private static final int CE_EXPANSION_TAG_ = 1;
- /**
- * Start of a contraction
- */
- private static final int CE_CONTRACTION_TAG_ = 2;
- /*
- * Thai character - do the reordering
- */
- // private static final int CE_THAI_TAG_ = 3;
- /*
- * Charset processing, not yet implemented
- */
- // private static final int CE_CHARSET_TAG_ = 4;
- /**
- * Lead surrogate that is tailored and doesn't start a contraction
- */
- private static final int CE_SURROGATE_TAG_ = 5;
- /*
- * AC00-D7AF
- */
- // private static final int CE_HANGUL_SYLLABLE_TAG_ = 6;
- /*
- * D800-DBFF
- */
- // private static final int CE_LEAD_SURROGATE_TAG_ = 7;
- /*
- * DC00-DFFF
- */
- // private static final int CE_TRAIL_SURROGATE_TAG_ = 8;
- /*
- * 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
- */
- // private static final int CE_CJK_IMPLICIT_TAG_ = 9;
- private static final int CE_IMPLICIT_TAG_ = 10;
- private static final int CE_SPEC_PROC_TAG_ = 11;
- /**
- * This is a three byte primary with starting secondaries and tertiaries. It
- * fits in a single 32 bit CE and is used instead of expansion to save space
- * without affecting the performance (hopefully)
- */
- private static final int CE_LONG_PRIMARY_TAG_ = 12;
- /**
- * Unsafe UChar hash table table size. Size is 32 bytes for 1 bit for each
- * latin 1 char + some power of two for hashing the rest of the chars. Size
- * in bytes
- */
- private static final int UNSAFECP_TABLE_SIZE_ = 1056;
- /**
- * Mask value down to "some power of two" -1. Number of bits, not num of
- * bytes.
- */
- private static final int UNSAFECP_TABLE_MASK_ = 0x1fff;
- /**
- * Case values
- */
- private static final int UPPER_CASE_ = 0x80;
- private static final int MIXED_CASE_ = 0x40;
- private static final int LOWER_CASE_ = 0x00;
- /*
- * Initial table size
- */
- // private static final int INIT_TABLE_SIZE_ = 1028;
- /*
- * Header size, copied from ICU4C, to be changed when that value changes
- */
- // private static final int HEADER_SIZE_ = 0xC4;
- /**
- * Contraction table new element indicator
- */
- private static final int CONTRACTION_TABLE_NEW_ELEMENT_ = 0xFFFFFF;
- /**
- * Parser for the rules
- */
- private CollationRuleParser m_parser_;
- /**
- * Utility UCA collation element iterator
- */
- private CollationElementIterator m_utilColEIter_;
- /**
- * Utility data members
- */
- private CEGenerator m_utilGens_[] = { new CEGenerator(), new CEGenerator(),
- new CEGenerator() };
- private int m_utilCEBuffer_[] = new int[CE_BASIC_STRENGTH_LIMIT_];
- private int m_utilIntBuffer_[] = new int[CE_STRENGTH_LIMIT_];
- private Elements m_utilElement_ = new Elements();
- private Elements m_utilElement2_ = new Elements();
- private CollationRuleParser.Token m_utilToken_ = new CollationRuleParser.Token();
- private int m_utilCountBuffer_[] = new int[6];
- private long m_utilLongBuffer_[] = new long[5];
- private WeightRange m_utilLowerWeightRange_[] = { new WeightRange(),
- new WeightRange(), new WeightRange(), new WeightRange(),
- new WeightRange() };
- private WeightRange m_utilUpperWeightRange_[] = { new WeightRange(),
- new WeightRange(), new WeightRange(), new WeightRange(),
- new WeightRange() };
- private WeightRange m_utilWeightRange_ = new WeightRange();
- private final Normalizer2Impl m_nfcImpl_ = Norm2AllModes.getNFCInstance().impl;
- private CanonicalIterator m_utilCanIter_ = new CanonicalIterator("");
- private StringBuilder m_utilStringBuffer_ = new StringBuilder("");
- // Flag indicating a combining marks table is required or not.
- private static boolean buildCMTabFlag = false;
-
- // private methods -------------------------------------------------------
-
- /**
- * @param listheader
- * parsed rule tokens
- * @exception Exception
- * thrown when internal error occurs
- */
- private void initBuffers(CollationRuleParser.TokenListHeader listheader)
- throws Exception {
- CollationRuleParser.Token token = listheader.m_last_;
- Arrays.fill(m_utilIntBuffer_, 0, CE_STRENGTH_LIMIT_, 0);
-
- token.m_toInsert_ = 1;
- m_utilIntBuffer_[token.m_strength_] = 1;
- while (token.m_previous_ != null) {
- if (token.m_previous_.m_strength_ < token.m_strength_) {
- // going up
- m_utilIntBuffer_[token.m_strength_] = 0;
- m_utilIntBuffer_[token.m_previous_.m_strength_]++;
- } else if (token.m_previous_.m_strength_ > token.m_strength_) {
- // going down
- m_utilIntBuffer_[token.m_previous_.m_strength_] = 1;
- } else {
- m_utilIntBuffer_[token.m_strength_]++;
- }
- token = token.m_previous_;
- token.m_toInsert_ = m_utilIntBuffer_[token.m_strength_];
- }
-
- token.m_toInsert_ = m_utilIntBuffer_[token.m_strength_];
- INVERSE_UCA_.getInverseGapPositions(listheader);
-
- token = listheader.m_first_;
- int fstrength = Collator.IDENTICAL;
- int initstrength = Collator.IDENTICAL;
-
- m_utilCEBuffer_[Collator.PRIMARY] = mergeCE(listheader.m_baseCE_,
- listheader.m_baseContCE_, Collator.PRIMARY);
- m_utilCEBuffer_[Collator.SECONDARY] = mergeCE(listheader.m_baseCE_,
- listheader.m_baseContCE_, Collator.SECONDARY);
- m_utilCEBuffer_[Collator.TERTIARY] = mergeCE(listheader.m_baseCE_,
- listheader.m_baseContCE_, Collator.TERTIARY);
- while (token != null) {
- fstrength = token.m_strength_;
- if (fstrength < initstrength) {
- initstrength = fstrength;
- if (listheader.m_pos_[fstrength] == -1) {
- while (listheader.m_pos_[fstrength] == -1 && fstrength > 0) {
- fstrength--;
- }
- if (listheader.m_pos_[fstrength] == -1) {
- throw new Exception("Internal program error");
- }
- }
- if (initstrength == Collator.TERTIARY) {
- // starting with tertiary
- m_utilCEBuffer_[Collator.PRIMARY] = listheader.m_gapsLo_[fstrength * 3];
- m_utilCEBuffer_[Collator.SECONDARY] = listheader.m_gapsLo_[fstrength * 3 + 1];
- m_utilCEBuffer_[Collator.TERTIARY] = getCEGenerator(
- m_utilGens_[Collator.TERTIARY],
- listheader.m_gapsLo_, listheader.m_gapsHi_, token,
- fstrength);
- } else if (initstrength == Collator.SECONDARY) {
- // secondaries
- m_utilCEBuffer_[Collator.PRIMARY] = listheader.m_gapsLo_[fstrength * 3];
- m_utilCEBuffer_[Collator.SECONDARY] = getCEGenerator(
- m_utilGens_[Collator.SECONDARY],
- listheader.m_gapsLo_, listheader.m_gapsHi_, token,
- fstrength);
- m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(
- m_utilGens_[Collator.TERTIARY], token,
- Collator.TERTIARY);
- } else {
- // primaries
- m_utilCEBuffer_[Collator.PRIMARY] = getCEGenerator(
- m_utilGens_[Collator.PRIMARY],
- listheader.m_gapsLo_, listheader.m_gapsHi_, token,
- fstrength);
- m_utilCEBuffer_[Collator.SECONDARY] = getSimpleCEGenerator(
- m_utilGens_[Collator.SECONDARY], token,
- Collator.SECONDARY);
- m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(
- m_utilGens_[Collator.TERTIARY], token,
- Collator.TERTIARY);
- }
- } else {
- if (token.m_strength_ == Collator.TERTIARY) {
- m_utilCEBuffer_[Collator.TERTIARY] = getNextGenerated(m_utilGens_[Collator.TERTIARY]);
- } else if (token.m_strength_ == Collator.SECONDARY) {
- m_utilCEBuffer_[Collator.SECONDARY] = getNextGenerated(m_utilGens_[Collator.SECONDARY]);
- m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(
- m_utilGens_[Collator.TERTIARY], token,
- Collator.TERTIARY);
- } else if (token.m_strength_ == Collator.PRIMARY) {
- m_utilCEBuffer_[Collator.PRIMARY] = getNextGenerated(m_utilGens_[Collator.PRIMARY]);
- m_utilCEBuffer_[Collator.SECONDARY] = getSimpleCEGenerator(
- m_utilGens_[Collator.SECONDARY], token,
- Collator.SECONDARY);
- m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(
- m_utilGens_[Collator.TERTIARY], token,
- Collator.TERTIARY);
- }
- }
- doCE(m_utilCEBuffer_, token);
- token = token.m_next_;
- }
- }
-
- /**
- * Get the next generated ce
- *
- * @param g
- * ce generator
- * @return next generated ce
- */
- private int getNextGenerated(CEGenerator g) {
- g.m_current_ = nextWeight(g);
- return g.m_current_;
- }
-
- /**
- * @param g
- * CEGenerator
- * @param token
- * rule token
- * @param strength
- * @return ce generator
- * @exception Exception
- * thrown when internal error occurs
- */
- private int getSimpleCEGenerator(CEGenerator g,
- CollationRuleParser.Token token, int strength) throws Exception {
- int high, low, count = 1;
- int maxbyte = (strength == Collator.TERTIARY) ? 0x3F : 0xFF;
-
- if (strength == Collator.SECONDARY) {
- low = RuleBasedCollator.COMMON_TOP_2_ << 24;
- high = 0xFFFFFFFF;
- count = 0xFF - RuleBasedCollator.COMMON_TOP_2_;
- } else {
- low = RuleBasedCollator.BYTE_COMMON_ << 24; // 0x05000000;
- high = 0x40000000;
- count = 0x40 - RuleBasedCollator.BYTE_COMMON_;
- }
-
- if (token.m_next_ != null && token.m_next_.m_strength_ == strength) {
- count = token.m_next_.m_toInsert_;
- }
-
- g.m_rangesLength_ = allocateWeights(low, high, count, maxbyte,
- g.m_ranges_);
- g.m_current_ = RuleBasedCollator.BYTE_COMMON_ << 24;
-
- if (g.m_rangesLength_ == 0) {
- throw new Exception("Internal program error");
- }
- return g.m_current_;
- }
-
- /**
- * Combines 2 ce into one with respect to the argument strength
- *
- * @param ce1
- * first ce
- * @param ce2
- * second ce
- * @param strength
- * strength to use
- * @return combined ce
- */
- private static int mergeCE(int ce1, int ce2, int strength) {
- int mask = RuleBasedCollator.CE_TERTIARY_MASK_;
- if (strength == Collator.SECONDARY) {
- mask = RuleBasedCollator.CE_SECONDARY_MASK_;
- } else if (strength == Collator.PRIMARY) {
- mask = RuleBasedCollator.CE_PRIMARY_MASK_;
- }
- ce1 &= mask;
- ce2 &= mask;
- switch (strength) {
- case Collator.PRIMARY:
- return ce1 | ce2 >>> 16;
- case Collator.SECONDARY:
- return ce1 << 16 | ce2 << 8;
- default:
- return ce1 << 24 | ce2 << 16;
- }
- }
-
- /**
- * @param g
- * CEGenerator
- * @param lows
- * low gap array
- * @param highs
- * high gap array
- * @param token
- * rule token
- * @param fstrength
- * @exception Exception
- * thrown when internal error occurs
- */
- private int getCEGenerator(CEGenerator g, int lows[], int highs[],
- CollationRuleParser.Token token, int fstrength) throws Exception {
- int strength = token.m_strength_;
- int low = lows[fstrength * 3 + strength];
- int high = highs[fstrength * 3 + strength];
- int maxbyte = 0;
- if (strength == Collator.TERTIARY) {
- maxbyte = 0x3F;
- } else if (strength == Collator.PRIMARY) {
- maxbyte = 0xFE;
- } else {
- maxbyte = 0xFF;
- }
-
- int count = token.m_toInsert_;
-
- if (Utility.compareUnsigned(low, high) >= 0
- && strength > Collator.PRIMARY) {
- int s = strength;
- while (true) {
- s--;
- if (lows[fstrength * 3 + s] != highs[fstrength * 3 + s]) {
- if (strength == Collator.SECONDARY) {
- if (low < (RuleBasedCollator.COMMON_TOP_2_ << 24)) {
- // Override if low range is less than
- // UCOL_COMMON_TOP2.
- low = RuleBasedCollator.COMMON_TOP_2_ << 24;
- }
- high = 0xFFFFFFFF;
- } else {
- if (low < RuleBasedCollator.COMMON_BOTTOM_3 << 24) {
- // Override if low range is less than
- // UCOL_COMMON_BOT3.
- low = RuleBasedCollator.COMMON_BOTTOM_3 << 24;
- }
- high = 0x40000000;
- }
- break;
- }
- if (s < 0) {
- throw new Exception("Internal program error");
- }
- }
- }
- if(0 <= low && low < 0x02000000) { // unsigned comparison < 0x02000000
- // We must not use CE weight byte 02, so we set it as the minimum lower bound.
- // See http://site.icu-project.org/design/collation/bytes
- low = 0x02000000;
- }
-
- if (strength == Collator.SECONDARY) { // similar as simple
- if (Utility.compareUnsigned(low,
- RuleBasedCollator.COMMON_BOTTOM_2_ << 24) >= 0
- && Utility.compareUnsigned(low,
- RuleBasedCollator.COMMON_TOP_2_ << 24) < 0) {
- low = RuleBasedCollator.COMMON_TOP_2_ << 24;
- }
- if (Utility.compareUnsigned(high,
- RuleBasedCollator.COMMON_BOTTOM_2_ << 24) > 0
- && Utility.compareUnsigned(high,
- RuleBasedCollator.COMMON_TOP_2_ << 24) < 0) {
- high = RuleBasedCollator.COMMON_TOP_2_ << 24;
- }
- if (Utility.compareUnsigned(low,
- RuleBasedCollator.COMMON_BOTTOM_2_ << 24) < 0) {
- g.m_rangesLength_ = allocateWeights(
- RuleBasedCollator.BYTE_UNSHIFTED_MIN_ << 24, high,
- count, maxbyte, g.m_ranges_);
- g.m_current_ = nextWeight(g);
- // g.m_current_ = RuleBasedCollator.COMMON_BOTTOM_2_ << 24;
- return g.m_current_;
- }
- }
-
- g.m_rangesLength_ = allocateWeights(low, high, count, maxbyte,
- g.m_ranges_);
- if (g.m_rangesLength_ == 0) {
- throw new Exception("Internal program error");
- }
- g.m_current_ = nextWeight(g);
- return g.m_current_;
- }
-
- /**
- * @param ceparts
- * list of collation elements parts
- * @param token
- * rule token
- * @exception Exception
- * thrown when forming case bits for expansions fails
- */
- private void doCE(int ceparts[], CollationRuleParser.Token token)
- throws Exception {
- // this one makes the table and stuff
- // int noofbytes[] = new int[3];
- for (int i = 0; i < 3; i++) {
- // noofbytes[i] = countBytes(ceparts[i]);
- m_utilIntBuffer_[i] = countBytes(ceparts[i]);
- }
-
- // Here we have to pack CEs from parts
- int cei = 0;
- int value = 0;
-
- while ((cei << 1) < m_utilIntBuffer_[0] || cei < m_utilIntBuffer_[1]
- || cei < m_utilIntBuffer_[2]) {
- if (cei > 0) {
- value = RuleBasedCollator.CE_CONTINUATION_MARKER_;
- } else {
- value = 0;
- }
-
- if ((cei << 1) < m_utilIntBuffer_[0]) {
- value |= ((ceparts[0] >> (32 - ((cei + 1) << 4))) & 0xFFFF) << 16;
- }
- if (cei < m_utilIntBuffer_[1]) {
- value |= ((ceparts[1] >> (32 - ((cei + 1) << 3))) & 0xFF) << 8;
- }
-
- if (cei < m_utilIntBuffer_[2]) {
- value |= ((ceparts[2] >> (32 - ((cei + 1) << 3))) & 0x3F);
- }
- token.m_CE_[cei] = value;
- cei++;
- }
- if (cei == 0) { // totally ignorable
- token.m_CELength_ = 1;
- token.m_CE_[0] = 0;
- } else { // there is at least something
- token.m_CELength_ = cei;
- }
-
- // Case bits handling for expansion
- if (token.m_CE_[0] != 0) { // case bits should be set only for
- // non-ignorables
- token.m_CE_[0] &= 0xFFFFFF3F; // Clean the case bits field
- int cSize = (token.m_source_ & 0xFF000000) >>> 24;
- int startoftokenrule = token.m_source_ & 0x00FFFFFF;
-
- if (cSize > 1) {
- // Do it manually
- String tokenstr = token.m_rules_.substring(startoftokenrule,
- startoftokenrule + cSize);
- token.m_CE_[0] |= getCaseBits(tokenstr);
- } else {
- // Copy it from the UCA
- int caseCE = getFirstCE(token.m_rules_.charAt(startoftokenrule));
- token.m_CE_[0] |= (caseCE & 0xC0);
- }
- }
- }
-
- /**
- * Count the number of non-zero bytes used in the ce
- *
- * @param ce
- * @return number of non-zero bytes used in ce
- */
- private static final int countBytes(int ce) {
- int mask = 0xFFFFFFFF;
- int result = 0;
- while (mask != 0) {
- if ((ce & mask) != 0) {
- result++;
- }
- mask >>>= 8;
- }
- return result;
- }
-
- /**
- * We are ready to create collation elements
- *
- * @param t
- * build table to insert
- * @param lh
- * rule token list header
- */
- private void createElements(BuildTable t,
- CollationRuleParser.TokenListHeader lh) {
- CollationRuleParser.Token tok = lh.m_first_;
- m_utilElement_.clear();
- while (tok != null) {
- // first, check if there are any expansions
- // if there are expansions, we need to do a little bit more
- // processing since parts of expansion can be tailored, while
- // others are not
- if (tok.m_expansion_ != 0) {
- int len = tok.m_expansion_ >>> 24;
- int currentSequenceLen = len;
- int expOffset = tok.m_expansion_ & 0x00FFFFFF;
- m_utilToken_.m_source_ = currentSequenceLen | expOffset;
- m_utilToken_.m_rules_ = m_parser_.m_source_;
-
- while (len > 0) {
- currentSequenceLen = len;
- while (currentSequenceLen > 0) {
- m_utilToken_.m_source_ = (currentSequenceLen << 24)
- | expOffset;
- CollationRuleParser.Token expt = m_parser_.m_hashTable_.get(m_utilToken_);
- if (expt != null
- && expt.m_strength_ != CollationRuleParser.TOKEN_RESET_) {
- // expansion is tailored
- int noOfCEsToCopy = expt.m_CELength_;
- for (int j = 0; j < noOfCEsToCopy; j++) {
- tok.m_expCE_[tok.m_expCELength_ + j] = expt.m_CE_[j];
- }
- tok.m_expCELength_ += noOfCEsToCopy;
- // never try to add codepoints and CEs.
- // For some odd reason, it won't work.
- expOffset += currentSequenceLen; // noOfCEsToCopy;
- len -= currentSequenceLen; // noOfCEsToCopy;
- break;
- } else {
- currentSequenceLen--;
- }
- }
- if (currentSequenceLen == 0) {
- // couldn't find any tailored subsequence, will have to
- // get one from UCA. first, get the UChars from the
- // rules then pick CEs out until there is no more and
- // stuff them into expansion
- m_utilColEIter_.setText(m_parser_.m_source_.substring(
- expOffset, expOffset + 1));
- while (true) {
- int order = m_utilColEIter_.next();
- if (order == CollationElementIterator.NULLORDER) {
- break;
- }
- tok.m_expCE_[tok.m_expCELength_++] = order;
- }
- expOffset++;
- len--;
- }
- }
- } else {
- tok.m_expCELength_ = 0;
- }
-
- // set the ucaelement with obtained values
- m_utilElement_.m_CELength_ = tok.m_CELength_ + tok.m_expCELength_;
-
- // copy CEs
- System.arraycopy(tok.m_CE_, 0, m_utilElement_.m_CEs_, 0,
- tok.m_CELength_);
- System.arraycopy(tok.m_expCE_, 0, m_utilElement_.m_CEs_,
- tok.m_CELength_, tok.m_expCELength_);
-
- // copy UChars
- // We kept prefix and source kind of together, as it is a kind of a
- // contraction.
- // However, now we have to slice the prefix off the main thing -
- m_utilElement_.m_prefix_ = 0;// el.m_prefixChars_;
- m_utilElement_.m_cPointsOffset_ = 0; // el.m_uchars_;
- if (tok.m_prefix_ != 0) {
- // we will just copy the prefix here, and adjust accordingly in
- // the addPrefix function in ucol_elm. The reason is that we
- // need to add both composed AND decomposed elements to the
- // unsafe table.
- int size = tok.m_prefix_ >> 24;
- int offset = tok.m_prefix_ & 0x00FFFFFF;
- m_utilElement_.m_prefixChars_ = m_parser_.m_source_.substring(
- offset, offset + size);
- size = (tok.m_source_ >> 24) - (tok.m_prefix_ >> 24);
- offset = (tok.m_source_ & 0x00FFFFFF) + (tok.m_prefix_ >> 24);
- m_utilElement_.m_uchars_ = m_parser_.m_source_.substring(
- offset, offset + size);
- } else {
- m_utilElement_.m_prefixChars_ = null;
- int offset = tok.m_source_ & 0x00FFFFFF;
- int size = tok.m_source_ >>> 24;
- m_utilElement_.m_uchars_ = m_parser_.m_source_.substring(
- offset, offset + size);
- }
- m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
-
- boolean containCombinMarks = false;
- for (int i = 0; i < m_utilElement_.m_cPoints_.length()
- - m_utilElement_.m_cPointsOffset_; i++) {
- if (isJamo(m_utilElement_.m_cPoints_.charAt(i))) {
- t.m_collator_.m_isJamoSpecial_ = true;
- break;
- }
- if (!buildCMTabFlag) {
- // check combining class
- int fcd = m_nfcImpl_.getFCD16(m_utilElement_.m_cPoints_.charAt(i)); // TODO: review for handling supplementary characters
- if ((fcd & 0xff) == 0) {
- // reset flag when current char is not combining mark.
- containCombinMarks = false;
- } else {
- containCombinMarks = true;
- }
- }
- }
-
- if (!buildCMTabFlag && containCombinMarks) {
- buildCMTabFlag = true;
- }
-
- /***
- * // Case bits handling m_utilElement_.m_CEs_[0] &= 0xFFFFFF3F; //
- * Clean the case bits field if (m_utilElement_.m_cPoints_.length()
- * - m_utilElement_.m_cPointsOffset_ > 1) { // Do it manually
- * m_utilElement_.m_CEs_[0] |=
- * getCaseBits(m_utilElement_.m_cPoints_); } else { // Copy it from
- * the UCA int caseCE =
- * getFirstCE(m_utilElement_.m_cPoints_.charAt(0));
- * m_utilElement_.m_CEs_[0] |= (caseCE & 0xC0); }
- ***/
- // and then, add it
- addAnElement(t, m_utilElement_);
- tok = tok.m_next_;
- }
- }
-
- /**
- * Testing if the string argument has case
- *
- * @param src
- * string
- * @return the case for this char array
- * @exception Exception
- * thrown when internal program error occurs
- */
- private final int getCaseBits(String src) throws Exception {
- int uCount = 0;
- int lCount = 0;
- src = Normalizer.decompose(src, true);
- m_utilColEIter_.setText(src);
- for (int i = 0; i < src.length(); i++) {
- m_utilColEIter_.setText(src.substring(i, i + 1));
- int order = m_utilColEIter_.next();
- if (RuleBasedCollator.isContinuation(order)) {
- throw new Exception("Internal program error");
- }
- if ((order & RuleBasedCollator.CE_CASE_BIT_MASK_) == UPPER_CASE_) {
- uCount++;
- } else {
- char ch = src.charAt(i);
- if (UCharacter.isLowerCase(ch)) {
- lCount++;
- } else {
- if (toSmallKana(ch) == ch && toLargeKana(ch) != ch) {
- lCount++;
- }
- }
- }
- }
-
- if (uCount != 0 && lCount != 0) {
- return MIXED_CASE_;
- } else if (uCount != 0) {
- return UPPER_CASE_;
- } else {
- return LOWER_CASE_;
- }
- }
-
- /**
- * Converts a char to the uppercase Kana
- *
- * @param ch
- * character to convert
- * @return the converted Kana character
- */
- private static final char toLargeKana(char ch) {
- if (0x3042 < ch && ch < 0x30ef) { // Kana range
- switch (ch - 0x3000) {
- case 0x41:
- case 0x43:
- case 0x45:
- case 0x47:
- case 0x49:
- case 0x63:
- case 0x83:
- case 0x85:
- case 0x8E:
- case 0xA1:
- case 0xA3:
- case 0xA5:
- case 0xA7:
- case 0xA9:
- case 0xC3:
- case 0xE3:
- case 0xE5:
- case 0xEE:
- ch++;
- break;
- case 0xF5:
- ch = 0x30AB;
- break;
- case 0xF6:
- ch = 0x30B1;
- break;
- }
- }
- return ch;
- }
-
- /**
- * Converts a char to the lowercase Kana
- *
- * @param ch
- * character to convert
- * @return the converted Kana character
- */
- private static final char toSmallKana(char ch) {
- if (0x3042 < ch && ch < 0x30ef) { // Kana range
- switch (ch - 0x3000) {
- case 0x42:
- case 0x44:
- case 0x46:
- case 0x48:
- case 0x4A:
- case 0x64:
- case 0x84:
- case 0x86:
- case 0x8F:
- case 0xA2:
- case 0xA4:
- case 0xA6:
- case 0xA8:
- case 0xAA:
- case 0xC4:
- case 0xE4:
- case 0xE6:
- case 0xEF:
- ch--;
- break;
- case 0xAB:
- ch = 0x30F5;
- break;
- case 0xB1:
- ch = 0x30F6;
- break;
- }
- }
- return ch;
- }
-
- /**
- * This should be connected to special Jamo handling.
- */
- private int getFirstCE(char ch) {
- m_utilColEIter_.setText(UCharacter.toString(ch));
- return m_utilColEIter_.next();
- }
-
- /**
- * This adds a read element, while testing for existence
- *
- * @param t
- * build table
- * @param element
- * @return ce
- */
- private int addAnElement(BuildTable t, Elements element) {
- List<Integer> expansions = t.m_expansions_;
- element.m_mapCE_ = 0;
-
- if (element.m_CELength_ == 1) {
- element.m_mapCE_ = element.m_CEs_[0];
-
- } else {
- // unfortunately, it looks like we have to look for a long primary
- // here since in canonical closure we are going to hit some long
- // primaries from the first phase, and they will come back as
- // continuations/expansions destroying the effect of the previous
- // opitimization. A long primary is a three byte primary with
- // starting secondaries and tertiaries. It can appear in long runs
- // of only primary differences (like east Asian tailorings) also,
- // it should not be an expansion, as expansions would break with
- // this
- if (element.m_CELength_ == 2 // a two CE expansion
- && RuleBasedCollator.isContinuation(element.m_CEs_[1])
- && (element.m_CEs_[1] & (~(0xFF << 24 | RuleBasedCollator.CE_CONTINUATION_MARKER_))) == 0 // that
- // has
- // only
- // primaries
- // in
- // continuation
- && (((element.m_CEs_[0] >> 8) & 0xFF) == RuleBasedCollator.BYTE_COMMON_)
- // a common secondary
- && ((element.m_CEs_[0] & 0xFF) == RuleBasedCollator.BYTE_COMMON_) // and
- // a
- // common
- // tertiary
- ) {
- element.m_mapCE_ = RuleBasedCollator.CE_SPECIAL_FLAG_
- // a long primary special
- | (CE_LONG_PRIMARY_TAG_ << 24)
- // first and second byte of primary
- | ((element.m_CEs_[0] >> 8) & 0xFFFF00)
- // third byte of primary
- | ((element.m_CEs_[1] >> 24) & 0xFF);
- } else {
- // omitting expansion offset in builder
- // (HEADER_SIZE_ >> 2)
- int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_
- | (CE_EXPANSION_TAG_ << RuleBasedCollator.CE_TAG_SHIFT_)
- | (addExpansion(expansions, element.m_CEs_[0]) << 4)
- & 0xFFFFF0;
-
- for (int i = 1; i < element.m_CELength_; i++) {
- addExpansion(expansions, element.m_CEs_[i]);
- }
- if (element.m_CELength_ <= 0xF) {
- expansion |= element.m_CELength_;
- } else {
- addExpansion(expansions, 0);
- }
- element.m_mapCE_ = expansion;
- setMaxExpansion(element.m_CEs_[element.m_CELength_ - 1],
- (byte) element.m_CELength_, t.m_maxExpansions_);
- if (isJamo(element.m_cPoints_.charAt(0))) {
- t.m_collator_.m_isJamoSpecial_ = true;
- setMaxJamoExpansion(element.m_cPoints_.charAt(0),
- element.m_CEs_[element.m_CELength_ - 1],
- (byte) element.m_CELength_, t.m_maxJamoExpansions_);
- }
- }
- }
-
- // We treat digits differently - they are "uber special" and should be
- // processed differently if numeric collation is on.
- int uniChar = 0;
- if ((element.m_uchars_.length() == 2)
- && UTF16.isLeadSurrogate(element.m_uchars_.charAt(0))) {
- uniChar = UCharacterProperty.getRawSupplementary(element.m_uchars_
- .charAt(0), element.m_uchars_.charAt(1));
- } else if (element.m_uchars_.length() == 1) {
- uniChar = element.m_uchars_.charAt(0);
- }
-
- // Here, we either have one normal CE OR mapCE is set. Therefore, we
- // stuff only one element to the expansion buffer. When we encounter a
- // digit and we don't do numeric collation, we will just pick the CE
- // we have and break out of case (see ucol.cpp ucol_prv_getSpecialCE
- // && ucol_prv_getSpecialPrevCE). If we picked a special, further
- // processing will occur. If it's a simple CE, we'll return due
- // to how the loop is constructed.
- if (uniChar != 0 && UCharacter.isDigit(uniChar)) {
- // prepare the element
- int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_
- | (CollationElementIterator.CE_DIGIT_TAG_ << RuleBasedCollator.CE_TAG_SHIFT_)
- | 1;
- if (element.m_mapCE_ != 0) {
- // if there is an expansion, we'll pick it here
- expansion |= (addExpansion(expansions, element.m_mapCE_) << 4);
- } else {
- expansion |= (addExpansion(expansions, element.m_CEs_[0]) << 4);
- }
- element.m_mapCE_ = expansion;
- }
-
- // here we want to add the prefix structure.
- // I will try to process it as a reverse contraction, if possible.
- // prefix buffer is already reversed.
-
- if (element.m_prefixChars_ != null
- && element.m_prefixChars_.length() - element.m_prefix_ > 0) {
- // We keep the seen prefix starter elements in a hashtable we need
- // it to be able to distinguish between the simple codepoints and
- // prefix starters. Also, we need to use it for canonical closure.
- m_utilElement2_.m_caseBit_ = element.m_caseBit_;
- m_utilElement2_.m_CELength_ = element.m_CELength_;
- m_utilElement2_.m_CEs_ = element.m_CEs_;
- m_utilElement2_.m_mapCE_ = element.m_mapCE_;
- // m_utilElement2_.m_prefixChars_ = element.m_prefixChars_;
- m_utilElement2_.m_sizePrim_ = element.m_sizePrim_;
- m_utilElement2_.m_sizeSec_ = element.m_sizeSec_;
- m_utilElement2_.m_sizeTer_ = element.m_sizeTer_;
- m_utilElement2_.m_variableTop_ = element.m_variableTop_;
- m_utilElement2_.m_prefix_ = element.m_prefix_;
- m_utilElement2_.m_prefixChars_ = Normalizer.compose(
- element.m_prefixChars_, false);
- m_utilElement2_.m_uchars_ = element.m_uchars_;
- m_utilElement2_.m_cPoints_ = element.m_cPoints_;
- m_utilElement2_.m_cPointsOffset_ = 0;
-
- if (t.m_prefixLookup_ != null) {
- Elements uCE = t.m_prefixLookup_.get(element);
- if (uCE != null) {
- // there is already a set of code points here
- element.m_mapCE_ = addPrefix(t, uCE.m_mapCE_, element);
- } else { // no code points, so this spot is clean
- element.m_mapCE_ = addPrefix(t, CE_NOT_FOUND_, element);
- uCE = new Elements(element);
- uCE.m_cPoints_ = uCE.m_uchars_;
- t.m_prefixLookup_.put(uCE, uCE);
- }
- if (m_utilElement2_.m_prefixChars_.length() != element.m_prefixChars_
- .length()
- - element.m_prefix_
- || !m_utilElement2_.m_prefixChars_.regionMatches(0,
- element.m_prefixChars_, element.m_prefix_,
- m_utilElement2_.m_prefixChars_.length())) {
- // do it!
- m_utilElement2_.m_mapCE_ = addPrefix(t, element.m_mapCE_,
- m_utilElement2_);
- }
- }
- }
-
- // We need to use the canonical iterator here
- // the way we do it is to generate the canonically equivalent strings
- // for the contraction and then add the sequences that pass FCD check
- if (element.m_cPoints_.length() - element.m_cPointsOffset_ > 1
- && !(element.m_cPoints_.length() - element.m_cPointsOffset_ == 2
- && UTF16.isLeadSurrogate(element.m_cPoints_.charAt(0)) && UTF16
- .isTrailSurrogate(element.m_cPoints_.charAt(1)))) {
- // this is a contraction, we should check whether a composed form
- // should also be included
- m_utilCanIter_.setSource(element.m_cPoints_);
- String source = m_utilCanIter_.next();
- while (source != null && source.length() > 0) {
- if (Normalizer.quickCheck(source, Normalizer.FCD, 0) != Normalizer.NO) {
- element.m_uchars_ = source;
- element.m_cPoints_ = element.m_uchars_;
- finalizeAddition(t, element);
- }
- source = m_utilCanIter_.next();
- }
-
- return element.m_mapCE_;
- } else {
- return finalizeAddition(t, element);
- }
- }
-
- /**
- * Adds an expansion ce to the expansion vector
- *
- * @param expansions
- * vector to add to
- * @param value
- * of the expansion
- * @return the current position of the new element
- */
- private static final int addExpansion(List<Integer> expansions, int value) {
- expansions.add(Integer.valueOf(value));
- return expansions.size() - 1;
- }
-
- /**
- * Looks for the maximum length of all expansion sequences ending with the
- * same collation element. The size required for maxexpansion and maxsize is
- * returned if the arrays are too small.
- *
- * @param endexpansion
- * the last expansion collation element to be added
- * @param expansionsize
- * size of the expansion
- * @param maxexpansion
- * data structure to store the maximum expansion data.
- * @returns size of the maxexpansion and maxsize used.
- */
- private static int setMaxExpansion(int endexpansion, byte expansionsize,
- MaxExpansionTable maxexpansion) {
- int start = 0;
- int limit = maxexpansion.m_endExpansionCE_.size();
- long unsigned = (long) endexpansion;
- unsigned &= 0xFFFFFFFFl;
-
- // using binary search to determine if last expansion element is
- // already in the array
- int result = -1;
- if (limit > 0) {
- while (start < limit - 1) {
- int mid = (start + limit) >> 1;
- long unsignedce = (maxexpansion.m_endExpansionCE_
- .get(mid)).intValue();
- unsignedce &= 0xFFFFFFFFl;
- if (unsigned < unsignedce) {
- limit = mid;
- } else {
- start = mid;
- }
- }
-
- if ((maxexpansion.m_endExpansionCE_.get(start)).intValue() == endexpansion) {
- result = start;
- }
- }
- if (result > -1) {
- // found the ce in expansion, we'll just modify the size if it
- // is smaller
- Object currentsize = maxexpansion.m_expansionCESize_.get(result);
- if (((Byte) currentsize).byteValue() < expansionsize) {
- maxexpansion.m_expansionCESize_.set(result, Byte.valueOf(
- expansionsize));
- }
- } else {
- // we'll need to squeeze the value into the array. initial
- // implementation. shifting the subarray down by 1
- maxexpansion.m_endExpansionCE_.add(start + 1, Integer.valueOf(endexpansion));
- maxexpansion.m_expansionCESize_.add(start + 1, Byte.valueOf(expansionsize));
- }
- return maxexpansion.m_endExpansionCE_.size();
- }
-
- /**
- * Sets the maximum length of all jamo expansion sequences ending with the
- * same collation element. The size required for maxexpansion and maxsize is
- * returned if the arrays are too small.
- *
- * @param ch
- * the jamo codepoint
- * @param endexpansion
- * the last expansion collation element to be added
- * @param expansionsize
- * size of the expansion
- * @param maxexpansion
- * data structure to store the maximum expansion data.
- * @returns size of the maxexpansion and maxsize used.
- */
- private static int setMaxJamoExpansion(char ch, int endexpansion,
- byte expansionsize, MaxJamoExpansionTable maxexpansion) {
- boolean isV = true;
- if (ch >= 0x1100 && ch <= 0x1112) {
- // determines L for Jamo, doesn't need to store this since it is
- // never at the end of a expansion
- if (maxexpansion.m_maxLSize_ < expansionsize) {
- maxexpansion.m_maxLSize_ = expansionsize;
- }
- return maxexpansion.m_endExpansionCE_.size();
- }
-
- if (ch >= 0x1161 && ch <= 0x1175) {
- // determines V for Jamo
- if (maxexpansion.m_maxVSize_ < expansionsize) {
- maxexpansion.m_maxVSize_ = expansionsize;
- }
- }
-
- if (ch >= 0x11A8 && ch <= 0x11C2) {
- isV = false;
- // determines T for Jamo
- if (maxexpansion.m_maxTSize_ < expansionsize) {
- maxexpansion.m_maxTSize_ = expansionsize;
- }
- }
-
- int pos = maxexpansion.m_endExpansionCE_.size();
- while (pos > 0) {
- pos--;
- if ((maxexpansion.m_endExpansionCE_.get(pos)).intValue() == endexpansion) {
- return maxexpansion.m_endExpansionCE_.size();
- }
- }
- maxexpansion.m_endExpansionCE_.add(Integer.valueOf(endexpansion));
- maxexpansion.m_isV_.add(isV ? Boolean.TRUE : Boolean.FALSE);
-
- return maxexpansion.m_endExpansionCE_.size();
- }
-
- /**
- * Adds a prefix to the table
- *
- * @param t
- * build table to update
- * @param CE
- * collation element to add
- * @param element
- * rule element to add
- * @return modified ce
- */
- private int addPrefix(BuildTable t, int CE, Elements element) {
- // currently the longest prefix we're supporting in Japanese is two
- // characters long. Although this table could quite easily mimic
- // complete contraction stuff there is no good reason to make a general
- // solution, as it would require some error prone messing.
- ContractionTable contractions = t.m_contractions_;
- String oldCP = element.m_cPoints_;
- int oldCPOffset = element.m_cPointsOffset_;
-
- contractions.m_currentTag_ = CE_SPEC_PROC_TAG_;
- // here, we will normalize & add prefix to the table.
- int size = element.m_prefixChars_.length() - element.m_prefix_;
- for (int j = 1; j < size; j++) {
- // First add NFD prefix chars to unsafe CP hash table
- // Unless it is a trail surrogate, which is handled algoritmically
- // and shouldn't take up space in the table.
- char ch = element.m_prefixChars_.charAt(j + element.m_prefix_);
- if (!UTF16.isTrailSurrogate(ch)) {
- unsafeCPSet(t.m_unsafeCP_, ch);
- }
- }
-
- // StringBuffer reversed = new StringBuffer();
- m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length());
- for (int j = 0; j < size; j++) {
- // prefixes are going to be looked up backwards
- // therefore, we will promptly reverse the prefix buffer...
- int offset = element.m_prefixChars_.length() - j - 1;
- m_utilStringBuffer_.append(element.m_prefixChars_.charAt(offset));
- }
- element.m_prefixChars_ = m_utilStringBuffer_.toString();
- element.m_prefix_ = 0;
-
- // the first codepoint is also unsafe, as it forms a 'contraction' with
- // the prefix
- if (!UTF16.isTrailSurrogate(element.m_cPoints_.charAt(0))) {
- unsafeCPSet(t.m_unsafeCP_, element.m_cPoints_.charAt(0));
- }
-
- element.m_cPoints_ = element.m_prefixChars_;
- element.m_cPointsOffset_ = element.m_prefix_;
-
- // Add the last char of the contraction to the contraction-end hash
- // table. unless it is a trail surrogate, which is handled
- // algorithmically and shouldn't be in the table
- if (!UTF16.isTrailSurrogate(element.m_cPoints_
- .charAt(element.m_cPoints_.length() - 1))) {
- ContrEndCPSet(t.m_contrEndCP_, element.m_cPoints_
- .charAt(element.m_cPoints_.length() - 1));
- }
- // First we need to check if contractions starts with a surrogate
- // int cp = UTF16.charAt(element.m_cPoints_, element.m_cPointsOffset_);
-
- // If there are any Jamos in the contraction, we should turn on special
- // processing for Jamos
- if (isJamo(element.m_prefixChars_.charAt(element.m_prefix_))) {
- t.m_collator_.m_isJamoSpecial_ = true;
- }
- // then we need to deal with it
- // we could aready have something in table - or we might not
- if (!isPrefix(CE)) {
- // if it wasn't contraction, we wouldn't end up here
- int firstContractionOffset = addContraction(contractions,
- CONTRACTION_TABLE_NEW_ELEMENT_, (char) 0, CE);
- int newCE = processContraction(contractions, element, CE_NOT_FOUND_);
- addContraction(contractions, firstContractionOffset,
- element.m_prefixChars_.charAt(element.m_prefix_), newCE);
- addContraction(contractions, firstContractionOffset, (char) 0xFFFF,
- CE);
- CE = constructSpecialCE(CE_SPEC_PROC_TAG_, firstContractionOffset);
- } else {
- // we are adding to existing contraction
- // there were already some elements in the table, so we need to add
- // a new contraction
- // Two things can happen here: either the codepoint is already in
- // the table, or it is not
- char ch = element.m_prefixChars_.charAt(element.m_prefix_);
- int position = findCP(contractions, CE, ch);
- if (position > 0) {
- // if it is we just continue down the chain
- int eCE = getCE(contractions, CE, position);
- int newCE = processContraction(contractions, element, eCE);
- setContraction(contractions, CE, position, ch, newCE);
- } else {
- // if it isn't, we will have to create a new sequence
- processContraction(contractions, element, CE_NOT_FOUND_);
- insertContraction(contractions, CE, ch, element.m_mapCE_);
- }
- }
-
- element.m_cPoints_ = oldCP;
- element.m_cPointsOffset_ = oldCPOffset;
-
- return CE;
- }
-
- /**
- * Checks if the argument ce is a contraction
- *
- * @param CE
- * collation element
- * @return true if argument ce is a contraction
- */
- private static final boolean isContraction(int CE) {
- return isSpecial(CE) && (getCETag(CE) == CE_CONTRACTION_TAG_);
- }
-
- /**
- * Checks if the argument ce has a prefix
- *
- * @param CE
- * collation element
- * @return true if argument ce has a prefix
- */
- private static final boolean isPrefix(int CE) {
- return isSpecial(CE) && (getCETag(CE) == CE_SPEC_PROC_TAG_);
- }
-
- /**
- * Checks if the argument ce is special
- *
- * @param CE
- * collation element
- * @return true if argument ce is special
- */
- private static final boolean isSpecial(int CE) {
- return (CE & RuleBasedCollator.CE_SPECIAL_FLAG_) == 0xF0000000;
- }
-
- /**
- * Checks if the argument ce has a prefix
- *
- * @param CE
- * collation element
- * @return true if argument ce has a prefix
- */
- private static final int getCETag(int CE) {
- return (CE & RuleBasedCollator.CE_TAG_MASK_) >>> RuleBasedCollator.CE_TAG_SHIFT_;
- }
-
- /**
- * Gets the ce at position in contraction table
- *
- * @param table
- * contraction table
- * @param position
- * offset to the contraction table
- * @return ce
- */
- private static final int getCE(ContractionTable table, int element,
- int position) {
- element &= 0xFFFFFF;
- BasicContractionTable tbl = getBasicContractionTable(table, element);
-
- if (tbl == null) {
- return CE_NOT_FOUND_;
- }
- if (position > tbl.m_CEs_.size() || position == -1) {
- return CE_NOT_FOUND_;
- } else {
- return tbl.m_CEs_.get(position).intValue();
- }
- }
-
- /**
- * Sets the unsafe character
- *
- * @param table
- * unsafe table
- * @param c
- * character to be added
- */
- private static final void unsafeCPSet(byte table[], char c) {
- int hash = c;
- if (hash >= (UNSAFECP_TABLE_SIZE_ << 3)) {
- if (hash >= 0xd800 && hash <= 0xf8ff) {
- // Part of a surrogate, or in private use area.
- // These don't go in the table
- return;
- }
- hash = (hash & UNSAFECP_TABLE_MASK_) + 256;
- }
- table[hash >> 3] |= (1 << (hash & 7));
- }
-
- /**
- * Sets the contraction end character
- *
- * @param table
- * contraction end table
- * @param c
- * character to be added
- */
- private static final void ContrEndCPSet(byte table[], char c) {
- int hash = c;
- if (hash >= (UNSAFECP_TABLE_SIZE_ << 3)) {
- hash = (hash & UNSAFECP_TABLE_MASK_) + 256;
- }
- table[hash >> 3] |= (1 << (hash & 7));
- }
-
- /**
- * Adds more contractions in table. If element is non existant, it creates
- * on. Returns element handle
- *
- * @param table
- * contraction table
- * @param element
- * offset to the contraction table
- * @param codePoint
- * codepoint to add
- * @param value
- * @return collation element
- */
- private static int addContraction(ContractionTable table, int element,
- char codePoint, int value) {
- BasicContractionTable tbl = getBasicContractionTable(table, element);
- if (tbl == null) {
- tbl = addAContractionElement(table);
- element = table.m_elements_.size() - 1;
- }
-
- tbl.m_CEs_.add(Integer.valueOf(value));
- tbl.m_codePoints_.append(codePoint);
- return constructSpecialCE(table.m_currentTag_, element);
- }
-
- /**
- * Adds a contraction element to the table
- *
- * @param table
- * contraction table to update
- * @return contraction
- */
- private static BasicContractionTable addAContractionElement(
- ContractionTable table) {
- BasicContractionTable result = new BasicContractionTable();
- table.m_elements_.add(result);
- return result;
- }
-
- /**
- * Constructs a special ce
- *
- * @param tag
- * special tag
- * @param CE
- * collation element
- * @return a contraction ce
- */
- private static final int constructSpecialCE(int tag, int CE) {
- return RuleBasedCollator.CE_SPECIAL_FLAG_
- | (tag << RuleBasedCollator.CE_TAG_SHIFT_) | (CE & 0xFFFFFF);
- }
-
- /**
- * Sets and inserts the element that has a contraction
- *
- * @param contractions
- * contraction table
- * @param element
- * contracting element
- * @param existingCE
- * @return contraction ce
- */
- private static int processContraction(ContractionTable contractions,
- Elements element, int existingCE) {
- int firstContractionOffset = 0;
- // end of recursion
- if (element.m_cPoints_.length() - element.m_cPointsOffset_ == 1) {
- if (isContractionTableElement(existingCE)
- && getCETag(existingCE) == contractions.m_currentTag_) {
- changeContraction(contractions, existingCE, (char) 0,
- element.m_mapCE_);
- changeContraction(contractions, existingCE, (char) 0xFFFF,
- element.m_mapCE_);
- return existingCE;
- } else {
- // can't do just that. existingCe might be a contraction,
- // meaning that we need to do another step
- return element.m_mapCE_;
- }
- }
-
- // this recursion currently feeds on the only element we have...
- // We will have to copy it in order to accomodate for both backward
- // and forward cycles
- // we encountered either an empty space or a non-contraction element
- // this means we are constructing a new contraction sequence
- element.m_cPointsOffset_++;
- if (!isContractionTableElement(existingCE)) {
- // if it wasn't contraction, we wouldn't end up here
- firstContractionOffset = addContraction(contractions,
- CONTRACTION_TABLE_NEW_ELEMENT_, (char) 0, existingCE);
- int newCE = processContraction(contractions, element, CE_NOT_FOUND_);
- addContraction(contractions, firstContractionOffset,
- element.m_cPoints_.charAt(element.m_cPointsOffset_), newCE);
- addContraction(contractions, firstContractionOffset, (char) 0xFFFF,
- existingCE);
- existingCE = constructSpecialCE(contractions.m_currentTag_,
- firstContractionOffset);
- } else {
- // we are adding to existing contraction
- // there were already some elements in the table, so we need to add
- // a new contraction
- // Two things can happen here: either the codepoint is already in
- // the table, or it is not
- int position = findCP(contractions, existingCE, element.m_cPoints_
- .charAt(element.m_cPointsOffset_));
- if (position > 0) {
- // if it is we just continue down the chain
- int eCE = getCE(contractions, existingCE, position);
- int newCE = processContraction(contractions, element, eCE);
- setContraction(contractions, existingCE, position,
- element.m_cPoints_.charAt(element.m_cPointsOffset_),
- newCE);
- } else {
- // if it isn't, we will have to create a new sequence
- int newCE = processContraction(contractions, element,
- CE_NOT_FOUND_);
- insertContraction(contractions, existingCE, element.m_cPoints_
- .charAt(element.m_cPointsOffset_), newCE);
- }
- }
- element.m_cPointsOffset_--;
- return existingCE;
- }
-
- /**
- * Checks if CE belongs to the contraction table
- *
- * @param CE
- * collation element to test
- * @return true if CE belongs to the contraction table
- */
- private static final boolean isContractionTableElement(int CE) {
- return isSpecial(CE)
- && (getCETag(CE) == CE_CONTRACTION_TAG_ || getCETag(CE) == CE_SPEC_PROC_TAG_);
- }
-
- /**
- * Gets the codepoint
- *
- * @param table
- * contraction table
- * @param element
- * offset to the contraction element in the table
- * @param codePoint
- * code point to look for
- * @return the offset to the code point
- */
- private static int findCP(ContractionTable table, int element,
- char codePoint) {
- BasicContractionTable tbl = getBasicContractionTable(table, element);
- if (tbl == null) {
- return -1;
- }
-
- int position = 0;
- while (codePoint > tbl.m_codePoints_.charAt(position)) {
- position++;
- if (position > tbl.m_codePoints_.length()) {
- return -1;
- }
- }
- if (codePoint == tbl.m_codePoints_.charAt(position)) {
- return position;
- } else {
- return -1;
- }
- }
-
- /**
- * Gets the contraction element out of the contraction table
- *
- * @param table
- * contraction table
- * @param offset
- * to the element in the contraction table
- * @return basic contraction element at offset in the contraction table
- */
- private static final BasicContractionTable getBasicContractionTable(
- ContractionTable table, int offset) {
- offset &= 0xFFFFFF;
- if (offset == 0xFFFFFF) {
- return null;
- }
- return table.m_elements_.get(offset);
- }
-
- /**
- * Changes the contraction element
- *
- * @param table
- * contraction table
- * @param element
- * offset to the element in the contraction table
- * @param codePoint
- * codepoint
- * @param newCE
- * new collation element
- * @return basic contraction element at offset in the contraction table
- */
- private static final int changeContraction(ContractionTable table,
- int element, char codePoint, int newCE) {
- BasicContractionTable tbl = getBasicContractionTable(table, element);
- if (tbl == null) {
- return 0;
- }
- int position = 0;
- while (codePoint > tbl.m_codePoints_.charAt(position)) {
- position++;
- if (position > tbl.m_codePoints_.length()) {
- return CE_NOT_FOUND_;
- }
- }
- if (codePoint == tbl.m_codePoints_.charAt(position)) {
- tbl.m_CEs_.set(position, Integer.valueOf(newCE));
- return element & 0xFFFFFF;
- } else {
- return CE_NOT_FOUND_;
- }
- }
-
- /**
- * Sets a part of contraction sequence in table. If element is non existant,
- * it creates on. Returns element handle.
- *
- * @param table
- * contraction table
- * @param element
- * offset to the contraction table
- * @param offset
- * @param codePoint
- * contraction character
- * @param value
- * ce value
- * @return new contraction ce
- */
- private static final int setContraction(ContractionTable table,
- int element, int offset, char codePoint, int value) {
- element &= 0xFFFFFF;
- BasicContractionTable tbl = getBasicContractionTable(table, element);
- if (tbl == null) {
- tbl = addAContractionElement(table);
- element = table.m_elements_.size() - 1;
- }
-
- tbl.m_CEs_.set(offset, Integer.valueOf(value));
- tbl.m_codePoints_.setCharAt(offset, codePoint);
- return constructSpecialCE(table.m_currentTag_, element);
- }
-
- /**
- * Inserts a part of contraction sequence in table. Sequences behind the
- * offset are moved back. If element is non existent, it creates on.
- *
- * @param table
- * contraction
- * @param element
- * offset to the table contraction
- * @param codePoint
- * code point
- * @param value
- * collation element value
- * @return contraction collation element
- */
- private static final int insertContraction(ContractionTable table,
- int element, char codePoint, int value) {
- element &= 0xFFFFFF;
- BasicContractionTable tbl = getBasicContractionTable(table, element);
- if (tbl == null) {
- tbl = addAContractionElement(table);
- element = table.m_elements_.size() - 1;
- }
-
- int offset = 0;
- while (tbl.m_codePoints_.charAt(offset) < codePoint
- && offset < tbl.m_codePoints_.length()) {
- offset++;
- }
-
- tbl.m_CEs_.add(offset, Integer.valueOf(value));
- tbl.m_codePoints_.insert(offset, codePoint);
-
- return constructSpecialCE(table.m_currentTag_, element);
- }
-
- /**
- * Finalize addition
- *
- * @param t
- * build table
- * @param element
- * to add
- */
- private final static int finalizeAddition(BuildTable t, Elements element) {
- int CE = CE_NOT_FOUND_;
- // This should add a completely ignorable element to the
- // unsafe table, so that backward iteration will skip
- // over it when treating contractions.
- if (element.m_mapCE_ == 0) {
- for (int i = 0; i < element.m_cPoints_.length(); i++) {
- char ch = element.m_cPoints_.charAt(i);
- if (!UTF16.isTrailSurrogate(ch)) {
- unsafeCPSet(t.m_unsafeCP_, ch);
- }
- }
- }
-
- if (element.m_cPoints_.length() - element.m_cPointsOffset_ > 1) {
- // we're adding a contraction
- int cp = UTF16.charAt(element.m_cPoints_, element.m_cPointsOffset_);
- CE = t.m_mapping_.getValue(cp);
- CE = addContraction(t, CE, element);
- } else {
- // easy case
- CE = t.m_mapping_.getValue(element.m_cPoints_
- .charAt(element.m_cPointsOffset_));
-
- if (CE != CE_NOT_FOUND_) {
- if (isContractionTableElement(CE)) {
- // adding a non contraction element (thai, expansion,
- // single) to already existing contraction
- if (!isPrefix(element.m_mapCE_)) {
- // we cannot reenter prefix elements - as we are going
- // to create a dead loop
- // Only expansions and regular CEs can go here...
- // Contractions will never happen in this place
- setContraction(t.m_contractions_, CE, 0, (char) 0,
- element.m_mapCE_);
- // This loop has to change the CE at the end of
- // contraction REDO!
- changeLastCE(t.m_contractions_, CE, element.m_mapCE_);
- }
- } else {
- t.m_mapping_
- .setValue(element.m_cPoints_
- .charAt(element.m_cPointsOffset_),
- element.m_mapCE_);
- if (element.m_prefixChars_ != null
- && element.m_prefixChars_.length() > 0
- && getCETag(CE) != CE_IMPLICIT_TAG_) {
- // Add CE for standalone precontext char.
- Elements origElem = new Elements();
- origElem.m_prefixChars_ = null;
- origElem.m_uchars_ = element.m_cPoints_;
- origElem.m_cPoints_ = origElem.m_uchars_;
- origElem.m_CEs_[0] = CE;
- origElem.m_mapCE_ = CE;
- origElem.m_CELength_ = 1;
- finalizeAddition(t, origElem);
- }
- }
- } else {
- t.m_mapping_.setValue(element.m_cPoints_
- .charAt(element.m_cPointsOffset_), element.m_mapCE_);
- }
- }
- return CE;
- }
-
- /**
- * Note regarding surrogate handling: We are interested only in the single
- * or leading surrogates in a contraction. If a surrogate is somewhere else
- * in the contraction, it is going to be handled as a pair of code units, as
- * it doesn't affect the performance AND handling surrogates specially would
- * complicate code way too much.
- */
- private static int addContraction(BuildTable t, int CE, Elements element) {
- ContractionTable contractions = t.m_contractions_;
- contractions.m_currentTag_ = CE_CONTRACTION_TAG_;
-
- // First we need to check if contractions starts with a surrogate
- int cp = UTF16.charAt(element.m_cPoints_, 0);
- int cpsize = 1;
- if (UCharacter.isSupplementary(cp)) {
- cpsize = 2;
- }
- if (cpsize < element.m_cPoints_.length()) {
- // This is a real contraction, if there are other characters after
- // the first
- int size = element.m_cPoints_.length() - element.m_cPointsOffset_;
- for (int j = 1; j < size; j++) {
- // First add contraction chars to unsafe CP hash table
- // Unless it is a trail surrogate, which is handled
- // algoritmically and shouldn't take up space in the table.
- if (!UTF16.isTrailSurrogate(element.m_cPoints_
- .charAt(element.m_cPointsOffset_ + j))) {
- unsafeCPSet(t.m_unsafeCP_, element.m_cPoints_
- .charAt(element.m_cPointsOffset_ + j));
- }
- }
- // Add the last char of the contraction to the contraction-end
- // hash table. unless it is a trail surrogate, which is handled
- // algorithmically and shouldn't be in the table
- if (!UTF16.isTrailSurrogate(element.m_cPoints_
- .charAt(element.m_cPoints_.length() - 1))) {
- ContrEndCPSet(t.m_contrEndCP_, element.m_cPoints_
- .charAt(element.m_cPoints_.length() - 1));
- }
-
- // If there are any Jamos in the contraction, we should turn on
- // special processing for Jamos
- if (isJamo(element.m_cPoints_.charAt(element.m_cPointsOffset_))) {
- t.m_collator_.m_isJamoSpecial_ = true;
- }
- // then we need to deal with it
- // we could aready have something in table - or we might not
- element.m_cPointsOffset_ += cpsize;
- if (!isContraction(CE)) {
- // if it wasn't contraction, we wouldn't end up here
- int firstContractionOffset = addContraction(contractions,
- CONTRACTION_TABLE_NEW_ELEMENT_, (char) 0, CE);
- int newCE = processContraction(contractions, element,
- CE_NOT_FOUND_);
- addContraction(contractions, firstContractionOffset,
- element.m_cPoints_.charAt(element.m_cPointsOffset_),
- newCE);
- addContraction(contractions, firstContractionOffset,
- (char) 0xFFFF, CE);
- CE = constructSpecialCE(CE_CONTRACTION_TAG_,
- firstContractionOffset);
- } else {
- // we are adding to existing contraction
- // there were already some elements in the table, so we need to
- // add a new contraction
- // Two things can happen here: either the codepoint is already
- // in the table, or it is not
- int position = findCP(contractions, CE, element.m_cPoints_
- .charAt(element.m_cPointsOffset_));
- if (position > 0) {
- // if it is we just continue down the chain
- int eCE = getCE(contractions, CE, position);
- int newCE = processContraction(contractions, element, eCE);
- setContraction(
- contractions,
- CE,
- position,
- element.m_cPoints_.charAt(element.m_cPointsOffset_),
- newCE);
- } else {
- // if it isn't, we will have to create a new sequence
- int newCE = processContraction(contractions, element,
- CE_NOT_FOUND_);
- insertContraction(contractions, CE, element.m_cPoints_
- .charAt(element.m_cPointsOffset_), newCE);
- }
- }
- element.m_cPointsOffset_ -= cpsize;
- t.m_mapping_.setValue(cp, CE);
- } else if (!isContraction(CE)) {
- // this is just a surrogate, and there is no contraction
- t.m_mapping_.setValue(cp, element.m_mapCE_);
- } else {
- // fill out the first stage of the contraction with the surrogate
- // CE
- changeContraction(contractions, CE, (char) 0, element.m_mapCE_);
- changeContraction(contractions, CE, (char) 0xFFFF, element.m_mapCE_);
- }
- return CE;
- }
-
- /**
- * this is for adding non contractions
- *
- * @param table
- * contraction table
- * @param element
- * offset to the contraction table
- * @param value
- * collation element value
- * @return new collation element
- */
- private static final int changeLastCE(ContractionTable table, int element,
- int value) {
- BasicContractionTable tbl = getBasicContractionTable(table, element);
- if (tbl == null) {
- return 0;
- }
-
- tbl.m_CEs_.set(tbl.m_CEs_.size() - 1, Integer.valueOf(value));
- return constructSpecialCE(table.m_currentTag_, element & 0xFFFFFF);
- }
-
- /**
- * Given a set of ranges calculated by allocWeights(), iterate through the
- * weights. Sets the next weight in cegenerator.m_current_.
- *
- * @param cegenerator
- * object that contains ranges weight range array and its
- * rangeCount
- * @return the next weight
- */
- private static int nextWeight(CEGenerator cegenerator) {
- if (cegenerator.m_rangesLength_ > 0) {
- // get maxByte from the .count field
- int maxByte = cegenerator.m_ranges_[0].m_count_;
- // get the next weight
- int weight = cegenerator.m_ranges_[0].m_start_;
- if (weight == cegenerator.m_ranges_[0].m_end_) {
- // this range is finished, remove it and move the following
- // ones up
- cegenerator.m_rangesLength_--;
- if (cegenerator.m_rangesLength_ > 0) {
- System.arraycopy(cegenerator.m_ranges_, 1,
- cegenerator.m_ranges_, 0,
- cegenerator.m_rangesLength_);
- cegenerator.m_ranges_[0].m_count_ = maxByte;
- // keep maxByte in ranges[0]
- }
- } else {
- // increment the weight for the next value
- cegenerator.m_ranges_[0].m_start_ = incWeight(weight,
- cegenerator.m_ranges_[0].m_length2_, maxByte);
- }
- return weight;
- }
- return -1;
- }
-
- /**
- * Increment the collation weight
- *
- * @param weight
- * to increment
- * @param length
- * @param maxByte
- * @return new incremented weight
- */
- private static final int incWeight(int weight, int length, int maxByte) {
- while (true) {
- int b = getWeightByte(weight, length);
- if (b < maxByte) {
- return setWeightByte(weight, length, b + 1);
- } else {
- // roll over, set this byte to BYTE_FIRST_TAILORED_ and
- // increment the previous one
- weight = setWeightByte(weight, length,
- RuleBasedCollator.BYTE_FIRST_TAILORED_);
- --length;
- }
- }
- }
-
- /**
- * Gets the weight byte
- *
- * @param weight
- * @param index
- * @return byte
- */
- private static final int getWeightByte(int weight, int index) {
- return (weight >> ((4 - index) << 3)) & 0xff;
- }
-
- /**
- * Set the weight byte in table
- *
- * @param weight
- * @param index
- * @param b
- * byte
- */
- private static final int setWeightByte(int weight, int index, int b) {
- index <<= 3;
- // 0xffffffff except a 00 "hole" for the index-th byte
- int mask;
- if (index < 32) {
- mask = 0xffffffff >>> index;
- } else {
- // Do not use int>>>32 because that does not shift at all
- // while we need it to become 0.
- //
- // Java Language Specification (Third Edition) 15.19 Shift Operators:
- // "If the promoted type of the left-hand operand is int,
- // only the five lowest-order bits of the right-hand operand
- // are used as the shift distance.
- // It is as if the right-hand operand were subjected to
- // a bitwise logical AND operator & (§15.22.1) with the mask value 0x1f.
- // The shift distance actually used is therefore
- // always in the range 0 to 31, inclusive."
- mask = 0;
- }
- index = 32 - index;
- mask |= 0xffffff00 << index;
- return (weight & mask) | (b << index);
- }
-
- /**
- * Call getWeightRanges and then determine heuristically which ranges to use
- * for a given number of weights between (excluding) two limits
- *
- * @param lowerLimit
- * @param upperLimit
- * @param n
- * @param maxByte
- * @param ranges
- * @return
- */
- private int allocateWeights(int lowerLimit, int upperLimit, int n,
- int maxByte, WeightRange ranges[]) {
- // number of usable byte values 3..maxByte
- int countBytes = maxByte - RuleBasedCollator.BYTE_FIRST_TAILORED_ + 1;
- // [0] unused, [5] to make index checks unnecessary, m_utilCountBuffer_
- // countBytes to the power of index, m_utilLongBuffer_ for unsignedness
- // gcc requires explicit initialization
- m_utilLongBuffer_[0] = 1;
- m_utilLongBuffer_[1] = countBytes;
- m_utilLongBuffer_[2] = m_utilLongBuffer_[1] * countBytes;
- m_utilLongBuffer_[3] = m_utilLongBuffer_[2] * countBytes;
- m_utilLongBuffer_[4] = m_utilLongBuffer_[3] * countBytes;
- int rangeCount = getWeightRanges(lowerLimit, upperLimit, maxByte,
- countBytes, ranges);
- if (rangeCount <= 0) {
- return 0;
- }
- // what is the maximum number of weights with these ranges?
- long maxCount = 0;
- for (int i = 0; i < rangeCount; ++i) {
- maxCount += (long) ranges[i].m_count_
- * m_utilLongBuffer_[4 - ranges[i].m_length_];
- }
- if (maxCount < n) {
- return 0;
- }
- // set the length2 and count2 fields
- for (int i = 0; i < rangeCount; ++i) {
- ranges[i].m_length2_ = ranges[i].m_length_;
- ranges[i].m_count2_ = ranges[i].m_count_;
- }
- // try until we find suitably large ranges
- while (true) {
- // get the smallest number of bytes in a range
- int minLength = ranges[0].m_length2_;
- // sum up the number of elements that fit into ranges of each byte
- // length
- Arrays.fill(m_utilCountBuffer_, 0);
- for (int i = 0; i < rangeCount; ++i) {
- m_utilCountBuffer_[ranges[i].m_length2_] += ranges[i].m_count2_;
- }
- // now try to allocate n elements in the available short ranges
- if (n <= m_utilCountBuffer_[minLength]
- + m_utilCountBuffer_[minLength + 1]) {
- // trivial cases, use the first few ranges
- maxCount = 0;
- rangeCount = 0;
- do {
- maxCount += ranges[rangeCount].m_count2_;
- ++rangeCount;
- } while (n > maxCount);
- break;
- } else if (n <= ranges[0].m_count2_ * countBytes) {
- // easy case, just make this one range large enough by
- // lengthening it once more, possibly split it
- rangeCount = 1;
- // calculate how to split the range between maxLength-1
- // (count1) and maxLength (count2)
- long power_1 = m_utilLongBuffer_[minLength
- - ranges[0].m_length_];
- long power = power_1 * countBytes;
- int count2 = (int) ((n + power - 1) / power);
- int count1 = ranges[0].m_count_ - count2;
- // split the range
- if (count1 < 1) {
- // lengthen the entire range to maxLength
- lengthenRange(ranges, 0, maxByte, countBytes);
- } else {
- // really split the range
- // create a new range with the end and initial and current
- // length of the old one
- rangeCount = 2;
- ranges[1].m_end_ = ranges[0].m_end_;
- ranges[1].m_length_ = ranges[0].m_length_;
- ranges[1].m_length2_ = minLength;
- // set the end of the first range according to count1
- int i = ranges[0].m_length_;
- int b = getWeightByte(ranges[0].m_start_, i) + count1 - 1;
- // ranges[0].count and count1 may be >countBytes from
- // merging adjacent ranges; b > maxByte is possible
- if (b <= maxByte) {
- ranges[0].m_end_ = setWeightByte(ranges[0].m_start_, i,
- b);
- } else {
- ranges[0].m_end_ = setWeightByte(incWeight(
- ranges[0].m_start_, i - 1, maxByte), i, b
- - countBytes);
- }
- // set the bytes in the end weight at length + 1..length2
- // to maxByte
- b = (maxByte << 24) | (maxByte << 16) | (maxByte << 8)
- | maxByte; // this used to be 0xffffffff
- ranges[0].m_end_ = truncateWeight(ranges[0].m_end_, i)
- | (b >>> (i << 3)) & (b << ((4 - minLength) << 3));
- // set the start of the second range to immediately follow
- // the end of the first one
- ranges[1].m_start_ = incWeight(ranges[0].m_end_, minLength,
- maxByte);
- // set the count values (informational)
- ranges[0].m_count_ = count1;
- ranges[1].m_count_ = count2;
-
- ranges[0].m_count2_ = (int) (count1 * power_1);
- // will be *countBytes when lengthened
- ranges[1].m_count2_ = (int) (count2 * power_1);
-
- // lengthen the second range to maxLength
- lengthenRange(ranges, 1, maxByte, countBytes);
- }
- break;
- }
- // no good match, lengthen all minLength ranges and iterate
- for (int i = 0; ranges[i].m_length2_ == minLength; ++i) {
- lengthenRange(ranges, i, maxByte, countBytes);
- }
- }
-
- if (rangeCount > 1) {
- // sort the ranges by weight values
- Arrays.sort(ranges, 0, rangeCount);
- }
-
- // set maxByte in ranges[0] for ucol_nextWeight()
- ranges[0].m_count_ = maxByte;
-
- return rangeCount;
- }
-
- /**
- * Updates the range length
- *
- * @param range
- * weight range array
- * @param offset
- * to weight range array
- * @param maxByte
- * @param countBytes
- * @return new length
- */
- private static final int lengthenRange(WeightRange range[], int offset,
- int maxByte, int countBytes) {
- int length = range[offset].m_length2_ + 1;
- range[offset].m_start_ = setWeightTrail(range[offset].m_start_, length,
- RuleBasedCollator.BYTE_FIRST_TAILORED_);
- range[offset].m_end_ = setWeightTrail(range[offset].m_end_, length,
- maxByte);
- range[offset].m_count2_ *= countBytes;
- range[offset].m_length2_ = length;
- return length;
- }
-
- /**
- * Gets the weight
- *
- * @param weight
- * @param length
- * @param trail
- * @return new weight
- */
- private static final int setWeightTrail(int weight, int length, int trail) {
- length = (4 - length) << 3;
- return (weight & (0xffffff00 << length)) | (trail << length);
- }
-
- /**
- * take two CE weights and calculate the possible ranges of weights between
- * the two limits, excluding them for weights with up to 4 bytes there are
- * up to 2*4-1=7 ranges
- *
- * @param lowerLimit
- * @param upperLimit
- * @param maxByte
- * @param countBytes
- * @param ranges
- * @return weight ranges
- */
- private int getWeightRanges(int lowerLimit, int upperLimit, int maxByte,
- int countBytes, WeightRange ranges[]) {
- // assume that both lowerLimit & upperLimit are not 0
- // get the lengths of the limits
- int lowerLength = lengthOfWeight(lowerLimit);
- int upperLength = lengthOfWeight(upperLimit);
- if (Utility.compareUnsigned(lowerLimit, upperLimit) >= 0) {
- return 0;
- }
- // check that neither is a prefix of the other
- if (lowerLength < upperLength) {
- if (lowerLimit == truncateWeight(upperLimit, lowerLength)) {
- return 0;
- }
- }
- // if the upper limit is a prefix of the lower limit then the earlier
- // test lowerLimit >= upperLimit has caught it
- // reset local variables
- // With the limit lengths of 1..4, there are up to 7 ranges for
- // allocation:
- // range minimum length
- // lower[4] 4
- // lower[3] 3
- // lower[2] 2
- // middle 1
- // upper[2] 2
- // upper[3] 3
- // upper[4] 4
- // We are now going to calculate up to 7 ranges.
- // Some of them will typically overlap, so we will then have to merge
- // and eliminate ranges.
-
- // We have to clean cruft from previous invocations
- // before doing anything. C++ already does that
- for (int length = 0; length < 5; length++) {
- m_utilLowerWeightRange_[length].clear();
- m_utilUpperWeightRange_[length].clear();
- }
- m_utilWeightRange_.clear();
-
- int weight = lowerLimit;
- for (int length = lowerLength; length >= 2; --length) {
- m_utilLowerWeightRange_[length].clear();
- int trail = getWeightByte(weight, length);
- if (trail < maxByte) {
- m_utilLowerWeightRange_[length].m_start_ = incWeightTrail(
- weight, length);
- m_utilLowerWeightRange_[length].m_end_ = setWeightTrail(weight,
- length, maxByte);
- m_utilLowerWeightRange_[length].m_length_ = length;
- m_utilLowerWeightRange_[length].m_count_ = maxByte - trail;
- }
- weight = truncateWeight(weight, length - 1);
- }
- m_utilWeightRange_.m_start_ = incWeightTrail(weight, 1);
-
- weight = upperLimit;
- // [0] and [1] are not used - this simplifies indexing,
- // m_utilUpperWeightRange_
-
- for (int length = upperLength; length >= 2; length--) {
- int trail = getWeightByte(weight, length);
- if (trail > RuleBasedCollator.BYTE_FIRST_TAILORED_) {
- m_utilUpperWeightRange_[length].m_start_ = setWeightTrail(
- weight, length, RuleBasedCollator.BYTE_FIRST_TAILORED_);
- m_utilUpperWeightRange_[length].m_end_ = decWeightTrail(weight,
- length);
- m_utilUpperWeightRange_[length].m_length_ = length;
- m_utilUpperWeightRange_[length].m_count_ = trail
- - RuleBasedCollator.BYTE_FIRST_TAILORED_;
- }
- weight = truncateWeight(weight, length - 1);
- }
- m_utilWeightRange_.m_end_ = decWeightTrail(weight, 1);
-
- // set the middle range
- m_utilWeightRange_.m_length_ = 1;
- if (Utility.compareUnsigned(m_utilWeightRange_.m_end_,
- m_utilWeightRange_.m_start_) >= 0) {
- // if (m_utilWeightRange_.m_end_ >= m_utilWeightRange_.m_start_) {
- m_utilWeightRange_.m_count_ = ((m_utilWeightRange_.m_end_ - m_utilWeightRange_.m_start_) >>> 24) + 1;
- } else {
- // eliminate overlaps
- // remove the middle range
- m_utilWeightRange_.m_count_ = 0;
- // reduce or remove the lower ranges that go beyond upperLimit
- for (int length = 4; length >= 2; --length) {
- if (m_utilLowerWeightRange_[length].m_count_ > 0
- && m_utilUpperWeightRange_[length].m_count_ > 0) {
- int start = m_utilUpperWeightRange_[length].m_start_;
- int end = m_utilLowerWeightRange_[length].m_end_;
- if (end >= start
- || incWeight(end, length, maxByte) == start) {
- // lower and upper ranges collide or are directly
- // adjacent: merge these two and remove all shorter
- // ranges
- start = m_utilLowerWeightRange_[length].m_start_;
- end = m_utilLowerWeightRange_[length].m_end_ = m_utilUpperWeightRange_[length].m_end_;
- // merging directly adjacent ranges needs to subtract
- // the 0/1 gaps in between;
- // it may result in a range with count>countBytes
- m_utilLowerWeightRange_[length].m_count_ = getWeightByte(
- end, length)
- - getWeightByte(start, length)
- + 1
- + countBytes
- * (getWeightByte(end, length - 1) - getWeightByte(
- start, length - 1));
- m_utilUpperWeightRange_[length].m_count_ = 0;
- while (--length >= 2) {
- m_utilLowerWeightRange_[length].m_count_ = m_utilUpperWeightRange_[length].m_count_ = 0;
- }
- break;
- }
- }
- }
- }
-
- // copy the ranges, shortest first, into the result array
- int rangeCount = 0;
- if (m_utilWeightRange_.m_count_ > 0) {
- ranges[0] = new WeightRange(m_utilWeightRange_);
- rangeCount = 1;
- }
- for (int length = 2; length <= 4; ++length) {
- // copy upper first so that later the middle range is more likely
- // the first one to use
- if (m_utilUpperWeightRange_[length].m_count_ > 0) {
- ranges[rangeCount] = new WeightRange(
- m_utilUpperWeightRange_[length]);
- ++rangeCount;
- }
- if (m_utilLowerWeightRange_[length].m_count_ > 0) {
- ranges[rangeCount] = new WeightRange(
- m_utilLowerWeightRange_[length]);
- ++rangeCount;
- }
- }
- return rangeCount;
- }
-
- /**
- * Truncates the weight with length
- *
- * @param weight
- * @param length
- * @return truncated weight
- */
- private static final int truncateWeight(int weight, int length) {
- return weight & (0xffffffff << ((4 - length) << 3));
- }
-
- /**
- * Length of the weight
- *
- * @param weight
- * @return length of the weight
- */
- private static final int lengthOfWeight(int weight) {
- if ((weight & 0xffffff) == 0) {
- return 1;
- } else if ((weight & 0xffff) == 0) {
- return 2;
- } else if ((weight & 0xff) == 0) {
- return 3;
- }
- return 4;
- }
-
- /**
- * Increment the weight trail
- *
- * @param weight
- * @param length
- * @return new weight
- */
- private static final int incWeightTrail(int weight, int length) {
- return weight + (1 << ((4 - length) << 3));
- }
-
- /**
- * Decrement the weight trail
- *
- * @param weight
- * @param length
- * @return new weight
- */
- private static int decWeightTrail(int weight, int length) {
- return weight - (1 << ((4 - length) << 3));
- }
-
- /**
- * Gets the codepoint
- *
- * @param tbl
- * contraction table
- * @param codePoint
- * code point to look for
- * @return the offset to the code point
- */
- private static int findCP(BasicContractionTable tbl, char codePoint) {
- int position = 0;
- while (codePoint > tbl.m_codePoints_.charAt(position)) {
- position++;
- if (position > tbl.m_codePoints_.length()) {
- return -1;
- }
- }
- if (codePoint == tbl.m_codePoints_.charAt(position)) {
- return position;
- } else {
- return -1;
- }
- }
-
- /**
- * Finds a contraction ce
- *
- * @param table
- * @param element
- * @param ch
- * @return ce
- */
- private static int findCE(ContractionTable table, int element, char ch) {
- if (table == null) {
- return CE_NOT_FOUND_;
- }
- BasicContractionTable tbl = getBasicContractionTable(table, element);
- if (tbl == null) {
- return CE_NOT_FOUND_;
- }
- int position = findCP(tbl, ch);
- if (position > tbl.m_CEs_.size() || position < 0) {
- return CE_NOT_FOUND_;
- }
- return tbl.m_CEs_.get(position).intValue();
- }
-
- /**
- * Checks if the string is tailored in the contraction
- *
- * @param table
- * contraction table
- * @param element
- * @param array
- * character array to check
- * @param offset
- * array offset
- * @return true if it is tailored
- */
- private static boolean isTailored(ContractionTable table, int element,
- char array[], int offset) {
- while (array[offset] != 0) {
- element = findCE(table, element, array[offset]);
- if (element == CE_NOT_FOUND_) {
- return false;
- }
- if (!isContractionTableElement(element)) {
- return true;
- }
- offset++;
- }
- if (getCE(table, element, 0) != CE_NOT_FOUND_) {
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Assemble RuleBasedCollator
- *
- * @param t
- * build table
- * @param collator
- * to update
- */
- private void assembleTable(BuildTable t, RuleBasedCollator collator) {
- IntTrieBuilder mapping = t.m_mapping_;
- List<Integer> expansions = t.m_expansions_;
- ContractionTable contractions = t.m_contractions_;
- MaxExpansionTable maxexpansion = t.m_maxExpansions_;
-
- // contraction offset has to be in since we are building on the
- // UCA contractions
- // int beforeContractions = (HEADER_SIZE_
- // + paddedsize(expansions.size() << 2)) >>> 1;
- collator.m_contractionOffset_ = 0;
- int contractionsSize = constructTable(contractions);
-
- // the following operation depends on the trie data. Therefore, we have
- // to do it before the trie is compacted
- // sets jamo expansions
- getMaxExpansionJamo(mapping, maxexpansion, t.m_maxJamoExpansions_,
- collator.m_isJamoSpecial_);
-
- // TODO: LATIN1 array is now in the utrie - it should be removed from
- // the calculation
- setAttributes(collator, t.m_options_);
- // copy expansions
- int size = expansions.size();
- collator.m_expansion_ = new int[size];
- for (int i = 0; i < size; i++) {
- collator.m_expansion_[i] = expansions.get(i).intValue();
- }
- // contractions block
- if (contractionsSize != 0) {
- // copy contraction index
- collator.m_contractionIndex_ = new char[contractionsSize];
- contractions.m_codePoints_.getChars(0, contractionsSize,
- collator.m_contractionIndex_, 0);
- // copy contraction collation elements
- collator.m_contractionCE_ = new int[contractionsSize];
- for (int i = 0; i < contractionsSize; i++) {
- collator.m_contractionCE_[i] = contractions.m_CEs_.get(i).intValue();
- }
- }
- // copy mapping table
- collator.m_trie_ = mapping.serialize(t,
- RuleBasedCollator.DataManipulate.getInstance());
- // copy max expansion table
- // not copying the first element which is a dummy
- // to be in synch with icu4c's builder, we continue to use the
- // expansion offset
- // omitting expansion offset in builder
- collator.m_expansionOffset_ = 0;
- size = maxexpansion.m_endExpansionCE_.size();
- collator.m_expansionEndCE_ = new int[size - 1];
- for (int i = 1; i < size; i++) {
- collator.m_expansionEndCE_[i - 1] = maxexpansion.m_endExpansionCE_
- .get(i).intValue();
- }
- collator.m_expansionEndCEMaxSize_ = new byte[size - 1];
- for (int i = 1; i < size; i++) {
- collator.m_expansionEndCEMaxSize_[i - 1] = maxexpansion.m_expansionCESize_
- .get(i).byteValue();
- }
- // Unsafe chars table. Finish it off, then copy it.
- unsafeCPAddCCNZ(t);
- // Or in unsafebits from UCA, making a combined table.
- for (int i = 0; i < UNSAFECP_TABLE_SIZE_; i++) {
- t.m_unsafeCP_[i] |= RuleBasedCollator.UCA_.m_unsafe_[i];
- }
- collator.m_unsafe_ = t.m_unsafeCP_;
-
- // Finish building Contraction Ending chars hash table and then copy it
- // out.
- // Or in unsafebits from UCA, making a combined table
- for (int i = 0; i < UNSAFECP_TABLE_SIZE_; i++) {
- t.m_contrEndCP_[i] |= RuleBasedCollator.UCA_.m_contractionEnd_[i];
- }
- collator.m_contractionEnd_ = t.m_contrEndCP_;
- }
-
- /**
- * Sets this collator to use the all options and tables in UCA.
- *
- * @param collator
- * which attribute is to be set
- * @param option
- * to set with
- */
- private static final void setAttributes(RuleBasedCollator collator,
- CollationRuleParser.OptionSet option) {
- collator.latinOneFailed_ = true;
- collator.m_caseFirst_ = option.m_caseFirst_;
- collator.setDecomposition(option.m_decomposition_);
- collator
- .setAlternateHandlingShifted(option.m_isAlternateHandlingShifted_);
- collator.setCaseLevel(option.m_isCaseLevel_);
- collator.setFrenchCollation(option.m_isFrenchCollation_);
- collator.m_isHiragana4_ = option.m_isHiragana4_;
- collator.setStrength(option.m_strength_);
- collator.m_variableTopValue_ = option.m_variableTopValue_;
- collator.m_reorderCodes_ = option.m_scriptOrder_;
- collator.latinOneFailed_ = false;
- }
-
- /**
- * Constructing the contraction table
- *
- * @param table
- * contraction table
- * @return
- */
- private int constructTable(ContractionTable table) {
- // See how much memory we need
- int tsize = table.m_elements_.size();
- if (tsize == 0) {
- return 0;
- }
- table.m_offsets_.clear();
- int position = 0;
- for (int i = 0; i < tsize; i++) {
- table.m_offsets_.add(Integer.valueOf(position));
- position += table.m_elements_.get(i).m_CEs_
- .size();
- }
- table.m_CEs_.clear();
- table.m_codePoints_.delete(0, table.m_codePoints_.length());
- // Now stuff the things in
- StringBuilder cpPointer = table.m_codePoints_;
- List<Integer> CEPointer = table.m_CEs_;
- for (int i = 0; i < tsize; i++) {
- BasicContractionTable bct = table.m_elements_.get(i);
- int size = bct.m_CEs_.size();
- char ccMax = 0;
- char ccMin = 255;
- int offset = CEPointer.size();
- CEPointer.add(bct.m_CEs_.get(0));
- for (int j = 1; j < size; j++) {
- char ch = bct.m_codePoints_.charAt(j);
- char cc = (char) (UCharacter.getCombiningClass(ch) & 0xFF);
- if (cc > ccMax) {
- ccMax = cc;
- }
- if (cc < ccMin) {
- ccMin = cc;
- }
- cpPointer.append(ch);
- CEPointer.add(bct.m_CEs_.get(j));
- }
- cpPointer.insert(offset,
- (char) (((ccMin == ccMax) ? 1 : 0 << 8) | ccMax));
- for (int j = 0; j < size; j++) {
- if (isContractionTableElement(CEPointer.get(offset + j).intValue())) {
- int ce = CEPointer.get(offset + j).intValue();
- CEPointer.set(offset + j,
- Integer.valueOf(constructSpecialCE(getCETag(ce),
- table.m_offsets_.get(getContractionOffset(ce))
- .intValue())));
- }
- }
- }
-
- for (int i = 0; i <= 0x10FFFF; i++) {
- int CE = table.m_mapping_.getValue(i);
- if (isContractionTableElement(CE)) {
- CE = constructSpecialCE(getCETag(CE),
- table.m_offsets_.get(getContractionOffset(CE)).intValue());
- table.m_mapping_.setValue(i, CE);
- }
- }
- return position;
- }
-
- /**
- * Get contraction offset
- *
- * @param ce
- * collation element
- * @return contraction offset
- */
- private static final int getContractionOffset(int ce) {
- return ce & 0xFFFFFF;
- }
-
- /**
- * Gets the maximum Jamo expansion
- *
- * @param mapping
- * trie table
- * @param maxexpansion
- * maximum expansion table
- * @param maxjamoexpansion
- * maximum jamo expansion table
- * @param jamospecial
- * is jamo special?
- */
- private static void getMaxExpansionJamo(IntTrieBuilder mapping,
- MaxExpansionTable maxexpansion,
- MaxJamoExpansionTable maxjamoexpansion, boolean jamospecial) {
- int VBASE = 0x1161;
- int TBASE = 0x11A8;
- int VCOUNT = 21;
- int TCOUNT = 28;
- int v = VBASE + VCOUNT - 1;
- int t = TBASE + TCOUNT - 1;
-
- while (v >= VBASE) {
- int ce = mapping.getValue(v);
- if ((ce & RuleBasedCollator.CE_SPECIAL_FLAG_) != RuleBasedCollator.CE_SPECIAL_FLAG_) {
- setMaxExpansion(ce, (byte) 2, maxexpansion);
- }
- v--;
- }
-
- while (t >= TBASE) {
- int ce = mapping.getValue(t);
- if ((ce & RuleBasedCollator.CE_SPECIAL_FLAG_) != RuleBasedCollator.CE_SPECIAL_FLAG_) {
- setMaxExpansion(ce, (byte) 3, maxexpansion);
- }
- t--;
- }
- // According to the docs, 99% of the time, the Jamo will not be special
- if (jamospecial) {
- // gets the max expansion in all unicode characters
- int count = maxjamoexpansion.m_endExpansionCE_.size();
- byte maxTSize = (byte) (maxjamoexpansion.m_maxLSize_
- + maxjamoexpansion.m_maxVSize_ + maxjamoexpansion.m_maxTSize_);
- byte maxVSize = (byte) (maxjamoexpansion.m_maxLSize_ + maxjamoexpansion.m_maxVSize_);
-
- while (count > 0) {
- count--;
- if ((maxjamoexpansion.m_isV_.get(count))
- .booleanValue() == true) {
- setMaxExpansion(
- (maxjamoexpansion.m_endExpansionCE_
- .get(count)).intValue(), maxVSize,
- maxexpansion);
- } else {
- setMaxExpansion(
- (maxjamoexpansion.m_endExpansionCE_
- .get(count)).intValue(), maxTSize,
- maxexpansion);
- }
- }
- }
- }
-
- /**
- * To the UnsafeCP hash table, add all chars with combining class != 0
- *
- * @param t
- * build table
- */
- private final void unsafeCPAddCCNZ(BuildTable t) {
- boolean buildCMTable = (buildCMTabFlag & (t.cmLookup == null));
- char[] cm = null; // combining mark array
- int[] index = new int[256];
- int count = 0;
-
- if (buildCMTable) {
- cm = new char[0x10000];
- }
- for (char c = 0; c < 0xffff; c++) {
- int fcd;
- if (UTF16.isLeadSurrogate(c)) {
- fcd = 0;
- if (m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(c)) {
- int supp = Character.toCodePoint(c, (char)0xdc00);
- int suppLimit = supp + 0x400;
- while (supp < suppLimit) {
- fcd |= m_nfcImpl_.getFCD16FromNormData(supp++);
- }
- }
- } else {
- fcd = m_nfcImpl_.getFCD16(c);
- }
- // TODO: review for handling supplementary characters
- if (fcd >= 0x100 || // if the leading combining class(c) > 0 ||
- (UTF16.isLeadSurrogate(c) && fcd != 0)) {
- // c is a leading surrogate with some FCD data
- unsafeCPSet(t.m_unsafeCP_, c);
- if (buildCMTable) {
- int cc = (fcd & 0xff);
- int pos = (cc << 8) + index[cc];
- cm[pos] = c;
- index[cc]++;
- count++;
- }
- }
- }
-
- if (t.m_prefixLookup_ != null) {
- Enumeration<Elements> els = Collections.enumeration(t.m_prefixLookup_.values());
- while (els.hasMoreElements()) {
- Elements e = els.nextElement();
- // codepoints here are in the NFD form. We need to add the
- // first code point of the NFC form to unsafe, because
- // strcoll needs to backup over them.
- // weiv: This is wrong! See the comment above.
- // String decomp = Normalizer.decompose(e.m_cPoints_, true);
- // unsafeCPSet(t.m_unsafeCP_, decomp.charAt(0));
- // it should be:
- String comp = Normalizer.compose(e.m_cPoints_, false);
- unsafeCPSet(t.m_unsafeCP_, comp.charAt(0));
- }
- }
-
- if (buildCMTable) {
- t.cmLookup = new CombinClassTable();
- t.cmLookup.generate(cm, count, index);
- }
- }
-
- /**
- * Create closure
- *
- * @param t
- * build table
- * @param collator
- * RuleBasedCollator
- * @param colEl
- * collation element iterator
- * @param start
- * @param limit
- * @param type
- * character type
- * @return
- */
- private boolean enumCategoryRangeClosureCategory(BuildTable t,
- RuleBasedCollator collator, CollationElementIterator colEl,
- int start, int limit, int type) {
- if (type != UCharacterCategory.UNASSIGNED
- && type != UCharacterCategory.PRIVATE_USE) {
- // if the range is assigned - we might ommit more categories later
-
- for (int u32 = start; u32 < limit; u32++) {
- String decomp = m_nfcImpl_.getDecomposition(u32);
- if (decomp != null) {
- String comp = UCharacter.toString(u32);
- if (!collator.equals(comp, decomp)) {
- m_utilElement_.m_cPoints_ = decomp;
- m_utilElement_.m_prefix_ = 0;
- Elements prefix = t.m_prefixLookup_.get(m_utilElement_);
- if (prefix == null) {
- m_utilElement_.m_cPoints_ = comp;
- m_utilElement_.m_prefix_ = 0;
- m_utilElement_.m_prefixChars_ = null;
- colEl.setText(decomp);
- int ce = colEl.next();
- m_utilElement_.m_CELength_ = 0;
- while (ce != CollationElementIterator.NULLORDER) {
- m_utilElement_.m_CEs_[m_utilElement_.m_CELength_++] = ce;
- ce = colEl.next();
- }
- } else {
- m_utilElement_.m_cPoints_ = comp;
- m_utilElement_.m_prefix_ = 0;
- m_utilElement_.m_prefixChars_ = null;
- m_utilElement_.m_CELength_ = 1;
- m_utilElement_.m_CEs_[0] = prefix.m_mapCE_;
- // This character uses a prefix. We have to add it
- // to the unsafe table, as it decomposed form is
- // already in. In Japanese, this happens for \u309e
- // & \u30fe
- // Since unsafeCPSet is static in ucol_elm, we are
- // going to wrap it up in the unsafeCPAddCCNZ
- // function
- }
- addAnElement(t, m_utilElement_);
- }
- }
- }
- }
- return true;
- }
-
- /**
- * Determine if a character is a Jamo
- *
- * @param ch
- * character to test
- * @return true if ch is a Jamo, false otherwise
- */
- private static final boolean isJamo(char ch) {
- return (ch >= 0x1100 && ch <= 0x1112) || (ch >= 0x1175 && ch <= 0x1161)
- || (ch >= 0x11A8 && ch <= 0x11C2);
- }
-
- /**
- * Produces canonical closure
- */
- private void canonicalClosure(BuildTable t) {
- BuildTable temp = new BuildTable(t);
- assembleTable(temp, temp.m_collator_);
- // produce canonical closure
- CollationElementIterator coleiter = temp.m_collator_
- .getCollationElementIterator("");
- RangeValueIterator typeiter = UCharacter.getTypeIterator();
- RangeValueIterator.Element element = new RangeValueIterator.Element();
- while (typeiter.next(element)) {
- enumCategoryRangeClosureCategory(t, temp.m_collator_, coleiter,
- element.start, element.limit, element.value);
- }
-
- t.cmLookup = temp.cmLookup;
- temp.cmLookup = null;
-
- for (int i = 0; i < m_parser_.m_resultLength_; i++) {
- char baseChar, firstCM;
- // now we need to generate the CEs
- // We stuff the initial value in the buffers, and increase the
- // appropriate buffer according to strength */
- // createElements(t, m_parser_.m_listHeader_[i]);
- CollationRuleParser.Token tok = m_parser_.m_listHeader_[i].m_first_;
- m_utilElement_.clear();
- while (tok != null) {
- m_utilElement_.m_prefix_ = 0;// el.m_prefixChars_;
- m_utilElement_.m_cPointsOffset_ = 0; // el.m_uchars_;
- if (tok.m_prefix_ != 0) {
- // we will just copy the prefix here, and adjust accordingly
- // in
- // the addPrefix function in ucol_elm. The reason is that we
- // need to add both composed AND decomposed elements to the
- // unsafe table.
- int size = tok.m_prefix_ >> 24;
- int offset = tok.m_prefix_ & 0x00FFFFFF;
- m_utilElement_.m_prefixChars_ = m_parser_.m_source_
- .substring(offset, offset + size);
- size = (tok.m_source_ >> 24) - (tok.m_prefix_ >> 24);
- offset = (tok.m_source_ & 0x00FFFFFF)
- + (tok.m_prefix_ >> 24);
- m_utilElement_.m_uchars_ = m_parser_.m_source_.substring(
- offset, offset + size);
- } else {
- m_utilElement_.m_prefixChars_ = null;
- int offset = tok.m_source_ & 0x00FFFFFF;
- int size = tok.m_source_ >>> 24;
- m_utilElement_.m_uchars_ = m_parser_.m_source_.substring(
- offset, offset + size);
- }
- m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
-
- baseChar = firstCM = 0; // reset
- for (int j = 0; j < m_utilElement_.m_cPoints_.length()
- - m_utilElement_.m_cPointsOffset_; j++) {
-
- int fcd = m_nfcImpl_.getFCD16(m_utilElement_.m_cPoints_.charAt(j)); // TODO: review for handling supplementary characters
- if ((fcd & 0xff) == 0) {
- baseChar = m_utilElement_.m_cPoints_.charAt(j);
- } else {
- if ((baseChar != 0) && (firstCM == 0)) {
- firstCM = m_utilElement_.m_cPoints_.charAt(j); // first
- // combining
- // mark
- }
- }
- }
-
- if ((baseChar != 0) && (firstCM != 0)) {
- addTailCanonicalClosures(t, temp.m_collator_, coleiter,
- baseChar, firstCM);
- }
- tok = tok.m_next_;
- }
- }
- }
-
- private void addTailCanonicalClosures(BuildTable t,
- RuleBasedCollator m_collator, CollationElementIterator colEl,
- char baseChar, char cMark) {
- if (t.cmLookup == null) {
- return;
- }
- CombinClassTable cmLookup = t.cmLookup;
- int[] index = cmLookup.index;
- int cClass = m_nfcImpl_.getFCD16(cMark) & 0xff; // TODO: review for handling supplementary characters
- int maxIndex = 0;
- char[] precompCh = new char[256];
- int[] precompClass = new int[256];
- int precompLen = 0;
- Elements element = new Elements();
-
- if (cClass > 0) {
- maxIndex = index[cClass - 1];
- }
- for (int i = 0; i < maxIndex; i++) {
- StringBuilder decompBuf = new StringBuilder();
- decompBuf.append(baseChar).append(cmLookup.cPoints[i]);
- String comp = Normalizer.compose(decompBuf.toString(), false);
- if (comp.length() == 1) {
- precompCh[precompLen] = comp.charAt(0);
- precompClass[precompLen] = m_nfcImpl_.getFCD16(cmLookup.cPoints[i]) & 0xff; // TODO: review for handling supplementary characters
- precompLen++;
- StringBuilder decomp = new StringBuilder();
- for (int j = 0; j < m_utilElement_.m_cPoints_.length(); j++) {
- if (m_utilElement_.m_cPoints_.charAt(j) == cMark) {
- decomp.append(cmLookup.cPoints[i]);
- } else {
- decomp.append(m_utilElement_.m_cPoints_.charAt(j));
- }
- }
- comp = Normalizer.compose(decomp.toString(), false);
- StringBuilder buf = new StringBuilder(comp);
- buf.append(cMark);
- decomp.append(cMark);
- comp = buf.toString();
-
- element.m_cPoints_ = decomp.toString();
- element.m_CELength_ = 0;
- element.m_prefix_ = 0;
- Elements prefix = t.m_prefixLookup_.get(element);
- element.m_cPoints_ = comp;
- element.m_uchars_ = comp;
-
- if (prefix == null) {
- element.m_prefix_ = 0;
- element.m_prefixChars_ = null;
- colEl.setText(decomp.toString());
- int ce = colEl.next();
- element.m_CELength_ = 0;
- while (ce != CollationElementIterator.NULLORDER) {
- element.m_CEs_[element.m_CELength_++] = ce;
- ce = colEl.next();
- }
- } else {
- element.m_cPoints_ = comp;
- element.m_prefix_ = 0;
- element.m_prefixChars_ = null;
- element.m_CELength_ = 1;
- element.m_CEs_[0] = prefix.m_mapCE_;
- }
- setMapCE(t, element);
- finalizeAddition(t, element);
-
- if (comp.length() > 2) {
- // This is a fix for tailoring contractions with accented
- // character at the end of contraction string.
- addFCD4AccentedContractions(t, colEl, comp, element);
- }
- if (precompLen > 1) {
- precompLen = addMultiCMontractions(t, colEl, element,
- precompCh, precompClass, precompLen, cMark, i,
- decomp.toString());
- }
- }
- }
-
- }
-
- private void setMapCE(BuildTable t, Elements element) {
- List<Integer> expansions = t.m_expansions_;
- element.m_mapCE_ = 0;
-
- if (element.m_CELength_ == 2 // a two CE expansion
- && RuleBasedCollator.isContinuation(element.m_CEs_[1])
- && (element.m_CEs_[1] & (~(0xFF << 24 | RuleBasedCollator.CE_CONTINUATION_MARKER_))) == 0 // that
- // has
- // only
- // primaries
- // in
- // continuation
- && (((element.m_CEs_[0] >> 8) & 0xFF) == RuleBasedCollator.BYTE_COMMON_)
- // a common secondary
- && ((element.m_CEs_[0] & 0xFF) == RuleBasedCollator.BYTE_COMMON_)) { // and
- // a
- // common
- // tertiary
-
- element.m_mapCE_ = RuleBasedCollator.CE_SPECIAL_FLAG_
- // a long primary special
- | (CE_LONG_PRIMARY_TAG_ << 24)
- // first and second byte of primary
- | ((element.m_CEs_[0] >> 8) & 0xFFFF00)
- // third byte of primary
- | ((element.m_CEs_[1] >> 24) & 0xFF);
- } else {
- // omitting expansion offset in builder
- // (HEADER_SIZE_ >> 2)
- int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_
- | (CE_EXPANSION_TAG_ << RuleBasedCollator.CE_TAG_SHIFT_)
- | (addExpansion(expansions, element.m_CEs_[0]) << 4)
- & 0xFFFFF0;
-
- for (int i = 1; i < element.m_CELength_; i++) {
- addExpansion(expansions, element.m_CEs_[i]);
- }
- if (element.m_CELength_ <= 0xF) {
- expansion |= element.m_CELength_;
- } else {
- addExpansion(expansions, 0);
- }
- element.m_mapCE_ = expansion;
- setMaxExpansion(element.m_CEs_[element.m_CELength_ - 1],
- (byte) element.m_CELength_, t.m_maxExpansions_);
- }
- }
-
- private int addMultiCMontractions(BuildTable t,
- CollationElementIterator colEl, Elements element, char[] precompCh,
- int[] precompClass, int maxComp, char cMark, int cmPos,
- String decomp) {
-
- CombinClassTable cmLookup = t.cmLookup;
- char[] combiningMarks = { cMark };
- int cMarkClass = UCharacter.getCombiningClass(cMark) & 0xFF;
- String comMark = new String(combiningMarks);
- int noOfPrecomposedChs = maxComp;
-
- for (int j = 0; j < maxComp; j++) {
- int count = 0;
- StringBuilder temp;
-
- do {
- String newDecomp, comp;
-
- if (count == 0) { // Decompose the saved precomposed char.
- newDecomp = Normalizer.decompose(
- new String(precompCh, j, 1), false);
- temp = new StringBuilder(newDecomp);
- temp.append(cmLookup.cPoints[cmPos]);
- newDecomp = temp.toString();
- } else {
- temp = new StringBuilder(decomp);
- temp.append(precompCh[j]);
- newDecomp = temp.toString();
- }
- comp = Normalizer.compose(newDecomp, false);
- if (comp.length() == 1) {
- temp.append(cMark);
- element.m_cPoints_ = temp.toString();
- element.m_CELength_ = 0;
- element.m_prefix_ = 0;
- Elements prefix = t.m_prefixLookup_.get(element);
- element.m_cPoints_ = comp + comMark;
- if (prefix == null) {
- element.m_prefix_ = 0;
- element.m_prefixChars_ = null;
- colEl.setText(temp.toString());
- int ce = colEl.next();
- element.m_CELength_ = 0;
- while (ce != CollationElementIterator.NULLORDER) {
- element.m_CEs_[element.m_CELength_++] = ce;
- ce = colEl.next();
- }
- } else {
- element.m_cPoints_ = comp;
- element.m_prefix_ = 0;
- element.m_prefixChars_ = null;
- element.m_CELength_ = 1;
- element.m_CEs_[0] = prefix.m_mapCE_;
- }
- setMapCE(t, element);
- finalizeAddition(t, element);
- precompCh[noOfPrecomposedChs] = comp.charAt(0);
- precompClass[noOfPrecomposedChs] = cMarkClass;
- noOfPrecomposedChs++;
- }
- } while (++count < 2 && (precompClass[j] == cMarkClass));
- }
- return noOfPrecomposedChs;
- }
-
- private void addFCD4AccentedContractions(BuildTable t,
- CollationElementIterator colEl, String data, Elements element) {
- String decomp = Normalizer.decompose(data, false);
- String comp = Normalizer.compose(data, false);
-
- element.m_cPoints_ = decomp;
- element.m_CELength_ = 0;
- element.m_prefix_ = 0;
- Elements prefix = t.m_prefixLookup_.get(element);
- if (prefix == null) {
- element.m_cPoints_ = comp;
- element.m_prefix_ = 0;
- element.m_prefixChars_ = null;
- element.m_CELength_ = 0;
- colEl.setText(decomp);
- int ce = colEl.next();
- element.m_CELength_ = 0;
- while (ce != CollationElementIterator.NULLORDER) {
- element.m_CEs_[element.m_CELength_++] = ce;
- ce = colEl.next();
- }
- addAnElement(t, element);
- }
- }
-
- private void processUCACompleteIgnorables(BuildTable t) {
- TrieIterator trieiterator = new TrieIterator(
- RuleBasedCollator.UCA_.m_trie_);
- RangeValueIterator.Element element = new RangeValueIterator.Element();
- while (trieiterator.next(element)) {
- int start = element.start;
- int limit = element.limit;
- if (element.value == 0) {
- while (start < limit) {
- int CE = t.m_mapping_.getValue(start);
- if (CE == CE_NOT_FOUND_) {
- m_utilElement_.m_prefix_ = 0;
- m_utilElement_.m_uchars_ = UCharacter.toString(start);
- m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
- m_utilElement_.m_cPointsOffset_ = 0;
- m_utilElement_.m_CELength_ = 1;
- m_utilElement_.m_CEs_[0] = 0;
- addAnElement(t, m_utilElement_);
- }
- start++;
- }
- }
- }
- }
-}
+++ /dev/null
-/**
-*******************************************************************************
-* Copyright (C) 1996-2011, International Business Machines Corporation and *
-* others. All Rights Reserved. *
-*******************************************************************************
-*/
-package com.ibm.icu.text;
-
-import java.text.ParseException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.Map;
-
-import com.ibm.icu.impl.ICUResourceBundle;
-import com.ibm.icu.impl.PatternProps;
-import com.ibm.icu.lang.UCharacter;
-import com.ibm.icu.lang.UProperty;
-import com.ibm.icu.lang.UScript;
-import com.ibm.icu.text.Collator.ReorderCodes;
-import com.ibm.icu.util.ULocale;
-import com.ibm.icu.util.UResourceBundle;
-
-/**
-* Class for parsing collation rules, produces a list of tokens that will be
-* turned into collation elements
-* @author Syn Wee Quek
-* @since release 2.2, June 7 2002
-*/
-final class CollationRuleParser
-{
- // public data members ---------------------------------------------------
-
- // package private constructors ------------------------------------------
-
- /**
- * <p>RuleBasedCollator constructor that takes the rules.
- * Please see RuleBasedCollator class description for more details on the
- * collation rule syntax.</p>
- * @see java.util.Locale
- * @param rules the collation rules to build the collation table from.
- * @exception ParseException thrown when argument rules have an invalid
- * syntax.
- */
- CollationRuleParser(String rules) throws ParseException
- {
- // Prepares m_copySet_ and m_removeSet_.
- rules = preprocessRules(rules);
-
- // Save the rules as a long string. The StringBuilder object is
- // used to store the result of token parsing as well.
- m_source_ = new StringBuilder(Normalizer.decompose(rules, false).trim());
- m_rules_ = m_source_.toString();
-
- // Index of the next unparsed character.
- m_current_ = 0;
-
- // Index of the next unwritten character in the parsed result.
- m_extraCurrent_ = m_source_.length();
-
- m_variableTop_ = null;
- m_parsedToken_ = new ParsedToken();
- m_hashTable_ = new HashMap<Token, Token>();
- m_options_ = new OptionSet(RuleBasedCollator.UCA_);
- m_listHeader_ = new TokenListHeader[512];
- m_resultLength_ = 0;
- // call assembleTokenList() manually, so that we can
- // init a parser and manually parse tokens
- //assembleTokenList();
- }
-
- // package private inner classes -----------------------------------------
-
- /**
- * Collation options set
- */
- static class OptionSet
- {
- // package private constructor ---------------------------------------
-
- /**
- * Initializes the option set with the argument collators
- * @param collator option to use
- */
- OptionSet(RuleBasedCollator collator)
- {
- m_variableTopValue_ = collator.m_variableTopValue_;
- m_isFrenchCollation_ = collator.isFrenchCollation();
- m_isAlternateHandlingShifted_
- = collator.isAlternateHandlingShifted();
- m_caseFirst_ = collator.m_caseFirst_;
- m_isCaseLevel_ = collator.isCaseLevel();
- m_decomposition_ = collator.getDecomposition();
- m_strength_ = collator.getStrength();
- m_isHiragana4_ = collator.m_isHiragana4_;
-
- if(collator.m_reorderCodes_ != null){
- m_scriptOrder_ = new int[collator.m_reorderCodes_.length];
- for(int i = 0; i < m_scriptOrder_.length; i++){
- m_scriptOrder_[i] = collator.m_reorderCodes_[i];
- }
- }
-
- }
-
- // package private data members --------------------------------------
-
- int m_variableTopValue_;
- boolean m_isFrenchCollation_;
- /**
- * Attribute for handling variable elements
- */
- boolean m_isAlternateHandlingShifted_;
- /**
- * who goes first, lower case or uppercase
- */
- int m_caseFirst_;
- /**
- * do we have an extra case level
- */
- boolean m_isCaseLevel_;
- /**
- * attribute for normalization
- */
- int m_decomposition_;
- /**
- * attribute for strength
- */
- int m_strength_;
- /**
- * attribute for special Hiragana
- */
- boolean m_isHiragana4_;
-
- /**
- * the ordering of the scripts
- */
- int[] m_scriptOrder_;
- }
-
- /**
- * List of tokens used by the collation rules
- */
- static class TokenListHeader
- {
- Token m_first_;
- Token m_last_;
- Token m_reset_;
- boolean m_indirect_;
- int m_baseCE_;
- int m_baseContCE_;
- int m_nextCE_;
- int m_nextContCE_;
- int m_previousCE_;
- int m_previousContCE_;
- int m_pos_[] = new int[Collator.IDENTICAL + 1];
- int m_gapsLo_[] = new int[3 * (Collator.TERTIARY + 1)];
- int m_gapsHi_[] = new int[3 * (Collator.TERTIARY + 1)];
- int m_numStr_[] = new int[3 * (Collator.TERTIARY + 1)];
- Token m_fStrToken_[] = new Token[Collator.TERTIARY + 1];
- Token m_lStrToken_[] = new Token[Collator.TERTIARY + 1];
- }
-
- /**
- * Token wrapper for collation rules
- */
- static class Token
- {
- // package private data members ---------------------------------------
-
- int m_CE_[];
- int m_CELength_;
- int m_expCE_[];
- int m_expCELength_;
- int m_source_;
- int m_expansion_;
- int m_prefix_;
- int m_strength_;
- int m_toInsert_;
- int m_polarity_; // 1 for <, <<, <<<, , ; and 0 for >, >>, >>>
- TokenListHeader m_listHeader_;
- Token m_previous_;
- Token m_next_;
- StringBuilder m_rules_;
- char m_flags_;
-
- // package private constructors ---------------------------------------
-
- Token()
- {
- m_CE_ = new int[128];
- m_expCE_ = new int[128];
- // TODO: this should also handle reverse
- m_polarity_ = TOKEN_POLARITY_POSITIVE_;
- m_next_ = null;
- m_previous_ = null;
- m_CELength_ = 0;
- m_expCELength_ = 0;
- }
-
- // package private methods --------------------------------------------
-
- /**
- * Hashcode calculation for token
- * @return the hashcode
- */
- public int hashCode()
- {
- int result = 0;
- int len = (m_source_ & 0xFF000000) >>> 24;
- int inc = ((len - 32) / 32) + 1;
-
- int start = m_source_ & 0x00FFFFFF;
- int limit = start + len;
-
- while (start < limit) {
- result = (result * 37) + m_rules_.charAt(start);
- start += inc;
- }
- return result;
- }
-
- /**
- * Equals calculation
- * @param target object to compare
- * @return true if target is the same as this object
- */
- public boolean equals(Object target)
- {
- if (target == this) {
- return true;
- }
- if (target instanceof Token) {
- Token t = (Token)target;
- int sstart = m_source_ & 0x00FFFFFF;
- int tstart = t.m_source_ & 0x00FFFFFF;
- int slimit = (m_source_ & 0xFF000000) >> 24;
- int tlimit = (m_source_ & 0xFF000000) >> 24;
-
- int end = sstart + slimit - 1;
-
- if (m_source_ == 0 || t.m_source_ == 0) {
- return false;
- }
- if (slimit != tlimit) {
- return false;
- }
- if (m_source_ == t.m_source_) {
- return true;
- }
-
- while (sstart < end
- && m_rules_.charAt(sstart) == t.m_rules_.charAt(tstart))
- {
- ++ sstart;
- ++ tstart;
- }
- if (m_rules_.charAt(sstart) == t.m_rules_.charAt(tstart)) {
- return true;
- }
- }
- return false;
- }
- }
-
- // package private data member -------------------------------------------
-
- /**
- * Indicator that the token is resetted yet, ie & in the rules
- */
- static final int TOKEN_RESET_ = 0xDEADBEEF;
-
- /**
- * Size of the number of tokens
- */
- int m_resultLength_;
- /**
- * List of parsed tokens
- */
- TokenListHeader m_listHeader_[];
- /**
- * Variable top token
- */
- Token m_variableTop_;
- /**
- * Collation options
- */
- OptionSet m_options_;
- /**
- * Normalized collation rules with some extra characters
- */
- StringBuilder m_source_;
- /**
- * Hash table to keep all tokens
- */
- Map<Token, Token> m_hashTable_;
-
- // package private method ------------------------------------------------
-
- void setDefaultOptionsInCollator(RuleBasedCollator collator)
- {
- collator.m_defaultStrength_ = m_options_.m_strength_;
- collator.m_defaultDecomposition_ = m_options_.m_decomposition_;
- collator.m_defaultIsFrenchCollation_ = m_options_.m_isFrenchCollation_;
- collator.m_defaultIsAlternateHandlingShifted_
- = m_options_.m_isAlternateHandlingShifted_;
- collator.m_defaultIsCaseLevel_ = m_options_.m_isCaseLevel_;
- collator.m_defaultCaseFirst_ = m_options_.m_caseFirst_;
- collator.m_defaultIsHiragana4_ = m_options_.m_isHiragana4_;
- collator.m_defaultVariableTopValue_ = m_options_.m_variableTopValue_;
- if(m_options_.m_scriptOrder_ != null) {
- collator.m_defaultReorderCodes_ = m_options_.m_scriptOrder_.clone();
- } else {
- collator.m_defaultReorderCodes_ = null;
- }
- }
-
- // private inner classes -------------------------------------------------
-
- /**
- * This is a token that has been parsed but not yet processed. Used to
- * reduce the number of arguments in the parser
- */
- private static class ParsedToken
- {
- // private constructor ----------------------------------------------
-
- /**
- * Empty constructor
- */
- ParsedToken()
- {
- m_charsLen_ = 0;
- m_charsOffset_ = 0;
- m_extensionLen_ = 0;
- m_extensionOffset_ = 0;
- m_prefixLen_ = 0;
- m_prefixOffset_ = 0;
- m_flags_ = 0;
- m_strength_ = TOKEN_UNSET_;
- }
-
- // private data members ---------------------------------------------
-
- int m_strength_;
- int m_charsOffset_;
- int m_charsLen_;
- int m_extensionOffset_;
- int m_extensionLen_;
- int m_prefixOffset_;
- int m_prefixLen_;
- char m_flags_;
- char m_indirectIndex_;
- }
-
- /**
- * Boundary wrappers
- */
- private static class IndirectBoundaries
- {
- // package private constructor ---------------------------------------
-
- IndirectBoundaries(int startce[], int limitce[])
- {
- // Set values for the top - TODO: once we have values for all the
- // indirects, we are going to initalize here.
- m_startCE_ = startce[0];
- m_startContCE_ = startce[1];
- if (limitce != null) {
- m_limitCE_ = limitce[0];
- m_limitContCE_ = limitce[1];
- }
- else {
- m_limitCE_ = 0;
- m_limitContCE_ = 0;
- }
- }
-
- // package private data members --------------------------------------
-
- int m_startCE_;
- int m_startContCE_;
- int m_limitCE_;
- int m_limitContCE_;
- }
-
- /**
- * Collation option rule tag
- */
- private static class TokenOption
- {
- // package private constructor ---------------------------------------
-
- TokenOption(String name, int attribute, String suboptions[],
- int suboptionattributevalue[])
- {
- m_name_ = name;
- m_attribute_ = attribute;
- m_subOptions_ = suboptions;
- m_subOptionAttributeValues_ = suboptionattributevalue;
- }
-
- // package private data member ---------------------------------------
-
- private String m_name_;
- private int m_attribute_;
- private String m_subOptions_[];
- private int m_subOptionAttributeValues_[];
- }
-
- // private variables -----------------------------------------------------
-
- /**
- * Current parsed token
- */
- private ParsedToken m_parsedToken_;
- /**
- * Collation rule
- */
- private String m_rules_;
- private int m_current_;
- /**
- * End of the option while reading.
- * Need it for UnicodeSet reading support.
- */
- private int m_optionEnd_;
- /*
- * Current offset in m_source
- */
- //private int m_sourceLimit_;
- /**
- * Offset to m_source_ ofr the extra expansion characters
- */
- private int m_extraCurrent_;
-
- /**
- * UnicodeSet that contains code points to be copied from the UCA
- */
- UnicodeSet m_copySet_;
-
- /**
- * UnicodeSet that contains code points for which we want to remove
- * UCA contractions. It implies copying of these code points from
- * the UCA.
- */
- UnicodeSet m_removeSet_;
-
- /*
- * This is space for the extra strings that need to be unquoted during the
- * parsing of the rules
- */
- //private static final int TOKEN_EXTRA_RULE_SPACE_SIZE_ = 2048;
- /**
- * Indicator that the token is not set yet
- */
- private static final int TOKEN_UNSET_ = 0xFFFFFFFF;
- /*
- * Indicator that the rule is in the > polarity, ie everything on the
- * right of the rule is less than
- */
- //private static final int TOKEN_POLARITY_NEGATIVE_ = 0;
- /**
- * Indicator that the rule is in the < polarity, ie everything on the
- * right of the rule is greater than
- */
- private static final int TOKEN_POLARITY_POSITIVE_ = 1;
- /**
- * Flag mask to determine if top is set
- */
- private static final int TOKEN_TOP_MASK_ = 0x04;
- /**
- * Flag mask to determine if variable top is set
- */
- private static final int TOKEN_VARIABLE_TOP_MASK_ = 0x08;
- /**
- * Flag mask to determine if a before attribute is set
- */
- private static final int TOKEN_BEFORE_ = 0x03;
- /**
- * For use in parsing token options
- */
- private static final int TOKEN_SUCCESS_MASK_ = 0x10;
-
- /**
- * These values are used for finding CE values for indirect positioning.
- * Indirect positioning is a mechanism for allowing resets on symbolic
- * values. It only works for resets and you cannot tailor indirect names.
- * An indirect name can define either an anchor point or a range. An anchor
- * point behaves in exactly the same way as a code point in reset would,
- * except that it cannot be tailored. A range (we currently only know for
- * the [top] range will explicitly set the upper bound for generated CEs,
- * thus allowing for better control over how many CEs can be squeezed
- * between in the range without performance penalty. In that respect, we use
- * [top] for tailoring of locales that use CJK characters. Other indirect
- * values are currently a pure convenience, they can be used to assure that
- * the CEs will be always positioned in the same place relative to a point
- * with known properties (e.g. first primary ignorable).
- */
- private static final IndirectBoundaries INDIRECT_BOUNDARIES_[];
-
-// /**
-// * Inverse UCA constants
-// */
-// private static final int INVERSE_SIZE_MASK_ = 0xFFF00000;
-// private static final int INVERSE_OFFSET_MASK_ = 0x000FFFFF;
-// private static final int INVERSE_SHIFT_VALUE_ = 20;
-
- /**
- * Collation option tags
- * [last variable] last variable value
- * [last primary ignorable] largest CE for primary ignorable
- * [last secondary ignorable] largest CE for secondary ignorable
- * [last tertiary ignorable] largest CE for tertiary ignorable
- * [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
- */
- private static final TokenOption RULES_OPTIONS_[];
-
- static
- {
- INDIRECT_BOUNDARIES_ = new IndirectBoundaries[15];
- // UCOL_RESET_TOP_VALUE
- INDIRECT_BOUNDARIES_[0] = new IndirectBoundaries(
- RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_,
- RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_);
- // UCOL_FIRST_PRIMARY_IGNORABLE
- INDIRECT_BOUNDARIES_[1] = new IndirectBoundaries(
- RuleBasedCollator.UCA_CONSTANTS_.FIRST_PRIMARY_IGNORABLE_,
- null);
- // UCOL_LAST_PRIMARY_IGNORABLE
- INDIRECT_BOUNDARIES_[2] = new IndirectBoundaries(
- RuleBasedCollator.UCA_CONSTANTS_.LAST_PRIMARY_IGNORABLE_,
- null);
-
- // UCOL_FIRST_SECONDARY_IGNORABLE
- INDIRECT_BOUNDARIES_[3] = new IndirectBoundaries(
- RuleBasedCollator.UCA_CONSTANTS_.FIRST_SECONDARY_IGNORABLE_,
- null);
- // UCOL_LAST_SECONDARY_IGNORABLE
- INDIRECT_BOUNDARIES_[4] = new IndirectBoundaries(
- RuleBasedCollator.UCA_CONSTANTS_.LAST_SECONDARY_IGNORABLE_,
- null);
- // UCOL_FIRST_TERTIARY_IGNORABLE
- INDIRECT_BOUNDARIES_[5] = new IndirectBoundaries(
- RuleBasedCollator.UCA_CONSTANTS_.FIRST_TERTIARY_IGNORABLE_,
- null);
- // UCOL_LAST_TERTIARY_IGNORABLE
- INDIRECT_BOUNDARIES_[6] = new IndirectBoundaries(
- RuleBasedCollator.UCA_CONSTANTS_.LAST_TERTIARY_IGNORABLE_,
- null);
- // UCOL_FIRST_VARIABLE;
- INDIRECT_BOUNDARIES_[7] = new IndirectBoundaries(
- RuleBasedCollator.UCA_CONSTANTS_.FIRST_VARIABLE_,
- null);
- // UCOL_LAST_VARIABLE
- INDIRECT_BOUNDARIES_[8] = new IndirectBoundaries(
- RuleBasedCollator.UCA_CONSTANTS_.LAST_VARIABLE_,
- null);
- // UCOL_FIRST_NON_VARIABLE
- INDIRECT_BOUNDARIES_[9] = new IndirectBoundaries(
- RuleBasedCollator.UCA_CONSTANTS_.FIRST_NON_VARIABLE_,
- null);
- // UCOL_LAST_NON_VARIABLE
- INDIRECT_BOUNDARIES_[10] = new IndirectBoundaries(
- RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_,
- RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_);
- // UCOL_FIRST_IMPLICIT
- INDIRECT_BOUNDARIES_[11] = new IndirectBoundaries(
- RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_,
- null);
- // UCOL_LAST_IMPLICIT
- INDIRECT_BOUNDARIES_[12] = new IndirectBoundaries(
- RuleBasedCollator.UCA_CONSTANTS_.LAST_IMPLICIT_,
- RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_);
- // UCOL_FIRST_TRAILING
- INDIRECT_BOUNDARIES_[13] = new IndirectBoundaries(
- RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_,
- null);
- // UCOL_LAST_TRAILING
- INDIRECT_BOUNDARIES_[14] = new IndirectBoundaries(
- RuleBasedCollator.UCA_CONSTANTS_.LAST_TRAILING_,
- null);
- INDIRECT_BOUNDARIES_[14].m_limitCE_
- = RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_SPECIAL_MIN_ << 24;
-
- RULES_OPTIONS_ = new TokenOption[20];
- String option[] = {"non-ignorable", "shifted"};
- int value[] = {RuleBasedCollator.AttributeValue.NON_IGNORABLE_,
- RuleBasedCollator.AttributeValue.SHIFTED_};
- RULES_OPTIONS_[0] = new TokenOption("alternate",
- RuleBasedCollator.Attribute.ALTERNATE_HANDLING_,
- option, value);
- option = new String[1];
- option[0] = "2";
- value = new int[1];
- value[0] = RuleBasedCollator.AttributeValue.ON_;
- RULES_OPTIONS_[1] = new TokenOption("backwards",
- RuleBasedCollator.Attribute.FRENCH_COLLATION_,
- option, value);
- String offonoption[] = new String[2];
- offonoption[0] = "off";
- offonoption[1] = "on";
- int offonvalue[] = new int[2];
- offonvalue[0] = RuleBasedCollator.AttributeValue.OFF_;
- offonvalue[1] = RuleBasedCollator.AttributeValue.ON_;
- RULES_OPTIONS_[2] = new TokenOption("caseLevel",
- RuleBasedCollator.Attribute.CASE_LEVEL_,
- offonoption, offonvalue);
- option = new String[3];
- option[0] = "lower";
- option[1] = "upper";
- option[2] = "off";
- value = new int[3];
- value[0] = RuleBasedCollator.AttributeValue.LOWER_FIRST_;
- value[1] = RuleBasedCollator.AttributeValue.UPPER_FIRST_;
- value[2] = RuleBasedCollator.AttributeValue.OFF_;
- RULES_OPTIONS_[3] = new TokenOption("caseFirst",
- RuleBasedCollator.Attribute.CASE_FIRST_,
- option, value);
- RULES_OPTIONS_[4] = new TokenOption("normalization",
- RuleBasedCollator.Attribute.NORMALIZATION_MODE_,
- offonoption, offonvalue);
- RULES_OPTIONS_[5] = new TokenOption("hiraganaQ",
- RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_,
- offonoption, offonvalue);
- option = new String[5];
- option[0] = "1";
- option[1] = "2";
- option[2] = "3";
- option[3] = "4";
- option[4] = "I";
- value = new int[5];
- value[0] = RuleBasedCollator.AttributeValue.PRIMARY_;
- value[1] = RuleBasedCollator.AttributeValue.SECONDARY_;
- value[2] = RuleBasedCollator.AttributeValue.TERTIARY_;
- value[3] = RuleBasedCollator.AttributeValue.QUATERNARY_;
- value[4] = RuleBasedCollator.AttributeValue.IDENTICAL_;
- RULES_OPTIONS_[6] = new TokenOption("strength",
- RuleBasedCollator.Attribute.STRENGTH_,
- option, value);
- RULES_OPTIONS_[7] = new TokenOption("variable top",
- RuleBasedCollator.Attribute.LIMIT_,
- null, null);
- RULES_OPTIONS_[8] = new TokenOption("rearrange",
- RuleBasedCollator.Attribute.LIMIT_,
- null, null);
- option = new String[3];
- option[0] = "1";
- option[1] = "2";
- option[2] = "3";
- value = new int[3];
- value[0] = RuleBasedCollator.AttributeValue.PRIMARY_;
- value[1] = RuleBasedCollator.AttributeValue.SECONDARY_;
- value[2] = RuleBasedCollator.AttributeValue.TERTIARY_;
- RULES_OPTIONS_[9] = new TokenOption("before",
- RuleBasedCollator.Attribute.LIMIT_,
- option, value);
- RULES_OPTIONS_[10] = new TokenOption("top",
- RuleBasedCollator.Attribute.LIMIT_,
- null, null);
- String firstlastoption[] = new String[7];
- firstlastoption[0] = "primary";
- firstlastoption[1] = "secondary";
- firstlastoption[2] = "tertiary";
- firstlastoption[3] = "variable";
- firstlastoption[4] = "regular";
- firstlastoption[5] = "implicit";
- firstlastoption[6] = "trailing";
-
- int firstlastvalue[] = new int[7];
- Arrays.fill(firstlastvalue, RuleBasedCollator.AttributeValue.PRIMARY_);
-
- RULES_OPTIONS_[11] = new TokenOption("first",
- RuleBasedCollator.Attribute.LIMIT_,
- firstlastoption, firstlastvalue);
- RULES_OPTIONS_[12] = new TokenOption("last",
- RuleBasedCollator.Attribute.LIMIT_,
- firstlastoption, firstlastvalue);
- RULES_OPTIONS_[13] = new TokenOption("optimize",
- RuleBasedCollator.Attribute.LIMIT_,
- null, null);
- RULES_OPTIONS_[14] = new TokenOption("suppressContractions",
- RuleBasedCollator.Attribute.LIMIT_,
- null, null);
- RULES_OPTIONS_[15] = new TokenOption("undefined",
- RuleBasedCollator.Attribute.LIMIT_,
- null, null);
- RULES_OPTIONS_[16] = new TokenOption("reorder",
- RuleBasedCollator.Attribute.LIMIT_,
- null, null);
- RULES_OPTIONS_[17] = new TokenOption("charsetname",
- RuleBasedCollator.Attribute.LIMIT_,
- null, null);
- RULES_OPTIONS_[18] = new TokenOption("charset",
- RuleBasedCollator.Attribute.LIMIT_,
- null, null);
- RULES_OPTIONS_[19] = new TokenOption("import",
- RuleBasedCollator.Attribute.LIMIT_,
- null, null);
- }
-
- /**
- * Utility data members
- */
- private Token m_utilToken_ = new Token();
- private CollationElementIterator m_UCAColEIter_
- = RuleBasedCollator.UCA_.getCollationElementIterator("");
- private int m_utilCEBuffer_[] = new int[2];
-
- private boolean m_isStarred_;
-
- private int m_currentStarredCharIndex_;
-
-
- private int m_lastStarredCharIndex_;
-
- private int m_currentRangeCp_;
-
- private int m_lastRangeCp_;
-
- private boolean m_inRange_;
-
- private int m_previousCp_;
-
- private boolean m_savedIsStarred_;
-
-
- // private methods -------------------------------------------------------
-
- /**
- * Assembles the token list
- * @exception ParseException thrown when rules syntax fails
- */
- int assembleTokenList() throws ParseException
- {
- Token lastToken = null;
- m_parsedToken_.m_strength_ = TOKEN_UNSET_;
- int sourcelimit = m_source_.length();
- int expandNext = 0;
-
- m_isStarred_ = false;
-
- while (m_current_ < sourcelimit || m_isStarred_) {
- m_parsedToken_.m_prefixOffset_ = 0;
- if (parseNextToken(lastToken == null) < 0) {
- // we have reached the end
- continue;
- }
- char specs = m_parsedToken_.m_flags_;
- boolean variableTop = ((specs & TOKEN_VARIABLE_TOP_MASK_) != 0);
- boolean top = ((specs & TOKEN_TOP_MASK_) != 0);
- int lastStrength = TOKEN_UNSET_;
- if (lastToken != null) {
- lastStrength = lastToken.m_strength_;
- }
- m_utilToken_.m_source_ = m_parsedToken_.m_charsLen_ << 24
- | m_parsedToken_.m_charsOffset_;
- m_utilToken_.m_rules_ = m_source_;
- // 4 Lookup each source in the CharsToToken map, and find a
- // sourcetoken
- Token sourceToken = m_hashTable_.get(m_utilToken_);
- if (m_parsedToken_.m_strength_ != TOKEN_RESET_) {
- if (lastToken == null) {
- // this means that rules haven't started properly
- throwParseException(m_source_.toString(), 0);
- }
- // 6 Otherwise (when relation != reset)
- if (sourceToken == null) {
- // If sourceToken is null, create new one
- sourceToken = new Token();
- sourceToken.m_rules_ = m_source_;
- sourceToken.m_source_ = m_parsedToken_.m_charsLen_ << 24
- | m_parsedToken_.m_charsOffset_;
- sourceToken.m_prefix_ = m_parsedToken_.m_prefixLen_ << 24
- | m_parsedToken_.m_prefixOffset_;
- // TODO: this should also handle reverse
- sourceToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_;
- sourceToken.m_next_ = null;
- sourceToken.m_previous_ = null;
- sourceToken.m_CELength_ = 0;
- sourceToken.m_expCELength_ = 0;
- m_hashTable_.put(sourceToken, sourceToken);
- }
- else {
- // we could have fished out a reset here
- if (sourceToken.m_strength_ != TOKEN_RESET_
- && lastToken != sourceToken) {
- // otherwise remove sourceToken from where it was.
-
- // Take care of the next node
- if (sourceToken.m_next_ != null) {
- if (sourceToken.m_next_.m_strength_
- > sourceToken.m_strength_) {
- sourceToken.m_next_.m_strength_
- = sourceToken.m_strength_;
- }
- sourceToken.m_next_.m_previous_
- = sourceToken.m_previous_;
- }
- else {
- // sourcetoken is the last token.
- // Redefine the tail token.
- sourceToken.m_listHeader_.m_last_
- = sourceToken.m_previous_;
- }
-
- // Take care of the previous node.
- if (sourceToken.m_previous_ != null) {
- sourceToken.m_previous_.m_next_
- = sourceToken.m_next_;
- }
- else {
- // sourcetoken is the first token.
- // Redefine the head node.
- sourceToken.m_listHeader_.m_first_
- = sourceToken.m_next_;
- }
- sourceToken.m_next_ = null;
- sourceToken.m_previous_ = null;
- }
- }
- sourceToken.m_strength_ = m_parsedToken_.m_strength_;
- sourceToken.m_listHeader_ = lastToken.m_listHeader_;
-
- // 1. Find the strongest strength in each list, and set
- // strongestP and strongestN accordingly in the headers.
- if (lastStrength == TOKEN_RESET_
- || sourceToken.m_listHeader_.m_first_ == null) {
- // If LAST is a reset insert sourceToken in the list.
- if (sourceToken.m_listHeader_.m_first_ == null) {
- sourceToken.m_listHeader_.m_first_ = sourceToken;
- sourceToken.m_listHeader_.m_last_ = sourceToken;
- }
- else { // we need to find a place for us
- // and we'll get in front of the same strength
- if (sourceToken.m_listHeader_.m_first_.m_strength_
- <= sourceToken.m_strength_) {
- sourceToken.m_next_
- = sourceToken.m_listHeader_.m_first_;
- sourceToken.m_next_.m_previous_ = sourceToken;
- sourceToken.m_listHeader_.m_first_ = sourceToken;
- sourceToken.m_previous_ = null;
- }
- else {
- lastToken = sourceToken.m_listHeader_.m_first_;
- while (lastToken.m_next_ != null
- && lastToken.m_next_.m_strength_
- > sourceToken.m_strength_) {
- lastToken = lastToken.m_next_;
- }
- if (lastToken.m_next_ != null) {
- lastToken.m_next_.m_previous_ = sourceToken;
- }
- else {
- sourceToken.m_listHeader_.m_last_
- = sourceToken;
- }
- sourceToken.m_previous_ = lastToken;
- sourceToken.m_next_ = lastToken.m_next_;
- lastToken.m_next_ = sourceToken;
- }
- }
- }
- else {
- // Otherwise (when LAST is not a reset)
- // if polarity (LAST) == polarity(relation), insert
- // sourceToken after LAST, otherwise insert before.
- // when inserting after or before, search to the next
- // position with the same strength in that direction.
- // (This is called postpone insertion).
- if (sourceToken != lastToken) {
- if (lastToken.m_polarity_ == sourceToken.m_polarity_) {
- while (lastToken.m_next_ != null
- && lastToken.m_next_.m_strength_
- > sourceToken.m_strength_) {
- lastToken = lastToken.m_next_;
- }
- sourceToken.m_previous_ = lastToken;
- if (lastToken.m_next_ != null) {
- lastToken.m_next_.m_previous_ = sourceToken;
- }
- else {
- sourceToken.m_listHeader_.m_last_ = sourceToken;
- }
- sourceToken.m_next_ = lastToken.m_next_;
- lastToken.m_next_ = sourceToken;
- }
- else {
- while (lastToken.m_previous_ != null
- && lastToken.m_previous_.m_strength_
- > sourceToken.m_strength_) {
- lastToken = lastToken.m_previous_;
- }
- sourceToken.m_next_ = lastToken;
- if (lastToken.m_previous_ != null) {
- lastToken.m_previous_.m_next_ = sourceToken;
- }
- else {
- sourceToken.m_listHeader_.m_first_
- = sourceToken;
- }
- sourceToken.m_previous_ = lastToken.m_previous_;
- lastToken.m_previous_ = sourceToken;
- }
- }
- else { // repeated one thing twice in rules, stay with the
- // stronger strength
- if (lastStrength < sourceToken.m_strength_) {
- sourceToken.m_strength_ = lastStrength;
- }
- }
- }
- // if the token was a variable top, we're gonna put it in
- if (variableTop == true && m_variableTop_ == null) {
- variableTop = false;
- m_variableTop_ = sourceToken;
- }
- // Treat the expansions.
- // There are two types of expansions: explicit (x / y) and
- // reset based propagating expansions
- // (&abc * d * e <=> &ab * d / c * e / c)
- // if both of them are in effect for a token, they are combined.
- sourceToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24
- | m_parsedToken_.m_extensionOffset_;
- if (expandNext != 0) {
- if (sourceToken.m_strength_ == RuleBasedCollator.PRIMARY) {
- // primary strength kills off the implicit expansion
- expandNext = 0;
- }
- else if (sourceToken.m_expansion_ == 0) {
- // if there is no expansion, implicit is just added to
- // the token
- sourceToken.m_expansion_ = expandNext;
- }
- else {
- // there is both explicit and implicit expansion.
- // We need to make a combination
- int start = expandNext & 0xFFFFFF;
- int size = expandNext >>> 24;
- if (size > 0) {
- m_source_.append(m_source_.substring(start,
- start + size));
- }
- start = m_parsedToken_.m_extensionOffset_;
- m_source_.append(m_source_.substring(start,
- start + m_parsedToken_.m_extensionLen_));
- sourceToken.m_expansion_ = (size
- + m_parsedToken_.m_extensionLen_) << 24
- | m_extraCurrent_;
- m_extraCurrent_ += size + m_parsedToken_.m_extensionLen_;
- }
- }
- // if the previous token was a reset before, the strength of this
- // token must match the strength of before. Otherwise we have an
- // undefined situation.
- // In other words, we currently have a cludge which we use to
- // represent &a >> x. This is written as &[before 2]a << x.
- if((lastToken.m_flags_ & TOKEN_BEFORE_) != 0) {
- int beforeStrength = (lastToken.m_flags_ & TOKEN_BEFORE_) - 1;
- if(beforeStrength != sourceToken.m_strength_) {
- throwParseException(m_source_.toString(), m_current_);
- }
- }
-
- }
- else {
- if (lastToken != null && lastStrength == TOKEN_RESET_) {
- // if the previous token was also a reset, this means that
- // we have two consecutive resets and we want to remove the
- // previous one if empty
- if (m_resultLength_ > 0 && m_listHeader_[m_resultLength_ - 1].m_first_ == null) {
- m_resultLength_ --;
- }
- }
- if (sourceToken == null) {
- // this is a reset, but it might still be somewhere in the
- // tailoring, in shorter form
- int searchCharsLen = m_parsedToken_.m_charsLen_;
- while (searchCharsLen > 1 && sourceToken == null) {
- searchCharsLen --;
- // key = searchCharsLen << 24 | charsOffset;
- m_utilToken_.m_source_ = searchCharsLen << 24
- | m_parsedToken_.m_charsOffset_;
- m_utilToken_.m_rules_ = m_source_;
- sourceToken = m_hashTable_.get(m_utilToken_);
- }
- if (sourceToken != null) {
- expandNext = (m_parsedToken_.m_charsLen_
- - searchCharsLen) << 24
- | (m_parsedToken_.m_charsOffset_
- + searchCharsLen);
- }
- }
- if ((specs & TOKEN_BEFORE_) != 0) {
- if (top == false) {
- // we're doing before & there is no indirection
- int strength = (specs & TOKEN_BEFORE_) - 1;
- if (sourceToken != null
- && sourceToken.m_strength_ != TOKEN_RESET_) {
- // this is a before that is already ordered in the UCA
- // - so we need to get the previous with good strength
- while (sourceToken.m_strength_ > strength
- && sourceToken.m_previous_ != null) {
- sourceToken = sourceToken.m_previous_;
- }
- // here, either we hit the strength or NULL
- if (sourceToken.m_strength_ == strength) {
- if (sourceToken.m_previous_ != null) {
- sourceToken = sourceToken.m_previous_;
- }
- else { // start of list
- sourceToken
- = sourceToken.m_listHeader_.m_reset_;
- }
- }
- else { // we hit NULL, we should be doing the else part
- sourceToken
- = sourceToken.m_listHeader_.m_reset_;
- sourceToken = getVirginBefore(sourceToken,
- strength);
- }
- }
- else {
- sourceToken
- = getVirginBefore(sourceToken, strength);
- }
- }
- else {
- // this is both before and indirection
- top = false;
- m_listHeader_[m_resultLength_] = new TokenListHeader();
- m_listHeader_[m_resultLength_].m_previousCE_ = 0;
- m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
- m_listHeader_[m_resultLength_].m_indirect_ = true;
- // we need to do slightly more work. we need to get the
- // baseCE using the inverse UCA & getPrevious. The next
- // bound is not set, and will be decided in ucol_bld
- int strength = (specs & TOKEN_BEFORE_) - 1;
- int baseCE = INDIRECT_BOUNDARIES_[
- m_parsedToken_.m_indirectIndex_].m_startCE_;
- int baseContCE = INDIRECT_BOUNDARIES_[
- m_parsedToken_.m_indirectIndex_].m_startContCE_;
- int ce[] = new int[2];
- if((baseCE >>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_)
- && (baseCE >>> 24 <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */
- int primary = baseCE & RuleBasedCollator.CE_PRIMARY_MASK_ | (baseContCE & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16;
- int raw = RuleBasedCollator.impCEGen_.getRawFromImplicit(primary);
- int primaryCE = RuleBasedCollator.impCEGen_.getImplicitFromRaw(raw-1);
- ce[0] = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;
- ce[1] = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_ | RuleBasedCollator.CE_CONTINUATION_MARKER_;
- } else {
- CollationParsedRuleBuilder.InverseUCA invuca
- = CollationParsedRuleBuilder.INVERSE_UCA_;
- invuca.getInversePrevCE(baseCE, baseContCE, strength,
- ce);
- }
- m_listHeader_[m_resultLength_].m_baseCE_ = ce[0];
- m_listHeader_[m_resultLength_].m_baseContCE_ = ce[1];
- m_listHeader_[m_resultLength_].m_nextCE_ = 0;
- m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
-
- sourceToken = new Token();
- expandNext = initAReset(0, sourceToken);
- }
- }
- // 5 If the relation is a reset:
- // If sourceToken is null
- // Create new list, create new sourceToken, make the baseCE
- // from source, put the sourceToken in ListHeader of the new
- // list
- if (sourceToken == null) {
- if (m_listHeader_[m_resultLength_] == null) {
- m_listHeader_[m_resultLength_] = new TokenListHeader();
- }
- // 3 Consider each item: relation, source, and expansion:
- // e.g. ...< x / y ...
- // First convert all expansions into normal form.
- // Examples:
- // If "xy" doesn't occur earlier in the list or in the UCA,
- // convert &xy * c * d * ... into &x * c/y * d * ...
- // Note: reset values can never have expansions, although
- // they can cause the very next item to have one. They may
- // be contractions, if they are found earlier in the list.
- if (top == false) {
- CollationElementIterator coleiter
- = RuleBasedCollator.UCA_.getCollationElementIterator(
- m_source_.substring(m_parsedToken_.m_charsOffset_,
- m_parsedToken_.m_charsOffset_
- + m_parsedToken_.m_charsLen_));
-
- int CE = coleiter.next();
- // offset to the character in the full rule string
- int expand = coleiter.getOffset()
- + m_parsedToken_.m_charsOffset_;
- int SecondCE = coleiter.next();
-
- m_listHeader_[m_resultLength_].m_baseCE_
- = CE & 0xFFFFFF3F;
- if (RuleBasedCollator.isContinuation(SecondCE)) {
- m_listHeader_[m_resultLength_].m_baseContCE_
- = SecondCE;
- }
- else {
- m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
- }
- m_listHeader_[m_resultLength_].m_nextCE_ = 0;
- m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
- m_listHeader_[m_resultLength_].m_previousCE_ = 0;
- m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
- m_listHeader_[m_resultLength_].m_indirect_ = false;
- sourceToken = new Token();
- expandNext = initAReset(expand, sourceToken);
- }
- else { // top == TRUE
- top = false;
- m_listHeader_[m_resultLength_].m_previousCE_ = 0;
- m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
- m_listHeader_[m_resultLength_].m_indirect_ = true;
- IndirectBoundaries ib = INDIRECT_BOUNDARIES_[
- m_parsedToken_.m_indirectIndex_];
- m_listHeader_[m_resultLength_].m_baseCE_
- = ib.m_startCE_;
- m_listHeader_[m_resultLength_].m_baseContCE_
- = ib.m_startContCE_;
- m_listHeader_[m_resultLength_].m_nextCE_
- = ib.m_limitCE_;
- m_listHeader_[m_resultLength_].m_nextContCE_
- = ib.m_limitContCE_;
- sourceToken = new Token();
- expandNext = initAReset(0, sourceToken);
- }
- }
- else { // reset to something already in rules
- top = false;
- }
- }
- // 7 After all this, set LAST to point to sourceToken, and goto
- // step 3.
- lastToken = sourceToken;
- }
-
- if (m_resultLength_ > 0
- && m_listHeader_[m_resultLength_ - 1].m_first_ == null) {
- m_resultLength_ --;
- }
- return m_resultLength_;
- }
-
- /**
- * Formats and throws a ParseException
- * @param rules collation rule that failed
- * @param offset failed offset in rules
- * @throws ParseException with failure information
- */
- private static final void throwParseException(String rules, int offset)
- throws ParseException
- {
- // for pre-context
- String precontext = rules.substring(0, offset);
- String postcontext = rules.substring(offset, rules.length());
- StringBuilder error = new StringBuilder(
- "Parse error occurred in rule at offset ");
- error.append(offset);
- error.append("\n after the prefix \"");
- error.append(precontext);
- error.append("\" before the suffix \"");
- error.append(postcontext);
- throw new ParseException(error.toString(), offset);
- }
-
- private final boolean doSetTop() {
- m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
- m_source_.append((char)0xFFFE);
- IndirectBoundaries ib =
- INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_];
- m_source_.append((char)(ib.m_startCE_ >> 16));
- m_source_.append((char)(ib.m_startCE_ & 0xFFFF));
- m_extraCurrent_ += 3;
- if (INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_
- ].m_startContCE_ == 0) {
- m_parsedToken_.m_charsLen_ = 3;
- }
- else {
- m_source_.append((char)(INDIRECT_BOUNDARIES_[
- m_parsedToken_.m_indirectIndex_
- ].m_startContCE_ >> 16));
- m_source_.append((char)(INDIRECT_BOUNDARIES_[
- m_parsedToken_.m_indirectIndex_
- ].m_startContCE_ & 0xFFFF));
- m_extraCurrent_ += 2;
- m_parsedToken_.m_charsLen_ = 5;
- }
- return true;
- }
-
- private static boolean isCharNewLine(char c) {
- switch (c) {
- case 0x000A: /* LF */
- case 0x000D: /* CR */
- case 0x000C: /* FF */
- case 0x0085: /* NEL */
- case 0x2028: /* LS */
- case 0x2029: /* PS */
- return true;
- default:
- return false;
- }
- }
-
- /**
- * Parses the next token.
- *
- * It updates/accesses the following member variables:
- * m_current_: Index to the next unparsed character (not code point)
- * in the character array (a StringBuilder object) m_source_.
- * m_parsedToken_: The parsed token. The following of the token are updated.
- * .m_strength: The strength of the token.
- * .m_charsOffset, m_charsLen_: Index to the first character (after operators),
- * and number of characters in the token.
- * This may be in the main string, or in the appended string.
- * .m_extensionOffset_, .m_extensionLen_:
- * .m_flags:
- * .m_prefixOffset, .m_prefixLen: Used when "|" is used to specify "context before".
- * .m_indirectIndex:
- * @param startofrules
- * flag indicating if we are at the start of rules
- * @return the offset of the next unparsed char
- * @exception ParseException
- * thrown when rule parsing fails
- */
- private int parseNextToken(boolean startofrules) throws ParseException
- {
-
- if (m_inRange_) {
- // We are not done processing a range. Continue it.
- return processNextCodePointInRange();
- } else if (m_isStarred_) {
- // We are not done processing a starred token. Continue it.
- return processNextTokenInTheStarredList();
- }
-
- // Get the next token.
- int nextOffset = parseNextTokenInternal(startofrules);
-
- // If the next token is starred and/or in range, we need to handle it here.
- if (m_inRange_) {
- // A new range has started.
- // Check whether it is a chain of ranges with more than one hyphen.
- if (m_lastRangeCp_ > 0 && m_lastRangeCp_ == m_previousCp_) {
- throw new ParseException("Chained range syntax", m_current_);
- }
-
- // The current token is the first character of the second code point of the range.
- // Process just that, and then proceed with the star.
- m_lastRangeCp_ = m_source_.codePointAt(this.m_parsedToken_.m_charsOffset_);
- if (m_lastRangeCp_ <= m_previousCp_) {
- throw new ParseException("Invalid range", m_current_);
- }
-
- // Set current range code point to process the range loop
- m_currentRangeCp_ = m_previousCp_ + 1;
-
- // Set current starred char index to continue processing the starred
- // expression after the range is done.
- m_currentStarredCharIndex_ = m_parsedToken_.m_charsOffset_
- + Character.charCount(m_lastRangeCp_);
- m_lastStarredCharIndex_ = m_parsedToken_.m_charsOffset_ + m_parsedToken_.m_charsLen_ - 1;
-
- return processNextCodePointInRange();
- } else if (m_isStarred_) {
- // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that
- // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be
- // separated into several tokens and returned.
- m_currentStarredCharIndex_ = m_parsedToken_.m_charsOffset_;
- m_lastStarredCharIndex_ = m_parsedToken_.m_charsOffset_ + m_parsedToken_.m_charsLen_ - 1;
-
- return processNextTokenInTheStarredList();
- }
- return nextOffset;
- }
-
- private int processNextCodePointInRange() throws ParseException {
- int nChars = Character.charCount(m_currentRangeCp_);
- m_source_.appendCodePoint(m_currentRangeCp_);
-
- m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
- m_parsedToken_.m_charsLen_ = nChars;
-
- m_extraCurrent_ += nChars;
- ++m_currentRangeCp_;
- if (m_currentRangeCp_ > m_lastRangeCp_) {
- // All the code points in the range are processed.
- // Turn the range flag off.
- m_inRange_ = false;
-
- // If there is a starred portion remaining in the current
- // parsed token, resume the starred operation.
- if (m_currentStarredCharIndex_ <= m_lastStarredCharIndex_) {
- m_isStarred_ = true;
- } else {
- m_isStarred_ = false;
- }
- } else {
- m_previousCp_ = m_currentRangeCp_;
- }
- return m_current_;
- }
-
-
- /**
- * Extracts the next token from the starred token from
- * m_currentStarredCharIndex_ and returns it.
- * @return the offset of the next unparsed char
- * @throws ParseException
- */
- private int processNextTokenInTheStarredList() throws ParseException {
- // Extract the characters corresponding to the next code point.
- int cp = m_source_.codePointAt(m_currentStarredCharIndex_);
- int nChars = Character.charCount(cp);
-
- m_parsedToken_.m_charsLen_ = nChars;
- m_parsedToken_.m_charsOffset_ = m_currentStarredCharIndex_;
- m_currentStarredCharIndex_ += nChars;
-
- // When we are done parsing the starred string, turn the flag off so that
- // the normal processing is restored.
- if (m_currentStarredCharIndex_ > m_lastStarredCharIndex_) {
- m_isStarred_ = false;
- }
- m_previousCp_ = cp;
- return m_current_;
- }
-
- private int resetToTop(boolean top, boolean variableTop,
- int extensionOffset, int newExtensionLen,
- byte byteBefore) throws ParseException {
- m_parsedToken_.m_indirectIndex_ = 5;
- top = doSetTop();
- return doEndParseNextToken(TOKEN_RESET_,
- top,
- extensionOffset,
- newExtensionLen,
- variableTop, byteBefore);
- }
-
- /**
- * Gets the next token and sets the necessary internal variables.
- * This function parses a starred string as a single token, which will be separated
- * in the calling function.
- * @param startofrules Boolean value indicating whether this is the first rule
- * @return the offset of the next unparsed char
- * @throws ParseException
- */
- @SuppressWarnings("fallthrough")
- private int parseNextTokenInternal(boolean startofrules) throws ParseException {
- boolean variabletop = false;
- boolean top = false;
- boolean inchars = true;
- boolean inquote = false;
- boolean wasinquote = false;
- byte before = 0;
- boolean isescaped = false;
- int /*newcharslen = 0,*/ newextensionlen = 0;
- int /*charsoffset = 0,*/ extensionoffset = 0;
- int newstrength = TOKEN_UNSET_;
-
- initializeParsedToken();
-
- int limit = m_rules_.length();
- while (m_current_ < limit) {
- char ch = m_source_.charAt(m_current_);
- if (inquote) {
- if (ch == 0x0027) { // '\''
- inquote = false;
- }
- else {
- if ((m_parsedToken_.m_charsLen_ == 0) || inchars) {
- if (m_parsedToken_.m_charsLen_ == 0) {
- m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
- }
- m_parsedToken_.m_charsLen_ ++;
- }
- else {
- if (newextensionlen == 0) {
- extensionoffset = m_extraCurrent_;
- }
- newextensionlen ++;
- }
- }
- }
- else if (isescaped) {
- isescaped = false;
- if (newstrength == TOKEN_UNSET_) {
- throwParseException(m_rules_, m_current_);
- }
- if (ch != 0 && m_current_ != limit) {
- if (inchars) {
- if (m_parsedToken_.m_charsLen_ == 0) {
- m_parsedToken_.m_charsOffset_ = m_current_;
- }
- m_parsedToken_.m_charsLen_ ++;
- }
- else {
- if (newextensionlen == 0) {
- extensionoffset = m_current_;
- }
- newextensionlen ++;
- }
- }
- }
- else {
- if (!PatternProps.isWhiteSpace(ch)) {
- // Sets the strength for this entry
- switch (ch) {
- case 0x003D : // '='
- if (newstrength != TOKEN_UNSET_) {
- return doEndParseNextToken(newstrength,
- top,
- extensionoffset,
- newextensionlen,
- variabletop, before);
- }
- // if we start with strength, we'll reset to top
- if (startofrules == true) {
- return resetToTop(top, variabletop, extensionoffset,
- newextensionlen, before);
- }
- newstrength = Collator.IDENTICAL;
- if (m_source_.charAt(m_current_ + 1) == 0x002A) { // '*'
- m_current_++;
- m_isStarred_ = true;
- }
- break;
- case 0x002C : // ','
- if (newstrength != TOKEN_UNSET_) {
- return doEndParseNextToken(newstrength,
- top,
- extensionoffset,
- newextensionlen,
- variabletop, before);
- }
- // if we start with strength, we'll reset to top
- if (startofrules == true) {
- return resetToTop(top, variabletop, extensionoffset,
- newextensionlen, before);
- }
- newstrength = Collator.TERTIARY;
- break;
- case 0x003B : // ';'
- if (newstrength != TOKEN_UNSET_) {
- return doEndParseNextToken(newstrength,
- top,
- extensionoffset,
- newextensionlen,
- variabletop, before);
- }
- //if we start with strength, we'll reset to top
- if(startofrules == true) {
- return resetToTop(top, variabletop, extensionoffset,
- newextensionlen, before);
- }
- newstrength = Collator.SECONDARY;
- break;
- case 0x003C : // '<'
- if (newstrength != TOKEN_UNSET_) {
- return doEndParseNextToken(newstrength,
- top,
- extensionoffset,
- newextensionlen,
- variabletop, before);
- }
- // if we start with strength, we'll reset to top
- if (startofrules == true) {
- return resetToTop(top, variabletop, extensionoffset,
- newextensionlen, before);
- }
- // before this, do a scan to verify whether this is
- // another strength
- if (m_source_.charAt(m_current_ + 1) == 0x003C) {
- m_current_ ++;
- if (m_source_.charAt(m_current_ + 1) == 0x003C) {
- m_current_ ++; // three in a row!
- newstrength = Collator.TERTIARY;
- }
- else { // two in a row
- newstrength = Collator.SECONDARY;
- }
- }
- else { // just one
- newstrength = Collator.PRIMARY;
- }
- if (m_source_.charAt(m_current_ + 1) == 0x002A) { // '*'
- m_current_++;
- m_isStarred_ = true;
- }
- break;
-
- case 0x0026 : // '&'
- if (newstrength != TOKEN_UNSET_) {
- return doEndParseNextToken(newstrength,
- top,
- extensionoffset,
- newextensionlen,
- variabletop, before);
- }
- newstrength = TOKEN_RESET_; // PatternEntry::RESET = 0
- break;
- case 0x005b : // '['
- // options - read an option, analyze it
- m_optionEnd_ = m_rules_.indexOf(0x005d, m_current_);
- if (m_optionEnd_ != -1) { // ']'
- byte result = readAndSetOption();
- m_current_ = m_optionEnd_;
- if ((result & TOKEN_TOP_MASK_) != 0) {
- if (newstrength == TOKEN_RESET_) {
- doSetTop();
- if (before != 0) {
- // This is a combination of before and
- // indirection like
- // '&[before 2][first regular]<b'
- m_source_.append((char)0x002d);
- m_source_.append((char)before);
- m_extraCurrent_ += 2;
- m_parsedToken_.m_charsLen_ += 2;
- }
- m_current_ ++;
- return doEndParseNextToken(newstrength,
- true,
- extensionoffset,
- newextensionlen,
- variabletop, before);
- }
- else {
- throwParseException(m_rules_, m_current_);
- }
- }
- else if ((result & TOKEN_VARIABLE_TOP_MASK_) != 0) {
- if (newstrength != TOKEN_RESET_
- && newstrength != TOKEN_UNSET_) {
- variabletop = true;
- m_parsedToken_.m_charsOffset_
- = m_extraCurrent_;
- m_source_.append((char)0xFFFF);
- m_extraCurrent_ ++;
- m_current_ ++;
- m_parsedToken_.m_charsLen_ = 1;
- return doEndParseNextToken(newstrength,
- top,
- extensionoffset,
- newextensionlen,
- variabletop, before);
- }
- else {
- throwParseException(m_rules_, m_current_);
- }
- }
- else if ((result & TOKEN_BEFORE_) != 0){
- if (newstrength == TOKEN_RESET_) {
- before = (byte)(result & TOKEN_BEFORE_);
- }
- else {
- throwParseException(m_rules_, m_current_);
- }
- }
- }
- break;
- case 0x002F : // '/'
- wasinquote = false; // if we were copying source
- // characters, we want to stop now
- inchars = false; // we're now processing expansion
- break;
- case 0x005C : // back slash for escaped chars
- isescaped = true;
- break;
- // found a quote, we're gonna start copying
- case 0x0027 : //'\''
- if (newstrength == TOKEN_UNSET_) {
- // quote is illegal until we have a strength
- throwParseException(m_rules_, m_current_);
- }
- inquote = true;
- if (inchars) { // we're doing characters
- if (wasinquote == false) {
- m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
- }
- if (m_parsedToken_.m_charsLen_ != 0) {
- // We are processing characters in quote together.
- // Copy whatever is in the current token, so that
- // the unquoted string can be appended to that.
- m_source_.append(m_source_.substring(
- m_current_ - m_parsedToken_.m_charsLen_,
- m_current_));
- m_extraCurrent_ += m_parsedToken_.m_charsLen_;
- }
- m_parsedToken_.m_charsLen_ ++;
- }
- else { // we're doing an expansion
- if (wasinquote == false) {
- extensionoffset = m_extraCurrent_;
- }
- if (newextensionlen != 0) {
- m_source_.append(m_source_.substring(
- m_current_ - newextensionlen,
- m_current_));
- m_extraCurrent_ += newextensionlen;
- }
- newextensionlen ++;
- }
- wasinquote = true;
- m_current_ ++;
- ch = m_source_.charAt(m_current_);
- if (ch == 0x0027) { // copy the double quote
- m_source_.append(ch);
- m_extraCurrent_ ++;
- inquote = false;
- }
- break;
- // '@' is french only if the strength is not currently set
- // if it is, it's just a regular character in collation
- case 0x0040 : // '@'
- if (newstrength == TOKEN_UNSET_) {
- m_options_.m_isFrenchCollation_ = true;
- break;
- }
- // fall through
- case 0x007C : //|
- // this means we have actually been reading prefix part
- // we want to store read characters to the prefix part
- // and continue reading the characters (proper way
- // would be to restart reading the chars, but in that
- // case we would have to complicate the token hasher,
- // which I do not intend to play with. Instead, we will
- // do prefixes when prefixes are due (before adding the
- // elements).
- m_parsedToken_.m_prefixOffset_
- = m_parsedToken_.m_charsOffset_;
- m_parsedToken_.m_prefixLen_
- = m_parsedToken_.m_charsLen_;
- if (inchars) { // we're doing characters
- if (wasinquote == false) {
- m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
- }
- if (m_parsedToken_.m_charsLen_ != 0) {
- String prefix = m_source_.substring(
- m_current_ - m_parsedToken_.m_charsLen_,
- m_current_);
- m_source_.append(prefix);
- m_extraCurrent_ += m_parsedToken_.m_charsLen_;
- }
- m_parsedToken_.m_charsLen_ ++;
- }
- wasinquote = true;
- do {
- m_current_ ++;
- ch = m_source_.charAt(m_current_);
- // skip whitespace between '|' and the character
- } while (PatternProps.isWhiteSpace(ch));
- break;
- case 0x002D : // '-', indicates a range.
- if (newstrength != TOKEN_UNSET_) {
- m_savedIsStarred_ = m_isStarred_;
- return doEndParseNextToken(newstrength,
- top,
- extensionoffset,
- newextensionlen,
- variabletop, before);
- }
-
- m_isStarred_ = m_savedIsStarred_;
- // Ranges are valid only in starred tokens.
- if (!m_isStarred_) {
- throwParseException(m_rules_, m_current_);
- }
-
- newstrength = m_parsedToken_.m_strength_;
- m_inRange_ = true;
- break;
-
- case 0x0023: // '#' // this is a comment, skip everything through the end of line
- do {
- m_current_ ++;
- ch = m_source_.charAt(m_current_);
- } while (!isCharNewLine(ch));
- break;
- case 0x0021: // '!' // ignoring java set thai reordering
- break;
- default :
- if (newstrength == TOKEN_UNSET_) {
- throwParseException(m_rules_, m_current_);
- }
- if (isSpecialChar(ch) && (inquote == false)) {
- throwParseException(m_rules_, m_current_);
- }
- if (ch == 0x0000 && m_current_ + 1 == limit) {
- break;
- }
- if (inchars) {
- if (m_parsedToken_.m_charsLen_ == 0) {
- m_parsedToken_.m_charsOffset_ = m_current_;
- }
- m_parsedToken_.m_charsLen_++;
- }
- else {
- if (newextensionlen == 0) {
- extensionoffset = m_current_;
- }
- newextensionlen ++;
- }
- break;
- }
- }
- }
- if (wasinquote) {
- if (ch != 0x27) {
- m_source_.append(ch);
- m_extraCurrent_ ++;
- }
- }
- m_current_ ++;
- }
- return doEndParseNextToken(newstrength, top,
- extensionoffset, newextensionlen,
- variabletop, before);
- }
-
-
- /**
- *
- */
- private void initializeParsedToken() {
- m_parsedToken_.m_charsLen_ = 0;
- m_parsedToken_.m_charsOffset_ = 0;
- m_parsedToken_.m_prefixOffset_ = 0;
- m_parsedToken_.m_prefixLen_ = 0;
- m_parsedToken_.m_indirectIndex_ = 0;
- }
-
- /**
- * End the next parse token
- * @param newstrength new strength
- * @return offset in rules, -1 for end of rules
- */
- private int doEndParseNextToken(int newstrength, /*int newcharslen,*/
- boolean top, /*int charsoffset,*/
- int extensionoffset, int newextensionlen,
- boolean variabletop, int before)
- throws ParseException
- {
- if (newstrength == TOKEN_UNSET_) {
- return -1;
- }
- if (m_parsedToken_.m_charsLen_ == 0 && top == false) {
- throwParseException(m_rules_, m_current_);
- }
-
- m_parsedToken_.m_strength_ = newstrength;
- //m_parsedToken_.m_charsOffset_ = charsoffset;
- //m_parsedToken_.m_charsLen_ = newcharslen;
- m_parsedToken_.m_extensionOffset_ = extensionoffset;
- m_parsedToken_.m_extensionLen_ = newextensionlen;
- m_parsedToken_.m_flags_ = (char)
- ((variabletop ? TOKEN_VARIABLE_TOP_MASK_ : 0)
- | (top ? TOKEN_TOP_MASK_ : 0) | before);
- return m_current_;
- }
-
- /**
- * Token before this element
- * @param sourcetoken
- * @param strength collation strength
- * @return the token before source token
- * @exception ParseException thrown when rules have the wrong syntax
- */
- private Token getVirginBefore(Token sourcetoken, int strength)
- throws ParseException
- {
- // this is a virgin before - we need to fish the anchor from the UCA
- if (sourcetoken != null) {
- int offset = sourcetoken.m_source_ & 0xFFFFFF;
- m_UCAColEIter_.setText(m_source_.substring(offset, offset + 1));
- }
- else {
- m_UCAColEIter_.setText(
- m_source_.substring(m_parsedToken_.m_charsOffset_,
- m_parsedToken_.m_charsOffset_ + 1));
- }
-
- int basece = m_UCAColEIter_.next() & 0xFFFFFF3F;
- int basecontce = m_UCAColEIter_.next();
- if (basecontce == CollationElementIterator.NULLORDER) {
- basecontce = 0;
- }
-
- int ch = 0;
-
-
- if((basece >>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_)
- && (basece >>> 24 <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */
-
- int primary = basece & RuleBasedCollator.CE_PRIMARY_MASK_ | (basecontce & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16;
- int raw = RuleBasedCollator.impCEGen_.getRawFromImplicit(primary);
- ch = RuleBasedCollator.impCEGen_.getCodePointFromRaw(raw-1);
- int primaryCE = RuleBasedCollator.impCEGen_.getImplicitFromRaw(raw-1);
- m_utilCEBuffer_[0] = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;
- m_utilCEBuffer_[1] = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_ | RuleBasedCollator.CE_CONTINUATION_MARKER_;
-
- m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
- m_source_.append('\uFFFE');
- m_source_.append((char)ch);
- m_extraCurrent_ += 2;
- m_parsedToken_.m_charsLen_++;
-
- m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24)
- | m_parsedToken_.m_charsOffset_;
- m_utilToken_.m_rules_ = m_source_;
- sourcetoken = m_hashTable_.get(m_utilToken_);
-
- if(sourcetoken == null) {
- m_listHeader_[m_resultLength_] = new TokenListHeader();
- m_listHeader_[m_resultLength_].m_baseCE_
- = m_utilCEBuffer_[0] & 0xFFFFFF3F;
- if (RuleBasedCollator.isContinuation(m_utilCEBuffer_[1])) {
- m_listHeader_[m_resultLength_].m_baseContCE_
- = m_utilCEBuffer_[1];
- }
- else {
- m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
- }
- m_listHeader_[m_resultLength_].m_nextCE_ = 0;
- m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
- m_listHeader_[m_resultLength_].m_previousCE_ = 0;
- m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
- m_listHeader_[m_resultLength_].m_indirect_ = false;
-
- sourcetoken = new Token();
- initAReset(-1, sourcetoken);
- }
-
- } else {
-
- // first ce and second ce m_utilCEBuffer_
- /*int invpos = */CollationParsedRuleBuilder.INVERSE_UCA_.getInversePrevCE(
- basece, basecontce,
- strength, m_utilCEBuffer_);
- // we got the previous CE. Now we need to see if the difference between
- // the two CEs is really of the requested strength.
- // if it's a bigger difference (we asked for secondary and got primary), we
- // need to modify the CE.
- if(CollationParsedRuleBuilder.INVERSE_UCA_.getCEStrengthDifference(basece, basecontce, m_utilCEBuffer_[0], m_utilCEBuffer_[1]) < strength) {
- // adjust the strength
- // now we are in the situation where our baseCE should actually be modified in
- // order to get the CE in the right position.
- if(strength == Collator.SECONDARY) {
- m_utilCEBuffer_[0] = basece - 0x0200;
- } else { // strength == UCOL_TERTIARY
- m_utilCEBuffer_[0] = basece - 0x02;
- }
- if(RuleBasedCollator.isContinuation(basecontce)) {
- if(strength == Collator.SECONDARY) {
- m_utilCEBuffer_[1] = basecontce - 0x0200;
- } else { // strength == UCOL_TERTIARY
- m_utilCEBuffer_[1] = basecontce - 0x02;
- }
- }
- }
-
-/*
- // the code below relies on getting a code point from the inverse table, in order to be
- // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
- // 1. There are many code points that have the same CE
- // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
- // Also, in case when there is no equivalent strength before an element, we have to actually
- // construct one. For example, &[before 2]a << x won't result in x << a, because the element
- // before a is a primary difference.
- ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_table_[3 * invpos
- + 2];
- if ((ch & INVERSE_SIZE_MASK_) != 0) {
- int offset = ch & INVERSE_OFFSET_MASK_;
- ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_continuations_[
- offset];
- }
- m_source_.append((char)ch);
- m_extraCurrent_ ++;
- m_parsedToken_.m_charsOffset_ = m_extraCurrent_ - 1;
- m_parsedToken_.m_charsLen_ = 1;
-
- // We got an UCA before. However, this might have been tailored.
- // example:
- // &\u30ca = \u306a
- // &[before 3]\u306a<<<\u306a|\u309d
-
- m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24)
- | m_parsedToken_.m_charsOffset_;
- m_utilToken_.m_rules_ = m_source_;
- sourcetoken = (Token)m_hashTable_.get(m_utilToken_);
-*/
-
- // here is how it should be. The situation such as &[before 1]a < x, should be
- // resolved exactly as if we wrote &a > x.
- // therefore, I don't really care if the UCA value before a has been changed.
- // However, I do care if the strength between my element and the previous element
- // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
- // have to construct the base CE.
-
- // if we found a tailored thing, we have to use the UCA value and
- // construct a new reset token with constructed name
- //if (sourcetoken != null && sourcetoken.m_strength_ != TOKEN_RESET_) {
- // character to which we want to anchor is already tailored.
- // We need to construct a new token which will be the anchor point
- //m_source_.setCharAt(m_extraCurrent_ - 1, '\uFFFE');
- //m_source_.append(ch);
- //m_extraCurrent_ ++;
- //m_parsedToken_.m_charsLen_ ++;
- // grab before
- m_parsedToken_.m_charsOffset_ -= 10;
- m_parsedToken_.m_charsLen_ += 10;
- m_listHeader_[m_resultLength_] = new TokenListHeader();
- m_listHeader_[m_resultLength_].m_baseCE_
- = m_utilCEBuffer_[0] & 0xFFFFFF3F;
- if (RuleBasedCollator.isContinuation(m_utilCEBuffer_[1])) {
- m_listHeader_[m_resultLength_].m_baseContCE_
- = m_utilCEBuffer_[1];
- }
- else {
- m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
- }
- m_listHeader_[m_resultLength_].m_nextCE_ = 0;
- m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
- m_listHeader_[m_resultLength_].m_previousCE_ = 0;
- m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
- m_listHeader_[m_resultLength_].m_indirect_ = false;
- sourcetoken = new Token();
- initAReset(-1, sourcetoken);
- //}
- }
- return sourcetoken;
- }
-
- /**
- * Processing Description.
- * 1. Build a m_listHeader_. Each list has a header, which contains two lists
- * (positive and negative), a reset token, a baseCE, nextCE, and
- * previousCE. The lists and reset may be null.
- * 2. As you process, you keep a LAST pointer that points to the last token
- * you handled.
- * @param expand string offset, -1 for null strings
- * @param targetToken token to update
- * @return expandnext offset
- * @throws ParseException thrown when rules syntax failed
- */
- private int initAReset(int expand, Token targetToken) throws ParseException
- {
- if (m_resultLength_ == m_listHeader_.length - 1) {
- // Unfortunately, this won't work, as we store addresses of lhs in
- // token
- TokenListHeader temp[] = new TokenListHeader[m_resultLength_ << 1];
- System.arraycopy(m_listHeader_, 0, temp, 0, m_resultLength_ + 1);
- m_listHeader_ = temp;
- }
- // do the reset thing
- targetToken.m_rules_ = m_source_;
- targetToken.m_source_ = m_parsedToken_.m_charsLen_ << 24
- | m_parsedToken_.m_charsOffset_;
- targetToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24
- | m_parsedToken_.m_extensionOffset_;
- // keep the flags around so that we know about before
- targetToken.m_flags_ = m_parsedToken_.m_flags_;
-
- if (m_parsedToken_.m_prefixOffset_ != 0) {
- throwParseException(m_rules_, m_parsedToken_.m_charsOffset_ - 1);
- }
-
- targetToken.m_prefix_ = 0;
- // TODO: this should also handle reverse
- targetToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_;
- targetToken.m_strength_ = TOKEN_RESET_;
- targetToken.m_next_ = null;
- targetToken.m_previous_ = null;
- targetToken.m_CELength_ = 0;
- targetToken.m_expCELength_ = 0;
- targetToken.m_listHeader_ = m_listHeader_[m_resultLength_];
- m_listHeader_[m_resultLength_].m_first_ = null;
- m_listHeader_[m_resultLength_].m_last_ = null;
- m_listHeader_[m_resultLength_].m_first_ = null;
- m_listHeader_[m_resultLength_].m_last_ = null;
- m_listHeader_[m_resultLength_].m_reset_ = targetToken;
-
- /* 3 Consider each item: relation, source, and expansion:
- * e.g. ...< x / y ...
- * First convert all expansions into normal form. Examples:
- * If "xy" doesn't occur earlier in the list or in the UCA, convert
- * &xy * c * d * ... into &x * c/y * d * ...
- * Note: reset values can never have expansions, although they can
- * cause the very next item to have one. They may be contractions, if
- * they are found earlier in the list.
- */
- int result = 0;
- if (expand > 0) {
- // check to see if there is an expansion
- if (m_parsedToken_.m_charsLen_ > 1) {
- targetToken.m_source_ = ((expand
- - m_parsedToken_.m_charsOffset_ )
- << 24)
- | m_parsedToken_.m_charsOffset_;
- result = ((m_parsedToken_.m_charsLen_
- + m_parsedToken_.m_charsOffset_ - expand) << 24)
- | expand;
- }
- }
-
- m_resultLength_ ++;
- m_hashTable_.put(targetToken, targetToken);
- return result;
- }
-
- /**
- * Checks if an character is special
- * @param ch character to test
- * @return true if the character is special
- */
- private static final boolean isSpecialChar(char ch)
- {
- return (ch <= 0x002F && ch >= 0x0020) || (ch <= 0x003F && ch >= 0x003A)
- || (ch <= 0x0060 && ch >= 0x005B)
- || (ch <= 0x007E && ch >= 0x007D) || ch == 0x007B;
- }
-
- private
- UnicodeSet readAndSetUnicodeSet(String source, int start) throws ParseException
- {
- while(source.charAt(start) != '[') { /* advance while we find the first '[' */
- start++;
- }
- // now we need to get a balanced set of '[]'. The problem is that a set can have
- // many, and *end point to the first closing '['
- int noOpenBraces = 1;
- int current = 1; // skip the opening brace
- while(start+current < source.length() && noOpenBraces != 0) {
- if(source.charAt(start+current) == '[') {
- noOpenBraces++;
- } else if(source.charAt(start+current) == ']') { // closing brace
- noOpenBraces--;
- }
- current++;
- }
- //int nextBrace = -1;
-
- if(noOpenBraces != 0 || (/*nextBrace =*/ source.indexOf("]", start+current) /*']'*/) == -1) {
- throwParseException(m_rules_, start);
- }
- return new UnicodeSet(source.substring(start, start+current)); //uset_openPattern(start, current);
- }
-
- /** in C, optionarg is passed by reference to function.
- * We use a private int to simulate this.
- */
- private int m_optionarg_ = 0;
-
- private int readOption(String rules, int start, int optionend)
- {
- m_optionarg_ = 0;
- int i = 0;
- while (i < RULES_OPTIONS_.length) {
- String option = RULES_OPTIONS_[i].m_name_;
- int optionlength = option.length();
- if (rules.length() > start + optionlength
- && option.equalsIgnoreCase(rules.substring(start,
- start + optionlength))) {
- if (optionend - start > optionlength) {
- m_optionarg_ = start + optionlength;
- // start of the options, skip space
- while (m_optionarg_ < optionend && PatternProps.isWhiteSpace(rules.charAt(m_optionarg_)))
- { // eat whitespace
- m_optionarg_ ++;
- }
- }
- break;
- }
- i ++;
- }
- if(i == RULES_OPTIONS_.length) {
- i = -1;
- }
- return i;
- }
-
- /**
- * Reads and set collation options
- * @return TOKEN_SUCCESS if option is set correct, 0 otherwise
- * @exception ParseException thrown when options in rules are wrong
- */
- private byte readAndSetOption() throws ParseException
- {
- int start = m_current_ + 1; // skip opening '['
- int i = readOption(m_rules_, start, m_optionEnd_);
-
- int optionarg = m_optionarg_;
-
- if (i < 0) {
- throwParseException(m_rules_, start);
- }
-
- if (i < 7) {
- if (optionarg != 0) {
- for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length;
- j ++) {
- String subname = RULES_OPTIONS_[i].m_subOptions_[j];
- int size = optionarg + subname.length();
- if (m_rules_.length() > size
- && subname.equalsIgnoreCase(m_rules_.substring(
- optionarg, size))) {
- setOptions(m_options_, RULES_OPTIONS_[i].m_attribute_,
- RULES_OPTIONS_[i].m_subOptionAttributeValues_[j]);
- return TOKEN_SUCCESS_MASK_;
- }
- }
- }
- throwParseException(m_rules_, optionarg);
- }
- else if (i == 7) { // variable top
- return TOKEN_SUCCESS_MASK_ | TOKEN_VARIABLE_TOP_MASK_;
- }
- else if (i == 8) { // rearrange
- return TOKEN_SUCCESS_MASK_;
- }
- else if (i == 9) { // before
- if (optionarg != 0) {
- for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length;
- j ++) {
- String subname = RULES_OPTIONS_[i].m_subOptions_[j];
- int size = optionarg + subname.length();
- if (m_rules_.length() > size
- && subname.equalsIgnoreCase(
- m_rules_.substring(optionarg,
- optionarg + subname.length()))) {
- return (byte)(TOKEN_SUCCESS_MASK_
- | RULES_OPTIONS_[i].m_subOptionAttributeValues_[j]
- + 1);
- }
- }
- }
- throwParseException(m_rules_, optionarg);
- }
- else if (i == 10) { // top, we are going to have an array with
- // structures of limit CEs index to this array will be
- // src->parsedToken.indirectIndex
- m_parsedToken_.m_indirectIndex_ = 0;
- return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_;
- }
- else if (i < 13) { // first, last
- for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; j ++) {
- String subname = RULES_OPTIONS_[i].m_subOptions_[j];
- int size = optionarg + subname.length();
- if (m_rules_.length() > size
- && subname.equalsIgnoreCase(m_rules_.substring(optionarg,
- size))) {
- m_parsedToken_.m_indirectIndex_ = (char)(i - 10 + (j << 1));
- return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_;
- }
- }
- throwParseException(m_rules_, optionarg);
- }
- else if(i == 13 || i == 14) { // copy and remove are handled before normalization
- // we need to move end here
- int noOpenBraces = 1;
- m_current_++; // skip opening brace
- while(m_current_ < m_source_.length() && noOpenBraces != 0) {
- if(m_source_.charAt(m_current_) == '[') {
- noOpenBraces++;
- } else if(m_source_.charAt(m_current_) == ']') { // closing brace
- noOpenBraces--;
- }
- m_current_++;
- }
- m_optionEnd_ = m_current_-1;
- return TOKEN_SUCCESS_MASK_;
- }
- else if(i == 16) {
- m_current_ = m_optionarg_; // skip opening brace and name
- parseScriptReorder();
- return TOKEN_SUCCESS_MASK_;
- }
- else {
- throwParseException(m_rules_, optionarg);
- }
- return TOKEN_SUCCESS_MASK_; // we will never reach here.
- }
-
- /**
- * Set collation option
- * @param optionset option set to set
- * @param attribute type to set
- * @param value attribute value
- */
- private void setOptions(OptionSet optionset, int attribute, int value)
- {
- switch (attribute) {
- case RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_ :
- optionset.m_isHiragana4_
- = (value == RuleBasedCollator.AttributeValue.ON_);
- break;
- case RuleBasedCollator.Attribute.FRENCH_COLLATION_ :
- optionset.m_isFrenchCollation_
- = (value == RuleBasedCollator.AttributeValue.ON_);
- break;
- case RuleBasedCollator.Attribute.ALTERNATE_HANDLING_ :
- optionset.m_isAlternateHandlingShifted_
- = (value
- == RuleBasedCollator.AttributeValue.SHIFTED_);
- break;
- case RuleBasedCollator.Attribute.CASE_FIRST_ :
- optionset.m_caseFirst_ = value;
- break;
- case RuleBasedCollator.Attribute.CASE_LEVEL_ :
- optionset.m_isCaseLevel_
- = (value == RuleBasedCollator.AttributeValue.ON_);
- break;
- case RuleBasedCollator.Attribute.NORMALIZATION_MODE_ :
- if (value == RuleBasedCollator.AttributeValue.ON_) {
- value = Collator.CANONICAL_DECOMPOSITION;
- }
- optionset.m_decomposition_ = value;
- break;
- case RuleBasedCollator.Attribute.STRENGTH_ :
- optionset.m_strength_ = value;
- break;
- default :
- break;
- }
- }
-
- UnicodeSet getTailoredSet() throws ParseException
- {
- boolean startOfRules = true;
- UnicodeSet tailored = new UnicodeSet();
- String pattern;
- CanonicalIterator it = new CanonicalIterator("");
-
- m_parsedToken_.m_strength_ = TOKEN_UNSET_;
- int sourcelimit = m_source_.length();
- //int expandNext = 0;
-
- while (m_current_ < sourcelimit) {
- m_parsedToken_.m_prefixOffset_ = 0;
- if (parseNextToken(startOfRules) < 0) {
- // we have reached the end
- continue;
- }
- startOfRules = false;
- // The idea is to tokenize the rule set. For each non-reset token,
- // we add all the canonicaly equivalent FCD sequences
- if(m_parsedToken_.m_strength_ != TOKEN_RESET_) {
- it.setSource(m_source_.substring(
- m_parsedToken_.m_charsOffset_,
- m_parsedToken_.m_charsOffset_+m_parsedToken_.m_charsLen_));
- pattern = it.next();
- while(pattern != null) {
- if(Normalizer.quickCheck(pattern, Normalizer.FCD,0) != Normalizer.NO) {
- tailored.add(pattern);
- }
- pattern = it.next();
- }
- }
- }
- return tailored;
- }
-
- final private String preprocessRules(String rules) throws ParseException {
- int optionNumber = -1;
- int setStart = 0;
- int i = 0;
- while(i < rules.length()) {
- if(rules.charAt(i) == 0x005B) { // [
- optionNumber = readOption(rules, i+1, rules.length());
- setStart = m_optionarg_;
- if(optionNumber == 13) { /* copy - parts of UCA to tailoring */
- UnicodeSet newSet = readAndSetUnicodeSet(rules, setStart);
- if(m_copySet_ == null) {
- m_copySet_ = newSet;
- } else {
- m_copySet_.addAll(newSet);
- }
- } else if(optionNumber == 14) {
- UnicodeSet newSet = readAndSetUnicodeSet(rules, setStart);
- if(m_removeSet_ == null) {
- m_removeSet_ = newSet;
- } else {
- m_removeSet_.addAll(newSet);
- }
- } else if(optionNumber == 19) {
- int optionEndOffset = rules.indexOf(']', i) + 1;
- ULocale locale = ULocale.forLanguageTag(rules.substring(setStart, optionEndOffset-1));
- UResourceBundle bundle = UResourceBundle.getBundleInstance(
- ICUResourceBundle.ICU_BASE_NAME + "/coll", locale.getBaseName());
-
- String type = locale.getKeywordValue("collation");
- if(type == null){
- type = "standard";
- }
-
- String importRules = bundle.get("collations")
- .get(type)
- .get("Sequence")
- .getString();
-
- rules = rules.substring(0, i) + importRules + rules.substring(optionEndOffset);
- }
- }
- i++;
- }
- return rules;
- }
-
- /* This is the data that is used for non-script reordering codes. These _must_ be kept
- * in order that they are to be applied as defaults and in synch with the Collator.ReorderCodes statics.
- */
- static final String ReorderingTokensArray[] = {
- "SPACE",
- "PUNCT",
- "SYMBOL",
- "CURRENCY",
- "DIGIT",
- };
-
- int findReorderingEntry(String name) {
- for (int tokenIndex = 0; tokenIndex < ReorderingTokensArray.length; tokenIndex++) {
- if (name.equalsIgnoreCase(ReorderingTokensArray[tokenIndex])) {
- return tokenIndex + ReorderCodes.FIRST;
- }
- }
- return UScript.INVALID_CODE;
- }
-
- private void parseScriptReorder() throws ParseException {
- ArrayList<Integer> tempOrder = new ArrayList<Integer>();
- int end = m_rules_.indexOf(']', m_current_);
- if (end == -1) {
- return;
- }
- String tokenString = m_rules_.substring(m_current_, end);
- String[] tokens = tokenString.split("\\s+", 0);
- String token;
- for (int tokenIndex = 0; tokenIndex < tokens.length; tokenIndex++) {
- token = tokens[tokenIndex];
- int reorderCode = findReorderingEntry(token);
- if (reorderCode == UScript.INVALID_CODE) {
- reorderCode = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, token);
- if (reorderCode < 0) {
- throw new ParseException(m_rules_, tokenIndex);
- }
- }
- tempOrder.add(reorderCode);
- }
- m_options_.m_scriptOrder_ = new int[tempOrder.size()];
- for(int i = 0; i < tempOrder.size(); i++) {
- m_options_.m_scriptOrder_[i] = tempOrder.get(i);
- }
- }
-}
/**
*******************************************************************************
-* Copyright (C) 1996-2014, International Business Machines Corporation and *
-* others. All Rights Reserved. *
+* Copyright (C) 1996-2014, International Business Machines Corporation and
+* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.text;
import com.ibm.icu.impl.ICUDebug;
import com.ibm.icu.impl.ICUResourceBundle;
-import com.ibm.icu.impl.Norm2AllModes;
+import com.ibm.icu.impl.coll.CollationData;
+import com.ibm.icu.impl.coll.CollationRoot;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.util.Freezable;
import com.ibm.icu.util.ULocale;
* difference between large and small Kana. A tertiary difference is ignored
* when there is a primary or secondary difference anywhere in the strings.
* <li>QUATERNARY strength: When punctuation is ignored
-* <a href="http://www.icu-project.org/userguide/Collate_Concepts.html#Ignoring_Punctuation">
-* (see Ignoring Punctuations in the user guide)</a> at PRIMARY to TERTIARY
+* (see <a href="http://userguide.icu-project.org/collation/concepts#TOC-Ignoring-Punctuation">
+* Ignoring Punctuations in the User Guide</a>) at PRIMARY to TERTIARY
* strength, an additional strength level can
* be used to distinguish words with and without punctuation (for example,
* "ab" < "a-b" < "aB").
* a comparison or before getting a CollationKey.</p>
*
* <p>For more information about the collation service see the
-* <a href="http://www.icu-project.org/userguide/Collate_Intro.html">users
-* guide</a>.</p>
+* <a href="http://userguide.icu-project.org/collation">User Guide</a>.</p>
*
* <p>Examples of use
* <pre>
* @author Syn Wee Quek
* @stable ICU 2.8
*/
-public abstract class Collator implements Comparator<Object>, Freezable<Collator>
+public abstract class Collator implements Comparator<Object>, Freezable<Collator>, Cloneable
{
// public data members ---------------------------------------------------
/**
* {@icu} Fourth level collator strength value.
* When punctuation is ignored
- * <a href="http://www.icu-project.org/userguide/Collate_Concepts.html#Ignoring_Punctuation">
- * (see Ignoring Punctuations in the user guide)</a> at PRIMARY to TERTIARY
+ * (see <a href="http://userguide.icu-project.org/collation/concepts#TOC-Ignoring-Punctuation">
+ * Ignoring Punctuation in the User Guide</a>) at PRIMARY to TERTIARY
* strength, an additional strength level can
* be used to distinguish words with and without punctuation.
* See class documentation for more explanation.
// public methods --------------------------------------------------------
+ /**
+ * Compares the equality of two Collator objects. Collator objects are equal if they have the same
+ * collation (sorting & searching) behavior.
+ *
+ * <p>The base class checks for null and for equal types.
+ * Subclasses should override.
+ *
+ * @param obj the Collator to compare to.
+ * @return true if this Collator has exactly the same collation behavior as obj, false otherwise.
+ * @stable ICU 2.8
+ */
+ @Override
+ public boolean equals(Object obj) {
+ // Subclasses: Call this method and then add more specific checks.
+ return this == obj || (obj != null && getClass() == obj.getClass());
+ }
+
// public setters --------------------------------------------------------
+ private void checkNotFrozen() {
+ if (isFrozen()) {
+ throw new UnsupportedOperationException("Attempt to modify frozen Collator");
+ }
+ }
+
/**
- * Sets this Collator's strength property. The strength property
+ * Sets this Collator's strength attribute. The strength attribute
* determines the minimum level of difference considered significant
* during comparison.</p>
*
- * <p>The default strength for the Collator is TERTIARY, unless specified
- * otherwise by the locale used to create the Collator.</p>
+ * <p>The base class method does nothing. Subclasses should override it if appropriate.
*
* <p>See the Collator class description for an example of use.</p>
* @param newStrength the new strength value.
* @see #TERTIARY
* @see #QUATERNARY
* @see #IDENTICAL
- * @throws IllegalArgumentException if the new strength value is not one
- * of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL.
+ * @throws IllegalArgumentException if the new strength value is not valid.
* @stable ICU 2.8
*/
public void setStrength(int newStrength)
{
- if (isFrozen()) {
- throw new UnsupportedOperationException("Attempt to modify frozen object");
- }
-
- if ((newStrength != PRIMARY) &&
- (newStrength != SECONDARY) &&
- (newStrength != TERTIARY) &&
- (newStrength != QUATERNARY) &&
- (newStrength != IDENTICAL)) {
- throw new IllegalArgumentException("Incorrect comparison level.");
- }
- m_strength_ = newStrength;
+ checkNotFrozen();
}
-
+
/**
- * @internal
+ * @return this, for chaining
+ * @internal Used in UnicodeTools
* @deprecated This API is ICU internal only.
*/
@Deprecated
/**
* Sets the decomposition mode of this Collator. Setting this
- * decomposition property with CANONICAL_DECOMPOSITION allows the
+ * decomposition attribute with CANONICAL_DECOMPOSITION allows the
* Collator to handle un-normalized text properly, producing the
* same results as if the text were normalized. If
* NO_DECOMPOSITION is set, it is the user's responsibility to
* text normalization, most locales set NO_DECOMPOSITION as the
* default decomposition mode.</p>
*
- * The default decompositon mode for the Collator is
- * NO_DECOMPOSITON, unless specified otherwise by the locale used
- * to create the Collator.</p>
+ * <p>The base class method does nothing. Subclasses should override it if appropriate.
*
* <p>See getDecomposition for a description of decomposition
* mode.</p>
*/
public void setDecomposition(int decomposition)
{
- if (isFrozen()) {
- throw new UnsupportedOperationException("Attempt to modify frozen object");
- }
- internalSetDecomposition(decomposition);
- }
-
- /**
- * Internal set decompostion call to workaround frozen state because of self-modification
- * in the RuleBasedCollator. This method <b>must</b> only be called by code that has
- * passed the frozen check already <b>and</b> has the lock if the Collator is frozen.
- * Better still this method should go away and RuleBasedCollator.getSortKeyBytes()
- * should be fixed to not self-modify.
- * @param decomposition
- * @internal
- */
- protected void internalSetDecomposition(int decomposition)
- {
- if ((decomposition != NO_DECOMPOSITION) &&
- (decomposition != CANONICAL_DECOMPOSITION)) {
- throw new IllegalArgumentException("Wrong decomposition mode.");
- }
- m_decomposition_ = decomposition;
- if (decomposition != NO_DECOMPOSITION) {
- // ensure the FCD data is initialized
- Norm2AllModes.getFCDNormalizer2();
- }
+ checkNotFrozen();
}
/**
*/
public void setReorderCodes(int... order)
{
- throw new UnsupportedOperationException();
+ throw new UnsupportedOperationException("Needs to be implemented by the subclass.");
}
// public getters --------------------------------------------------------
* The default locale is determined by java.util.Locale.getDefault().
* @return the Collator for the default locale (for example, en_US) if it
* is created successfully. Otherwise if there is no Collator
- * associated with the current locale, the default UCA collator
+ * associated with the current locale, the root collator
* will be returned.
* @see java.util.Locale#getDefault()
* @see #getInstance(Locale)
* @param locale the desired locale.
* @return Collator for the desired locale if it is created successfully.
* Otherwise if there is no Collator
- * associated with the current locale, a default UCA collator will
+ * associated with the current locale, the root collator will
* be returned.
* @see java.util.Locale
* @see java.util.ResourceBundle
* @param locale the desired locale.
* @return Collator for the desired locale if it is created successfully.
* Otherwise if there is no Collator
- * associated with the current locale, a default UCA collator will
+ * associated with the current locale, the root collator will
* be returned.
* @see java.util.Locale
* @see java.util.ResourceBundle
LinkedList<String> values = new LinkedList<String>();
UResourceBundle bundle = UResourceBundle.getBundleInstance(
- ICUResourceBundle.ICU_BASE_NAME + "/coll", baseLoc);
+ ICUResourceBundle.ICU_COLLATION_BASE_NAME, baseLoc);
String defcoll = null;
while (bundle != null) {
* applications who wish to cache collators, or otherwise reuse
* collators when possible. The functional equivalent may change
* over time. For more information, please see the <a
- * href="http://www.icu-project.org/userguide/locale.html#services">
+ * href="http://userguide.icu-project.org/locale#TOC-Locales-and-Services">
* Locales and Services</a> section of the ICU User Guide.
* @param keyword a particular keyword as enumerated by
* getKeywords.
}
/**
- * Returns this Collator's strength property. The strength property
+ * Returns this Collator's strength attribute. The strength attribute
* determines the minimum level of difference considered significant.
* </p>
* {@icunote} This can return QUATERNARY strength, which is not supported by the
* <p>
* See the Collator class description for more details.
* </p>
- * @return this Collator's current strength property.
+ * <p>The base class method always returns {@link #TERTIARY}.
+ * Subclasses should override it if appropriate.
+ *
+ * @return this Collator's current strength attribute.
* @see #setStrength
* @see #PRIMARY
* @see #SECONDARY
*/
public int getStrength()
{
- return m_strength_;
+ return TERTIARY;
}
/**
* <p>
* See the Collator class description for more details.
* </p>
+ * <p>The base class method always returns {@link #NO_DECOMPOSITION}.
+ * Subclasses should override it if appropriate.
+ *
* @return the decomposition mode
* @see #setDecomposition
* @see #NO_DECOMPOSITION
*/
public int getDecomposition()
{
- return m_decomposition_;
+ return NO_DECOMPOSITION;
}
// public other methods -------------------------------------------------
* in this collator.
* @return a pointer to a UnicodeSet object containing all the
* code points and sequences that may sort differently than
- * in the UCA.
+ * in the root collator.
* @stable ICU 2.4
*/
public UnicodeSet getTailoredSet()
* @return Returns an integer value. Value is less than zero if source is
* less than target, value is zero if source and target are equal,
* value is greater than zero if source is greater than target.
- * @throws ClassCastException thrown if either arguments cannot be cast to String.
+ * @throws ClassCastException thrown if either arguments cannot be cast to CharSequence.
* @stable ICU 4.2
*/
public int compare(Object source, Object target) {
- return compare((String)source, (String)target);
+ return doCompare((CharSequence)source, (CharSequence)target);
+ }
+
+ /**
+ * Compares two CharSequences.
+ * The base class just calls compare(left.toString(), right.toString()).
+ * Subclasses should instead implement this method and have the String API call this method.
+ * @internal
+ * @deprecated This API is ICU internal only.
+ */
+ protected int doCompare(CharSequence left, CharSequence right) {
+ return compare(left.toString(), right.toString());
}
/**
RawCollationKey key);
/**
- * {@icu} Variable top is a two byte primary value which causes all the codepoints
- * with primary values that are less or equal than the variable top to be
- * shifted when alternate handling is set to SHIFTED.
- * </p>
- * <p>
- * Sets the variable top to a collation element value of a string supplied.
- * </p>
+ * {@icu} Sets the variable top to the top of the specified reordering group.
+ * The variable top determines the highest-sorting character
+ * which is affected by the alternate handling behavior.
+ * If that attribute is set to UCOL_NON_IGNORABLE, then the variable top has no effect.
+ *
+ * <p>The base class implementation throws an UnsupportedOperationException.
+ * @param group one of Collator.ReorderCodes.SPACE, Collator.ReorderCodes.PUNCTUATION,
+ * Collator.ReorderCodes.SYMBOL, Collator.ReorderCodes.CURRENCY;
+ * or Collator.ReorderCodes.DEFAULT to restore the default max variable group
+ * @return this
+ * @see #getMaxVariable
+ * @draft ICU 53
+ * @provisional This API might change or be removed in a future release.
+ */
+ public Collator setMaxVariable(int group) {
+ throw new UnsupportedOperationException("Needs to be implemented by the subclass.");
+ }
+
+ /**
+ * {@icu} Returns the maximum reordering group whose characters are affected by
+ * the alternate handling behavior.
+ *
+ * <p>The base class implementation returns Collator.ReorderCodes.PUNCTUATION.
+ * @return the maximum variable reordering group.
+ * @see #setMaxVariable
+ * @draft ICU 53
+ * @provisional This API might change or be removed in a future release.
+ */
+ public int getMaxVariable() {
+ return Collator.ReorderCodes.PUNCTUATION;
+ }
+
+ /**
+ * {@icu} Sets the variable top to the primary weight of the specified string.
+ *
+ * <p>Beginning with ICU 53, the variable top is pinned to
+ * the top of one of the supported reordering groups,
+ * and it must not be beyond the last of those groups.
+ * See {@link #setMaxVariable(int)}.
+ *
* @param varTop one or more (if contraction) characters to which the
* variable top should be set
- * @return a int value containing the value of the variable top in upper 16
- * bits. Lower 16 bits are undefined.
- * @throws IllegalArgumentException is thrown if varTop argument is not
- * a valid variable top element. A variable top element is
- * invalid when it is a contraction that does not exist in the
- * Collation order or when the PRIMARY strength collation
- * element for the variable top has more than two bytes
+ * @return variable top primary weight
+ * @exception IllegalArgumentException
+ * is thrown if varTop argument is not a valid variable top element. A variable top element is
+ * invalid when
+ * <ul>
+ * <li>it is a contraction that does not exist in the Collation order
+ * <li>the variable top is beyond
+ * the last reordering group supported by setMaxVariable()
+ * <li>when the varTop argument is null or zero in length.
+ * </ul>
* @see #getVariableTop
* @see RuleBasedCollator#setAlternateHandlingShifted
- * @stable ICU 2.6
+ * @deprecated ICU 53 Call {@link #setMaxVariable(int)} instead.
*/
public abstract int setVariableTop(String varTop);
/**
- * {@icu} Returns the variable top value of a Collator.
- * Lower 16 bits are undefined and should be ignored.
- * @return the variable top value of a Collator.
- * @see #setVariableTop
+ * {@icu} Gets the variable top value of a Collator.
+ *
+ * @return the variable top primary weight
+ * @see #getMaxVariable
* @stable ICU 2.6
*/
public abstract int getVariableTop();
/**
- * {@icu} Sets the variable top to a collation element value supplied.
- * Variable top is set to the upper 16 bits.
- * Lower 16 bits are ignored.
- * @param varTop Collation element value, as returned by setVariableTop or
- * getVariableTop
+ * {@icu} Sets the variable top to the specified primary weight.
+ *
+ * <p>Beginning with ICU 53, the variable top is pinned to
+ * the top of one of the supported reordering groups,
+ * and it must not be beyond the last of those groups.
+ * See {@link #setMaxVariable(int)}.
+ *
+ * @param varTop primary weight, as returned by setVariableTop or getVariableTop
* @see #getVariableTop
- * @see #setVariableTop
- * @stable ICU 2.6
+ * @see #setVariableTop(String)
+ * @deprecated ICU 53 Call setMaxVariable() instead.
*/
public abstract void setVariableTop(int varTop);
*/
public int[] getReorderCodes()
{
- throw new UnsupportedOperationException();
+ throw new UnsupportedOperationException("Needs to be implemented by the subclass.");
}
/**
* Retrieves all the reorder codes that are grouped with the given reorder code. Some reorder
* codes are grouped and must reorder together.
*
- * @param reorderCode code for which equivalents to be retrieved
+ * @param reorderCode The reorder code to determine equivalence for.
* @return the set of all reorder codes in the same group as the given reorder code.
* @see #setReorderCodes
* @see #getReorderCodes
* @see UScript
* @stable ICU 4.8
*/
- public static int[] getEquivalentReorderCodes(int reorderCode)
- {
- throw new UnsupportedOperationException();
+ public static int[] getEquivalentReorderCodes(int reorderCode) {
+ CollationData baseData = CollationRoot.getData();
+ return baseData.getEquivalentScripts(reorderCode);
}
throw new UnsupportedOperationException("Needs to be implemented by the subclass.");
}
- // protected constructor -------------------------------------------------
-
/**
* Empty default constructor to make javadocs happy
* @stable ICU 2.4
{
}
- // package private methods -----------------------------------------------
-
- // private data members --------------------------------------------------
-
- /**
- * Collation strength
- */
- private int m_strength_ = TERTIARY;
-
- /**
- * Decomposition mode
- */
- private int m_decomposition_ = CANONICAL_DECOMPOSITION;
-
private static final boolean DEBUG = ICUDebug.enabled("collator");
- // private methods -------------------------------------------------------
-
- // end registry stuff
-
// -------- BEGIN ULocale boilerplate --------
/**
* contains a partial preview implementation. The * <i>actual</i>
* locale is returned correctly, but the <i>valid</i> locale is
* not, in most cases.
+ *
+ * <p>The base class method always returns {@link ULocale#ROOT}.
+ * Subclasses should override it if appropriate.
+ *
* @param type type of information requested, either {@link
* com.ibm.icu.util.ULocale#VALID_LOCALE} or {@link
* com.ibm.icu.util.ULocale#ACTUAL_LOCALE}.
* @draft ICU 2.8 (retain)
* @provisional This API might change or be removed in a future release.
*/
- public final ULocale getLocale(ULocale.Type type) {
- return type == ULocale.ACTUAL_LOCALE ?
- this.actualLocale : this.validLocale;
+ public ULocale getLocale(ULocale.Type type) {
+ return ULocale.ROOT;
}
- /*
+ /**
* Set information about the locales that were used to create this
* object. If the object was not constructed from locale data,
* both arguments should be set to null. Otherwise, neither
* less specific than the valid locale. This method is intended
* for use by factories or other entities that create objects of
* this class.
+ *
+ * <p>The base class method does nothing. Subclasses should override it if appropriate.
+ *
* @param valid the most specific locale containing any resource
* data, or null
* @param actual the locale containing data used to construct this
* @see com.ibm.icu.util.ULocale#VALID_LOCALE
* @see com.ibm.icu.util.ULocale#ACTUAL_LOCALE
*/
- final void setLocale(ULocale valid, ULocale actual) {
- // Change the following to an assertion later
- ///CLOVER:OFF
- // The following would not happen since the method is called
- // by other protected functions that checks and makes sure that
- // valid and actual are not null before passing
- if ((valid == null) != (actual == null)) {
- throw new IllegalArgumentException();
- }
- ///CLOVER:ON
- // Another check we could do is that the actual locale is at
- // the same level or less specific than the valid locale.
- this.validLocale = valid;
- this.actualLocale = actual;
- }
-
- /*
- * The most specific locale containing any resource data, or null.
- * @see com.ibm.icu.util.ULocale
- */
- private ULocale validLocale;
-
- /*
- * The locale containing data used to construct this object, or
- * null.
- * @see com.ibm.icu.util.ULocale
- */
- private ULocale actualLocale;
+ void setLocale(ULocale valid, ULocale actual) {}
// -------- END ULocale boilerplate --------
}
+++ /dev/null
-/**
- *******************************************************************************
- * Copyright (C) 1996-2013, International Business Machines Corporation and
- * others. All Rights Reserved.
- *******************************************************************************
- */
-package com.ibm.icu.text;
-
-import java.io.BufferedInputStream;
-import java.io.DataInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.ByteBuffer;
-
-import com.ibm.icu.impl.ICUBinary;
-import com.ibm.icu.impl.ICUData;
-import com.ibm.icu.impl.ICUResourceBundle;
-import com.ibm.icu.impl.IntTrie;
-import com.ibm.icu.text.CollationParsedRuleBuilder.InverseUCA;
-import com.ibm.icu.text.RuleBasedCollator.LeadByteConstants;
-import com.ibm.icu.text.RuleBasedCollator.UCAConstants;
-import com.ibm.icu.util.Output;
-import com.ibm.icu.util.VersionInfo;
-
-/**
- * <p>
- * Internal reader class for ICU data file uca.icu containing Unicode Collation Algorithm data.
- * </p>
- * <p>
- * This class simply reads uca.icu, authenticates that it is a valid ICU data file and split its contents up into blocks
- * of data for use in <a href=Collator.html>com.ibm.icu.text.Collator</a>.
- * </p>
- * <p>
- * uca.icu which is in big-endian format is jared together with this package.
- * </p>
- *
- * @author Syn Wee Quek
- * @since release 2.2, April 18 2002
- */
-
-final class CollatorReader {
- static char[] read(RuleBasedCollator rbc, UCAConstants ucac,
- LeadByteConstants leadByteConstants, Output<Integer> maxUCAContractionLength)
- throws IOException {
- InputStream i = ICUData.getRequiredStream(ICUResourceBundle.ICU_BUNDLE + "/coll/ucadata.icu");
- BufferedInputStream b = new BufferedInputStream(i, 90000);
- CollatorReader reader = new CollatorReader(b);
- char[] ucaContractions = reader.readImp(rbc, ucac, leadByteConstants, maxUCAContractionLength);
- b.close();
- return ucaContractions;
- }
-
- public static InputStream makeByteBufferInputStream(final ByteBuffer buf) {
- return new InputStream() {
- public int read() throws IOException {
- if (!buf.hasRemaining()) {
- return -1;
- }
- return buf.get() & 0xff;
- }
-
- public int read(byte[] bytes, int off, int len) throws IOException {
- len = Math.min(len, buf.remaining());
- buf.get(bytes, off, len);
- return len;
- }
- };
- }
-
- static void initRBC(RuleBasedCollator rbc, ByteBuffer data) throws IOException {
- final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2;
- int dataLength = data.remaining();
- // TODO: Change the rest of this class to use the ByteBuffer directly, rather than
- // a DataInputStream, except for passing an InputStream to ICUBinary.readHeader().
- // Consider changing ICUBinary to also work with a ByteBuffer.
- CollatorReader reader = new CollatorReader(makeByteBufferInputStream(data), false);
- if (dataLength > MIN_BINARY_DATA_SIZE_) {
- reader.readImp(rbc, null, null, null);
- } else {
- reader.readHeader(rbc, null);
- reader.readOptions(rbc);
- // duplicating UCA_'s data
- rbc.setWithUCATables();
- }
- }
-
- static InverseUCA getInverseUCA() throws IOException {
- InverseUCA result = null;
- InputStream i = ICUData.getRequiredStream(ICUResourceBundle.ICU_BUNDLE + "/coll/invuca.icu");
- // try {
- // String invdat = "/com/ibm/icu/impl/data/invuca.icu";
- // InputStream i = CollationParsedRuleBuilder.class.getResourceAsStream(invdat);
- BufferedInputStream b = new BufferedInputStream(i, 110000);
- result = CollatorReader.readInverseUCA(b);
- b.close();
- i.close();
- return result;
- // } catch (Exception e) {
- // throw new RuntimeException(e.getMessage());
- // }
- }
-
- // protected constructor ---------------------------------------------
-
- /**
- * <p>
- * Protected constructor.
- * </p>
- *
- * @param inputStream
- * ICU collator file input stream
- * @exception IOException
- * throw if data file fails authentication
- */
- private CollatorReader(InputStream inputStream) throws IOException {
- this(inputStream, true);
- /*
- * byte[] UnicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID_, UCA_AUTHENTICATE_); // weiv: check
- * that we have the correct Unicode version in // binary files VersionInfo UCDVersion =
- * UCharacter.getUnicodeVersion(); if(UnicodeVersion[0] != UCDVersion.getMajor() || UnicodeVersion[1] !=
- * UCDVersion.getMinor()) { throw new IOException(WRONG_UNICODE_VERSION_ERROR_); } m_dataInputStream_ = new
- * DataInputStream(inputStream);
- */
- }
-
- /**
- * <p>
- * Protected constructor.
- * </p>
- *
- * @param inputStream
- * ICU uprops.icu file input stream
- * @param readICUHeader
- * flag to indicate if the ICU header has to be read
- * @exception IOException
- * throw if data file fails authentication
- */
- private CollatorReader(InputStream inputStream, boolean readICUHeader) throws IOException {
- if (readICUHeader) {
- ICUBinary.readHeader(inputStream, DATA_FORMAT_ID_, UCA_AUTHENTICATE_);
- // Note: In ICU 51 and earlier,
- // we used to check that the UCA data version (readHeader() return value)
- // matches the UCD version (UCharacter.getUnicodeVersion())
- // but that complicated version updates, and
- // a mismatch is "only" a problem for handling canonical equivalence.
- // It need not be a fatal error.
- // throw new IOException(WRONG_UNICODE_VERSION_ERROR_);
- }
- m_dataInputStream_ = new DataInputStream(inputStream);
- }
-
- // protected methods -------------------------------------------------
-
- /**
- * Read and break up the header stream of data passed in as arguments into meaningful Collator data.
- *
- * @param rbc
- * RuleBasedCollator to populate with header information
- * @exception IOException
- * thrown when there's a data error.
- */
- private void readHeader(RuleBasedCollator rbc, Output<Integer> maxUCAContractionLength) throws IOException {
- m_size_ = m_dataInputStream_.readInt();
- // all the offsets are in bytes
- // to get the address add to the header address and cast properly
- // Default options int options
- m_headerSize_ = m_dataInputStream_.readInt(); // start of options
- int readcount = 8; // for size and headersize
- // structure which holds values for indirect positioning and implicit
- // ranges
- m_UCAConstOffset_ = m_dataInputStream_.readInt();
- readcount += 4;
- // this one is needed only for UCA, to copy the appropriate
- // contractions
- /*int contractionUCACombos =*/ m_dataInputStream_.readInt();
- readcount += 4;
- // reserved for future use
- m_dataInputStream_.skipBytes(4);
- readcount += 4;
- // const uint8_t *mappingPosition;
- int mapping = m_dataInputStream_.readInt();
- readcount += 4;
- // uint32_t *expansion;
- rbc.m_expansionOffset_ = m_dataInputStream_.readInt();
- readcount += 4;
- // UChar *contractionIndex;
- rbc.m_contractionOffset_ = m_dataInputStream_.readInt();
- readcount += 4;
- // uint32_t *contractionCEs;
- int contractionCE = m_dataInputStream_.readInt();
- readcount += 4;
- // needed for various closures int contractionSize
- int contractionSize = m_dataInputStream_.readInt();
- readcount += 4;
- // array of last collation element in expansion
- int expansionEndCE = m_dataInputStream_.readInt();
- readcount += 4;
- // array of maximum expansion size corresponding to the expansion
- // collation elements with last element in expansionEndCE
- int expansionEndCEMaxSize = m_dataInputStream_.readInt();
- readcount += 4;
- // size of endExpansionCE int expansionEndCESize
- /* int endExpansionCECount = */m_dataInputStream_.readInt();
- readcount += 4;
- // hash table of unsafe code points
- int unsafe = m_dataInputStream_.readInt();
- readcount += 4;
- // hash table of final code points in contractions.
- int contractionEnd = m_dataInputStream_.readInt();
- readcount += 4;
- // int CEcount = m_dataInputStream_.readInt();
- int contractionUCACombosSize = m_dataInputStream_.readInt();
- readcount += 4;
- // is jamoSpecial
- rbc.m_isJamoSpecial_ = m_dataInputStream_.readBoolean();
- readcount++;
- // isBigEndian and charSetFamily
- m_dataInputStream_.skipBytes(2);
- readcount += 2;
- int contractionUCACombosWidth = m_dataInputStream_.readByte();
- if (maxUCAContractionLength != null) {
- maxUCAContractionLength.value = contractionUCACombosWidth;
- }
- // We want to be able to output this value if it's not 0.
- assert contractionUCACombosWidth == 0 || maxUCAContractionLength != null;
- readcount += 1;
- rbc.m_version_ = readVersion(m_dataInputStream_);
- readcount += 4;
- rbc.m_UCA_version_ = readVersion(m_dataInputStream_);
- readcount += 4;
- rbc.m_UCD_version_ = readVersion(m_dataInputStream_);
- readcount += 4;
- /*VersionInfo formatVersion =*/ readVersion(m_dataInputStream_);
- readcount += 4;
- rbc.m_scriptToLeadBytes = m_dataInputStream_.readInt();
- readcount += 4;
- rbc.m_leadByteToScripts = m_dataInputStream_.readInt();
- readcount += 4;
-
- // byte charsetName[] = new byte[32]; // for charset CEs
- m_dataInputStream_.skipBytes(32);
- readcount += 32;
-
- m_dataInputStream_.skipBytes(44); // for future use
- readcount += 44;
- if (m_headerSize_ < readcount) {
- // /CLOVER:OFF
- throw new IOException("Internal Error: Header size error");
- // /CLOVER:ON
- }
- m_dataInputStream_.skipBytes(m_headerSize_ - readcount);
-
- if (rbc.m_contractionOffset_ == 0) { // contraction can be null
- rbc.m_contractionOffset_ = mapping;
- contractionCE = mapping;
- }
- m_optionSize_ = rbc.m_expansionOffset_ - m_headerSize_;
- m_expansionSize_ = rbc.m_contractionOffset_ - rbc.m_expansionOffset_;
- m_contractionIndexSize_ = contractionCE - rbc.m_contractionOffset_;
- m_contractionCESize_ = mapping - contractionCE;
- // m_trieSize_ = expansionEndCE - mapping;
- m_expansionEndCESize_ = expansionEndCEMaxSize - expansionEndCE;
- m_expansionEndCEMaxSizeSize_ = unsafe - expansionEndCEMaxSize;
- m_unsafeSize_ = contractionEnd - unsafe;
- // m_UCAValuesSize_ = m_size_ - UCAConst; // UCA value, will be handled later
- m_UCAcontractionSize_ = contractionUCACombosSize * contractionUCACombosWidth * 2;
-
- // treat it as normal collator first
- // for normal collator there is no UCA contraction
- // contractions (UChar[contractionSize] + CE[contractionSize])
- m_contractionSize_ = contractionSize * 2 + contractionSize * 4;
-
- rbc.m_contractionOffset_ >>= 1; // casting to ints
- rbc.m_expansionOffset_ >>= 2; // casting to chars
- }
-
- /**
- * Read and break up the collation options passed in the stream of data and update the argument Collator with the
- * results
- *
- * @param rbc
- * RuleBasedCollator to populate
- * @exception IOException
- * thrown when there's a data error.
- */
- private void readOptions(RuleBasedCollator rbc) throws IOException {
- int readcount = 0;
- rbc.m_defaultVariableTopValue_ = m_dataInputStream_.readInt();
- readcount += 4;
- rbc.m_defaultIsFrenchCollation_ = (m_dataInputStream_.readInt() == RuleBasedCollator.AttributeValue.ON_);
- readcount += 4;
- rbc.m_defaultIsAlternateHandlingShifted_ = (m_dataInputStream_.readInt() == RuleBasedCollator.AttributeValue.SHIFTED_);
- readcount += 4;
- rbc.m_defaultCaseFirst_ = m_dataInputStream_.readInt();
- readcount += 4;
- // rbc.m_defaultIsCaseLevel_ = (m_dataInputStream_.readInt()
- // == RuleBasedCollator.AttributeValue.ON_);
- int defaultIsCaseLevel = m_dataInputStream_.readInt();
- rbc.m_defaultIsCaseLevel_ = (defaultIsCaseLevel == RuleBasedCollator.AttributeValue.ON_);
- readcount += 4;
- int value = m_dataInputStream_.readInt();
- readcount += 4;
- if (value == RuleBasedCollator.AttributeValue.ON_) {
- value = Collator.CANONICAL_DECOMPOSITION;
- } else {
- value = Collator.NO_DECOMPOSITION;
- }
- rbc.m_defaultDecomposition_ = value;
- rbc.m_defaultStrength_ = m_dataInputStream_.readInt();
- readcount += 4;
- rbc.m_defaultIsHiragana4_ = (m_dataInputStream_.readInt() == RuleBasedCollator.AttributeValue.ON_);
- readcount += 4;
- rbc.m_defaultIsNumericCollation_ = (m_dataInputStream_.readInt() == RuleBasedCollator.AttributeValue.ON_);
- readcount += 4;
- m_dataInputStream_.skip(60); // reserved for future use
- readcount += 60;
- m_dataInputStream_.skipBytes(m_optionSize_ - readcount);
- if (m_optionSize_ < readcount) {
- // /CLOVER:OFF
- throw new IOException("Internal Error: Option size error");
- // /CLOVER:ON
- }
- }
-
- /**
- * Read and break up the stream of data passed in as arguments into meaningful Collator data.
- *
- * @param rbc
- * RuleBasedCollator to populate
- * @param UCAConst
- * object to fill up with UCA constants if we are reading the UCA collator, if not use a null
- * @param leadByteConstants
- * @return UCAContractions array filled up with the UCA contractions if we are reading the UCA collator
- * @exception IOException
- * thrown when there's a data error.
- */
- private char[] readImp(RuleBasedCollator rbc, RuleBasedCollator.UCAConstants UCAConst,
- RuleBasedCollator.LeadByteConstants leadByteConstants,
- Output<Integer> maxUCAContractionLength) throws IOException {
- char ucaContractions[] = null; // return result
-
- readHeader(rbc, maxUCAContractionLength);
- // header size has been checked by readHeader
- int readcount = m_headerSize_;
- // option size has been checked by readOptions
- readOptions(rbc);
- readcount += m_optionSize_;
- m_expansionSize_ >>= 2;
- rbc.m_expansion_ = new int[m_expansionSize_];
- for (int i = 0; i < m_expansionSize_; i++) {
- rbc.m_expansion_[i] = m_dataInputStream_.readInt();
- }
- readcount += (m_expansionSize_ << 2);
- if (m_contractionIndexSize_ > 0) {
- m_contractionIndexSize_ >>= 1;
- rbc.m_contractionIndex_ = new char[m_contractionIndexSize_];
- for (int i = 0; i < m_contractionIndexSize_; i++) {
- rbc.m_contractionIndex_[i] = m_dataInputStream_.readChar();
- }
- readcount += (m_contractionIndexSize_ << 1);
- m_contractionCESize_ >>= 2;
- rbc.m_contractionCE_ = new int[m_contractionCESize_];
- for (int i = 0; i < m_contractionCESize_; i++) {
- rbc.m_contractionCE_[i] = m_dataInputStream_.readInt();
- }
- readcount += (m_contractionCESize_ << 2);
- }
- rbc.m_trie_ = new IntTrie(m_dataInputStream_, RuleBasedCollator.DataManipulate.getInstance());
- if (!rbc.m_trie_.isLatin1Linear()) {
- throw new IOException("Data corrupted, " + "Collator Tries expected to have linear "
- + "latin one data arrays");
- }
- readcount += rbc.m_trie_.getSerializedDataSize();
- m_expansionEndCESize_ >>= 2;
- rbc.m_expansionEndCE_ = new int[m_expansionEndCESize_];
- for (int i = 0; i < m_expansionEndCESize_; i++) {
- rbc.m_expansionEndCE_[i] = m_dataInputStream_.readInt();
- }
- readcount += (m_expansionEndCESize_ << 2);
- rbc.m_expansionEndCEMaxSize_ = new byte[m_expansionEndCEMaxSizeSize_];
- for (int i = 0; i < m_expansionEndCEMaxSizeSize_; i++) {
- rbc.m_expansionEndCEMaxSize_[i] = m_dataInputStream_.readByte();
- }
- readcount += m_expansionEndCEMaxSizeSize_;
- rbc.m_unsafe_ = new byte[m_unsafeSize_];
- for (int i = 0; i < m_unsafeSize_; i++) {
- rbc.m_unsafe_[i] = m_dataInputStream_.readByte();
- }
- readcount += m_unsafeSize_;
- if (UCAConst != null) {
- // we are reading the UCA
- // unfortunately the UCA offset in any collator data is not 0 and
- // only refers to the UCA data
- // m_contractionSize_ -= m_UCAValuesSize_;
- m_contractionSize_ = m_UCAConstOffset_ - readcount;
- } else {
- m_contractionSize_ = m_size_ - readcount;
- }
- rbc.m_contractionEnd_ = new byte[m_contractionSize_];
- for (int i = 0; i < m_contractionSize_; i++) {
- rbc.m_contractionEnd_[i] = m_dataInputStream_.readByte();
- }
- readcount += m_contractionSize_;
- if (UCAConst != null) {
- UCAConst.FIRST_TERTIARY_IGNORABLE_[0] = m_dataInputStream_.readInt();
- int readUCAConstcount = 4;
- UCAConst.FIRST_TERTIARY_IGNORABLE_[1] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.LAST_TERTIARY_IGNORABLE_[0] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.LAST_TERTIARY_IGNORABLE_[1] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.FIRST_PRIMARY_IGNORABLE_[0] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.FIRST_PRIMARY_IGNORABLE_[1] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.FIRST_SECONDARY_IGNORABLE_[0] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.FIRST_SECONDARY_IGNORABLE_[1] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.LAST_SECONDARY_IGNORABLE_[0] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.LAST_SECONDARY_IGNORABLE_[1] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.LAST_PRIMARY_IGNORABLE_[0] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.LAST_PRIMARY_IGNORABLE_[1] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.FIRST_VARIABLE_[0] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.FIRST_VARIABLE_[1] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.LAST_VARIABLE_[0] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.LAST_VARIABLE_[1] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.FIRST_NON_VARIABLE_[0] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.FIRST_NON_VARIABLE_[1] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.LAST_NON_VARIABLE_[0] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.LAST_NON_VARIABLE_[1] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.RESET_TOP_VALUE_[0] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.RESET_TOP_VALUE_[1] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.FIRST_IMPLICIT_[0] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.FIRST_IMPLICIT_[1] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.LAST_IMPLICIT_[0] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.LAST_IMPLICIT_[1] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.FIRST_TRAILING_[0] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.FIRST_TRAILING_[1] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.LAST_TRAILING_[0] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.LAST_TRAILING_[1] = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.PRIMARY_TOP_MIN_ = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.PRIMARY_IMPLICIT_MIN_ = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.PRIMARY_IMPLICIT_MAX_ = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.PRIMARY_TRAILING_MIN_ = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.PRIMARY_TRAILING_MAX_ = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.PRIMARY_SPECIAL_MIN_ = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
- UCAConst.PRIMARY_SPECIAL_MAX_ = m_dataInputStream_.readInt();
- readUCAConstcount += 4;
-
- readcount += readUCAConstcount;
-
- int resultsize = (rbc.m_scriptToLeadBytes - readcount) / 2;
- assert resultsize == m_UCAcontractionSize_ / 2;
- ucaContractions = new char[resultsize];
- for (int i = 0; i < resultsize; i++) {
- ucaContractions[i] = m_dataInputStream_.readChar();
- }
- readcount += m_UCAcontractionSize_;
- }
-
- if (leadByteConstants != null) {
- readcount += m_dataInputStream_.skip(rbc.m_scriptToLeadBytes - readcount);
- leadByteConstants.read(m_dataInputStream_);
- readcount += leadByteConstants.getSerializedDataSize();
- }
-
- if (readcount != m_size_) {
- // /CLOVER:OFF
- throw new IOException("Internal Error: Data file size error");
- // /CLOVER:ON
- }
- return ucaContractions;
- }
-
- /**
- * Reads in the inverse uca data
- *
- * @param input
- * input stream with the inverse uca data
- * @return an object containing the inverse uca data
- * @exception IOException
- * thrown when error occurs while reading the inverse uca
- */
- private static CollationParsedRuleBuilder.InverseUCA readInverseUCA(InputStream inputStream) throws IOException {
- ICUBinary.readHeader(inputStream, INVERSE_UCA_DATA_FORMAT_ID_, INVERSE_UCA_AUTHENTICATE_);
-
- // TODO: Check that the invuca data version (readHeader() return value)
- // matches the ucadata version.
- // throw new IOException(WRONG_UNICODE_VERSION_ERROR_);
-
- CollationParsedRuleBuilder.InverseUCA result = new CollationParsedRuleBuilder.InverseUCA();
- DataInputStream input = new DataInputStream(inputStream);
- input.readInt(); // bytesize
- int tablesize = input.readInt(); // in int size
- int contsize = input.readInt(); // in char size
- input.readInt(); // table in bytes
- input.readInt(); // conts in bytes
- result.m_UCA_version_ = readVersion(input);
- input.skipBytes(8); // skip padding
-
- int size = tablesize * 3; // one column for each strength
- result.m_table_ = new int[size];
- result.m_continuations_ = new char[contsize];
-
- for (int i = 0; i < size; i++) {
- result.m_table_[i] = input.readInt();
- }
- for (int i = 0; i < contsize; i++) {
- result.m_continuations_[i] = input.readChar();
- }
- input.close();
- return result;
- }
-
- /**
- * Reads four bytes from the input and returns a VersionInfo object. Use it to read different collator versions.
- *
- * @param input
- * already instantiated DataInputStream, positioned at the start of four version bytes
- * @return a ready VersionInfo object
- * @throws IOException
- * thrown when error occurs while reading version bytes
- */
-
- protected static VersionInfo readVersion(DataInputStream input) throws IOException {
- byte[] version = new byte[4];
- version[0] = input.readByte();
- version[1] = input.readByte();
- version[2] = input.readByte();
- version[3] = input.readByte();
-
- VersionInfo result = VersionInfo.getInstance((int) version[0], (int) version[1], (int) version[2],
- (int) version[3]);
-
- return result;
- }
-
- // private inner class -----------------------------------------------
-
- // private variables -------------------------------------------------
-
- /**
- * Authenticate uca data format version
- */
- private static final ICUBinary.Authenticate UCA_AUTHENTICATE_ = new ICUBinary.Authenticate() {
- public boolean isDataVersionAcceptable(byte version[]) {
- return version[0] == DATA_FORMAT_VERSION_[0] && version[1] >= DATA_FORMAT_VERSION_[1];
- // Too harsh
- // && version[1] == DATA_FORMAT_VERSION_[1]
- // && version[2] == DATA_FORMAT_VERSION_[2]
- // && version[3] == DATA_FORMAT_VERSION_[3];
- }
- };
-
- /**
- * Authenticate uca data format version
- */
- private static final ICUBinary.Authenticate INVERSE_UCA_AUTHENTICATE_ = new ICUBinary.Authenticate() {
- public boolean isDataVersionAcceptable(byte version[]) {
- return version[0] == INVERSE_UCA_DATA_FORMAT_VERSION_[0]
- && version[1] >= INVERSE_UCA_DATA_FORMAT_VERSION_[1];
- }
- };
-
- /**
- * Data input stream for uca.icu
- */
- private DataInputStream m_dataInputStream_;
-
- /**
- * File format version and id that this class understands. No guarantees are made if a older version is used
- */
- private static final byte DATA_FORMAT_VERSION_[] = { (byte) 0x3, (byte) 0x0, (byte) 0x0, (byte) 0x0 };
- private static final byte DATA_FORMAT_ID_[] = { (byte) 0x55, (byte) 0x43, (byte) 0x6f, (byte) 0x6c };
- /**
- * Inverse UCA file format version and id that this class understands. No guarantees are made if a older version is
- * used
- */
- private static final byte INVERSE_UCA_DATA_FORMAT_VERSION_[] = { (byte) 0x2, (byte) 0x1, (byte) 0x0, (byte) 0x0 };
- private static final byte INVERSE_UCA_DATA_FORMAT_ID_[] = { (byte) 0x49, (byte) 0x6e, (byte) 0x76, (byte) 0x43 };
-
- /**
- * Wrong unicode version error string
- */
- // private static final String WRONG_UNICODE_VERSION_ERROR_ = "Unicode version in binary image is not compatible with the current Unicode version";
-
- /**
- * Size of expansion table in bytes
- */
- private int m_expansionSize_;
- /**
- * Size of contraction index table in bytes
- */
- private int m_contractionIndexSize_;
- /**
- * Size of contraction table in bytes
- */
- private int m_contractionCESize_;
- /*
- * Size of the Trie in bytes
- */
- // private int m_trieSize_;
- /**
- * Size of the table that contains information about collation elements that end with an expansion
- */
- private int m_expansionEndCESize_;
- /**
- * Size of the table that contains information about the maximum size of collation elements that end with a
- * particular expansion CE corresponding to the ones in expansionEndCE
- */
- private int m_expansionEndCEMaxSizeSize_;
- /**
- * Size of the option table that contains information about the collation options
- */
- private int m_optionSize_;
- /**
- * Size of the whole data file minusing the ICU header
- */
- private int m_size_;
- /**
- * Size of the collation data header
- */
- private int m_headerSize_;
- /**
- * Size of the table that contains information about the "Unsafe" codepoints
- */
- private int m_unsafeSize_;
- /**
- * Size in bytes of the table that contains information about codepoints that ends with a contraction
- */
- private int m_contractionSize_;
- /**
- * Size of the table that contains UCA contraction information in bytes
- */
- private int m_UCAcontractionSize_;
- /**
- * Offset of the UCA Const
- */
- private int m_UCAConstOffset_;
-
- // private methods ---------------------------------------------------
-
-}
/**
*******************************************************************************
-* Copyright (C) 2003-2010, International Business Machines Corporation and *
-* others. All Rights Reserved. *
+* Copyright (C) 2003-2014, International Business Machines Corporation and
+* others. All Rights Reserved.
*******************************************************************************
*/
import com.ibm.icu.impl.ICUResourceBundle;
import com.ibm.icu.impl.ICUService;
import com.ibm.icu.impl.ICUService.Factory;
+import com.ibm.icu.impl.coll.CollationLoader;
+import com.ibm.icu.impl.coll.CollationTailoring;
import com.ibm.icu.text.Collator.CollatorFactory;
+import com.ibm.icu.util.Output;
import com.ibm.icu.util.ULocale;
final class CollatorServiceShim extends Collator.ServiceShim {
throw new MissingResourceException("Could not locate Collator data", "", "");
///CLOVER:ON
}
- coll = (Collator) coll.clone();
- coll.setLocale(actualLoc[0], actualLoc[0]); // services make no distinction between actual & valid
- return coll;
+ return (Collator) coll.clone();
}
catch (CloneNotSupportedException e) {
///CLOVER:OFF
}
Object registerInstance(Collator collator, ULocale locale) {
+ // Set the collator locales while registering so that getInstance()
+ // need not guess whether the collator's locales are already set properly
+ // (as they are by the data loader).
+ collator.setLocale(locale, locale);
return service.registerObject(collator, locale);
}
}
protected Object handleCreate(ULocale uloc, int kind, ICUService srvc) {
- return new RuleBasedCollator(uloc);
+ return makeInstance(uloc);
}
}
actualIDReturn[0] = "root";
}
try {
- return new RuleBasedCollator(ULocale.ROOT);
+ return makeInstance(ULocale.ROOT);
}
catch (MissingResourceException e) {
return null;
}
///CLOVER:ON
}
+
+ // Ported from C++ Collator::makeInstance().
+ private static final Collator makeInstance(ULocale desiredLocale) {
+ Output<ULocale> validLocale = new Output<ULocale>(ULocale.ROOT);
+ CollationTailoring t =
+ CollationLoader.loadTailoring(desiredLocale, validLocale);
+ return new RuleBasedCollator(t, validLocale.value);
+ }
+
private static ICULocaleService service = new CService();
}
*/
package com.ibm.icu.text;
-import java.io.DataInputStream;
-import java.io.IOException;
-import java.nio.ByteBuffer;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
import java.text.CharacterIterator;
import java.text.ParseException;
import java.util.Arrays;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.MissingResourceException;
-import java.util.Set;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
-import com.ibm.icu.impl.BOCU;
-import com.ibm.icu.impl.ICUDebug;
-import com.ibm.icu.impl.ICUResourceBundle;
-import com.ibm.icu.impl.ImplicitCEGenerator;
-import com.ibm.icu.impl.IntTrie;
-import com.ibm.icu.impl.StringUCharacterIterator;
-import com.ibm.icu.impl.Trie;
-import com.ibm.icu.impl.TrieIterator;
+import com.ibm.icu.impl.Normalizer2Impl;
import com.ibm.icu.impl.Utility;
-import com.ibm.icu.lang.UCharacter;
-import com.ibm.icu.lang.UScript;
-import com.ibm.icu.util.Output;
-import com.ibm.icu.util.RangeValueIterator;
+import com.ibm.icu.impl.Normalizer2Impl.ReorderingBuffer;
+import com.ibm.icu.impl.coll.BOCSU;
+import com.ibm.icu.impl.coll.Collation;
+import com.ibm.icu.impl.coll.CollationCompare;
+import com.ibm.icu.impl.coll.CollationData;
+import com.ibm.icu.impl.coll.CollationFastLatin;
+import com.ibm.icu.impl.coll.CollationIterator;
+import com.ibm.icu.impl.coll.CollationKeys;
+import com.ibm.icu.impl.coll.CollationKeys.SortKeyByteSink;
+import com.ibm.icu.impl.coll.CollationLoader;
+import com.ibm.icu.impl.coll.CollationRoot;
+import com.ibm.icu.impl.coll.CollationSettings;
+import com.ibm.icu.impl.coll.CollationTailoring;
+import com.ibm.icu.impl.coll.ContractionsAndExpansions;
+import com.ibm.icu.impl.coll.FCDUTF16CollationIterator;
+import com.ibm.icu.impl.coll.SharedObject;
+import com.ibm.icu.impl.coll.TailoredSet;
+import com.ibm.icu.impl.coll.UTF16CollationIterator;
import com.ibm.icu.util.ULocale;
-import com.ibm.icu.util.UResourceBundle;
import com.ibm.icu.util.VersionInfo;
/**
* </p>
*
* <p>
- * Users are strongly encouraged to read <a href="http://www.icu-project.org/userguide/Collate_Intro.html"> the users
- * guide</a> for more information about the collation service before using this class.
+ * Users are strongly encouraged to read the <a href="http://userguide.icu-project.org/collation">User
+ * Guide</a> for more information about the collation service before using this class.
* </p>
*
* <p>
* Create a RuleBasedCollator from a locale by calling the getInstance(Locale) factory method in the base class
* Collator. Collator.getInstance(Locale) creates a RuleBasedCollator object based on the collation rules defined by the
- * argument locale. If a customized collation ordering ar attributes is required, use the RuleBasedCollator(String)
- * constructor with the appropriate rules. The customized RuleBasedCollator will base its ordering on UCA, while
+ * argument locale. If a customized collation ordering or attributes is required, use the RuleBasedCollator(String)
+ * constructor with the appropriate rules. The customized RuleBasedCollator will base its ordering on the CLDR root collation, while
* re-adjusting the attributes and orders of the characters in the specified rule accordingly.
* </p>
*
* <p>
* RuleBasedCollator provides correct collation orders for most locales supported in ICU. If specific data for a locale
- * is not available, the orders eventually falls back to the <a href="http://www.unicode.org/unicode/reports/tr10/">UCA
- * collation order </a>.
+ * is not available, the orders eventually falls back to the
+ * <a href="http://www.unicode.org/reports/tr35/tr35-collation.html#Root_Collation">CLDR root sort order</a>.
* </p>
*
* <p>
* For information about the collation rule syntax and details about customization, please refer to the <a
- * href="http://www.icu-project.org/userguide/Collate_Customization.html"> Collation customization</a> section of the
- * user's guide.
+ * href="http://userguide.icu-project.org/collation/customization">Collation customization</a> section of the
+ * User Guide.
* </p>
*
* <p>
/**
* <p>
- * Constructor that takes the argument rules for customization. The collator will be based on UCA, with the
+ * Constructor that takes the argument rules for customization.
+ * The collator will be based on the CLDR root collation, with the
* attributes and re-ordering of the characters specified in the argument rules.
* </p>
* <p>
- * See the user guide's section on <a href="http://www.icu-project.org/userguide/Collate_Customization.html">
+ * See the User Guide's section on <a href="http://userguide.icu-project.org/collation/customization">
* Collation Customization</a> for details on the rule syntax.
* </p>
*
* the collation rules to build the collation table from.
* @exception ParseException
* and IOException thrown. ParseException thrown when argument rules have an invalid syntax.
- * IOException thrown when an error occured while reading internal data.
+ * IOException thrown when an error occurred while reading internal data.
* @stable ICU 2.8
*/
public RuleBasedCollator(String rules) throws Exception {
- checkUCA();
if (rules == null) {
throw new IllegalArgumentException("Collation rules can not be null");
}
- init(rules);
+ validLocale = ULocale.ROOT;
+ internalBuildTailoring(rules);
+ }
+
+ /**
+ * Implements from-rule constructors.
+ * @param rules rule string
+ * @throws Exception
+ */
+ private final void internalBuildTailoring(String rules) throws Exception {
+ CollationTailoring base = CollationRoot.getRoot();
+ // Most code using Collator does not need to build a Collator from rules.
+ // By using reflection, most code will not have a static dependency on the builder code.
+ // CollationBuilder builder = new CollationBuilder(base);
+ ClassLoader classLoader = getClass().getClassLoader();
+ CollationTailoring t;
+ try {
+ Class<?> builderClass = classLoader.loadClass("com.ibm.icu.impl.coll.CollationBuilder");
+ Object builder = builderClass.getConstructor(CollationTailoring.class).newInstance(base);
+ // builder.parseAndBuild(rules);
+ Method parseAndBuild = builderClass.getMethod("parseAndBuild", String.class);
+ t = (CollationTailoring)parseAndBuild.invoke(builder, rules);
+ } catch(InvocationTargetException e) {
+ throw (Exception)e.getTargetException();
+ }
+ CollationSettings ts = t.settings.readOnly();
+ char[] fastLatinPrimaries = new char[CollationFastLatin.LATIN_LIMIT];
+ int fastLatinOptions = CollationFastLatin.getOptions(t.data, ts, fastLatinPrimaries);
+ if(fastLatinOptions != ts.fastLatinOptions ||
+ (fastLatinOptions >= 0 &&
+ !Arrays.equals(fastLatinPrimaries, ts.fastLatinPrimaries))) {
+ CollationSettings ownedSettings = t.settings.copyOnWrite();
+ ownedSettings.fastLatinOptions = CollationFastLatin.getOptions(
+ t.data, ownedSettings,
+ ownedSettings.fastLatinPrimaries);
+ }
+ t.actualLocale = null;
+ adoptTailoring(t);
}
// public methods --------------------------------------------------------
* @return a new instance of this RuleBasedCollator object
* @stable ICU 2.8
*/
+ @Override
public Object clone() throws CloneNotSupportedException {
- return clone(isFrozen());
+ if (isFrozen()) {
+ return this;
+ }
+ return cloneAsThawed();
}
- /**
- * Clones the RuleBasedCollator
- *
- * @param frozen should the clone be frozen or not
- * @return a new instance of this RuleBasedCollator object
- */
- private Object clone(boolean frozen) throws CloneNotSupportedException {
- //TODO: once buffer and threading issue is resolved have frozen clone just return itself
- RuleBasedCollator result = (RuleBasedCollator) super.clone();
- if (latinOneCEs_ != null) {
- result.m_reallocLatinOneCEs_ = true;
- result.m_ContInfo_ = new ContractionInfo();
+ private final void initMaxExpansions() {
+ synchronized(tailoring) {
+ if (tailoring.maxExpansions == null) {
+ tailoring.maxExpansions = CollationElementIterator.computeMaxExpansions(tailoring.data);
+ }
}
-
- // since all collation data in the RuleBasedCollator do not change
- // we can safely assign the result.fields to this collator
- // except in cases where we can't
- result.collationBuffer = null;
- result.frozenLock = frozen ? new ReentrantLock() : null;
- return result;
}
/**
* @stable ICU 2.8
*/
public CollationElementIterator getCollationElementIterator(String source) {
+ initMaxExpansions();
return new CollationElementIterator(source, this);
}
* @stable ICU 2.8
*/
public CollationElementIterator getCollationElementIterator(CharacterIterator source) {
+ initMaxExpansions();
CharacterIterator newsource = (CharacterIterator) source.clone();
return new CollationElementIterator(newsource, this);
}
* @stable ICU 2.8
*/
public CollationElementIterator getCollationElementIterator(UCharacterIterator source) {
+ initMaxExpansions();
return new CollationElementIterator(source, this);
}
* Determines whether the object has been frozen or not.
* @stable ICU 4.8
*/
+ @Override
public boolean isFrozen() {
return frozenLock != null;
}
* @return the collator itself.
* @stable ICU 4.8
*/
+ @Override
public Collator freeze() {
if (!isFrozen()) {
frozenLock = new ReentrantLock();
+ if (collationBuffer == null) {
+ collationBuffer = new CollationBuffer(data);
+ }
}
return this;
}
* Provides for the clone operation. Any clone is initially unfrozen.
* @stable ICU 4.8
*/
+ @Override
public RuleBasedCollator cloneAsThawed() {
- RuleBasedCollator clone = null;
try {
- clone = (RuleBasedCollator) clone(false);
+ RuleBasedCollator result = (RuleBasedCollator) super.clone();
+ // since all collation data in the RuleBasedCollator do not change
+ // we can safely assign the result.fields to this collator
+ // except in cases where we can't
+ result.settings = settings.clone();
+ result.collationBuffer = null;
+ result.frozenLock = null;
+ return result;
} catch (CloneNotSupportedException e) {
// Clone is implemented
+ return null;
}
- return clone;
}
// public setters --------------------------------------------------------
+ private void checkNotFrozen() {
+ if (isFrozen()) {
+ throw new UnsupportedOperationException("Attempt to modify frozen RuleBasedCollator");
+ }
+ }
+
+ private final CollationSettings getOwnedSettings() {
+ return settings.copyOnWrite();
+ }
+
+ private final CollationSettings getDefaultSettings() {
+ return tailoring.settings.readOnly();
+ }
+
/**
* Sets the Hiragana Quaternary mode to be on or off. When the Hiragana Quaternary mode is turned on, the collator
* positions Hiragana characters before all non-ignorable characters in QUATERNARY strength. This is to produce a
*/
@Deprecated
public void setHiraganaQuaternary(boolean flag) {
- if (isFrozen()) {
- throw new UnsupportedOperationException("Attempt to modify frozen object");
- }
+ checkNotFrozen();
}
/**
*/
@Deprecated
public void setHiraganaQuaternaryDefault() {
- if (isFrozen()) {
- throw new UnsupportedOperationException("Attempt to modify frozen object");
- }
+ checkNotFrozen();
}
/**
* @stable ICU 2.8
*/
public void setUpperCaseFirst(boolean upperfirst) {
- if (isFrozen()) {
- throw new UnsupportedOperationException("Attempt to modify frozen object");
- }
-
- if (upperfirst) {
- if (m_caseFirst_ != AttributeValue.UPPER_FIRST_) {
- latinOneRegenTable_ = true;
- }
- m_caseFirst_ = AttributeValue.UPPER_FIRST_;
- } else {
- if (m_caseFirst_ != AttributeValue.OFF_) {
- latinOneRegenTable_ = true;
- }
- m_caseFirst_ = AttributeValue.OFF_;
- }
- updateInternalState();
+ checkNotFrozen();
+ if (upperfirst == isUpperCaseFirst()) { return; }
+ CollationSettings ownedSettings = getOwnedSettings();
+ ownedSettings.setCaseFirst(upperfirst ? CollationSettings.CASE_FIRST_AND_UPPER_MASK : 0);
+ setFastLatinOptions(ownedSettings);
}
/**
* @stable ICU 2.8
*/
public void setLowerCaseFirst(boolean lowerfirst) {
- if (isFrozen()) {
- throw new UnsupportedOperationException("Attempt to modify frozen object");
- }
-
- if (lowerfirst) {
- if (m_caseFirst_ != AttributeValue.LOWER_FIRST_) {
- latinOneRegenTable_ = true;
- }
- m_caseFirst_ = AttributeValue.LOWER_FIRST_;
- } else {
- if (m_caseFirst_ != AttributeValue.OFF_) {
- latinOneRegenTable_ = true;
- }
- m_caseFirst_ = AttributeValue.OFF_;
- }
- updateInternalState();
+ checkNotFrozen();
+ if (lowerfirst == isLowerCaseFirst()) { return; }
+ CollationSettings ownedSettings = getOwnedSettings();
+ ownedSettings.setCaseFirst(lowerfirst ? CollationSettings.CASE_FIRST : 0);
+ setFastLatinOptions(ownedSettings);
}
/**
* @stable ICU 2.8
*/
public final void setCaseFirstDefault() {
- if (isFrozen()) {
- throw new UnsupportedOperationException("Attempt to modify frozen object");
- }
-
- if (m_caseFirst_ != m_defaultCaseFirst_) {
- latinOneRegenTable_ = true;
- }
- m_caseFirst_ = m_defaultCaseFirst_;
- updateInternalState();
+ checkNotFrozen();
+ CollationSettings defaultSettings = getDefaultSettings();
+ if(settings.readOnly() == defaultSettings) { return; }
+ CollationSettings ownedSettings = getOwnedSettings();
+ ownedSettings.setCaseFirstDefault(defaultSettings.options);
+ setFastLatinOptions(ownedSettings);
}
/**
* @stable ICU 2.8
*/
public void setAlternateHandlingDefault() {
- if (isFrozen()) {
- throw new UnsupportedOperationException("Attempt to modify frozen object");
- }
-
- m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_;
- updateInternalState();
+ checkNotFrozen();
+ CollationSettings defaultSettings = getDefaultSettings();
+ if(settings.readOnly() == defaultSettings) { return; }
+ CollationSettings ownedSettings = getOwnedSettings();
+ ownedSettings.setAlternateHandlingDefault(defaultSettings.options);
+ setFastLatinOptions(ownedSettings);
}
/**
* @stable ICU 2.8
*/
public void setCaseLevelDefault() {
- if (isFrozen()) {
- throw new UnsupportedOperationException("Attempt to modify frozen object");
- }
-
- m_isCaseLevel_ = m_defaultIsCaseLevel_;
- updateInternalState();
+ checkNotFrozen();
+ CollationSettings defaultSettings = getDefaultSettings();
+ if(settings.readOnly() == defaultSettings) { return; }
+ CollationSettings ownedSettings = getOwnedSettings();
+ ownedSettings.setFlagDefault(CollationSettings.CASE_LEVEL, defaultSettings.options);
+ setFastLatinOptions(ownedSettings);
}
/**
* @stable ICU 2.8
*/
public void setDecompositionDefault() {
- if (isFrozen()) {
- throw new UnsupportedOperationException("Attempt to modify frozen object");
- }
-
- setDecomposition(m_defaultDecomposition_);
- updateInternalState();
+ checkNotFrozen();
+ CollationSettings defaultSettings = getDefaultSettings();
+ if(settings.readOnly() == defaultSettings) { return; }
+ CollationSettings ownedSettings = getOwnedSettings();
+ ownedSettings.setFlagDefault(CollationSettings.CHECK_FCD, defaultSettings.options);
+ setFastLatinOptions(ownedSettings);
}
/**
* @stable ICU 2.8
*/
public void setFrenchCollationDefault() {
- if (isFrozen()) {
- throw new UnsupportedOperationException("Attempt to modify frozen object");
- }
-
- if (m_isFrenchCollation_ != m_defaultIsFrenchCollation_) {
- latinOneRegenTable_ = true;
- }
- m_isFrenchCollation_ = m_defaultIsFrenchCollation_;
- updateInternalState();
+ checkNotFrozen();
+ CollationSettings defaultSettings = getDefaultSettings();
+ if(settings.readOnly() == defaultSettings) { return; }
+ CollationSettings ownedSettings = getOwnedSettings();
+ ownedSettings.setFlagDefault(CollationSettings.BACKWARD_SECONDARY, defaultSettings.options);
+ setFastLatinOptions(ownedSettings);
}
/**
* @stable ICU 2.8
*/
public void setStrengthDefault() {
- setStrength(m_defaultStrength_);
- updateInternalState();
+ checkNotFrozen();
+ CollationSettings defaultSettings = getDefaultSettings();
+ if(settings.readOnly() == defaultSettings) { return; }
+ CollationSettings ownedSettings = getOwnedSettings();
+ ownedSettings.setStrengthDefault(defaultSettings.options);
+ setFastLatinOptions(ownedSettings);
}
/**
- * Method to set numeric collation to its default value. When numeric collation is turned on, this Collator
- * generates a collation key for the numeric value of substrings of digits. This is a way to get '100' to sort AFTER
- * '2'
- *
+ * Method to set numeric collation to its default value.
+ *
* @see #getNumericCollation
* @see #setNumericCollation
* @stable ICU 2.8
*/
public void setNumericCollationDefault() {
- if (isFrozen()) {
- throw new UnsupportedOperationException("Attempt to modify frozen object");
- }
-
- setNumericCollation(m_defaultIsNumericCollation_);
- updateInternalState();
+ checkNotFrozen();
+ CollationSettings defaultSettings = getDefaultSettings();
+ if(settings.readOnly() == defaultSettings) { return; }
+ CollationSettings ownedSettings = getOwnedSettings();
+ ownedSettings.setFlagDefault(CollationSettings.NUMERIC, defaultSettings.options);
+ setFastLatinOptions(ownedSettings);
}
/**
* Sets the mode for the direction of SECONDARY weights to be used in French collation. The default value is false,
* which treats SECONDARY weights in the order they appear. If set to true, the SECONDARY weights will be sorted
- * backwards. See the section on <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html">
+ * backwards. See the section on <a href="http://userguide.icu-project.org/collation/architecture">
* French collation</a> for more information.
*
* @param flag
* @see #setFrenchCollationDefault
*/
public void setFrenchCollation(boolean flag) {
- if (isFrozen()) {
- throw new UnsupportedOperationException("Attempt to modify frozen object");
- }
-
- if (m_isFrenchCollation_ != flag) {
- latinOneRegenTable_ = true;
- }
- m_isFrenchCollation_ = flag;
- updateInternalState();
+ checkNotFrozen();
+ if(flag == isFrenchCollation()) { return; }
+ CollationSettings ownedSettings = getOwnedSettings();
+ ownedSettings.setFlag(CollationSettings.BACKWARD_SECONDARY, flag);
+ setFastLatinOptions(ownedSettings);
}
/**
* Sets the alternate handling for QUATERNARY strength to be either shifted or non-ignorable. See the UCA definition
- * on <a href="http://www.unicode.org/unicode/reports/tr10/#Variable_Weighting"> Alternate Weighting</a>. This
+ * on <a href="http://www.unicode.org/unicode/reports/tr10/#Variable_Weighting">Variable Weighting</a>. This
* attribute will only be effective when QUATERNARY strength is set. The default value for this mode is false,
- * corresponding to the NON_IGNORABLE mode in UCA. In the NON-IGNORABLE mode, the RuleBasedCollator will treats all
- * the codepoints with non-ignorable primary weights in the same way. If the mode is set to true, the behaviour
- * corresponds to SHIFTED defined in UCA, this causes codepoints with PRIMARY orders that are equal or below the
+ * corresponding to the NON_IGNORABLE mode in UCA. In the NON_IGNORABLE mode, the RuleBasedCollator treats all
+ * the code points with non-ignorable primary weights in the same way. If the mode is set to true, the behavior
+ * corresponds to SHIFTED defined in UCA, this causes code points with PRIMARY orders that are equal or below the
* variable top value to be ignored in PRIMARY order and moved to the QUATERNARY order.
*
* @param shifted
- * true if SHIFTED behaviour for alternate handling is desired, false for the NON_IGNORABLE behaviour.
+ * true if SHIFTED behavior for alternate handling is desired, false for the NON_IGNORABLE behavior.
* @see #isAlternateHandlingShifted
* @see #setAlternateHandlingDefault
* @stable ICU 2.8
*/
public void setAlternateHandlingShifted(boolean shifted) {
- if (isFrozen()) {
- throw new UnsupportedOperationException("Attempt to modify frozen object");
- }
-
- m_isAlternateHandlingShifted_ = shifted;
- updateInternalState();
+ checkNotFrozen();
+ if(shifted == isAlternateHandlingShifted()) { return; }
+ CollationSettings ownedSettings = getOwnedSettings();
+ ownedSettings.setAlternateHandlingShifted(shifted);
+ setFastLatinOptions(ownedSettings);
}
/**
* case level.
* </p>
* <p>
- * See the section on <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html"> case
+ * See the section on <a href="http://userguide.icu-project.org/collation/architecture">case
* level</a> for more information.
* </p>
*
* @see #isCaseLevel
*/
public void setCaseLevel(boolean flag) {
- if (isFrozen()) {
- throw new UnsupportedOperationException("Attempt to modify frozen object");
- }
-
- m_isCaseLevel_ = flag;
- updateInternalState();
- }
-
- /**
- * <p>
- * Sets this Collator's strength property. The strength property determines the minimum level of difference
+ checkNotFrozen();
+ if(flag == isCaseLevel()) { return; }
+ CollationSettings ownedSettings = getOwnedSettings();
+ ownedSettings.setFlag(CollationSettings.CASE_LEVEL, flag);
+ setFastLatinOptions(ownedSettings);
+ }
+
+ /**
+ * Sets the decomposition mode of this Collator. Setting this
+ * decomposition attribute with CANONICAL_DECOMPOSITION allows the
+ * Collator to handle un-normalized text properly, producing the
+ * same results as if the text were normalized. If
+ * NO_DECOMPOSITION is set, it is the user's responsibility to
+ * insure that all text is already in the appropriate form before
+ * a comparison or before getting a CollationKey. Adjusting
+ * decomposition mode allows the user to select between faster and
+ * more complete collation behavior.</p>
+ *
+ * <p>Since a great many of the world's languages do not require
+ * text normalization, most locales set NO_DECOMPOSITION as the
+ * default decomposition mode.</p>
+ *
+ * The default decompositon mode for the Collator is
+ * NO_DECOMPOSITON, unless specified otherwise by the locale used
+ * to create the Collator.</p>
+ *
+ * <p>See getDecomposition for a description of decomposition
+ * mode.</p>
+ *
+ * @param decomposition the new decomposition mode
+ * @see #getDecomposition
+ * @see #NO_DECOMPOSITION
+ * @see #CANONICAL_DECOMPOSITION
+ * @throws IllegalArgumentException If the given value is not a valid
+ * decomposition mode.
+ * @stable ICU 2.8
+ */
+ @Override
+ public void setDecomposition(int decomposition)
+ {
+ checkNotFrozen();
+ boolean flag;
+ switch(decomposition) {
+ case NO_DECOMPOSITION:
+ flag = false;
+ break;
+ case CANONICAL_DECOMPOSITION:
+ flag = true;
+ break;
+ default:
+ throw new IllegalArgumentException("Wrong decomposition mode.");
+ }
+ if(flag == settings.readOnly().getFlag(CollationSettings.CHECK_FCD)) { return; }
+ CollationSettings ownedSettings = getOwnedSettings();
+ ownedSettings.setFlag(CollationSettings.CHECK_FCD, flag);
+ setFastLatinOptions(ownedSettings);
+ }
+
+ /**
+ * Sets this Collator's strength attribute. The strength attribute determines the minimum level of difference
* considered significant during comparison.
- * </p>
- * <p>
- * See the Collator class description for an example of use.
- * </p>
+ *
+ * <p>See the Collator class description for an example of use.
*
* @param newStrength
* the new strength value.
* If the new strength value is not one of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL.
* @stable ICU 2.8
*/
+ @Override
public void setStrength(int newStrength) {
- super.setStrength(newStrength);
- updateInternalState();
+ checkNotFrozen();
+ if(newStrength == getStrength()) { return; }
+ CollationSettings ownedSettings = getOwnedSettings();
+ ownedSettings.setStrength(newStrength);
+ setFastLatinOptions(ownedSettings);
+ }
+
+ /**
+ * {@icu} Sets the variable top to the top of the specified reordering group.
+ * The variable top determines the highest-sorting character
+ * which is affected by the alternate handling behavior.
+ * If that attribute is set to NON_IGNORABLE, then the variable top has no effect.
+ * @param group one of Collator.ReorderCodes.SPACE, Collator.ReorderCodes.PUNCTUATION,
+ * Collator.ReorderCodes.SYMBOL, Collator.ReorderCodes.CURRENCY;
+ * or Collator.ReorderCodes.DEFAULT to restore the default max variable group
+ * @return this
+ * @see #getMaxVariable
+ * @draft ICU 53
+ * @provisional This API might change or be removed in a future release.
+ */
+ @Override
+ public RuleBasedCollator setMaxVariable(int group) {
+ // Convert the reorder code into a MaxVariable number, or UCOL_DEFAULT=-1.
+ int value;
+ if(group == Collator.ReorderCodes.DEFAULT) {
+ value = -1; // UCOL_DEFAULT
+ } else if(Collator.ReorderCodes.FIRST <= group && group <= Collator.ReorderCodes.CURRENCY) {
+ value = group - Collator.ReorderCodes.FIRST;
+ } else {
+ throw new IllegalArgumentException("illegal max variable group " + group);
+ }
+ int oldValue = settings.readOnly().getMaxVariable();
+ if(value == oldValue) {
+ return this;
+ }
+ CollationSettings defaultSettings = getDefaultSettings();
+ if(settings.readOnly() == defaultSettings) {
+ if(value < 0) { // UCOL_DEFAULT
+ return this;
+ }
+ }
+ CollationSettings ownedSettings = getOwnedSettings();
+
+ if(group == Collator.ReorderCodes.DEFAULT) {
+ group = Collator.ReorderCodes.FIRST + defaultSettings.getMaxVariable();
+ }
+ long varTop = data.getLastPrimaryForGroup(group);
+ assert(varTop != 0);
+ ownedSettings.setMaxVariable(value, defaultSettings.options);
+ ownedSettings.variableTop = varTop;
+ setFastLatinOptions(ownedSettings);
+ return this;
}
/**
- * <p>
- * Variable top is a two byte primary value which causes all the codepoints with primary values that are less or
- * equal than the variable top to be shifted when alternate handling is set to SHIFTED.
- * </p>
- * <p>
- * Sets the variable top to a collation element value of a string supplied.
- * </p>
+ * {@icu} Returns the maximum reordering group whose characters are affected by
+ * the alternate handling behavior.
+ * @return the maximum variable reordering group.
+ * @see #setMaxVariable
+ * @draft ICU 53
+ * @provisional This API might change or be removed in a future release.
+ */
+ @Override
+ public int getMaxVariable() {
+ return Collator.ReorderCodes.FIRST + settings.readOnly().getMaxVariable();
+ }
+
+ /**
+ * {@icu} Sets the variable top to the primary weight of the specified string.
+ *
+ * <p>Beginning with ICU 53, the variable top is pinned to
+ * the top of one of the supported reordering groups,
+ * and it must not be beyond the last of those groups.
+ * See {@link #setMaxVariable(int)}.
*
* @param varTop
* one or more (if contraction) characters to which the variable top should be set
- * @return a int value containing the value of the variable top in upper 16 bits. Lower 16 bits are undefined.
+ * @return variable top primary weight
* @exception IllegalArgumentException
* is thrown if varTop argument is not a valid variable top element. A variable top element is
* invalid when
* <ul>
* <li>it is a contraction that does not exist in the Collation order
- * <li>when the PRIMARY strength collation element for the variable top has more than two bytes
+ * <li>the variable top is beyond
+ * the last reordering group supported by setMaxVariable()
* <li>when the varTop argument is null or zero in length.
* </ul>
* @see #getVariableTop
* @see RuleBasedCollator#setAlternateHandlingShifted
- * @stable ICU 2.6
+ * @deprecated ICU 53 Call {@link #setMaxVariable(int)} instead.
*/
+ @Override
public int setVariableTop(String varTop) {
- if (isFrozen()) {
- throw new UnsupportedOperationException("Attempt to modify frozen object");
- }
-
+ checkNotFrozen();
if (varTop == null || varTop.length() == 0) {
throw new IllegalArgumentException("Variable top argument string can not be null or zero in length.");
}
-
- CollationBuffer buffer = null;
- try {
- buffer = getCollationBuffer();
- return setVariableTop(varTop, buffer);
- } finally {
- releaseCollationBuffer(buffer);
- }
-
- }
-
- private int setVariableTop(String varTop, CollationBuffer buffer) {
- buffer.m_srcUtilColEIter_.setText(varTop);
- int ce = buffer.m_srcUtilColEIter_.next();
-
- // here we check if we have consumed all characters
- // you can put in either one character or a contraction
- // you shouldn't put more...
- if (buffer.m_srcUtilColEIter_.getOffset() != varTop.length() || ce == CollationElementIterator.NULLORDER) {
- throw new IllegalArgumentException("Variable top argument string is a contraction that does not exist "
- + "in the Collation order");
+ boolean numeric = settings.readOnly().isNumeric();
+ long ce1, ce2;
+ if(settings.readOnly().dontCheckFCD()) {
+ UTF16CollationIterator ci = new UTF16CollationIterator(data, numeric, varTop, 0);
+ ce1 = ci.nextCE();
+ ce2 = ci.nextCE();
+ } else {
+ FCDUTF16CollationIterator ci = new FCDUTF16CollationIterator(data, numeric, varTop, 0);
+ ce1 = ci.nextCE();
+ ce2 = ci.nextCE();
}
-
- int nextCE = buffer.m_srcUtilColEIter_.next();
-
- if ((nextCE != CollationElementIterator.NULLORDER)
- && (!isContinuation(nextCE) || (nextCE & CE_PRIMARY_MASK_) != 0)) {
- throw new IllegalArgumentException("Variable top argument string can only have a single collation "
- + "element that has less than or equal to two PRIMARY strength " + "bytes");
+ if(ce1 == Collation.NO_CE || ce2 != Collation.NO_CE) {
+ throw new IllegalArgumentException("Variable top argument string must map to exactly one collation element");
}
-
- m_variableTopValue_ = (ce & CE_PRIMARY_MASK_) >> 16;
-
- return ce & CE_PRIMARY_MASK_;
+ internalSetVariableTop(ce1 >>> 32);
+ return (int)settings.readOnly().variableTop;
}
/**
- * Sets the variable top to a collation element value supplied. Variable top is set to the upper 16 bits. Lower 16
- * bits are ignored.
+ * {@icu} Sets the variable top to the specified primary weight.
+ *
+ * <p>Beginning with ICU 53, the variable top is pinned to
+ * the top of one of the supported reordering groups,
+ * and it must not be beyond the last of those groups.
+ * See {@link #setMaxVariable(int)}.
*
- * @param varTop
- * Collation element value, as returned by setVariableTop or getVariableTop
+ * @param varTop primary weight, as returned by setVariableTop or getVariableTop
* @see #getVariableTop
* @see #setVariableTop(String)
- * @stable ICU 2.6
+ * @deprecated ICU 53 Call setMaxVariable() instead.
*/
+ @Override
public void setVariableTop(int varTop) {
- if (isFrozen()) {
- throw new UnsupportedOperationException("Attempt to modify frozen object");
- }
+ checkNotFrozen();
+ internalSetVariableTop(varTop & 0xffffffffL);
+ }
- m_variableTopValue_ = (varTop & CE_PRIMARY_MASK_) >> 16;
+ private void internalSetVariableTop(long varTop) {
+ if(varTop != settings.readOnly().variableTop) {
+ // Pin the variable top to the end of the reordering group which contains it.
+ // Only a few special groups are supported.
+ int group = data.getGroupForPrimary(varTop);
+ if(group < Collator.ReorderCodes.FIRST || Collator.ReorderCodes.CURRENCY < group) {
+ throw new IllegalArgumentException("The variable top must be a primary weight in " +
+ "the space/punctuation/symbols/currency symbols range");
+ }
+ long v = data.getLastPrimaryForGroup(group);
+ assert(v != 0 && v >= varTop);
+ varTop = v;
+ if(varTop != settings.readOnly().variableTop) {
+ CollationSettings ownedSettings = getOwnedSettings();
+ ownedSettings.setMaxVariable(group - Collator.ReorderCodes.FIRST,
+ getDefaultSettings().options);
+ ownedSettings.variableTop = varTop;
+ setFastLatinOptions(ownedSettings);
+ }
+ }
}
/**
- * When numeric collation is turned on, this Collator generates a collation key for the numeric value of substrings
- * of digits. This is a way to get '100' to sort AFTER '2'
- *
+ * {@icu} When numeric collation is turned on, this Collator makes
+ * substrings of digits sort according to their numeric values.
+ *
+ * <p>This is a way to get '100' to sort AFTER '2'. Note that the longest
+ * digit substring that can be treated as a single unit is
+ * 254 digits (not counting leading zeros). If a digit substring is
+ * longer than that, the digits beyond the limit will be treated as a
+ * separate digit substring.
+ *
+ * <p>A "digit" in this sense is a code point with General_Category=Nd,
+ * which does not include circled numbers, roman numerals, etc.
+ * Only a contiguous digit substring is considered, that is,
+ * non-negative integers without separators.
+ * There is no support for plus/minus signs, decimals, exponents, etc.
+ *
* @param flag
* true to turn numeric collation on and false to turn it off
* @see #getNumericCollation
* @stable ICU 2.8
*/
public void setNumericCollation(boolean flag) {
- if (isFrozen()) {
- throw new UnsupportedOperationException("Attempt to modify frozen object");
- }
-
+ checkNotFrozen();
// sort substrings of digits as numbers
- m_isNumericCollation_ = flag;
- updateInternalState();
+ if(flag == getNumericCollation()) { return; }
+ CollationSettings ownedSettings = getOwnedSettings();
+ ownedSettings.setFlag(CollationSettings.NUMERIC, flag);
+ setFastLatinOptions(ownedSettings);
}
/**
* then this clears any existing reordering
* @throws IllegalArgumentException if the reordering codes are malformed in any way (e.g. duplicates, multiple reset codes, overlapping equivalent scripts)
* @see #getReorderCodes
- * @see #getEquivalentReorderCodes
+ * @see Collator#getEquivalentReorderCodes
* @stable ICU 4.8
*/
+ @Override
public void setReorderCodes(int... order) {
- if (isFrozen()) {
- throw new UnsupportedOperationException("Attempt to modify frozen object");
+ checkNotFrozen();
+ if(order == null ?
+ settings.readOnly().reorderCodes.length == 0 :
+ Arrays.equals(order, settings.readOnly().reorderCodes)) {
+ return;
}
-
- if (order != null && order.length > 0) {
- m_reorderCodes_ = order.clone();
+ int length = (order != null) ? order.length : 0;
+ CollationSettings defaultSettings = getDefaultSettings();
+ if(length == 1 && order[0] == Collator.ReorderCodes.DEFAULT) {
+ if(settings.readOnly() != defaultSettings) {
+ CollationSettings ownedSettings = getOwnedSettings();
+ ownedSettings.setReordering(defaultSettings.reorderCodes,
+ defaultSettings.reorderTable);
+ setFastLatinOptions(ownedSettings);
+ }
+ return;
+ }
+ CollationSettings ownedSettings = getOwnedSettings();
+ if(length == 0) {
+ ownedSettings.resetReordering();
} else {
- m_reorderCodes_ = null;
+ byte[] reorderTable = new byte[256];
+ data.makeReorderTable(order, reorderTable);
+ ownedSettings.setReordering(order.clone(), reorderTable);
}
- buildPermutationTable();
+ setFastLatinOptions(ownedSettings);
+ }
+
+ private void setFastLatinOptions(CollationSettings ownedSettings) {
+ ownedSettings.fastLatinOptions = CollationFastLatin.getOptions(
+ data, ownedSettings, ownedSettings.fastLatinPrimaries);
}
// public getters --------------------------------------------------------
* @stable ICU 2.8
*/
public String getRules() {
- return m_rules_;
+ return tailoring.rules;
}
/**
- * Returns current rules. The argument defines whether full rules (UCA + tailored) rules are returned or just the
- * tailoring.
+ * Returns current rules.
+ * The argument defines whether full rules (root collation + tailored) rules are returned
+ * or just the tailoring.
*
- * <p>The "UCA rules" are an <i>approximation</i> of the root collator's sort order.
+ * <p>The root collation rules are an <i>approximation</i> of the root collator's sort order.
* They are almost never used or useful at runtime and can be removed from the data.
* See <a href="http://userguide.icu-project.org/collation/customization#TOC-Building-on-Existing-Locales">User Guide:
* Collation Customization, Building on Existing Locales</a>
*/
public String getRules(boolean fullrules) {
if (!fullrules) {
- return m_rules_;
+ return tailoring.rules;
}
- // take the UCA rules and append real rules at the end
- return UCA_.m_rules_.concat(m_rules_);
+ return CollationLoader.getRootRules() + tailoring.rules;
}
/**
- * Get an UnicodeSet that contains all the characters and sequences tailored in this collator.
+ * Get a UnicodeSet that contains all the characters and sequences tailored in this collator.
*
* @return a pointer to a UnicodeSet object containing all the code points and sequences that may sort differently
- * than in the UCA.
+ * than in the root collator.
* @stable ICU 2.4
*/
+ @Override
public UnicodeSet getTailoredSet() {
- try {
- CollationRuleParser src = new CollationRuleParser(getRules());
- return src.getTailoredSet();
- } catch (Exception e) {
- throw new IllegalStateException("A tailoring rule should not " + "have errors. Something is quite wrong!");
- }
- }
-
- private static class contContext {
- RuleBasedCollator coll;
- UnicodeSet contractions;
- UnicodeSet expansions;
- UnicodeSet removedContractions;
- boolean addPrefixes;
-
- contContext(RuleBasedCollator coll, UnicodeSet contractions, UnicodeSet expansions,
- UnicodeSet removedContractions, boolean addPrefixes) {
- this.coll = coll;
- this.contractions = contractions;
- this.expansions = expansions;
- this.removedContractions = removedContractions;
- this.addPrefixes = addPrefixes;
- }
- }
-
- private void addSpecial(contContext c, StringBuilder buffer, int CE) {
- StringBuilder b = new StringBuilder();
- int offset = (CE & 0xFFFFFF) - c.coll.m_contractionOffset_;
- int newCE = c.coll.m_contractionCE_[offset];
- // we might have a contraction that ends from previous level
- if (newCE != CollationElementIterator.CE_NOT_FOUND_) {
- if (isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_ && isSpecial(newCE)
- && getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) {
- addSpecial(c, buffer, newCE);
- }
- if (buffer.length() > 1) {
- if (c.contractions != null) {
- c.contractions.add(buffer.toString());
- }
- if (c.expansions != null && isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
- c.expansions.add(buffer.toString());
- }
- }
- }
-
- offset++;
- // check whether we're doing contraction or prefix
- if (getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) {
- while (c.coll.m_contractionIndex_[offset] != 0xFFFF) {
- b.delete(0, b.length());
- b.append(buffer);
- newCE = c.coll.m_contractionCE_[offset];
- b.insert(0, c.coll.m_contractionIndex_[offset]);
- if (isSpecial(newCE)
- && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) {
- addSpecial(c, b, newCE);
- } else {
- if (c.contractions != null) {
- c.contractions.add(b.toString());
- }
- if (c.expansions != null && isSpecial(newCE)
- && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) {
- c.expansions.add(b.toString());
- }
- }
- offset++;
- }
- } else if (getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_) {
- while (c.coll.m_contractionIndex_[offset] != 0xFFFF) {
- b.delete(0, b.length());
- b.append(buffer);
- newCE = c.coll.m_contractionCE_[offset];
- b.append(c.coll.m_contractionIndex_[offset]);
- if (isSpecial(newCE)
- && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) {
- addSpecial(c, b, newCE);
- } else {
- if (c.contractions != null) {
- c.contractions.add(b.toString());
- }
- if (c.expansions != null && isSpecial(newCE)
- && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) {
- c.expansions.add(b.toString());
- }
- }
- offset++;
- }
- }
- }
-
- private void processSpecials(contContext c) {
- int internalBufferSize = 512;
- TrieIterator trieiterator = new TrieIterator(c.coll.m_trie_);
- RangeValueIterator.Element element = new RangeValueIterator.Element();
- while (trieiterator.next(element)) {
- int start = element.start;
- int limit = element.limit;
- int CE = element.value;
- StringBuilder contraction = new StringBuilder(internalBufferSize);
-
- if (isSpecial(CE)) {
- if (((getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) || getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_)) {
- while (start < limit) {
- // if there are suppressed contractions, we don't
- // want to add them.
- if (c.removedContractions != null && c.removedContractions.contains(start)) {
- start++;
- continue;
- }
- // we start our contraction from middle, since we don't know if it
- // will grow toward right or left
- contraction.append((char) start);
- addSpecial(c, contraction, CE);
- start++;
- }
- } else if (c.expansions != null && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
- while (start < limit) {
- c.expansions.add(start++);
- }
- }
- }
+ UnicodeSet tailored = new UnicodeSet();
+ if(data.base != null) {
+ new TailoredSet(tailored).forData(data);
}
+ return tailored;
}
/**
* @stable ICU 3.4
*/
public void getContractionsAndExpansions(UnicodeSet contractions, UnicodeSet expansions, boolean addPrefixes)
- throws Exception {
+ throws Exception {
if (contractions != null) {
contractions.clear();
}
if (expansions != null) {
expansions.clear();
}
- String rules = getRules();
- try {
- CollationRuleParser src = new CollationRuleParser(rules);
- contContext c = new contContext(RuleBasedCollator.UCA_, contractions, expansions, src.m_removeSet_,
- addPrefixes);
+ new ContractionsAndExpansions(contractions, expansions, null, addPrefixes).forData(data);
+ }
- // Add the UCA contractions
- processSpecials(c);
- // This is collator specific. Add contractions from a collator
- c.coll = this;
- c.removedContractions = null;
- processSpecials(c);
- } catch (Exception e) {
- throw e;
- }
+ /**
+ * Adds the contractions that start with character c to the set.
+ * Ignores prefixes. Used by AlphabeticIndex.
+ * @internal
+ * @deprecated This API is ICU internal only.
+ */
+ void internalAddContractions(int c, UnicodeSet set) {
+ new ContractionsAndExpansions(set, null, null, false).forCodePoint(data, c);
}
/**
* @see #getRawCollationKey
* @stable ICU 2.8
*/
+ @Override
public CollationKey getCollationKey(String source) {
if (source == null) {
return null;
}
private CollationKey getCollationKey(String source, CollationBuffer buffer) {
- buffer.m_utilRawCollationKey_ = getRawCollationKey(source, buffer.m_utilRawCollationKey_, buffer);
- return new CollationKey(source, buffer.m_utilRawCollationKey_);
+ buffer.rawCollationKey = getRawCollationKey(source, buffer.rawCollationKey, buffer);
+ return new CollationKey(source, buffer.rawCollationKey);
}
/**
* @see RawCollationKey
* @stable ICU 2.8
*/
+ @Override
public RawCollationKey getRawCollationKey(String source, RawCollationKey key) {
if (source == null) {
return null;
}
}
- private RawCollationKey getRawCollationKey(String source, RawCollationKey key, CollationBuffer buffer) {
- int strength = getStrength();
- buffer.m_utilCompare0_ = m_isCaseLevel_;
- // m_utilCompare1_ = true;
- buffer.m_utilCompare2_ = strength >= SECONDARY;
- buffer.m_utilCompare3_ = strength >= TERTIARY;
- buffer.m_utilCompare4_ = strength >= QUATERNARY;
- buffer.m_utilCompare5_ = strength == IDENTICAL;
+ private static final class CollationKeyByteSink extends SortKeyByteSink {
+ CollationKeyByteSink(RawCollationKey key) {
+ super(key.bytes);
+ key_ = key;
+ }
- boolean doFrench = m_isFrenchCollation_ && buffer.m_utilCompare2_;
- // TODO: UCOL_COMMON_BOT4 should be a function of qShifted.
- // If we have no qShifted, we don't need to set UCOL_COMMON_BOT4 so
- // high.
- int commonBottom4 = ((m_variableTopValue_ >>> 8) + 1) & LAST_BYTE_MASK_;
- byte hiragana4 = 0;
- if (m_isHiragana4_ && buffer.m_utilCompare4_) {
- // allocate one more space for hiragana, value for hiragana
- hiragana4 = (byte) commonBottom4;
- commonBottom4++;
+ @Override
+ protected void AppendBeyondCapacity(byte[] bytes, int start, int n, int length) {
+ // n > 0 && appended_ > capacity_
+ if (Resize(n, length)) {
+ System.arraycopy(bytes, start, buffer_, length, n);
+ }
}
- int bottomCount4 = 0xFF - commonBottom4;
- // If we need to normalize, we'll do it all at once at the beginning!
- if (buffer.m_utilCompare5_ && Normalizer.quickCheck(source, Normalizer.NFD, 0) != Normalizer.YES) {
- // if it is identical strength, we have to normalize the string to
- // NFD so that it will be appended correctly to the end of the sort
- // key
- source = Normalizer.decompose(source, false);
- } else if (getDecomposition() != NO_DECOMPOSITION
- && Normalizer.quickCheck(source, Normalizer.FCD, 0) != Normalizer.YES) {
- // for the rest of the strength, if decomposition is on, FCD is
- // enough for us to work on.
- source = Normalizer.normalize(source, Normalizer.FCD);
+ @Override
+ protected boolean Resize(int appendCapacity, int length) {
+ int newCapacity = 2 * buffer_.length;
+ int altCapacity = length + 2 * appendCapacity;
+ if (newCapacity < altCapacity) {
+ newCapacity = altCapacity;
+ }
+ if (newCapacity < 200) {
+ newCapacity = 200;
+ }
+ // Do not call key_.ensureCapacity(newCapacity) because we do not
+ // keep key_.size in sync with appended_.
+ // We only set it when we are done.
+ byte[] newBytes = new byte[newCapacity];
+ System.arraycopy(buffer_, 0, newBytes, 0, length);
+ buffer_ = key_.bytes = newBytes;
+ return true;
}
- getSortKeyBytes(source, doFrench, hiragana4, commonBottom4, bottomCount4, buffer);
+
+ private RawCollationKey key_;
+ }
+
+ private RawCollationKey getRawCollationKey(CharSequence source, RawCollationKey key, CollationBuffer buffer) {
if (key == null) {
- key = new RawCollationKey();
+ key = new RawCollationKey(simpleKeyLengthEstimate(source));
+ } else if (key.bytes == null) {
+ key.bytes = new byte[simpleKeyLengthEstimate(source)];
}
- getSortKey(source, doFrench, commonBottom4, bottomCount4, key, buffer);
+ CollationKeyByteSink sink = new CollationKeyByteSink(key);
+ writeSortKey(source, sink, buffer);
+ key.size = sink.NumberOfBytesAppended();
return key;
}
+ private int simpleKeyLengthEstimate(CharSequence source) {
+ return 2 * source.length() + 10;
+ }
+
+ private void writeSortKey(CharSequence s, CollationKeyByteSink sink, CollationBuffer buffer) {
+ boolean numeric = settings.readOnly().isNumeric();
+ if(settings.readOnly().dontCheckFCD()) {
+ buffer.leftUTF16CollIter.setText(numeric, s, 0);
+ CollationKeys.writeSortKeyUpToQuaternary(
+ buffer.leftUTF16CollIter, data.compressibleBytes, settings.readOnly(),
+ sink, Collation.PRIMARY_LEVEL,
+ CollationKeys.SIMPLE_LEVEL_FALLBACK, true);
+ } else {
+ buffer.leftFCDUTF16Iter.setText(numeric, s, 0);
+ CollationKeys.writeSortKeyUpToQuaternary(
+ buffer.leftFCDUTF16Iter, data.compressibleBytes, settings.readOnly(),
+ sink, Collation.PRIMARY_LEVEL,
+ CollationKeys.SIMPLE_LEVEL_FALLBACK, true);
+ }
+ if(settings.readOnly().getStrength() == IDENTICAL) {
+ writeIdenticalLevel(s, sink);
+ }
+ sink.Append(Collation.TERMINATOR_BYTE);
+ }
+
+ private void writeIdenticalLevel(CharSequence s, CollationKeyByteSink sink) {
+ // NFD quick check
+ int nfdQCYesLimit = data.nfcImpl.decompose(s, 0, s.length(), null);
+ sink.Append(Collation.LEVEL_SEPARATOR_BYTE);
+ // Sync the ByteArrayWrapper size with the key length.
+ sink.key_.size = sink.NumberOfBytesAppended();
+ int prev = 0;
+ if(nfdQCYesLimit != 0) {
+ prev = BOCSU.writeIdenticalLevelRun(prev, s, 0, nfdQCYesLimit, sink.key_);
+ }
+ // Is there non-NFD text?
+ if(nfdQCYesLimit < s.length()) {
+ int destLengthEstimate = s.length() - nfdQCYesLimit;
+ StringBuilder nfd = new StringBuilder();
+ data.nfcImpl.decompose(s, nfdQCYesLimit, s.length(), nfd, destLengthEstimate);
+ BOCSU.writeIdenticalLevelRun(prev, nfd, 0, nfd.length(), sink.key_);
+ }
+ // Sync the key with the buffer again which got bytes appended and may have been reallocated.
+ sink.setBufferAndAppended(sink.key_.bytes, sink.key_.size);
+ }
+
+ /**
+ * Returns the CEs for the string.
+ * @param str the string
+ * @internal for tests & tools
+ * @deprecated This API is ICU internal only.
+ */
+ public long[] internalGetCEs(CharSequence str) {
+ CollationBuffer buffer = null;
+ try {
+ buffer = getCollationBuffer();
+ boolean numeric = settings.readOnly().isNumeric();
+ CollationIterator iter;
+ if(settings.readOnly().dontCheckFCD()) {
+ buffer.leftUTF16CollIter.setText(numeric, str, 0);
+ iter = buffer.leftUTF16CollIter;
+ } else {
+ buffer.leftFCDUTF16Iter.setText(numeric, str, 0);
+ iter = buffer.leftFCDUTF16Iter;
+ }
+ int length = iter.fetchCEs() - 1;
+ assert length >= 0 && iter.getCE(length) == Collation.NO_CE;
+ long[] ces = new long[length];
+ System.arraycopy(iter.getCEs(), 0, ces, 0, length);
+ return ces;
+ } finally {
+ releaseCollationBuffer(buffer);
+ }
+ }
+
+ /**
+ * Returns this Collator's strength attribute. The strength attribute
+ * determines the minimum level of difference considered significant.
+ *
+ * <p>{@icunote} This can return QUATERNARY strength, which is not supported by the
+ * JDK version.
+ *
+ * <p>See the Collator class description for more details.
+ *
+ * @return this Collator's current strength attribute.
+ * @see #setStrength
+ * @see #PRIMARY
+ * @see #SECONDARY
+ * @see #TERTIARY
+ * @see #QUATERNARY
+ * @see #IDENTICAL
+ * @stable ICU 2.8
+ */
+ @Override
+ public int getStrength() {
+ return settings.readOnly().getStrength();
+ }
+
+ /**
+ * Returns the decomposition mode of this Collator. The decomposition mode
+ * determines how Unicode composed characters are handled.
+ *
+ * <p>See the Collator class description for more details.
+ *
+ * @return the decomposition mode
+ * @see #setDecomposition
+ * @see #NO_DECOMPOSITION
+ * @see #CANONICAL_DECOMPOSITION
+ * @stable ICU 2.8
+ */
+ @Override
+ public int getDecomposition() {
+ return (settings.readOnly().options & CollationSettings.CHECK_FCD) != 0 ?
+ CANONICAL_DECOMPOSITION : NO_DECOMPOSITION;
+ }
+
/**
* Return true if an uppercase character is sorted before the corresponding lowercase character. See
* setCaseFirst(boolean) for details.
* @stable ICU 2.8
*/
public boolean isUpperCaseFirst() {
- return (m_caseFirst_ == AttributeValue.UPPER_FIRST_);
+ return (settings.readOnly().getCaseFirst() == CollationSettings.CASE_FIRST_AND_UPPER_MASK);
}
/**
* @stable ICU 2.8
*/
public boolean isLowerCaseFirst() {
- return (m_caseFirst_ == AttributeValue.LOWER_FIRST_);
+ return (settings.readOnly().getCaseFirst() == CollationSettings.CASE_FIRST);
}
/**
- * Checks if the alternate handling behaviour is the UCA defined SHIFTED or NON_IGNORABLE. If return value is true,
+ * Checks if the alternate handling behavior is the UCA defined SHIFTED or NON_IGNORABLE. If return value is true,
* then the alternate handling attribute for the Collator is SHIFTED. Otherwise if return value is false, then the
* alternate handling attribute for the Collator is NON_IGNORABLE See setAlternateHandlingShifted(boolean) for more
* details.
* @stable ICU 2.8
*/
public boolean isAlternateHandlingShifted() {
- return m_isAlternateHandlingShifted_;
+ return settings.readOnly().getAlternateHandling();
}
/**
* @stable ICU 2.8
*/
public boolean isCaseLevel() {
- return m_isCaseLevel_;
+ return (settings.readOnly().options & CollationSettings.CASE_LEVEL) != 0;
}
/**
* @stable ICU 2.8
*/
public boolean isFrenchCollation() {
- return m_isFrenchCollation_;
+ return (settings.readOnly().options & CollationSettings.BACKWARD_SECONDARY) != 0;
}
/**
*/
@Deprecated
public boolean isHiraganaQuaternary() {
- return m_isHiragana4_;
+ return false; // TODO: change docs to say always returns false?
}
/**
- * Gets the variable top value of a Collator. Lower 16 bits are undefined and should be ignored.
+ * {@icu} Gets the variable top value of a Collator.
*
- * @return the variable top value of a Collator.
- * @see #setVariableTop
+ * @return the variable top primary weight
+ * @see #getMaxVariable
* @stable ICU 2.6
*/
+ @Override
public int getVariableTop() {
- return m_variableTopValue_ << 16;
+ return (int)settings.readOnly().variableTop;
}
/**
* @stable ICU 2.8
*/
public boolean getNumericCollation() {
- return m_isNumericCollation_;
+ return (settings.readOnly().options & CollationSettings.NUMERIC) != 0;
}
/**
* @return a copy of the reordering codes for this collator;
* if none are set then returns an empty array
* @see #setReorderCodes
- * @see #getEquivalentReorderCodes
+ * @see Collator#getEquivalentReorderCodes
* @stable ICU 4.8
*/
+ @Override
public int[] getReorderCodes() {
- if (m_reorderCodes_ != null) {
- return m_reorderCodes_.clone();
- } else {
- return LeadByteConstants.EMPTY_INT_ARRAY;
- }
+ return settings.readOnly().reorderCodes.clone();
}
+ // public other methods -------------------------------------------------
+
/**
- * Retrieves all the reorder codes that are grouped with the given reorder code. Some reorder
- * codes are grouped and must reorder together.
- *
- * @param reorderCode code for which equivalents to be retrieved
- * @return the set of all reorder codes in the same group as the given reorder code.
- * @see #setReorderCodes
- * @see #getReorderCodes
- * @stable ICU 4.8
+ * {@inheritDoc}
*/
- public static int[] getEquivalentReorderCodes(int reorderCode) {
- Set<Integer> equivalentCodesSet = new HashSet<Integer>();
- int[] leadBytes = RuleBasedCollator.LEADBYTE_CONSTANTS_.getLeadBytesForReorderCode(reorderCode);
- for (int leadByte : leadBytes) {
- int[] codes = RuleBasedCollator.LEADBYTE_CONSTANTS_.getReorderCodesForLeadByte(leadByte);
- for (int code : codes) {
- equivalentCodesSet.add(code);
- }
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
}
- int[] equivalentCodes = new int[equivalentCodesSet.size()];
- int i = 0;
- for (int code : equivalentCodesSet) {
- equivalentCodes[i++] = code;
+ if (!super.equals(obj)) {
+ return false;
}
- return equivalentCodes;
+ RuleBasedCollator o = (RuleBasedCollator) obj;
+ if(!settings.readOnly().equals(o.settings.readOnly())) { return false; }
+ if(data == o.data) { return true; }
+ boolean thisIsRoot = data.base == null;
+ boolean otherIsRoot = o.data.base == null;
+ assert(!thisIsRoot || !otherIsRoot); // otherwise their data pointers should be ==
+ if(thisIsRoot != otherIsRoot) { return false; }
+ if((thisIsRoot || tailoring.rules.length() != 0) &&
+ (otherIsRoot || o.tailoring.rules.length() != 0)) {
+ // Shortcut: If both collators have valid rule strings, then compare those.
+ if(tailoring.rules.equals(o.tailoring.rules)) { return true; }
+ }
+ // Different rule strings can result in the same or equivalent tailoring.
+ // The rule strings are optional in ICU resource bundles, although included by default.
+ // cloneBinary() drops the rule string.
+ UnicodeSet thisTailored = getTailoredSet();
+ UnicodeSet otherTailored = o.getTailoredSet();
+ if(!thisTailored.equals(otherTailored)) { return false; }
+ // For completeness, we should compare all of the mappings;
+ // or we should create a list of strings, sort it with one collator,
+ // and check if both collators compare adjacent strings the same
+ // (order & strength, down to quaternary); or similar.
+ // Testing equality of collators seems unusual.
+ return true;
}
- // public other methods -------------------------------------------------
-
/**
- * Compares the equality of two RuleBasedCollator objects. RuleBasedCollator objects are equal if they have the same
- * collation rules and the same attributes.
+ * Generates a unique hash code for this RuleBasedCollator.
*
- * @param obj
- * the RuleBasedCollator to be compared to.
- * @return true if this RuleBasedCollator has exactly the same collation behaviour as obj, false otherwise.
+ * @return the unique hash code for this Collator
* @stable ICU 2.8
*/
- public boolean equals(Object obj) {
- if (obj == null) {
- return false; // super does class check
+ @Override
+ public int hashCode() {
+ int h = settings.readOnly().hashCode();
+ if(data.base == null) { return h; } // root collator
+ // Do not rely on the rule string, see comments in operator==().
+ UnicodeSet set = getTailoredSet();
+ UnicodeSetIterator iter = new UnicodeSetIterator(set);
+ while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) {
+ h ^= data.getCE32(iter.codepoint);
}
- if (this == obj) {
- return true;
- }
- if (getClass() != obj.getClass()) {
- return false;
- }
- RuleBasedCollator other = (RuleBasedCollator) obj;
- // all other non-transient information is also contained in rules.
- if (getStrength() != other.getStrength() || getDecomposition() != other.getDecomposition()
- || other.m_caseFirst_ != m_caseFirst_ || other.m_caseSwitch_ != m_caseSwitch_
- || other.m_isAlternateHandlingShifted_ != m_isAlternateHandlingShifted_
- || other.m_isCaseLevel_ != m_isCaseLevel_ || other.m_isFrenchCollation_ != m_isFrenchCollation_
- || other.m_isHiragana4_ != m_isHiragana4_) {
- return false;
- }
- if (m_reorderCodes_ != null ^ other.m_reorderCodes_ != null) {
- return false;
- }
- if (m_reorderCodes_ != null) {
- if (m_reorderCodes_.length != other.m_reorderCodes_.length) {
- return false;
- }
- for (int i = 0; i < m_reorderCodes_.length; i++) {
- if (m_reorderCodes_[i] != other.m_reorderCodes_[i]) {
- return false;
- }
- }
- }
- boolean rules = m_rules_ == other.m_rules_;
- if (!rules && (m_rules_ != null && other.m_rules_ != null)) {
- rules = m_rules_.equals(other.m_rules_);
- }
- if (!rules || !ICUDebug.enabled("collation")) {
- return rules;
- }
- if (m_addition3_ != other.m_addition3_ || m_bottom3_ != other.m_bottom3_
- || m_bottomCount3_ != other.m_bottomCount3_ || m_common3_ != other.m_common3_
- || m_isSimple3_ != other.m_isSimple3_ || m_mask3_ != other.m_mask3_
- || m_minContractionEnd_ != other.m_minContractionEnd_ || m_minUnsafe_ != other.m_minUnsafe_
- || m_top3_ != other.m_top3_ || m_topCount3_ != other.m_topCount3_
- || !Arrays.equals(m_unsafe_, other.m_unsafe_)) {
- return false;
- }
- if (!m_trie_.equals(other.m_trie_)) {
- // we should use the trie iterator here, but then this part is
- // only used in the test.
- for (int i = UCharacter.MAX_VALUE; i >= UCharacter.MIN_VALUE; i--) {
- int v = m_trie_.getCodePointValue(i);
- int otherv = other.m_trie_.getCodePointValue(i);
- if (v != otherv) {
- int mask = v & (CE_TAG_MASK_ | CE_SPECIAL_FLAG_);
- if (mask == (otherv & 0xff000000)) {
- v &= 0xffffff;
- otherv &= 0xffffff;
- if (mask == 0xf1000000) {
- v -= (m_expansionOffset_ << 4);
- otherv -= (other.m_expansionOffset_ << 4);
- } else if (mask == 0xf2000000) {
- v -= m_contractionOffset_;
- otherv -= other.m_contractionOffset_;
- }
- if (v == otherv) {
- continue;
- }
- }
- return false;
- }
- }
- }
- if (!Arrays.equals(m_contractionCE_, other.m_contractionCE_)
- || !Arrays.equals(m_contractionEnd_, other.m_contractionEnd_)
- || !Arrays.equals(m_contractionIndex_, other.m_contractionIndex_)
- || !Arrays.equals(m_expansion_, other.m_expansion_)
- || !Arrays.equals(m_expansionEndCE_, other.m_expansionEndCE_)) {
- return false;
- }
- // not comparing paddings
- for (int i = 0; i < m_expansionEndCE_.length; i++) {
- if (m_expansionEndCEMaxSize_[i] != other.m_expansionEndCEMaxSize_[i]) {
- return false;
- }
- }
- return true;
- }
-
- /**
- * Generates a unique hash code for this RuleBasedCollator.
- *
- * @return the unique hash code for this Collator
- * @stable ICU 2.8
- */
- public int hashCode() {
- String rules = getRules();
- if (rules == null) {
- rules = "";
- }
- return rules.hashCode();
- }
-
- /**
- * Compares the source text String to the target text String according to the collation rules, strength and
- * decomposition mode for this RuleBasedCollator. Returns an integer less than, equal to or greater than zero
- * depending on whether the source String is less than, equal to or greater than the target String. See the Collator
- * class description for an example of use. </p>
- * <p>
- * General recommendation: <br>
- * If comparison are to be done to the same String multiple times, it would be more efficient to generate
- * CollationKeys for the Strings and use CollationKey.compareTo(CollationKey) for the comparisons. If speed
- * performance is critical and object instantiation is to be reduced, further optimization may be achieved by
- * generating a simpler key of the form RawCollationKey and reusing this RawCollationKey object with the method
- * RuleBasedCollator.getRawCollationKey. Internal byte representation can be directly accessed via RawCollationKey
- * and stored for future use. Like CollationKey, RawCollationKey provides a method RawCollationKey.compareTo for key
- * comparisons. If the each Strings are compared to only once, using the method RuleBasedCollator.compare(String,
- * String) will have a better performance.
- * </p>
- *
- * @param source
- * the source text String.
- * @param target
- * the target text String.
- * @return Returns an integer value. Value is less than zero if source is less than target, value is zero if source
- * and target are equal, value is greater than zero if source is greater than target.
- * @see CollationKey
- * @see #getCollationKey
- * @stable ICU 2.8
- */
- public int compare(String source, String target) {
- if (source.equals(target)) {
- return 0;
- }
- CollationBuffer buffer = null;
- try {
- buffer = getCollationBuffer();
- return compare(source, target, buffer);
- } finally {
- releaseCollationBuffer(buffer);
- }
- }
-
- private int compare(String source, String target, CollationBuffer buffer) {
- // Find the length of any leading portion that is equal
- int offset = getFirstUnmatchedOffset(source, target);
- // return compareRegular(source, target, offset);
- if (latinOneUse_) {
- if ((offset < source.length() && source.charAt(offset) > ENDOFLATINONERANGE_)
- || (offset < target.length() && target.charAt(offset) > ENDOFLATINONERANGE_)) {
- // source or target start with non-latin-1
- return compareRegular(source, target, offset, buffer);
- } else {
- return compareUseLatin1(source, target, offset, buffer);
- }
- } else {
- return compareRegular(source, target, offset, buffer);
- }
- }
-
- // package private inner interfaces --------------------------------------
-
- /**
- * Attribute values to be used when setting the Collator options
- */
- static interface AttributeValue {
- /**
- * Indicates that the default attribute value will be used. See individual attribute for details on its default
- * value.
- */
- static final int DEFAULT_ = -1;
- /**
- * Primary collation strength
- */
- static final int PRIMARY_ = Collator.PRIMARY;
- /**
- * Secondary collation strength
- */
- static final int SECONDARY_ = Collator.SECONDARY;
- /**
- * Tertiary collation strength
- */
- static final int TERTIARY_ = Collator.TERTIARY;
- /**
- * Default collation strength
- */
- static final int DEFAULT_STRENGTH_ = Collator.TERTIARY;
- /**
- * Internal use for strength checks in Collation elements
- */
- static final int CE_STRENGTH_LIMIT_ = Collator.TERTIARY + 1;
- /**
- * Quaternary collation strength
- */
- static final int QUATERNARY_ = 3;
- /**
- * Identical collation strength
- */
- static final int IDENTICAL_ = Collator.IDENTICAL;
- /**
- * Internal use for strength checks
- */
- static final int STRENGTH_LIMIT_ = Collator.IDENTICAL + 1;
- /**
- * Turn the feature off - works for FRENCH_COLLATION, CASE_LEVEL, HIRAGANA_QUATERNARY_MODE and
- * DECOMPOSITION_MODE
- */
- static final int OFF_ = 16;
- /**
- * Turn the feature on - works for FRENCH_COLLATION, CASE_LEVEL, HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE
- */
- static final int ON_ = 17;
- /**
- * Valid for ALTERNATE_HANDLING. Alternate handling will be shifted
- */
- static final int SHIFTED_ = 20;
- /**
- * Valid for ALTERNATE_HANDLING. Alternate handling will be non ignorable
- */
- static final int NON_IGNORABLE_ = 21;
- /**
- * Valid for CASE_FIRST - lower case sorts before upper case
- */
- static final int LOWER_FIRST_ = 24;
- /**
- * Upper case sorts before lower case
- */
- static final int UPPER_FIRST_ = 25;
- /**
- * Number of attribute values
- */
- static final int LIMIT_ = 29;
- }
-
- /**
- * Attributes that collation service understands. All the attributes can take DEFAULT value, as well as the values
- * specific to each one.
- */
- static interface Attribute {
- /**
- * Attribute for direction of secondary weights - used in French. Acceptable values are ON, which results in
- * secondary weights being considered backwards and OFF which treats secondary weights in the order they appear.
- */
- static final int FRENCH_COLLATION_ = 0;
- /**
- * Attribute for handling variable elements. Acceptable values are NON_IGNORABLE (default) which treats all the
- * codepoints with non-ignorable primary weights in the same way, and SHIFTED which causes codepoints with
- * primary weights that are equal or below the variable top value to be ignored on primary level and moved to
- * the quaternary level.
- */
- static final int ALTERNATE_HANDLING_ = 1;
- /**
- * Controls the ordering of upper and lower case letters. Acceptable values are OFF (default), which orders
- * upper and lower case letters in accordance to their tertiary weights, UPPER_FIRST which forces upper case
- * letters to sort before lower case letters, and LOWER_FIRST which does the opposite.
- */
- static final int CASE_FIRST_ = 2;
- /**
- * Controls whether an extra case level (positioned before the third level) is generated or not. Acceptable
- * values are OFF (default), when case level is not generated, and ON which causes the case level to be
- * generated. Contents of the case level are affected by the value of CASE_FIRST attribute. A simple way to
- * ignore accent differences in a string is to set the strength to PRIMARY and enable case level.
- */
- static final int CASE_LEVEL_ = 3;
- /**
- * Controls whether the normalization check and necessary normalizations are performed. When set to OFF
- * (default) no normalization check is performed. The correctness of the result is guaranteed only if the input
- * data is in so-called FCD form (see users manual for more info). When set to ON, an incremental check is
- * performed to see whether the input data is in the FCD form. If the data is not in the FCD form, incremental
- * NFD normalization is performed.
- */
- static final int NORMALIZATION_MODE_ = 4;
- /**
- * The strength attribute. Can be either PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL. The usual
- * strength for most locales (except Japanese) is tertiary. Quaternary strength is useful when combined with
- * shifted setting for alternate handling attribute and for JIS x 4061 collation, when it is used to distinguish
- * between Katakana and Hiragana (this is achieved by setting the HIRAGANA_QUATERNARY mode to on. Otherwise,
- * quaternary level is affected only by the number of non ignorable code points in the string. Identical
- * strength is rarely useful, as it amounts to codepoints of the NFD form of the string.
- */
- static final int STRENGTH_ = 5;
- /**
- * When turned on, this attribute positions Hiragana before all non-ignorables on quaternary level. This is a
- * sneaky way to produce JIS sort order.
- */
- static final int HIRAGANA_QUATERNARY_MODE_ = 6;
- /**
- * Attribute count
- */
- static final int LIMIT_ = 7;
- }
-
- /**
- * DataManipulate singleton
- */
- static class DataManipulate implements Trie.DataManipulate {
- // public methods ----------------------------------------------------
-
- /**
- * Internal method called to parse a lead surrogate's ce for the offset to the next trail surrogate data.
- *
- * @param ce
- * collation element of the lead surrogate
- * @return data offset or 0 for the next trail surrogate
- * @stable ICU 2.8
- */
- public final int getFoldingOffset(int ce) {
- if (isSpecial(ce) && getTag(ce) == CE_SURROGATE_TAG_) {
- return (ce & 0xFFFFFF);
- }
- return 0;
- }
-
- /**
- * Get singleton object
- */
- public static final DataManipulate getInstance() {
- if (m_instance_ == null) {
- m_instance_ = new DataManipulate();
- }
- return m_instance_;
- }
-
- // private data member ----------------------------------------------
-
- /**
- * Singleton instance
- */
- private static DataManipulate m_instance_;
-
- // private constructor ----------------------------------------------
-
- /**
- * private to prevent initialization
- */
- private DataManipulate() {
- }
- }
-
- /**
- * UCAConstants
- */
- static final class UCAConstants {
- int FIRST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000
- int LAST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000
- int FIRST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x00008705
- int FIRST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000000
- int LAST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000500
- int LAST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x0000DD05
- int FIRST_VARIABLE_[] = new int[2]; // 0x05070505
- int LAST_VARIABLE_[] = new int[2]; // 0x13CF0505
- int FIRST_NON_VARIABLE_[] = new int[2]; // 0x16200505
- int LAST_NON_VARIABLE_[] = new int[2]; // 0x767C0505
- int RESET_TOP_VALUE_[] = new int[2]; // 0x9F000303
- int FIRST_IMPLICIT_[] = new int[2];
- int LAST_IMPLICIT_[] = new int[2];
- int FIRST_TRAILING_[] = new int[2];
- int LAST_TRAILING_[] = new int[2];
- int PRIMARY_TOP_MIN_;
- int PRIMARY_IMPLICIT_MIN_; // 0xE8000000
- int PRIMARY_IMPLICIT_MAX_; // 0xF0000000
- int PRIMARY_TRAILING_MIN_; // 0xE8000000
- int PRIMARY_TRAILING_MAX_; // 0xF0000000
- int PRIMARY_SPECIAL_MIN_; // 0xE8000000
- int PRIMARY_SPECIAL_MAX_; // 0xF0000000
- }
-
- /**
- * Script to Lead Byte and Lead Byte to Script Data
- *
- */
- static final class LeadByteConstants {
- private static final int DATA_MASK_FOR_INDEX = 0x8000;
- private static final int[] EMPTY_INT_ARRAY = new int[0];
-
- private int serializedSize = 0;
-
- private Map<Integer, Integer> SCRIPT_TO_LEAD_BYTES_INDEX;
- private byte[] SCRIPT_TO_LEAD_BYTES_DATA;
-
- private int[] LEAD_BYTE_TO_SCRIPTS_INDEX;
- private byte[] LEAD_BYTE_TO_SCRIPTS_DATA;
-
- LeadByteConstants() {
- }
-
- void read(DataInputStream dis) throws IOException {
- int readcount = 0;
- int indexCount;
- int dataSize;
-
- // script to lead bytes
- indexCount = dis.readShort();
- readcount += 2;
- dataSize = dis.readShort();
- readcount += 2;
- this.SCRIPT_TO_LEAD_BYTES_INDEX = new HashMap<Integer, Integer>();
- //System.out.println("Script to Lead Bytes Index - Count = " + indexCount);
- for (int index = 0; index < indexCount; index++) {
- int reorderCode = dis.readShort(); // reorder code
- readcount += 2;
- int dataOffset = 0xffff & dis.readShort(); // data offset
- readcount += 2;
- // System.out.println("\t-------------");
- // System.out.println("\toffset = " + Integer.toHexString(readcount - 4));
- // System.out.println("\treorderCode = " + Integer.toHexString(reorderCode));
- // System.out.println("\tdataOffset = " + Integer.toHexString(dataOffset));
- this.SCRIPT_TO_LEAD_BYTES_INDEX.put(reorderCode, dataOffset);
- }
-
- this.SCRIPT_TO_LEAD_BYTES_DATA = new byte[dataSize * 2];
- dis.readFully(this.SCRIPT_TO_LEAD_BYTES_DATA, 0, this.SCRIPT_TO_LEAD_BYTES_DATA.length);
- readcount += this.SCRIPT_TO_LEAD_BYTES_DATA.length;
-
- // lead byte to scripts
- indexCount = dis.readShort();
- readcount += 2;
- dataSize = dis.readShort();
- readcount += 2;
- this.LEAD_BYTE_TO_SCRIPTS_INDEX = new int[indexCount];
- //System.out.println("Lead Byte to Scripts Index - Count = " + indexCount);
- for (int index = 0; index < indexCount; index++) {
- this.LEAD_BYTE_TO_SCRIPTS_INDEX[index] = 0xffff & dis.readShort();
- readcount += 2;
- // System.out.println("\t-------------");
- // System.out.println("\toffset = " + Integer.toHexString(readcount - 2));
- // System.out.println("\tindex = " + Integer.toHexString(index));
- // System.out.println("\tdataOffset = " + Integer.toHexString(this.LEAD_BYTE_TO_SCRIPTS_INDEX[index]));
- }
-
- this.LEAD_BYTE_TO_SCRIPTS_DATA = new byte[dataSize * 2];
- dis.readFully(this.LEAD_BYTE_TO_SCRIPTS_DATA, 0, this.LEAD_BYTE_TO_SCRIPTS_DATA.length);
- readcount += this.LEAD_BYTE_TO_SCRIPTS_DATA.length;
-
- this.serializedSize = readcount;
- }
-
- int getSerializedDataSize() {
- return this.serializedSize;
- }
-
- int[] getReorderCodesForLeadByte(int leadByte) {
- if (leadByte >= this.LEAD_BYTE_TO_SCRIPTS_INDEX.length) {
- return EMPTY_INT_ARRAY;
- }
- int offset = this.LEAD_BYTE_TO_SCRIPTS_INDEX[leadByte];
- if (offset == 0) {
- return EMPTY_INT_ARRAY;
- }
- int[] reorderCodes;
- if ((offset & DATA_MASK_FOR_INDEX) == DATA_MASK_FOR_INDEX) {
- reorderCodes = new int[1];
- reorderCodes[0] = offset & ~DATA_MASK_FOR_INDEX;
- } else {
- int length = readShort(this.LEAD_BYTE_TO_SCRIPTS_DATA, offset);
- offset++;
-
- reorderCodes = new int[length];
- for (int code = 0; code < length; code++, offset++) {
- reorderCodes[code] = readShort(this.LEAD_BYTE_TO_SCRIPTS_DATA, offset);
- }
- }
- return reorderCodes;
- }
-
- int[] getLeadBytesForReorderCode(int reorderCode) {
- if (!this.SCRIPT_TO_LEAD_BYTES_INDEX.containsKey(reorderCode)) {
- return EMPTY_INT_ARRAY;
- }
- int offset = this.SCRIPT_TO_LEAD_BYTES_INDEX.get(reorderCode);
-
- if (offset == 0) {
- return EMPTY_INT_ARRAY;
- }
-
- int[] leadBytes;
- if ((offset & DATA_MASK_FOR_INDEX) == DATA_MASK_FOR_INDEX) {
- leadBytes = new int[1];
- leadBytes[0] = offset & ~DATA_MASK_FOR_INDEX;
- } else {
- int length = readShort(this.SCRIPT_TO_LEAD_BYTES_DATA, offset);
- offset++;
-
- leadBytes = new int[length];
- for (int leadByte = 0; leadByte < length; leadByte++, offset++) {
- leadBytes[leadByte] = readShort(this.SCRIPT_TO_LEAD_BYTES_DATA, offset);
- }
- }
- return leadBytes;
- }
-
- private static int readShort(byte[] data, int offset) {
- return (0xff & data[offset * 2]) << 8 | (data[offset * 2 + 1] & 0xff);
- }
- }
-
- // package private data member -------------------------------------------
-
- static final byte BYTE_FIRST_TAILORED_ = (byte) 0x04;
- static final byte BYTE_COMMON_ = (byte) 0x05;
- static final int COMMON_TOP_2_ = 0x86; // int for unsigness
- static final int COMMON_BOTTOM_2_ = BYTE_COMMON_;
- static final int COMMON_BOTTOM_3 = 0x05;
- /**
- * Case strength mask
- */
- static final int CE_CASE_BIT_MASK_ = 0xC0;
- static final int CE_TAG_SHIFT_ = 24;
- static final int CE_TAG_MASK_ = 0x0F000000;
-
- static final int CE_SPECIAL_FLAG_ = 0xF0000000;
- /**
- * Lead surrogate that is tailored and doesn't start a contraction
- */
- static final int CE_SURROGATE_TAG_ = 5;
- /**
- * Mask to get the primary strength of the collation element
- */
- static final int CE_PRIMARY_MASK_ = 0xFFFF0000;
- /**
- * Mask to get the secondary strength of the collation element
- */
- static final int CE_SECONDARY_MASK_ = 0xFF00;
- /**
- * Mask to get the tertiary strength of the collation element
- */
- static final int CE_TERTIARY_MASK_ = 0xFF;
- /**
- * Primary strength shift
- */
- static final int CE_PRIMARY_SHIFT_ = 16;
- /**
- * Secondary strength shift
- */
- static final int CE_SECONDARY_SHIFT_ = 8;
- /**
- * Continuation marker
- */
- static final int CE_CONTINUATION_MARKER_ = 0xC0;
-
- /**
- * Size of collator raw data headers and options before the expansion data. This is used when expansion ces are to
- * be retrieved. ICU4C uses the expansion offset starting from UCollator.UColHeader, hence ICU4J will have to minus
- * that off to get the right expansion ce offset. In number of ints.
- */
- int m_expansionOffset_;
- /**
- * Size of collator raw data headers, options and expansions before contraction data. This is used when contraction
- * ces are to be retrieved. ICU4C uses contraction offset starting from UCollator.UColHeader, hence ICU4J will have
- * to minus that off to get the right contraction ce offset. In number of chars.
- */
- int m_contractionOffset_;
- /**
- * Flag indicator if Jamo is special
- */
- boolean m_isJamoSpecial_;
-
- // Collator options ------------------------------------------------------
-
- int m_defaultVariableTopValue_;
- boolean m_defaultIsFrenchCollation_;
- boolean m_defaultIsAlternateHandlingShifted_;
- int m_defaultCaseFirst_;
- boolean m_defaultIsCaseLevel_;
- int m_defaultDecomposition_;
- int m_defaultStrength_;
- boolean m_defaultIsHiragana4_;
- boolean m_defaultIsNumericCollation_;
- /**
- * Default script order - the one created at initial rule parse time
- */
- int[] m_defaultReorderCodes_;
-
- /**
- * Value of the variable top
- */
- int m_variableTopValue_;
- /**
- * Attribute for special Hiragana
- */
- boolean m_isHiragana4_;
- /**
- * Case sorting customization
- */
- int m_caseFirst_;
- /**
- * Numeric collation option
- */
- boolean m_isNumericCollation_;
- /**
- * Script order
- */
- int[] m_reorderCodes_;
-
- // end Collator options --------------------------------------------------
-
- /**
- * Expansion table
- */
- int m_expansion_[];
- /**
- * Contraction index table
- */
- char m_contractionIndex_[];
- /**
- * Contraction CE table
- */
- int m_contractionCE_[];
- /**
- * Data trie
- */
- IntTrie m_trie_;
- /**
- * Table to store all collation elements that are the last element of an expansion. This is for use in StringSearch.
- */
- int m_expansionEndCE_[];
- /**
- * Table to store the maximum size of any expansions that end with the corresponding collation element in
- * m_expansionEndCE_. For use in StringSearch too
- */
- byte m_expansionEndCEMaxSize_[];
- /**
- * Heuristic table to store information on whether a char character is considered "unsafe". "Unsafe" character are
- * combining marks or those belonging to some contraction sequence from the offset 1 onwards. E.g. if "ABC" is the
- * only contraction, then 'B' and 'C' are considered unsafe. If we have another contraction "ZA" with the one above,
- * then 'A', 'B', 'C' are "unsafe" but 'Z' is not.
- */
- byte m_unsafe_[];
- /**
- * Table to store information on whether a codepoint can occur as the last character in a contraction
- */
- byte m_contractionEnd_[];
- /**
- * Original collation rules
- */
- String m_rules_;
- /**
- * The smallest "unsafe" codepoint
- */
- char m_minUnsafe_;
- /**
- * The smallest codepoint that could be the end of a contraction
- */
- char m_minContractionEnd_;
- /**
- * General version of the collator
- */
- VersionInfo m_version_;
- /**
- * UCA version
- */
- VersionInfo m_UCA_version_;
- /**
- * UCD version
- */
- VersionInfo m_UCD_version_;
- /**
- * Lead byte and script data
- */
- int m_leadByteToScripts;
- int m_scriptToLeadBytes;
- /**
- * UnicodeData.txt property object
- */
- static final RuleBasedCollator UCA_;
- /**
- * UCA Constants
- */
- static final UCAConstants UCA_CONSTANTS_;
- /**
- * Lead Byte Constants
- */
- static LeadByteConstants LEADBYTE_CONSTANTS_;
- /**
- * Table for UCA and builder use
- */
- static final char UCA_CONTRACTIONS_[];
- static final int MAX_UCA_CONTRACTION_LENGTH;
-
- private static boolean UCA_INIT_COMPLETE;
-
- /**
- * Implicit generator
- */
- static final ImplicitCEGenerator impCEGen_;
-
- static final byte SORT_LEVEL_TERMINATOR_ = 1;
-
- // These are values from UCA required for
- // implicit generation and supressing sort key compression
- // they should regularly be in the UCA, but if one
- // is running without UCA, it could be a problem
- static final int maxRegularPrimary = 0x7A;
- static final int minImplicitPrimary = 0xE0;
- static final int maxImplicitPrimary = 0xE4;
-
- // block to initialise character property database
- static {
- // take pains to let static class init succeed, otherwise the class itself won't exist and
- // clients will get a NoClassDefFoundException. Instead, make the constructors fail if
- // we can't load the UCA data.
-
- RuleBasedCollator iUCA_ = null;
- UCAConstants iUCA_CONSTANTS_ = null;
- LeadByteConstants iLEADBYTE_CONSTANTS = null;
- char iUCA_CONTRACTIONS_[] = null;
- Output<Integer> maxUCAContractionLength = new Output<Integer>();
- ImplicitCEGenerator iimpCEGen_ = null;
- try {
- // !!! note what's going on here...
- // even though the static init of the class is not yet complete, we
- // instantiate an instance of the class. So we'd better be sure that
- // instantiation doesn't rely on the static initialization that's
- // not complete yet!
- iUCA_ = new RuleBasedCollator();
- iUCA_CONSTANTS_ = new UCAConstants();
- iLEADBYTE_CONSTANTS = new LeadByteConstants();
- iUCA_CONTRACTIONS_ = CollatorReader.read(iUCA_, iUCA_CONSTANTS_, iLEADBYTE_CONSTANTS, maxUCAContractionLength);
-
- // called before doing canonical closure for the UCA.
- iimpCEGen_ = new ImplicitCEGenerator(minImplicitPrimary, maxImplicitPrimary);
- // iimpCEGen_ = new ImplicitCEGenerator(iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_,
- // iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_);
- iUCA_.init();
- ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle.getBundleInstance(
- ICUResourceBundle.ICU_COLLATION_BASE_NAME, ULocale.ENGLISH);
- iUCA_.m_rules_ = (String) rb.getObject("UCARules");
- } catch (MissingResourceException ex) {
- // throw ex;
- } catch (IOException e) {
- // e.printStackTrace();
- // throw new MissingResourceException(e.getMessage(),"","");
- }
-
- UCA_ = iUCA_;
- UCA_CONSTANTS_ = iUCA_CONSTANTS_;
- LEADBYTE_CONSTANTS_ = iLEADBYTE_CONSTANTS;
- UCA_CONTRACTIONS_ = iUCA_CONTRACTIONS_;
- MAX_UCA_CONTRACTION_LENGTH = maxUCAContractionLength.value;
- impCEGen_ = iimpCEGen_;
-
- UCA_INIT_COMPLETE = true;
- }
-
- private static void checkUCA() throws MissingResourceException {
- if (UCA_INIT_COMPLETE && UCA_ == null) {
- throw new MissingResourceException("Collator UCA data unavailable", "", "");
- }
- }
-
- // package private constructors ------------------------------------------
-
- /**
- * <p>
- * Private contructor for use by subclasses. Public access to creating Collators is handled by the API
- * Collator.getInstance() or RuleBasedCollator(String rules).
- * </p>
- * <p>
- * This constructor constructs the UCA collator internally
- * </p>
- */
- RuleBasedCollator() {
- checkUCA();
- }
-
- /**
- * Constructs a RuleBasedCollator from the argument locale.
- * If no resource bundle is associated with the locale, UCA is used instead.
- *
- * @param locale
- */
- RuleBasedCollator(ULocale locale) {
- checkUCA();
- try {
- ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle.getBundleInstance(
- ICUResourceBundle.ICU_COLLATION_BASE_NAME, locale);
- if (rb != null) {
- ICUResourceBundle elements = null;
-
- // Use keywords, if supplied for lookup
- String collkey = locale.getKeywordValue("collation");
- if (collkey != null) {
- try {
- elements = rb.getWithFallback("collations/" + collkey);
- } catch (MissingResourceException e) {
- // fall through
- }
- }
- if (elements == null) {
- // either collation keyword was not supplied or
- // the keyword was invalid - use default collation for the locale
-
- // collations/default should always give a string back
- // keyword for the real collation data
- collkey = rb.getStringWithFallback("collations/default");
- elements = rb.getWithFallback("collations/" + collkey);
- }
-
- // TODO: Determine actual & valid locale correctly
- ULocale uloc = rb.getULocale();
- setLocale(uloc, uloc);
-
- m_rules_ = elements.getString("Sequence");
- ByteBuffer buf = elements.get("%%CollationBin").getBinary();
- // %%CollationBin
- if (buf != null) {
- // m_rules_ = (String)rules[1][1];
- CollatorReader.initRBC(this, buf);
- /*
- * BufferedInputStream input = new BufferedInputStream( new ByteArrayInputStream(map)); /*
- * CollatorReader reader = new CollatorReader(input, false); if (map.length >
- * MIN_BINARY_DATA_SIZE_) { reader.read(this, null); } else { reader.readHeader(this);
- * reader.readOptions(this); // duplicating UCA_'s data setWithUCATables(); }
- */
- // at this point, we have read in the collator
- // now we need to check whether the binary image has
- // the right UCA and other versions
- if (!m_UCA_version_.equals(UCA_.m_UCA_version_) || !m_UCD_version_.equals(UCA_.m_UCD_version_)) {
- init(m_rules_);
- return;
- }
- init();
- try {
- UResourceBundle reorderRes = elements.get("%%ReorderCodes");
- if (reorderRes != null) {
- int[] reorderCodes = reorderRes.getIntVector();
- setReorderCodes(reorderCodes);
- m_defaultReorderCodes_ = reorderCodes.clone();
- }
- } catch (MissingResourceException e) {
- // ignore
- }
- return;
- } else {
- init(m_rules_);
- return;
- }
- }
- } catch (Exception e) {
- // fallthrough
- }
- setWithUCAData();
- }
-
- // package private methods -----------------------------------------------
-
- /**
- * Sets this collator to use the tables in UCA. Note options not taken care of here.
- */
- final void setWithUCATables() {
- m_contractionOffset_ = UCA_.m_contractionOffset_;
- m_expansionOffset_ = UCA_.m_expansionOffset_;
- m_expansion_ = UCA_.m_expansion_;
- m_contractionIndex_ = UCA_.m_contractionIndex_;
- m_contractionCE_ = UCA_.m_contractionCE_;
- m_trie_ = UCA_.m_trie_;
- m_expansionEndCE_ = UCA_.m_expansionEndCE_;
- m_expansionEndCEMaxSize_ = UCA_.m_expansionEndCEMaxSize_;
- m_unsafe_ = UCA_.m_unsafe_;
- m_contractionEnd_ = UCA_.m_contractionEnd_;
- m_minUnsafe_ = UCA_.m_minUnsafe_;
- m_minContractionEnd_ = UCA_.m_minContractionEnd_;
- }
-
- /**
- * Sets this collator to use the all options and tables in UCA.
- */
- final void setWithUCAData() {
- latinOneFailed_ = true;
-
- m_addition3_ = UCA_.m_addition3_;
- m_bottom3_ = UCA_.m_bottom3_;
- m_bottomCount3_ = UCA_.m_bottomCount3_;
- m_caseFirst_ = UCA_.m_caseFirst_;
- m_caseSwitch_ = UCA_.m_caseSwitch_;
- m_common3_ = UCA_.m_common3_;
- m_contractionOffset_ = UCA_.m_contractionOffset_;
- setDecomposition(UCA_.getDecomposition());
- m_defaultCaseFirst_ = UCA_.m_defaultCaseFirst_;
- m_defaultDecomposition_ = UCA_.m_defaultDecomposition_;
- m_defaultIsAlternateHandlingShifted_ = UCA_.m_defaultIsAlternateHandlingShifted_;
- m_defaultIsCaseLevel_ = UCA_.m_defaultIsCaseLevel_;
- m_defaultIsFrenchCollation_ = UCA_.m_defaultIsFrenchCollation_;
- m_defaultIsHiragana4_ = UCA_.m_defaultIsHiragana4_;
- m_defaultStrength_ = UCA_.m_defaultStrength_;
- m_defaultVariableTopValue_ = UCA_.m_defaultVariableTopValue_;
- m_defaultIsNumericCollation_ = UCA_.m_defaultIsNumericCollation_;
- m_expansionOffset_ = UCA_.m_expansionOffset_;
- m_isAlternateHandlingShifted_ = UCA_.m_isAlternateHandlingShifted_;
- m_isCaseLevel_ = UCA_.m_isCaseLevel_;
- m_isFrenchCollation_ = UCA_.m_isFrenchCollation_;
- m_isHiragana4_ = UCA_.m_isHiragana4_;
- m_isJamoSpecial_ = UCA_.m_isJamoSpecial_;
- m_isSimple3_ = UCA_.m_isSimple3_;
- m_mask3_ = UCA_.m_mask3_;
- m_minContractionEnd_ = UCA_.m_minContractionEnd_;
- m_minUnsafe_ = UCA_.m_minUnsafe_;
- m_rules_ = UCA_.m_rules_;
- setStrength(UCA_.getStrength());
- m_top3_ = UCA_.m_top3_;
- m_topCount3_ = UCA_.m_topCount3_;
- m_variableTopValue_ = UCA_.m_variableTopValue_;
- m_isNumericCollation_ = UCA_.m_isNumericCollation_;
- setWithUCATables();
- latinOneFailed_ = false;
- }
-
- /**
- * Test whether a char character is potentially "unsafe" for use as a collation starting point. "Unsafe" characters
- * are combining marks or those belonging to some contraction sequence from the offset 1 onwards. E.g. if "ABC" is
- * the only contraction, then 'B' and 'C' are considered unsafe. If we have another contraction "ZA" with the one
- * above, then 'A', 'B', 'C' are "unsafe" but 'Z' is not.
- *
- * @param ch
- * character to determin
- * @return true if ch is unsafe, false otherwise
- */
- final boolean isUnsafe(char ch) {
- if (ch < m_minUnsafe_) {
- return false;
- }
-
- if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) {
- if (UTF16.isLeadSurrogate(ch) || UTF16.isTrailSurrogate(ch)) {
- // Trail surrogate are always considered unsafe.
- return true;
- }
- ch &= HEURISTIC_OVERFLOW_MASK_;
- ch += HEURISTIC_OVERFLOW_OFFSET_;
- }
- int value = m_unsafe_[ch >> HEURISTIC_SHIFT_];
- return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0;
- }
-
- /**
- * Approximate determination if a char character is at a contraction end. Guaranteed to be true if a character is at
- * the end of a contraction, otherwise it is not deterministic.
- *
- * @param ch
- * character to be determined
- */
- final boolean isContractionEnd(char ch) {
- if (UTF16.isTrailSurrogate(ch)) {
- return true;
- }
-
- if (ch < m_minContractionEnd_) {
- return false;
- }
-
- if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) {
- ch &= HEURISTIC_OVERFLOW_MASK_;
- ch += HEURISTIC_OVERFLOW_OFFSET_;
- }
- int value = m_contractionEnd_[ch >> HEURISTIC_SHIFT_];
- return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0;
- }
-
- /**
- * Retrieve the tag of a special ce
- *
- * @param ce
- * ce to test
- * @return tag of ce
- */
- static int getTag(int ce) {
- return (ce & CE_TAG_MASK_) >> CE_TAG_SHIFT_;
- }
-
- /**
- * Checking if ce is special
- *
- * @param ce
- * to check
- * @return true if ce is special
- */
- static boolean isSpecial(int ce) {
- return (ce & CE_SPECIAL_FLAG_) == CE_SPECIAL_FLAG_;
- }
-
- /**
- * Checks if the argument ce is a continuation
- *
- * @param ce
- * collation element to test
- * @return true if ce is a continuation
- */
- static final boolean isContinuation(int ce) {
- return ce != CollationElementIterator.NULLORDER && (ce & CE_CONTINUATION_TAG_) == CE_CONTINUATION_TAG_;
- }
-
- // private inner classes ------------------------------------------------
-
- // private variables -----------------------------------------------------
-
- /**
- * The smallest natural unsafe or contraction end char character before tailoring. This is a combining mark.
- */
- private static final int DEFAULT_MIN_HEURISTIC_ = 0x300;
- /**
- * Heuristic table table size. Size is 32 bytes, 1 bit for each latin 1 char, and some power of two for hashing the
- * rest of the chars. Size in bytes.
- */
- private static final char HEURISTIC_SIZE_ = 1056;
- /**
- * Mask value down to "some power of two" - 1, number of bits, not num of bytes.
- */
- private static final char HEURISTIC_OVERFLOW_MASK_ = 0x1fff;
- /**
- * Unsafe character shift
- */
- private static final int HEURISTIC_SHIFT_ = 3;
- /**
- * Unsafe character addition for character too large, it has to be folded then incremented.
- */
- private static final char HEURISTIC_OVERFLOW_OFFSET_ = 256;
- /**
- * Mask value to get offset in heuristic table.
- */
- private static final char HEURISTIC_MASK_ = 7;
-
- private int m_caseSwitch_;
- private int m_common3_;
- private int m_mask3_;
- /**
- * When switching case, we need to add or subtract different values.
- */
- private int m_addition3_;
- /**
- * Upper range when compressing
- */
- private int m_top3_;
- /**
- * Upper range when compressing
- */
- private int m_bottom3_;
- private int m_topCount3_;
- private int m_bottomCount3_;
- /**
- * Script reordering table
- */
- private byte[] m_leadBytePermutationTable_;
- /**
- * Case first constants
- */
- private static final int CASE_SWITCH_ = 0xC0;
- private static final int NO_CASE_SWITCH_ = 0;
- /**
- * Case level constants
- */
- private static final int CE_REMOVE_CASE_ = 0x3F;
- private static final int CE_KEEP_CASE_ = 0xFF;
- /**
- * Case strength mask
- */
- private static final int CE_CASE_MASK_3_ = 0xFF;
- /**
- * Sortkey size factor. Values can be changed.
- */
- private static final double PROPORTION_2_ = 0.5;
- private static final double PROPORTION_3_ = 0.667;
-
- // These values come from the UCA ----------------------------------------
-
- /**
- * This is an enum that lists magic special byte values from the fractional UCA
- */
- // private static final byte BYTE_ZERO_ = 0x0;
- // private static final byte BYTE_LEVEL_SEPARATOR_ = (byte)0x01;
- // private static final byte BYTE_SORTKEY_GLUE_ = (byte)0x02;
- private static final byte BYTE_SHIFT_PREFIX_ = (byte) 0x03;
- /* private */static final byte BYTE_UNSHIFTED_MIN_ = BYTE_SHIFT_PREFIX_;
- // private static final byte BYTE_FIRST_UCA_ = BYTE_COMMON_;
- // TODO: Make the following values dynamic since they change with almost every UCA version.
- static final byte CODAN_PLACEHOLDER = 0x12;
- private static final byte BYTE_FIRST_NON_LATIN_PRIMARY_ = (byte) 0x5B;
-
- private static final byte BYTE_UNSHIFTED_MAX_ = (byte) 0xFF;
- private static final int TOTAL_2_ = COMMON_TOP_2_ - COMMON_BOTTOM_2_ - 1;
- private static final int FLAG_BIT_MASK_CASE_SWITCH_OFF_ = 0x80;
- private static final int FLAG_BIT_MASK_CASE_SWITCH_ON_ = 0x40;
- private static final int COMMON_TOP_CASE_SWITCH_OFF_3_ = 0x85;
- private static final int COMMON_TOP_CASE_SWITCH_LOWER_3_ = 0x45;
- private static final int COMMON_TOP_CASE_SWITCH_UPPER_3_ = 0xC5;
- private static final int COMMON_BOTTOM_3_ = 0x05;
- private static final int COMMON_BOTTOM_CASE_SWITCH_UPPER_3_ = 0x86;
- private static final int COMMON_BOTTOM_CASE_SWITCH_LOWER_3_ = COMMON_BOTTOM_3_;
- private static final int TOP_COUNT_2_ = (int) (PROPORTION_2_ * TOTAL_2_);
- private static final int BOTTOM_COUNT_2_ = TOTAL_2_ - TOP_COUNT_2_;
- private static final int COMMON_2_ = COMMON_BOTTOM_2_;
- private static final int COMMON_UPPER_FIRST_3_ = 0xC5;
- private static final int COMMON_NORMAL_3_ = COMMON_BOTTOM_3_;
- // private static final int COMMON_4_ = (byte)0xFF;
-
- /*
- * Minimum size required for the binary collation data in bytes. Size of UCA header + size of options to 4 bytes
- */
- // private static final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2;
-
- /**
- * If this collator is to generate only simple tertiaries for fast path
- */
- private boolean m_isSimple3_;
-
- /**
- * French collation sorting flag
- */
- private boolean m_isFrenchCollation_;
- /**
- * Flag indicating if shifted is requested for Quaternary alternate handling. If this is not true, the default for
- * alternate handling will be non-ignorable.
- */
- private boolean m_isAlternateHandlingShifted_;
- /**
- * Extra case level for sorting
- */
- private boolean m_isCaseLevel_;
- /**
- * Frozen state of the collator.
- */
- private Lock frozenLock;
-
-
- private static final int SORT_BUFFER_INIT_SIZE_ = 128;
- private static final int SORT_BUFFER_INIT_SIZE_1_ = SORT_BUFFER_INIT_SIZE_ << 3;
- private static final int SORT_BUFFER_INIT_SIZE_2_ = SORT_BUFFER_INIT_SIZE_;
- private static final int SORT_BUFFER_INIT_SIZE_3_ = SORT_BUFFER_INIT_SIZE_;
- private static final int SORT_BUFFER_INIT_SIZE_CASE_ = SORT_BUFFER_INIT_SIZE_ >> 2;
- private static final int SORT_BUFFER_INIT_SIZE_4_ = SORT_BUFFER_INIT_SIZE_;
-
- private static final int CE_CONTINUATION_TAG_ = 0xC0;
- private static final int CE_REMOVE_CONTINUATION_MASK_ = 0xFFFFFF3F;
-
- private static final int LAST_BYTE_MASK_ = 0xFF;
-
- // private static final int CE_RESET_TOP_VALUE_ = 0x9F000303;
- // private static final int CE_NEXT_TOP_VALUE_ = 0xE8960303;
-
- private static final byte SORT_CASE_BYTE_START_ = (byte) 0x80;
- private static final byte SORT_CASE_SHIFT_START_ = (byte) 7;
-
- /**
- * CE buffer size
- */
- private static final int CE_BUFFER_SIZE_ = 512;
-
- // variables for Latin-1 processing
- boolean latinOneUse_ = false;
- boolean latinOneRegenTable_ = false;
- boolean latinOneFailed_ = false;
-
- int latinOneTableLen_ = 0;
- int latinOneCEs_[] = null;
-
- private final class CollationBuffer {
- /**
- * Bunch of utility iterators
- */
- protected StringUCharacterIterator m_srcUtilIter_;
- protected CollationElementIterator m_srcUtilColEIter_;
- protected StringUCharacterIterator m_tgtUtilIter_;
- protected CollationElementIterator m_tgtUtilColEIter_;
-
- /**
- * Utility comparison flags
- */
- protected boolean m_utilCompare0_;
- // private boolean m_utilCompare1_;
- protected boolean m_utilCompare2_;
- protected boolean m_utilCompare3_;
- protected boolean m_utilCompare4_;
- protected boolean m_utilCompare5_;
-
- /**
- * Utility byte buffer
- */
- protected byte m_utilBytes0_[];
- protected byte m_utilBytes1_[];
- protected byte m_utilBytes2_[];
- protected byte m_utilBytes3_[];
- protected byte m_utilBytes4_[];
- // private byte m_utilBytes5_[];
-
- protected RawCollationKey m_utilRawCollationKey_;
-
- protected int m_utilBytesCount0_;
- protected int m_utilBytesCount1_;
- protected int m_utilBytesCount2_;
- protected int m_utilBytesCount3_;
- protected int m_utilBytesCount4_;
- // private int m_utilBytesCount5_;
-
- // private int m_utilCount0_;
- // private int m_utilCount1_;
- protected int m_utilCount2_;
- protected int m_utilCount3_;
- protected int m_utilCount4_;
- // private int m_utilCount5_;
-
- protected int m_utilFrenchStart_;
- protected int m_utilFrenchEnd_;
-
- /**
- * Preparing the CE buffers. will be filled during the primary phase
- */
- protected int m_srcUtilCEBuffer_[];
- protected int m_tgtUtilCEBuffer_[];
- protected int m_srcUtilCEBufferSize_;
- protected int m_tgtUtilCEBufferSize_;
-
- protected int m_srcUtilContOffset_;
- protected int m_tgtUtilContOffset_;
-
- protected int m_srcUtilOffset_;
- protected int m_tgtUtilOffset_;
-
- private CollationBuffer() {
- initBuffers();
- }
-
- /**
- * Initializes utility iterators and byte buffer used by compare
- */
- protected final void initBuffers() {
- resetBuffers();
- m_srcUtilIter_ = new StringUCharacterIterator();
- m_srcUtilColEIter_ = new CollationElementIterator(m_srcUtilIter_, RuleBasedCollator.this);
- m_tgtUtilIter_ = new StringUCharacterIterator();
- m_tgtUtilColEIter_ = new CollationElementIterator(m_tgtUtilIter_, RuleBasedCollator.this);
- m_utilBytes0_ = new byte[SORT_BUFFER_INIT_SIZE_CASE_]; // case
- m_utilBytes1_ = new byte[SORT_BUFFER_INIT_SIZE_1_]; // primary
- m_utilBytes2_ = new byte[SORT_BUFFER_INIT_SIZE_2_]; // secondary
- m_utilBytes3_ = new byte[SORT_BUFFER_INIT_SIZE_3_]; // tertiary
- m_utilBytes4_ = new byte[SORT_BUFFER_INIT_SIZE_4_]; // Quaternary
- m_srcUtilCEBuffer_ = new int[CE_BUFFER_SIZE_];
- m_tgtUtilCEBuffer_ = new int[CE_BUFFER_SIZE_];
- }
-
- protected final void resetBuffers() {
- m_utilCompare0_ = false;
- // private boolean m_utilCompare1_;
- m_utilCompare2_ = false;
- m_utilCompare3_ = false;
- m_utilCompare4_ = false;
- m_utilCompare5_ = false;
-
- m_utilBytesCount0_ = 0;
- m_utilBytesCount1_ = 0;
- m_utilBytesCount2_ = 0;
- m_utilBytesCount3_ = 0;
- m_utilBytesCount4_ = 0;
- // private int m_utilBytesCount5_;
-
- m_utilCount2_ = 0;
- m_utilCount3_ = 0;
- m_utilCount4_ = 0;
-
- m_utilFrenchStart_ = 0;
- m_utilFrenchEnd_ = 0;
-
- m_srcUtilContOffset_ = 0;
- m_tgtUtilContOffset_ = 0;
-
- m_srcUtilOffset_ = 0;
- m_tgtUtilOffset_ = 0;
- }
- }
-
- // private methods -------------------------------------------------------
-
- private void init(String rules) throws Exception {
- setWithUCAData();
- CollationParsedRuleBuilder builder = new CollationParsedRuleBuilder(rules);
- builder.setRules(this);
- m_rules_ = rules;
- init();
- buildPermutationTable();
- }
-
- private final int compareRegular(String source, String target, int offset, CollationBuffer buffer) {
- buffer.resetBuffers();
-
- int strength = getStrength();
- // setting up the collator parameters
- buffer.m_utilCompare0_ = m_isCaseLevel_;
- // m_utilCompare1_ = true;
- buffer.m_utilCompare2_ = strength >= SECONDARY;
- buffer.m_utilCompare3_ = strength >= TERTIARY;
- buffer.m_utilCompare4_ = strength >= QUATERNARY;
- buffer.m_utilCompare5_ = strength == IDENTICAL;
- boolean doFrench = m_isFrenchCollation_ && buffer.m_utilCompare2_;
- boolean doShift4 = m_isAlternateHandlingShifted_ && buffer.m_utilCompare4_;
- boolean doHiragana4 = m_isHiragana4_ && buffer.m_utilCompare4_;
-
- if (doHiragana4 && doShift4) {
- String sourcesub = source.substring(offset);
- String targetsub = target.substring(offset);
- return compareBySortKeys(sourcesub, targetsub, buffer);
- }
-
- // This is the lowest primary value that will not be ignored if shifted
- int lowestpvalue = m_isAlternateHandlingShifted_ ? m_variableTopValue_ << 16 : 0;
- buffer.m_srcUtilCEBufferSize_ = 0;
- buffer.m_tgtUtilCEBufferSize_ = 0;
- int result = doPrimaryCompare(doHiragana4, lowestpvalue, source, target, offset, buffer);
- if (buffer.m_srcUtilCEBufferSize_ == -1 && buffer.m_tgtUtilCEBufferSize_ == -1) {
- // since the cebuffer is cleared when we have determined that
- // either source is greater than target or vice versa, the return
- // result is the comparison result and not the hiragana result
- return result;
- }
-
- int hiraganaresult = result;
-
- if (buffer.m_utilCompare2_) {
- result = doSecondaryCompare(doFrench, buffer);
- if (result != 0) {
- return result;
- }
- }
- // doing the case bit
- if (buffer.m_utilCompare0_) {
- result = doCaseCompare(buffer);
- if (result != 0) {
- return result;
- }
- }
- // Tertiary level
- if (buffer.m_utilCompare3_) {
- result = doTertiaryCompare(buffer);
- if (result != 0) {
- return result;
- }
- }
-
- if (doShift4) { // checkQuad
- result = doQuaternaryCompare(lowestpvalue, buffer);
- if (result != 0) {
- return result;
- }
- } else if (doHiragana4 && hiraganaresult != 0) {
- // If we're fine on quaternaries, we might be different
- // on Hiragana. This, however, might fail us in shifted.
- return hiraganaresult;
- }
-
- // For IDENTICAL comparisons, we use a bitwise character comparison
- // as a tiebreaker if all else is equal.
- // Getting here should be quite rare - strings are not identical -
- // that is checked first, but compared == through all other checks.
- if (buffer.m_utilCompare5_) {
- return doIdenticalCompare(source, target, offset, true);
- }
- return 0;
- }
-
- // Is this primary weight compressible?
- // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit).
- // TODO: This should use per-lead-byte flags from FractionalUCA.txt.
- static boolean isCompressible(int primary1) {
- return BYTE_FIRST_NON_LATIN_PRIMARY_ <= primary1 && primary1 <= maxRegularPrimary;
- }
-
- /**
- * Gets the 2 bytes of primary order and adds it to the primary byte array
- *
- * @param ce
- * current ce
- * @param notIsContinuation
- * flag indicating if the current bytes belong to a continuation ce
- * @param doShift
- * flag indicating if ce is to be shifted
- * @param leadPrimary
- * lead primary used for compression
- * @param commonBottom4
- * common byte value for Quaternary
- * @param bottomCount4
- * smallest byte value for Quaternary
- * @return the new lead primary for compression
- */
- private final int doPrimaryBytes(int ce, boolean notIsContinuation, boolean doShift, int leadPrimary,
- int commonBottom4, int bottomCount4, CollationBuffer buffer) {
-
- int p2 = (ce >>>= 16) & LAST_BYTE_MASK_; // in ints for unsigned
- int p1 = ce >>> 8; // comparison
- int originalP1 = p1;
- if (notIsContinuation) {
- if (m_leadBytePermutationTable_ != null) {
- p1 = 0xff & m_leadBytePermutationTable_[p1];
- }
- }
-
- if (doShift) {
- if (buffer.m_utilCount4_ > 0) {
- while (buffer.m_utilCount4_ > bottomCount4) {
- buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonBottom4 + bottomCount4));
- buffer.m_utilBytesCount4_++;
- buffer.m_utilCount4_ -= bottomCount4;
- }
- buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonBottom4 + (buffer.m_utilCount4_ - 1)));
- buffer.m_utilBytesCount4_++;
- buffer.m_utilCount4_ = 0;
- }
- // dealing with a variable and we're treating them as shifted
- // This is a shifted ignorable
- if (p1 != 0) {
- // we need to check this since we could be in continuation
- buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) p1);
- buffer.m_utilBytesCount4_++;
- }
- if (p2 != 0) {
- buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) p2);
- buffer.m_utilBytesCount4_++;
- }
- } else {
- // Note: This code assumes that the table is well built
- // i.e. not having 0 bytes where they are not supposed to be.
- // Usually, we'll have non-zero primary1 & primary2, except
- // in cases of LatinOne and friends, when primary2 will be
- // regular and simple sortkey calc
- if (p1 != CollationElementIterator.IGNORABLE) {
- if (notIsContinuation) {
- if (leadPrimary == p1) {
- buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p2);
- buffer.m_utilBytesCount1_++;
- } else {
- if (leadPrimary != 0) {
- buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_,
- ((p1 > leadPrimary) ? BYTE_UNSHIFTED_MAX_ : BYTE_UNSHIFTED_MIN_));
- buffer.m_utilBytesCount1_++;
- }
- if (p2 == CollationElementIterator.IGNORABLE) {
- // one byter, not compressed
- buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p1);
- buffer.m_utilBytesCount1_++;
- leadPrimary = 0;
- } else if (isCompressible(originalP1)) {
- // compress
- leadPrimary = p1;
- buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p1);
- buffer.m_utilBytesCount1_++;
- buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p2);
- buffer.m_utilBytesCount1_++;
- } else {
- leadPrimary = 0;
- buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p1);
- buffer.m_utilBytesCount1_++;
- buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p2);
- buffer.m_utilBytesCount1_++;
- }
- }
- } else {
- // continuation, add primary to the key, no compression
- buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p1);
- buffer.m_utilBytesCount1_++;
- if (p2 != CollationElementIterator.IGNORABLE) {
- buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p2);
- // second part
- buffer.m_utilBytesCount1_++;
- }
- }
- }
- }
- return leadPrimary;
- }
-
- /**
- * Gets the secondary byte and adds it to the secondary byte array
- *
- * @param ce current ce
- * @param notIsContinuation flag indicating if the current bytes belong to a continuation ce
- * @param doFrench flag indicator if french sort is to be performed
- * @param buffer collation buffer temporary state
- */
- private final void doSecondaryBytes(int ce, boolean notIsContinuation, boolean doFrench, CollationBuffer buffer) {
- int s = (ce >> 8) & LAST_BYTE_MASK_; // int for comparison
- if (s != 0) {
- if (!doFrench) {
- // This is compression code.
- if (s == COMMON_2_ && notIsContinuation) {
- buffer.m_utilCount2_++;
- } else {
- if (buffer.m_utilCount2_ > 0) {
- if (s > COMMON_2_) { // not necessary for 4th level.
- while (buffer.m_utilCount2_ > TOP_COUNT_2_) {
- buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_,
- (byte) (COMMON_TOP_2_ - TOP_COUNT_2_));
- buffer.m_utilBytesCount2_++;
- buffer.m_utilCount2_ -= TOP_COUNT_2_;
- }
- buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_,
- (byte) (COMMON_TOP_2_ - (buffer.m_utilCount2_ - 1)));
- buffer.m_utilBytesCount2_++;
- } else {
- while (buffer.m_utilCount2_ > BOTTOM_COUNT_2_) {
- buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_,
- (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
- buffer.m_utilBytesCount2_++;
- buffer.m_utilCount2_ -= BOTTOM_COUNT_2_;
- }
- buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_,
- (byte) (COMMON_BOTTOM_2_ + (buffer.m_utilCount2_ - 1)));
- buffer.m_utilBytesCount2_++;
- }
- buffer.m_utilCount2_ = 0;
- }
- buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, (byte) s);
- buffer.m_utilBytesCount2_++;
- }
- } else {
- buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, (byte) s);
- buffer.m_utilBytesCount2_++;
- // Do the special handling for French secondaries
- // We need to get continuation elements and do intermediate
- // restore
- // abc1c2c3de with french secondaries need to be edc1c2c3ba
- // NOT edc3c2c1ba
- if (notIsContinuation) {
- if (buffer.m_utilFrenchStart_ != -1) {
- // reverse secondaries from frenchStartPtr up to
- // frenchEndPtr
- reverseBuffer(buffer.m_utilBytes2_, buffer.m_utilFrenchStart_, buffer.m_utilFrenchEnd_);
- buffer.m_utilFrenchStart_ = -1;
- }
- } else {
- if (buffer.m_utilFrenchStart_ == -1) {
- buffer.m_utilFrenchStart_ = buffer.m_utilBytesCount2_ - 2;
- }
- buffer.m_utilFrenchEnd_ = buffer.m_utilBytesCount2_ - 1;
- }
- }
- }
- }
-
- /**
- * Reverse the argument buffer
- *
- * @param buffer to reverse
- * @param start index in buffer to start from
- * @param end index in buffer to end at
- */
- private static void reverseBuffer(byte buffer[], int start, int end) {
- while (start < end) {
- byte b = buffer[start];
- buffer[start++] = buffer[end];
- buffer[end--] = b;
- }
- }
-
- /**
- * Insert the case shifting byte if required
- *
- * @param caseshift value
- * @return new caseshift value
- */
- private final int doCaseShift(int caseshift, CollationBuffer buffer) {
- if (caseshift == 0) {
- buffer.m_utilBytes0_ = append(buffer.m_utilBytes0_, buffer.m_utilBytesCount0_, SORT_CASE_BYTE_START_);
- buffer.m_utilBytesCount0_++;
- caseshift = SORT_CASE_SHIFT_START_;
- }
- return caseshift;
- }
-
- /**
- * Performs the casing sort
- *
- * @param tertiary byte in ints for easy comparison
- * @param notIsContinuation flag indicating if the current bytes belong to a continuation ce
- * @param caseshift
- * @param buffer collation buffer temporary state
- * @return the new value of case shift
- */
- private final int doCaseBytes(int tertiary, boolean notIsContinuation, int caseshift, CollationBuffer buffer) {
- caseshift = doCaseShift(caseshift, buffer);
-
- if (notIsContinuation && tertiary != 0) {
- byte casebits = (byte) (tertiary & 0xC0);
- if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
- if (casebits == 0) {
- buffer.m_utilBytes0_[buffer.m_utilBytesCount0_ - 1] |= (1 << (--caseshift));
- } else {
- // second bit
- caseshift = doCaseShift(caseshift - 1, buffer);
- buffer.m_utilBytes0_[buffer.m_utilBytesCount0_ - 1] |= ((casebits >> 6) & 1) << (--caseshift);
- }
- } else {
- if (casebits != 0) {
- buffer.m_utilBytes0_[buffer.m_utilBytesCount0_ - 1] |= 1 << (--caseshift);
- // second bit
- caseshift = doCaseShift(caseshift, buffer);
- buffer.m_utilBytes0_[buffer.m_utilBytesCount0_ - 1] |= ((casebits >> 7) & 1) << (--caseshift);
- } else {
- caseshift--;
- }
- }
- }
-
- return caseshift;
- }
-
- /**
- * Gets the tertiary byte and adds it to the tertiary byte array
- *
- * @param tertiary byte in int for easy comparison
- * @param notIsContinuation flag indicating if the current bytes belong to a continuation ce
- * @param buffer collation buffer temporary state
- */
- private final void doTertiaryBytes(int tertiary, boolean notIsContinuation, CollationBuffer buffer) {
- if (tertiary != 0) {
- // This is compression code.
- // sequence size check is included in the if clause
- if (tertiary == m_common3_ && notIsContinuation) {
- buffer.m_utilCount3_++;
- } else {
- int common3 = m_common3_ & LAST_BYTE_MASK_;
- if (tertiary > common3 && m_common3_ == COMMON_NORMAL_3_) {
- tertiary += m_addition3_;
- } else if (tertiary <= common3 && m_common3_ == COMMON_UPPER_FIRST_3_) {
- tertiary -= m_addition3_;
- }
- if (buffer.m_utilCount3_ > 0) {
- if (tertiary > common3) {
- while (buffer.m_utilCount3_ > m_topCount3_) {
- buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_top3_ - m_topCount3_));
- buffer.m_utilBytesCount3_++;
- buffer.m_utilCount3_ -= m_topCount3_;
- }
- buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_,
- (byte) (m_top3_ - (buffer.m_utilCount3_ - 1)));
- buffer.m_utilBytesCount3_++;
- } else {
- while (buffer.m_utilCount3_ > m_bottomCount3_) {
- buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_,
- (byte) (m_bottom3_ + m_bottomCount3_));
- buffer.m_utilBytesCount3_++;
- buffer.m_utilCount3_ -= m_bottomCount3_;
- }
- buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_,
- (byte) (m_bottom3_ + (buffer.m_utilCount3_ - 1)));
- buffer.m_utilBytesCount3_++;
- }
- buffer.m_utilCount3_ = 0;
- }
- buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) tertiary);
- buffer.m_utilBytesCount3_++;
- }
- }
- }
-
- /**
- * Gets the Quaternary byte and adds it to the Quaternary byte array
- *
- * @param isCodePointHiragana flag indicator if the previous codepoint we dealt with was Hiragana
- * @param commonBottom4 smallest common Quaternary byte
- * @param bottomCount4 smallest Quaternary byte
- * @param hiragana4 hiragana Quaternary byte
- * @param buffer collation buffer temporary state
- */
- private final void doQuaternaryBytes(boolean isCodePointHiragana, int commonBottom4, int bottomCount4,
- byte hiragana4, CollationBuffer buffer) {
- if (isCodePointHiragana) { // This was Hiragana, need to note it
- if (buffer.m_utilCount4_ > 0) { // Close this part
- while (buffer.m_utilCount4_ > bottomCount4) {
- buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonBottom4 + bottomCount4));
- buffer.m_utilBytesCount4_++;
- buffer.m_utilCount4_ -= bottomCount4;
- }
- buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonBottom4 + (buffer.m_utilCount4_ - 1)));
- buffer.m_utilBytesCount4_++;
- buffer.m_utilCount4_ = 0;
- }
- buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, hiragana4); // Add the Hiragana
- buffer.m_utilBytesCount4_++;
- } else { // This wasn't Hiragana, so we can continue adding stuff
- buffer.m_utilCount4_++;
- }
- }
-
- /**
- * Iterates through the argument string for all ces. Split the ces into their relevant primaries, secondaries etc.
- *
- * @param source normalized string
- * @param doFrench flag indicator if special handling of French has to be done
- * @param hiragana4 offset for Hiragana quaternary
- * @param commonBottom4 smallest common quaternary byte
- * @param bottomCount4 smallest quaternary byte
- * @param buffer collation buffer temporary state
- */
- private final void getSortKeyBytes(String source, boolean doFrench, byte hiragana4, int commonBottom4,
- int bottomCount4, CollationBuffer buffer)
-
- {
- int backupDecomposition = getDecomposition();
- // TODO- hack fix around frozen state - stop self-modification
- internalSetDecomposition(NO_DECOMPOSITION); // have to revert to backup later
- buffer.m_srcUtilIter_.setText(source);
- buffer.m_srcUtilColEIter_.setText(buffer.m_srcUtilIter_);
- buffer.m_utilFrenchStart_ = -1;
- buffer.m_utilFrenchEnd_ = -1;
-
- boolean doShift = false;
- boolean notIsContinuation = false;
-
- int leadPrimary = 0; // int for easier comparison
- int caseShift = 0;
-
- while (true) {
- int ce = buffer.m_srcUtilColEIter_.next();
- if (ce == CollationElementIterator.NULLORDER) {
- break;
- }
-
- if (ce == CollationElementIterator.IGNORABLE) {
- continue;
- }
-
- notIsContinuation = !isContinuation(ce);
-
- boolean isPrimaryByteIgnorable = (ce & CE_PRIMARY_MASK_) == 0;
- // actually we can just check that the first byte is 0
- // generation stuffs the order left first
- boolean isSmallerThanVariableTop = (ce >>> CE_PRIMARY_SHIFT_) <= m_variableTopValue_;
- doShift = (m_isAlternateHandlingShifted_
- && ((notIsContinuation && isSmallerThanVariableTop && !isPrimaryByteIgnorable) // primary byte not 0
- || (!notIsContinuation && doShift)) || (doShift && isPrimaryByteIgnorable));
- if (doShift && isPrimaryByteIgnorable) {
- // amendment to the UCA says that primary ignorables and other
- // ignorables should be removed if following a shifted code
- // point
- // if we were shifted and we got an ignorable code point
- // we should just completely ignore it
- continue;
- }
- leadPrimary = doPrimaryBytes(ce, notIsContinuation, doShift, leadPrimary, commonBottom4, bottomCount4, buffer);
-
- if (doShift) {
- continue;
- }
- if (buffer.m_utilCompare2_) {
- doSecondaryBytes(ce, notIsContinuation, doFrench, buffer);
- }
-
- int t = ce & LAST_BYTE_MASK_;
- if (!notIsContinuation) {
- t = ce & CE_REMOVE_CONTINUATION_MASK_;
- }
-
- if (buffer.m_utilCompare0_ && (!isPrimaryByteIgnorable || buffer.m_utilCompare2_)) {
- // do the case level if we need to do it. We don't want to calculate
- // case level for primary ignorables if we have only primary strength and case level
- // otherwise we would break well formedness of CEs
- caseShift = doCaseBytes(t, notIsContinuation, caseShift, buffer);
- } else if (notIsContinuation) {
- t ^= m_caseSwitch_;
- }
-
- t &= m_mask3_;
-
- if (buffer.m_utilCompare3_) {
- doTertiaryBytes(t, notIsContinuation, buffer);
- }
-
- if (buffer.m_utilCompare4_ && notIsContinuation) { // compare quad
- doQuaternaryBytes(buffer.m_srcUtilColEIter_.m_isCodePointHiragana_, commonBottom4, bottomCount4, hiragana4, buffer);
- }
- }
- // TODO - hack fix around frozen state - stop self-modification
- internalSetDecomposition(backupDecomposition); // reverts to original
- if (buffer.m_utilFrenchStart_ != -1) {
- // one last round of checks
- reverseBuffer(buffer.m_utilBytes2_, buffer.m_utilFrenchStart_, buffer.m_utilFrenchEnd_);
- }
- }
-
- /**
- * From the individual strength byte results the final compact sortkey will be calculated.
- *
- * @param source text string
- * @param doFrench flag indicating that special handling of French has to be done
- * @param commonBottom4 smallest common quaternary byte
- * @param bottomCount4 smallest quaternary byte
- * @param key output RawCollationKey to store results, key cannot be null
- * @param buffer collation buffer temporary state
- */
- private final void getSortKey(String source, boolean doFrench, int commonBottom4, int bottomCount4,
- RawCollationKey key, CollationBuffer buffer) {
- // we have done all the CE's, now let's put them together to form
- // a key
- if (buffer.m_utilCompare2_) {
- doSecondary(doFrench, buffer);
- }
- // adding case level should be independent of secondary level
- if (buffer.m_utilCompare0_) {
- doCase(buffer);
- }
- if (buffer.m_utilCompare3_) {
- doTertiary(buffer);
- if (buffer.m_utilCompare4_) {
- doQuaternary(commonBottom4, bottomCount4, buffer);
- if (buffer.m_utilCompare5_) {
- doIdentical(source, buffer);
- }
-
- }
- }
- buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) 0);
- buffer.m_utilBytesCount1_++;
-
- key.set(buffer.m_utilBytes1_, 0, buffer.m_utilBytesCount1_);
- }
-
- /**
- * Packs the French bytes
- * @param buffer collation buffer temporary state
- */
- private static final void doFrench(CollationBuffer buffer) {
- for (int i = 0; i < buffer.m_utilBytesCount2_; i++) {
- byte s = buffer.m_utilBytes2_[buffer.m_utilBytesCount2_ - i - 1];
- // This is compression code.
- if (s == COMMON_2_) {
- ++buffer.m_utilCount2_;
- } else {
- if (buffer.m_utilCount2_ > 0) {
- // getting the unsigned value
- if ((s & LAST_BYTE_MASK_) > COMMON_2_) {
- // not necessary for 4th level.
- while (buffer.m_utilCount2_ > TOP_COUNT_2_) {
- buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_,
- (byte) (COMMON_TOP_2_ - TOP_COUNT_2_));
- buffer.m_utilBytesCount1_++;
- buffer.m_utilCount2_ -= TOP_COUNT_2_;
- }
- buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_,
- (byte) (COMMON_TOP_2_ - (buffer.m_utilCount2_ - 1)));
- buffer.m_utilBytesCount1_++;
- } else {
- while (buffer.m_utilCount2_ > BOTTOM_COUNT_2_) {
- buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_,
- (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
- buffer.m_utilBytesCount1_++;
- buffer.m_utilCount2_ -= BOTTOM_COUNT_2_;
- }
- buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_,
- (byte) (COMMON_BOTTOM_2_ + (buffer.m_utilCount2_ - 1)));
- buffer.m_utilBytesCount1_++;
- }
- buffer.m_utilCount2_ = 0;
- }
- buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, s);
- buffer.m_utilBytesCount1_++;
- }
- }
- if (buffer.m_utilCount2_ > 0) {
- while (buffer.m_utilCount2_ > BOTTOM_COUNT_2_) {
- buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
- buffer.m_utilBytesCount1_++;
- buffer.m_utilCount2_ -= BOTTOM_COUNT_2_;
- }
- buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) (COMMON_BOTTOM_2_ + (buffer.m_utilCount2_ - 1)));
- buffer.m_utilBytesCount1_++;
- }
- }
-
- /**
- * Compacts the secondary bytes and stores them into the primary array
- *
- * @param doFrench flag indicator that French has to be handled specially
- * @param buffer collation buffer temporary state
- */
- private static final void doSecondary(boolean doFrench, CollationBuffer buffer) {
- if (buffer.m_utilCount2_ > 0) {
- while (buffer.m_utilCount2_ > BOTTOM_COUNT_2_) {
- buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
- buffer.m_utilBytesCount2_++;
- buffer.m_utilCount2_ -= BOTTOM_COUNT_2_;
- }
- buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, (byte) (COMMON_BOTTOM_2_ + (buffer.m_utilCount2_ - 1)));
- buffer.m_utilBytesCount2_++;
- }
-
- buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_);
- buffer.m_utilBytesCount1_++;
-
- if (doFrench) { // do the reverse copy
- doFrench(buffer);
- } else {
- if (buffer.m_utilBytes1_.length <= buffer.m_utilBytesCount1_ + buffer.m_utilBytesCount2_) {
- buffer.m_utilBytes1_ = increase(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount2_);
- }
- System.arraycopy(buffer.m_utilBytes2_, 0, buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount2_);
- buffer.m_utilBytesCount1_ += buffer.m_utilBytesCount2_;
- }
- }
-
- /**
- * Increase buffer size
- *
- * @param buffer array of bytes
- * @param size of the byte array
- * @param incrementsize size to increase
- * @return the new buffer
- */
- private static final byte[] increase(byte buffer[], int size, int incrementsize) {
- byte result[] = new byte[buffer.length + incrementsize];
- System.arraycopy(buffer, 0, result, 0, size);
- return result;
- }
-
- /**
- * Increase buffer size
- *
- * @param buffer array of ints
- * @param size of the byte array
- * @param incrementsize size to increase
- * @return the new buffer
- */
- private static final int[] increase(int buffer[], int size, int incrementsize) {
- int result[] = new int[buffer.length + incrementsize];
- System.arraycopy(buffer, 0, result, 0, size);
- return result;
- }
-
- /**
- * Compacts the case bytes and stores them into the primary array
- *
- * @param buffer collation buffer temporary state
- */
- private static final void doCase(CollationBuffer buffer) {
- buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_);
- buffer.m_utilBytesCount1_++;
- if (buffer.m_utilBytes1_.length <= buffer.m_utilBytesCount1_ + buffer.m_utilBytesCount0_) {
- buffer.m_utilBytes1_ = increase(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount0_);
- }
- System.arraycopy(buffer.m_utilBytes0_, 0, buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount0_);
- buffer.m_utilBytesCount1_ += buffer.m_utilBytesCount0_;
- }
-
- /**
- * Compacts the tertiary bytes and stores them into the primary array
- *
- * @param buffer collation buffer temporary state
- */
- private final void doTertiary(CollationBuffer buffer) {
- if (buffer.m_utilCount3_ > 0) {
- if (m_common3_ != COMMON_BOTTOM_3_) {
- while (buffer.m_utilCount3_ >= m_topCount3_) {
- buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_top3_ - m_topCount3_));
- buffer.m_utilBytesCount3_++;
- buffer.m_utilCount3_ -= m_topCount3_;
- }
- buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_top3_ - buffer.m_utilCount3_));
- buffer.m_utilBytesCount3_++;
- } else {
- while (buffer.m_utilCount3_ > m_bottomCount3_) {
- buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_bottom3_ + m_bottomCount3_));
- buffer.m_utilBytesCount3_++;
- buffer.m_utilCount3_ -= m_bottomCount3_;
- }
- buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_bottom3_ + (buffer.m_utilCount3_ - 1)));
- buffer.m_utilBytesCount3_++;
- }
- }
- buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_);
- buffer.m_utilBytesCount1_++;
- if (buffer.m_utilBytes1_.length <= buffer.m_utilBytesCount1_ + buffer.m_utilBytesCount3_) {
- buffer.m_utilBytes1_ = increase(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount3_);
- }
- System.arraycopy(buffer.m_utilBytes3_, 0, buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount3_);
- buffer.m_utilBytesCount1_ += buffer.m_utilBytesCount3_;
- }
-
- /**
- * Compacts the quaternary bytes and stores them into the primary array
- *
- * @param buffer collation buffer temporary state
- */
- private final void doQuaternary(int commonbottom4, int bottomcount4, CollationBuffer buffer) {
- if (buffer.m_utilCount4_ > 0) {
- while (buffer.m_utilCount4_ > bottomcount4) {
- buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonbottom4 + bottomcount4));
- buffer.m_utilBytesCount4_++;
- buffer.m_utilCount4_ -= bottomcount4;
- }
- buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonbottom4 + (buffer.m_utilCount4_ - 1)));
- buffer.m_utilBytesCount4_++;
- }
- buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_);
- buffer.m_utilBytesCount1_++;
- if (buffer.m_utilBytes1_.length <= buffer.m_utilBytesCount1_ + buffer.m_utilBytesCount4_) {
- buffer.m_utilBytes1_ = increase(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount4_);
- }
- System.arraycopy(buffer.m_utilBytes4_, 0, buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount4_);
- buffer.m_utilBytesCount1_ += buffer.m_utilBytesCount4_;
- }
-
- /**
- * Deals with the identical sort. Appends the BOCSU version of the source string to the ends of the byte buffer.
- *
- * @param source text string
- * @param buffer collation buffer temporary state
- */
- private static final void doIdentical(String source, CollationBuffer buffer) {
- int isize = BOCU.getCompressionLength(source);
- buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_);
- buffer.m_utilBytesCount1_++;
- if (buffer.m_utilBytes1_.length <= buffer.m_utilBytesCount1_ + isize) {
- buffer.m_utilBytes1_ = increase(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, 1 + isize);
- }
- buffer.m_utilBytesCount1_ = BOCU.compress(source, buffer.m_utilBytes1_, buffer.m_utilBytesCount1_);
- }
-
- /**
- * Gets the offset of the first unmatched characters in source and target. This method returns the offset of the
- * start of a contraction or a combining sequence, if the first difference is in the middle of such a sequence.
- *
- * @param source
- * string
- * @param target
- * string
- * @return offset of the first unmatched characters in source and target.
- */
- private final int getFirstUnmatchedOffset(String source, String target) {
- int result = 0;
- int slength = source.length();
- int tlength = target.length();
- int minlength = slength;
- if (minlength > tlength) {
- minlength = tlength;
- }
- while (result < minlength && source.charAt(result) == target.charAt(result)) {
- result++;
- }
- if (result > 0) {
- // There is an identical portion at the beginning of the two
- // strings. If the identical portion ends within a contraction or a
- // combining character sequence, back up to the start of that
- // sequence.
- char schar = 0;
- char tchar = 0;
- if (result < minlength) {
- schar = source.charAt(result); // first differing chars
- tchar = target.charAt(result);
- } else {
- schar = source.charAt(minlength - 1);
- if (isUnsafe(schar)) {
- tchar = schar;
- } else if (slength == tlength) {
- return result;
- } else if (slength < tlength) {
- tchar = target.charAt(result);
- } else {
- schar = source.charAt(result);
- }
- }
- if (isUnsafe(schar) || isUnsafe(tchar)) {
- // We are stopped in the middle of a contraction or combining
- // sequence.
- // Look backwards for the part of the string for the start of
- // the sequence
- // It doesn't matter which string we scan, since they are the
- // same in this region.
- do {
- result--;
- } while (result > 0 && isUnsafe(source.charAt(result)));
- }
- }
- return result;
- }
-
- /**
- * Appending an byte to an array of bytes and increases it if we run out of space
- *
- * @param array
- * of byte arrays
- * @param appendindex
- * index in the byte array to append
- * @param value
- * to append
- * @return array if array size can accomodate the new value, otherwise a bigger array will be created and returned
- */
- private static final byte[] append(byte array[], int appendindex, byte value) {
- try {
- array[appendindex] = value;
- } catch (ArrayIndexOutOfBoundsException e) {
- array = increase(array, appendindex, SORT_BUFFER_INIT_SIZE_);
- array[appendindex] = value;
- }
- return array;
- }
-
- /**
- * This is a trick string compare function that goes in and uses sortkeys to compare. It is used when compare gets
- * in trouble and needs to bail out.
- *
- * @param source text string
- * @param target text string
- * @param buffer collation buffer temporary state
- */
- private final int compareBySortKeys(String source, String target, CollationBuffer buffer)
- {
- buffer.m_utilRawCollationKey_ = getRawCollationKey(source, buffer.m_utilRawCollationKey_);
- // this method is very seldom called
- RawCollationKey targetkey = getRawCollationKey(target, null);
- return buffer.m_utilRawCollationKey_.compareTo(targetkey);
- }
-
- /**
- * Performs the primary comparisons, and fills up the CE buffer at the same time. The return value toggles between
- * the comparison result and the hiragana result. If either the source is greater than target or vice versa, the
- * return result is the comparison result, ie 1 or -1, furthermore the cebuffers will be cleared when that happens.
- * If the primary comparisons are equal, we'll have to continue with secondary comparison. In this case the cebuffer
- * will not be cleared and the return result will be the hiragana result.
- *
- * @param doHiragana4 flag indicator that Hiragana Quaternary has to be observed
- * @param lowestpvalue the lowest primary value that will not be ignored if alternate handling is shifted
- * @param source text string
- * @param target text string
- * @param textoffset offset in text to start the comparison
- * @param buffer collation buffer temporary state
- * @return comparion result if a primary difference is found, otherwise hiragana result
- */
- private final int doPrimaryCompare(boolean doHiragana4, int lowestpvalue, String source, String target,
- int textoffset, CollationBuffer buffer)
-
- {
- // Preparing the context objects for iterating over strings
- buffer.m_srcUtilIter_.setText(source);
- buffer.m_srcUtilColEIter_.setText(buffer.m_srcUtilIter_, textoffset);
- buffer.m_tgtUtilIter_.setText(target);
- buffer.m_tgtUtilColEIter_.setText(buffer.m_tgtUtilIter_, textoffset);
-
- // Non shifted primary processing is quite simple
- if (!m_isAlternateHandlingShifted_) {
- int hiraganaresult = 0;
- while (true) {
- int sorder = 0;
- int sPrimary;
- // We fetch CEs until we hit a non ignorable primary or end.
- do {
- sorder = buffer.m_srcUtilColEIter_.next();
- buffer.m_srcUtilCEBuffer_ = append(buffer.m_srcUtilCEBuffer_, buffer.m_srcUtilCEBufferSize_, sorder);
- buffer.m_srcUtilCEBufferSize_++;
- sPrimary = sorder & CE_PRIMARY_MASK_;
- } while (sPrimary == CollationElementIterator.IGNORABLE);
-
- int torder = 0;
- int tPrimary;
- do {
- torder = buffer.m_tgtUtilColEIter_.next();
- buffer.m_tgtUtilCEBuffer_ = append(buffer.m_tgtUtilCEBuffer_, buffer.m_tgtUtilCEBufferSize_, torder);
- buffer.m_tgtUtilCEBufferSize_++;
- tPrimary = torder & CE_PRIMARY_MASK_;
- } while (tPrimary == CollationElementIterator.IGNORABLE);
-
- // if both primaries are the same
- if (sPrimary == tPrimary) {
- // and there are no more CEs, we advance to the next level
- // see if we are at the end of either string
- if (buffer.m_srcUtilCEBuffer_[buffer.m_srcUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER) {
- if (buffer.m_tgtUtilCEBuffer_[buffer.m_tgtUtilCEBufferSize_ - 1] != CollationElementIterator.NULLORDER) {
- return -1;
- }
- break;
- } else if (buffer.m_tgtUtilCEBuffer_[buffer.m_tgtUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER) {
- return 1;
- }
- if (doHiragana4 && hiraganaresult == 0
- && buffer.m_srcUtilColEIter_.m_isCodePointHiragana_ != buffer.m_tgtUtilColEIter_.m_isCodePointHiragana_) {
- if (buffer.m_srcUtilColEIter_.m_isCodePointHiragana_) {
- hiraganaresult = -1;
- } else {
- hiraganaresult = 1;
- }
- }
- } else {
- if (!isContinuation(sorder) && m_leadBytePermutationTable_ != null) {
- sPrimary = (m_leadBytePermutationTable_[sPrimary >>> 24] << 24) | (sPrimary & 0x00FFFFFF);
- tPrimary = (m_leadBytePermutationTable_[tPrimary >>> 24] << 24) | (tPrimary & 0x00FFFFFF);
- }
- // if two primaries are different, we are done
- return endPrimaryCompare(sPrimary, tPrimary, buffer);
- }
- }
- // no primary difference... do the rest from the buffers
- return hiraganaresult;
- } else { // shifted - do a slightly more complicated processing :)
- while (true) {
- int sorder = getPrimaryShiftedCompareCE(buffer.m_srcUtilColEIter_, lowestpvalue, true, buffer);
- int torder = getPrimaryShiftedCompareCE(buffer.m_tgtUtilColEIter_, lowestpvalue, false, buffer);
- if (sorder == torder) {
- if (buffer.m_srcUtilCEBuffer_[buffer.m_srcUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER) {
- break;
- } else {
- continue;
- }
- } else {
- return endPrimaryCompare(sorder, torder, buffer);
- }
- } // no primary difference... do the rest from the buffers
- }
- return 0;
- }
-
- /**
- * This is used only for primary strength when we know that sorder is already different from torder. Compares sorder
- * and torder, returns -1 if sorder is less than torder. Clears the cebuffer at the same time.
- *
- * @param sorder source strength order
- * @param torder target strength order
- * @param buffer collation buffer temporary state
- * @return the comparison result of sorder and torder
- */
- private static final int endPrimaryCompare(int sorder, int torder, CollationBuffer buffer) {
- // if we reach here, the ce offset accessed is the last ce
- // appended to the buffer
- boolean isSourceNullOrder = (buffer.m_srcUtilCEBuffer_[buffer.m_srcUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER);
- boolean isTargetNullOrder = (buffer.m_tgtUtilCEBuffer_[buffer.m_tgtUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER);
- buffer.m_srcUtilCEBufferSize_ = -1;
- buffer.m_tgtUtilCEBufferSize_ = -1;
- if (isSourceNullOrder) {
- return -1;
- }
- if (isTargetNullOrder) {
- return 1;
- }
- // getting rid of the sign
- sorder >>>= CE_PRIMARY_SHIFT_;
- torder >>>= CE_PRIMARY_SHIFT_;
- if (sorder < torder) {
- return -1;
- }
- return 1;
- }
-
- /**
- * Calculates the next primary shifted value and fills up cebuffer with the next non-ignorable ce.
- *
- * @param coleiter collation element iterator
- * @param doHiragana4 flag indicator if hiragana quaternary is to be handled
- * @param lowestpvalue lowest primary shifted value that will not be ignored
- * @param buffer collation buffer temporary state
- * @return result next modified ce
- */
- private static final int getPrimaryShiftedCompareCE(CollationElementIterator coleiter, int lowestpvalue, boolean isSrc, CollationBuffer buffer)
- {
- boolean shifted = false;
- int result = CollationElementIterator.IGNORABLE;
- int cebuffer[] = buffer.m_srcUtilCEBuffer_;
- int cebuffersize = buffer.m_srcUtilCEBufferSize_;
- if (!isSrc) {
- cebuffer = buffer.m_tgtUtilCEBuffer_;
- cebuffersize = buffer.m_tgtUtilCEBufferSize_;
- }
- while (true) {
- result = coleiter.next();
- if (result == CollationElementIterator.NULLORDER) {
- cebuffer = append(cebuffer, cebuffersize, result);
- cebuffersize++;
- break;
- } else if (result == CollationElementIterator.IGNORABLE
- || (shifted && (result & CE_PRIMARY_MASK_) == CollationElementIterator.IGNORABLE)) {
- // UCA amendment - ignore ignorables that follow shifted code
- // points
- continue;
- } else if (isContinuation(result)) {
- if ((result & CE_PRIMARY_MASK_) != CollationElementIterator.IGNORABLE) {
- // There is primary value
- if (shifted) {
- result = (result & CE_PRIMARY_MASK_) | CE_CONTINUATION_MARKER_;
- // preserve interesting continuation
- cebuffer = append(cebuffer, cebuffersize, result);
- cebuffersize++;
- continue;
- } else {
- cebuffer = append(cebuffer, cebuffersize, result);
- cebuffersize++;
- break;
- }
- } else { // Just lower level values
- if (!shifted) {
- cebuffer = append(cebuffer, cebuffersize, result);
- cebuffersize++;
- }
- }
- } else { // regular
- if (Utility.compareUnsigned(result & CE_PRIMARY_MASK_, lowestpvalue) > 0) {
- cebuffer = append(cebuffer, cebuffersize, result);
- cebuffersize++;
- break;
- } else {
- if ((result & CE_PRIMARY_MASK_) != 0) {
- shifted = true;
- result &= CE_PRIMARY_MASK_;
- cebuffer = append(cebuffer, cebuffersize, result);
- cebuffersize++;
- continue;
- } else {
- cebuffer = append(cebuffer, cebuffersize, result);
- cebuffersize++;
- shifted = false;
- continue;
- }
- }
- }
- }
- if (isSrc) {
- buffer.m_srcUtilCEBuffer_ = cebuffer;
- buffer.m_srcUtilCEBufferSize_ = cebuffersize;
- } else {
- buffer.m_tgtUtilCEBuffer_ = cebuffer;
- buffer.m_tgtUtilCEBufferSize_ = cebuffersize;
- }
- result &= CE_PRIMARY_MASK_;
- return result;
- }
-
- /**
- * Appending an int to an array of ints and increases it if we run out of space
- *
- * @param array
- * of int arrays
- * @param appendindex
- * index at which value will be appended
- * @param value
- * to append
- * @return array if size is not increased, otherwise a new array will be returned
- */
- private static final int[] append(int array[], int appendindex, int value) {
- if (appendindex + 1 >= array.length) {
- array = increase(array, appendindex, CE_BUFFER_SIZE_);
- }
- array[appendindex] = value;
- return array;
- }
-
- /**
- * Does secondary strength comparison based on the collected ces.
- *
- * @param doFrench flag indicates if French ordering is to be done
- * @param buffer collation buffer temporary state
- * @return the secondary strength comparison result
- */
- private static final int doSecondaryCompare(boolean doFrench, CollationBuffer buffer) {
- // now, we're gonna reexamine collected CEs
- if (!doFrench) { // normal
- int soffset = 0;
- int toffset = 0;
- while (true) {
- int sorder = CollationElementIterator.IGNORABLE;
- while (sorder == CollationElementIterator.IGNORABLE) {
- sorder = buffer.m_srcUtilCEBuffer_[soffset++] & CE_SECONDARY_MASK_;
- }
- int torder = CollationElementIterator.IGNORABLE;
- while (torder == CollationElementIterator.IGNORABLE) {
- torder = buffer.m_tgtUtilCEBuffer_[toffset++] & CE_SECONDARY_MASK_;
- }
-
- if (sorder == torder) {
- if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
- if (buffer.m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) {
- return -1;
- }
- break;
- } else if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
- return 1;
- }
- } else {
- if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
- return -1;
- }
- if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
- return 1;
- }
- return (sorder < torder) ? -1 : 1;
- }
- }
- } else { // do the French
- buffer.m_srcUtilContOffset_ = 0;
- buffer.m_tgtUtilContOffset_ = 0;
- buffer.m_srcUtilOffset_ = buffer.m_srcUtilCEBufferSize_ - 2;
- buffer.m_tgtUtilOffset_ = buffer.m_tgtUtilCEBufferSize_ - 2;
- while (true) {
- int sorder = getSecondaryFrenchCE(true, buffer);
- int torder = getSecondaryFrenchCE(false, buffer);
- if (sorder == torder) {
- if ((buffer.m_srcUtilOffset_ < 0 && buffer.m_tgtUtilOffset_ < 0)
- || (buffer.m_srcUtilOffset_ >= 0 && buffer.m_srcUtilCEBuffer_[buffer.m_srcUtilOffset_] == CollationElementIterator.NULLORDER)) {
- break;
- }
- } else {
- return (sorder < torder) ? -1 : 1;
- }
- }
- }
- return 0;
- }
-
- /**
- * Calculates the next secondary french CE.
- *
- * @param isSrc flag indicator if we are calculating the src ces
- * @param buffer collation buffer temporary state
- * @return result next modified ce
- */
- private static final int getSecondaryFrenchCE(boolean isSrc, CollationBuffer buffer) {
- int result = CollationElementIterator.IGNORABLE;
- int offset = buffer.m_srcUtilOffset_;
- int continuationoffset = buffer.m_srcUtilContOffset_;
- int cebuffer[] = buffer.m_srcUtilCEBuffer_;
- if (!isSrc) {
- offset = buffer.m_tgtUtilOffset_;
- continuationoffset = buffer.m_tgtUtilContOffset_;
- cebuffer = buffer.m_tgtUtilCEBuffer_;
- }
-
- while (result == CollationElementIterator.IGNORABLE && offset >= 0) {
- if (continuationoffset == 0) {
- result = cebuffer[offset];
- while (isContinuation(cebuffer[offset--])) {
- }
- // after this, sorder is at the start of continuation,
- // and offset points before that
- if (isContinuation(cebuffer[offset + 1])) {
- // save offset for later
- continuationoffset = offset;
- offset += 2;
- }
- } else {
- result = cebuffer[offset++];
- if (!isContinuation(result)) {
- // we have finished with this continuation
- offset = continuationoffset;
- // reset the pointer to before continuation
- continuationoffset = 0;
- continue;
- }
- }
- result &= CE_SECONDARY_MASK_; // remove continuation bit
- }
- if (isSrc) {
- buffer.m_srcUtilOffset_ = offset;
- buffer.m_srcUtilContOffset_ = continuationoffset;
- } else {
- buffer.m_tgtUtilOffset_ = offset;
- buffer.m_tgtUtilContOffset_ = continuationoffset;
- }
- return result;
- }
-
- /**
- * Does case strength comparison based on the collected ces.
- *
- * @param buffer collation buffer temporary state
- * @return the case strength comparison result
- */
- private final int doCaseCompare(CollationBuffer buffer) {
- int soffset = 0;
- int toffset = 0;
- while (true) {
- int sorder = CollationElementIterator.IGNORABLE;
- int torder = CollationElementIterator.IGNORABLE;
- while ((sorder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) {
- sorder = buffer.m_srcUtilCEBuffer_[soffset++];
- if (!isContinuation(sorder) && ((sorder & CE_PRIMARY_MASK_) != 0 || buffer.m_utilCompare2_ == true)) {
- // primary ignorables should not be considered on the case level when the strength is primary
- // otherwise, the CEs stop being well-formed
- sorder &= CE_CASE_MASK_3_;
- sorder ^= m_caseSwitch_;
- } else {
- sorder = CollationElementIterator.IGNORABLE;
- }
- }
-
- while ((torder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) {
- torder = buffer.m_tgtUtilCEBuffer_[toffset++];
- if (!isContinuation(torder) && ((torder & CE_PRIMARY_MASK_) != 0 || buffer.m_utilCompare2_ == true)) {
- // primary ignorables should not be considered on the case level when the strength is primary
- // otherwise, the CEs stop being well-formed
- torder &= CE_CASE_MASK_3_;
- torder ^= m_caseSwitch_;
- } else {
- torder = CollationElementIterator.IGNORABLE;
- }
- }
-
- sorder &= CE_CASE_BIT_MASK_;
- torder &= CE_CASE_BIT_MASK_;
- if (sorder == torder) {
- // checking end of strings
- if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
- if (buffer.m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) {
- return -1;
- }
- break;
- } else if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
- return 1;
- }
- } else {
- if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
- return -1;
- }
- if (buffer.m_tgtUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
- return 1;
- }
- return (sorder < torder) ? -1 : 1;
- }
- }
- return 0;
- }
-
- /**
- * Does tertiary strength comparison based on the collected ces.
- *
- * @param buffer collation buffer temporary state
- * @return the tertiary strength comparison result
- */
- private final int doTertiaryCompare(CollationBuffer buffer) {
- int soffset = 0;
- int toffset = 0;
- while (true) {
- int sorder = CollationElementIterator.IGNORABLE;
- int torder = CollationElementIterator.IGNORABLE;
- while ((sorder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) {
- sorder = buffer.m_srcUtilCEBuffer_[soffset++];
- if (!isContinuation(sorder)) {
- sorder = (sorder & m_mask3_) ^ m_caseSwitch_;
- } else {
- sorder = (sorder & m_mask3_) & CE_REMOVE_CASE_;
- }
- }
-
- while ((torder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) {
- torder = buffer.m_tgtUtilCEBuffer_[toffset++];
- if (!isContinuation(torder)) {
- torder = (torder & m_mask3_) ^ m_caseSwitch_;
- } else {
- torder = (torder & m_mask3_) & CE_REMOVE_CASE_;
- }
- }
-
- if (sorder == torder) {
- if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
- if (buffer.m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) {
- return -1;
- }
- break;
- } else if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
- return 1;
- }
- } else {
- if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
- return -1;
- }
- if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
- return 1;
- }
- return (sorder < torder) ? -1 : 1;
- }
- }
- return 0;
- }
-
- /**
- * Does quaternary strength comparison based on the collected ces.
- *
- * @param lowestpvalue the lowest primary value that will not be ignored if alternate handling is shifted
- * @param buffer collation buffer temporary state
- * @return the quaternary strength comparison result
- */
- private final int doQuaternaryCompare(int lowestpvalue, CollationBuffer buffer) {
- boolean sShifted = true;
- boolean tShifted = true;
- int soffset = 0;
- int toffset = 0;
- while (true) {
- int sorder = CollationElementIterator.IGNORABLE;
- int torder = CollationElementIterator.IGNORABLE;
- while (sorder == CollationElementIterator.IGNORABLE || (isContinuation(sorder) && !sShifted)) {
- sorder = buffer.m_srcUtilCEBuffer_[soffset++];
- if (isContinuation(sorder)) {
- if (!sShifted) {
- continue;
- }
- } else if (Utility.compareUnsigned(sorder, lowestpvalue) > 0
- || (sorder & CE_PRIMARY_MASK_) == CollationElementIterator.IGNORABLE) {
- // non continuation
- sorder = CE_PRIMARY_MASK_;
- sShifted = false;
- } else {
- sShifted = true;
- }
- }
- sorder >>>= CE_PRIMARY_SHIFT_;
- while (torder == CollationElementIterator.IGNORABLE || (isContinuation(torder) && !tShifted)) {
- torder = buffer.m_tgtUtilCEBuffer_[toffset++];
- if (isContinuation(torder)) {
- if (!tShifted) {
- continue;
- }
- } else if (Utility.compareUnsigned(torder, lowestpvalue) > 0
- || (torder & CE_PRIMARY_MASK_) == CollationElementIterator.IGNORABLE) {
- // non continuation
- torder = CE_PRIMARY_MASK_;
- tShifted = false;
- } else {
- tShifted = true;
- }
- }
- torder >>>= CE_PRIMARY_SHIFT_;
-
- if (sorder == torder) {
- if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
- if (buffer.m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) {
- return -1;
- }
- break;
- } else if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
- return 1;
- }
- } else {
- if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
- return -1;
- }
- if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
- return 1;
- }
- return (sorder < torder) ? -1 : 1;
- }
- }
- return 0;
+ return h;
}
/**
- * Internal function. Does byte level string compare. Used by strcoll if strength == identical and strings are
- * otherwise equal. This is a rare case. Comparison must be done on NFD normalized strings. FCD is not good enough.
+ * Compares the source text String to the target text String according to the collation rules, strength and
+ * decomposition mode for this RuleBasedCollator. Returns an integer less than, equal to or greater than zero
+ * depending on whether the source String is less than, equal to or greater than the target String. See the Collator
+ * class description for an example of use. </p>
+ * <p>
+ * General recommendation: <br>
+ * If comparison are to be done to the same String multiple times, it would be more efficient to generate
+ * CollationKeys for the Strings and use CollationKey.compareTo(CollationKey) for the comparisons. If speed
+ * performance is critical and object instantiation is to be reduced, further optimization may be achieved by
+ * generating a simpler key of the form RawCollationKey and reusing this RawCollationKey object with the method
+ * RuleBasedCollator.getRawCollationKey. Internal byte representation can be directly accessed via RawCollationKey
+ * and stored for future use. Like CollationKey, RawCollationKey provides a method RawCollationKey.compareTo for key
+ * comparisons. If the each Strings are compared to only once, using the method RuleBasedCollator.compare(String,
+ * String) will have a better performance.
+ * </p>
*
* @param source
- * text
+ * the source text String.
* @param target
- * text
- * @param offset
- * of the first difference in the text strings
- * @param normalize
- * flag indicating if we are to normalize the text before comparison
- * @return 1 if source is greater than target, -1 less than and 0 if equals
+ * the target text String.
+ * @return Returns an integer value. Value is less than zero if source is less than target, value is zero if source
+ * and target are equal, value is greater than zero if source is greater than target.
+ * @see CollationKey
+ * @see #getCollationKey
+ * @stable ICU 2.8
*/
- private static final int doIdenticalCompare(String source, String target, int offset, boolean normalize)
-
- {
- if (normalize) {
- if (Normalizer.quickCheck(source, Normalizer.NFD, 0) != Normalizer.YES) {
- source = Normalizer.decompose(source, false);
- }
-
- if (Normalizer.quickCheck(target, Normalizer.NFD, 0) != Normalizer.YES) {
- target = Normalizer.decompose(target, false);
- }
- offset = 0;
- }
-
- return doStringCompare(source, target, offset);
+ @Override
+ public int compare(String source, String target) {
+ return doCompare(source, target);
}
/**
- * Compares string for their codepoint order. This comparison handles surrogate characters and place them after the
- * all non surrogate characters.
- *
- * @param source
- * text
- * @param target
- * text
- * @param offset
- * start offset for comparison
- * @return 1 if source is greater than target, -1 less than and 0 if equals
- */
- private static final int doStringCompare(String source, String target, int offset) {
- // compare identical prefixes - they do not need to be fixed up
- char schar = 0;
- char tchar = 0;
- int slength = source.length();
- int tlength = target.length();
- int minlength = Math.min(slength, tlength);
- while (offset < minlength) {
- schar = source.charAt(offset);
- tchar = target.charAt(offset++);
- if (schar != tchar) {
- break;
- }
+ * Abstract iterator for identical-level string comparisons.
+ * Returns FCD code points and handles temporary switching to NFD.
+ *
+ * <p>As with CollationIterator,
+ * Java NFDIterator instances are partially constructed and cached,
+ * and completed when reset for use.
+ * C++ NFDIterator instances are stack-allocated.
+ */
+ private static abstract class NFDIterator {
+ /**
+ * Partial constructor, must call reset().
+ */
+ NFDIterator() {}
+ final void reset() {
+ index = -1;
}
- if (schar == tchar && offset == minlength) {
- if (slength > minlength) {
- return 1;
- }
- if (tlength > minlength) {
- return -1;
+ /**
+ * Returns the next code point from the internal normalization buffer,
+ * or else the next text code point.
+ * Returns -1 at the end of the text.
+ */
+ final int nextCodePoint() {
+ if(index >= 0) {
+ if(index == decomp.length()) {
+ index = -1;
+ } else {
+ int c = Character.codePointAt(decomp, index);
+ index += Character.charCount(c);
+ return c;
+ }
}
- return 0;
+ return nextRawCodePoint();
}
-
- // if both values are in or above the surrogate range, Fix them up.
- if (schar >= UTF16.LEAD_SURROGATE_MIN_VALUE && tchar >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
- schar = fixupUTF16(schar);
- tchar = fixupUTF16(tchar);
+ /**
+ * @param nfcImpl
+ * @param c the last code point returned by nextCodePoint() or nextDecomposedCodePoint()
+ * @return the first code point in c's decomposition,
+ * or c itself if it was decomposed already or if it does not decompose
+ */
+ final int nextDecomposedCodePoint(Normalizer2Impl nfcImpl, int c) {
+ if(index >= 0) { return c; }
+ decomp = nfcImpl.getDecomposition(c);
+ if(decomp == null) { return c; }
+ c = Character.codePointAt(decomp, 0);
+ index = Character.charCount(c);
+ return c;
}
- // now c1 and c2 are in UTF-32-compatible order
- return (schar < tchar) ? -1 : 1; // schar and tchar has to be different
- }
+ /**
+ * Returns the next text code point in FCD order.
+ * Returns -1 at the end of the text.
+ */
+ protected abstract int nextRawCodePoint();
- /**
- * Rotate surrogates to the top to get code point order
- */
- private static final char fixupUTF16(char ch) {
- if (ch >= 0xe000) {
- ch -= 0x800;
- } else {
- ch += 0x2000;
- }
- return ch;
+ private String decomp;
+ private int index;
}
- private static final int UCOL_REORDER_CODE_IGNORE = ReorderCodes.LIMIT + 1;
- /**
- * Builds the lead byte permuatation table
- */
- private void buildPermutationTable() {
- if (m_reorderCodes_ == null || m_reorderCodes_.length == 0 || (m_reorderCodes_.length == 1 && m_reorderCodes_[0] == ReorderCodes.NONE)) {
- m_leadBytePermutationTable_ = null;
- return;
- }
-
- if (m_reorderCodes_[0] == ReorderCodes.DEFAULT) {
- if (m_reorderCodes_.length != 1) {
- throw new IllegalArgumentException("Illegal collation reorder codes - default reorder code must be the only code in the list.");
- }
- // swap the reorder codes for those at build of the rules
- if (m_defaultReorderCodes_ == null || m_defaultReorderCodes_.length == 0) {
- m_leadBytePermutationTable_ = null;
- return;
- }
- m_reorderCodes_ = m_defaultReorderCodes_.clone();
- }
-
- // TODO - these need to be read in from the UCA data file
- // The lowest byte that hasn't been assigned a mapping
- int toBottom = 0x03;
- // The highest byte that hasn't been assigned a mapping
- int toTop = 0xe4;
-
- // filled slots in the output m_scriptOrder_
- boolean[] permutationSlotFilled = new boolean[256];
-
- // used lead bytes
- boolean[] newLeadByteUsed = new boolean[256];
-
- if (m_leadBytePermutationTable_ == null) {
- m_leadBytePermutationTable_ = new byte[256];
+ private static class UTF16NFDIterator extends NFDIterator {
+ UTF16NFDIterator() {}
+ void setText(CharSequence seq, int start) {
+ reset();
+ s = seq;
+ pos = start;
}
- // prefill the reordering codes with the leading entries
- int[] internalReorderCodes = new int[m_reorderCodes_.length + (ReorderCodes.LIMIT - ReorderCodes.FIRST)];
- for (int codeIndex = 0; codeIndex < ReorderCodes.LIMIT - ReorderCodes.FIRST; codeIndex++) {
- internalReorderCodes[codeIndex] = ReorderCodes.FIRST + codeIndex;
- }
- for (int codeIndex = 0; codeIndex < m_reorderCodes_.length; codeIndex++) {
- internalReorderCodes[codeIndex + (ReorderCodes.LIMIT - ReorderCodes.FIRST)] = m_reorderCodes_[codeIndex];
- if (m_reorderCodes_[codeIndex] >= ReorderCodes.FIRST && m_reorderCodes_[codeIndex] < ReorderCodes.LIMIT) {
- internalReorderCodes[m_reorderCodes_[codeIndex] - ReorderCodes.FIRST] = UCOL_REORDER_CODE_IGNORE;
- }
+ @Override
+ protected int nextRawCodePoint() {
+ if(pos == s.length()) { return Collation.SENTINEL_CP; }
+ int c = Character.codePointAt(s, pos);
+ pos += Character.charCount(c);
+ return c;
}
- /*
- * Start from the front of the list and place each script we encounter at the earliest possible locatation
- * in the permutation table. If we encounter UNKNOWN, start processing from the back, and place each script
- * in the last possible location. At each step, we also need to make sure that any scripts that need to not
- * be moved are copied to their same location in the final table.
- */
- boolean fromTheBottom = true;
- int reorderCodesIndex = -1;
- for (int reorderCodesCount = 0; reorderCodesCount < internalReorderCodes.length; reorderCodesCount++) {
- reorderCodesIndex += fromTheBottom ? 1 : -1;
- int next = internalReorderCodes[reorderCodesIndex];
- if (next == UCOL_REORDER_CODE_IGNORE) {
- continue;
- }
- if (next == UScript.UNKNOWN) {
- if (fromTheBottom == false) {
- // double turnaround
- m_leadBytePermutationTable_ = null;
- throw new IllegalArgumentException("Illegal collation reorder codes - two \"from the end\" markers.");
- }
- fromTheBottom = false;
- reorderCodesIndex = internalReorderCodes.length;
- continue;
- }
+ protected CharSequence s;
+ protected int pos;
+ }
- int[] leadBytes = RuleBasedCollator.LEADBYTE_CONSTANTS_.getLeadBytesForReorderCode(next);
- if (fromTheBottom) {
- for (int leadByte : leadBytes) {
- // don't place a lead byte twice in the permutation table
- if (permutationSlotFilled[leadByte]) {
- // lead byte already used
- m_leadBytePermutationTable_ = null;
- throw new IllegalArgumentException("Illegal reorder codes specified - multiple codes with the same lead byte.");
- }
- m_leadBytePermutationTable_[leadByte] = (byte) toBottom;
- newLeadByteUsed[toBottom] = true;
- permutationSlotFilled[leadByte] = true;
- toBottom++;
- }
+ private static final class FCDUTF16NFDIterator extends UTF16NFDIterator {
+ FCDUTF16NFDIterator() {}
+ void setText(Normalizer2Impl nfcImpl, CharSequence seq, int start) {
+ reset();
+ int spanLimit = nfcImpl.makeFCD(seq, start, seq.length(), null);
+ if(spanLimit == seq.length()) {
+ s = seq;
+ pos = start;
} else {
- for (int leadByteIndex = leadBytes.length - 1; leadByteIndex >= 0; leadByteIndex--) {
- int leadByte = leadBytes[leadByteIndex];
- // don't place a lead byte twice in the permutation table
- if (permutationSlotFilled[leadByte]) {
- // lead byte already used
- m_leadBytePermutationTable_ = null;
- throw new IllegalArgumentException("Illegal reorder codes specified - multiple codes with the same lead byte.");
- }
-
- m_leadBytePermutationTable_[leadByte] = (byte) toTop;
- newLeadByteUsed[toTop] = true;
- permutationSlotFilled[leadByte] = true;
- toTop--;
+ if(str == null) {
+ str = new StringBuilder();
+ } else {
+ str.setLength(0);
}
+ str.append(seq, start, spanLimit);
+ ReorderingBuffer buffer = new ReorderingBuffer(nfcImpl, str, seq.length() - start);
+ nfcImpl.makeFCD(seq, spanLimit, seq.length(), buffer);
+ s = str;
+ pos = 0;
}
}
- /* Copy everything that's left over */
- int reorderCode = 0;
- for (int i = 0; i < 256; i++) {
- if (!permutationSlotFilled[i]) {
- while (newLeadByteUsed[reorderCode]) {
- if (reorderCode > 255) {
- throw new IllegalArgumentException("Unable to fill collation reordering table slots - no available reordering code.");
- }
- reorderCode++;
- }
- m_leadBytePermutationTable_[i] = (byte) reorderCode;
- permutationSlotFilled[i] = true;
- newLeadByteUsed[reorderCode] = true;
- }
- }
-
- // for (int i = 0; i < 256; i++){
- // System.out.println(Integer.toString(i, 16) + " -> " + Integer.toString(m_scriptReorderTable_[i], 16));
- // }
- latinOneRegenTable_ = true;
- updateInternalState();
+ private StringBuilder str;
}
- /**
- * Resets the internal case data members and compression values.
- */
- private void updateInternalState() {
- if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
- m_caseSwitch_ = CASE_SWITCH_;
- } else {
- m_caseSwitch_ = NO_CASE_SWITCH_;
- }
-
- if (m_isCaseLevel_ || m_caseFirst_ == AttributeValue.OFF_) {
- m_mask3_ = CE_REMOVE_CASE_;
- m_common3_ = COMMON_NORMAL_3_;
- m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_OFF_;
- m_top3_ = COMMON_TOP_CASE_SWITCH_OFF_3_;
- m_bottom3_ = COMMON_BOTTOM_3_;
- } else {
- m_mask3_ = CE_KEEP_CASE_;
- m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_ON_;
- if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
- m_common3_ = COMMON_UPPER_FIRST_3_;
- m_top3_ = COMMON_TOP_CASE_SWITCH_UPPER_3_;
- m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_UPPER_3_;
+ private static final int compareNFDIter(Normalizer2Impl nfcImpl, NFDIterator left, NFDIterator right) {
+ for(;;) {
+ // Fetch the next FCD code point from each string.
+ int leftCp = left.nextCodePoint();
+ int rightCp = right.nextCodePoint();
+ if(leftCp == rightCp) {
+ if(leftCp < 0) { break; }
+ continue;
+ }
+ // If they are different, then decompose each and compare again.
+ if(leftCp < 0) {
+ leftCp = -2; // end of string
+ } else if(leftCp == 0xfffe) {
+ leftCp = -1; // U+FFFE: merge separator
} else {
- m_common3_ = COMMON_NORMAL_3_;
- m_top3_ = COMMON_TOP_CASE_SWITCH_LOWER_3_;
- m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_LOWER_3_;
+ leftCp = left.nextDecomposedCodePoint(nfcImpl, leftCp);
}
- }
-
- // Set the compression values
- int total3 = m_top3_ - m_bottom3_ - 1;
- // we multilply double with int, but need only int
- m_topCount3_ = (int) (PROPORTION_3_ * total3);
- m_bottomCount3_ = total3 - m_topCount3_;
-
- if (!m_isCaseLevel_ && getStrength() == AttributeValue.TERTIARY_ && !m_isFrenchCollation_
- && !m_isAlternateHandlingShifted_) {
- m_isSimple3_ = true;
- } else {
- m_isSimple3_ = false;
- }
- if (!m_isCaseLevel_ && getStrength() <= AttributeValue.TERTIARY_ && !m_isNumericCollation_
- && !m_isAlternateHandlingShifted_ && !latinOneFailed_) {
- if (latinOneCEs_ == null || latinOneRegenTable_) {
- if (setUpLatinOne()) { // if we succeed in building latin1 table, we'll use it
- latinOneUse_ = true;
- } else {
- latinOneUse_ = false;
- latinOneFailed_ = true;
- }
- latinOneRegenTable_ = false;
- } else { // latin1Table exists and it doesn't need to be regenerated, just use it
- latinOneUse_ = true;
+ if(rightCp < 0) {
+ rightCp = -2; // end of string
+ } else if(rightCp == 0xfffe) {
+ rightCp = -1; // U+FFFE: merge separator
+ } else {
+ rightCp = right.nextDecomposedCodePoint(nfcImpl, rightCp);
}
- } else {
- latinOneUse_ = false;
+ if(leftCp < rightCp) { return Collation.LESS; }
+ if(leftCp > rightCp) { return Collation.GREATER; }
}
-
+ return Collation.EQUAL;
}
/**
- * Initializes the RuleBasedCollator
+ * Compares two CharSequences.
+ * @internal
+ * @deprecated This API is ICU internal only.
*/
- private final void init() {
- for (m_minUnsafe_ = 0; m_minUnsafe_ < DEFAULT_MIN_HEURISTIC_; m_minUnsafe_++) {
- // Find the smallest unsafe char.
- if (isUnsafe(m_minUnsafe_)) {
- break;
- }
+ @Override
+ protected int doCompare(CharSequence left, CharSequence right) {
+ if(left == right) {
+ return Collation.EQUAL;
}
- for (m_minContractionEnd_ = 0; m_minContractionEnd_ < DEFAULT_MIN_HEURISTIC_; m_minContractionEnd_++) {
- // Find the smallest contraction-ending char.
- if (isContractionEnd(m_minContractionEnd_)) {
+ // Identical-prefix test.
+ int equalPrefixLength = 0;
+ for(;;) {
+ if(equalPrefixLength == left.length()) {
+ if(equalPrefixLength == right.length()) { return Collation.EQUAL; }
+ break;
+ } else if(equalPrefixLength == right.length() ||
+ left.charAt(equalPrefixLength) != right.charAt(equalPrefixLength)) {
break;
}
- }
- latinOneFailed_ = true;
- setStrength(m_defaultStrength_);
- setDecomposition(m_defaultDecomposition_);
- m_variableTopValue_ = m_defaultVariableTopValue_;
- m_isFrenchCollation_ = m_defaultIsFrenchCollation_;
- m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_;
- m_isCaseLevel_ = m_defaultIsCaseLevel_;
- m_caseFirst_ = m_defaultCaseFirst_;
- m_isHiragana4_ = m_defaultIsHiragana4_;
- m_isNumericCollation_ = m_defaultIsNumericCollation_;
- latinOneFailed_ = false;
- if (m_defaultReorderCodes_ != null) {
- m_reorderCodes_ = m_defaultReorderCodes_.clone();
- } else {
- m_reorderCodes_ = null;
- }
- updateInternalState();
- }
-
- // Consts for Latin-1 special processing
- private static final int ENDOFLATINONERANGE_ = 0xFF;
- private static final int LATINONETABLELEN_ = (ENDOFLATINONERANGE_ + 50);
- private static final int BAIL_OUT_CE_ = 0xFF000000;
-
- /**
- * Generate latin-1 tables
- */
-
- private static class shiftValues {
- int primShift = 24;
- int secShift = 24;
- int terShift = 24;
- }
-
- private final void addLatinOneEntry(char ch, int CE, shiftValues sh) {
- int primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
- boolean continuation = isContinuation(CE);
- boolean reverseSecondary = false;
- if (!continuation) {
- tertiary = ((CE & m_mask3_));
- tertiary ^= m_caseSwitch_;
- reverseSecondary = true;
+ ++equalPrefixLength;
+ }
+
+ CollationSettings roSettings = settings.readOnly();
+ boolean numeric = roSettings.isNumeric();
+ if(equalPrefixLength > 0) {
+ if((equalPrefixLength != left.length() &&
+ data.isUnsafeBackward(left.charAt(equalPrefixLength), numeric)) ||
+ (equalPrefixLength != right.length() &&
+ data.isUnsafeBackward(right.charAt(equalPrefixLength), numeric))) {
+ // Identical prefix: Back up to the start of a contraction or reordering sequence.
+ while(--equalPrefixLength > 0 &&
+ data.isUnsafeBackward(left.charAt(equalPrefixLength), numeric)) {}
+ }
+ // Notes:
+ // - A longer string can compare equal to a prefix of it if only ignorables follow.
+ // - With a backward level, a longer string can compare less-than a prefix of it.
+
+ // Pass the actual start of each string into the CollationIterators,
+ // plus the equalPrefixLength position,
+ // so that prefix matches back into the equal prefix work.
+ }
+
+ int result;
+ int fastLatinOptions = roSettings.fastLatinOptions;
+ if(fastLatinOptions >= 0 &&
+ (equalPrefixLength == left.length() ||
+ left.charAt(equalPrefixLength) <= CollationFastLatin.LATIN_MAX) &&
+ (equalPrefixLength == right.length() ||
+ right.charAt(equalPrefixLength) <= CollationFastLatin.LATIN_MAX)) {
+ result = CollationFastLatin.compareUTF16(data.fastLatinTable,
+ roSettings.fastLatinPrimaries,
+ fastLatinOptions,
+ left, right, equalPrefixLength);
} else {
- tertiary = (byte) ((CE & CE_REMOVE_CONTINUATION_MASK_));
- tertiary &= CE_REMOVE_CASE_;
- reverseSecondary = false;
- }
-
- secondary = ((CE >>>= 8) & LAST_BYTE_MASK_);
- primary2 = ((CE >>>= 8) & LAST_BYTE_MASK_);
- primary1 = (CE >>> 8);
-
- if (primary1 != 0) {
- if (m_leadBytePermutationTable_ != null && !continuation) {
- primary1 = m_leadBytePermutationTable_[primary1];
- }
- latinOneCEs_[ch] |= (primary1 << sh.primShift);
- sh.primShift -= 8;
- }
- if (primary2 != 0) {
- if (sh.primShift < 0) {
- latinOneCEs_[ch] = BAIL_OUT_CE_;
- latinOneCEs_[latinOneTableLen_ + ch] = BAIL_OUT_CE_;
- latinOneCEs_[2 * latinOneTableLen_ + ch] = BAIL_OUT_CE_;
- return;
- }
- latinOneCEs_[ch] |= (primary2 << sh.primShift);
- sh.primShift -= 8;
- }
- if (secondary != 0) {
- if (reverseSecondary && m_isFrenchCollation_) { // reverse secondary
- latinOneCEs_[latinOneTableLen_ + ch] >>>= 8; // make space for secondary
- latinOneCEs_[latinOneTableLen_ + ch] |= (secondary << 24);
- } else { // normal case
- latinOneCEs_[latinOneTableLen_ + ch] |= (secondary << sh.secShift);
+ result = CollationFastLatin.BAIL_OUT_RESULT;
+ }
+
+ if(result == CollationFastLatin.BAIL_OUT_RESULT) {
+ CollationBuffer buffer = null;
+ try {
+ buffer = getCollationBuffer();
+ if(roSettings.dontCheckFCD()) {
+ buffer.leftUTF16CollIter.setText(numeric, left, equalPrefixLength);
+ buffer.rightUTF16CollIter.setText(numeric, right, equalPrefixLength);
+ result = CollationCompare.compareUpToQuaternary(
+ buffer.leftUTF16CollIter, buffer.rightUTF16CollIter, roSettings);
+ } else {
+ buffer.leftFCDUTF16Iter.setText(numeric, left, equalPrefixLength);
+ buffer.rightFCDUTF16Iter.setText(numeric, right, equalPrefixLength);
+ result = CollationCompare.compareUpToQuaternary(
+ buffer.leftFCDUTF16Iter, buffer.rightFCDUTF16Iter, roSettings);
+ }
+ } finally {
+ releaseCollationBuffer(buffer);
}
- sh.secShift -= 8;
- }
- if (tertiary != 0) {
- latinOneCEs_[2 * latinOneTableLen_ + ch] |= (tertiary << sh.terShift);
- sh.terShift -= 8;
- }
- }
-
- private final void resizeLatinOneTable(int newSize) {
- int newTable[] = new int[3 * newSize];
- int sizeToCopy = ((newSize < latinOneTableLen_) ? newSize : latinOneTableLen_);
- // uprv_memset(newTable, 0, newSize*sizeof(uint32_t)*3); // automatically cleared.
- System.arraycopy(latinOneCEs_, 0, newTable, 0, sizeToCopy);
- System.arraycopy(latinOneCEs_, latinOneTableLen_, newTable, newSize, sizeToCopy);
- System.arraycopy(latinOneCEs_, 2 * latinOneTableLen_, newTable, 2 * newSize, sizeToCopy);
- latinOneTableLen_ = newSize;
- latinOneCEs_ = newTable;
- }
-
- private final boolean setUpLatinOne() {
- if (latinOneCEs_ == null || m_reallocLatinOneCEs_) {
- latinOneCEs_ = new int[3 * LATINONETABLELEN_];
- latinOneTableLen_ = LATINONETABLELEN_;
- m_reallocLatinOneCEs_ = false;
- } else {
- Arrays.fill(latinOneCEs_, 0);
}
- if (m_ContInfo_ == null) {
- m_ContInfo_ = new ContractionInfo();
+ if(result != Collation.EQUAL || roSettings.getStrength() < Collator.IDENTICAL) {
+ return result;
}
- char ch = 0;
- // StringBuffer sCh = new StringBuffer();
- // CollationElementIterator it = getCollationElementIterator(sCh.toString());
- CollationElementIterator it = getCollationElementIterator("");
- shiftValues s = new shiftValues();
- int CE = 0;
- char contractionOffset = ENDOFLATINONERANGE_ + 1;
-
- for (ch = 0; ch <= ENDOFLATINONERANGE_; ch++) {
- s.primShift = 24;
- s.secShift = 24;
- s.terShift = 24;
- if (ch < 0x100) {
- CE = m_trie_.getLatin1LinearValue(ch);
- } else {
- CE = m_trie_.getLeadValue(ch);
- if (CE == CollationElementIterator.CE_NOT_FOUND_) {
- CE = UCA_.m_trie_.getLeadValue(ch);
- }
- }
- if (!isSpecial(CE)) {
- addLatinOneEntry(ch, CE, s);
+ CollationBuffer buffer = null;
+ try {
+ buffer = getCollationBuffer();
+ // Compare identical level.
+ Normalizer2Impl nfcImpl = data.nfcImpl;
+ if(roSettings.dontCheckFCD()) {
+ buffer.leftUTF16NFDIter.setText(left, equalPrefixLength);
+ buffer.rightUTF16NFDIter.setText(right, equalPrefixLength);
+ return compareNFDIter(nfcImpl, buffer.leftUTF16NFDIter, buffer.rightUTF16NFDIter);
} else {
- switch (RuleBasedCollator.getTag(CE)) {
- case CollationElementIterator.CE_EXPANSION_TAG_:
- case CollationElementIterator.CE_DIGIT_TAG_:
- // sCh.delete(0, sCh.length());
- // sCh.append(ch);
- // it.setText(sCh.toString());
- it.setText(UCharacter.toString(ch));
- while ((CE = it.next()) != CollationElementIterator.NULLORDER) {
- if (s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
- latinOneCEs_[ch] = BAIL_OUT_CE_;
- latinOneCEs_[latinOneTableLen_ + ch] = BAIL_OUT_CE_;
- latinOneCEs_[2 * latinOneTableLen_ + ch] = BAIL_OUT_CE_;
- break;
- }
- addLatinOneEntry(ch, CE, s);
- }
- break;
- case CollationElementIterator.CE_CONTRACTION_TAG_:
- // here is the trick
- // F2 is contraction. We do something very similar to contractions
- // but have two indices, one in the real contraction table and the
- // other to where we stuffed things. This hopes that we don't have
- // many contractions (this should work for latin-1 tables).
- {
- if ((CE & 0x00FFF000) != 0) {
- latinOneFailed_ = true;
- return false;
- }
-
- int UCharOffset = (CE & 0xFFFFFF) - m_contractionOffset_; // getContractionOffset(CE)]
-
- CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
-
- latinOneCEs_[ch] = CE;
- latinOneCEs_[latinOneTableLen_ + ch] = CE;
- latinOneCEs_[2 * latinOneTableLen_ + ch] = CE;
-
- // We're going to jump into contraction table, pick the elements
- // and use them
- do {
- // CE = *(contractionCEs + (UCharOffset - contractionIndex));
- CE = m_contractionCE_[UCharOffset];
- if (isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
- int i; /* general counter */
- // uint32_t *CEOffset = (uint32_t *)image+getExpansionOffset(CE); /* find the offset to
- // expansion table */
- int offset = ((CE & 0xFFFFF0) >> 4) - m_expansionOffset_; // it.getExpansionOffset(this,
- // CE);
- int size = CE & 0xF; // getExpansionCount(CE);
- // CE = *CEOffset++;
- if (size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
- for (i = 0; i < size; i++) {
- if (s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
- latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
- latinOneCEs_[latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_;
- latinOneCEs_[2 * latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_;
- break;
- }
- addLatinOneEntry(contractionOffset, m_expansion_[offset + i], s);
- }
- } else { /* else, we do */
- while (m_expansion_[offset] != 0) {
- if (s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
- latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
- latinOneCEs_[latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_;
- latinOneCEs_[2 * latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_;
- break;
- }
- addLatinOneEntry(contractionOffset, m_expansion_[offset++], s);
- }
- }
- contractionOffset++;
- } else if (!isSpecial(CE)) {
- addLatinOneEntry(contractionOffset++, CE, s);
- } else {
- latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
- latinOneCEs_[latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_;
- latinOneCEs_[2 * latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_;
- contractionOffset++;
- }
- UCharOffset++;
- s.primShift = 24;
- s.secShift = 24;
- s.terShift = 24;
- if (contractionOffset == latinOneTableLen_) { // we need to reallocate
- resizeLatinOneTable(2 * latinOneTableLen_);
- }
- } while (m_contractionIndex_[UCharOffset] != 0xFFFF);
- }
- break;
- case CollationElementIterator.CE_SPEC_PROC_TAG_: {
- // 0xB7 is a precontext character defined in UCA5.1, a special
- // handle is implemeted in order to save LatinOne table for
- // most locales.
- if (ch == 0xb7) {
- addLatinOneEntry(ch, CE, s);
- } else {
- latinOneFailed_ = true;
- return false;
- }
- }
- break;
- default:
- latinOneFailed_ = true;
- return false;
- }
+ buffer.leftFCDUTF16NFDIter.setText(nfcImpl, left, equalPrefixLength);
+ buffer.rightFCDUTF16NFDIter.setText(nfcImpl, right, equalPrefixLength);
+ return compareNFDIter(nfcImpl, buffer.leftFCDUTF16NFDIter, buffer.rightFCDUTF16NFDIter);
}
+ } finally {
+ releaseCollationBuffer(buffer);
}
- // compact table
- if (contractionOffset < latinOneTableLen_) {
- resizeLatinOneTable(contractionOffset);
- }
- return true;
- }
-
- private static class ContractionInfo {
- int index;
}
- ContractionInfo m_ContInfo_;
-
- private int getLatinOneContraction(int strength, int CE, String s) {
- // int strength, int CE, String s, Integer ind) {
- int len = s.length();
- // const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
- int UCharOffset = (CE & 0xFFF) - m_contractionOffset_;
- int offset = 1;
- int latinOneOffset = (CE & 0x00FFF000) >>> 12;
- char schar = 0, tchar = 0;
+ // package private constructors ------------------------------------------
- for (;;) {
- /*
- * if(len == -1) { if(s[*index] == 0) { // end of string
- * return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); } else { schar = s[*index]; }
- * } else {
- */
- if (m_ContInfo_.index == len) {
- return (latinOneCEs_[strength * latinOneTableLen_ + latinOneOffset]);
- } else {
- schar = s.charAt(m_ContInfo_.index);
- }
- // }
+ RuleBasedCollator(CollationTailoring t, ULocale vl) {
+ data = t.data;
+ settings = t.settings.clone();
+ tailoring = t;
+ validLocale = vl;
+ actualLocaleIsSameAsValid = false;
+ }
- while (schar > (tchar = m_contractionIndex_[UCharOffset + offset]/** (UCharOffset+offset) */
- )) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
- offset++;
- }
+ private void adoptTailoring(CollationTailoring t) {
+ assert(settings == null && data == null && tailoring == null);
+ data = t.data;
+ settings = t.settings.clone();
+ tailoring = t;
+ validLocale = t.actualLocale;
+ actualLocaleIsSameAsValid = false;
+ }
- if (schar == tchar) {
- m_ContInfo_.index++;
- return (latinOneCEs_[strength * latinOneTableLen_ + latinOneOffset + offset]);
- } else {
- if (schar > ENDOFLATINONERANGE_ /* & 0xFF00 */) {
- return BAIL_OUT_CE_;
- }
- // skip completely ignorables
- int isZeroCE = m_trie_.getLeadValue(schar); // UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
- if (isZeroCE == 0) { // we have to ignore completely ignorables
- m_ContInfo_.index++;
- continue;
- }
+ // package private methods -----------------------------------------------
- return (latinOneCEs_[strength * latinOneTableLen_ + latinOneOffset]);
- }
- }
+ /**
+ * Tests whether a character is "unsafe" for use as a collation starting point.
+ *
+ * @param c code point or code unit
+ * @return true if c is unsafe
+ * @see CollationElementIterator#setOffset(int)
+ */
+ final boolean isUnsafe(int c) {
+ return data.isUnsafeBackward(c, settings.readOnly().isNumeric());
}
/**
- * This is a fast strcoll, geared towards text in Latin-1. It supports contractions of size two, French secondaries
- * and case switching. You can use it with strengths primary to tertiary. It does not support shifted and case
- * level. It relies on the table build by setupLatin1Table. If it doesn't understand something, it will go to the
- * regular strcoll.
- * @param buffer collation buffer temporary state
+ * Frozen state of the collator.
*/
- private final int compareUseLatin1(String source, String target, int startOffset, CollationBuffer buffer) {
- int sLen = source.length();
- int tLen = target.length();
-
- int strength = getStrength();
-
- int sIndex = startOffset, tIndex = startOffset;
- char sChar = 0, tChar = 0;
- int sOrder = 0, tOrder = 0;
-
- boolean endOfSource = false;
-
- // uint32_t *elements = coll->latinOneCEs;
-
- boolean haveContractions = false; // if we have contractions in our string
- // we cannot do French secondary
-
- int offset = latinOneTableLen_;
-
- // Do the primary level
- primLoop:
- for (;;) {
- while (sOrder == 0) { // this loop skips primary ignorables
- // sOrder=getNextlatinOneCE(source);
- if (sIndex == sLen) {
- endOfSource = true;
- break;
- }
- sChar = source.charAt(sIndex++); // [sIndex++];
- // }
- if (sChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out
- // fprintf(stderr, "R");
- return compareRegular(source, target, startOffset, buffer);
- }
- sOrder = latinOneCEs_[sChar];
- if (isSpecial(sOrder)) { // if we got a special
- // specials can basically be either contractions or bail-out signs. If we get anything
- // else, we'll bail out anywasy
- if (getTag(sOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) {
- m_ContInfo_.index = sIndex;
- sOrder = getLatinOneContraction(0, sOrder, source);
- sIndex = m_ContInfo_.index;
- haveContractions = true; // if there are contractions, we cannot do French secondary
- // However, if there are contractions in the table, but we always use just one char,
- // we might be able to do French. This should be checked out.
- }
- if (isSpecial(sOrder) /* == UCOL_BAIL_OUT_CE */) {
- // fprintf(stderr, "S");
- return compareRegular(source, target, startOffset, buffer);
- }
- }
- }
-
- while (tOrder == 0) { // this loop skips primary ignorables
- // tOrder=getNextlatinOneCE(target);
- if (tIndex == tLen) {
- if (endOfSource) {
- break primLoop;
- } else {
- return 1;
- }
- }
- tChar = target.charAt(tIndex++); // [tIndex++];
- if (tChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out
- // fprintf(stderr, "R");
- return compareRegular(source, target, startOffset, buffer);
- }
- tOrder = latinOneCEs_[tChar];
- if (isSpecial(tOrder)) {
- // Handling specials, see the comments for source
- if (getTag(tOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) {
- m_ContInfo_.index = tIndex;
- tOrder = getLatinOneContraction(0, tOrder, target);
- tIndex = m_ContInfo_.index;
- haveContractions = true;
- }
- if (isSpecial(tOrder)/* == UCOL_BAIL_OUT_CE */) {
- // fprintf(stderr, "S");
- return compareRegular(source, target, startOffset, buffer);
- }
- }
- }
- if (endOfSource) { // source is finished, but target is not, say the result.
- return -1;
- }
-
- if (sOrder == tOrder) { // if we have same CEs, we continue the loop
- sOrder = 0;
- tOrder = 0;
- continue;
- } else {
- // compare current top bytes
- if (((sOrder ^ tOrder) & 0xFF000000) != 0) {
- // top bytes differ, return difference
- if (sOrder >>> 8 < tOrder >>> 8) {
- return -1;
- } else {
- return 1;
- }
- // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
- // since we must return enum value
- }
-
- // top bytes match, continue with following bytes
- sOrder <<= 8;
- tOrder <<= 8;
- }
- }
-
- // after primary loop, we definitely know the sizes of strings,
- // so we set it and use simpler loop for secondaries and tertiaries
- // sLen = sIndex; tLen = tIndex;
- if (strength >= SECONDARY) {
- // adjust the table beggining
- // latinOneCEs_ += coll->latinOneTableLen;
- endOfSource = false;
-
- if (!m_isFrenchCollation_) { // non French
- // This loop is a simplified copy of primary loop
- // at this point we know that whole strings are latin-1, so we don't
- // check for that. We also know that we only have contractions as
- // specials.
- // sIndex = 0; tIndex = 0;
- sIndex = startOffset;
- tIndex = startOffset;
- secLoop: for (;;) {
- while (sOrder == 0) {
- if (sIndex == sLen) {
- endOfSource = true;
- break;
- }
- sChar = source.charAt(sIndex++); // [sIndex++];
- sOrder = latinOneCEs_[offset + sChar];
- if (isSpecial(sOrder)) {
- m_ContInfo_.index = sIndex;
- sOrder = getLatinOneContraction(1, sOrder, source);
- sIndex = m_ContInfo_.index;
- }
- }
-
- while (tOrder == 0) {
- if (tIndex == tLen) {
- if (endOfSource) {
- break secLoop;
- } else {
- return 1;
- }
- }
- tChar = target.charAt(tIndex++); // [tIndex++];
- tOrder = latinOneCEs_[offset + tChar];
- if (isSpecial(tOrder)) {
- m_ContInfo_.index = tIndex;
- tOrder = getLatinOneContraction(1, tOrder, target);
- tIndex = m_ContInfo_.index;
- }
- }
- if (endOfSource) {
- return -1;
- }
+ private Lock frozenLock;
- if (sOrder == tOrder) {
- sOrder = 0;
- tOrder = 0;
- continue;
- } else {
- // see primary loop for comments on this
- if (((sOrder ^ tOrder) & 0xFF000000) != 0) {
- if (sOrder >>> 8 < tOrder >>> 8) {
- return -1;
- } else {
- return 1;
- }
- }
- sOrder <<= 8;
- tOrder <<= 8;
- }
- }
- } else { // French
- if (haveContractions) { // if we have contractions, we have to bail out
- // since we don't really know how to handle them here
- return compareRegular(source, target, startOffset, buffer);
- }
- // For French, we go backwards
- sIndex = sLen;
- tIndex = tLen;
- secFLoop: for (;;) {
- while (sOrder == 0) {
- if (sIndex == startOffset) {
- endOfSource = true;
- break;
- }
- sChar = source.charAt(--sIndex); // [--sIndex];
- sOrder = latinOneCEs_[offset + sChar];
- // don't even look for contractions
- }
+ private static final class CollationBuffer {
+ private CollationBuffer(CollationData data) {
+ leftUTF16CollIter = new UTF16CollationIterator(data);
+ rightUTF16CollIter = new UTF16CollationIterator(data);
+ leftFCDUTF16Iter = new FCDUTF16CollationIterator(data);
+ rightFCDUTF16Iter = new FCDUTF16CollationIterator(data);
+ leftUTF16NFDIter = new UTF16NFDIterator();
+ rightUTF16NFDIter = new UTF16NFDIterator();
+ leftFCDUTF16NFDIter = new FCDUTF16NFDIterator();
+ rightFCDUTF16NFDIter = new FCDUTF16NFDIterator();
+ }
- while (tOrder == 0) {
- if (tIndex == startOffset) {
- if (endOfSource) {
- break secFLoop;
- } else {
- return 1;
- }
- }
- tChar = target.charAt(--tIndex); // [--tIndex];
- tOrder = latinOneCEs_[offset + tChar];
- // don't even look for contractions
- }
- if (endOfSource) {
- return -1;
- }
+ UTF16CollationIterator leftUTF16CollIter;
+ UTF16CollationIterator rightUTF16CollIter;
+ FCDUTF16CollationIterator leftFCDUTF16Iter;
+ FCDUTF16CollationIterator rightFCDUTF16Iter;
- if (sOrder == tOrder) {
- sOrder = 0;
- tOrder = 0;
- continue;
- } else {
- // see the primary loop for comments
- if (((sOrder ^ tOrder) & 0xFF000000) != 0) {
- if (sOrder >>> 8 < tOrder >>> 8) {
- return -1;
- } else {
- return 1;
- }
- }
- sOrder <<= 8;
- tOrder <<= 8;
- }
- }
- }
- }
+ UTF16NFDIterator leftUTF16NFDIter;
+ UTF16NFDIterator rightUTF16NFDIter;
+ FCDUTF16NFDIterator leftFCDUTF16NFDIter;
+ FCDUTF16NFDIterator rightFCDUTF16NFDIter;
- if (strength >= TERTIARY) {
- // tertiary loop is the same as secondary (except no French)
- offset += latinOneTableLen_;
- // sIndex = 0; tIndex = 0;
- sIndex = startOffset;
- tIndex = startOffset;
- endOfSource = false;
- for (;;) {
- while (sOrder == 0) {
- if (sIndex == sLen) {
- endOfSource = true;
- break;
- }
- sChar = source.charAt(sIndex++); // [sIndex++];
- sOrder = latinOneCEs_[offset + sChar];
- if (isSpecial(sOrder)) {
- m_ContInfo_.index = sIndex;
- sOrder = getLatinOneContraction(2, sOrder, source);
- sIndex = m_ContInfo_.index;
- }
- }
- while (tOrder == 0) {
- if (tIndex == tLen) {
- if (endOfSource) {
- return 0; // if both strings are at the end, they are equal
- } else {
- return 1;
- }
- }
- tChar = target.charAt(tIndex++); // [tIndex++];
- tOrder = latinOneCEs_[offset + tChar];
- if (isSpecial(tOrder)) {
- m_ContInfo_.index = tIndex;
- tOrder = getLatinOneContraction(2, tOrder, target);
- tIndex = m_ContInfo_.index;
- }
- }
- if (endOfSource) {
- return -1;
- }
- if (sOrder == tOrder) {
- sOrder = 0;
- tOrder = 0;
- continue;
- } else {
- if (((sOrder ^ tOrder) & 0xff000000) != 0) {
- if (sOrder >>> 8 < tOrder >>> 8) {
- return -1;
- } else {
- return 1;
- }
- }
- sOrder <<= 8;
- tOrder <<= 8;
- }
- }
- }
- return 0;
+ RawCollationKey rawCollationKey;
}
/**
* @return the version object associated with this collator
* @stable ICU 2.8
*/
+ @Override
public VersionInfo getVersion() {
- /* RunTime version */
+ VersionInfo version = tailoring.version;
int rtVersion = VersionInfo.UCOL_RUNTIME_VERSION.getMajor();
- /* Builder version */
- int bdVersion = m_version_.getMajor();
-
- /*
- * Charset Version. Need to get the version from cnv files makeconv should populate cnv files with version and
- * an api has to be provided in ucnv.h to obtain this version
- */
- int csVersion = 0;
-
- /* combine the version info */
- int cmbVersion = ((rtVersion << 11) | (bdVersion << 6) | (csVersion)) & 0xFFFF;
-
- /* Tailoring rules */
- return VersionInfo.getInstance(cmbVersion >> 8, cmbVersion & 0xFF, m_version_.getMinor(),
- UCA_.m_UCA_version_.getMajor());
-
- // versionInfo[0] = (uint8_t)(cmbVersion>>8);
- // versionInfo[1] = (uint8_t)cmbVersion;
- // versionInfo[2] = coll->image->version[1];
- // versionInfo[3] = coll->UCA->image->UCAVersion[0];
+ return VersionInfo.getInstance(
+ version.getMajor() + (rtVersion << 4) + (rtVersion >> 4),
+ version.getMinor(), version.getMilli(), version.getMicro());
}
/**
* @return the version object associated with this collator
* @stable ICU 2.8
*/
+ @Override
public VersionInfo getUCAVersion() {
- return UCA_.m_UCA_version_;
+ VersionInfo v = getVersion();
+ // Note: This is tied to how the current implementation encodes the UCA version
+ // in the overall getVersion().
+ // Alternatively, we could load the root collator and get at lower-level data from there.
+ // Either way, it will reflect the input collator's UCA version only
+ // if it is a known implementation.
+ // (C++ comment) It would be cleaner to make this a virtual Collator method.
+ // (In Java, it is virtual.)
+ return VersionInfo.getInstance(v.getMinor() >> 3, v.getMinor() & 7, v.getMilli() >> 6, 0);
}
- private transient boolean m_reallocLatinOneCEs_;
-
private CollationBuffer collationBuffer;
private final CollationBuffer getCollationBuffer() {
if (isFrozen()) {
frozenLock.lock();
- }
- if (collationBuffer == null) {
- collationBuffer = new CollationBuffer();
- } else {
- collationBuffer.resetBuffers();
+ } else if (collationBuffer == null) {
+ collationBuffer = new CollationBuffer(data);
}
return collationBuffer;
}
frozenLock.unlock();
}
}
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public ULocale getLocale(ULocale.Type type) {
+ if (type == ULocale.ACTUAL_LOCALE) {
+ return actualLocaleIsSameAsValid ? validLocale : tailoring.actualLocale;
+ } else if(type == ULocale.VALID_LOCALE) {
+ return validLocale;
+ } else {
+ throw new IllegalArgumentException("unknown ULocale.Type " + type);
+ }
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ void setLocale(ULocale valid, ULocale actual) {
+ // This method is called
+ // by other protected functions that checks and makes sure that
+ // valid and actual are not null before passing
+ assert (valid == null) == (actual == null);
+ // Another check we could do is that the actual locale is at
+ // the same level or less specific than the valid locale.
+ // TODO: Starting with Java 7, use Objects.equals(a, b).
+ if(Utility.objectEquals(actual, tailoring.actualLocale)) {
+ actualLocaleIsSameAsValid = false;
+ } else {
+ assert(Utility.objectEquals(actual, valid));
+ actualLocaleIsSameAsValid = true;
+ }
+ // Do not modify tailoring.actualLocale:
+ // We cannot be sure that that would be thread-safe.
+ validLocale = valid;
+ }
+
+ CollationData data;
+ SharedObject.Reference<CollationSettings> settings; // reference-counted
+ CollationTailoring tailoring; // C++: reference-counted
+ private ULocale validLocale;
+ // Note: No need in Java to track which attributes have been set explicitly.
+ // int or EnumSet explicitlySetAttributes;
+
+ private boolean actualLocaleIsSameAsValid;
}
/*
*******************************************************************************
- * Copyright (C) 1996-2010, International Business Machines Corporation and *
+ * Copyright (C) 1996-2014, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
*/
public abstract class SearchIterator
{
-
+ /**
+ * The BreakIterator to define the boundaries of a logical match.
+ * This value can be a null.
+ * See class documentation for more information.
+ * @see #setBreakIterator(BreakIterator)
+ * @see #getBreakIterator
+ * @see BreakIterator
+ * @stable ICU 2.0
+ */
+ protected BreakIterator breakIterator;
+
+ /**
+ * Target text for searching.
+ * @see #setTarget(CharacterIterator)
+ * @see #getTarget
+ * @stable ICU 2.0
+ */
+ protected CharacterIterator targetText;
+ /**
+ * Length of the most current match in target text.
+ * Value 0 is the default value.
+ * @see #setMatchLength
+ * @see #getMatchLength
+ * @stable ICU 2.0
+ */
+ protected int matchLength;
+
+ /**
+ * Java port of ICU4C struct USearch (usrchimp.h)
+ *
+ * Note:
+ *
+ * ICU4J already exposed some protected members such as
+ * targetText, brekIterator and matchedLength as a part of stable
+ * APIs. In ICU4C, they are exposed through USearch struct,
+ * although USearch struct itself is internal API.
+ *
+ * This class was created for making ICU4J code in parallel to
+ * ICU4C implementation. ICU4J implementation access member
+ * fields like C struct (e.g. search_.isOverlap_) mostly, except
+ * fields already exposed as protected member (e.g. search_.text()).
+ *
+ */
+ final class Search {
+
+ CharacterIterator text() {
+ return SearchIterator.this.targetText;
+ }
+
+ void setTarget(CharacterIterator text) {
+ SearchIterator.this.targetText = text;
+ }
+
+ /** Flag to indicate if overlapping search is to be done.
+ E.g. looking for "aa" in "aaa" will yield matches at offset 0 and 1. */
+ boolean isOverlap_;
+
+ boolean isCanonicalMatch_;
+
+ ElementComparisonType elementComparisonType_;
+
+ BreakIterator internalBreakIter_;
+
+ BreakIterator breakIter() {
+ return SearchIterator.this.breakIterator;
+ }
+
+ void setBreakIter(BreakIterator breakIter) {
+ SearchIterator.this.breakIterator = breakIter;
+ }
+
+ int matchedIndex_;
+
+ int matchedLength() {
+ return SearchIterator.this.matchLength;
+ }
+
+ void setMatchedLength(int matchedLength) {
+ SearchIterator.this.matchLength = matchedLength;
+ }
+
+ /** Flag indicates if we are doing a forwards search */
+ boolean isForwardSearching_;
+
+ /** Flag indicates if we are at the start of a string search.
+ This indicates that we are in forward search and at the start of m_text. */
+ boolean reset_;
+
+ // Convenient methods for accessing begin/end index of the
+ // target text. These are ICU4J only and are not data fields.
+ int beginIndex() {
+ if (targetText == null) {
+ return 0;
+ }
+ return targetText.getBeginIndex();
+ }
+
+ int endIndex() {
+ if (targetText == null) {
+ return 0;
+ }
+ return targetText.getEndIndex();
+ }
+ }
+
+ Search search_ = new Search();
+
// public data members -------------------------------------------------
/**
* @stable ICU 2.8
*/
public void setIndex(int position) {
- if (position < targetText.getBeginIndex()
- || position > targetText.getEndIndex()) {
+ if (position < search_.beginIndex()
+ || position > search_.endIndex()) {
throw new IndexOutOfBoundsException(
"setIndex(int) expected position to be between " +
- targetText.getBeginIndex() + " and " + targetText.getEndIndex());
+ search_.beginIndex() + " and " + search_.endIndex());
}
- m_setOffset_ = position;
- m_reset_ = false;
- matchLength = 0;
+ search_.reset_ = false;
+ search_.setMatchedLength(0);
+ search_.matchedIndex_ = DONE;
}
/**
*/
public void setOverlapping(boolean allowOverlap)
{
- m_isOverlap_ = allowOverlap;
+ search_.isOverlap_ = allowOverlap;
}
/**
*/
public void setBreakIterator(BreakIterator breakiter)
{
- breakIterator = breakiter;
- if (breakIterator != null) {
- breakIterator.setText(targetText);
+ search_.setBreakIter(breakiter);
+ if (search_.breakIter() != null) {
+ // Create a clone of CharacterItearator, so it won't
+ // affect the position currently held by search_.text()
+ if (search_.text() != null) {
+ search_.breakIter().setText((CharacterIterator)search_.text().clone());
+ }
}
}
-
+
/**
* Set the target text to be searched. Text iteration will then begin at
* the start of the text string. This method is useful if you want to
if (text == null || text.getEndIndex() == text.getIndex()) {
throw new IllegalArgumentException("Illegal null or empty text");
}
-
- targetText = text;
- targetText.setIndex(targetText.getBeginIndex());
- matchLength = 0;
- m_reset_ = true;
- m_isForwardSearching_ = true;
- if (breakIterator != null) {
- breakIterator.setText(targetText);
+
+ text.setIndex(text.getBeginIndex());
+ search_.setTarget(text);
+ search_.matchedIndex_ = DONE;
+ search_.setMatchedLength(0);
+ search_.reset_ = true;
+ search_.isForwardSearching_ = true;
+ if (search_.breakIter() != null) {
+ // Create a clone of CharacterItearator, so it won't
+ // affect the position currently held by search_.text()
+ search_.breakIter().setText((CharacterIterator)text.clone());
+ }
+ if (search_.internalBreakIter_ != null) {
+ search_.internalBreakIter_.setText((CharacterIterator)text.clone());
}
}
+ //TODO: We should add APIs below to match ICU4C APIs
+ // setCanonicalMatch
+ // setElementComparison
+
// public getters ----------------------------------------------------
/**
*/
public int getMatchStart()
{
- return m_lastMatchStart_;
+ return search_.matchedIndex_;
}
/**
*/
public int getMatchLength()
{
- return matchLength;
+ return search_.matchedLength();
}
/**
*/
public BreakIterator getBreakIterator()
{
- return breakIterator;
+ return search_.breakIter();
}
/**
*/
public CharacterIterator getTarget()
{
- return targetText;
+ return search_.text();
}
/**
*/
public String getMatchedText()
{
- if (matchLength > 0) {
- int limit = m_lastMatchStart_ + matchLength;
- StringBuilder result = new StringBuilder(matchLength);
- result.append(targetText.current());
- targetText.next();
- while (targetText.getIndex() < limit) {
- result.append(targetText.current());
- targetText.next();
+ if (search_.matchedLength() > 0) {
+ int limit = search_.matchedIndex_ + search_.matchedLength();
+ StringBuilder result = new StringBuilder(search_.matchedLength());
+ CharacterIterator it = search_.text();
+ it.setIndex(search_.matchedIndex_);
+ while (it.getIndex() < limit) {
+ result.append(it.current());
+ it.next();
}
- targetText.setIndex(m_lastMatchStart_);
+ it.setIndex(search_.matchedIndex_);
return result.toString();
}
return null;
*/
public int next()
{
- int start = targetText.getIndex();
- if (m_setOffset_ != DONE) {
- start = m_setOffset_;
- m_setOffset_ = DONE;
- }
- if (m_isForwardSearching_) {
- if (!m_reset_ &&
- start + matchLength >= targetText.getEndIndex()) {
- // not enough characters to match
- matchLength = 0;
- targetText.setIndex(targetText.getEndIndex());
- m_lastMatchStart_ = DONE;
- return DONE;
+ int index = getIndex(); // offset = getOffset() in ICU4C
+ int matchindex = search_.matchedIndex_;
+ int matchlength = search_.matchedLength();
+ search_.reset_ = false;
+ if (search_.isForwardSearching_) {
+ int endIdx = search_.endIndex();
+ if (index == endIdx || matchindex == endIdx ||
+ (matchindex != DONE &&
+ matchindex + matchlength >= endIdx)) {
+ setMatchNotFound();
+ return DONE;
}
- m_reset_ = false;
- }
- else {
- // switching direction.
- // if matchedIndex == USEARCH_DONE, it means that either a
- // setIndex has been called or that previous ran off the text
+ } else {
+ // switching direction.
+ // if matchedIndex == DONE, it means that either a
+ // setIndex (setOffset in C) has been called or that previous ran off the text
// string. the iterator would have been set to offset 0 if a
// match is not found.
- m_isForwardSearching_ = true;
- if (start != DONE) {
+ search_.isForwardSearching_ = true;
+ if (search_.matchedIndex_ != DONE) {
// there's no need to set the collation element iterator
// the next call to next will set the offset.
- return start;
+ return matchindex;
}
}
-
- if (start == DONE) {
- start = targetText.getBeginIndex();
- }
- if (matchLength > 0) {
- // if match length is 0 we are at the start of the iteration
- if (m_isOverlap_) {
- start ++;
- }
- else {
- start += matchLength;
+
+ if (matchlength > 0) {
+ // if matchlength is 0 we are at the start of the iteration
+ if (search_.isOverlap_) {
+ index++;
+ } else {
+ index += matchlength;
}
}
- m_lastMatchStart_ = handleNext(start);
- return m_lastMatchStart_;
+
+ return handleNext(index);
}
/**
*/
public int previous()
{
- int start = targetText.getIndex();
- if (m_setOffset_ != DONE) {
- start = m_setOffset_;
- m_setOffset_ = DONE;
- }
- if (m_reset_) {
- m_isForwardSearching_ = false;
- m_reset_ = false;
- start = targetText.getEndIndex();
+ int index; // offset in ICU4C
+ if (search_.reset_) {
+ index = search_.endIndex(); // m_search_->textLength in ICU4C
+ search_.isForwardSearching_ = false;
+ search_.reset_ = false;
+ setIndex(index);
+ } else {
+ index = getIndex();
}
-
- if (m_isForwardSearching_ == true) {
+
+ int matchindex = search_.matchedIndex_;
+ if (search_.isForwardSearching_) {
// switching direction.
- // if matchedIndex == USEARCH_DONE, it means that either a
- // setIndex has been called or that next ran off the text
+ // if matchedIndex == DONE, it means that either a
+ // setIndex (setOffset in C) has been called or that next ran off the text
// string. the iterator would have been set to offset textLength if
// a match is not found.
- m_isForwardSearching_ = false;
- if (start != targetText.getEndIndex()) {
- return start;
+ search_.isForwardSearching_ = false;
+ if (matchindex != DONE) {
+ return matchindex;
}
- }
- else {
- if (start == targetText.getBeginIndex()) {
+ } else {
+ int startIdx = search_.beginIndex();
+ if (index == startIdx || matchindex == startIdx) {
// not enough characters to match
- matchLength = 0;
- targetText.setIndex(targetText.getBeginIndex());
- m_lastMatchStart_ = DONE;
+ setMatchNotFound();
return DONE;
}
}
- m_lastMatchStart_ = handlePrevious(start);
- return m_lastMatchStart_;
+ if (matchindex != DONE) {
+ if (search_.isOverlap_) {
+ matchindex += search_.matchedLength() - 2;
+ }
+
+ return handlePrevious(matchindex);
+ }
+
+ return handlePrevious(index);
}
/**
*/
public boolean isOverlapping()
{
- return m_isOverlap_;
+ return search_.isOverlap_;
}
-
+
+ //TODO: We should add APIs below to match ICU4C APIs
+ // isCanonicalMatch
+ // getElementComparison
+
/**
* <p>
* Resets the search iteration. All properties will be reset to their
*/
public void reset()
{
- // reset is setting the attributes that are already in string search
- matchLength = 0;
- setIndex(targetText.getBeginIndex());
- m_isOverlap_ = false;
- m_isForwardSearching_ = true;
- m_reset_ = true;
- m_setOffset_ = DONE;
+ setMatchNotFound();
+ setIndex(search_.beginIndex());
+ search_.isOverlap_ = false;
+ search_.isCanonicalMatch_ = false;
+ search_.elementComparisonType_ = ElementComparisonType.STANDARD_ELEMENT_COMPARISON;
+ search_.isForwardSearching_ = true;
+ search_.reset_ = true;
}
/**
*/
public final int first()
{
- m_isForwardSearching_ = true;
- setIndex(targetText.getBeginIndex());
- return next();
+ int startIdx = search_.beginIndex();
+ setIndex(startIdx);
+ return handleNext(startIdx);
}
/**
*/
public final int following(int position)
{
- m_isForwardSearching_ = true;
- // position checked in usearch_setOffset
setIndex(position);
- return next();
+ return handleNext(position);
}
/**
*/
public final int last()
{
- m_isForwardSearching_ = false;
- setIndex(targetText.getEndIndex());
- return previous();
+ int endIdx = search_.endIndex();
+ setIndex(endIdx);
+ return handlePrevious(endIdx);
}
/**
*/
public final int preceding(int position)
{
- m_isForwardSearching_ = false;
- // position checked in usearch_setOffset
setIndex(position);
- return previous();
+ return handlePrevious(position);
}
-
- // protected data member ----------------------------------------------
-
- /**
- * The BreakIterator to define the boundaries of a logical match.
- * This value can be a null.
- * See class documentation for more information.
- * @see #setBreakIterator(BreakIterator)
- * @see #getBreakIterator
- * @see BreakIterator
- * @stable ICU 2.0
- */
- protected BreakIterator breakIterator;
- /**
- * Target text for searching.
- * @see #setTarget(CharacterIterator)
- * @see #getTarget
- * @stable ICU 2.0
- */
- protected CharacterIterator targetText;
- /**
- * Length of the most current match in target text.
- * Value 0 is the default value.
- * @see #setMatchLength
- * @see #getMatchLength
- * @stable ICU 2.0
- */
- protected int matchLength;
-
// protected constructor ----------------------------------------------
/**
"Illegal argument target. " +
" Argument can not be null or of length 0");
}
- targetText = target;
- breakIterator = breaker;
- if (breakIterator != null) {
- breakIterator.setText(target);
+
+ search_.setTarget(target);
+ search_.setBreakIter(breaker);
+ if (search_.breakIter() != null) {
+ search_.breakIter().setText((CharacterIterator)target.clone());
}
- matchLength = 0;
- m_lastMatchStart_ = DONE;
- m_isOverlap_ = false;
- m_isForwardSearching_ = true;
- m_reset_ = true;
- m_setOffset_ = DONE;
+ search_.isOverlap_ = false;
+ search_.isCanonicalMatch_ = false;
+ search_.elementComparisonType_ = ElementComparisonType.STANDARD_ELEMENT_COMPARISON;
+ search_.isForwardSearching_ = true;
+ search_.reset_ = true;
+ search_.matchedIndex_ = DONE;
+ search_.setMatchedLength(0);
}
-
+
// protected methods --------------------------------------------------
*/
protected void setMatchLength(int length)
{
- matchLength = length;
+ search_.setMatchedLength(length);
}
/**
* @stable ICU 2.0
*/
protected abstract int handlePrevious(int startAt);
-
- // private data members ------------------------------------------------
-
+
/**
- * Flag indicates if we are doing a forwards search
+ * @internal
+ * @deprecated This API is ICU internal only.
*/
- private boolean m_isForwardSearching_;
+ //TODO: This protected method is @stable 2.0 in ICU4C
+ protected void setMatchNotFound() {
+ search_.matchedIndex_ = DONE;
+ search_.setMatchedLength(0);
+ }
+
/**
- * Flag to indicate if overlapping search is to be done.
- * E.g. looking for "aa" in "aaa" will yield matches at offset 0 and 1.
+ * Option to control how collation elements are compared.
+ * The default value will be {@link #STANDARD_ELEMENT_COMPARISON}.
+ *
+ * @see #setElementComparisonType(ElementComparisonType)
+ * @see #getElementComparisonType()
+ * @draft ICU 53
+ * @provisional This API might change or be removed in a future release.
*/
- private boolean m_isOverlap_;
- /**
- * Flag indicates if we are at the start of a string search.
- * This indicates that we are in forward search and at the start of m_text.
- */
- private boolean m_reset_;
+ public enum ElementComparisonType {
+ /**
+ * Standard collation element comparison at the specified collator strength.
+ *
+ * @draft ICU 53
+ * @provisional This API might change or be removed in a future release.
+ */
+ STANDARD_ELEMENT_COMPARISON,
+ /**
+ * <p>Collation element comparison is modified to effectively provide behavior
+ * between the specified strength and strength - 1.</p>
+ *
+ * <p>Collation elements in the pattern that have the base weight for the specified
+ * strength are treated as "wildcards" that match an element with any other
+ * weight at that collation level in the searched text. For example, with a
+ * secondary-strength English collator, a plain 'e' in the pattern will match
+ * a plain e or an e with any diacritic in the searched text, but an e with
+ * diacritic in the pattern will only match an e with the same diacritic in
+ * the searched text.<p>
+ *
+ * @draft ICU 53
+ * @provisional This API might change or be removed in a future release.
+ */
+ PATTERN_BASE_WEIGHT_IS_WILDCARD,
+
+ /**
+ * <p>Collation element comparison is modified to effectively provide behavior
+ * between the specified strength and strength - 1.</p>
+ *
+ * <p>Collation elements in either the pattern or the searched text that have the
+ * base weight for the specified strength are treated as "wildcards" that match
+ * an element with any other weight at that collation level. For example, with
+ * a secondary-strength English collator, a plain 'e' in the pattern will match
+ * a plain e or an e with any diacritic in the searched text, but an e with
+ * diacritic in the pattern will only match an e with the same diacritic or a
+ * plain e in the searched text.</p>
+ *
+ * @draft ICU 53
+ * @provisional This API might change or be removed in a future release.
+ */
+ ANY_BASE_WEIGHT_IS_WILDCARD
+ }
+
/**
- * Data member to store user defined position in setIndex().
- * If setIndex() is not called, this value will be DONE.
- */
- private int m_setOffset_;
+ * <p>Sets the collation element comparison type.</p>
+ *
+ * <p>The default comparison type is {@link ElementComparisonType#STANDARD_ELEMENT_COMPARISON}.</p>
+ *
+ * @see ElementComparisonType
+ * @see #getElementComparisonType()
+ * @draft ICU 53
+ * @provisional This API might change or be removed in a future release.
+ */
+ public void setElementComparisonType(ElementComparisonType type) {
+ search_.elementComparisonType_ = type;
+ }
+
/**
- * Offset of the beginning of the last match
+ * <p>Returns the collation element comparison type.</p>
+ *
+ * @see ElementComparisonType
+ * @see #setElementComparisonType(ElementComparisonType)
+ * @draft ICU 53
+ * @provisional This API might change or be removed in a future release.
*/
- private int m_lastMatchStart_;
+ public ElementComparisonType getElementComparisonType() {
+ return search_.elementComparisonType_;
+ }
}
/*
*******************************************************************************
- * Copyright (C) 1996-2011, International Business Machines Corporation and
+ * Copyright (C) 1996-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
-
package com.ibm.icu.text;
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
import java.util.Locale;
-import com.ibm.icu.impl.CharacterIteratorWrapper;
-import com.ibm.icu.impl.Norm2AllModes;
-import com.ibm.icu.impl.Normalizer2Impl;
-import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.util.ULocale;
+// Java porting note:
+// ICU4C implementation contains dead code in many places.
+// While porting ICU4C linear search implementation, these dead codes
+// were not fully ported. The code block tagged by "// *** Boyer-Moore ***"
+// are those dead code, still available in ICU4C.
+
+//TODO: ICU4C implementation does not seem to handle UCharacterIterator pointing
+// a fragment of text properly. ICU4J uses CharacterIterator to navigate through
+// the input text. We need to carefully review the code ported from ICU4C
+// assuming the start index is 0.
+
+//TODO: ICU4C implementation initializes pattern.CE and pattern.PCE. It looks
+// CE is no longer used, except a few places checking CELength. It looks this
+// is a left over from already disable Boyer-Moore search code. This Java implementation
+// preserves the code, but we should clean them up later.
+
+//TODO: We need to update document to remove the term "Boyer-Moore search".
+
/**
* <p>
* <code>StringSearch</code> is the concrete subclass of
// internal notes: all methods do not guarantee the correct status of the
// characteriterator. the caller has to maintain the original index position
// if necessary. methods could change the index position as it deems fit
-public final class StringSearch extends SearchIterator
-{
-
- // public constructors --------------------------------------------------
+public final class StringSearch extends SearchIterator {
+ /**
+ * DONE is returned by previous() and next() after all valid matches have
+ * been returned, and by first() and last() if there are no matches at all.
+ * @see #previous
+ * @see #next
+ * @stable ICU 2.0
+ */
+ public static final int DONE = -1;
+
+ private Pattern pattern_;
+ private RuleBasedCollator collator_;
+
+ // positions within the collation element iterator is used to determine
+ // if we are at the start of the text.
+ private CollationElementIterator textIter_;
+ private CollationPCE textProcessedIter_;
+
+ // utility collation element, used throughout program for temporary
+ // iteration.
+ private CollationElementIterator utilIter_;
+
+ private int strength_;
+ int ceMask_;
+ int variableTop_;
+
+ private boolean toShift_;
+
+ // *** Boyer-Moore ***
+ // private char[] canonicalPrefixAccents_;
+ // private char[] canonicalSuffixAccents_;
+
/**
* Initializes the iterator to use the language-specific rules defined in
* the argument collator to search for argument pattern in the argument
* @see SearchIterator
* @stable ICU 2.0
*/
- public StringSearch(String pattern, CharacterIterator target,
- RuleBasedCollator collator, BreakIterator breakiter)
- {
+ public StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator,
+ BreakIterator breakiter) {
+
+ // This implementation is ported from ICU4C usearch_open()
+
super(target, breakiter);
- m_textBeginOffset_ = targetText.getBeginIndex();
- m_textLimitOffset_ = targetText.getEndIndex();
- m_collator_ = collator;
- m_colEIter_ = m_collator_.getCollationElementIterator(target);
- m_utilColEIter_ = collator.getCollationElementIterator("");
- m_ceMask_ = getMask(m_collator_.getStrength());
- m_isCanonicalMatch_ = false;
- m_pattern_ = new Pattern(pattern);
- m_matchedIndex_ = DONE;
- m_charBreakIter_ = BreakIterator.getCharacterInstance(/*m_collator_.getLocale(ULocale.ACTUAL_LOCALE)*/);
- m_charBreakIter_.setText(target);
+
+ // string search does not really work when numeric collation is turned on
+ if (collator.getNumericCollation()) {
+ throw new UnsupportedOperationException("Numeric collation is not supported by StringSearch");
+ }
+
+ collator_ = collator;
+ strength_ = collator.getStrength();
+ ceMask_ = getMask(strength_);
+ toShift_ = collator.isAlternateHandlingShifted();
+ variableTop_ = collator.getVariableTop();
+
+ pattern_ = new Pattern(pattern);
+
+ search_.setMatchedLength(0);
+ search_.matchedIndex_ = DONE;
+
+ utilIter_ = null;
+ textIter_ = new CollationElementIterator(target, collator);
+
+ textProcessedIter_ = null;
+
+ // This is done by super class constructor
+ /*
+ search_.isOverlap_ = false;
+ search_.isCanonicalMatch_ = false;
+ search_.elementComparisonType_ = ElementComparisonType.STANDARD_ELEMENT_COMPARISON;
+ search_.isForwardSearching_ = true;
+ search_.reset_ = true;
+ */
+ ULocale collLocale = collator.getLocale(ULocale.VALID_LOCALE);
+ search_.internalBreakIter_ = BreakIterator.getCharacterInstance(collLocale == null ? ULocale.ROOT : collLocale);
+ search_.internalBreakIter_.setText((CharacterIterator)target.clone()); // We need to create a clone
+
initialize();
}
* @see SearchIterator
* @stable ICU 2.0
*/
- public StringSearch(String pattern, CharacterIterator target,
- RuleBasedCollator collator)
- {
- this(pattern, target, collator, null/*BreakIterator.getCharacterInstance()*/);
+ public StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator) {
+ this(pattern, target, collator, null);
}
/**
* @see SearchIterator
* @stable ICU 2.0
*/
- public StringSearch(String pattern, CharacterIterator target, Locale locale)
- {
+ public StringSearch(String pattern, CharacterIterator target, Locale locale) {
this(pattern, target, ULocale.forLocale(locale));
}
* @see SearchIterator
* @stable ICU 3.2
*/
- public StringSearch(String pattern, CharacterIterator target, ULocale locale)
- {
- this(pattern, target, (RuleBasedCollator)Collator.getInstance(locale),
- null/*BreakIterator.getCharacterInstance(locale)*/);
+ public StringSearch(String pattern, CharacterIterator target, ULocale locale) {
+ this(pattern, target, (RuleBasedCollator) Collator.getInstance(locale), null);
}
/**
* @see SearchIterator
* @stable ICU 2.0
*/
- public StringSearch(String pattern, String target)
- {
+ public StringSearch(String pattern, String target) {
this(pattern, new StringCharacterIterator(target),
- (RuleBasedCollator)Collator.getInstance(),
- null/*BreakIterator.getCharacterInstance()*/);
+ (RuleBasedCollator) Collator.getInstance(), null);
}
- // public getters -----------------------------------------------------
-
/**
* <p>
* Gets the RuleBasedCollator used for the language rules.
* @see #setCollator
* @stable ICU 2.0
*/
- public RuleBasedCollator getCollator()
- {
- return m_collator_;
- }
-
- /**
- * Returns the pattern for which StringSearch is searching for.
- * @return the pattern searched for
- * @stable ICU 2.0
- */
- public String getPattern()
- {
- return m_pattern_.targetText;
- }
-
- /**
- * Return the index in the target text where the iterator is currently
- * positioned at.
- * If the iteration has gone past the end of the target text or past
- * the beginning for a backwards search, {@link #DONE} is returned.
- * @return index in the target text where the iterator is currently
- * positioned at
- * @stable ICU 2.8
- */
- public int getIndex()
- {
- int result = m_colEIter_.getOffset();
- if (isOutOfBounds(m_textBeginOffset_, m_textLimitOffset_, result)) {
- return DONE;
- }
- return result;
- }
-
- /**
- * Determines whether canonical matches (option 1, as described in the
- * class documentation) is set.
- * See setCanonical(boolean) for more information.
- * @see #setCanonical
- * @return true if canonical matches is set, false otherwise
- * @stable ICU 2.8
- */
- public boolean isCanonical()
- {
- return m_isCanonicalMatch_;
+ public RuleBasedCollator getCollator() {
+ return collator_;
}
-
- // public setters -----------------------------------------------------
-
+
/**
* <p>
* Sets the RuleBasedCollator to be used for language-specific searching.
* @see #getCollator
* @stable ICU 2.0
*/
- public void setCollator(RuleBasedCollator collator)
- {
+ public void setCollator(RuleBasedCollator collator) {
if (collator == null) {
throw new IllegalArgumentException("Collator can not be null");
}
- m_collator_ = collator;
- m_ceMask_ = getMask(m_collator_.getStrength());
- // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
+ collator_ = collator;
+ ceMask_ = getMask(collator_.getStrength());
+
+ ULocale collLocale = collator.getLocale(ULocale.VALID_LOCALE);
+ search_.internalBreakIter_ = BreakIterator.getCharacterInstance(collLocale == null ? ULocale.ROOT : collLocale);
+ search_.internalBreakIter_.setText((CharacterIterator)search_.text().clone()); // We need to create a clone
+
+ toShift_ = collator.isAlternateHandlingShifted();
+ variableTop_ = collator.getVariableTop();
+ textIter_ = new CollationElementIterator(pattern_.text_, collator);
+ utilIter_ = new CollationElementIterator(pattern_.text_, collator);
+
+ // initialize() _after_ setting the iterators for the new collator.
initialize();
- m_colEIter_.setCollator(m_collator_);
- m_utilColEIter_.setCollator(m_collator_);
- m_charBreakIter_ = BreakIterator.getCharacterInstance(/*collator.getLocale(ULocale.VALID_LOCALE)*/);
- m_charBreakIter_.setText(targetText);
}
-
+
+ /**
+ * Returns the pattern for which StringSearch is searching for.
+ * @return the pattern searched for
+ * @stable ICU 2.0
+ */
+ public String getPattern() {
+ return pattern_.text_;
+ }
+
/**
* <p>
* Set the pattern to search for.
* length 0
* @stable ICU 2.0
*/
- public void setPattern(String pattern)
- {
+ public void setPattern(String pattern) {
if (pattern == null || pattern.length() <= 0) {
throw new IllegalArgumentException(
"Pattern to search for can not be null or of length 0");
}
- m_pattern_.targetText = pattern;
+ pattern_.text_ = pattern;
initialize();
}
-
+
+ /**
+ * Determines whether canonical matches (option 1, as described in the
+ * class documentation) is set.
+ * See setCanonical(boolean) for more information.
+ * @see #setCanonical
+ * @return true if canonical matches is set, false otherwise
+ * @stable ICU 2.8
+ */
+ //TODO: hoist this to SearchIterator
+ public boolean isCanonical() {
+ return search_.isCanonicalMatch_;
+ }
+
+ /**
+ * <p>
+ * Set the canonical match mode. See class documentation for details.
+ * The default setting for this property is false.
+ * </p>
+ * @param allowCanonical flag indicator if canonical matches are allowed
+ * @see #isCanonical
+ * @stable ICU 2.8
+ */
+ //TODO: hoist this to SearchIterator
+ public void setCanonical(boolean allowCanonical) {
+ search_.isCanonicalMatch_ = allowCanonical;
+ }
+
/**
- * Set the target text to be searched. Text iteration will hence begin at
+ * Set the target text to be searched. Text iteration will hence begin at
* the start of the text string. This method is useful if you want to
* re-use an iterator to search within a different body of text.
* @param text new text iterator to look for match,
* @see #getTarget
* @stable ICU 2.8
*/
- public void setTarget(CharacterIterator text)
- {
+ @Override
+ public void setTarget(CharacterIterator text) {
super.setTarget(text);
- m_textBeginOffset_ = targetText.getBeginIndex();
- m_textLimitOffset_ = targetText.getEndIndex();
- m_colEIter_.setText(targetText);
- m_charBreakIter_.setText(targetText);
+ textIter_.setText(text);
}
-
+
+ /**
+ * Return the index in the target text where the iterator is currently
+ * positioned at.
+ * If the iteration has gone past the end of the target text or past
+ * the beginning for a backwards search, {@link #DONE} is returned.
+ * @return index in the target text where the iterator is currently
+ * positioned at
+ * @stable ICU 2.8
+ */
+ @Override
+ public int getIndex() {
+ int result = textIter_.getOffset();
+ if (isOutOfBounds(search_.beginIndex(), search_.endIndex(), result)) {
+ return DONE;
+ }
+ return result;
+ }
+
/**
* <p>
* Sets the position in the target text which the next search will start
* @see #getIndex
* @stable ICU 2.8
*/
- public void setIndex(int position)
- {
+ @Override
+ public void setIndex(int position) {
+ // Java porting note: This method is equivalent to setOffset() in ICU4C.
+ // ICU4C SearchIterator::setOffset() is a pure virtual method, while
+ // ICU4J SearchIterator.setIndex() is not abstract method.
+
super.setIndex(position);
- m_matchedIndex_ = DONE;
- m_colEIter_.setExactOffset(position);
- }
-
- /**
- * <p>
- * Set the canonical match mode. See class documentation for details.
- * The default setting for this property is false.
- * </p>
- * @param allowCanonical flag indicator if canonical matches are allowed
- * @see #isCanonical
- * @stable ICU 2.8
- */
- public void setCanonical(boolean allowCanonical)
- {
- m_isCanonicalMatch_ = allowCanonical;
- if (m_isCanonicalMatch_ == true) {
- if (m_canonicalPrefixAccents_ == null) {
- m_canonicalPrefixAccents_ = new StringBuilder();
- }
- else {
- m_canonicalPrefixAccents_.delete(0,
- m_canonicalPrefixAccents_.length());
- }
- if (m_canonicalSuffixAccents_ == null) {
- m_canonicalSuffixAccents_ = new StringBuilder();
- }
- else {
- m_canonicalSuffixAccents_.delete(0,
- m_canonicalSuffixAccents_.length());
- }
- }
+ textIter_.setOffset(position);
}
-
- // public miscellaneous methods -----------------------------------------
-
+
/**
* <p>
* Resets the search iteration. All properties will be reset to the
* </p>
* @stable ICU 2.8
*/
- public void reset()
- {
- // reset is setting the attributes that are already in string search,
- // hence all attributes in the collator should be retrieved without any
- // problems
- super.reset();
- m_isCanonicalMatch_ = false;
- m_ceMask_ = getMask(m_collator_.getStrength());
- // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
- initialize();
- m_colEIter_.setCollator(m_collator_);
- m_colEIter_.reset();
- m_utilColEIter_.setCollator(m_collator_);
+ @Override
+ public void reset() {
+ // reset is setting the attributes that are already in
+ // string search, hence all attributes in the collator should
+ // be retrieved without any problems
+
+ boolean sameCollAttribute = true;
+ int ceMask;
+ boolean shift;
+ int varTop;
+
+ // **** hack to deal w/ how processed CEs encode quaternary ****
+ int newStrength = collator_.getStrength();
+ if ((strength_ < Collator.QUATERNARY && newStrength >= Collator.QUATERNARY)
+ || (strength_ >= Collator.QUATERNARY && newStrength < Collator.QUATERNARY)) {
+ sameCollAttribute = false;
+ }
+
+ strength_ = collator_.getStrength();
+ ceMask = getMask(strength_);
+ if (ceMask_ != ceMask) {
+ ceMask_ = ceMask;
+ sameCollAttribute = false;
+ }
+
+ shift = collator_.isAlternateHandlingShifted();
+ if (toShift_ != shift) {
+ toShift_ = shift;
+ sameCollAttribute = false;
+ }
+
+ varTop = collator_.getVariableTop();
+ if (variableTop_ != varTop) {
+ variableTop_ = varTop;
+ sameCollAttribute = false;
+ }
+
+ if (!sameCollAttribute) {
+ initialize();
+ }
+
+ textIter_.setText(search_.text());
+
+ search_.setMatchedLength(0);
+ search_.matchedIndex_ = DONE;
+ search_.isOverlap_ = false;
+ search_.isCanonicalMatch_ = false;
+ search_.elementComparisonType_ = ElementComparisonType.STANDARD_ELEMENT_COMPARISON;
+ search_.isForwardSearching_ = true;
+ search_.reset_ = true;
}
- // protected methods -----------------------------------------------------
-
/**
* <p>
* Concrete method to provide the mechanism
* for finding the next <b>forwards</b> match in the target text.
* See super class documentation for its use.
* </p>
- * @param start index in the target text at which the forwards search
+ * @param position index in the target text at which the forwards search
* should begin.
* @return the starting index of the next forwards match if found, DONE
* otherwise
* @see #DONE
* @stable ICU 2.8
*/
- protected int handleNext(int start)
- {
- if (m_pattern_.m_CELength_ == 0) {
- matchLength = 0;
- if (m_matchedIndex_ == DONE && start == m_textBeginOffset_) {
- m_matchedIndex_ = start;
- return m_matchedIndex_;
- }
-
- targetText.setIndex(start);
- char ch = targetText.current();
- // ch can never be done, it is handled by next()
- char ch2 = targetText.next();
- if (ch2 == CharacterIterator.DONE) {
- m_matchedIndex_ = DONE;
- }
- else {
- m_matchedIndex_ = targetText.getIndex();
- }
- if (UTF16.isLeadSurrogate(ch) && UTF16.isTrailSurrogate(ch2)) {
- targetText.next();
- m_matchedIndex_ = targetText.getIndex();
+ @Override
+ protected int handleNext(int position) {
+ if (pattern_.CELength_ == 0) {
+ search_.matchedIndex_ = search_.matchedIndex_ == DONE ?
+ getIndex() : search_.matchedIndex_ + 1;
+ search_.setMatchedLength(0);
+ textIter_.setOffset(search_.matchedIndex_);
+ if (search_.matchedIndex_ == search_.endIndex()) {
+ search_.matchedIndex_ = DONE;
}
- }
- else {
- if (matchLength <= 0) {
- // we must have reversed direction after we reached the start
- // of the target text
- // see SearchIterator next(), it checks the bounds and returns
- // if it exceeds the range. It does not allow setting of
- // m_matchedIndex
- if (start == m_textBeginOffset_) {
- m_matchedIndex_ = DONE;
- }
- else {
- // for boundary check purposes. this will ensure that the
- // next match will not preceed the current offset
- // note search->matchedIndex will always be set to something
- // in the code
- m_matchedIndex_ = start - 1;
- }
+ } else {
+ if (search_.matchedLength() <= 0) {
+ // the flipping direction issue has already been handled
+ // in next()
+ // for boundary check purposes. this will ensure that the
+ // next match will not preceed the current offset
+ // note search_.matchedIndex_ will always be set to something
+ // in the code
+ search_.matchedIndex_ = position - 1;
}
-
- // status checked below
- if (m_isCanonicalMatch_) {
- // can't use exact here since extra accents are allowed.
- handleNextCanonical(start);
+
+ textIter_.setOffset(position);
+
+ // ICU4C comment:
+ // if strsrch_->breakIter is always the same as m_breakiterator_
+ // then we don't need to check the match boundaries here because
+ // usearch_handleNextXXX will already have done it.
+ if (search_.isCanonicalMatch_) {
+ // *could* actually use exact here 'cause no extra accents allowed...
+ handleNextCanonical();
+ } else {
+ handleNextExact();
}
- else {
- handleNextExact(start);
+
+ if (search_.matchedIndex_ == DONE) {
+ textIter_.setOffset(search_.endIndex());
+ } else {
+ textIter_.setOffset(search_.matchedIndex_);
}
+
+ return search_.matchedIndex_;
}
- if (m_matchedIndex_ == DONE) {
- targetText.setIndex(m_textLimitOffset_);
- }
- else {
- targetText.setIndex(m_matchedIndex_);
- }
- return m_matchedIndex_;
+
+ return DONE;
}
-
+
/**
* <p>
* Concrete method to provide the mechanism
* for finding the next <b>backwards</b> match in the target text.
* See super class documentation for its use.
* </p>
- * @param start index in the target text at which the backwards search
+ * @param position index in the target text at which the backwards search
* should begin.
* @return the starting index of the next backwards match if found, DONE
* otherwise
* @see #DONE
* @stable ICU 2.8
*/
- protected int handlePrevious(int start)
- {
- if (m_pattern_.m_CELength_ == 0) {
- matchLength = 0;
- // start can never be DONE or 0, it is handled in previous
- targetText.setIndex(start);
- char ch = targetText.previous();
- if (ch == CharacterIterator.DONE) {
- m_matchedIndex_ = DONE;
- }
- else {
- m_matchedIndex_ = targetText.getIndex();
- if (UTF16.isTrailSurrogate(ch)) {
- if (UTF16.isLeadSurrogate(targetText.previous())) {
- m_matchedIndex_ = targetText.getIndex();
- }
- }
- }
- }
- else {
- if (matchLength == 0) {
- // we must have reversed direction after we reached the end
- // of the target text
- // see SearchIterator next(), it checks the bounds and returns
- // if it exceeds the range. It does not allow setting of
- // m_matchedIndex
- m_matchedIndex_ = DONE;
- }
- if (m_isCanonicalMatch_) {
- // can't use exact here since extra accents are allowed.
- handlePreviousCanonical(start);
- }
- else {
- handlePreviousExact(start);
+ @Override
+ protected int handlePrevious(int position) {
+ if (pattern_.CELength_ == 0) {
+ search_.matchedIndex_ =
+ search_.matchedIndex_ == DONE ? getIndex() : search_.matchedIndex_;
+ if (search_.matchedIndex_ == search_.beginIndex()) {
+ setMatchNotFound();
+ } else {
+ search_.matchedIndex_--;
+ textIter_.setOffset(search_.matchedIndex_);
+ search_.setMatchedLength(0);
}
- }
+ } else {
+ textIter_.setOffset(position);
- if (m_matchedIndex_ == DONE) {
- targetText.setIndex(m_textBeginOffset_);
- }
- else {
- targetText.setIndex(m_matchedIndex_);
+ if (search_.isCanonicalMatch_) {
+ // *could* use exact match here since extra accents *not* allowed!
+ handlePreviousCanonical();
+ } else {
+ handlePreviousExact();
+ }
}
- return m_matchedIndex_;
- }
- // private static inner classes ----------------------------------------
-
- private static class Pattern
- {
- // protected methods -----------------------------------------------
-
- /**
- * Pattern string
- */
- protected String targetText;
- /**
- * Array containing the collation elements of targetText
- */
- protected int m_CE_[];
- /**
- * Number of collation elements in m_CE_
- */
- protected int m_CELength_;
- /**
- * Flag indicator if targetText starts with an accent
- */
- protected boolean m_hasPrefixAccents_;
- /**
- * Flag indicator if targetText ends with an accent
- */
- protected boolean m_hasSuffixAccents_;
- /**
- * Default number of characters to shift for Boyer Moore
- */
- protected int m_defaultShiftSize_;
- /**
- * Number of characters to shift for Boyer Moore, depending on the
- * source text to search
- */
- protected char m_shift_[];
- /**
- * Number of characters to shift backwards for Boyer Moore, depending
- * on the source text to search
- */
- protected char m_backShift_[];
-
- // protected constructors ------------------------------------------
-
- /**
- * Empty constructor
- */
- protected Pattern(String pattern)
- {
- targetText = pattern;
- m_CE_ = new int[INITIAL_ARRAY_SIZE_];
- m_CELength_ = 0;
- m_hasPrefixAccents_ = false;
- m_hasSuffixAccents_ = false;
- m_defaultShiftSize_ = 1;
- m_shift_ = new char[MAX_TABLE_SIZE_];
- m_backShift_ = new char[MAX_TABLE_SIZE_];
- }
+ return search_.matchedIndex_;
}
+ // ------------------ Internal implementation code ---------------------------
- // private data members ------------------------------------------------
-
- /**
- * target text begin offset. Each targetText has a valid contiguous region
- * to iterate and this data member is the offset to the first such
- * character in the region.
- */
- private int m_textBeginOffset_;
- /**
- * target text limit offset. Each targetText has a valid contiguous region
- * to iterate and this data member is the offset to 1 after the last such
- * character in the region.
- */
- private int m_textLimitOffset_;
- /**
- * Upon completion of a search, m_matchIndex_ will store starting offset in
- * m_text for the match. The Value DONE is the default value.
- * If we are not at the start of the text or the end of the text and
- * m_matchedIndex_ is DONE it means that we can find any more matches in
- * that particular direction
- */
- private int m_matchedIndex_;
- /**
- * Current pattern to search for
- */
- private Pattern m_pattern_;
- /**
- * Collator whose rules are used to perform the search
- */
- private RuleBasedCollator m_collator_;
- /**
- * The collation element iterator for the text source.
- */
- private CollationElementIterator m_colEIter_;
- /**
- * Utility collation element, used throughout program for temporary
- * iteration.
- */
- private CollationElementIterator m_utilColEIter_;
- /**
- * The mask used on the collation elements to retrieve the valid strength
- * weight
- */
- private int m_ceMask_;
- /**
- * Buffer storing accents during a canonical search
- */
- private StringBuilder m_canonicalPrefixAccents_;
- /**
- * Buffer storing accents during a canonical search
- */
- private StringBuilder m_canonicalSuffixAccents_;
- /**
- * Flag to indicate if canonical search is to be done.
- * E.g looking for "a\u0300" in "a\u0318\u0300" will yield the match at 0.
- */
- private boolean m_isCanonicalMatch_;
- /**
- * Character break iterator for boundary checking.
- */
- private BreakIterator m_charBreakIter_;
- private final Normalizer2Impl m_nfcImpl_ = Norm2AllModes.getNFCInstance().impl;
- /**
- * Size of the shift tables
- */
- private static final int MAX_TABLE_SIZE_ = 257;
- /**
- * Initial array size
- */
private static final int INITIAL_ARRAY_SIZE_ = 256;
- /**
- * Utility mask
- */
- private static final int SECOND_LAST_BYTE_SHIFT_ = 8;
- /**
- * Utility mask
- */
- private static final int LAST_BYTE_MASK_ = 0xff;
- /**
- * Utility buffer for return values and temporary storage
- */
- private int m_utilBuffer_[] = new int[2];
- /**
- * Unsigned 32-Bit Integer Mask
- */
- private static final long UNSIGNED_32BIT_MASK = 0xffffffffL;
- // private methods -------------------------------------------------------
+ // *** Boyer-Moore ***
+ // private static final Normalizer2Impl nfcImpl_ = Norm2AllModes.getNFCInstance().impl;
+ // private static final int LAST_BYTE_MASK_ = 0xff;
+ // private static final int SECOND_LAST_BYTE_SHIFT_ = 8;
+
+ private static final int PRIMARYORDERMASK = 0xffff0000;
+ private static final int SECONDARYORDERMASK = 0x0000ff00;
+ private static final int TERTIARYORDERMASK = 0x000000ff;
/**
- * Hash a collation element from its full size (32 bits) down into a
- * value that can be used as an index into the shift tables. Right
- * now we do a modulus by the size of the hash table.
- * @param ce collation element
- * @return collapsed version of the collation element
+ * Getting the mask for collation strength
+ * @param strength collation strength
+ * @return collation element mask
*/
- private static final int hash(int ce)
- {
- // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
- // well with the new collation where most of the latin 1 characters
- // are of the value xx000xxx. their hashes will most of the time be 0
- // to be discussed on the hash algo.
- return CollationElementIterator.primaryOrder(ce) % MAX_TABLE_SIZE_;
+ private static int getMask(int strength) {
+ switch (strength) {
+ case Collator.PRIMARY:
+ return PRIMARYORDERMASK;
+ case Collator.SECONDARY:
+ return SECONDARYORDERMASK | PRIMARYORDERMASK;
+ default:
+ return TERTIARYORDERMASK | SECONDARYORDERMASK | PRIMARYORDERMASK;
+ }
}
- private final char getFCD(int c) {
- return (char)m_nfcImpl_.getFCD16(c);
- }
- /**
- * Gets the fcd value for a character at the argument index.
- * This method takes into accounts of the supplementary characters.
- * Note this method changes the offset in the character iterator.
- * @param str UTF16 string where character for fcd retrieval resides
- * @param offset position of the character whose fcd is to be retrieved
- * @return fcd value
- */
- private final char getFCD(CharacterIterator str, int offset)
- {
- char ch = str.setIndex(offset);
+
+ // *** Boyer-Moore ***
+ /*
+ private final char getFCD(String str, int offset) {
+ char ch = str.charAt(offset);
if (ch < 0x180) {
- return (char)m_nfcImpl_.getFCD16FromBelow180(ch);
- } else if (m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) {
+ return (char) nfcImpl_.getFCD16FromBelow180(ch);
+ } else if (nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) {
if (!Character.isHighSurrogate(ch)) {
- return (char)m_nfcImpl_.getFCD16FromNormData(ch);
+ return (char) nfcImpl_.getFCD16FromNormData(ch);
} else {
- char c2 = str.next();
- if (Character.isLowSurrogate(c2)) {
- return (char)m_nfcImpl_.getFCD16FromNormData(Character.toCodePoint(ch, c2));
+ char c2;
+ if (++offset < str.length() && Character.isLowSurrogate(c2 = str.charAt(offset))) {
+ return (char) nfcImpl_.getFCD16FromNormData(Character.toCodePoint(ch, c2));
}
}
}
return 0;
}
+
+ private final char getFCD(int c) {
+ return (char)nfcImpl_.getFCD16(c);
+ }
+ */
+
/**
- * Gets the FCD value for the code point before the input offset.
- * Modifies the iterator's index.
- * @param iter text iterator
- * @param offset index after the character to test
- * @return FCD value for the character before offset
+ * Getting the modified collation elements taking into account the collation
+ * attributes.
+ *
+ * @param sourcece
+ * @return the modified collation element
*/
- private final int getFCDBefore(CharacterIterator iter, int offset) {
- iter.setIndex(offset);
- char c = iter.previous();
- if (c < 0x180) {
- return (char)m_nfcImpl_.getFCD16FromBelow180(c);
- } else if (!Character.isLowSurrogate(c)) {
- if (m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(c)) {
- return (char)m_nfcImpl_.getFCD16FromNormData(c);
- }
- } else {
- char lead = iter.previous();
- if (Character.isHighSurrogate(lead)) {
- return (char)m_nfcImpl_.getFCD16FromNormData(Character.toCodePoint(lead, c));
- }
- }
- return 0;
- }
- /**
- * Gets the fcd value for a character at the argument index.
- * This method takes into accounts of the supplementary characters.
- * @param str UTF16 string where character for fcd retrieval resides
- * @param offset position of the character whose fcd is to be retrieved
- * @return fcd value
- */
- private final char getFCD(String str, int offset)
- {
- char ch = str.charAt(offset);
- if (ch < 0x180) {
- return (char)m_nfcImpl_.getFCD16FromBelow180(ch);
- } else if (m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) {
- if (!Character.isHighSurrogate(ch)) {
- return (char)m_nfcImpl_.getFCD16FromNormData(ch);
- } else {
- char c2;
- if (++offset < str.length() && Character.isLowSurrogate(c2 = str.charAt(offset))) {
- return (char)m_nfcImpl_.getFCD16FromNormData(Character.toCodePoint(ch, c2));
- }
- }
- }
- return 0;
- }
-
- /**
- * Getting the modified collation elements taking into account the collation
- * attributes
- * @param ce
- * @return the modified collation element
- */
- private final int getCE(int ce)
- {
+ private int getCE(int sourcece) {
// note for tertiary we can't use the collator->tertiaryMask, that
// is a preprocessed mask that takes into account case options. since
// we are only concerned with exact matches, we don't need that.
- ce &= m_ceMask_;
-
- if (m_collator_.isAlternateHandlingShifted()) {
- // alternate handling here, since only the 16 most significant
- // digits is only used, we can safely do a compare without masking
+ sourcece &= ceMask_;
+
+ if (toShift_) {
+ // alternate handling here, since only the 16 most significant digits
+ // is only used, we can safely do a compare without masking
// if the ce is a variable, we mask and get only the primary values
// no shifting to quartenary is required since all primary values
// less than variabletop will need to be masked off anyway.
- if (((m_collator_.m_variableTopValue_ << 16) & UNSIGNED_32BIT_MASK) > (ce & UNSIGNED_32BIT_MASK)) {
- if (m_collator_.getStrength() == Collator.QUATERNARY) {
- ce = CollationElementIterator.primaryOrder(ce);
- }
- else {
- ce = CollationElementIterator.IGNORABLE;
+ if (variableTop_ > sourcece) {
+ if (strength_ >= Collator.QUATERNARY) {
+ sourcece &= PRIMARYORDERMASK;
+ } else {
+ sourcece = CollationElementIterator.IGNORABLE;
}
}
+ } else if (strength_ >= Collator.QUATERNARY && sourcece == CollationElementIterator.IGNORABLE) {
+ sourcece = 0xFFFF;
}
-
- return ce;
+
+ return sourcece;
}
-
+
/**
- * Appends a int to a int array, increasing the size of the array when
- * we are out of space.
- * @param offset in array to append to
- * @param value to append
- * @param array to append to
- * @return the array appended to, this could be a new and bigger array
- */
- private static final int[] append(int offset, int value, int array[])
- {
- if (offset >= array.length) {
- int temp[] = new int[offset + INITIAL_ARRAY_SIZE_];
- System.arraycopy(array, 0, temp, 0, array.length);
- array = temp;
- }
- array[offset] = value;
- return array;
+ * Direct port of ICU4C static int32_t * addTouint32_tArray(...) in usearch.cpp.
+ * This is used for appending a PCE to Pattern.PCE_ buffer. We probably should
+ * implement this in Pattern class.
+ *
+ * @param destination target array
+ * @param offset destination offset to add value
+ * @param destinationlength target array size
+ * @param value to be added
+ * @param increments incremental size expected
+ * @return new destination array, destination if there was no new allocation
+ */
+ private static int[] addToIntArray(int[] destination, int offset, int destinationlength,
+ int value, int increments) {
+ int newlength = destinationlength;
+ if (offset + 1 == newlength) {
+ newlength += increments;
+ int temp[] = new int[newlength];
+ System.arraycopy(destination, 0, temp, 0, offset);
+ destination = temp;
+ }
+ destination[offset] = value;
+ return destination;
}
-
+
/**
- * Initializing the ce table for a pattern. Stores non-ignorable collation
- * keys. Table size will be estimated by the size of the pattern text.
- * Table expansion will be perform as we go along. Adding 1 to ensure that
- * the table size definitely increases.
- * Internal method, status assumed to be a success.
- * @return total number of expansions
- */
- private final int initializePatternCETable()
- {
- m_utilColEIter_.setText(m_pattern_.targetText);
-
+ * Direct port of ICU4C static int64_t * addTouint64_tArray(...) in usearch.cpp.
+ * This is used for appending a PCE to Pattern.PCE_ buffer. We probably should
+ * implement this in Pattern class.
+ *
+ * @param destination target array
+ * @param offset destination offset to add value
+ * @param destinationlength target array size
+ * @param value to be added
+ * @param increments incremental size expected
+ * @return new destination array, destination if there was no new allocation
+ */
+ private static long[] addToLongArray(long[] destination, int offset, int destinationlength,
+ long value, int increments) {
+ int newlength = destinationlength;
+ if (offset + 1 == newlength) {
+ newlength += increments;
+ long temp[] = new long[newlength];
+ System.arraycopy(destination, 0, temp, 0, offset);
+ destination = temp;
+ }
+ destination[offset] = value;
+ return destination;
+ }
+
+ /**
+ * Initializing the ce table for a pattern.
+ * Stores non-ignorable collation keys.
+ * Table size will be estimated by the size of the pattern text. Table
+ * expansion will be perform as we go along. Adding 1 to ensure that the table
+ * size definitely increases.
+ * @return total number of expansions
+ */
+ // TODO: We probably do not need Pattern CE table.
+ private int initializePatternCETable() {
+ int[] cetable = new int[INITIAL_ARRAY_SIZE_];
+ int cetablesize = cetable.length;
+ int patternlength = pattern_.text_.length();
+ CollationElementIterator coleiter = utilIter_;
+
+ if (coleiter == null) {
+ coleiter = new CollationElementIterator(pattern_.text_, collator_);
+ utilIter_ = coleiter;
+ } else {
+ coleiter.setText(pattern_.text_);
+ }
+
int offset = 0;
int result = 0;
- int ce = m_utilColEIter_.next();
-
- while (ce != CollationElementIterator.NULLORDER) {
+ int ce;
+
+ while ((ce = coleiter.next()) != CollationElementIterator.NULLORDER) {
int newce = getCE(ce);
- if (newce != CollationElementIterator.IGNORABLE) {
- m_pattern_.m_CE_ = append(offset, newce, m_pattern_.m_CE_);
- offset ++;
+ if (newce != CollationElementIterator.IGNORABLE /* 0 */) {
+ int[] temp = addToIntArray(cetable, offset, cetablesize, newce,
+ patternlength - coleiter.getOffset() + 1);
+ offset++;
+ cetable = temp;
}
- result += m_utilColEIter_.getMaxExpansion(ce) - 1;
- ce = m_utilColEIter_.next();
+ result += (coleiter.getMaxExpansion(ce) - 1);
}
-
- m_pattern_.m_CE_ = append(offset, 0, m_pattern_.m_CE_);
- m_pattern_.m_CELength_ = offset;
-
+
+ cetable[offset] = 0;
+ pattern_.CE_ = cetable;
+ pattern_.CELength_ = offset;
+
return result;
}
-
+
/**
- * Initializes the pattern struct.
- * Internal method, status assumed to be success.
- * @return expansionsize the total expansion size of the pattern
- */
- private final int initializePattern()
- {
- if (m_collator_.getStrength() == Collator.PRIMARY) {
- m_pattern_.m_hasPrefixAccents_ = false;
- m_pattern_.m_hasSuffixAccents_ = false;
+ * Initializing the pce table for a pattern.
+ * Stores non-ignorable collation keys.
+ * Table size will be estimated by the size of the pattern text. Table
+ * expansion will be perform as we go along. Adding 1 to ensure that the table
+ * size definitely increases.
+ * @return total number of expansions
+ */
+ private int initializePatternPCETable() {
+ long[] pcetable = new long[INITIAL_ARRAY_SIZE_];
+ int pcetablesize = pcetable.length;
+ int patternlength = pattern_.text_.length();
+ CollationElementIterator coleiter = utilIter_;
+
+ if (coleiter == null) {
+ coleiter = new CollationElementIterator(pattern_.text_, collator_);
+ utilIter_ = coleiter;
} else {
- m_pattern_.m_hasPrefixAccents_ = (getFCD(m_pattern_.targetText, 0)
- >> SECOND_LAST_BYTE_SHIFT_) != 0;
- m_pattern_.m_hasSuffixAccents_ = (getFCD(m_pattern_.targetText.codePointBefore(
- m_pattern_.targetText.length()))
- & LAST_BYTE_MASK_) != 0;
- }
- // since intializePattern is an internal method status is a success.
- return initializePatternCETable();
- }
-
- /**
- * Initializing shift tables, with the default values.
- * If a corresponding default value is 0, the shift table is not set.
- * @param shift table for forwards shift
- * @param backshift table for backwards shift
- * @param cetable table containing pattern ce
- * @param cesize size of the pattern ces
- * @param expansionsize total size of the expansions
- * @param defaultforward the default forward value
- * @param defaultbackward the default backward value
- */
- private final void setShiftTable(char shift[],
- char backshift[],
- int cetable[], int cesize,
- int expansionsize,
- char defaultforward,
- char defaultbackward)
- {
- // estimate the value to shift. to do that we estimate the smallest
- // number of characters to give the relevant ces, ie approximately
- // the number of ces minus their expansion, since expansions can come
- // from a character.
- for (int count = 0; count < MAX_TABLE_SIZE_; count ++) {
- shift[count] = defaultforward;
- }
- cesize --; // down to the last index
- for (int count = 0; count < cesize; count ++) {
- // number of ces from right of array to the count
- int temp = defaultforward - count - 1;
- shift[hash(cetable[count])] = temp > 1 ? ((char)temp) : 1;
- }
- shift[hash(cetable[cesize])] = 1;
- // for ignorables we just shift by one. see test examples.
- shift[hash(0)] = 1;
-
- for (int count = 0; count < MAX_TABLE_SIZE_; count ++) {
- backshift[count] = defaultbackward;
- }
- for (int count = cesize; count > 0; count --) {
- // the original value count does not seem to work
- backshift[hash(cetable[count])] = (char)(count > expansionsize ?
- count - expansionsize : 1);
- }
- backshift[hash(cetable[0])] = 1;
- backshift[hash(0)] = 1;
- }
-
- /**
- * <p>Building of the pattern collation element list and the Boyer Moore
- * StringSearch table.</p>
- * <p>The canonical match will only be performed after the default match
- * fails.</p>
- * <p>For both cases we need to remember the size of the composed and
- * decomposed versions of the string. Since the Boyer-Moore shift
- * calculations shifts by a number of characters in the text and tries to
- * match the pattern from that offset, the shift value can not be too large
- * in case we miss some characters. To choose a right shift size, we
- * estimate the NFC form of the and use its size as a shift guide. The NFC
- * form should be the small possible representation of the pattern. Anyways,
- * we'll err on the smaller shift size. Hence the calculation for
- * minlength. Canonical match will be performed slightly differently. We'll
- * split the pattern into 3 parts, the prefix accents (PA), the middle
- * string bounded by the first and last base character (MS), the ending
- * accents (EA). Matches will be done on MS first, and only when we match
- * MS then some processing will be required for the prefix and end accents
- * in order to determine if they match PA and EA. Hence the default shift
- * values for the canonical match will take the size of either end's accent
- * into consideration. Forwards search will take the end accents into
- * consideration for the default shift values and the backwards search will
- * take the prefix accents into consideration.</p>
- * <p>If pattern has no non-ignorable ce, we return a illegal argument
- * error.</p>
- */
- private final void initialize()
- {
- int expandlength = initializePattern();
- if (m_pattern_.m_CELength_ > 0) {
- char minlength = (char)(m_pattern_.m_CELength_ > expandlength
- ? m_pattern_.m_CELength_ - expandlength : 1);
- m_pattern_.m_defaultShiftSize_ = minlength;
- setShiftTable(m_pattern_.m_shift_, m_pattern_.m_backShift_,
- m_pattern_.m_CE_, m_pattern_.m_CELength_,
- expandlength, minlength, minlength);
- }
- else {
- m_pattern_.m_defaultShiftSize_ = 0;
- }
- }
-
- /**
- * Determine whether the search text bounded by the offset start and end is
- * one or more whole units of text as determined by the breakiterator in
- * StringSearch.
- * @param start target text start offset
- * @param end target text end offset
- */
- private final boolean isBreakUnit(int start, int end)
- {
- if (breakIterator != null) {
- int startindex = breakIterator.first();
- int endindex = breakIterator.last();
-
- // out-of-range indexes are never boundary positions
- if (start < startindex || start > endindex || end < startindex
- || end > endindex) {
- return false;
- }
- // otherwise, we can use following() on the position before the
- // specified one and return true of the position we get back is the
- // one the user specified
- boolean result = (start == startindex
- || breakIterator.following(start - 1) == start)
- && (end == endindex
- || breakIterator.following(end - 1) == end);
- if (result) {
- // iterates the individual ces
- m_utilColEIter_.setText(
- new CharacterIteratorWrapper(targetText), start);
- for (int count = 0; count < m_pattern_.m_CELength_;
- count ++) {
- int ce = getCE(m_utilColEIter_.next());
- if (ce == CollationElementIterator.IGNORABLE) {
- count --;
- continue;
- }
- if (ce != m_pattern_.m_CE_[count]) {
- return false;
- }
- }
- int nextce = m_utilColEIter_.next();
- while (m_utilColEIter_.getOffset() == end
- && getCE(nextce) == CollationElementIterator.IGNORABLE) {
- nextce = m_utilColEIter_.next();
- }
- if (nextce != CollationElementIterator.NULLORDER
- && m_utilColEIter_.getOffset() == end) {
- // extra collation elements at the end of the match
- return false;
- }
- }
- return result;
+ coleiter.setText(pattern_.text_);
}
- return true;
- }
- /**
- * Getting the next base character offset if current offset is an accent,
- * or the current offset if the current character contains a base character.
- * accents the following base character will be returned
- * @param text string
- * @param textoffset current offset
- * @param textlength length of text string
- * @return the next base character or the current offset
- * if the current character is contains a base character.
- */
- private final int getNextBaseOffset(CharacterIterator text, int textoffset)
- {
- if (textoffset >= text.getEndIndex()) {
- return textoffset;
- }
- // iteration ends with reading CharacterIterator.DONE which has fcd==0
- char c = text.setIndex(textoffset);
- for (;;) {
- if (c < Normalizer2Impl.MIN_CCC_LCCC_CP || !m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(c)) {
- return textoffset;
- }
- char next = text.next();
- if (Character.isSurrogatePair(c, next)) {
- int fcd = m_nfcImpl_.getFCD16FromNormData(Character.toCodePoint(c, next));
- if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
- return textoffset;
- }
- next = text.next();
- textoffset += 2;
- } else {
- int fcd = m_nfcImpl_.getFCD16FromNormData(c);
- if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
- return textoffset;
- }
- ++textoffset;
- }
- c = next;
+ int offset = 0;
+ int result = 0;
+ long pce;
+
+ CollationPCE iter = new CollationPCE(coleiter);
+
+ // ** Should processed CEs be signed or unsigned?
+ // ** (the rest of the code in this file seems to play fast-and-loose with
+ // ** whether a CE is signed or unsigned. For example, look at routine above this one.)
+ while ((pce = iter.nextProcessed(null)) != CollationPCE.PROCESSED_NULLORDER) {
+ long[] temp = addToLongArray(pcetable, offset, pcetablesize, pce, patternlength - coleiter.getOffset() + 1);
+ offset++;
+ pcetable = temp;
}
+
+ pcetable[offset] = 0;
+ pattern_.PCE_ = pcetable;
+ pattern_.PCELength_ = offset;
+
+ return result;
}
- /**
- * Gets the next base character offset depending on the string search
- * pattern data
- * @param textoffset one offset away from the last character
- * to search for.
- * @return start index of the next base character or the current offset
- * if the current character is contains a base character.
- */
- private final int getNextBaseOffset(int textoffset)
- {
- if (m_pattern_.m_hasSuffixAccents_ && textoffset < m_textLimitOffset_) {
- if ((getFCDBefore(targetText, textoffset) & LAST_BYTE_MASK_) != 0) {
- return getNextBaseOffset(targetText, textoffset);
- }
+ // TODO: This method only triggers initializePatternCETable(), which is probably no
+ // longer needed.
+ private int initializePattern() {
+ // Since the strength is primary, accents are ignored in the pattern.
+
+ // *** Boyer-Moore ***
+ /*
+ if (strength_ == Collator.PRIMARY) {
+ pattern_.hasPrefixAccents_ = false;
+ pattern_.hasSuffixAccents_ = false;
+ } else {
+ pattern_.hasPrefixAccents_ = (getFCD(pattern_.text_, 0) >>> SECOND_LAST_BYTE_SHIFT_) != 0;
+ pattern_.hasSuffixAccents_ = (getFCD(pattern_.text_.codePointBefore(pattern_.text_.length())) & LAST_BYTE_MASK_) != 0;
}
- return textoffset;
+ */
+
+ pattern_.PCE_ = null;
+
+ // since intializePattern is an internal method status is a success.
+ return initializePatternCETable();
}
- /**
- * Shifting the collation element iterator position forward to prepare for
- * a following match. If the last character is a unsafe character, we'll
- * only shift by 1 to capture contractions, normalization etc.
- * Internal method, status assumed to be success.
- * @param textoffset start text position to do search
- * @param ce the text ce which failed the match.
- * @param patternceindex index of the ce within the pattern ce buffer which
- * failed the match
- * @return final offset
+ // *** Boyer-Moore ***
+ /*
+ private final void setShiftTable(char shift[],
+ char backshift[],
+ int cetable[], int cesize,
+ int expansionsize,
+ int defaultforward,
+ int defaultbackward) {
+ // No implementation
+ }
*/
- private int shiftForward(int textoffset, int ce, int patternceindex)
-
- {
- if (ce != CollationElementIterator.NULLORDER) {
- int shift = m_pattern_.m_shift_[hash(ce)];
- // this is to adjust for characters in the middle of the
- // substring for matching that failed.
- int adjust = m_pattern_.m_CELength_ - patternceindex;
- if (adjust > 1 && shift >= adjust) {
- shift -= adjust - 1;
- }
- textoffset += shift;
- }
- else {
- textoffset += m_pattern_.m_defaultShiftSize_;
- }
-
- textoffset = getNextBaseOffset(textoffset);
- // check for unsafe characters
- // * if it is the start or middle of a contraction: to be done after
- // a initial match is found
- // * thai or lao base consonant character: similar to contraction
- // * high surrogate character: similar to contraction
- // * next character is a accent: shift to the next base character
- return textoffset;
+
+ // TODO: This method only triggers initializePattern(), which is probably no
+ // longer needed.
+ private void initialize() {
+ /* int expandlength = */ initializePattern();
+
+ // *** Boyer-Moore ***
+ /*
+ if (pattern_.CELength_ > 0) {
+ int cesize = pattern_.CELength_;
+ int minlength = cesize > expandlength ? cesize - expandlength : 1;
+ pattern_.defaultShiftSize_ = minlength;
+ setShiftTable(pattern_.shift_, pattern_.backShift_, pattern_.CE_, cesize,
+ expandlength, minlength, minlength);
+ return;
+ }
+ return pattern_.defaultShiftSize_;
+ */
}
-
+
/**
- * Gets the offset to the next safe point in text.
- * ie. not the middle of a contraction, swappable characters or
- * supplementary characters.
- * @param textoffset offset in string
- * @param end offset in string
- * @return offset to the next safe character
- */
- private final int getNextSafeOffset(int textoffset, int end)
- {
- int result = textoffset; // first contraction character
- targetText.setIndex(result);
- while (result != end &&
- m_collator_.isUnsafe(targetText.current())) {
- result ++;
- targetText.setIndex(result);
- }
- return result;
- }
-
- /**
- * This checks for accents in the potential match started with a composite
- * character.
- * This is really painful... we have to check that composite character do
- * not have any extra accents. We have to normalize the potential match and
- * find the immediate decomposed character before the match.
- * The first composite character would have been taken care of by the fcd
- * checks in checkForwardExactMatch.
- * This is the slow path after the fcd of the first character and
- * the last character has been checked by checkForwardExactMatch and we
- * determine that the potential match has extra non-ignorable preceding
- * ces.
- * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
- * checkExtraMatchAccent should fail since there is a middle ring in
- * \u01FA Note here that accents checking are slow and cautioned in the API
- * docs.
- * Internal method, status assumed to be a success, caller should check
- * status before calling this method
- * @param start index of the potential unfriendly composite character
- * @param end index of the potential unfriendly composite character
- * @return true if there is non-ignorable accents before at the beginning
- * of the match, false otherwise.
+ * @internal
+ * @deprecated This API is ICU internal only.
*/
- private final boolean checkExtraMatchAccents(int start, int end)
- {
- boolean result = false;
- if (m_pattern_.m_hasPrefixAccents_) {
- targetText.setIndex(start);
-
- if (UTF16.isLeadSurrogate(targetText.next())) {
- if (!UTF16.isTrailSurrogate(targetText.next())) {
- targetText.previous();
- }
- }
- // we are only concerned with the first composite character
- String str = getString(targetText, start, end);
- if (Normalizer.quickCheck(str, Normalizer.NFD,0)
- == Normalizer.NO) {
- int safeoffset = getNextSafeOffset(start, end);
- if (safeoffset != end) {
- safeoffset ++;
- }
- String decomp = Normalizer.decompose(
- str.substring(0, safeoffset - start), false);
- m_utilColEIter_.setText(decomp);
- int firstce = m_pattern_.m_CE_[0];
- boolean ignorable = true;
- int ce = CollationElementIterator.IGNORABLE;
- int offset = 0;
- while (ce != firstce) {
- offset = m_utilColEIter_.getOffset();
- if (ce != firstce
- && ce != CollationElementIterator.IGNORABLE) {
- ignorable = false;
- }
- ce = m_utilColEIter_.next();
- }
- m_utilColEIter_.setExactOffset(offset); // back up 1 to the
- m_utilColEIter_.previous(); // right offset
- offset = m_utilColEIter_.getOffset();
- result = !ignorable && (UCharacter.getCombiningClass(
- UTF16.charAt(decomp, offset)) != 0);
- }
- }
-
- return result;
- }
-
- /**
- * Used by exact matches, checks if there are accents before the match.
- * This is really painful... we have to check that composite characters at
- * the start of the matches have to not have any extra accents.
- * We check the FCD of the character first, if it starts with an accent and
- * the first pattern ce does not match the first ce of the character, we
- * bail.
- * Otherwise we try normalizing the first composite
- * character and find the immediate decomposed character before the match to
- * see if it is an non-ignorable accent.
- * Now normalizing the first composite character is enough because we ensure
- * that when the match is passed in here with extra beginning ces, the
- * first or last ce that match has to occur within the first character.
- * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
- * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
- * Note here that accents checking are slow and cautioned in the API docs.
- * @param start offset
- * @param end offset
- * @return true if there are accents on either side of the match,
- * false otherwise
- */
- private final boolean hasAccentsBeforeMatch(int start, int end)
- {
- if (m_pattern_.m_hasPrefixAccents_) {
- // we have been iterating forwards previously
- boolean ignorable = true;
- int firstce = m_pattern_.m_CE_[0];
- m_colEIter_.setExactOffset(start);
- int ce = getCE(m_colEIter_.next());
- while (ce != firstce) {
- if (ce != CollationElementIterator.IGNORABLE) {
- ignorable = false;
- }
- ce = getCE(m_colEIter_.next());
- }
- if (!ignorable && m_colEIter_.isInBuffer()) {
- // within normalization buffer, discontiguous handled here
- return true;
- }
-
- // within text
- boolean accent = (getFCD(targetText, start) >> SECOND_LAST_BYTE_SHIFT_)
- != 0;
- if (!accent) {
- return checkExtraMatchAccents(start, end);
- }
- if (!ignorable) {
- return true;
- }
- if (start > m_textBeginOffset_) {
- targetText.setIndex(start);
- targetText.previous();
- if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_)
- != 0) {
- m_colEIter_.setExactOffset(start);
- ce = m_colEIter_.previous();
- if (ce != CollationElementIterator.NULLORDER
- && ce != CollationElementIterator.IGNORABLE) {
- return true;
- }
- }
- }
+ protected void setMatchNotFound() {
+ super.setMatchNotFound();
+ // SearchIterator#setMatchNotFound() does following:
+ // search_.matchedIndex_ = DONE;
+ // search_.setMatchedLength(0);
+ if (search_.isForwardSearching_) {
+ textIter_.setOffset(search_.text().getEndIndex());
+ } else {
+ textIter_.setOffset(0);
}
-
- return false;
}
-
+
/**
- * Used by exact matches, checks if there are accents bounding the match.
- * Note this is the initial boundary check. If the potential match
- * starts or ends with composite characters, the accents in those
- * characters will be determined later.
- * Not doing backwards iteration here, since discontiguos contraction for
- * backwards collation element iterator, use up too many characters.
- * E.g. looking for \u030A ring in \u01FA A ring above and acute,
- * should fail since there is a acute at the end of \u01FA
- * Note here that accents checking are slow and cautioned in the API docs.
- * @param start offset of match
- * @param end end offset of the match
- * @return true if there are accents on either side of the match,
- * false otherwise
+ * Checks if the offset runs out of the text string range
+ * @param textstart offset of the first character in the range
+ * @param textlimit limit offset of the text string range
+ * @param offset to test
+ * @return true if offset is out of bounds, false otherwise
*/
- private final boolean hasAccentsAfterMatch(int start, int end)
- {
- if (m_pattern_.m_hasSuffixAccents_) {
- targetText.setIndex(end);
- if (end > m_textBeginOffset_
- && UTF16.isTrailSurrogate(targetText.previous())) {
- if (targetText.getIndex() > m_textBeginOffset_ &&
- !UTF16.isLeadSurrogate(targetText.previous())) {
- targetText.next();
- }
- }
- if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) != 0) {
- int firstce = m_pattern_.m_CE_[0];
- m_colEIter_.setExactOffset(start);
- while (getCE(m_colEIter_.next()) != firstce) {
- }
- int count = 1;
- while (count < m_pattern_.m_CELength_) {
- if (getCE(m_colEIter_.next())
- == CollationElementIterator.IGNORABLE) {
- count --;
- }
- count ++;
- }
- //int ce = getCE(m_colEIter_.next());
- int ce = m_colEIter_.next();
- if (ce != CollationElementIterator.NULLORDER
- && ce != CollationElementIterator.IGNORABLE) {
- ce = getCE(ce);
- }
- if (ce != CollationElementIterator.NULLORDER
- && ce != CollationElementIterator.IGNORABLE) {
- if (m_colEIter_.getOffset() <= end) {
- return true;
- }
- if ((getFCD(targetText, end) >> SECOND_LAST_BYTE_SHIFT_)
- != 0) {
- return true;
- }
- }
- }
- }
- return false;
- }
-
- /**
- * Checks if the offset runs out of the text string range
- * @param textstart offset of the first character in the range
- * @param textlimit limit offset of the text string range
- * @param offset to test
- * @return true if offset is out of bounds, false otherwise
- */
- private static final boolean isOutOfBounds(int textstart, int textlimit,
- int offset)
- {
+ private static final boolean isOutOfBounds(int textstart, int textlimit, int offset) {
return offset < textstart || offset > textlimit;
}
-
+
/**
* Checks for identical match
- * @param strsrch string search data
* @param start offset of possible match
* @param end offset of possible match
- * @return true if identical match is found
+ * @return TRUE if identical match is found
*/
- private final boolean checkIdentical(int start, int end)
- {
- if (m_collator_.getStrength() != Collator.IDENTICAL) {
+ private boolean checkIdentical(int start, int end) {
+ if (strength_ != Collator.IDENTICAL) {
return true;
}
-
+ // Note: We could use Normalizer::compare() or similar, but for short strings
+ // which may not be in FCD it might be faster to just NFD them.
String textstr = getString(targetText, start, end - start);
- if (Normalizer.quickCheck(textstr, Normalizer.NFD,0)
- == Normalizer.NO) {
+ if (Normalizer.quickCheck(textstr, Normalizer.NFD, 0) == Normalizer.NO) {
textstr = Normalizer.decompose(textstr, false);
}
- String patternstr = m_pattern_.targetText;
- if (Normalizer.quickCheck(patternstr, Normalizer.NFD,0)
- == Normalizer.NO) {
+ String patternstr = pattern_.text_;
+ if (Normalizer.quickCheck(patternstr, Normalizer.NFD, 0) == Normalizer.NO) {
patternstr = Normalizer.decompose(patternstr, false);
}
return textstr.equals(patternstr);
}
-
- /**
- * Checks to see if the match is repeated
- * @param start new match start index
- * @param limit new match limit index
- * @return true if the the match is repeated, false otherwise
- */
- private final boolean checkRepeatedMatch(int start, int limit)
- {
- if (m_matchedIndex_ == DONE) {
- return false;
- }
- int end = limit - 1; // last character in the match
- int lastmatchend = m_matchedIndex_ + matchLength - 1;
- if (!isOverlapping()) {
- return (start >= m_matchedIndex_ && start <= lastmatchend)
- || (end >= m_matchedIndex_ && end <= lastmatchend)
- || (start <= m_matchedIndex_ && end >= lastmatchend);
-
+
+ private boolean initTextProcessedIter() {
+ if (textProcessedIter_ == null) {
+ textProcessedIter_ = new CollationPCE(textIter_);
+ } else {
+ textProcessedIter_.init(textIter_);
}
- return start <= m_matchedIndex_ && end >= lastmatchend;
- }
-
- /**
- * Checks match for contraction.
- * If the match ends with a partial contraction we fail.
- * If the match starts too far off (because of backwards iteration) we try
- * to chip off the extra characters depending on whether a breakiterator
- * has been used.
- * Temporary utility buffer used to return modified start and end.
- * @param start offset of potential match, to be modified if necessary
- * @param end offset of potential match, to be modified if necessary
- * @return true if match passes the contraction test, false otherwise.
- */
- private final boolean checkNextExactContractionMatch(int start, int end)
- {
- // This part checks if either ends of the match contains potential
- // contraction. If so we'll have to iterate through them
- char endchar = 0;
- if (end < m_textLimitOffset_) {
- targetText.setIndex(end);
- endchar = targetText.current();
- }
- char poststartchar = 0;
- if (start + 1 < m_textLimitOffset_) {
- targetText.setIndex(start + 1);
- poststartchar = targetText.current();
- }
- if (m_collator_.isUnsafe(endchar)
- || m_collator_.isUnsafe(poststartchar)) {
- // expansion prefix, what's left to iterate
- int bufferedCEOffset = m_colEIter_.m_CEBufferOffset_;
- boolean hasBufferedCE = bufferedCEOffset > 0;
- m_colEIter_.setExactOffset(start);
- int temp = start;
- while (bufferedCEOffset > 0) {
- // getting rid of the redundant ce, caused by setOffset.
- // since backward contraction/expansion may have extra ces if
- // we are in the normalization buffer, hasAccentsBeforeMatch
- // would have taken care of it.
- // E.g. the character \u01FA will have an expansion of 3, but
- // if we are only looking for acute and ring \u030A and \u0301,
- // we'll have to skip the first ce in the expansion buffer.
- m_colEIter_.next();
- if (m_colEIter_.getOffset() != temp) {
- start = temp;
- temp = m_colEIter_.getOffset();
- }
- bufferedCEOffset --;
- }
-
- int count = 0;
- while (count < m_pattern_.m_CELength_) {
- int ce = getCE(m_colEIter_.next());
- if (ce == CollationElementIterator.IGNORABLE) {
- continue;
- }
- if (hasBufferedCE && count == 0
- && m_colEIter_.getOffset() != temp) {
- start = temp;
- temp = m_colEIter_.getOffset();
- }
- if (ce != m_pattern_.m_CE_[count]) {
- end ++;
- end = getNextBaseOffset(end);
- m_utilBuffer_[0] = start;
- m_utilBuffer_[1] = end;
- return false;
- }
- count ++;
- }
- }
- m_utilBuffer_[0] = start;
- m_utilBuffer_[1] = end;
return true;
}
-
-
- /**
- * Checks and sets the match information if found.
- * Checks
- * <ul>
- * <li> the potential match does not repeat the previous match
- * <li> boundaries are correct
- * <li> exact matches has no extra accents
- * <li> identical matchesb
- * <li> potential match does not end in the middle of a contraction
- * </ul>
- * Otherwise the offset will be shifted to the next character.
- * The result m_matchIndex_ and m_matchLength_ will be set to the truncated
- * more fitting result value.
- * Uses the temporary utility buffer for storing the modified textoffset.
- * @param textoffset offset in the collation element text.
- * @return true if the match is valid, false otherwise
+
+ /*
+ * Find the next break boundary after startIndex. If the UStringSearch object
+ * has an external break iterator, use that. Otherwise use the internal character
+ * break iterator.
*/
- private final boolean checkNextExactMatch(int textoffset)
- {
- int start = m_colEIter_.getOffset();
- if (!checkNextExactContractionMatch(start, textoffset)) {
- // returns the modified textoffset
- m_utilBuffer_[0] = m_utilBuffer_[1];
- return false;
- }
-
- start = m_utilBuffer_[0];
- textoffset = m_utilBuffer_[1];
- // this totally matches, however we need to check if it is repeating
- if (!isBreakUnit(start, textoffset)
- || checkRepeatedMatch(start, textoffset)
- || hasAccentsBeforeMatch(start, textoffset)
- || !checkIdentical(start, textoffset)
- || hasAccentsAfterMatch(start, textoffset)) {
- textoffset ++;
- textoffset = getNextBaseOffset(textoffset);
- m_utilBuffer_[0] = textoffset;
- return false;
- }
-
- if (m_collator_.getStrength() == Collator.PRIMARY) {
- textoffset = checkBreakBoundary(textoffset);
- }
-
- // totally match, we will get rid of the ending ignorables.
- m_matchedIndex_ = start;
- matchLength = textoffset - start;
- return true;
- }
-
- /**
- * Getting the previous base character offset, or the current offset if the
- * current character is a base character
- * @param text the source text to work on
- * @param textoffset one offset after the current character
- * @return the offset of the next character after the base character or the
- * first composed character with accents
- */
- private final int getPreviousBaseOffset(CharacterIterator text,
- int textoffset)
- {
- if (textoffset > m_textBeginOffset_) {
- while (true) {
- int result = textoffset;
- text.setIndex(result);
- if (UTF16.isTrailSurrogate(text.previous())) {
- if (text.getIndex() != text.getBeginIndex() &&
- !UTF16.isLeadSurrogate(text.previous())) {
- text.next();
- }
- }
- textoffset = text.getIndex();
- char fcd = getFCD(text, textoffset);
- if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
- if ((fcd & LAST_BYTE_MASK_) != 0) {
- return textoffset;
- }
- return result;
- }
- if (textoffset == m_textBeginOffset_) {
- return m_textBeginOffset_;
- }
- }
+ private int nextBoundaryAfter(int startIndex) {
+ BreakIterator breakiterator = search_.breakIter();
+
+ if (breakiterator == null) {
+ breakiterator = search_.internalBreakIter_;
}
- return textoffset;
- }
-
- /**
- * Getting the indexes of the accents that are not blocked in the argument
- * accent array
- * @param accents accents in nfd.
- * @param accentsindex array to store the indexes of accents in accents that
- * are not blocked
- * @return the length of populated accentsindex
- */
- private int getUnblockedAccentIndex(StringBuilder accents,
- int accentsindex[])
- {
- int index = 0;
- int length = accents.length();
- int cclass = 0;
- int result = 0;
- while (index < length) {
- int codepoint = UTF16.charAt(accents, index);
- int tempclass = UCharacter.getCombiningClass(codepoint);
- if (tempclass != cclass) {
- cclass = tempclass;
- accentsindex[result] = index;
- result ++;
- }
- if (UCharacter.isSupplementary(codepoint)) {
- index += 2;
- }
- else {
- index ++;
- }
+
+ if (breakiterator != null) {
+ return breakiterator.following(startIndex);
}
- accentsindex[result] = length;
- return result;
+
+ return startIndex;
}
- /**
- * Appends 3 StringBuilder/CharacterIterator together into a destination
- * string buffer.
- * @param source1 string buffer
- * @param source2 character iterator
- * @param start2 start of the character iterator to merge
- * @param end2 end of the character iterator to merge
- * @param source3 string buffer
- * @return appended string buffer
+ /*
+ * Returns TRUE if index is on a break boundary. If the UStringSearch
+ * has an external break iterator, test using that, otherwise test
+ * using the internal character break iterator.
*/
- private static final StringBuilder merge(StringBuilder source1,
- CharacterIterator source2,
- int start2, int end2,
- StringBuilder source3)
- {
- StringBuilder result = new StringBuilder();
- if (source1 != null && source1.length() != 0) {
- result.append(source1);
- }
- source2.setIndex(start2);
- while (source2.getIndex() < end2) {
- result.append(source2.current());
- source2.next();
- }
- if (source3 != null && source3.length() != 0) {
- result.append(source3);
+ private boolean isBreakBoundary(int index) {
+ BreakIterator breakiterator = search_.breakIter();
+
+ if (breakiterator == null) {
+ breakiterator = search_.internalBreakIter_;
}
- return result;
+
+ return (breakiterator != null && breakiterator.isBoundary(index));
}
-
- /**
- * Running through a collation element iterator to see if the contents
- * matches pattern in string search data
- * @param coleiter collation element iterator to test
- * @return true if a match if found, false otherwise
- */
- private final boolean checkCollationMatch(CollationElementIterator coleiter)
- {
- int patternceindex = m_pattern_.m_CELength_;
- int offset = 0;
- while (patternceindex > 0) {
- int ce = getCE(coleiter.next());
- if (ce == CollationElementIterator.IGNORABLE) {
- continue;
- }
- if (ce != m_pattern_.m_CE_[offset]) {
- return false;
- }
- offset ++;
- patternceindex --;
+
+
+ // Java porting note: Followings are corresponding to UCompareCEsResult enum
+ private static final int CE_MATCH = -1;
+ private static final int CE_NO_MATCH = 0;
+ private static final int CE_SKIP_TARG = 1;
+ private static final int CE_SKIP_PATN = 2;
+
+ private static int CE_LEVEL2_BASE = 0x00000005;
+ private static int CE_LEVEL3_BASE = 0x00050000;
+
+ private static int compareCE64s(long targCE, long patCE, ElementComparisonType compareType) {
+ if (targCE == patCE) {
+ return CE_MATCH;
}
- return true;
- }
-
- /**
- * Rearranges the front accents to try matching.
- * Prefix accents in the text will be grouped according to their combining
- * class and the groups will be mixed and matched to try find the perfect
- * match with the pattern.
- * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
- * step 1: split "\u030A\u0301" into 6 other type of potential accent
- * substrings "\u030A", "\u0301", "\u0325", "\u030A\u0301",
- * "\u030A\u0325", "\u0301\u0325".
- * step 2: check if any of the generated substrings matches the pattern.
- * Internal method, status is assumed to be success, caller has to check
- * status before calling this method.
- * @param start first offset of the accents to start searching
- * @param end start of the last accent set
- * @return DONE if a match is not found, otherwise return the starting
- * offset of the match. Note this start includes all preceding
- * accents.
- */
- private int doNextCanonicalPrefixMatch(int start, int end)
- {
- if ((getFCD(targetText, start) & LAST_BYTE_MASK_) == 0) {
- // die... failed at a base character
- return DONE;
+ if (compareType == ElementComparisonType.STANDARD_ELEMENT_COMPARISON) {
+ return CE_NO_MATCH;
}
-
- start = targetText.getIndex(); // index changed by fcd
- int offset = getNextBaseOffset(targetText, start);
- start = getPreviousBaseOffset(start);
-
- StringBuilder accents = new StringBuilder();
- String accentstr = getString(targetText, start, offset - start);
- // normalizing the offensive string
- if (Normalizer.quickCheck(accentstr, Normalizer.NFD,0)
- == Normalizer.NO) {
- accentstr = Normalizer.decompose(accentstr, false);
- }
- accents.append(accentstr);
-
- int accentsindex[] = new int[INITIAL_ARRAY_SIZE_];
- int accentsize = getUnblockedAccentIndex(accents, accentsindex);
- int count = (2 << (accentsize - 1)) - 1;
- while (count > 0) {
- // copy the base characters
- m_canonicalPrefixAccents_.delete(0,
- m_canonicalPrefixAccents_.length());
- int k = 0;
- for (; k < accentsindex[0]; k ++) {
- m_canonicalPrefixAccents_.append(accents.charAt(k));
+
+ long targCEshifted = targCE >>> 32;
+ long patCEshifted = patCE >>> 32;
+ long mask;
+
+ mask = 0xFFFF0000L;
+ int targLev1 = (int)(targCEshifted & mask);
+ int patLev1 = (int)(patCEshifted & mask);
+ if (targLev1 != patLev1) {
+ if (targLev1 == 0) {
+ return CE_SKIP_TARG;
}
- // forming all possible canonical rearrangement by dropping
- // sets of accents
- for (int i = 0; i <= accentsize - 1; i ++) {
- int mask = 1 << (accentsize - i - 1);
- if ((count & mask) != 0) {
- for (int j = accentsindex[i]; j < accentsindex[i + 1];
- j ++) {
- m_canonicalPrefixAccents_.append(accents.charAt(j));
- }
- }
+ if (patLev1 == 0
+ && compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD) {
+ return CE_SKIP_PATN;
}
- StringBuilder match = merge(m_canonicalPrefixAccents_,
- targetText, offset, end,
- m_canonicalSuffixAccents_);
-
- // if status is a failure, ucol_setText does nothing.
- // run the collator iterator through this match
- m_utilColEIter_.setText(match.toString());
- if (checkCollationMatch(m_utilColEIter_)) {
- return start;
+ return CE_NO_MATCH;
+ }
+
+ mask = 0x0000FFFFL;
+ int targLev2 = (int)(targCEshifted & mask);
+ int patLev2 = (int)(patCEshifted & mask);
+ if (targLev2 != patLev2) {
+ if (targLev2 == 0) {
+ return CE_SKIP_TARG;
}
- count --;
+ if (patLev2 == 0
+ && compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD) {
+ return CE_SKIP_PATN;
+ }
+ return (patLev2 == CE_LEVEL2_BASE ||
+ (compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD &&
+ targLev2 == CE_LEVEL2_BASE)) ? CE_MATCH : CE_NO_MATCH;
}
- return DONE;
- }
- /**
- * Gets the offset to the safe point in text before textoffset.
- * ie. not the middle of a contraction, swappable characters or
- * supplementary characters.
- * @param start offset in string
- * @param textoffset offset in string
- * @return offset to the previous safe character
- */
- private final int getPreviousSafeOffset(int start, int textoffset)
- {
- int result = textoffset; // first contraction character
- targetText.setIndex(textoffset);
- while (result >= start && m_collator_.isUnsafe(targetText.previous())) {
- result = targetText.getIndex();
- }
- if (result != start) {
- // the first contraction character is consider unsafe here
- result = targetText.getIndex(); // originally result --;
- }
- return result;
+ mask = 0xFFFF0000L;
+ int targLev3 = (int)(targCE & mask);
+ int patLev3 = (int)(patCE & mask);
+ if (targLev3 != patLev3) {
+ return (patLev3 == CE_LEVEL3_BASE ||
+ (compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD &&
+ targLev3 == CE_LEVEL3_BASE) )? CE_MATCH: CE_NO_MATCH;
+ }
+
+ return CE_MATCH;
}
/**
- * Take the rearranged end accents and tries matching. If match failed at
- * a seperate preceding set of accents (seperated from the rearranged on by
- * at least a base character) then we rearrange the preceding accents and
- * tries matching again.
- * We allow skipping of the ends of the accent set if the ces do not match.
- * However if the failure is found before the accent set, it fails.
- * Internal method, status assumed to be success, caller has to check
- * status before calling this method.
- * @param textoffset of the start of the rearranged accent
- * @return DONE if a match is not found, otherwise return the starting
- * offset of the match. Note this start includes all preceding
- * accents.
+ * An object used for receiving matched index in search() and
+ * searchBackwards().
*/
- private int doNextCanonicalSuffixMatch(int textoffset)
- {
- int safelength = 0;
- StringBuilder safetext;
- int safeoffset = m_textBeginOffset_;
-
- if (textoffset != m_textBeginOffset_
- && m_canonicalSuffixAccents_.length() > 0
- && m_collator_.isUnsafe(m_canonicalSuffixAccents_.charAt(0))) {
- safeoffset = getPreviousSafeOffset(m_textBeginOffset_,
- textoffset);
- safelength = textoffset - safeoffset;
- safetext = merge(null, targetText, safeoffset, textoffset,
- m_canonicalSuffixAccents_);
- }
- else {
- safetext = m_canonicalSuffixAccents_;
+ private static class Match {
+ int start_ = -1;
+ int limit_ = -1;
+ }
+
+ private boolean search(int startIdx, Match m) {
+ // Input parameter sanity check.
+ if (pattern_.CELength_ == 0
+ || startIdx < search_.beginIndex()
+ || startIdx > search_.endIndex()) {
+ throw new IllegalArgumentException("search(" + startIdx + ", m) - expected position to be between " +
+ search_.beginIndex() + " and " + search_.endIndex());
}
-
- // if status is a failure, ucol_setText does nothing
- CollationElementIterator coleiter = m_utilColEIter_;
- coleiter.setText(safetext.toString());
- // status checked in loop below
-
- int ceindex = m_pattern_.m_CELength_ - 1;
- boolean isSafe = true; // indication flag for position in safe zone
-
- while (ceindex >= 0) {
- int textce = coleiter.previous();
- if (textce == CollationElementIterator.NULLORDER) {
- // check if we have passed the safe buffer
- if (coleiter == m_colEIter_) {
- return DONE;
- }
- coleiter = m_colEIter_;
- if (safetext != m_canonicalSuffixAccents_) {
- safetext.delete(0, safetext.length());
- }
- coleiter.setExactOffset(safeoffset);
- // status checked at the start of the loop
- isSafe = false;
- continue;
+
+ if (pattern_.PCE_ == null) {
+ initializePatternPCETable();
+ }
+
+ textIter_.setOffset(startIdx);
+ CEBuffer ceb = new CEBuffer(this);
+
+ int targetIx = 0;
+ CEI targetCEI = null;
+ int patIx;
+ boolean found;
+
+ int mStart = -1;
+ int mLimit = -1;
+ int minLimit;
+ int maxLimit;
+
+ // Outer loop moves over match starting positions in the
+ // target CE space.
+ // Here we see the target as a sequence of collation elements, resulting from the following:
+ // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
+ // (for example, digraphs such as IJ may be broken into two characters).
+ // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
+ // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
+ // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
+ // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
+ // then the CE is deleted, so the following code sees only CEs that are relevant.
+ // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
+ // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
+ // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
+ for (targetIx = 0; ; targetIx++) {
+ found = true;
+ // Inner loop checks for a match beginning at each
+ // position from the outer loop.
+ int targetIxOffset = 0;
+ long patCE = 0;
+ // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
+ // (compared to the last CE fetched for the previous targetIx value) as we need to go
+ // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
+ CEI firstCEI = ceb.get(targetIx);
+ if (firstCEI == null) {
+ throw new RuntimeException("CEBuffer.get(" + targetIx + ") returned null.");
}
- textce = getCE(textce);
- if (textce != CollationElementIterator.IGNORABLE
- && textce != m_pattern_.m_CE_[ceindex]) {
- // do the beginning stuff
- int failedoffset = coleiter.getOffset();
- if (isSafe && failedoffset >= safelength) {
- // alas... no hope. failed at rearranged accent set
- return DONE;
- }
- else {
- if (isSafe) {
- failedoffset += safeoffset;
- }
-
- // try rearranging the front accents
- int result = doNextCanonicalPrefixMatch(failedoffset,
- textoffset);
- if (result != DONE) {
- // if status is a failure, ucol_setOffset does nothing
- m_colEIter_.setExactOffset(result);
+
+ for (patIx = 0; patIx < pattern_.PCELength_; patIx++) {
+ patCE = pattern_.PCE_[patIx];
+ targetCEI = ceb.get(targetIx + patIx + targetIxOffset);
+ // Compare CE from target string with CE from the pattern.
+ // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
+ // which will fail the compare, below.
+ int ceMatch = compareCE64s(targetCEI.ce_, patCE, search_.elementComparisonType_);
+ if (ceMatch == CE_NO_MATCH) {
+ found = false;
+ break;
+ } else if (ceMatch > CE_NO_MATCH) {
+ if (ceMatch == CE_SKIP_TARG) {
+ // redo with same patCE, next targCE
+ patIx--;
+ targetIxOffset++;
+ } else { // ceMatch == CE_SKIP_PATN
+ // redo with same targCE, next patCE
+ targetIxOffset--;
}
- return result;
}
}
- if (textce == m_pattern_.m_CE_[ceindex]) {
- ceindex --;
- }
- }
- // set offset here
- if (isSafe) {
- int result = coleiter.getOffset();
- // sets the text iterator with the correct expansion and offset
- int leftoverces = coleiter.m_CEBufferOffset_;
- if (result >= safelength) {
- result = textoffset;
- }
- else {
- result += safeoffset;
+ targetIxOffset += pattern_.PCELength_; // this is now the offset in target CE space to end of the match so far
+
+ if (!found && ((targetCEI == null) || (targetCEI.ce_ != CollationPCE.PROCESSED_NULLORDER))) {
+ // No match at this targetIx. Try again at the next.
+ continue;
}
- m_colEIter_.setExactOffset(result);
- m_colEIter_.m_CEBufferOffset_ = leftoverces;
- return result;
- }
-
- return coleiter.getOffset();
- }
-
- /**
- * Trying out the substring and sees if it can be a canonical match.
- * This will try normalizing the end accents and arranging them into
- * canonical equivalents and check their corresponding ces with the pattern
- * ce.
- * Suffix accents in the text will be grouped according to their combining
- * class and the groups will be mixed and matched to try find the perfect
- * match with the pattern.
- * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
- * step 1: split "\u030A\u0301" into 6 other type of potential accent
- * substrings
- * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
- * "\u0301\u0325".
- * step 2: check if any of the generated substrings matches the pattern.
- * @param textoffset end offset in the collation element text that ends with
- * the accents to be rearranged
- * @return true if the match is valid, false otherwise
- */
- private boolean doNextCanonicalMatch(int textoffset)
- {
- int offset = m_colEIter_.getOffset();
- targetText.setIndex(textoffset);
- if (UTF16.isTrailSurrogate(targetText.previous())
- && targetText.getIndex() > m_textBeginOffset_) {
- if (!UTF16.isLeadSurrogate(targetText.previous())) {
- targetText.next();
+
+ if (!found) {
+ // No match at all, we have run off the end of the target text.
+ break;
}
- }
- if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) == 0) {
- if (m_pattern_.m_hasPrefixAccents_) {
- offset = doNextCanonicalPrefixMatch(offset, textoffset);
- if (offset != DONE) {
- m_colEIter_.setExactOffset(offset);
- return true;
+
+ // We have found a match in CE space.
+ // Now determine the bounds in string index space.
+ // There still is a chance of match failure if the CE range not correspond to
+ // an acceptable character range.
+ //
+ CEI lastCEI = ceb.get(targetIx + targetIxOffset -1);
+
+ mStart = firstCEI.lowIndex_;
+ minLimit = lastCEI.lowIndex_;
+
+ // Look at the CE following the match. If it is UCOL_NULLORDER the match
+ // extended to the end of input, and the match is good.
+
+ // Look at the high and low indices of the CE following the match. If
+ // they are the same it means one of two things:
+ // 1. The match extended to the last CE from the target text, which is OK, or
+ // 2. The last CE that was part of the match is in an expansion that extends
+ // to the first CE after the match. In this case, we reject the match.
+ CEI nextCEI = null;
+ if (search_.elementComparisonType_ == ElementComparisonType.STANDARD_ELEMENT_COMPARISON) {
+ nextCEI = ceb.get(targetIx + targetIxOffset);
+ maxLimit = nextCEI.lowIndex_;
+ if (nextCEI.lowIndex_ == nextCEI.highIndex_ && nextCEI.ce_ != CollationPCE.PROCESSED_NULLORDER) {
+ found = false;
+ }
+ } else {
+ for (;; ++targetIxOffset) {
+ nextCEI = ceb.get(targetIx + targetIxOffset);
+ maxLimit = nextCEI.lowIndex_;
+ // If we are at the end of the target too, match succeeds
+ if (nextCEI.ce_ == CollationPCE.PROCESSED_NULLORDER) {
+ break;
+ }
+ // As long as the next CE has primary weight of 0,
+ // it is part of the last target element matched by the pattern;
+ // make sure it can be part of a match with the last patCE
+ if ((((nextCEI.ce_) >>> 32) & 0xFFFF0000L) == 0) {
+ int ceMatch = compareCE64s(nextCEI.ce_, patCE, search_.elementComparisonType_);
+ if (ceMatch == CE_NO_MATCH || ceMatch == CE_SKIP_PATN ) {
+ found = false;
+ break;
+ }
+ // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
+ // target element, but it has non-zero primary weight => match fails
+ } else if ( nextCEI.lowIndex_ == nextCEI.highIndex_ ) {
+ found = false;
+ break;
+ // Else the target CE is not part of an expansion of the last matched element, match succeeds
+ } else {
+ break;
+ }
}
}
- return false;
- }
-
- if (!m_pattern_.m_hasSuffixAccents_) {
- return false;
- }
-
- StringBuilder accents = new StringBuilder();
- // offset to the last base character in substring to search
- int baseoffset = getPreviousBaseOffset(targetText, textoffset);
- // normalizing the offensive string
- String accentstr = getString(targetText, baseoffset,
- textoffset - baseoffset);
- if (Normalizer.quickCheck(accentstr, Normalizer.NFD,0)
- == Normalizer.NO) {
- accentstr = Normalizer.decompose(accentstr, false);
- }
- accents.append(accentstr);
- // status checked in loop below
-
- int accentsindex[] = new int[INITIAL_ARRAY_SIZE_];
- int size = getUnblockedAccentIndex(accents, accentsindex);
-
- // 2 power n - 1 plus the full set of accents
- int count = (2 << (size - 1)) - 1;
- while (count > 0) {
- m_canonicalSuffixAccents_.delete(0,
- m_canonicalSuffixAccents_.length());
- // copy the base characters
- for (int k = 0; k < accentsindex[0]; k ++) {
- m_canonicalSuffixAccents_.append(accents.charAt(k));
+
+ // Check for the start of the match being within a combining sequence.
+ // This can happen if the pattern itself begins with a combining char, and
+ // the match found combining marks in the target text that were attached
+ // to something else.
+ // This type of match should be rejected for not completely consuming a
+ // combining sequence.
+ if (!isBreakBoundary(mStart)) {
+ found = false;
+ }
+
+ // Check for the start of the match being within an Collation Element Expansion,
+ // meaning that the first char of the match is only partially matched.
+ // With expansions, the first CE will report the index of the source
+ // character, and all subsequent (expansions) CEs will report the source index of the
+ // _following_ character.
+ int secondIx = firstCEI.highIndex_;
+ if (mStart == secondIx) {
+ found = false;
}
- // forming all possible canonical rearrangement by dropping
- // sets of accents
- for (int i = 0; i <= size - 1; i ++) {
- int mask = 1 << (size - i - 1);
- if ((count & mask) != 0) {
- for (int j = accentsindex[i]; j < accentsindex[i + 1];
- j ++) {
- m_canonicalSuffixAccents_.append(accents.charAt(j));
+
+ // Advance the match end position to the first acceptable match boundary.
+ // This advances the index over any combining characters.
+ mLimit = maxLimit;
+ if (minLimit < maxLimit) {
+ // When the last CE's low index is same with its high index, the CE is likely
+ // a part of expansion. In this case, the index is located just after the
+ // character corresponding to the CEs compared above. If the index is right
+ // at the break boundary, move the position to the next boundary will result
+ // incorrect match length when there are ignorable characters exist between
+ // the position and the next character produces CE(s). See ticket#8482.
+ if (minLimit == lastCEI.highIndex_ && isBreakBoundary(minLimit)) {
+ mLimit = minLimit;
+ } else {
+ int nba = nextBoundaryAfter(minLimit);
+ if (nba >= lastCEI.highIndex_) {
+ mLimit = nba;
}
}
}
- offset = doNextCanonicalSuffixMatch(baseoffset);
- if (offset != DONE) {
- return true; // match found
+
+ // If advancing to the end of a combining sequence in character indexing space
+ // advanced us beyond the end of the match in CE space, reject this match.
+ if (mLimit > maxLimit) {
+ found = false;
}
- count --;
- }
- return false;
- }
-
- /**
- * Gets the previous base character offset depending on the string search
- * pattern data
- * @param strsrch string search data
- * @param textoffset current offset, current character
- * @return the offset of the next character after this base character or
- * itself if it is a composed character with accents
- */
- private final int getPreviousBaseOffset(int textoffset)
- {
- if (m_pattern_.m_hasPrefixAccents_ && textoffset > m_textBeginOffset_) {
- int offset = textoffset;
- if ((getFCD(targetText, offset) >> SECOND_LAST_BYTE_SHIFT_) != 0) {
- return getPreviousBaseOffset(targetText, textoffset);
+
+ if (!isBreakBoundary(mLimit)) {
+ found = false;
+ }
+
+ if (!checkIdentical(mStart, mLimit)) {
+ found = false;
+ }
+
+ if (found) {
+ break;
}
}
- return textoffset;
+
+ // All Done. Store back the match bounds to the caller.
+ //
+ if (found == false) {
+ mLimit = -1;
+ mStart = -1;
+ }
+
+ if (m != null) {
+ m.start_ = mStart;
+ m.limit_ = mLimit;
+ }
+
+ return found;
}
-
- /**
- * Checks match for contraction.
- * If the match ends with a partial contraction we fail.
- * If the match starts too far off (because of backwards iteration) we try
- * to chip off the extra characters.
- * Uses the temporary util buffer for return values of the modified start
- * and end.
- * @param start offset of potential match, to be modified if necessary
- * @param end offset of potential match, to be modified if necessary
- * @return true if match passes the contraction test, false otherwise.
- */
- private boolean checkNextCanonicalContractionMatch(int start, int end)
- {
- // This part checks if either ends of the match contains potential
- // contraction. If so we'll have to iterate through them
- char schar = 0;
- char echar = 0;
- if (end < m_textLimitOffset_) {
- targetText.setIndex(end);
- echar = targetText.current();
- }
- if (start < m_textLimitOffset_) {
- targetText.setIndex(start + 1);
- schar = targetText.current();
- }
- if (m_collator_.isUnsafe(echar) || m_collator_.isUnsafe(schar)) {
- int expansion = m_colEIter_.m_CEBufferOffset_;
- boolean hasExpansion = expansion > 0;
- m_colEIter_.setExactOffset(start);
- int temp = start;
- while (expansion > 0) {
- // getting rid of the redundant ce, caused by setOffset.
- // since backward contraction/expansion may have extra ces if
- // we are in the normalization buffer, hasAccentsBeforeMatch
- // would have taken care of it.
- // E.g. the character \u01FA will have an expansion of 3, but
- // if we are only looking for acute and ring \u030A and \u0301,
- // we'll have to skip the first ce in the expansion buffer.
- m_colEIter_.next();
- if (m_colEIter_.getOffset() != temp) {
- start = temp;
- temp = m_colEIter_.getOffset();
+
+ private boolean searchBackwards(int startIdx, Match m) {
+ //ICU4C_TODO comment: reject search patterns beginning with a combining char.
+
+ // Input parameter sanity check.
+ if (pattern_.CELength_ == 0
+ || startIdx < search_.beginIndex()
+ || startIdx > search_.endIndex()) {
+ throw new IllegalArgumentException("searchBackwards(" + startIdx + ", m) - expected position to be between " +
+ search_.beginIndex() + " and " + search_.endIndex());
+ }
+
+ if (pattern_.PCE_ == null) {
+ initializePatternPCETable();
+ }
+
+ CEBuffer ceb = new CEBuffer(this);
+ int targetIx = 0;
+
+ /*
+ * Pre-load the buffer with the CE's for the grapheme
+ * after our starting position so that we're sure that
+ * we can look at the CE following the match when we
+ * check the match boundaries.
+ *
+ * This will also pre-fetch the first CE that we'll
+ * consider for the match.
+ */
+ if (startIdx < search_.endIndex()) {
+ BreakIterator bi = search_.internalBreakIter_;
+ int next = bi.following(startIdx);
+
+ textIter_.setOffset(next);
+
+ for (targetIx = 0; ; targetIx++) {
+ if (ceb.getPrevious(targetIx).lowIndex_ < startIdx) {
+ break;
}
- expansion --;
}
-
- int count = 0;
- while (count < m_pattern_.m_CELength_) {
- int ce = getCE(m_colEIter_.next());
- // status checked below, note that if status is a failure
- // ucol_next returns UCOL_NULLORDER
- if (ce == CollationElementIterator.IGNORABLE) {
- continue;
+ } else {
+ textIter_.setOffset(startIdx);
+ }
+
+ CEI targetCEI = null;
+ int patIx;
+ boolean found;
+
+ int limitIx = targetIx;
+ int mStart = -1;
+ int mLimit = -1;
+ int minLimit;
+ int maxLimit;
+
+ // Outer loop moves over match starting positions in the
+ // target CE space.
+ // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
+ // But patIx is 0 at the beginning of the pattern and increases toward the end.
+ // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
+ // and the beginning of the base text.
+ for (targetIx = limitIx; ; targetIx++) {
+ found = true;
+ // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
+ // (compared to the last CE fetched for the previous targetIx value) as we need to go
+ // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
+ CEI lastCEI = ceb.getPrevious(targetIx);
+ if (lastCEI == null) {
+ throw new RuntimeException("CEBuffer.getPrevious(" + targetIx + ") returned null.");
+ }
+ // Inner loop checks for a match beginning at each
+ // position from the outer loop.
+ int targetIxOffset = 0;
+ for (patIx = pattern_.PCELength_ - 1; patIx >= 0; patIx--) {
+ long patCE = pattern_.PCE_[patIx];
+
+ targetCEI = ceb.getPrevious(targetIx + pattern_.PCELength_ - 1 - patIx + targetIxOffset);
+ // Compare CE from target string with CE from the pattern.
+ // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
+ // which will fail the compare, below.
+ int ceMatch = compareCE64s(targetCEI.ce_, patCE, search_.elementComparisonType_);
+ if (ceMatch == CE_NO_MATCH) {
+ found = false;
+ break;
+ } else if (ceMatch > CE_NO_MATCH) {
+ if (ceMatch == CE_SKIP_TARG) {
+ // redo with same patCE, next targCE
+ patIx++;
+ targetIxOffset++;
+ } else { // ceMatch == CE_SKIP_PATN
+ // redo with same targCE, next patCE
+ targetIxOffset--;
+ }
}
- if (hasExpansion && count == 0
- && m_colEIter_.getOffset() != temp) {
- start = temp;
- temp = m_colEIter_.getOffset();
+ }
+
+ if (!found && ((targetCEI == null) || (targetCEI.ce_ != CollationPCE.PROCESSED_NULLORDER))) {
+ // No match at this targetIx. Try again at the next.
+ continue;
+ }
+
+ if (!found) {
+ // No match at all, we have run off the end of the target text.
+ break;
+ }
+
+ // We have found a match in CE space.
+ // Now determine the bounds in string index space.
+ // There still is a chance of match failure if the CE range not correspond to
+ // an acceptable character range.
+ //
+ CEI firstCEI = ceb.getPrevious(targetIx + pattern_.PCELength_ - 1 + targetIxOffset);
+ mStart = firstCEI.lowIndex_;
+
+ // Check for the start of the match being within a combining sequence.
+ // This can happen if the pattern itself begins with a combining char, and
+ // the match found combining marks in the target text that were attached
+ // to something else.
+ // This type of match should be rejected for not completely consuming a
+ // combining sequence.
+ if (!isBreakBoundary(mStart)) {
+ found = false;
+ }
+
+ // Look at the high index of the first CE in the match. If it's the same as the
+ // low index, the first CE in the match is in the middle of an expansion.
+ if (mStart == firstCEI.highIndex_) {
+ found = false;
+ }
+
+ minLimit = lastCEI.lowIndex_;
+
+ if (targetIx > 0) {
+ // Look at the CE following the match. If it is UCOL_NULLORDER the match
+ // extended to the end of input, and the match is good.
+
+ // Look at the high and low indices of the CE following the match. If
+ // they are the same it means one of two things:
+ // 1. The match extended to the last CE from the target text, which is OK, or
+ // 2. The last CE that was part of the match is in an expansion that extends
+ // to the first CE after the match. In this case, we reject the match.
+ CEI nextCEI = ceb.getPrevious(targetIx - 1);
+
+ if (nextCEI.lowIndex_ == nextCEI.highIndex_ && nextCEI.ce_ != CollationPCE.PROCESSED_NULLORDER) {
+ found = false;
}
-
- if (count == 0 && ce != m_pattern_.m_CE_[0]) {
- // accents may have extra starting ces, this occurs when a
- // pure accent pattern is matched without rearrangement
- // text \u0325\u0300 and looking for \u0300
- int expected = m_pattern_.m_CE_[0];
- if ((getFCD(targetText, start) & LAST_BYTE_MASK_) != 0) {
- ce = getCE(m_colEIter_.next());
- while (ce != expected
- && ce != CollationElementIterator.NULLORDER
- && m_colEIter_.getOffset() <= end) {
- ce = getCE(m_colEIter_.next());
- }
+
+ mLimit = maxLimit = nextCEI.lowIndex_;
+
+ // Advance the match end position to the first acceptable match boundary.
+ // This advances the index over any combining charcters.
+ if (minLimit < maxLimit) {
+ int nba = nextBoundaryAfter(minLimit);
+
+ if (nba >= lastCEI.highIndex_) {
+ mLimit = nba;
}
}
- if (ce != m_pattern_.m_CE_[count]) {
- end ++;
- end = getNextBaseOffset(end);
- m_utilBuffer_[0] = start;
- m_utilBuffer_[1] = end;
- return false;
+
+ // If advancing to the end of a combining sequence in character indexing space
+ // advanced us beyond the end of the match in CE space, reject this match.
+ if (mLimit > maxLimit) {
+ found = false;
}
- count ++;
+
+ // Make sure the end of the match is on a break boundary
+ if (!isBreakBoundary(mLimit)) {
+ found = false;
+ }
+
+ } else {
+ // No non-ignorable CEs after this point.
+ // The maximum position is detected by boundary after
+ // the last non-ignorable CE. Combining sequence
+ // across the start index will be truncated.
+ int nba = nextBoundaryAfter(minLimit);
+ mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
}
- }
- m_utilBuffer_[0] = start;
- m_utilBuffer_[1] = end;
- return true;
- }
- /**
- * Checks and sets the match information if found.
- * Checks
- * <ul>
- * <li> the potential match does not repeat the previous match
- * <li> boundaries are correct
- * <li> potential match does not end in the middle of a contraction
- * <li> identical matches
- * </ul>
- * Otherwise the offset will be shifted to the next character.
- * The result m_matchIndex_ and m_matchLength_ will be set to the truncated
- * more fitting result value.
- * Uses the temporary utility buffer for storing the modified textoffset.
- * @param textoffset offset in the collation element text.
- * @return true if the match is valid, false otherwise
- */
- private boolean checkNextCanonicalMatch(int textoffset)
- {
- // to ensure that the start and ends are not composite characters
- // if we have a canonical accent match
- if ((m_pattern_.m_hasSuffixAccents_
- && m_canonicalSuffixAccents_.length() != 0) ||
- (m_pattern_.m_hasPrefixAccents_
- && m_canonicalPrefixAccents_.length() != 0)) {
- m_matchedIndex_ = getPreviousBaseOffset(m_colEIter_.getOffset());
- matchLength = textoffset - m_matchedIndex_;
- return true;
+ if (!checkIdentical(mStart, mLimit)) {
+ found = false;
+ }
+
+ if (found) {
+ break;
+ }
}
-
- int start = m_colEIter_.getOffset();
- if (!checkNextCanonicalContractionMatch(start, textoffset)) {
- // return the modified textoffset
- m_utilBuffer_[0] = m_utilBuffer_[1];
- return false;
+
+ // All Done. Store back the match bounds to the caller.
+ //
+ if (found == false) {
+ mLimit = -1;
+ mStart = -1;
}
- start = m_utilBuffer_[0];
- textoffset = m_utilBuffer_[1];
- start = getPreviousBaseOffset(start);
- // this totally matches, however we need to check if it is repeating
- if (checkRepeatedMatch(start, textoffset)
- || !isBreakUnit(start, textoffset)
- || !checkIdentical(start, textoffset)) {
- textoffset ++;
- textoffset = getNextBaseOffset(targetText, textoffset);
- m_utilBuffer_[0] = textoffset;
- return false;
+
+ if (m != null) {
+ m.start_ = mStart;
+ m.limit_ = mLimit;
}
-
- m_matchedIndex_ = start;
- matchLength = textoffset - start;
- return true;
+
+ return found;
}
-
- /**
- * Shifting the collation element iterator position forward to prepare for
- * a preceding match. If the first character is a unsafe character, we'll
- * only shift by 1 to capture contractions, normalization etc.
- * @param textoffset start text position to do search
- * @param ce the text ce which failed the match.
- * @param patternceindex index of the ce within the pattern ce buffer which
- * failed the match
- * @return final offset
- */
- private int reverseShift(int textoffset, int ce, int patternceindex)
- {
- if (isOverlapping()) {
- if (textoffset != m_textLimitOffset_) {
- textoffset --;
- }
- else {
- textoffset -= m_pattern_.m_defaultShiftSize_;
- }
+
+ // Java porting note:
+ //
+ // ICU4C usearch_handleNextExact() is identical to usearch_handleNextCanonical()
+ // for the linear search implementation. The differences are addressed in search().
+ //
+ private boolean handleNextExact() {
+ return handleNextCommonImpl();
+ }
+
+ private boolean handleNextCanonical() {
+ return handleNextCommonImpl();
+ }
+
+ private boolean handleNextCommonImpl() {
+ int textOffset = textIter_.getOffset();
+ Match match = new Match();
+
+ if (search(textOffset, match)) {
+ search_.matchedIndex_ = match.start_;
+ search_.setMatchedLength(match.limit_ - match.start_);
+ return true;
+ } else {
+ setMatchNotFound();
+ return false;
}
- else {
- if (ce != CollationElementIterator.NULLORDER) {
- int shift = m_pattern_.m_backShift_[hash(ce)];
-
- // this is to adjust for characters in the middle of the substring
- // for matching that failed.
- int adjust = patternceindex;
- if (adjust > 1 && shift > adjust) {
- shift -= adjust - 1;
- }
- textoffset -= shift;
- }
- else {
- textoffset -= m_pattern_.m_defaultShiftSize_;
- }
- }
-
- textoffset = getPreviousBaseOffset(textoffset);
- return textoffset;
}
- /**
- * Checks match for contraction.
- * If the match starts with a partial contraction we fail.
- * Uses the temporary utility buffer to return the modified start and end.
- * @param start offset of potential match, to be modified if necessary
- * @param end offset of potential match, to be modified if necessary
- * @return true if match passes the contraction test, false otherwise.
- */
- private boolean checkPreviousExactContractionMatch(int start, int end)
- {
- // This part checks if either ends of the match contains potential
- // contraction. If so we'll have to iterate through them
- char echar = 0;
- if (end < m_textLimitOffset_) {
- targetText.setIndex(end);
- echar = targetText.current();
- }
- char schar = 0;
- if (start + 1 < m_textLimitOffset_) {
- targetText.setIndex(start + 1);
- schar = targetText.current();
- }
- if (m_collator_.isUnsafe(echar) || m_collator_.isUnsafe(schar)) {
- // expansion suffix, what's left to iterate
- int expansion = m_colEIter_.m_CEBufferSize_
- - m_colEIter_.m_CEBufferOffset_;
- boolean hasExpansion = expansion > 0;
- m_colEIter_.setExactOffset(end);
- int temp = end;
- while (expansion > 0) {
- // getting rid of the redundant ce
- // since forward contraction/expansion may have extra ces
- // if we are in the normalization buffer, hasAccentsBeforeMatch
- // would have taken care of it.
- // E.g. the character \u01FA will have an expansion of 3, but if
- // we are only looking for A ring A\u030A, we'll have to skip the
- // last ce in the expansion buffer
- m_colEIter_.previous();
- if (m_colEIter_.getOffset() != temp) {
- end = temp;
- temp = m_colEIter_.getOffset();
- }
- expansion --;
- }
-
- int count = m_pattern_.m_CELength_;
- while (count > 0) {
- int ce = getCE(m_colEIter_.previous());
- // status checked below, note that if status is a failure
- // ucol_previous returns UCOL_NULLORDER
- if (ce == CollationElementIterator.IGNORABLE) {
- continue;
- }
- if (hasExpansion && count == 0
- && m_colEIter_.getOffset() != temp) {
- end = temp;
- temp = m_colEIter_.getOffset();
- }
- if (ce != m_pattern_.m_CE_[count - 1]) {
- start --;
- start = getPreviousBaseOffset(targetText, start);
- m_utilBuffer_[0] = start;
- m_utilBuffer_[1] = end;
+ // Java porting note:
+ //
+ // ICU4C usearch_handlePreviousExact() is identical to usearch_handlePreviousCanonical()
+ // for the linear search implementation. The differences are addressed in searchBackwards().
+ //
+ private boolean handlePreviousExact() {
+ return handlePreviousCommonImpl();
+ }
+
+ private boolean handlePreviousCanonical() {
+ return handlePreviousCommonImpl();
+ }
+
+ private boolean handlePreviousCommonImpl() {
+ int textOffset;
+
+ if (search_.isOverlap_) {
+ if (search_.matchedIndex_ != DONE) {
+ textOffset = search_.matchedIndex_ + search_.matchedLength() - 1;
+ } else {
+ // move the start position at the end of possible match
+ initializePatternPCETable();
+ if (!initTextProcessedIter()) {
+ setMatchNotFound();
return false;
}
- count --;
+ for (int nPCEs = 0; nPCEs < pattern_.PCELength_ - 1; nPCEs++) {
+ long pce = textProcessedIter_.nextProcessed(null);
+ if (pce == CollationPCE.PROCESSED_NULLORDER) {
+ // at the end of the text
+ break;
+ }
+ }
+ textOffset = textIter_.getOffset();
}
- }
- m_utilBuffer_[0] = start;
- m_utilBuffer_[1] = end;
- return true;
- }
-
- /**
- * Checks and sets the match information if found.
- * Checks
- * <ul>
- * <li> the current match does not repeat the last match
- * <li> boundaries are correct
- * <li> exact matches has no extra accents
- * <li> identical matches
- * </ul>
- * Otherwise the offset will be shifted to the preceding character.
- * Uses the temporary utility buffer to store the modified textoffset.
- * @param textoffset offset in the collation element text. the returned value
- * will be the truncated start offset of the match or the new start
- * search offset.
- * @return true if the match is valid, false otherwise
- */
- private final boolean checkPreviousExactMatch(int textoffset)
- {
- // to ensure that the start and ends are not composite characters
- int end = m_colEIter_.getOffset();
- if (!checkPreviousExactContractionMatch(textoffset, end)) {
- return false;
+ } else {
+ textOffset = textIter_.getOffset();
}
- textoffset = m_utilBuffer_[0];
- end = m_utilBuffer_[1];
-
- // this totally matches, however we need to check if it is repeating
- // the old match
- if (checkRepeatedMatch(textoffset, end)
- || !isBreakUnit(textoffset, end)
- || hasAccentsBeforeMatch(textoffset, end)
- || !checkIdentical(textoffset, end)
- || hasAccentsAfterMatch(textoffset, end)) {
- textoffset --;
- textoffset = getPreviousBaseOffset(targetText, textoffset);
- m_utilBuffer_[0] = textoffset;
+
+ Match match = new Match();
+ if (searchBackwards(textOffset, match)) {
+ search_.matchedIndex_ = match.start_;
+ search_.setMatchedLength(match.limit_ - match.start_);
+ return true;
+ } else {
+ setMatchNotFound();
return false;
}
-
- if (m_collator_.getStrength() == Collator.PRIMARY) {
- end = checkBreakBoundary(end);
- }
-
- m_matchedIndex_ = textoffset;
- matchLength = end - textoffset;
- return true;
}
/**
- * Rearranges the end accents to try matching.
- * Suffix accents in the text will be grouped according to their combining
- * class and the groups will be mixed and matched to try find the perfect
- * match with the pattern.
- * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
- * step 1: split "\u030A\u0301" into 6 other type of potential accent
- * substrings
- * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
- * "\u0301\u0325".
- * step 2: check if any of the generated substrings matches the pattern.
- * @param start offset of the first base character
- * @param end start of the last accent set
- * @return DONE if a match is not found, otherwise return the ending
- * offset of the match. Note this start includes all following
- * accents.
+ * Gets a substring out of a CharacterIterator
+ *
+ * Java porting note: Not available in ICU4C
+ *
+ * @param text CharacterIterator
+ * @param start start offset
+ * @param length of substring
+ * @return substring from text starting at start and length length
*/
- private int doPreviousCanonicalSuffixMatch(int start, int end)
- {
- targetText.setIndex(end);
- if (UTF16.isTrailSurrogate(targetText.previous())
- && targetText.getIndex() > m_textBeginOffset_) {
- if (!UTF16.isLeadSurrogate(targetText.previous())) {
- targetText.next();
- }
- }
- if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) == 0) {
- // die... failed at a base character
- return DONE;
+ private static final String getString(CharacterIterator text, int start, int length) {
+ StringBuilder result = new StringBuilder(length);
+ int offset = text.getIndex();
+ text.setIndex(start);
+ for (int i = 0; i < length; i++) {
+ result.append(text.current());
+ text.next();
}
- end = getNextBaseOffset(targetText, end);
-
- StringBuilder accents = new StringBuilder();
- int offset = getPreviousBaseOffset(targetText, end);
- // normalizing the offensive string
- String accentstr = getString(targetText, offset, end - offset);
- if (Normalizer.quickCheck(accentstr, Normalizer.NFD,0)
- == Normalizer.NO) {
- accentstr = Normalizer.decompose(accentstr, false);
- }
- accents.append(accentstr);
-
- int accentsindex[] = new int[INITIAL_ARRAY_SIZE_];
- int accentsize = getUnblockedAccentIndex(accents, accentsindex);
- int count = (2 << (accentsize - 1)) - 1;
- while (count > 0) {
- m_canonicalSuffixAccents_.delete(0,
- m_canonicalSuffixAccents_.length());
- // copy the base characters
- for (int k = 0; k < accentsindex[0]; k ++) {
- m_canonicalSuffixAccents_.append(accents.charAt(k));
- }
- // forming all possible canonical rearrangement by dropping
- // sets of accents
- for (int i = 0; i <= accentsize - 1; i ++) {
- int mask = 1 << (accentsize - i - 1);
- if ((count & mask) != 0) {
- for (int j = accentsindex[i]; j < accentsindex[i + 1];
- j ++) {
- m_canonicalSuffixAccents_.append(accents.charAt(j));
- }
- }
- }
- StringBuilder match = merge(m_canonicalPrefixAccents_, targetText,
- start, offset,
- m_canonicalSuffixAccents_);
- // run the collator iterator through this match
- // if status is a failure ucol_setText does nothing
- m_utilColEIter_.setText(match.toString());
- if (checkCollationMatch(m_utilColEIter_)) {
- return end;
- }
- count --;
+ text.setIndex(offset);
+ return result.toString();
+ }
+
+ /**
+ * Java port of ICU4C struct UPattern (usrchimp.h)
+ */
+ private static final class Pattern {
+ /** Pattern string */
+ String text_;
+
+ long[] PCE_;
+ int PCELength_ = 0;
+
+ // TODO: We probably do not need CE_ / CELength_
+ @SuppressWarnings("unused")
+ int[] CE_;
+ int CELength_ = 0;
+
+ // *** Boyer-Moore ***
+ // boolean hasPrefixAccents_ = false;
+ // boolean hasSuffixAccents_ = false;
+ // int defaultShiftSize_;
+ // char[] shift_;
+ // char[] backShift_;
+
+ protected Pattern(String pattern) {
+ text_ = pattern;
}
- return DONE;
}
-
+
/**
- * Take the rearranged start accents and tries matching. If match failed at
- * a seperate following set of accents (seperated from the rearranged on by
- * at least a base character) then we rearrange the preceding accents and
- * tries matching again.
- * We allow skipping of the ends of the accent set if the ces do not match.
- * However if the failure is found before the accent set, it fails.
- * Internal method, status assumed to be success, caller has to check
- * status before calling this method.
- * @param textoffset of the ends of the rearranged accent
- * @return DONE if a match is not found, otherwise return the ending offset
- * of the match. Note this start includes all following accents.
+ * Java port of ICU4C UCollationPCE (usrchimp.h)
*/
- private int doPreviousCanonicalPrefixMatch(int textoffset)
- {
- // int safelength = 0;
- StringBuilder safetext;
- int safeoffset = textoffset;
-
- if (textoffset > m_textBeginOffset_
- && m_collator_.isUnsafe(m_canonicalPrefixAccents_.charAt(
- m_canonicalPrefixAccents_.length() - 1))) {
- safeoffset = getNextSafeOffset(textoffset, m_textLimitOffset_);
- //safelength = safeoffset - textoffset;
- safetext = merge(m_canonicalPrefixAccents_, targetText, textoffset,
- safeoffset, null);
+ private static class CollationPCE {
+ public static final long PROCESSED_NULLORDER = -1;
+
+ private static final int DEFAULT_BUFFER_SIZE = 16;
+ private static final int BUFFER_GROW = 8;
+
+ // Note: PRIMARYORDERMASK is also duplicated in StringSearch class
+ private static final int PRIMARYORDERMASK = 0xffff0000;
+ private static final int CONTINUATION_MARKER = 0xc0;
+
+ private PCEBuffer pceBuffer_ = new PCEBuffer();
+ private CollationElementIterator cei_;
+ private int strength_;
+ private boolean toShift_;
+ private boolean isShifted_;
+ private int variableTop_;
+
+ public CollationPCE(CollationElementIterator iter) {
+ init(iter);
}
- else {
- safetext = m_canonicalPrefixAccents_;
+
+ public void init(CollationElementIterator iter) {
+ cei_ = iter;
+ init(iter.getRuleBasedCollator());
}
-
- // if status is a failure, ucol_setText does nothing
- CollationElementIterator coleiter = m_utilColEIter_;
- coleiter.setText(safetext.toString());
- // status checked in loop below
-
- int ceindex = 0;
- boolean isSafe = true; // safe zone indication flag for position
- int prefixlength = m_canonicalPrefixAccents_.length();
-
- while (ceindex < m_pattern_.m_CELength_) {
- int textce = coleiter.next();
- if (textce == CollationElementIterator.NULLORDER) {
- // check if we have passed the safe buffer
- if (coleiter == m_colEIter_) {
- return DONE;
+
+ private void init(RuleBasedCollator coll) {
+ strength_ = coll.getStrength();
+ toShift_ = coll.isAlternateHandlingShifted();
+ isShifted_ = false;
+ variableTop_ = coll.getVariableTop();
+ }
+
+ @SuppressWarnings("fallthrough")
+ private long processCE(int ce) {
+ long primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
+
+ // This is clean, but somewhat slow...
+ // We could apply the mask to ce and then
+ // just get all three orders...
+ switch (strength_) {
+ default:
+ tertiary = CollationElementIterator.tertiaryOrder(ce);
+ /* note fall-through */
+
+ case Collator.SECONDARY:
+ secondary = CollationElementIterator.secondaryOrder(ce);
+ /* note fall-through */
+
+ case Collator.PRIMARY:
+ primary = CollationElementIterator.primaryOrder(ce);
+ }
+
+ // **** This should probably handle continuations too. ****
+ // **** That means that we need 24 bits for the primary ****
+ // **** instead of the 16 that we're currently using. ****
+ // **** So we can lay out the 64 bits as: 24.12.12.16. ****
+ // **** Another complication with continuations is that ****
+ // **** the *second* CE is marked as a continuation, so ****
+ // **** we always have to peek ahead to know how long ****
+ // **** the primary is... ****
+ if ((toShift_ && variableTop_ > ce && primary != 0) || (isShifted_ && primary == 0)) {
+
+ if (primary == 0) {
+ return CollationElementIterator.IGNORABLE;
}
- if (safetext != m_canonicalPrefixAccents_) {
- safetext.delete(0, safetext.length());
+
+ if (strength_ >= Collator.QUATERNARY) {
+ quaternary = primary;
}
- coleiter = m_colEIter_;
- coleiter.setExactOffset(safeoffset);
- // status checked at the start of the loop
- isSafe = false;
- continue;
+
+ primary = secondary = tertiary = 0;
+ isShifted_ = true;
+ } else {
+ if (strength_ >= Collator.QUATERNARY) {
+ quaternary = 0xFFFF;
+ }
+
+ isShifted_ = false;
}
- textce = getCE(textce);
- if (textce != CollationElementIterator.IGNORABLE
- && textce != m_pattern_.m_CE_[ceindex]) {
- // do the beginning stuff
- int failedoffset = coleiter.getOffset();
- if (isSafe && failedoffset <= prefixlength) {
- // alas... no hope. failed at rearranged accent set
- return DONE;
+
+ return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
+ }
+
+ /**
+ * Get the processed ordering priority of the next collation element in the text.
+ * A single character may contain more than one collation element.
+ *
+ * Note: This is equivalent to
+ * UCollationPCE::nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
+ *
+ * @param range receiving the iterator index before/after fetching the CE.
+ * @return The next collation elements ordering, otherwise returns PROCESSED_NULLORDER
+ * if an error has occurred or if the end of string has been reached
+ */
+ public long nextProcessed(Range range) {
+ long result = CollationElementIterator.IGNORABLE;
+ int low = 0, high = 0;
+
+ pceBuffer_.reset();
+
+ do {
+ low = cei_.getOffset();
+ int ce = cei_.next();
+ high = cei_.getOffset();
+
+ if (ce == CollationElementIterator.NULLORDER) {
+ result = PROCESSED_NULLORDER;
+ break;
}
- else {
- if (isSafe) {
- failedoffset = safeoffset - failedoffset;
- if (safetext != m_canonicalPrefixAccents_) {
- safetext.delete(0, safetext.length());
+
+ result = processCE(ce);
+ } while (result == CollationElementIterator.IGNORABLE);
+
+ if (range != null) {
+ range.ixLow_ = low;
+ range.ixHigh_ = high;
+ }
+
+ return result;
+ }
+
+ /**
+ * Get the processed ordering priority of the previous collation element in the text.
+ * A single character may contain more than one collation element.
+ *
+ * Note: This is equivalent to
+ * UCollationPCE::previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
+ *
+ * @param range receiving the iterator index before/after fetching the CE.
+ * @return The previous collation elements ordering, otherwise returns
+ * PROCESSED_NULLORDER if an error has occurred or if the start of
+ * string has been reached.
+ */
+ public long previousProcessed(Range range) {
+ long result = CollationElementIterator.IGNORABLE;
+ int low = 0, high = 0;
+
+ // pceBuffer_.reset();
+
+ while (pceBuffer_.empty()) {
+ // buffer raw CEs up to non-ignorable primary
+ RCEBuffer rceb = new RCEBuffer();
+ int ce;
+
+ boolean finish = false;
+
+ // **** do we need to reset rceb, or will it always be empty at this point ****
+ do {
+ high = cei_.getOffset();
+ ce = cei_.previous();
+ low = cei_.getOffset();
+
+ if (ce == CollationElementIterator.NULLORDER) {
+ if (!rceb.empty()) {
+ break;
}
+
+ finish = true;
+ break;
}
-
- // try rearranging the end accents
- int result = doPreviousCanonicalSuffixMatch(textoffset,
- failedoffset);
- if (result != DONE) {
- // if status is a failure, ucol_setOffset does nothing
- m_colEIter_.setExactOffset(result);
+
+ rceb.put(ce, low, high);
+ } while ((ce & PRIMARYORDERMASK) == 0 || isContinuation(ce));
+
+ if (finish) {
+ break;
+ }
+
+ // process the raw CEs
+ while (!rceb.empty()) {
+ RCEI rcei = rceb.get();
+
+ result = processCE(rcei.ce_);
+
+ if (result != CollationElementIterator.IGNORABLE) {
+ pceBuffer_.put(result, rcei.low_, rcei.high_);
}
- return result;
}
}
- if (textce == m_pattern_.m_CE_[ceindex]) {
- ceindex ++;
- }
- }
- // set offset here
- if (isSafe) {
- int result = coleiter.getOffset();
- // sets the text iterator here with the correct expansion and offset
- int leftoverces = coleiter.m_CEBufferSize_
- - coleiter.m_CEBufferOffset_;
- if (result <= prefixlength) {
- result = textoffset;
+
+ if (pceBuffer_.empty()) {
+ // **** Is -1 the right value for ixLow, ixHigh? ****
+ if (range != null) {
+ range.ixLow_ = -1;
+ range.ixHigh_ = -1;
+ }
+ return CollationElementIterator.NULLORDER;
}
- else {
- result = textoffset + (safeoffset - result);
+
+ PCEI pcei = pceBuffer_.get();
+
+ if (range != null) {
+ range.ixLow_ = pcei.low_;
+ range.ixHigh_ = pcei.high_;
}
- m_colEIter_.setExactOffset(result);
- m_colEIter_.m_CEBufferOffset_ = m_colEIter_.m_CEBufferSize_
- - leftoverces;
- return result;
+
+ return pcei.ce_;
}
-
- return coleiter.getOffset();
- }
-
- /**
- * Trying out the substring and sees if it can be a canonical match.
- * This will try normalizing the starting accents and arranging them into
- * canonical equivalents and check their corresponding ces with the pattern
- * ce.
- * Prefix accents in the text will be grouped according to their combining
- * class and the groups will be mixed and matched to try find the perfect
- * match with the pattern.
- * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
- * step 1: split "\u030A\u0301" into 6 other type of potential accent
- * substrings
- * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
- * "\u0301\u0325".
- * step 2: check if any of the generated substrings matches the pattern.
- * @param textoffset start offset in the collation element text that starts
- * with the accents to be rearranged
- * @return true if the match is valid, false otherwise
- */
- private boolean doPreviousCanonicalMatch(int textoffset)
- {
- int offset = m_colEIter_.getOffset();
- if ((getFCD(targetText, textoffset) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
- if (m_pattern_.m_hasSuffixAccents_) {
- offset = doPreviousCanonicalSuffixMatch(textoffset, offset);
- if (offset != DONE) {
- m_colEIter_.setExactOffset(offset);
- return true;
- }
- }
- return false;
+
+ private static boolean isContinuation(int ce) {
+ return ((ce & CONTINUATION_MARKER) == CONTINUATION_MARKER);
}
-
- if (!m_pattern_.m_hasPrefixAccents_) {
- return false;
+
+ public static final class Range {
+ int ixLow_;
+ int ixHigh_;
}
-
- StringBuilder accents = new StringBuilder();
- // offset to the last base character in substring to search
- int baseoffset = getNextBaseOffset(targetText, textoffset);
- // normalizing the offensive string
- String textstr = getString(targetText, textoffset,
- baseoffset - textoffset);
- if (Normalizer.quickCheck(textstr, Normalizer.NFD,0)
- == Normalizer.NO) {
- textstr = Normalizer.decompose(textstr, false);
+
+ /** Processed collation element buffer stuff ported from ICU4C ucoleitr.cpp */
+ private static final class PCEI {
+ long ce_;
+ int low_;
+ int high_;
}
- accents.append(textstr);
- // status checked in loop
-
- int accentsindex[] = new int[INITIAL_ARRAY_SIZE_];
- int size = getUnblockedAccentIndex(accents, accentsindex);
-
- // 2 power n - 1 plus the full set of accents
- int count = (2 << (size - 1)) - 1;
- while (count > 0) {
- m_canonicalPrefixAccents_.delete(0,
- m_canonicalPrefixAccents_.length());
- // copy the base characters
- for (int k = 0; k < accentsindex[0]; k ++) {
- m_canonicalPrefixAccents_.append(accents.charAt(k));
- }
- // forming all possible canonical rearrangement by dropping
- // sets of accents
- for (int i = 0; i <= size - 1; i ++) {
- int mask = 1 << (size - i - 1);
- if ((count & mask) != 0) {
- for (int j = accentsindex[i]; j < accentsindex[i + 1];
- j ++) {
- m_canonicalPrefixAccents_.append(accents.charAt(j));
- }
- }
+
+ private static final class PCEBuffer {
+ private PCEI[] buffer_ = new PCEI[DEFAULT_BUFFER_SIZE];
+ private int bufferIndex_ = 0;
+
+ void reset() {
+ bufferIndex_ = 0;
}
- offset = doPreviousCanonicalPrefixMatch(baseoffset);
- if (offset != DONE) {
- return true; // match found
+
+ boolean empty() {
+ return bufferIndex_ <= 0;
}
- count --;
- }
- return false;
- }
-
- /**
- * Checks match for contraction.
- * If the match starts with a partial contraction we fail.
- * Uses the temporary utility buffer to return the modified start and end.
- * @param start offset of potential match, to be modified if necessary
- * @param end offset of potential match, to be modified if necessary
- * @return true if match passes the contraction test, false otherwise.
- */
- private boolean checkPreviousCanonicalContractionMatch(int start, int end)
- {
- int temp = end;
- // This part checks if either ends of the match contains potential
- // contraction. If so we'll have to iterate through them
- char echar = 0;
- char schar = 0;
- if (end < m_textLimitOffset_) {
- targetText.setIndex(end);
- echar = targetText.current();
- }
- if (start + 1 < m_textLimitOffset_) {
- targetText.setIndex(start + 1);
- schar = targetText.current();
- }
- if (m_collator_.isUnsafe(echar) || m_collator_.isUnsafe(schar)) {
- int expansion = m_colEIter_.m_CEBufferSize_
- - m_colEIter_.m_CEBufferOffset_;
- boolean hasExpansion = expansion > 0;
- m_colEIter_.setExactOffset(end);
- while (expansion > 0) {
- // getting rid of the redundant ce
- // since forward contraction/expansion may have extra ces
- // if we are in the normalization buffer, hasAccentsBeforeMatch
- // would have taken care of it.
- // E.g. the character \u01FA will have an expansion of 3, but
- // if we are only looking for A ring A\u030A, we'll have to
- // skip the last ce in the expansion buffer
- m_colEIter_.previous();
- if (m_colEIter_.getOffset() != temp) {
- end = temp;
- temp = m_colEIter_.getOffset();
- }
- expansion --;
+
+ void put(long ce, int ixLow, int ixHigh)
+ {
+ if (bufferIndex_ >= buffer_.length) {
+ PCEI[] newBuffer = new PCEI[buffer_.length + BUFFER_GROW];
+ System.arraycopy(buffer_, 0, newBuffer, 0, buffer_.length);
+ buffer_ = newBuffer;
+ }
+ buffer_[bufferIndex_] = new PCEI();
+ buffer_[bufferIndex_].ce_ = ce;
+ buffer_[bufferIndex_].low_ = ixLow;
+ buffer_[bufferIndex_].high_ = ixHigh;
+
+ bufferIndex_ += 1;
}
-
- int count = m_pattern_.m_CELength_;
- while (count > 0) {
- int ce = getCE(m_colEIter_.previous());
- // status checked below, note that if status is a failure
- // previous() returns NULLORDER
- if (ce == CollationElementIterator.IGNORABLE) {
- continue;
- }
- if (hasExpansion && count == 0
- && m_colEIter_.getOffset() != temp) {
- end = temp;
- temp = m_colEIter_.getOffset();
- }
- if (count == m_pattern_.m_CELength_
- && ce != m_pattern_.m_CE_[m_pattern_.m_CELength_ - 1]) {
- // accents may have extra starting ces, this occurs when a
- // pure accent pattern is matched without rearrangement
- int expected = m_pattern_.m_CE_[m_pattern_.m_CELength_ - 1];
- targetText.setIndex(end);
- if (UTF16.isTrailSurrogate(targetText.previous())) {
- if (targetText.getIndex() > m_textBeginOffset_ &&
- !UTF16.isLeadSurrogate(targetText.previous())) {
- targetText.next();
- }
- }
- end = targetText.getIndex();
- if ((getFCD(targetText, end) & LAST_BYTE_MASK_) != 0) {
- ce = getCE(m_colEIter_.previous());
- while (ce != expected
- && ce != CollationElementIterator.NULLORDER
- && m_colEIter_.getOffset() <= start) {
- ce = getCE(m_colEIter_.previous());
- }
- }
- }
- if (ce != m_pattern_.m_CE_[count - 1]) {
- start --;
- start = getPreviousBaseOffset(start);
- m_utilBuffer_[0] = start;
- m_utilBuffer_[1] = end;
- return false;
+
+ PCEI get() {
+ if (bufferIndex_ > 0) {
+ return buffer_[--bufferIndex_];
}
- count --;
+ return null;
}
- }
- m_utilBuffer_[0] = start;
- m_utilBuffer_[1] = end;
- return true;
- }
-
- /**
- * Checks and sets the match information if found.
- * Checks
- * <ul>
- * <li> the potential match does not repeat the previous match
- * <li> boundaries are correct
- * <li> potential match does not end in the middle of a contraction
- * <li> identical matches
- * </ul>
- * Otherwise the offset will be shifted to the next character.
- * Uses the temporary utility buffer for storing the modified textoffset.
- * @param textoffset offset in the collation element text. the returned
- * value will be the truncated start offset of the match or the
- * new start search offset.
- * @return true if the match is valid, false otherwise
- */
- private boolean checkPreviousCanonicalMatch(int textoffset)
- {
- // to ensure that the start and ends are not composite characters
- // if we have a canonical accent match
- if (m_pattern_.m_hasSuffixAccents_
- && m_canonicalSuffixAccents_.length() != 0
- || m_pattern_.m_hasPrefixAccents_
- && m_canonicalPrefixAccents_.length() != 0) {
- m_matchedIndex_ = textoffset;
- matchLength = getNextBaseOffset(m_colEIter_.getOffset())
- - textoffset;
- return true;
}
-
- int end = m_colEIter_.getOffset();
- if (!checkPreviousCanonicalContractionMatch(textoffset, end)) {
- // storing the modified textoffset
- return false;
- }
- textoffset = m_utilBuffer_[0];
- end = m_utilBuffer_[1];
- end = getNextBaseOffset(end);
- // this totally matches, however we need to check if it is repeating
- if (checkRepeatedMatch(textoffset, end)
- || !isBreakUnit(textoffset, end)
- || !checkIdentical(textoffset, end)) {
- textoffset --;
- textoffset = getPreviousBaseOffset(textoffset);
- m_utilBuffer_[0] = textoffset;
- return false;
+
+ /** Raw collation element buffer stuff ported from ICU4C ucoleitr.cpp */
+ private static final class RCEI {
+ int ce_;
+ int low_;
+ int high_;
}
-
- m_matchedIndex_ = textoffset;
- matchLength = end - textoffset;
- return true;
- }
-
- /**
- * Method that does the next exact match
- * @param start the offset to start shifting from and performing the
- * next exact match
- */
- private void handleNextExact(int start)
- {
- int textoffset = shiftForward(start,
- CollationElementIterator.NULLORDER,
- m_pattern_.m_CELength_);
- int targetce = CollationElementIterator.IGNORABLE;
- while (textoffset <= m_textLimitOffset_) {
- m_colEIter_.setExactOffset(textoffset);
- int patternceindex = m_pattern_.m_CELength_ - 1;
- boolean found = false;
- int lastce = CollationElementIterator.NULLORDER;
-
- while (true) {
- // finding the last pattern ce match, imagine composite
- // characters. for example: search for pattern A in text \u00C0
- // we'll have to skip \u0300 the grave first before we get to A
- targetce = m_colEIter_.previous();
- if (targetce == CollationElementIterator.NULLORDER) {
- found = false;
- break;
- }
- targetce = getCE(targetce);
- if (targetce == CollationElementIterator.IGNORABLE &&
- m_colEIter_.isInBuffer()) {
- // this is for the text \u0315\u0300 that requires
- // normalization and pattern \u0300, where \u0315 is ignorable
- continue;
- }
- if (lastce == CollationElementIterator.NULLORDER
- || lastce == CollationElementIterator.IGNORABLE) {
- lastce = targetce;
- }
- if (targetce == m_pattern_.m_CE_[patternceindex]) {
- // the first ce can be a contraction
- found = true;
- break;
- }
- if (m_colEIter_.m_CEBufferOffset_ <= 0) {
- found = false;
- break;
- }
- }
-
- while (found && patternceindex > 0) {
- lastce = targetce;
- targetce = m_colEIter_.previous();
- if (targetce == CollationElementIterator.NULLORDER) {
- found = false;
- break;
- }
- targetce = getCE(targetce);
- if (targetce == CollationElementIterator.IGNORABLE) {
- continue;
- }
-
- patternceindex --;
- found = found && targetce == m_pattern_.m_CE_[patternceindex];
+
+ private static final class RCEBuffer {
+ private RCEI[] buffer_ = new RCEI[DEFAULT_BUFFER_SIZE];
+ private int bufferIndex_ = 0;
+
+ boolean empty() {
+ return bufferIndex_ <= 0;
}
-
- targetce = lastce;
-
- if (!found) {
- textoffset = shiftForward(textoffset, lastce, patternceindex);
- // status checked at loop.
- patternceindex = m_pattern_.m_CELength_;
- continue;
+
+ void put(int ce, int ixLow, int ixHigh) {
+ if (bufferIndex_ >= buffer_.length) {
+ RCEI[] newBuffer = new RCEI[buffer_.length + BUFFER_GROW];
+ System.arraycopy(buffer_, 0, newBuffer, 0, buffer_.length);
+ buffer_ = newBuffer;
+ }
+ buffer_[bufferIndex_] = new RCEI();
+ buffer_[bufferIndex_].ce_ = ce;
+ buffer_[bufferIndex_].low_ = ixLow;
+ buffer_[bufferIndex_].high_ = ixHigh;
+
+ bufferIndex_ += 1;
}
-
- if (checkNextExactMatch(textoffset)) {
- // status checked in ucol_setOffset
- return;
+
+ RCEI get() {
+ if (bufferIndex_ > 0) {
+ return buffer_[--bufferIndex_];
+ }
+ return null;
}
- textoffset = m_utilBuffer_[0];
}
- setMatchNotFound();
}
/**
- * Method that does the next canonical match
- * @param start the offset to start shifting from and performing the
- * next canonical match
+ * Java port of ICU4C CEI (usearch.cpp)
+ *
+ * CEI Collation Element + source text index.
+ * These structs are kept in the circular buffer.
*/
- private void handleNextCanonical(int start)
- {
- boolean hasPatternAccents =
- m_pattern_.m_hasSuffixAccents_ || m_pattern_.m_hasPrefixAccents_;
-
- // shifting it check for setting offset
- // if setOffset is called previously or there was no previous match, we
- // leave the offset as it is.
- int textoffset = shiftForward(start, CollationElementIterator.NULLORDER,
- m_pattern_.m_CELength_);
- m_canonicalPrefixAccents_.delete(0, m_canonicalPrefixAccents_.length());
- m_canonicalSuffixAccents_.delete(0, m_canonicalSuffixAccents_.length());
- int targetce = CollationElementIterator.IGNORABLE;
-
- while (textoffset <= m_textLimitOffset_)
- {
- m_colEIter_.setExactOffset(textoffset);
- int patternceindex = m_pattern_.m_CELength_ - 1;
- boolean found = false;
- int lastce = CollationElementIterator.NULLORDER;
-
- while (true) {
- // finding the last pattern ce match, imagine composite characters
- // for example: search for pattern A in text \u00C0
- // we'll have to skip \u0300 the grave first before we get to A
- targetce = m_colEIter_.previous();
- if (targetce == CollationElementIterator.NULLORDER) {
- found = false;
- break;
- }
- targetce = getCE(targetce);
- if (lastce == CollationElementIterator.NULLORDER
- || lastce == CollationElementIterator.IGNORABLE) {
- lastce = targetce;
- }
- if (targetce == m_pattern_.m_CE_[patternceindex]) {
- // the first ce can be a contraction
- found = true;
- break;
- }
- if (m_colEIter_.m_CEBufferOffset_ <= 0) {
- found = false;
- break;
- }
- }
-
- while (found && patternceindex > 0) {
- targetce = m_colEIter_.previous();
- if (targetce == CollationElementIterator.NULLORDER) {
- found = false;
- break;
- }
- targetce = getCE(targetce);
- if (targetce == CollationElementIterator.IGNORABLE) {
- continue;
+ private static class CEI {
+ long ce_;
+ int lowIndex_;
+ int highIndex_;
+ }
+
+ /**
+ * CEBuffer A circular buffer of CEs from the text being searched
+ */
+ private static class CEBuffer {
+ // Java porting note: ICU4C uses the size for stack buffer
+ // static final int DEFAULT_CEBUFFER_SIZE = 96;
+
+ static final int CEBUFFER_EXTRA = 32;
+ static final int MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L = 8;
+ static final int MAX_TARGET_IGNORABLES_PER_PAT_OTHER = 3;
+
+ CEI[] buf_;
+ int bufSize_;
+ int firstIx_;
+ int limitIx_;
+
+ // Java porting note: No references in ICU4C implementation
+ // CollationElementIterator ceIter_;
+
+ StringSearch strSearch_;
+
+ CEBuffer(StringSearch ss) {
+ strSearch_ = ss;
+ bufSize_ = ss.pattern_.PCELength_ + CEBUFFER_EXTRA;
+ if (ss.search_.elementComparisonType_ != ElementComparisonType.STANDARD_ELEMENT_COMPARISON) {
+ String patText = ss.pattern_.text_;
+ if (patText != null) {
+ for (int i = 0; i < patText.length(); i++) {
+ char c = patText.charAt(i);
+ if (MIGHT_BE_JAMO_L(c)) {
+ bufSize_ += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
+ } else {
+ // No check for surrogates, we might allocate slightly more buffer than necessary.
+ bufSize_ += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
+ }
+ }
}
-
- patternceindex --;
- found = found && targetce == m_pattern_.m_CE_[patternceindex];
- }
-
- // initializing the rearranged accent array
- if (hasPatternAccents && !found) {
- found = doNextCanonicalMatch(textoffset);
- }
-
- if (!found) {
- textoffset = shiftForward(textoffset, lastce, patternceindex);
- // status checked at loop
- patternceindex = m_pattern_.m_CELength_;
- continue;
}
-
- if (checkNextCanonicalMatch(textoffset)) {
+
+ // Not used - see above
+ // ceIter_ = ss.textIter_;
+
+ firstIx_ = 0;
+ limitIx_ = 0;
+
+ if (!ss.initTextProcessedIter()) {
return;
}
- textoffset = m_utilBuffer_[0];
+
+ buf_ = new CEI[bufSize_];
}
- setMatchNotFound();
- }
-
- /**
- * Method that does the previous exact match
- * @param start the offset to start shifting from and performing the
- * previous exact match
- */
- private void handlePreviousExact(int start)
- {
- int textoffset = reverseShift(start, CollationElementIterator.NULLORDER,
- m_pattern_.m_CELength_);
- while (textoffset >= m_textBeginOffset_)
- {
- m_colEIter_.setExactOffset(textoffset);
- int patternceindex = 1;
- int targetce = CollationElementIterator.IGNORABLE;
- boolean found = false;
- int firstce = CollationElementIterator.NULLORDER;
-
- while (true) {
- // finding the first pattern ce match, imagine composite
- // characters. for example: search for pattern \u0300 in text
- // \u00C0, we'll have to skip A first before we get to
- // \u0300 the grave accent
- targetce = m_colEIter_.next();
- if (targetce == CollationElementIterator.NULLORDER) {
- found = false;
- break;
- }
- targetce = getCE(targetce);
- if (firstce == CollationElementIterator.NULLORDER
- || firstce == CollationElementIterator.IGNORABLE) {
- firstce = targetce;
- }
- if (targetce == CollationElementIterator.IGNORABLE && m_collator_.getStrength() != Collator.PRIMARY) {
- continue;
- }
- if (targetce == m_pattern_.m_CE_[0]) {
- found = true;
- break;
- }
- if (m_colEIter_.m_CEBufferOffset_ == -1
- || m_colEIter_.m_CEBufferOffset_
- == m_colEIter_.m_CEBufferSize_) {
- // checking for accents in composite character
- found = false;
- break;
- }
+
+ // Get the CE with the specified index.
+ // Index must be in the range
+ // n-history_size < index < n+1
+ // where n is the largest index to have been fetched by some previous call to this function.
+ // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
+ //
+ CEI get(int index) {
+ int i = index % bufSize_;
+
+ if (index >= firstIx_ && index < limitIx_) {
+ // The request was for an entry already in our buffer.
+ // Just return it.
+ return buf_[i];
}
-
- //targetce = firstce;
-
- while (found && patternceindex < m_pattern_.m_CELength_) {
- firstce = targetce;
- targetce = m_colEIter_.next();
- if (targetce == CollationElementIterator.NULLORDER) {
- found = false;
- break;
- }
- targetce = getCE(targetce);
- if (targetce == CollationElementIterator.IGNORABLE) {
- continue;
- }
-
- found = found && targetce == m_pattern_.m_CE_[patternceindex];
- patternceindex ++;
+
+ // Caller is requesting a new, never accessed before, CE.
+ // Verify that it is the next one in sequence, which is all
+ // that is allowed.
+ if (index != limitIx_) {
+ assert(false);
+ return null;
}
-
- targetce = firstce;
-
- if (!found) {
- textoffset = reverseShift(textoffset, targetce, patternceindex);
- patternceindex = 0;
- continue;
+
+ // Manage the circular CE buffer indexing
+ limitIx_++;
+
+ if (limitIx_ - firstIx_ >= bufSize_) {
+ // The buffer is full, knock out the lowest-indexed entry.
+ firstIx_++;
}
-
- if (checkPreviousExactMatch(textoffset)) {
- return;
+
+ CollationPCE.Range range = new CollationPCE.Range();
+ if (buf_[i] == null) {
+ buf_[i] = new CEI();
}
- textoffset = m_utilBuffer_[0];
+ buf_[i].ce_ = strSearch_.textProcessedIter_.nextProcessed(range);
+ buf_[i].lowIndex_ = range.ixLow_;
+ buf_[i].highIndex_ = range.ixHigh_;
+
+ return buf_[i];
}
- setMatchNotFound();
- }
-
- /**
- * Method that does the previous canonical match
- * @param start the offset to start shifting from and performing the
- * previous canonical match
- */
- private void handlePreviousCanonical(int start)
- {
- boolean hasPatternAccents =
- m_pattern_.m_hasSuffixAccents_ || m_pattern_.m_hasPrefixAccents_;
-
- // shifting it check for setting offset
- // if setOffset is called previously or there was no previous match, we
- // leave the offset as it is.
- int textoffset = reverseShift(start, CollationElementIterator.NULLORDER,
- m_pattern_.m_CELength_);
- m_canonicalPrefixAccents_.delete(0, m_canonicalPrefixAccents_.length());
- m_canonicalSuffixAccents_.delete(0, m_canonicalSuffixAccents_.length());
-
- while (textoffset >= m_textBeginOffset_)
- {
- m_colEIter_.setExactOffset(textoffset);
- int patternceindex = 1;
- int targetce = CollationElementIterator.IGNORABLE;
- boolean found = false;
- int firstce = CollationElementIterator.NULLORDER;
-
- while (true) {
- // finding the first pattern ce match, imagine composite
- // characters. for example: search for pattern \u0300 in text
- // \u00C0, we'll have to skip A first before we get to
- // \u0300 the grave accent
- targetce = m_colEIter_.next();
- if (targetce == CollationElementIterator.NULLORDER) {
- found = false;
- break;
- }
- targetce = getCE(targetce);
- if (firstce == CollationElementIterator.NULLORDER
- || firstce == CollationElementIterator.IGNORABLE) {
- firstce = targetce;
- }
-
- if (targetce == m_pattern_.m_CE_[0]) {
- // the first ce can be a contraction
- found = true;
- break;
- }
- if (m_colEIter_.m_CEBufferOffset_ == -1
- || m_colEIter_.m_CEBufferOffset_
- == m_colEIter_.m_CEBufferSize_) {
- // checking for accents in composite character
- found = false;
- break;
- }
- }
-
- targetce = firstce;
-
- while (found && patternceindex < m_pattern_.m_CELength_) {
- targetce = m_colEIter_.next();
- if (targetce == CollationElementIterator.NULLORDER) {
- found = false;
- break;
- }
- targetce = getCE(targetce);
- if (targetce == CollationElementIterator.IGNORABLE) {
- continue;
- }
-
- found = found && targetce == m_pattern_.m_CE_[patternceindex];
- patternceindex ++;
+
+ // Get the CE with the specified index.
+ // Index must be in the range
+ // n-history_size < index < n+1
+ // where n is the largest index to have been fetched by some previous call to this function.
+ // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
+ //
+ CEI getPrevious(int index) {
+ int i = index % bufSize_;
+
+ if (index >= firstIx_ && index < limitIx_) {
+ // The request was for an entry already in our buffer.
+ // Just return it.
+ return buf_[i];
}
-
- // initializing the rearranged accent array
- if (hasPatternAccents && !found) {
- found = doPreviousCanonicalMatch(textoffset);
+
+ // Caller is requesting a new, never accessed before, CE.
+ // Verify that it is the next one in sequence, which is all
+ // that is allowed.
+ if (index != limitIx_) {
+ assert(false);
+ return null;
}
-
- if (!found) {
- textoffset = reverseShift(textoffset, targetce, patternceindex);
- patternceindex = 0;
- continue;
+
+ // Manage the circular CE buffer indexing
+ limitIx_++;
+
+ if (limitIx_ - firstIx_ >= bufSize_) {
+ // The buffer is full, knock out the lowest-indexed entry.
+ firstIx_++;
}
-
- if (checkPreviousCanonicalMatch(textoffset)) {
- return;
+
+ CollationPCE.Range range = new CollationPCE.Range();
+ if (buf_[i] == null) {
+ buf_[i] = new CEI();
}
- textoffset = m_utilBuffer_[0];
- }
- setMatchNotFound();
- }
-
- /**
- * Gets a substring out of a CharacterIterator
- * @param text CharacterIterator
- * @param start start offset
- * @param length of substring
- * @return substring from text starting at start and length length
- */
- private static final String getString(CharacterIterator text, int start,
- int length)
- {
- StringBuilder result = new StringBuilder(length);
- int offset = text.getIndex();
- text.setIndex(start);
- for (int i = 0; i < length; i ++) {
- result.append(text.current());
- text.next();
- }
- text.setIndex(offset);
- return result.toString();
- }
-
- /**
- * Getting the mask for collation strength
- * @param strength collation strength
- * @return collation element mask
- */
- private static final int getMask(int strength)
- {
- switch (strength)
- {
- case Collator.PRIMARY:
- return RuleBasedCollator.CE_PRIMARY_MASK_;
- case Collator.SECONDARY:
- return RuleBasedCollator.CE_SECONDARY_MASK_
- | RuleBasedCollator.CE_PRIMARY_MASK_;
- default:
- return RuleBasedCollator.CE_TERTIARY_MASK_
- | RuleBasedCollator.CE_SECONDARY_MASK_
- | RuleBasedCollator.CE_PRIMARY_MASK_;
+ buf_[i].ce_ = strSearch_.textProcessedIter_.previousProcessed(range);
+ buf_[i].lowIndex_ = range.ixLow_;
+ buf_[i].highIndex_ = range.ixHigh_;
+
+ return buf_[i];
}
- }
-
- /**
- * Sets match not found
- */
- private void setMatchNotFound()
- {
- // this method resets the match result regardless of the error status.
- m_matchedIndex_ = DONE;
- setMatchLength(0);
- }
-
- /**
- * Check the boundaries of the match.
- */
- private int checkBreakBoundary(int end) {
- if (!m_charBreakIter_.isBoundary(end)) {
- end = m_charBreakIter_.following(end);
+
+ static boolean MIGHT_BE_JAMO_L(char c) {
+ return (c >= 0x1100 && c <= 0x115E)
+ || (c >= 0x3131 && c <= 0x314E)
+ || (c >= 0x3165 && c <= 0x3186);
}
- return end;
}
}
/*
*******************************************************************************
-* Copyright (C) 2009-2012, International Business Machines
+* Copyright (C) 2009-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
public static final class Hangul {
/* Korean Hangul and Jamo constants */
public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */
+ public static final int JAMO_L_END=0x1112;
public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */
+ public static final int JAMO_V_END=0x1175;
public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */
+ public static final int JAMO_T_END=0x11c2;
public static final int HANGUL_BASE=0xac00;
+ public static final int HANGUL_END=0xd7a3;
public static final int JAMO_L_COUNT=19;
public static final int JAMO_V_COUNT=21;
return load(ICUData.getRequiredStream(name));
}
+ private void enumLcccRange(int start, int end, int norm16, UnicodeSet set) {
+ if(isAlgorithmicNoNo(norm16)) {
+ // Range of code points with same-norm16-value algorithmic decompositions.
+ // They might have different non-zero FCD16 values.
+ do {
+ int fcd16=getFCD16(start);
+ if(fcd16>0xff) { set.add(start); }
+ } while(++start<=end);
+ } else {
+ int fcd16=getFCD16(start);
+ if(fcd16>0xff) { set.add(start, end); }
+ }
+ }
+
+ private void enumNorm16PropertyStartsRange(int start, int end, int value, UnicodeSet set) {
+ /* add the start code point to the USet */
+ set.add(start);
+ if(start!=end && isAlgorithmicNoNo(value)) {
+ // Range of code points with same-norm16-value algorithmic decompositions.
+ // They might have different non-zero FCD16 values.
+ int prevFCD16=getFCD16(start);
+ while(++start<=end) {
+ int fcd16=getFCD16(start);
+ if(fcd16!=prevFCD16) {
+ set.add(start);
+ prevFCD16=fcd16;
+ }
+ }
+ }
+ }
+
+ public void addLcccChars(UnicodeSet set) {
+ /* add the start code point of each same-value range of each trie */
+ Iterator<Trie2.Range> trieIterator=normTrie.iterator();
+ Trie2.Range range;
+ while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
+ enumLcccRange(range.startCodePoint, range.endCodePoint, range.value, set);
+ }
+ }
+
public void addPropertyStarts(UnicodeSet set) {
/* add the start code point of each same-value range of each trie */
Iterator<Trie2.Range> trieIterator=normTrie.iterator();
Trie2.Range range;
while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
- /* add the start code point to the USet */
- set.add(range.startCodePoint);
+ enumNorm16PropertyStartsRange(range.startCodePoint, range.endCodePoint, range.value, set);
}
/* add Hangul LV syllables and LV+1 because of skippables */
return 0; // no
}
}
+ public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; }
public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
// higher-level functionality ------------------------------------------ ***
+ // NFD without an NFD Normalizer2 instance.
+ public Appendable decompose(CharSequence s, StringBuilder dest) {
+ decompose(s, 0, s.length(), dest, s.length());
+ return dest;
+ }
+ /**
+ * Decomposes s[src, limit[ and writes the result to dest.
+ * limit can be NULL if src is NUL-terminated.
+ * destLengthEstimate is the initial dest buffer capacity and can be -1.
+ */
+ public void decompose(CharSequence s, int src, int limit, StringBuilder dest,
+ int destLengthEstimate) {
+ if(destLengthEstimate<0) {
+ destLengthEstimate=limit-src;
+ }
+ dest.setLength(0);
+ ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate);
+ decompose(s, src, limit, buffer);
+ }
+
// Dual functionality:
// buffer!=NULL: normalize
// buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
ICU_DATA_VERSION = getInstance(53, 0, 1, 0);
UNICODE_VERSION = UNICODE_6_3;
- UCOL_RUNTIME_VERSION = getInstance(7);
- UCOL_BUILDER_VERSION = getInstance(8);
+ UCOL_RUNTIME_VERSION = getInstance(8);
+ UCOL_BUILDER_VERSION = getInstance(9);
UCOL_TAILORINGS_VERSION = getInstance(1);
}
version https://git-lfs.github.com/spec/v1
-oid sha256:c2759d4ee4ebccae6cd0995f8ae6442228829ac6f3818b206dbebfa16e864895
-size 11002389
+oid sha256:1abc0174ae76c79801fe369dac52cd4c42a09d6c2b92919b1f7736d46ea10e1d
+size 10855096
<!--
*******************************************************************************
-* Copyright (C) 2009-2011, International Business Machines Corporation and *
+* Copyright (C) 2009-2014, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
-->
</unjar>
<copy todir="${bin.dir}">
<fileset dir="${icu4j.collate-tests.dir}/src">
- <include name="com/ibm/icu/dev/data/riwords.txt"/>
+ <include name="com/ibm/icu/dev/data/collationtest.txt"/>
<include name="com/ibm/icu/dev/data/CollationTest_*.txt"/>
- <include name="com/ibm/icu/dev/data/resources/*"/>
+ <include name="com/ibm/icu/dev/data/riwords.txt"/>
</fileset>
</copy>
</target>
# File: CollationTest_CLDR_NON_IGNORABLE_SHORT.txt
-# UCA Version: 6.2.0
-# UCD Version: 6.2.0
-# Generated: 2012-08-15, 21:43:27 GMT [MD]
+# UCA Version: 6.3.0
+# UCD Version: 6.3.0
+# Generated: 2013-09-03 [MS]
# For a description of the format and usage, see CollationAuxiliary.html
0338 0334
20E5 0334
20EA 0334
20EB 0334
+0335 0334
0334 0591
0591 0334
0334 0592
20EF 0334
0334 10A0D
10A0D 0334
+0334 3099
+3099 0334
+0334 309A
+309A 0334
0305 0334
0334 0305
0309 0334
302E 0334
0334 302F
302F 0334
-0334 3099
-3099 0334
-0334 309A
-309A 0334
0334 20D0
20D0 0334
0334 20D1
20E9 0334
0334 101FD
101FD 0334
-0335 0334
10A39 0334
20D2 0334
20D3 0334
0085 0061
0085 0041
0085 0062
-180E 0021
-180E 003F
-180E 0061
-180E 0041
-180E 0062
2028 0021
2028 003F
2028 0061
0618 0021
0619 0021
061A 0021
+061C 0021
0640 0021
06D6 0021
06D7 0021
180B 0021
180C 0021
180D 0021
+180E 0021
1A7F 0021
1B6B 0021
1B6C 0021
2062 0021
2063 0021
2064 0021
+2066 0021
+2067 0021
+2068 0021
+2069 0021
206A 0021
206B 0021
206C 0021
20E2 0021
20E3 0021
20E4 0021
+3099 0021
+FF9E 0021
+309A 0021
+FF9F 0021
+0335 0021
0305 0021
0309 0021
030F 0021
0330 0021
0331 0021
0334 0021
-0335 0021
0339 0021
0345 0021
0358 0021
302D 0021
302E 0021
302F 0021
-3099 0021
-FF9E 0021
-309A 0021
-FF9F 0021
20D0 0021
20D1 0021
20D2 0021
0618 003F
0619 003F
061A 003F
+061C 003F
0640 003F
06D6 003F
06D7 003F
180B 003F
180C 003F
180D 003F
+180E 003F
1A7F 003F
1B6B 003F
1B6C 003F
2062 003F
2063 003F
2064 003F
+2066 003F
+2067 003F
+2068 003F
+2069 003F
206A 003F
206B 003F
206C 003F
20E2 003F
20E3 003F
20E4 003F
+3099 003F
+FF9E 003F
+309A 003F
+FF9F 003F
+0335 003F
0305 003F
0309 003F
030F 003F
0330 003F
0331 003F
0334 003F
-0335 003F
0339 003F
0345 003F
0358 003F
302D 003F
302E 003F
302F 003F
-3099 003F
-FF9E 003F
-309A 003F
-FF9F 003F
20D0 003F
20D1 003F
20D2 003F
2046 0061
2046 0041
2046 0062
+2308 0021
+2308 003F
+2308 0061
+2308 0041
+2308 0062
+2309 0021
+2309 003F
+2309 0061
+2309 0041
+2309 0062
+230A 0021
+230A 003F
+230A 0061
+230A 0041
+230A 0062
+230B 0021
+230B 003F
+230B 0061
+230B 0041
+230B 0062
29FC 0021
29FC 003F
29FC 0061
2307 0061
2307 0041
2307 0062
-2308 0021
-2308 003F
-2308 0061
-2308 0041
-2308 0062
-2309 0021
-2309 003F
-2309 0061
-2309 0041
-2309 0062
-230A 0021
-230A 003F
-230A 0061
-230A 0041
-230A 0062
-230B 0021
-230B 003F
-230B 0061
-230B 0041
-230B 0062
230C 0021
230C 003F
230C 0061
FFFC 0061
FFFC 0041
FFFC 0062
-FFFD 0021
-FFFD 003F
-FFFD 0061
-FFFD 0041
-FFFD 0062
02D0 0021
02D0 003F
02D0 0061
12433 0061
12433 0041
12433 0062
-12456 0021
-12456 003F
-12456 0061
-12456 0041
-12456 0062
-12457 0021
-12457 003F
-12457 0061
-12457 0041
-12457 0062
1245A 0021
1245A 003F
1245A 0061
1F101 0041
1F101 0062
0030 0021
-FF10 0021
-1D7CE 0021
-1D7D8 0021
-1D7E2 0021
-1D7EC 0021
-1D7F6 0021
-24EA 0021
-24FF 0021
-2070 0021
-2080 0021
0660 0021
06F0 0021
07C0 0021
-104A0 0021
0966 0021
09E6 0021
0A66 0021
0C78 0021
0CE6 0021
0D66 0021
-ABF0 0021
-A8D0 0021
-1946 0021
-19D0 0021
-1A80 0021
-1A90 0021
0E50 0021
0ED0 0021
0F20 0021
-0F33 0021
-1C40 0021
-A900 0021
1040 0021
1090 0021
-11136 0021
17E0 0021
17F0 0021
-AA50 0021
+1810 0021
+1946 0021
+19D0 0021
+1A80 0021
+1A90 0021
1B50 0021
-A9D0 0021
1BB0 0021
-1810 0021
+1C40 0021
1C50 0021
-A620 0021
-110F0 0021
3007 0021
+A620 0021
+A8D0 0021
+A900 0021
+A9D0 0021
+AA50 0021
+ABF0 0021
1018A 0021
+104A0 0021
+11066 0021
+110F0 0021
+11136 0021
111D0 0021
116C0 0021
-11066 0021
+FF10 0021
+0F33 0021
+1D7CE 0021
+1D7D8 0021
+1D7E2 0021
+1D7EC 0021
+1D7F6 0021
+24EA 0021
+24FF 0021
+2070 0021
+2080 0021
0030 003F
-FF10 003F
-1D7CE 003F
-1D7D8 003F
-1D7E2 003F
-1D7EC 003F
-1D7F6 003F
-24EA 003F
-24FF 003F
-2070 003F
-2080 003F
0660 003F
06F0 003F
07C0 003F
-104A0 003F
0966 003F
09E6 003F
0A66 003F
0C78 003F
0CE6 003F
0D66 003F
-ABF0 003F
-A8D0 003F
-1946 003F
-19D0 003F
-1A80 003F
-1A90 003F
0E50 003F
0ED0 003F
0F20 003F
-0F33 003F
-1C40 003F
-A900 003F
1040 003F
1090 003F
-11136 003F
17E0 003F
17F0 003F
-AA50 003F
+1810 003F
+1946 003F
+19D0 003F
+1A80 003F
+1A90 003F
1B50 003F
-A9D0 003F
1BB0 003F
-1810 003F
+1C40 003F
1C50 003F
-A620 003F
-110F0 003F
3007 003F
+A620 003F
+A8D0 003F
+A900 003F
+A9D0 003F
+AA50 003F
+ABF0 003F
1018A 003F
+104A0 003F
+11066 003F
+110F0 003F
+11136 003F
111D0 003F
116C0 003F
-11066 003F
+FF10 003F
+0F33 003F
+1D7CE 003F
+1D7D8 003F
+1D7E2 003F
+1D7EC 003F
+1D7F6 003F
+24EA 003F
+24FF 003F
+2070 003F
+2080 003F
1F100 0021
1F100 003F
1F100 0061
2189 0041
2189 0062
0030 0061
-0030 0041
-FF10 0061
-FF10 0041
-1D7CE 0061
-1D7D8 0061
-1D7E2 0061
-1D7EC 0061
-1D7F6 0061
-1D7CE 0041
-1D7D8 0041
-1D7E2 0041
-1D7EC 0041
-1D7F6 0041
-24EA 0061
-24FF 0061
-24EA 0041
-24FF 0041
-2070 0061
-2070 0041
-2080 0061
-2080 0041
0660 0061
-0660 0041
06F0 0061
-06F0 0041
07C0 0061
-07C0 0041
-104A0 0061
-104A0 0041
0966 0061
-0966 0041
09E6 0061
-09E6 0041
0A66 0061
-0A66 0041
0AE6 0061
-0AE6 0041
0B66 0061
-0B66 0041
0BE6 0061
-0BE6 0041
0C66 0061
0C78 0061
-0C66 0041
-0C78 0041
0CE6 0061
-0CE6 0041
0D66 0061
-0D66 0041
-ABF0 0061
-ABF0 0041
-A8D0 0061
-A8D0 0041
+0E50 0061
+0ED0 0061
+0F20 0061
+1040 0061
+1090 0061
+17E0 0061
+17F0 0061
+1810 0061
1946 0061
-1946 0041
19D0 0061
-19D0 0041
1A80 0061
-1A80 0041
1A90 0061
-1A90 0041
-0E50 0061
+1B50 0061
+1BB0 0061
+1C40 0061
+1C50 0061
+3007 0061
+A620 0061
+A8D0 0061
+A900 0061
+A9D0 0061
+AA50 0061
+ABF0 0061
+1018A 0061
+104A0 0061
+11066 0061
+110F0 0061
+11136 0061
+111D0 0061
+116C0 0061
+0030 0041
+0660 0041
+06F0 0041
+07C0 0041
+0966 0041
+09E6 0041
+0A66 0041
+0AE6 0041
+0B66 0041
+0BE6 0041
+0C66 0041
+0C78 0041
+0CE6 0041
+0D66 0041
0E50 0041
-0ED0 0061
0ED0 0041
-0F20 0061
0F20 0041
-0F33 0061
-0F33 0041
-1C40 0061
-1C40 0041
-A900 0061
-A900 0041
-1040 0061
1040 0041
-1090 0061
1090 0041
-11136 0061
-11136 0041
-17E0 0061
17E0 0041
-17F0 0061
17F0 0041
-AA50 0061
-AA50 0041
-1B50 0061
+1810 0041
+1946 0041
+19D0 0041
+1A80 0041
+1A90 0041
1B50 0041
-A9D0 0061
-A9D0 0041
-1BB0 0061
1BB0 0041
-1810 0061
-1810 0041
-1C50 0061
+1C40 0041
1C50 0041
-A620 0061
-A620 0041
-110F0 0061
-110F0 0041
-3007 0061
3007 0041
-1018A 0061
+A620 0041
+A8D0 0041
+A900 0041
+A9D0 0041
+AA50 0041
+ABF0 0041
1018A 0041
-111D0 0061
+104A0 0041
+11066 0041
+110F0 0041
+11136 0041
111D0 0041
-116C0 0061
116C0 0041
-11066 0061
-11066 0041
+FF10 0061
+FF10 0041
+0F33 0061
+0F33 0041
+1D7CE 0061
+1D7D8 0061
+1D7E2 0061
+1D7EC 0061
+1D7F6 0061
+1D7CE 0041
+1D7D8 0041
+1D7E2 0041
+1D7EC 0041
+1D7F6 0041
+24EA 0061
+24FF 0061
+24EA 0041
+24FF 0041
+2070 0061
+2070 0041
+2080 0061
+2080 0041
0030 0062
-FF10 0062
-1D7CE 0062
-1D7D8 0062
-1D7E2 0062
-1D7EC 0062
-1D7F6 0062
-24EA 0062
-24FF 0062
-2070 0062
-2080 0062
0660 0062
06F0 0062
07C0 0062
-104A0 0062
0966 0062
09E6 0062
0A66 0062
0C78 0062
0CE6 0062
0D66 0062
-ABF0 0062
-A8D0 0062
-1946 0062
-19D0 0062
-1A80 0062
-1A90 0062
0E50 0062
0ED0 0062
0F20 0062
-0F33 0062
-1C40 0062
-A900 0062
1040 0062
1090 0062
-11136 0062
17E0 0062
17F0 0062
-AA50 0062
+1810 0062
+1946 0062
+19D0 0062
+1A80 0062
+1A90 0062
1B50 0062
-A9D0 0062
1BB0 0062
-1810 0062
+1C40 0062
1C50 0062
-A620 0062
-110F0 0062
3007 0062
+A620 0062
+A8D0 0062
+A900 0062
+A9D0 0062
+AA50 0062
+ABF0 0062
1018A 0062
+104A0 0062
+11066 0062
+110F0 0062
+11136 0062
111D0 0062
116C0 0062
-11066 0062
+FF10 0062
+0F33 0062
+1D7CE 0062
+1D7D8 0062
+1D7E2 0062
+1D7EC 0062
+1D7F6 0062
+24EA 0062
+24FF 0062
+2070 0062
+2080 0062
3358 0021
3358 003F
3358 0061
1F102 0041
1F102 0062
0031 0021
-FF11 0021
-1D7CF 0021
-1D7D9 0021
-1D7E3 0021
-1D7ED 0021
-1D7F7 0021
-2460 0021
-24F5 0021
-2776 0021
-2780 0021
-278A 0021
-00B9 0021
-2081 0021
0661 0021
06F1 0021
-10E60 0021
07C1 0021
-1369 0021
-104A1 0021
0967 0021
09E7 0021
0A67 0021
0C7C 0021
0CE7 0021
0D67 0021
-ABF1 0021
-A8D1 0021
-1947 0021
-19D1 0021
-19DA 0021
-1A81 0021
-1A91 0021
0E51 0021
0ED1 0021
0F21 0021
-0F2A 0021
-1C41 0021
-A901 0021
1041 0021
1091 0021
-11137 0021
+1369 0021
17E1 0021
17F1 0021
-AA51 0021
+1811 0021
+1947 0021
+19D1 0021
+19DA 0021
+1A81 0021
+1A91 0021
1B51 0021
-A9D1 0021
1BB1 0021
-1811 0021
+1C41 0021
1C51 0021
-A621 0021
-110F1 0021
3021 0021
+A621 0021
+A8D1 0021
+A901 0021
+A9D1 0021
+AA51 0021
+ABF1 0021
10107 0021
10142 0021
10158 0021
1015A 0021
10320 0021
103D1 0021
+104A1 0021
+10858 0021
+10916 0021
+10A40 0021
+10A7D 0021
+10B58 0021
+10B78 0021
+10E60 0021
+11052 0021
+11067 0021
+110F1 0021
+11137 0021
+111D1 0021
+116C1 0021
12415 0021
1241E 0021
1242C 0021
12434 0021
1244F 0021
12458 0021
-10A7D 0021
-10916 0021
-10858 0021
-10B58 0021
-10B78 0021
-111D1 0021
-116C1 0021
-11067 0021
-11052 0021
-10A40 0021
1D360 0021
+FF11 0021
+0F2A 0021
+1D7CF 0021
+1D7D9 0021
+1D7E3 0021
+1D7ED 0021
+1D7F7 0021
+2460 0021
+24F5 0021
+2776 0021
+2780 0021
+278A 0021
+00B9 0021
+2081 0021
0031 003F
-FF11 003F
-1D7CF 003F
-1D7D9 003F
-1D7E3 003F
-1D7ED 003F
-1D7F7 003F
-2460 003F
-24F5 003F
-2776 003F
-2780 003F
-278A 003F
-00B9 003F
-2081 003F
0661 003F
06F1 003F
-10E60 003F
07C1 003F
-1369 003F
-104A1 003F
0967 003F
09E7 003F
0A67 003F
0C7C 003F
0CE7 003F
0D67 003F
-ABF1 003F
-A8D1 003F
-1947 003F
-19D1 003F
-19DA 003F
-1A81 003F
-1A91 003F
0E51 003F
0ED1 003F
0F21 003F
-0F2A 003F
-1C41 003F
-A901 003F
1041 003F
1091 003F
-11137 003F
+1369 003F
17E1 003F
17F1 003F
-AA51 003F
+1811 003F
+1947 003F
+19D1 003F
+19DA 003F
+1A81 003F
+1A91 003F
1B51 003F
-A9D1 003F
1BB1 003F
-1811 003F
+1C41 003F
1C51 003F
-A621 003F
-110F1 003F
3021 003F
+A621 003F
+A8D1 003F
+A901 003F
+A9D1 003F
+AA51 003F
+ABF1 003F
10107 003F
10142 003F
10158 003F
1015A 003F
10320 003F
103D1 003F
+104A1 003F
+10858 003F
+10916 003F
+10A40 003F
+10A7D 003F
+10B58 003F
+10B78 003F
+10E60 003F
+11052 003F
+11067 003F
+110F1 003F
+11137 003F
+111D1 003F
+116C1 003F
12415 003F
1241E 003F
1242C 003F
12434 003F
1244F 003F
12458 003F
-10A7D 003F
-10916 003F
-10858 003F
-10B58 003F
-10B78 003F
-111D1 003F
-116C1 003F
-11067 003F
-11052 003F
-10A40 003F
1D360 003F
+FF11 003F
+0F2A 003F
+1D7CF 003F
+1D7D9 003F
+1D7E3 003F
+1D7ED 003F
+1D7F7 003F
+2460 003F
+24F5 003F
+2776 003F
+2780 003F
+278A 003F
+00B9 003F
+2081 003F
2488 0021
2488 003F
2488 0061
336B 0041
336B 0062
0031 0061
-0031 0041
-FF11 0061
-FF11 0041
-1D7CF 0061
-1D7D9 0061
-1D7E3 0061
-1D7ED 0061
-1D7F7 0061
-1D7CF 0041
-1D7D9 0041
-1D7E3 0041
-1D7ED 0041
-1D7F7 0041
-2460 0061
-24F5 0061
-2776 0061
-2780 0061
-278A 0061
-2460 0041
-24F5 0041
-2776 0041
-2780 0041
-278A 0041
-00B9 0061
-00B9 0041
-2081 0061
-2081 0041
0661 0061
-0661 0041
06F1 0061
-06F1 0041
-10E60 0061
-10E60 0041
07C1 0061
-07C1 0041
-1369 0061
-1369 0041
-104A1 0061
-104A1 0041
0967 0061
-0967 0041
09E7 0061
-09E7 0041
0A67 0061
-0A67 0041
0AE7 0061
-0AE7 0041
0B67 0061
-0B67 0041
0BE7 0061
-0BE7 0041
0C67 0061
0C79 0061
0C7C 0061
-0C67 0041
-0C79 0041
-0C7C 0041
0CE7 0061
-0CE7 0041
0D67 0061
-0D67 0041
-ABF1 0061
-ABF1 0041
-A8D1 0061
-A8D1 0041
-1947 0061
-1947 0041
-19D1 0061
-19DA 0061
-19D1 0041
-19DA 0041
-1A81 0061
-1A81 0041
-1A91 0061
-1A91 0041
0E51 0061
-0E51 0041
0ED1 0061
-0ED1 0041
0F21 0061
-0F21 0041
-0F2A 0061
-0F2A 0041
-1C41 0061
-1C41 0041
-A901 0061
-A901 0041
1041 0061
-1041 0041
1091 0061
-1091 0041
-11137 0061
-11137 0041
+1369 0061
17E1 0061
-17E1 0041
17F1 0061
-17F1 0041
-AA51 0061
-AA51 0041
+1811 0061
+1947 0061
+19D1 0061
+19DA 0061
+1A81 0061
+1A91 0061
1B51 0061
-1B51 0041
-A9D1 0061
-A9D1 0041
1BB1 0061
-1BB1 0041
-1811 0061
-1811 0041
+1C41 0061
1C51 0061
-1C51 0041
-A621 0061
-A621 0041
-110F1 0061
-110F1 0041
3021 0061
-3021 0041
+A621 0061
+A8D1 0061
+A901 0061
+A9D1 0061
+AA51 0061
+ABF1 0061
10107 0061
-10107 0041
10142 0061
10158 0061
10159 0061
1015A 0061
-10142 0041
-10158 0041
-10159 0041
-1015A 0041
10320 0061
-10320 0041
103D1 0061
-103D1 0041
+104A1 0061
+10858 0061
+10916 0061
+10A40 0061
+10A7D 0061
+10B58 0061
+10B78 0061
+10E60 0061
+11052 0061
+11067 0061
+110F1 0061
+11137 0061
+111D1 0061
+116C1 0061
12415 0061
1241E 0061
1242C 0061
12434 0061
1244F 0061
12458 0061
+1D360 0061
+0031 0041
+0661 0041
+06F1 0041
+07C1 0041
+0967 0041
+09E7 0041
+0A67 0041
+0AE7 0041
+0B67 0041
+0BE7 0041
+0C67 0041
+0C79 0041
+0C7C 0041
+0CE7 0041
+0D67 0041
+0E51 0041
+0ED1 0041
+0F21 0041
+1041 0041
+1091 0041
+1369 0041
+17E1 0041
+17F1 0041
+1811 0041
+1947 0041
+19D1 0041
+19DA 0041
+1A81 0041
+1A91 0041
+1B51 0041
+1BB1 0041
+1C41 0041
+1C51 0041
+3021 0041
+A621 0041
+A8D1 0041
+A901 0041
+A9D1 0041
+AA51 0041
+ABF1 0041
+10107 0041
+10142 0041
+10158 0041
+10159 0041
+1015A 0041
+10320 0041
+103D1 0041
+104A1 0041
+10858 0041
+10916 0041
+10A40 0041
+10A7D 0041
+10B58 0041
+10B78 0041
+10E60 0041
+11052 0041
+11067 0041
+110F1 0041
+11137 0041
+111D1 0041
+116C1 0041
12415 0041
1241E 0041
1242C 0041
12434 0041
1244F 0041
12458 0041
-10A7D 0061
-10A7D 0041
-10916 0061
-10916 0041
-10858 0061
-10858 0041
-10B58 0061
-10B58 0041
-10B78 0061
-10B78 0041
-111D1 0061
-111D1 0041
-116C1 0061
-116C1 0041
-11067 0061
-11067 0041
-11052 0061
-11052 0041
-10A40 0061
-10A40 0041
-1D360 0061
1D360 0041
+FF11 0061
+FF11 0041
+0F2A 0061
+0F2A 0041
+1D7CF 0061
+1D7D9 0061
+1D7E3 0061
+1D7ED 0061
+1D7F7 0061
+1D7CF 0041
+1D7D9 0041
+1D7E3 0041
+1D7ED 0041
+1D7F7 0041
+2460 0061
+24F5 0061
+2776 0061
+2780 0061
+278A 0061
+2460 0041
+24F5 0041
+2776 0041
+2780 0041
+278A 0041
+00B9 0061
+00B9 0041
+2081 0061
+2081 0041
0031 0062
-FF11 0062
-1D7CF 0062
-1D7D9 0062
-1D7E3 0062
-1D7ED 0062
-1D7F7 0062
-2460 0062
-24F5 0062
-2776 0062
-2780 0062
-278A 0062
-00B9 0062
-2081 0062
0661 0062
06F1 0062
-10E60 0062
07C1 0062
-1369 0062
-104A1 0062
0967 0062
09E7 0062
0A67 0062
0C7C 0062
0CE7 0062
0D67 0062
-ABF1 0062
-A8D1 0062
-1947 0062
-19D1 0062
-19DA 0062
-1A81 0062
-1A91 0062
0E51 0062
0ED1 0062
0F21 0062
-0F2A 0062
-1C41 0062
-A901 0062
1041 0062
1091 0062
-11137 0062
+1369 0062
17E1 0062
17F1 0062
-AA51 0062
+1811 0062
+1947 0062
+19D1 0062
+19DA 0062
+1A81 0062
+1A91 0062
1B51 0062
-A9D1 0062
1BB1 0062
-1811 0062
+1C41 0062
1C51 0062
-A621 0062
-110F1 0062
3021 0062
+A621 0062
+A8D1 0062
+A901 0062
+A9D1 0062
+AA51 0062
+ABF1 0062
10107 0062
10142 0062
10158 0062
1015A 0062
10320 0062
103D1 0062
+104A1 0062
+10858 0062
+10916 0062
+10A40 0062
+10A7D 0062
+10B58 0062
+10B78 0062
+10E60 0062
+11052 0062
+11067 0062
+110F1 0062
+11137 0062
+111D1 0062
+116C1 0062
12415 0062
1241E 0062
1242C 0062
12434 0062
1244F 0062
12458 0062
-10A7D 0062
-10916 0062
-10858 0062
-10B58 0062
-10B78 0062
-111D1 0062
-116C1 0062
-11067 0062
-11052 0062
-10A40 0062
1D360 0062
+FF11 0062
+0F2A 0062
+1D7CF 0062
+1D7D9 0062
+1D7E3 0062
+1D7ED 0062
+1D7F7 0062
+2460 0062
+24F5 0062
+2776 0062
+2780 0062
+278A 0062
+00B9 0062
+2081 0062
33E0 0021
33E0 003F
33E0 0061
1F103 0041
1F103 0062
0032 0021
-FF12 0021
-1D7D0 0021
-1D7DA 0021
-1D7E4 0021
-1D7EE 0021
-1D7F8 0021
-2461 0021
-24F6 0021
-2777 0021
-2781 0021
-278B 0021
-00B2 0021
-2082 0021
0662 0021
06F2 0021
-10E61 0021
07C2 0021
-136A 0021
-104A2 0021
0968 0021
09E8 0021
0A68 0021
0C7D 0021
0CE8 0021
0D68 0021
-ABF2 0021
-A8D2 0021
-1948 0021
-19D2 0021
-1A82 0021
-1A92 0021
0E52 0021
0ED2 0021
0F22 0021
-0F2B 0021
-1C42 0021
-A902 0021
1042 0021
1092 0021
-11138 0021
+136A 0021
17E2 0021
17F2 0021
-AA52 0021
+1812 0021
+1948 0021
+19D2 0021
+1A82 0021
+1A92 0021
1B52 0021
-A9D2 0021
1BB2 0021
-1812 0021
+1C42 0021
1C52 0021
-A622 0021
-110F2 0021
3022 0021
+A622 0021
+A8D2 0021
+A902 0021
+A9D2 0021
+AA52 0021
+ABF2 0021
10108 0021
1015B 0021
1015C 0021
1015D 0021
1015E 0021
103D2 0021
+104A2 0021
+10859 0021
+1091A 0021
+10A41 0021
+10B59 0021
+10B79 0021
+10E61 0021
+11053 0021
+11068 0021
+110F2 0021
+11138 0021
+111D2 0021
+116C2 0021
12400 0021
12416 0021
1241F 0021
12435 0021
1244A 0021
12450 0021
+12456 0021
12459 0021
-1091A 0021
-10859 0021
-10B59 0021
-10B79 0021
-111D2 0021
-116C2 0021
-11068 0021
-11053 0021
-10A41 0021
1D361 0021
+FF12 0021
+0F2B 0021
+1D7D0 0021
+1D7DA 0021
+1D7E4 0021
+1D7EE 0021
+1D7F8 0021
+2461 0021
+24F6 0021
+2777 0021
+2781 0021
+278B 0021
+00B2 0021
+2082 0021
0032 003F
-FF12 003F
-1D7D0 003F
-1D7DA 003F
-1D7E4 003F
-1D7EE 003F
-1D7F8 003F
-2461 003F
-24F6 003F
-2777 003F
-2781 003F
-278B 003F
-00B2 003F
-2082 003F
0662 003F
06F2 003F
-10E61 003F
07C2 003F
-136A 003F
-104A2 003F
0968 003F
09E8 003F
0A68 003F
0C7D 003F
0CE8 003F
0D68 003F
-ABF2 003F
-A8D2 003F
-1948 003F
-19D2 003F
-1A82 003F
-1A92 003F
0E52 003F
0ED2 003F
0F22 003F
-0F2B 003F
-1C42 003F
-A902 003F
1042 003F
1092 003F
-11138 003F
+136A 003F
17E2 003F
17F2 003F
-AA52 003F
+1812 003F
+1948 003F
+19D2 003F
+1A82 003F
+1A92 003F
1B52 003F
-A9D2 003F
1BB2 003F
-1812 003F
+1C42 003F
1C52 003F
-A622 003F
-110F2 003F
3022 003F
+A622 003F
+A8D2 003F
+A902 003F
+A9D2 003F
+AA52 003F
+ABF2 003F
10108 003F
1015B 003F
1015C 003F
1015D 003F
1015E 003F
103D2 003F
+104A2 003F
+10859 003F
+1091A 003F
+10A41 003F
+10B59 003F
+10B79 003F
+10E61 003F
+11053 003F
+11068 003F
+110F2 003F
+11138 003F
+111D2 003F
+116C2 003F
12400 003F
12416 003F
1241F 003F
12435 003F
1244A 003F
12450 003F
+12456 003F
12459 003F
-1091A 003F
-10859 003F
-10B59 003F
-10B79 003F
-111D2 003F
-116C2 003F
-11068 003F
-11053 003F
-10A41 003F
1D361 003F
+FF12 003F
+0F2B 003F
+1D7D0 003F
+1D7DA 003F
+1D7E4 003F
+1D7EE 003F
+1D7F8 003F
+2461 003F
+24F6 003F
+2777 003F
+2781 003F
+278B 003F
+00B2 003F
+2082 003F
2489 0021
2489 003F
2489 0061
33FC 0041
33FC 0062
0032 0061
-0032 0041
-FF12 0061
-FF12 0041
-1D7D0 0061
-1D7DA 0061
-1D7E4 0061
-1D7EE 0061
-1D7F8 0061
-1D7D0 0041
-1D7DA 0041
-1D7E4 0041
-1D7EE 0041
-1D7F8 0041
-2461 0061
-24F6 0061
-2777 0061
-2781 0061
-278B 0061
-2461 0041
-24F6 0041
-2777 0041
-2781 0041
-278B 0041
-00B2 0061
-00B2 0041
-2082 0061
-2082 0041
0662 0061
-0662 0041
06F2 0061
-06F2 0041
-10E61 0061
-10E61 0041
07C2 0061
-07C2 0041
-136A 0061
-136A 0041
-104A2 0061
-104A2 0041
0968 0061
-0968 0041
09E8 0061
-09E8 0041
0A68 0061
-0A68 0041
0AE8 0061
-0AE8 0041
0B68 0061
-0B68 0041
0BE8 0061
-0BE8 0041
0C68 0061
0C7A 0061
0C7D 0061
-0C68 0041
-0C7A 0041
-0C7D 0041
0CE8 0061
-0CE8 0041
0D68 0061
-0D68 0041
-ABF2 0061
-ABF2 0041
-A8D2 0061
-A8D2 0041
-1948 0061
-1948 0041
-19D2 0061
-19D2 0041
-1A82 0061
-1A82 0041
-1A92 0061
-1A92 0041
0E52 0061
-0E52 0041
0ED2 0061
-0ED2 0041
0F22 0061
-0F22 0041
-0F2B 0061
-0F2B 0041
-1C42 0061
-1C42 0041
-A902 0061
-A902 0041
1042 0061
-1042 0041
1092 0061
-1092 0041
-11138 0061
-11138 0041
+136A 0061
17E2 0061
-17E2 0041
17F2 0061
-17F2 0041
-AA52 0061
-AA52 0041
+1812 0061
+1948 0061
+19D2 0061
+1A82 0061
+1A92 0061
1B52 0061
-1B52 0041
-A9D2 0061
-A9D2 0041
1BB2 0061
-1BB2 0041
-1812 0061
-1812 0041
+1C42 0061
1C52 0061
-1C52 0041
-A622 0061
-A622 0041
-110F2 0061
-110F2 0041
3022 0061
-3022 0041
+A622 0061
+A8D2 0061
+A902 0061
+A9D2 0061
+AA52 0061
+ABF2 0061
10108 0061
-10108 0041
1015B 0061
1015C 0061
1015D 0061
1015E 0061
-1015B 0041
-1015C 0041
-1015D 0041
-1015E 0041
103D2 0061
-103D2 0041
+104A2 0061
+10859 0061
+1091A 0061
+10A41 0061
+10B59 0061
+10B79 0061
+10E61 0061
+11053 0061
+11068 0061
+110F2 0061
+11138 0061
+111D2 0061
+116C2 0061
12400 0061
12416 0061
1241F 0061
12435 0061
1244A 0061
12450 0061
+12456 0061
12459 0061
+1D361 0061
+0032 0041
+0662 0041
+06F2 0041
+07C2 0041
+0968 0041
+09E8 0041
+0A68 0041
+0AE8 0041
+0B68 0041
+0BE8 0041
+0C68 0041
+0C7A 0041
+0C7D 0041
+0CE8 0041
+0D68 0041
+0E52 0041
+0ED2 0041
+0F22 0041
+1042 0041
+1092 0041
+136A 0041
+17E2 0041
+17F2 0041
+1812 0041
+1948 0041
+19D2 0041
+1A82 0041
+1A92 0041
+1B52 0041
+1BB2 0041
+1C42 0041
+1C52 0041
+3022 0041
+A622 0041
+A8D2 0041
+A902 0041
+A9D2 0041
+AA52 0041
+ABF2 0041
+10108 0041
+1015B 0041
+1015C 0041
+1015D 0041
+1015E 0041
+103D2 0041
+104A2 0041
+10859 0041
+1091A 0041
+10A41 0041
+10B59 0041
+10B79 0041
+10E61 0041
+11053 0041
+11068 0041
+110F2 0041
+11138 0041
+111D2 0041
+116C2 0041
12400 0041
12416 0041
1241F 0041
12435 0041
1244A 0041
12450 0041
+12456 0041
12459 0041
-1091A 0061
-1091A 0041
-10859 0061
-10859 0041
-10B59 0061
-10B59 0041
-10B79 0061
-10B79 0041
-111D2 0061
-111D2 0041
-116C2 0061
-116C2 0041
-11068 0061
-11068 0041
-11053 0061
-11053 0041
-10A41 0061
-10A41 0041
-1D361 0061
1D361 0041
+FF12 0061
+FF12 0041
+0F2B 0061
+0F2B 0041
+1D7D0 0061
+1D7DA 0061
+1D7E4 0061
+1D7EE 0061
+1D7F8 0061
+1D7D0 0041
+1D7DA 0041
+1D7E4 0041
+1D7EE 0041
+1D7F8 0041
+2461 0061
+24F6 0061
+2777 0061
+2781 0061
+278B 0061
+2461 0041
+24F6 0041
+2777 0041
+2781 0041
+278B 0041
+00B2 0061
+00B2 0041
+2082 0061
+2082 0041
0032 0062
-FF12 0062
-1D7D0 0062
-1D7DA 0062
-1D7E4 0062
-1D7EE 0062
-1D7F8 0062
-2461 0062
-24F6 0062
-2777 0062
-2781 0062
-278B 0062
-00B2 0062
-2082 0062
0662 0062
06F2 0062
-10E61 0062
07C2 0062
-136A 0062
-104A2 0062
0968 0062
09E8 0062
0A68 0062
0C7D 0062
0CE8 0062
0D68 0062
-ABF2 0062
-A8D2 0062
-1948 0062
-19D2 0062
-1A82 0062
-1A92 0062
0E52 0062
0ED2 0062
0F22 0062
-0F2B 0062
-1C42 0062
-A902 0062
1042 0062
1092 0062
-11138 0062
+136A 0062
17E2 0062
17F2 0062
-AA52 0062
+1812 0062
+1948 0062
+19D2 0062
+1A82 0062
+1A92 0062
1B52 0062
-A9D2 0062
1BB2 0062
-1812 0062
+1C42 0062
1C52 0062
-A622 0062
-110F2 0062
3022 0062
+A622 0062
+A8D2 0062
+A902 0062
+A9D2 0062
+AA52 0062
+ABF2 0062
10108 0062
1015B 0062
1015C 0062
1015D 0062
1015E 0062
103D2 0062
+104A2 0062
+10859 0062
+1091A 0062
+10A41 0062
+10B59 0062
+10B79 0062
+10E61 0062
+11053 0062
+11068 0062
+110F2 0062
+11138 0062
+111D2 0062
+116C2 0062
12400 0062
12416 0062
1241F 0062
12435 0062
1244A 0062
12450 0062
+12456 0062
12459 0062
-1091A 0062
-10859 0062
-10B59 0062
-10B79 0062
-111D2 0062
-116C2 0062
-11068 0062
-11053 0062
-10A41 0062
1D361 0062
+FF12 0062
+0F2B 0062
+1D7D0 0062
+1D7DA 0062
+1D7E4 0062
+1D7EE 0062
+1D7F8 0062
+2461 0062
+24F6 0062
+2777 0062
+2781 0062
+278B 0062
+00B2 0062
+2082 0062
33E1 0021
33E1 003F
33E1 0061
1F104 0041
1F104 0062
0033 0021
-FF13 0021
-1D7D1 0021
-1D7DB 0021
-1D7E5 0021
-1D7EF 0021
-1D7F9 0021
-2462 0021
-24F7 0021
-2778 0021
-2782 0021
-278C 0021
-00B3 0021
-2083 0021
0663 0021
06F3 0021
-10E62 0021
07C3 0021
-136B 0021
-104A3 0021
0969 0021
09E9 0021
0A69 0021
0C7E 0021
0CE9 0021
0D69 0021
-ABF3 0021
-A8D3 0021
-1949 0021
-19D3 0021
-1A83 0021
-1A93 0021
0E53 0021
0ED3 0021
0F23 0021
-0F2C 0021
-1C43 0021
-A903 0021
1043 0021
1093 0021
-11139 0021
+136B 0021
17E3 0021
17F3 0021
-AA53 0021
+1813 0021
+1949 0021
+19D3 0021
+1A83 0021
+1A93 0021
1B53 0021
-A9D3 0021
1BB3 0021
-1813 0021
+1C43 0021
1C53 0021
-A623 0021
-110F3 0021
3023 0021
+A623 0021
+A8D3 0021
+A903 0021
+A9D3 0021
+AA53 0021
+ABF3 0021
10109 0021
+104A3 0021
+1085A 0021
+1091B 0021
+10A42 0021
+10B5A 0021
+10B7A 0021
+10E62 0021
+11054 0021
+11069 0021
+110F3 0021
+11139 0021
+111D3 0021
+116C3 0021
12401 0021
12408 0021
12417 0021
1243B 0021
1244B 0021
12451 0021
-1091B 0021
-1085A 0021
-10B5A 0021
-10B7A 0021
-111D3 0021
-116C3 0021
-11069 0021
-11054 0021
-10A42 0021
+12457 0021
1D362 0021
+FF13 0021
+0F2C 0021
+1D7D1 0021
+1D7DB 0021
+1D7E5 0021
+1D7EF 0021
+1D7F9 0021
+2462 0021
+24F7 0021
+2778 0021
+2782 0021
+278C 0021
+00B3 0021
+2083 0021
0033 003F
-FF13 003F
-1D7D1 003F
-1D7DB 003F
-1D7E5 003F
-1D7EF 003F
-1D7F9 003F
-2462 003F
-24F7 003F
-2778 003F
-2782 003F
-278C 003F
-00B3 003F
-2083 003F
0663 003F
06F3 003F
-10E62 003F
07C3 003F
-136B 003F
-104A3 003F
0969 003F
09E9 003F
0A69 003F
0C7E 003F
0CE9 003F
0D69 003F
-ABF3 003F
-A8D3 003F
-1949 003F
-19D3 003F
-1A83 003F
-1A93 003F
0E53 003F
0ED3 003F
0F23 003F
-0F2C 003F
-1C43 003F
-A903 003F
1043 003F
1093 003F
-11139 003F
+136B 003F
17E3 003F
17F3 003F
-AA53 003F
+1813 003F
+1949 003F
+19D3 003F
+1A83 003F
+1A93 003F
1B53 003F
-A9D3 003F
1BB3 003F
-1813 003F
+1C43 003F
1C53 003F
-A623 003F
-110F3 003F
3023 003F
+A623 003F
+A8D3 003F
+A903 003F
+A9D3 003F
+AA53 003F
+ABF3 003F
10109 003F
+104A3 003F
+1085A 003F
+1091B 003F
+10A42 003F
+10B5A 003F
+10B7A 003F
+10E62 003F
+11054 003F
+11069 003F
+110F3 003F
+11139 003F
+111D3 003F
+116C3 003F
12401 003F
12408 003F
12417 003F
1243B 003F
1244B 003F
12451 003F
-1091B 003F
-1085A 003F
-10B5A 003F
-10B7A 003F
-111D3 003F
-116C3 003F
-11069 003F
-11054 003F
-10A42 003F
+12457 003F
1D362 003F
+FF13 003F
+0F2C 003F
+1D7D1 003F
+1D7DB 003F
+1D7E5 003F
+1D7EF 003F
+1D7F9 003F
+2462 003F
+24F7 003F
+2778 003F
+2782 003F
+278C 003F
+00B3 003F
+2083 003F
248A 0021
248A 003F
248A 0061
32B4 0041
32B4 0062
0033 0061
-0033 0041
-FF13 0061
-FF13 0041
-1D7D1 0061
-1D7DB 0061
-1D7E5 0061
-1D7EF 0061
-1D7F9 0061
-1D7D1 0041
-1D7DB 0041
-1D7E5 0041
-1D7EF 0041
-1D7F9 0041
-2462 0061
-24F7 0061
-2778 0061
-2782 0061
-278C 0061
-2462 0041
-24F7 0041
-2778 0041
-2782 0041
-278C 0041
-00B3 0061
-00B3 0041
-2083 0061
-2083 0041
0663 0061
-0663 0041
06F3 0061
-06F3 0041
-10E62 0061
-10E62 0041
07C3 0061
-07C3 0041
-136B 0061
-136B 0041
-104A3 0061
-104A3 0041
0969 0061
-0969 0041
09E9 0061
-09E9 0041
0A69 0061
-0A69 0041
0AE9 0061
-0AE9 0041
0B69 0061
-0B69 0041
0BE9 0061
-0BE9 0041
0C69 0061
0C7B 0061
0C7E 0061
-0C69 0041
-0C7B 0041
-0C7E 0041
0CE9 0061
-0CE9 0041
0D69 0061
-0D69 0041
-ABF3 0061
-ABF3 0041
-A8D3 0061
-A8D3 0041
-1949 0061
-1949 0041
-19D3 0061
-19D3 0041
-1A83 0061
-1A83 0041
-1A93 0061
-1A93 0041
0E53 0061
-0E53 0041
0ED3 0061
-0ED3 0041
0F23 0061
-0F23 0041
-0F2C 0061
-0F2C 0041
-1C43 0061
-1C43 0041
-A903 0061
-A903 0041
1043 0061
-1043 0041
1093 0061
-1093 0041
-11139 0061
-11139 0041
+136B 0061
17E3 0061
-17E3 0041
17F3 0061
-17F3 0041
-AA53 0061
-AA53 0041
+1813 0061
+1949 0061
+19D3 0061
+1A83 0061
+1A93 0061
1B53 0061
-1B53 0041
-A9D3 0061
-A9D3 0041
1BB3 0061
-1BB3 0041
-1813 0061
-1813 0041
+1C43 0061
1C53 0061
-1C53 0041
-A623 0061
-A623 0041
-110F3 0061
-110F3 0041
3023 0061
-3023 0041
+A623 0061
+A8D3 0061
+A903 0061
+A9D3 0061
+AA53 0061
+ABF3 0061
10109 0061
-10109 0041
+104A3 0061
+1085A 0061
+1091B 0061
+10A42 0061
+10B5A 0061
+10B7A 0061
+10E62 0061
+11054 0061
+11069 0061
+110F3 0061
+11139 0061
+111D3 0061
+116C3 0061
12401 0061
12408 0061
12417 0061
1243B 0061
1244B 0061
12451 0061
+12457 0061
+1D362 0061
+0033 0041
+0663 0041
+06F3 0041
+07C3 0041
+0969 0041
+09E9 0041
+0A69 0041
+0AE9 0041
+0B69 0041
+0BE9 0041
+0C69 0041
+0C7B 0041
+0C7E 0041
+0CE9 0041
+0D69 0041
+0E53 0041
+0ED3 0041
+0F23 0041
+1043 0041
+1093 0041
+136B 0041
+17E3 0041
+17F3 0041
+1813 0041
+1949 0041
+19D3 0041
+1A83 0041
+1A93 0041
+1B53 0041
+1BB3 0041
+1C43 0041
+1C53 0041
+3023 0041
+A623 0041
+A8D3 0041
+A903 0041
+A9D3 0041
+AA53 0041
+ABF3 0041
+10109 0041
+104A3 0041
+1085A 0041
+1091B 0041
+10A42 0041
+10B5A 0041
+10B7A 0041
+10E62 0041
+11054 0041
+11069 0041
+110F3 0041
+11139 0041
+111D3 0041
+116C3 0041
12401 0041
12408 0041
12417 0041
1243B 0041
1244B 0041
12451 0041
-1091B 0061
-1091B 0041
-1085A 0061
-1085A 0041
-10B5A 0061
-10B5A 0041
-10B7A 0061
-10B7A 0041
-111D3 0061
-111D3 0041
-116C3 0061
-116C3 0041
-11069 0061
-11069 0041
-11054 0061
-11054 0041
-10A42 0061
-10A42 0041
-1D362 0061
+12457 0041
1D362 0041
+FF13 0061
+FF13 0041
+0F2C 0061
+0F2C 0041
+1D7D1 0061
+1D7DB 0061
+1D7E5 0061
+1D7EF 0061
+1D7F9 0061
+1D7D1 0041
+1D7DB 0041
+1D7E5 0041
+1D7EF 0041
+1D7F9 0041
+2462 0061
+24F7 0061
+2778 0061
+2782 0061
+278C 0061
+2462 0041
+24F7 0041
+2778 0041
+2782 0041
+278C 0041
+00B3 0061
+00B3 0041
+2083 0061
+2083 0041
0033 0062
-FF13 0062
-1D7D1 0062
-1D7DB 0062
-1D7E5 0062
-1D7EF 0062
-1D7F9 0062
-2462 0062
-24F7 0062
-2778 0062
-2782 0062
-278C 0062
-00B3 0062
-2083 0062
0663 0062
06F3 0062
-10E62 0062
07C3 0062
-136B 0062
-104A3 0062
0969 0062
09E9 0062
0A69 0062
0C7E 0062
0CE9 0062
0D69 0062
-ABF3 0062
-A8D3 0062
-1949 0062
-19D3 0062
-1A83 0062
-1A93 0062
0E53 0062
0ED3 0062
0F23 0062
-0F2C 0062
-1C43 0062
-A903 0062
1043 0062
1093 0062
-11139 0062
+136B 0062
17E3 0062
17F3 0062
-AA53 0062
+1813 0062
+1949 0062
+19D3 0062
+1A83 0062
+1A93 0062
1B53 0062
-A9D3 0062
1BB3 0062
-1813 0062
+1C43 0062
1C53 0062
-A623 0062
-110F3 0062
3023 0062
+A623 0062
+A8D3 0062
+A903 0062
+A9D3 0062
+AA53 0062
+ABF3 0062
10109 0062
+104A3 0062
+1085A 0062
+1091B 0062
+10A42 0062
+10B5A 0062
+10B7A 0062
+10E62 0062
+11054 0062
+11069 0062
+110F3 0062
+11139 0062
+111D3 0062
+116C3 0062
12401 0062
12408 0062
12417 0062
1243B 0062
1244B 0062
12451 0062
-1091B 0062
-1085A 0062
-10B5A 0062
-10B7A 0062
-111D3 0062
-116C3 0062
-11069 0062
-11054 0062
-10A42 0062
+12457 0062
1D362 0062
+FF13 0062
+0F2C 0062
+1D7D1 0062
+1D7DB 0062
+1D7E5 0062
+1D7EF 0062
+1D7F9 0062
+2462 0062
+24F7 0062
+2778 0062
+2782 0062
+278C 0062
+00B3 0062
+2083 0062
33E2 0021
33E2 003F
33E2 0061
1F105 0041
1F105 0062
0034 0021
-FF14 0021
-1D7D2 0021
-1D7DC 0021
-1D7E6 0021
-1D7F0 0021
-1D7FA 0021
-2463 0021
-24F8 0021
-2779 0021
-2783 0021
-278D 0021
-2074 0021
-2084 0021
0664 0021
06F4 0021
-10E63 0021
07C4 0021
-136C 0021
-104A4 0021
096A 0021
09EA 0021
0A6A 0021
0C6A 0021
0CEA 0021
0D6A 0021
-ABF4 0021
-A8D4 0021
-194A 0021
-19D4 0021
-1A84 0021
-1A94 0021
0E54 0021
0ED4 0021
0F24 0021
-0F2D 0021
-1C44 0021
-A904 0021
1044 0021
1094 0021
-1113A 0021
+136C 0021
17E4 0021
17F4 0021
-AA54 0021
+1814 0021
+194A 0021
+19D4 0021
+1A84 0021
+1A94 0021
1B54 0021
-A9D4 0021
1BB4 0021
-1814 0021
+1C44 0021
1C54 0021
-A624 0021
-110F4 0021
3024 0021
+A624 0021
+A8D4 0021
+A904 0021
+A9D4 0021
+AA54 0021
+ABF4 0021
1010A 0021
+104A4 0021
+10A43 0021
+10B5B 0021
+10B7B 0021
+10E63 0021
+11055 0021
+1106A 0021
+110F4 0021
+1113A 0021
+111D4 0021
+116C4 0021
12402 0021
12409 0021
1240F 0021
1244C 0021
12452 0021
12453 0021
-10B5B 0021
-10B7B 0021
-111D4 0021
-116C4 0021
-1106A 0021
-11055 0021
-10A43 0021
1D363 0021
+FF14 0021
+0F2D 0021
+1D7D2 0021
+1D7DC 0021
+1D7E6 0021
+1D7F0 0021
+1D7FA 0021
+2463 0021
+24F8 0021
+2779 0021
+2783 0021
+278D 0021
+2074 0021
+2084 0021
0034 003F
-FF14 003F
-1D7D2 003F
-1D7DC 003F
-1D7E6 003F
-1D7F0 003F
-1D7FA 003F
-2463 003F
-24F8 003F
-2779 003F
-2783 003F
-278D 003F
-2074 003F
-2084 003F
0664 003F
06F4 003F
-10E63 003F
07C4 003F
-136C 003F
-104A4 003F
096A 003F
09EA 003F
0A6A 003F
0C6A 003F
0CEA 003F
0D6A 003F
-ABF4 003F
-A8D4 003F
-194A 003F
-19D4 003F
-1A84 003F
-1A94 003F
0E54 003F
0ED4 003F
0F24 003F
-0F2D 003F
-1C44 003F
-A904 003F
1044 003F
1094 003F
-1113A 003F
+136C 003F
17E4 003F
17F4 003F
-AA54 003F
+1814 003F
+194A 003F
+19D4 003F
+1A84 003F
+1A94 003F
1B54 003F
-A9D4 003F
1BB4 003F
-1814 003F
+1C44 003F
1C54 003F
-A624 003F
-110F4 003F
3024 003F
+A624 003F
+A8D4 003F
+A904 003F
+A9D4 003F
+AA54 003F
+ABF4 003F
1010A 003F
+104A4 003F
+10A43 003F
+10B5B 003F
+10B7B 003F
+10E63 003F
+11055 003F
+1106A 003F
+110F4 003F
+1113A 003F
+111D4 003F
+116C4 003F
12402 003F
12409 003F
1240F 003F
1244C 003F
12452 003F
12453 003F
-10B5B 003F
-10B7B 003F
-111D4 003F
-116C4 003F
-1106A 003F
-11055 003F
-10A43 003F
1D363 003F
+FF14 003F
+0F2D 003F
+1D7D2 003F
+1D7DC 003F
+1D7E6 003F
+1D7F0 003F
+1D7FA 003F
+2463 003F
+24F8 003F
+2779 003F
+2783 003F
+278D 003F
+2074 003F
+2084 003F
248B 0021
248B 003F
248B 0061
32BE 0041
32BE 0062
0034 0061
-0034 0041
-FF14 0061
-FF14 0041
-1D7D2 0061
-1D7DC 0061
-1D7E6 0061
-1D7F0 0061
-1D7FA 0061
-1D7D2 0041
-1D7DC 0041
-1D7E6 0041
-1D7F0 0041
-1D7FA 0041
-2463 0061
-24F8 0061
-2779 0061
-2783 0061
-278D 0061
-2463 0041
-24F8 0041
-2779 0041
-2783 0041
-278D 0041
-2074 0061
-2074 0041
-2084 0061
-2084 0041
0664 0061
-0664 0041
06F4 0061
-06F4 0041
-10E63 0061
-10E63 0041
07C4 0061
-07C4 0041
-136C 0061
-136C 0041
-104A4 0061
-104A4 0041
096A 0061
-096A 0041
09EA 0061
-09EA 0041
0A6A 0061
-0A6A 0041
0AEA 0061
-0AEA 0041
0B6A 0061
-0B6A 0041
0BEA 0061
-0BEA 0041
0C6A 0061
-0C6A 0041
0CEA 0061
-0CEA 0041
0D6A 0061
-0D6A 0041
-ABF4 0061
-ABF4 0041
-A8D4 0061
-A8D4 0041
-194A 0061
-194A 0041
-19D4 0061
-19D4 0041
-1A84 0061
-1A84 0041
-1A94 0061
-1A94 0041
0E54 0061
-0E54 0041
0ED4 0061
-0ED4 0041
0F24 0061
-0F24 0041
-0F2D 0061
-0F2D 0041
-1C44 0061
-1C44 0041
-A904 0061
-A904 0041
1044 0061
-1044 0041
1094 0061
-1094 0041
-1113A 0061
-1113A 0041
+136C 0061
17E4 0061
-17E4 0041
17F4 0061
-17F4 0041
-AA54 0061
-AA54 0041
+1814 0061
+194A 0061
+19D4 0061
+1A84 0061
+1A94 0061
1B54 0061
-1B54 0041
-A9D4 0061
-A9D4 0041
1BB4 0061
-1BB4 0041
-1814 0061
-1814 0041
+1C44 0061
1C54 0061
-1C54 0041
-A624 0061
-A624 0041
-110F4 0061
-110F4 0041
3024 0061
-3024 0041
+A624 0061
+A8D4 0061
+A904 0061
+A9D4 0061
+AA54 0061
+ABF4 0061
1010A 0061
-1010A 0041
+104A4 0061
+10A43 0061
+10B5B 0061
+10B7B 0061
+10E63 0061
+11055 0061
+1106A 0061
+110F4 0061
+1113A 0061
+111D4 0061
+116C4 0061
12402 0061
12409 0061
1240F 0061
1244C 0061
12452 0061
12453 0061
+1D363 0061
+0034 0041
+0664 0041
+06F4 0041
+07C4 0041
+096A 0041
+09EA 0041
+0A6A 0041
+0AEA 0041
+0B6A 0041
+0BEA 0041
+0C6A 0041
+0CEA 0041
+0D6A 0041
+0E54 0041
+0ED4 0041
+0F24 0041
+1044 0041
+1094 0041
+136C 0041
+17E4 0041
+17F4 0041
+1814 0041
+194A 0041
+19D4 0041
+1A84 0041
+1A94 0041
+1B54 0041
+1BB4 0041
+1C44 0041
+1C54 0041
+3024 0041
+A624 0041
+A8D4 0041
+A904 0041
+A9D4 0041
+AA54 0041
+ABF4 0041
+1010A 0041
+104A4 0041
+10A43 0041
+10B5B 0041
+10B7B 0041
+10E63 0041
+11055 0041
+1106A 0041
+110F4 0041
+1113A 0041
+111D4 0041
+116C4 0041
12402 0041
12409 0041
1240F 0041
1244C 0041
12452 0041
12453 0041
-10B5B 0061
-10B5B 0041
-10B7B 0061
-10B7B 0041
-111D4 0061
-111D4 0041
-116C4 0061
-116C4 0041
-1106A 0061
-1106A 0041
-11055 0061
-11055 0041
-10A43 0061
-10A43 0041
-1D363 0061
1D363 0041
+FF14 0061
+FF14 0041
+0F2D 0061
+0F2D 0041
+1D7D2 0061
+1D7DC 0061
+1D7E6 0061
+1D7F0 0061
+1D7FA 0061
+1D7D2 0041
+1D7DC 0041
+1D7E6 0041
+1D7F0 0041
+1D7FA 0041
+2463 0061
+24F8 0061
+2779 0061
+2783 0061
+278D 0061
+2463 0041
+24F8 0041
+2779 0041
+2783 0041
+278D 0041
+2074 0061
+2074 0041
+2084 0061
+2084 0041
0034 0062
-FF14 0062
-1D7D2 0062
-1D7DC 0062
-1D7E6 0062
-1D7F0 0062
-1D7FA 0062
-2463 0062
-24F8 0062
-2779 0062
-2783 0062
-278D 0062
-2074 0062
-2084 0062
0664 0062
06F4 0062
-10E63 0062
07C4 0062
-136C 0062
-104A4 0062
096A 0062
09EA 0062
0A6A 0062
0C6A 0062
0CEA 0062
0D6A 0062
-ABF4 0062
-A8D4 0062
-194A 0062
-19D4 0062
-1A84 0062
-1A94 0062
0E54 0062
0ED4 0062
0F24 0062
-0F2D 0062
-1C44 0062
-A904 0062
1044 0062
1094 0062
-1113A 0062
+136C 0062
17E4 0062
17F4 0062
-AA54 0062
+1814 0062
+194A 0062
+19D4 0062
+1A84 0062
+1A94 0062
1B54 0062
-A9D4 0062
1BB4 0062
-1814 0062
+1C44 0062
1C54 0062
-A624 0062
-110F4 0062
3024 0062
+A624 0062
+A8D4 0062
+A904 0062
+A9D4 0062
+AA54 0062
+ABF4 0062
1010A 0062
+104A4 0062
+10A43 0062
+10B5B 0062
+10B7B 0062
+10E63 0062
+11055 0062
+1106A 0062
+110F4 0062
+1113A 0062
+111D4 0062
+116C4 0062
12402 0062
12409 0062
1240F 0062
1244C 0062
12452 0062
12453 0062
-10B5B 0062
-10B7B 0062
-111D4 0062
-116C4 0062
-1106A 0062
-11055 0062
-10A43 0062
1D363 0062
+FF14 0062
+0F2D 0062
+1D7D2 0062
+1D7DC 0062
+1D7E6 0062
+1D7F0 0062
+1D7FA 0062
+2463 0062
+24F8 0062
+2779 0062
+2783 0062
+278D 0062
+2074 0062
+2084 0062
33E3 0021
33E3 003F
33E3 0061
1F106 0041
1F106 0062
0035 0021
-FF15 0021
-1D7D3 0021
-1D7DD 0021
-1D7E7 0021
-1D7F1 0021
-1D7FB 0021
-2464 0021
-24F9 0021
-277A 0021
-2784 0021
-278E 0021
-2075 0021
-2085 0021
0665 0021
06F5 0021
-10E64 0021
07C5 0021
-136D 0021
-104A5 0021
096B 0021
09EB 0021
0A6B 0021
0C6B 0021
0CEB 0021
0D6B 0021
-ABF5 0021
-A8D5 0021
-194B 0021
-19D5 0021
-1A85 0021
-1A95 0021
0E55 0021
0ED5 0021
0F25 0021
-0F2E 0021
-1C45 0021
-A905 0021
1045 0021
1095 0021
-1113B 0021
+136D 0021
17E5 0021
17F5 0021
-AA55 0021
+1815 0021
+194B 0021
+19D5 0021
+1A85 0021
+1A95 0021
1B55 0021
-A9D5 0021
1BB5 0021
-1815 0021
+1C45 0021
1C55 0021
-A625 0021
-110F5 0021
3025 0021
+A625 0021
+A8D5 0021
+A905 0021
+A9D5 0021
+AA55 0021
+ABF5 0021
1010B 0021
10143 0021
10148 0021
1015F 0021
10173 0021
10321 0021
+104A5 0021
+10E64 0021
+11056 0021
+1106B 0021
+110F5 0021
+1113B 0021
+111D5 0021
+116C5 0021
12403 0021
1240A 0021
12410 0021
1244D 0021
12454 0021
12455 0021
-111D5 0021
-116C5 0021
-1106B 0021
-11056 0021
1D364 0021
+FF15 0021
+0F2E 0021
+1D7D3 0021
+1D7DD 0021
+1D7E7 0021
+1D7F1 0021
+1D7FB 0021
+2464 0021
+24F9 0021
+277A 0021
+2784 0021
+278E 0021
+2075 0021
+2085 0021
0035 003F
-FF15 003F
-1D7D3 003F
-1D7DD 003F
-1D7E7 003F
-1D7F1 003F
-1D7FB 003F
-2464 003F
-24F9 003F
-277A 003F
-2784 003F
-278E 003F
-2075 003F
-2085 003F
0665 003F
06F5 003F
-10E64 003F
07C5 003F
-136D 003F
-104A5 003F
096B 003F
09EB 003F
0A6B 003F
0C6B 003F
0CEB 003F
0D6B 003F
-ABF5 003F
-A8D5 003F
-194B 003F
-19D5 003F
-1A85 003F
-1A95 003F
0E55 003F
0ED5 003F
0F25 003F
-0F2E 003F
-1C45 003F
-A905 003F
1045 003F
1095 003F
-1113B 003F
+136D 003F
17E5 003F
17F5 003F
-AA55 003F
+1815 003F
+194B 003F
+19D5 003F
+1A85 003F
+1A95 003F
1B55 003F
-A9D5 003F
1BB5 003F
-1815 003F
+1C45 003F
1C55 003F
-A625 003F
-110F5 003F
3025 003F
+A625 003F
+A8D5 003F
+A905 003F
+A9D5 003F
+AA55 003F
+ABF5 003F
1010B 003F
10143 003F
10148 003F
1015F 003F
10173 003F
10321 003F
+104A5 003F
+10E64 003F
+11056 003F
+1106B 003F
+110F5 003F
+1113B 003F
+111D5 003F
+116C5 003F
12403 003F
1240A 003F
12410 003F
1244D 003F
12454 003F
12455 003F
-111D5 003F
-116C5 003F
-1106B 003F
-11056 003F
1D364 003F
+FF15 003F
+0F2E 003F
+1D7D3 003F
+1D7DD 003F
+1D7E7 003F
+1D7F1 003F
+1D7FB 003F
+2464 003F
+24F9 003F
+277A 003F
+2784 003F
+278E 003F
+2075 003F
+2085 003F
248C 0021
248C 003F
248C 0061
324C 0062
32BF 0062
0035 0061
-0035 0041
-FF15 0061
-FF15 0041
-1D7D3 0061
-1D7DD 0061
-1D7E7 0061
-1D7F1 0061
-1D7FB 0061
-1D7D3 0041
-1D7DD 0041
-1D7E7 0041
-1D7F1 0041
-1D7FB 0041
-2464 0061
-24F9 0061
-277A 0061
-2784 0061
-278E 0061
-2464 0041
-24F9 0041
-277A 0041
-2784 0041
-278E 0041
-2075 0061
-2075 0041
-2085 0061
-2085 0041
0665 0061
-0665 0041
06F5 0061
-06F5 0041
-10E64 0061
-10E64 0041
07C5 0061
-07C5 0041
-136D 0061
-136D 0041
-104A5 0061
-104A5 0041
096B 0061
-096B 0041
09EB 0061
-09EB 0041
0A6B 0061
-0A6B 0041
0AEB 0061
-0AEB 0041
0B6B 0061
-0B6B 0041
0BEB 0061
-0BEB 0041
0C6B 0061
-0C6B 0041
0CEB 0061
-0CEB 0041
0D6B 0061
-0D6B 0041
-ABF5 0061
-ABF5 0041
-A8D5 0061
-A8D5 0041
-194B 0061
-194B 0041
-19D5 0061
-19D5 0041
-1A85 0061
-1A85 0041
-1A95 0061
-1A95 0041
0E55 0061
-0E55 0041
0ED5 0061
-0ED5 0041
0F25 0061
-0F25 0041
-0F2E 0061
-0F2E 0041
-1C45 0061
-1C45 0041
-A905 0061
-A905 0041
1045 0061
-1045 0041
1095 0061
-1095 0041
-1113B 0061
-1113B 0041
+136D 0061
17E5 0061
-17E5 0041
17F5 0061
-17F5 0041
-AA55 0061
-AA55 0041
+1815 0061
+194B 0061
+19D5 0061
+1A85 0061
+1A95 0061
1B55 0061
-1B55 0041
-A9D5 0061
-A9D5 0041
1BB5 0061
-1BB5 0041
-1815 0061
-1815 0041
+1C45 0061
1C55 0061
-1C55 0041
-A625 0061
-A625 0041
-110F5 0061
-110F5 0041
3025 0061
-3025 0041
+A625 0061
+A8D5 0061
+A905 0061
+A9D5 0061
+AA55 0061
+ABF5 0061
1010B 0061
-1010B 0041
10143 0061
10148 0061
1014F 0061
1015F 0061
10173 0061
-10143 0041
-10148 0041
-1014F 0041
-1015F 0041
-10173 0041
10321 0061
-10321 0041
+104A5 0061
+10E64 0061
+11056 0061
+1106B 0061
+110F5 0061
+1113B 0061
+111D5 0061
+116C5 0061
12403 0061
1240A 0061
12410 0061
1244D 0061
12454 0061
12455 0061
+1D364 0061
+0035 0041
+0665 0041
+06F5 0041
+07C5 0041
+096B 0041
+09EB 0041
+0A6B 0041
+0AEB 0041
+0B6B 0041
+0BEB 0041
+0C6B 0041
+0CEB 0041
+0D6B 0041
+0E55 0041
+0ED5 0041
+0F25 0041
+1045 0041
+1095 0041
+136D 0041
+17E5 0041
+17F5 0041
+1815 0041
+194B 0041
+19D5 0041
+1A85 0041
+1A95 0041
+1B55 0041
+1BB5 0041
+1C45 0041
+1C55 0041
+3025 0041
+A625 0041
+A8D5 0041
+A905 0041
+A9D5 0041
+AA55 0041
+ABF5 0041
+1010B 0041
+10143 0041
+10148 0041
+1014F 0041
+1015F 0041
+10173 0041
+10321 0041
+104A5 0041
+10E64 0041
+11056 0041
+1106B 0041
+110F5 0041
+1113B 0041
+111D5 0041
+116C5 0041
12403 0041
1240A 0041
12410 0041
1244D 0041
12454 0041
12455 0041
-111D5 0061
-111D5 0041
-116C5 0061
-116C5 0041
-1106B 0061
-1106B 0041
-11056 0061
-11056 0041
-1D364 0061
1D364 0041
+FF15 0061
+FF15 0041
+0F2E 0061
+0F2E 0041
+1D7D3 0061
+1D7DD 0061
+1D7E7 0061
+1D7F1 0061
+1D7FB 0061
+1D7D3 0041
+1D7DD 0041
+1D7E7 0041
+1D7F1 0041
+1D7FB 0041
+2464 0061
+24F9 0061
+277A 0061
+2784 0061
+278E 0061
+2464 0041
+24F9 0041
+277A 0041
+2784 0041
+278E 0041
+2075 0061
+2075 0041
+2085 0061
+2085 0041
0035 0062
-FF15 0062
-1D7D3 0062
-1D7DD 0062
-1D7E7 0062
-1D7F1 0062
-1D7FB 0062
-2464 0062
-24F9 0062
-277A 0062
-2784 0062
-278E 0062
-2075 0062
-2085 0062
0665 0062
06F5 0062
-10E64 0062
07C5 0062
-136D 0062
-104A5 0062
096B 0062
09EB 0062
0A6B 0062
0C6B 0062
0CEB 0062
0D6B 0062
-ABF5 0062
-A8D5 0062
-194B 0062
-19D5 0062
-1A85 0062
-1A95 0062
0E55 0062
0ED5 0062
0F25 0062
-0F2E 0062
-1C45 0062
-A905 0062
1045 0062
1095 0062
-1113B 0062
+136D 0062
17E5 0062
17F5 0062
-AA55 0062
+1815 0062
+194B 0062
+19D5 0062
+1A85 0062
+1A95 0062
1B55 0062
-A9D5 0062
1BB5 0062
-1815 0062
+1C45 0062
1C55 0062
-A625 0062
-110F5 0062
3025 0062
+A625 0062
+A8D5 0062
+A905 0062
+A9D5 0062
+AA55 0062
+ABF5 0062
1010B 0062
10143 0062
10148 0062
1015F 0062
10173 0062
10321 0062
+104A5 0062
+10E64 0062
+11056 0062
+1106B 0062
+110F5 0062
+1113B 0062
+111D5 0062
+116C5 0062
12403 0062
1240A 0062
12410 0062
1244D 0062
12454 0062
12455 0062
-111D5 0062
-116C5 0062
-1106B 0062
-11056 0062
1D364 0062
+FF15 0062
+0F2E 0062
+1D7D3 0062
+1D7DD 0062
+1D7E7 0062
+1D7F1 0062
+1D7FB 0062
+2464 0062
+24F9 0062
+277A 0062
+2784 0062
+278E 0062
+2075 0062
+2085 0062
33E4 0021
33E4 003F
33E4 0061
1F107 0041
1F107 0062
0036 0021
-FF16 0021
-1D7D4 0021
-1D7DE 0021
-1D7E8 0021
-1D7F2 0021
-1D7FC 0021
-2465 0021
-24FA 0021
-277B 0021
-2785 0021
-278F 0021
-2076 0021
-2086 0021
0666 0021
06F6 0021
-10E65 0021
07C6 0021
-136E 0021
-104A6 0021
096C 0021
09EC 0021
0A6C 0021
0C6C 0021
0CEC 0021
0D6C 0021
-ABF6 0021
-A8D6 0021
-194C 0021
-19D6 0021
-1A86 0021
-1A96 0021
0E56 0021
0ED6 0021
0F26 0021
-0F2F 0021
-1C46 0021
-A906 0021
1046 0021
1096 0021
-1113C 0021
+136E 0021
17E6 0021
17F6 0021
-AA56 0021
+1816 0021
+194C 0021
+19D6 0021
+1A86 0021
+1A96 0021
1B56 0021
-A9D6 0021
1BB6 0021
-1816 0021
+1C46 0021
1C56 0021
-A626 0021
-110F6 0021
+2185 0021
3026 0021
+A626 0021
+A8D6 0021
+A906 0021
+A9D6 0021
+AA56 0021
+ABF6 0021
1010C 0021
-2185 0021
+104A6 0021
+10E65 0021
+11057 0021
+1106C 0021
+110F6 0021
+1113C 0021
+111D6 0021
+116C6 0021
12404 0021
1240B 0021
12411 0021
12428 0021
12440 0021
1244E 0021
-111D6 0021
-116C6 0021
-1106C 0021
-11057 0021
1D365 0021
+FF16 0021
+0F2F 0021
+1D7D4 0021
+1D7DE 0021
+1D7E8 0021
+1D7F2 0021
+1D7FC 0021
+2465 0021
+24FA 0021
+277B 0021
+2785 0021
+278F 0021
+2076 0021
+2086 0021
0036 003F
-FF16 003F
-1D7D4 003F
-1D7DE 003F
-1D7E8 003F
-1D7F2 003F
-1D7FC 003F
-2465 003F
-24FA 003F
-277B 003F
-2785 003F
-278F 003F
-2076 003F
-2086 003F
0666 003F
06F6 003F
-10E65 003F
07C6 003F
-136E 003F
-104A6 003F
096C 003F
09EC 003F
0A6C 003F
0C6C 003F
0CEC 003F
0D6C 003F
-ABF6 003F
-A8D6 003F
-194C 003F
-19D6 003F
-1A86 003F
-1A96 003F
0E56 003F
0ED6 003F
0F26 003F
-0F2F 003F
-1C46 003F
-A906 003F
1046 003F
1096 003F
-1113C 003F
+136E 003F
17E6 003F
17F6 003F
-AA56 003F
+1816 003F
+194C 003F
+19D6 003F
+1A86 003F
+1A96 003F
1B56 003F
-A9D6 003F
1BB6 003F
-1816 003F
+1C46 003F
1C56 003F
-A626 003F
-110F6 003F
+2185 003F
3026 003F
+A626 003F
+A8D6 003F
+A906 003F
+A9D6 003F
+AA56 003F
+ABF6 003F
1010C 003F
-2185 003F
+104A6 003F
+10E65 003F
+11057 003F
+1106C 003F
+110F6 003F
+1113C 003F
+111D6 003F
+116C6 003F
12404 003F
1240B 003F
12411 003F
12428 003F
12440 003F
1244E 003F
-111D6 003F
-116C6 003F
-1106C 003F
-11057 003F
1D365 003F
+FF16 003F
+0F2F 003F
+1D7D4 003F
+1D7DE 003F
+1D7E8 003F
+1D7F2 003F
+1D7FC 003F
+2465 003F
+24FA 003F
+277B 003F
+2785 003F
+278F 003F
+2076 003F
+2086 003F
248D 0021
248D 003F
248D 0061
324D 0041
324D 0062
0036 0061
-0036 0041
-FF16 0061
-FF16 0041
-1D7D4 0061
-1D7DE 0061
-1D7E8 0061
-1D7F2 0061
-1D7FC 0061
-1D7D4 0041
-1D7DE 0041
-1D7E8 0041
-1D7F2 0041
-1D7FC 0041
-2465 0061
-24FA 0061
-277B 0061
-2785 0061
-278F 0061
-2465 0041
-24FA 0041
-277B 0041
-2785 0041
-278F 0041
-2076 0061
-2076 0041
-2086 0061
-2086 0041
0666 0061
-0666 0041
06F6 0061
-06F6 0041
-10E65 0061
-10E65 0041
07C6 0061
-07C6 0041
-136E 0061
-136E 0041
-104A6 0061
-104A6 0041
096C 0061
-096C 0041
09EC 0061
-09EC 0041
0A6C 0061
-0A6C 0041
0AEC 0061
-0AEC 0041
0B6C 0061
-0B6C 0041
0BEC 0061
-0BEC 0041
0C6C 0061
-0C6C 0041
0CEC 0061
-0CEC 0041
0D6C 0061
-0D6C 0041
-ABF6 0061
-ABF6 0041
-A8D6 0061
-A8D6 0041
-194C 0061
-194C 0041
-19D6 0061
-19D6 0041
-1A86 0061
-1A86 0041
-1A96 0061
-1A96 0041
0E56 0061
-0E56 0041
0ED6 0061
-0ED6 0041
0F26 0061
-0F26 0041
-0F2F 0061
-0F2F 0041
-1C46 0061
-1C46 0041
-A906 0061
-A906 0041
1046 0061
-1046 0041
1096 0061
-1096 0041
-1113C 0061
-1113C 0041
+136E 0061
17E6 0061
-17E6 0041
17F6 0061
-17F6 0041
-AA56 0061
-AA56 0041
+1816 0061
+194C 0061
+19D6 0061
+1A86 0061
+1A96 0061
1B56 0061
-1B56 0041
-A9D6 0061
-A9D6 0041
1BB6 0061
-1BB6 0041
-1816 0061
-1816 0041
+1C46 0061
1C56 0061
-1C56 0041
-A626 0061
-A626 0041
-110F6 0061
-110F6 0041
+2185 0061
3026 0061
-3026 0041
+A626 0061
+A8D6 0061
+A906 0061
+A9D6 0061
+AA56 0061
+ABF6 0061
1010C 0061
-1010C 0041
-2185 0061
-2185 0041
+104A6 0061
+10E65 0061
+11057 0061
+1106C 0061
+110F6 0061
+1113C 0061
+111D6 0061
+116C6 0061
12404 0061
1240B 0061
12411 0061
12428 0061
12440 0061
1244E 0061
+1D365 0061
+0036 0041
+0666 0041
+06F6 0041
+07C6 0041
+096C 0041
+09EC 0041
+0A6C 0041
+0AEC 0041
+0B6C 0041
+0BEC 0041
+0C6C 0041
+0CEC 0041
+0D6C 0041
+0E56 0041
+0ED6 0041
+0F26 0041
+1046 0041
+1096 0041
+136E 0041
+17E6 0041
+17F6 0041
+1816 0041
+194C 0041
+19D6 0041
+1A86 0041
+1A96 0041
+1B56 0041
+1BB6 0041
+1C46 0041
+1C56 0041
+2185 0041
+3026 0041
+A626 0041
+A8D6 0041
+A906 0041
+A9D6 0041
+AA56 0041
+ABF6 0041
+1010C 0041
+104A6 0041
+10E65 0041
+11057 0041
+1106C 0041
+110F6 0041
+1113C 0041
+111D6 0041
+116C6 0041
12404 0041
1240B 0041
12411 0041
12428 0041
12440 0041
1244E 0041
-111D6 0061
-111D6 0041
-116C6 0061
-116C6 0041
-1106C 0061
-1106C 0041
-11057 0061
-11057 0041
-1D365 0061
1D365 0041
+FF16 0061
+FF16 0041
+0F2F 0061
+0F2F 0041
+1D7D4 0061
+1D7DE 0061
+1D7E8 0061
+1D7F2 0061
+1D7FC 0061
+1D7D4 0041
+1D7DE 0041
+1D7E8 0041
+1D7F2 0041
+1D7FC 0041
+2465 0061
+24FA 0061
+277B 0061
+2785 0061
+278F 0061
+2465 0041
+24FA 0041
+277B 0041
+2785 0041
+278F 0041
+2076 0061
+2076 0041
+2086 0061
+2086 0041
0036 0062
-FF16 0062
-1D7D4 0062
-1D7DE 0062
-1D7E8 0062
-1D7F2 0062
-1D7FC 0062
-2465 0062
-24FA 0062
-277B 0062
-2785 0062
-278F 0062
-2076 0062
-2086 0062
0666 0062
06F6 0062
-10E65 0062
07C6 0062
-136E 0062
-104A6 0062
096C 0062
09EC 0062
0A6C 0062
0C6C 0062
0CEC 0062
0D6C 0062
-ABF6 0062
-A8D6 0062
-194C 0062
-19D6 0062
-1A86 0062
-1A96 0062
0E56 0062
0ED6 0062
0F26 0062
-0F2F 0062
-1C46 0062
-A906 0062
1046 0062
1096 0062
-1113C 0062
+136E 0062
17E6 0062
17F6 0062
-AA56 0062
+1816 0062
+194C 0062
+19D6 0062
+1A86 0062
+1A96 0062
1B56 0062
-A9D6 0062
1BB6 0062
-1816 0062
+1C46 0062
1C56 0062
-A626 0062
-110F6 0062
+2185 0062
3026 0062
+A626 0062
+A8D6 0062
+A906 0062
+A9D6 0062
+AA56 0062
+ABF6 0062
1010C 0062
-2185 0062
+104A6 0062
+10E65 0062
+11057 0062
+1106C 0062
+110F6 0062
+1113C 0062
+111D6 0062
+116C6 0062
12404 0062
1240B 0062
12411 0062
12428 0062
12440 0062
1244E 0062
-111D6 0062
-116C6 0062
-1106C 0062
-11057 0062
1D365 0062
+FF16 0062
+0F2F 0062
+1D7D4 0062
+1D7DE 0062
+1D7E8 0062
+1D7F2 0062
+1D7FC 0062
+2465 0062
+24FA 0062
+277B 0062
+2785 0062
+278F 0062
+2076 0062
+2086 0062
33E5 0021
33E5 003F
33E5 0061
1F108 0041
1F108 0062
0037 0021
-FF17 0021
-1D7D5 0021
-1D7DF 0021
-1D7E9 0021
-1D7F3 0021
-1D7FD 0021
-2466 0021
-24FB 0021
-277C 0021
-2786 0021
-2790 0021
-2077 0021
-2087 0021
0667 0021
06F7 0021
-10E66 0021
07C7 0021
-136F 0021
-104A7 0021
096D 0021
09ED 0021
0A6D 0021
0C6D 0021
0CED 0021
0D6D 0021
-ABF7 0021
-A8D7 0021
-194D 0021
-19D7 0021
-1A87 0021
-1A97 0021
0E57 0021
0ED7 0021
0F27 0021
-0F30 0021
-1C47 0021
-A907 0021
1047 0021
1097 0021
-1113D 0021
+136F 0021
17E7 0021
17F7 0021
-AA57 0021
+1817 0021
+194D 0021
+19D7 0021
+1A87 0021
+1A97 0021
1B57 0021
-A9D7 0021
1BB7 0021
-1817 0021
+1C47 0021
1C57 0021
-A627 0021
-110F7 0021
3027 0021
+A627 0021
+A8D7 0021
+A907 0021
+A9D7 0021
+AA57 0021
+ABF7 0021
1010D 0021
+104A7 0021
+10E66 0021
+11058 0021
+1106D 0021
+110F7 0021
+1113D 0021
+111D7 0021
+116C7 0021
12405 0021
1240C 0021
12412 0021
12441 0021
12442 0021
12443 0021
-111D7 0021
-116C7 0021
-1106D 0021
-11058 0021
1D366 0021
+FF17 0021
+0F30 0021
+1D7D5 0021
+1D7DF 0021
+1D7E9 0021
+1D7F3 0021
+1D7FD 0021
+2466 0021
+24FB 0021
+277C 0021
+2786 0021
+2790 0021
+2077 0021
+2087 0021
0037 003F
-FF17 003F
-1D7D5 003F
-1D7DF 003F
-1D7E9 003F
-1D7F3 003F
-1D7FD 003F
-2466 003F
-24FB 003F
-277C 003F
-2786 003F
-2790 003F
-2077 003F
-2087 003F
0667 003F
06F7 003F
-10E66 003F
07C7 003F
-136F 003F
-104A7 003F
096D 003F
09ED 003F
0A6D 003F
0C6D 003F
0CED 003F
0D6D 003F
-ABF7 003F
-A8D7 003F
-194D 003F
-19D7 003F
-1A87 003F
-1A97 003F
0E57 003F
0ED7 003F
0F27 003F
-0F30 003F
-1C47 003F
-A907 003F
1047 003F
1097 003F
-1113D 003F
+136F 003F
17E7 003F
17F7 003F
-AA57 003F
+1817 003F
+194D 003F
+19D7 003F
+1A87 003F
+1A97 003F
1B57 003F
-A9D7 003F
1BB7 003F
-1817 003F
+1C47 003F
1C57 003F
-A627 003F
-110F7 003F
3027 003F
+A627 003F
+A8D7 003F
+A907 003F
+A9D7 003F
+AA57 003F
+ABF7 003F
1010D 003F
+104A7 003F
+10E66 003F
+11058 003F
+1106D 003F
+110F7 003F
+1113D 003F
+111D7 003F
+116C7 003F
12405 003F
1240C 003F
12412 003F
12441 003F
12442 003F
12443 003F
-111D7 003F
-116C7 003F
-1106D 003F
-11058 003F
1D366 003F
+FF17 003F
+0F30 003F
+1D7D5 003F
+1D7DF 003F
+1D7E9 003F
+1D7F3 003F
+1D7FD 003F
+2466 003F
+24FB 003F
+277C 003F
+2786 003F
+2790 003F
+2077 003F
+2087 003F
248E 0021
248E 003F
248E 0061
324E 0041
324E 0062
0037 0061
-0037 0041
-FF17 0061
-FF17 0041
-1D7D5 0061
-1D7DF 0061
-1D7E9 0061
-1D7F3 0061
-1D7FD 0061
-1D7D5 0041
-1D7DF 0041
-1D7E9 0041
-1D7F3 0041
-1D7FD 0041
-2466 0061
-24FB 0061
-277C 0061
-2786 0061
-2790 0061
-2466 0041
-24FB 0041
-277C 0041
-2786 0041
-2790 0041
-2077 0061
-2077 0041
-2087 0061
-2087 0041
0667 0061
-0667 0041
06F7 0061
-06F7 0041
-10E66 0061
-10E66 0041
07C7 0061
-07C7 0041
-136F 0061
-136F 0041
-104A7 0061
-104A7 0041
096D 0061
-096D 0041
09ED 0061
-09ED 0041
0A6D 0061
-0A6D 0041
0AED 0061
-0AED 0041
0B6D 0061
-0B6D 0041
0BED 0061
-0BED 0041
0C6D 0061
-0C6D 0041
0CED 0061
-0CED 0041
0D6D 0061
-0D6D 0041
-ABF7 0061
-ABF7 0041
-A8D7 0061
-A8D7 0041
-194D 0061
-194D 0041
-19D7 0061
-19D7 0041
-1A87 0061
-1A87 0041
-1A97 0061
-1A97 0041
0E57 0061
-0E57 0041
0ED7 0061
-0ED7 0041
0F27 0061
-0F27 0041
-0F30 0061
-0F30 0041
-1C47 0061
-1C47 0041
-A907 0061
-A907 0041
1047 0061
-1047 0041
1097 0061
-1097 0041
-1113D 0061
-1113D 0041
+136F 0061
17E7 0061
-17E7 0041
17F7 0061
-17F7 0041
-AA57 0061
-AA57 0041
+1817 0061
+194D 0061
+19D7 0061
+1A87 0061
+1A97 0061
1B57 0061
-1B57 0041
-A9D7 0061
-A9D7 0041
1BB7 0061
-1BB7 0041
-1817 0061
-1817 0041
+1C47 0061
1C57 0061
-1C57 0041
-A627 0061
-A627 0041
-110F7 0061
-110F7 0041
3027 0061
-3027 0041
+A627 0061
+A8D7 0061
+A907 0061
+A9D7 0061
+AA57 0061
+ABF7 0061
1010D 0061
-1010D 0041
+104A7 0061
+10E66 0061
+11058 0061
+1106D 0061
+110F7 0061
+1113D 0061
+111D7 0061
+116C7 0061
12405 0061
1240C 0061
12412 0061
12441 0061
12442 0061
12443 0061
+1D366 0061
+0037 0041
+0667 0041
+06F7 0041
+07C7 0041
+096D 0041
+09ED 0041
+0A6D 0041
+0AED 0041
+0B6D 0041
+0BED 0041
+0C6D 0041
+0CED 0041
+0D6D 0041
+0E57 0041
+0ED7 0041
+0F27 0041
+1047 0041
+1097 0041
+136F 0041
+17E7 0041
+17F7 0041
+1817 0041
+194D 0041
+19D7 0041
+1A87 0041
+1A97 0041
+1B57 0041
+1BB7 0041
+1C47 0041
+1C57 0041
+3027 0041
+A627 0041
+A8D7 0041
+A907 0041
+A9D7 0041
+AA57 0041
+ABF7 0041
+1010D 0041
+104A7 0041
+10E66 0041
+11058 0041
+1106D 0041
+110F7 0041
+1113D 0041
+111D7 0041
+116C7 0041
12405 0041
1240C 0041
12412 0041
12441 0041
12442 0041
12443 0041
-111D7 0061
-111D7 0041
-116C7 0061
-116C7 0041
-1106D 0061
-1106D 0041
-11058 0061
-11058 0041
-1D366 0061
1D366 0041
+FF17 0061
+FF17 0041
+0F30 0061
+0F30 0041
+1D7D5 0061
+1D7DF 0061
+1D7E9 0061
+1D7F3 0061
+1D7FD 0061
+1D7D5 0041
+1D7DF 0041
+1D7E9 0041
+1D7F3 0041
+1D7FD 0041
+2466 0061
+24FB 0061
+277C 0061
+2786 0061
+2790 0061
+2466 0041
+24FB 0041
+277C 0041
+2786 0041
+2790 0041
+2077 0061
+2077 0041
+2087 0061
+2087 0041
0037 0062
-FF17 0062
-1D7D5 0062
-1D7DF 0062
-1D7E9 0062
-1D7F3 0062
-1D7FD 0062
-2466 0062
-24FB 0062
-277C 0062
-2786 0062
-2790 0062
-2077 0062
-2087 0062
0667 0062
06F7 0062
-10E66 0062
07C7 0062
-136F 0062
-104A7 0062
096D 0062
09ED 0062
0A6D 0062
0C6D 0062
0CED 0062
0D6D 0062
-ABF7 0062
-A8D7 0062
-194D 0062
-19D7 0062
-1A87 0062
-1A97 0062
0E57 0062
0ED7 0062
0F27 0062
-0F30 0062
-1C47 0062
-A907 0062
1047 0062
1097 0062
-1113D 0062
+136F 0062
17E7 0062
17F7 0062
-AA57 0062
+1817 0062
+194D 0062
+19D7 0062
+1A87 0062
+1A97 0062
1B57 0062
-A9D7 0062
1BB7 0062
-1817 0062
+1C47 0062
1C57 0062
-A627 0062
-110F7 0062
3027 0062
+A627 0062
+A8D7 0062
+A907 0062
+A9D7 0062
+AA57 0062
+ABF7 0062
1010D 0062
+104A7 0062
+10E66 0062
+11058 0062
+1106D 0062
+110F7 0062
+1113D 0062
+111D7 0062
+116C7 0062
12405 0062
1240C 0062
12412 0062
12441 0062
12442 0062
12443 0062
-111D7 0062
-116C7 0062
-1106D 0062
-11058 0062
1D366 0062
+FF17 0062
+0F30 0062
+1D7D5 0062
+1D7DF 0062
+1D7E9 0062
+1D7F3 0062
+1D7FD 0062
+2466 0062
+24FB 0062
+277C 0062
+2786 0062
+2790 0062
+2077 0062
+2087 0062
33E6 0021
33E6 003F
33E6 0061
1F109 0041
1F109 0062
0038 0021
-FF18 0021
-1D7D6 0021
-1D7E0 0021
-1D7EA 0021
-1D7F4 0021
-1D7FE 0021
-2467 0021
-24FC 0021
-277D 0021
-2787 0021
-2791 0021
-2078 0021
-2088 0021
0668 0021
06F8 0021
-10E67 0021
07C8 0021
-1370 0021
-104A8 0021
096E 0021
09EE 0021
0A6E 0021
0C6E 0021
0CEE 0021
0D6E 0021
-ABF8 0021
-A8D8 0021
-194E 0021
-19D8 0021
-1A88 0021
-1A98 0021
0E58 0021
0ED8 0021
0F28 0021
-0F31 0021
-1C48 0021
-A908 0021
1048 0021
1098 0021
-1113E 0021
+1370 0021
17E8 0021
17F8 0021
-AA58 0021
+1818 0021
+194E 0021
+19D8 0021
+1A88 0021
+1A98 0021
1B58 0021
-A9D8 0021
1BB8 0021
-1818 0021
+1C48 0021
1C58 0021
-A628 0021
-110F8 0021
3028 0021
+A628 0021
+A8D8 0021
+A908 0021
+A9D8 0021
+AA58 0021
+ABF8 0021
1010E 0021
+104A8 0021
+10E67 0021
+11059 0021
+1106E 0021
+110F8 0021
+1113E 0021
+111D8 0021
+116C8 0021
12406 0021
1240D 0021
12413 0021
1242A 0021
12444 0021
12445 0021
-111D8 0021
-116C8 0021
-1106E 0021
-11059 0021
1D367 0021
+FF18 0021
+0F31 0021
+1D7D6 0021
+1D7E0 0021
+1D7EA 0021
+1D7F4 0021
+1D7FE 0021
+2467 0021
+24FC 0021
+277D 0021
+2787 0021
+2791 0021
+2078 0021
+2088 0021
0038 003F
-FF18 003F
-1D7D6 003F
-1D7E0 003F
-1D7EA 003F
-1D7F4 003F
-1D7FE 003F
-2467 003F
-24FC 003F
-277D 003F
-2787 003F
-2791 003F
-2078 003F
-2088 003F
0668 003F
06F8 003F
-10E67 003F
07C8 003F
-1370 003F
-104A8 003F
096E 003F
09EE 003F
0A6E 003F
0C6E 003F
0CEE 003F
0D6E 003F
-ABF8 003F
-A8D8 003F
-194E 003F
-19D8 003F
-1A88 003F
-1A98 003F
0E58 003F
0ED8 003F
0F28 003F
-0F31 003F
-1C48 003F
-A908 003F
1048 003F
1098 003F
-1113E 003F
+1370 003F
17E8 003F
17F8 003F
-AA58 003F
+1818 003F
+194E 003F
+19D8 003F
+1A88 003F
+1A98 003F
1B58 003F
-A9D8 003F
1BB8 003F
-1818 003F
+1C48 003F
1C58 003F
-A628 003F
-110F8 003F
3028 003F
+A628 003F
+A8D8 003F
+A908 003F
+A9D8 003F
+AA58 003F
+ABF8 003F
1010E 003F
+104A8 003F
+10E67 003F
+11059 003F
+1106E 003F
+110F8 003F
+1113E 003F
+111D8 003F
+116C8 003F
12406 003F
1240D 003F
12413 003F
1242A 003F
12444 003F
12445 003F
-111D8 003F
-116C8 003F
-1106E 003F
-11059 003F
1D367 003F
+FF18 003F
+0F31 003F
+1D7D6 003F
+1D7E0 003F
+1D7EA 003F
+1D7F4 003F
+1D7FE 003F
+2467 003F
+24FC 003F
+277D 003F
+2787 003F
+2791 003F
+2078 003F
+2088 003F
248F 0021
248F 003F
248F 0061
324F 0041
324F 0062
0038 0061
-0038 0041
-FF18 0061
-FF18 0041
-1D7D6 0061
-1D7E0 0061
-1D7EA 0061
-1D7F4 0061
-1D7FE 0061
-1D7D6 0041
-1D7E0 0041
-1D7EA 0041
-1D7F4 0041
-1D7FE 0041
-2467 0061
-24FC 0061
-277D 0061
-2787 0061
-2791 0061
-2467 0041
-24FC 0041
-277D 0041
-2787 0041
-2791 0041
-2078 0061
-2078 0041
-2088 0061
-2088 0041
0668 0061
-0668 0041
06F8 0061
-06F8 0041
-10E67 0061
-10E67 0041
07C8 0061
-07C8 0041
-1370 0061
-1370 0041
-104A8 0061
-104A8 0041
096E 0061
-096E 0041
09EE 0061
-09EE 0041
0A6E 0061
-0A6E 0041
0AEE 0061
-0AEE 0041
0B6E 0061
-0B6E 0041
0BEE 0061
-0BEE 0041
0C6E 0061
-0C6E 0041
0CEE 0061
-0CEE 0041
0D6E 0061
-0D6E 0041
-ABF8 0061
-ABF8 0041
-A8D8 0061
-A8D8 0041
-194E 0061
-194E 0041
-19D8 0061
-19D8 0041
-1A88 0061
-1A88 0041
-1A98 0061
-1A98 0041
0E58 0061
-0E58 0041
0ED8 0061
-0ED8 0041
0F28 0061
-0F28 0041
-0F31 0061
-0F31 0041
-1C48 0061
-1C48 0041
-A908 0061
-A908 0041
1048 0061
-1048 0041
1098 0061
-1098 0041
-1113E 0061
-1113E 0041
+1370 0061
17E8 0061
-17E8 0041
17F8 0061
-17F8 0041
-AA58 0061
-AA58 0041
+1818 0061
+194E 0061
+19D8 0061
+1A88 0061
+1A98 0061
1B58 0061
-1B58 0041
-A9D8 0061
-A9D8 0041
1BB8 0061
-1BB8 0041
-1818 0061
-1818 0041
+1C48 0061
1C58 0061
-1C58 0041
-A628 0061
-A628 0041
-110F8 0061
-110F8 0041
3028 0061
-3028 0041
+A628 0061
+A8D8 0061
+A908 0061
+A9D8 0061
+AA58 0061
+ABF8 0061
1010E 0061
-1010E 0041
+104A8 0061
+10E67 0061
+11059 0061
+1106E 0061
+110F8 0061
+1113E 0061
+111D8 0061
+116C8 0061
12406 0061
1240D 0061
12413 0061
1242A 0061
12444 0061
12445 0061
+1D367 0061
+0038 0041
+0668 0041
+06F8 0041
+07C8 0041
+096E 0041
+09EE 0041
+0A6E 0041
+0AEE 0041
+0B6E 0041
+0BEE 0041
+0C6E 0041
+0CEE 0041
+0D6E 0041
+0E58 0041
+0ED8 0041
+0F28 0041
+1048 0041
+1098 0041
+1370 0041
+17E8 0041
+17F8 0041
+1818 0041
+194E 0041
+19D8 0041
+1A88 0041
+1A98 0041
+1B58 0041
+1BB8 0041
+1C48 0041
+1C58 0041
+3028 0041
+A628 0041
+A8D8 0041
+A908 0041
+A9D8 0041
+AA58 0041
+ABF8 0041
+1010E 0041
+104A8 0041
+10E67 0041
+11059 0041
+1106E 0041
+110F8 0041
+1113E 0041
+111D8 0041
+116C8 0041
12406 0041
1240D 0041
12413 0041
1242A 0041
12444 0041
12445 0041
-111D8 0061
-111D8 0041
-116C8 0061
-116C8 0041
-1106E 0061
-1106E 0041
-11059 0061
-11059 0041
-1D367 0061
1D367 0041
+FF18 0061
+FF18 0041
+0F31 0061
+0F31 0041
+1D7D6 0061
+1D7E0 0061
+1D7EA 0061
+1D7F4 0061
+1D7FE 0061
+1D7D6 0041
+1D7E0 0041
+1D7EA 0041
+1D7F4 0041
+1D7FE 0041
+2467 0061
+24FC 0061
+277D 0061
+2787 0061
+2791 0061
+2467 0041
+24FC 0041
+277D 0041
+2787 0041
+2791 0041
+2078 0061
+2078 0041
+2088 0061
+2088 0041
0038 0062
-FF18 0062
-1D7D6 0062
-1D7E0 0062
-1D7EA 0062
-1D7F4 0062
-1D7FE 0062
-2467 0062
-24FC 0062
-277D 0062
-2787 0062
-2791 0062
-2078 0062
-2088 0062
0668 0062
06F8 0062
-10E67 0062
07C8 0062
-1370 0062
-104A8 0062
096E 0062
09EE 0062
0A6E 0062
0C6E 0062
0CEE 0062
0D6E 0062
-ABF8 0062
-A8D8 0062
-194E 0062
-19D8 0062
-1A88 0062
-1A98 0062
0E58 0062
0ED8 0062
0F28 0062
-0F31 0062
-1C48 0062
-A908 0062
1048 0062
1098 0062
-1113E 0062
+1370 0062
17E8 0062
17F8 0062
-AA58 0062
+1818 0062
+194E 0062
+19D8 0062
+1A88 0062
+1A98 0062
1B58 0062
-A9D8 0062
1BB8 0062
-1818 0062
+1C48 0062
1C58 0062
-A628 0062
-110F8 0062
3028 0062
+A628 0062
+A8D8 0062
+A908 0062
+A9D8 0062
+AA58 0062
+ABF8 0062
1010E 0062
+104A8 0062
+10E67 0062
+11059 0062
+1106E 0062
+110F8 0062
+1113E 0062
+111D8 0062
+116C8 0062
12406 0062
1240D 0062
12413 0062
1242A 0062
12444 0062
12445 0062
-111D8 0062
-116C8 0062
-1106E 0062
-11059 0062
1D367 0062
+FF18 0062
+0F31 0062
+1D7D6 0062
+1D7E0 0062
+1D7EA 0062
+1D7F4 0062
+1D7FE 0062
+2467 0062
+24FC 0062
+277D 0062
+2787 0062
+2791 0062
+2078 0062
+2088 0062
33E7 0021
33E7 003F
33E7 0061
1F10A 0041
1F10A 0062
0039 0021
-FF19 0021
-1D7D7 0021
-1D7E1 0021
-1D7EB 0021
-1D7F5 0021
-1D7FF 0021
-2468 0021
-24FD 0021
-277E 0021
-2788 0021
-2792 0021
-2079 0021
-2089 0021
0669 0021
06F9 0021
-10E68 0021
07C9 0021
-1371 0021
-104A9 0021
096F 0021
09EF 0021
0A6F 0021
0C6F 0021
0CEF 0021
0D6F 0021
-ABF9 0021
-A8D9 0021
-194F 0021
-19D9 0021
-1A89 0021
-1A99 0021
0E59 0021
0ED9 0021
0F29 0021
-0F32 0021
-1C49 0021
-A909 0021
1049 0021
1099 0021
-1113F 0021
+1371 0021
17E9 0021
17F9 0021
-AA59 0021
+1819 0021
+194F 0021
+19D9 0021
+1A89 0021
+1A99 0021
1B59 0021
-A9D9 0021
1BB9 0021
-1819 0021
+1C49 0021
1C59 0021
-A629 0021
-110F9 0021
3029 0021
+A629 0021
+A8D9 0021
+A909 0021
+A9D9 0021
+AA59 0021
+ABF9 0021
1010F 0021
+104A9 0021
+10E68 0021
+1105A 0021
+1106F 0021
+110F9 0021
+1113F 0021
+111D9 0021
+116C9 0021
12407 0021
1240E 0021
12414 0021
12447 0021
12448 0021
12449 0021
-111D9 0021
-116C9 0021
-1106F 0021
-1105A 0021
1D368 0021
+FF19 0021
+0F32 0021
+1D7D7 0021
+1D7E1 0021
+1D7EB 0021
+1D7F5 0021
+1D7FF 0021
+2468 0021
+24FD 0021
+277E 0021
+2788 0021
+2792 0021
+2079 0021
+2089 0021
0039 003F
-FF19 003F
-1D7D7 003F
-1D7E1 003F
-1D7EB 003F
-1D7F5 003F
-1D7FF 003F
-2468 003F
-24FD 003F
-277E 003F
-2788 003F
-2792 003F
-2079 003F
-2089 003F
0669 003F
06F9 003F
-10E68 003F
07C9 003F
-1371 003F
-104A9 003F
096F 003F
09EF 003F
0A6F 003F
0C6F 003F
0CEF 003F
0D6F 003F
-ABF9 003F
-A8D9 003F
-194F 003F
-19D9 003F
-1A89 003F
-1A99 003F
0E59 003F
0ED9 003F
0F29 003F
-0F32 003F
-1C49 003F
-A909 003F
1049 003F
1099 003F
-1113F 003F
+1371 003F
17E9 003F
17F9 003F
-AA59 003F
+1819 003F
+194F 003F
+19D9 003F
+1A89 003F
+1A99 003F
1B59 003F
-A9D9 003F
1BB9 003F
-1819 003F
+1C49 003F
1C59 003F
-A629 003F
-110F9 003F
3029 003F
+A629 003F
+A8D9 003F
+A909 003F
+A9D9 003F
+AA59 003F
+ABF9 003F
1010F 003F
+104A9 003F
+10E68 003F
+1105A 003F
+1106F 003F
+110F9 003F
+1113F 003F
+111D9 003F
+116C9 003F
12407 003F
1240E 003F
12414 003F
12447 003F
12448 003F
12449 003F
-111D9 003F
-116C9 003F
-1106F 003F
-1105A 003F
1D368 003F
+FF19 003F
+0F32 003F
+1D7D7 003F
+1D7E1 003F
+1D7EB 003F
+1D7F5 003F
+1D7FF 003F
+2468 003F
+24FD 003F
+277E 003F
+2788 003F
+2792 003F
+2079 003F
+2089 003F
2490 0021
2490 003F
2490 0061
2490 0041
2490 0062
0039 0061
+0669 0061
+06F9 0061
+07C9 0061
+096F 0061
+09EF 0061
+0A6F 0061
+0AEF 0061
+0B6F 0061
+0BEF 0061
+0C6F 0061
+0CEF 0061
+0D6F 0061
+0E59 0061
+0ED9 0061
+0F29 0061
+1049 0061
+1099 0061
+1371 0061
+17E9 0061
+17F9 0061
+1819 0061
+194F 0061
+19D9 0061
+1A89 0061
+1A99 0061
+1B59 0061
+1BB9 0061
+1C49 0061
+1C59 0061
+3029 0061
+A629 0061
+A8D9 0061
+A909 0061
+A9D9 0061
+AA59 0061
+ABF9 0061
+1010F 0061
+104A9 0061
+10E68 0061
+1105A 0061
+1106F 0061
+110F9 0061
+1113F 0061
+111D9 0061
+116C9 0061
+12407 0061
+1240E 0061
+12414 0061
+1241D 0061
+1242B 0061
+12446 0061
+12447 0061
+12448 0061
+12449 0061
+1D368 0061
0039 0041
+0669 0041
+06F9 0041
+07C9 0041
+096F 0041
+09EF 0041
+0A6F 0041
+0AEF 0041
+0B6F 0041
+0BEF 0041
+0C6F 0041
+0CEF 0041
+0D6F 0041
+0E59 0041
+0ED9 0041
+0F29 0041
+1049 0041
+1099 0041
+1371 0041
+17E9 0041
+17F9 0041
+1819 0041
+194F 0041
+19D9 0041
+1A89 0041
+1A99 0041
+1B59 0041
+1BB9 0041
+1C49 0041
+1C59 0041
+3029 0041
+A629 0041
+A8D9 0041
+A909 0041
+A9D9 0041
+AA59 0041
+ABF9 0041
+1010F 0041
+104A9 0041
+10E68 0041
+1105A 0041
+1106F 0041
+110F9 0041
+1113F 0041
+111D9 0041
+116C9 0041
+12407 0041
+1240E 0041
+12414 0041
+1241D 0041
+1242B 0041
+12446 0041
+12447 0041
+12448 0041
+12449 0041
+1D368 0041
FF19 0061
FF19 0041
+0F32 0061
+0F32 0041
1D7D7 0061
1D7E1 0061
1D7EB 0061
2079 0041
2089 0061
2089 0041
-0669 0061
-0669 0041
-06F9 0061
-06F9 0041
-10E68 0061
-10E68 0041
-07C9 0061
-07C9 0041
-1371 0061
-1371 0041
-104A9 0061
-104A9 0041
-096F 0061
-096F 0041
-09EF 0061
-09EF 0041
-0A6F 0061
-0A6F 0041
-0AEF 0061
-0AEF 0041
-0B6F 0061
-0B6F 0041
-0BEF 0061
-0BEF 0041
-0C6F 0061
-0C6F 0041
-0CEF 0061
-0CEF 0041
-0D6F 0061
-0D6F 0041
-ABF9 0061
-ABF9 0041
-A8D9 0061
-A8D9 0041
-194F 0061
-194F 0041
-19D9 0061
-19D9 0041
-1A89 0061
-1A89 0041
-1A99 0061
-1A99 0041
-0E59 0061
-0E59 0041
-0ED9 0061
-0ED9 0041
-0F29 0061
-0F29 0041
-0F32 0061
-0F32 0041
-1C49 0061
-1C49 0041
-A909 0061
-A909 0041
-1049 0061
-1049 0041
-1099 0061
-1099 0041
-1113F 0061
-1113F 0041
-17E9 0061
-17E9 0041
-17F9 0061
-17F9 0041
-AA59 0061
-AA59 0041
-1B59 0061
-1B59 0041
-A9D9 0061
-A9D9 0041
-1BB9 0061
-1BB9 0041
-1819 0061
-1819 0041
-1C59 0061
-1C59 0041
-A629 0061
-A629 0041
-110F9 0061
-110F9 0041
-3029 0061
-3029 0041
-1010F 0061
-1010F 0041
-12407 0061
-1240E 0061
-12414 0061
-1241D 0061
-1242B 0061
-12446 0061
-12447 0061
-12448 0061
-12449 0061
-12407 0041
-1240E 0041
-12414 0041
-1241D 0041
-1242B 0041
-12446 0041
-12447 0041
-12448 0041
-12449 0041
-111D9 0061
-111D9 0041
-116C9 0061
-116C9 0041
-1106F 0061
-1106F 0041
-1105A 0061
-1105A 0041
-1D368 0061
-1D368 0041
0039 0062
-FF19 0062
-1D7D7 0062
-1D7E1 0062
-1D7EB 0062
-1D7F5 0062
-1D7FF 0062
-2468 0062
-24FD 0062
-277E 0062
-2788 0062
-2792 0062
-2079 0062
-2089 0062
0669 0062
06F9 0062
-10E68 0062
07C9 0062
-1371 0062
-104A9 0062
096F 0062
09EF 0062
0A6F 0062
0C6F 0062
0CEF 0062
0D6F 0062
-ABF9 0062
-A8D9 0062
-194F 0062
-19D9 0062
-1A89 0062
-1A99 0062
0E59 0062
0ED9 0062
0F29 0062
-0F32 0062
-1C49 0062
-A909 0062
1049 0062
1099 0062
-1113F 0062
+1371 0062
17E9 0062
17F9 0062
-AA59 0062
+1819 0062
+194F 0062
+19D9 0062
+1A89 0062
+1A99 0062
1B59 0062
-A9D9 0062
1BB9 0062
-1819 0062
+1C49 0062
1C59 0062
-A629 0062
-110F9 0062
3029 0062
+A629 0062
+A8D9 0062
+A909 0062
+A9D9 0062
+AA59 0062
+ABF9 0062
1010F 0062
+104A9 0062
+10E68 0062
+1105A 0062
+1106F 0062
+110F9 0062
+1113F 0062
+111D9 0062
+116C9 0062
12407 0062
1240E 0062
12414 0062
12447 0062
12448 0062
12449 0062
-111D9 0062
-116C9 0062
-1106F 0062
-1105A 0062
1D368 0062
+FF19 0062
+0F32 0062
+1D7D7 0062
+1D7E1 0062
+1D7EB 0062
+1D7F5 0062
+1D7FF 0062
+2468 0062
+24FD 0062
+277E 0062
+2788 0062
+2792 0062
+2079 0062
+2089 0062
33E8 0021
33E8 003F
33E8 0061
0618 0061
0619 0061
061A 0061
+061C 0061
0640 0061
06D6 0061
06D7 0061
180B 0061
180C 0061
180D 0061
+180E 0061
1A7F 0061
1B6B 0061
1B6C 0061
2062 0061
2063 0061
2064 0061
+2066 0061
+2067 0061
+2068 0061
+2069 0061
206A 0061
206B 0061
206C 0061
0618 0041
0619 0041
061A 0041
+061C 0041
0640 0041
06D6 0041
06D7 0041
180B 0041
180C 0041
180D 0041
+180E 0041
1A7F 0041
1B6B 0041
1B6C 0041
2062 0041
2063 0041
2064 0041
+2066 0041
+2067 0041
+2068 0041
+2069 0041
206A 0041
206B 0041
206C 0041
20E2 0041
20E3 0041
20E4 0041
+3099 0061
+3099 0041
+FF9E 0061
+FF9E 0041
+309A 0061
+309A 0041
+FF9F 0061
+FF9F 0041
+0335 0061
+0335 0041
0305 0061
0305 0041
0309 0061
0334 0041
0334 1DD3
1DD3 0334
-0335 0061
-0335 0041
0339 0061
0339 0041
0345 0061
302E 0041
302F 0061
302F 0041
-3099 0061
-3099 0041
-FF9E 0061
-FF9E 0041
-309A 0061
-309A 0041
-FF9F 0061
-FF9F 0041
20D0 0061
20D0 0041
20D1 0061
0618 0062
0619 0062
061A 0062
+061C 0062
0640 0062
06D6 0062
06D7 0062
180B 0062
180C 0062
180D 0062
+180E 0062
1A7F 0062
1B6B 0062
1B6C 0062
2062 0062
2063 0062
2064 0062
+2066 0062
+2067 0062
+2068 0062
+2069 0062
206A 0062
206B 0062
206C 0062
20E2 0062
20E3 0062
20E4 0062
+3099 0062
+FF9E 0062
+309A 0062
+FF9F 0062
+0335 0062
0305 0062
0309 0062
030F 0062
0330 0062
0331 0062
0334 0062
-0335 0062
0339 0062
0345 0062
0358 0062
302D 0062
302E 0062
302F 0062
-3099 0062
-FF9E 0062
-309A 0062
-FF9F 0062
20D0 0062
20D1 0062
20D2 0062
1E0A 0021
1E11 0021
1E10 0021
+0111 0021
+0110 0021
1E0D 0021
1E0C 0021
1E13 0021
1E12 0021
1E0F 0021
1E0E 0021
-0111 0021
-0110 0021
00F0 0021
1DD9 0021
00D0 0021
1E0A 003F
1E11 003F
1E10 003F
+0111 003F
+0110 003F
1E0D 003F
1E0C 003F
1E13 003F
1E12 003F
1E0F 003F
1E0E 003F
-0111 003F
-0110 003F
00F0 003F
1DD9 003F
00D0 003F
1E11 0041
1E10 0061
1E10 0041
+0111 0061
+0111 0041
+0110 0061
+0110 0041
1E0D 0061
1E0D 0041
1E0C 0061
1E0F 0041
1E0E 0061
1E0E 0041
-0111 0061
-0111 0041
-0110 0061
-0110 0041
00F0 0061
1DD9 0061
00F0 0041
1E0A 0062
1E11 0062
1E10 0062
+0111 0062
+0110 0062
1E0D 0062
1E0C 0062
1E13 0062
1E12 0062
1E0F 0062
1E0E 0062
-0111 0062
-0110 0062
00F0 0062
1DD9 0062
00D0 0062
1E22 0021
1E29 0021
1E28 0021
+0127 0021
+210F 0021
+0126 0021
+A7F8 0021
1E25 0021
1E24 0021
1E2B 0021
1E2A 0021
1E96 0021
-0127 0021
-210F 0021
-0126 0021
-A7F8 0021
0068 003F
FF48 003F
036A 003F
1E22 003F
1E29 003F
1E28 003F
+0127 003F
+210F 003F
+0126 003F
+A7F8 003F
1E25 003F
1E24 003F
1E2B 003F
1E2A 003F
1E96 003F
-0127 003F
-210F 003F
-0126 003F
-A7F8 003F
0068 0061
0068 0041
FF48 0061
1E29 0041
1E28 0061
1E28 0041
+0127 0061
+210F 0061
+0127 0041
+210F 0041
+0126 0061
+0126 0041
+A7F8 0061
+A7F8 0041
1E25 0061
1E25 0041
1E24 0061
1E2A 0041
1E96 0061
1E96 0041
-0127 0061
-210F 0061
-0127 0041
-210F 0041
-0126 0061
-0126 0041
-A7F8 0061
-A7F8 0041
33CA 0021
33CA 003F
33CA 0061
1E22 0062
1E29 0062
1E28 0062
+0127 0062
+210F 0062
+0126 0062
+A7F8 0062
1E25 0062
1E24 0062
1E2B 0062
1E2A 0062
1E96 0062
-0127 0062
-210F 0062
-0126 0062
-A7F8 0062
32CC 0021
32CC 003F
32CC 0061
0069 0308 0301 0334
0069 0308 0334 0341
00EF 0301 0334
-1E2F 0334
+00EF 0334 0341
0049 0308 0334 0301
0049 0308 0341 0334
00CF 0334 0301
013D 0021
013C 0021
013B 0021
+0142 0021
+0141 0021
1E37 0021
1E36 0021
1E39 0021
1E3C 0021
1E3B 0021
1E3A 0021
-0142 0021
-0141 0021
006C 00B7 0021
006C 0387 0021
0140 0021
013D 003F
013C 003F
013B 003F
+0142 003F
+0141 003F
1E37 003F
1E36 003F
1E39 003F
1E3C 003F
1E3B 003F
1E3A 003F
-0142 003F
-0141 003F
006C 00B7 003F
006C 0387 003F
0140 003F
013C 0041
013B 0061
013B 0041
+0142 0061
+0142 0041
+0141 0061
+0141 0041
1E37 0061
1E37 0041
1E36 0061
1E3B 0041
1E3A 0061
1E3A 0041
-0142 0061
-0142 0041
-0141 0061
-0141 0041
006C 00B7 0061
006C 0387 0061
0140 0061
013D 0062
013C 0062
013B 0062
+0142 0062
+0141 0062
1E37 0062
1E36 0062
1E39 0062
1E3C 0062
1E3B 0062
1E3A 0062
-0142 0062
-0141 0062
006C 00B7 0062
006C 0387 0062
0140 0062
0075 0334 0344
0075 0344 0334
01D8 0334
-0055 0308 0341 0334
+0055 0308 0301 0334
0055 0334 0308 0301
-0055 0334 0308 0341
00DC 0301 0334
+00DC 0334 0301
0075 0308 0340 0334
0075 0334 0308 0340
00FC 0300 0334
0391 0334 0313 0340
1F08 0300 0334
1F0A 0334
-03B1 0313 0300 0334 0345
03B1 0343 0300 0345 0334
03B1 0343 0345 0334 0340
03B1 0345 0313 0300 0334
+1F00 0345 0340 0334
0391 0343 0334 0345 0340
0391 0345 0313 0334 0340
1F08 0300 0345 0334
1F09 0300 0334
1F09 0340 0334
03B1 0314 0334 0300 0345
-03B1 0334 0314 0345 0300
-03B1 0345 0314 0300 0334
-03B1 0345 0334 0314 0340
+03B1 0314 0345 0340 0334
+1F81 0340 0334
+1F83 0334
0391 0334 0345 0314 0300
1F09 0334 0345 0300
-1F09 0345 0300 0334
1F0B 0345 0334
+1FBC 0314 0300 0334
03B1 0314 0334 0342
03B1 0314 0342 0334
03B1 0334 0314 0342
0397 0334 0343
1F28 0334
03B7 0334 0313 0341
-03B7 0334 0343 0301
03B7 0334 0343 0341
-1F20 0334 0341
+03B7 0343 0341 0334
+1F20 0341 0334
0397 0313 0334 0301
0397 0313 0341 0334
0397 0334 0313 0301
0397 0343 0300 0334
03B7 0313 0345 0334 0300
03B7 0313 0345 0340 0334
-03B7 0343 0340 0334 0345
03B7 0343 0345 0340 0334
+1FC3 0334 0313 0300
0397 0313 0334 0300 0345
0397 0343 0334 0345 0340
0397 0343 0340 0345 0334
1F29 0334 0301
1F2D 0334
03B7 0314 0301 0345 0334
+03B7 0314 0334 0345 0301
03B7 0314 0334 0345 0341
-1F21 0334 0341 0345
-1F21 0345 0301 0334
+03B7 0314 0345 0334 0301
0397 0314 0334 0345 0341
0397 0334 0314 0345 0301
1F29 0334 0301 0345
03C5 0334 0343 0341
1F50 0334 0301
1F50 0334 0341
-03C5 0334 0313 0340
+03C5 0313 0340 0334
03C5 0334 0343 0340
1F50 0334 0340
1F52 0334
03D2 0308 0334
03D2 0334 0308
03D4 0334
+03B0 0334
03C5 0308 0341 0334
03C5 0344 0334
-03CB 0334 0341
03CB 0341 0334
03C5 0308 0300 0334
03C5 0308 0340 0334
2126 0334 0314 0301
2126 0334 0314 0341
03C9 0314 0334 0345 0301
-1F61 0334 0301 0345
-1F61 0341 0345 0334
+03C9 0345 0314 0334 0301
+1F61 0301 0345 0334
1F65 0345 0334
03A9 0314 0301 0345 0334
03A9 0345 0334 0314 0301
2126 0314 0334 0340
2126 0334 0314 0340
03C9 0314 0334 0345 0300
-03C9 0314 0340 0334 0345
03C9 0314 0345 0300 0334
-03C9 0334 0314 0340 0345
+03C9 0345 0314 0300 0334
+03C9 0345 0314 0334 0300
03A9 0314 0300 0345 0334
03A9 0345 0334 0314 0300
1F6B 0345 0334
12262 0061
12262 0041
12262 0062
+122D4 0021
+122D4 003F
+122D4 0061
+122D4 0041
+122D4 0062
+122D5 0021
+122D5 003F
+122D5 0061
+122D5 0041
+122D5 0062
12263 0021
12263 003F
12263 0061
122D3 0061
122D3 0041
122D3 0062
-122D4 0021
-122D4 003F
-122D4 0061
-122D4 0041
-122D4 0062
-122D5 0021
-122D5 003F
-122D5 0061
-122D5 0041
-122D5 0062
122D6 0021
122D6 003F
122D6 0061
10FFFF 0061
10FFFF 0041
10FFFF 0062
+FFFD 0021
+FFFD 003F
+FFFD 0061
+FFFD 0041
+FFFD 0062
FFFF 0021
FFFF 003F
FFFF 0061
# File: CollationTest_CLDR_SHIFTED_SHORT.txt
-# UCA Version: 6.2.0
-# UCD Version: 6.2.0
-# Generated: 2012-08-15, 21:43:28 GMT [MD]
+# UCA Version: 6.3.0
+# UCD Version: 6.3.0
+# Generated: 2013-09-03 [MS]
# For a description of the format and usage, see CollationAuxiliary.html
0009 0021
000D 003F
0085 0021
0085 003F
-180E 0021
-180E 003F
2028 0021
2028 003F
2029 0021
0618 0021
0619 0021
061A 0021
+061C 0021
0640 0021
06D6 0021
06D7 0021
180B 0021
180C 0021
180D 0021
+180E 0021
1A7F 0021
1B6B 0021
1B6C 0021
2062 0021
2063 0021
2064 0021
+2066 0021
+2067 0021
+2068 0021
+2069 0021
206A 0021
206B 0021
206C 0021
0618 003F
0619 003F
061A 003F
+061C 003F
0640 003F
06D6 003F
06D7 003F
180B 003F
180C 003F
180D 003F
+180E 003F
1A7F 003F
1B6B 003F
1B6C 003F
2062 003F
2063 003F
2064 003F
+2066 003F
+2067 003F
+2068 003F
+2069 003F
206A 003F
206B 003F
206C 003F
2045 003F
2046 0021
2046 003F
+2308 0021
+2308 003F
+2309 0021
+2309 003F
+230A 0021
+230A 003F
+230B 0021
+230B 003F
29FC 0021
29FC 003F
29FD 0021
20E2 003F
20E3 003F
20E4 003F
+3099 0021
+3099 003F
+FF9E 0021
+FF9E 003F
+309A 0021
+309A 003F
+FF9F 0021
+FF9F 003F
+0335 0021
+0335 003F
+0335 0334
0305 0021
0305 003F
0309 0021
20EF 0334
0334 10A0D
10A0D 0334
+0334 3099
+3099 0334
+0334 309A
+309A 0334
0305 0334
0334 0305
0309 0334
302E 0334
0334 302F
302F 0334
-0334 3099
-3099 0334
-0334 309A
-309A 0334
0334 20D0
20D0 0334
0334 20D1
20E9 0334
0334 101FD
101FD 0334
-0335 0021
-0335 003F
-0335 0334
0339 0021
0339 003F
0345 0021
302E 003F
302F 0021
302F 003F
-3099 0021
-3099 003F
-FF9E 0021
-FF9E 003F
-309A 0021
-309A 003F
-FF9F 0021
-FF9F 003F
20D0 0021
20D0 003F
20D1 0021
003D 003F
FF1D 0021
FF1D 003F
+2A74 0021
+2A74 003F
FE66 0021
FE66 003F
207C 0021
207C 003F
208C 0021
208C 003F
-2A74 0021
-2A74 003F
2260 0021
2260 003F
003D 0338 0334
003D 0041
FF1D 0061
FF1D 0041
+2A74 0061
+2A74 0041
FE66 0061
FE66 0041
207C 0061
207C 0041
208C 0061
208C 0041
-2A74 0061
-2A74 0041
2260 0061
2260 0041
003D 0062
FF1D 0062
+2A74 0062
FE66 0062
207C 0062
208C 0062
-2A74 0062
2260 0062
003E 0021
003E 003F
2307 0061
2307 0041
2307 0062
-2308 0021
-2308 003F
-2308 0061
-2308 0041
-2308 0062
-2309 0021
-2309 003F
-2309 0061
-2309 0041
-2309 0062
-230A 0021
-230A 003F
-230A 0061
-230A 0041
-230A 0062
-230B 0021
-230B 003F
-230B 0061
-230B 0041
-230B 0062
230C 0021
230C 003F
230C 0061
FFFC 0061
FFFC 0041
FFFC 0062
-FFFD 0021
-FFFD 003F
-FFFD 0061
-FFFD 0041
-FFFD 0062
02D0 0021
02D0 003F
02D0 0061
3035 0062
309D 0021
309D 003F
+309E 0021
+309E 003F
309D 0334 3099
309D 3099 0334
309E 0334
-309E 0021
-309E 003F
309D 0061
309D 0041
309E 0061
FF70 0062
30FD 0021
30FD 003F
+30FE 0021
+30FE 003F
30FD 0334 3099
30FD 3099 0334
30FE 0334
-30FE 0021
-30FE 003F
30FD 0061
30FD 0041
30FE 0061
12433 0061
12433 0041
12433 0062
-12456 0021
-12456 003F
-12456 0334
-12456 0061
-12456 0041
-12456 0062
-12457 0021
-12457 003F
-12457 0334
-12457 0061
-12457 0041
-12457 0062
1245A 0021
1245A 003F
1245A 0334
1D371 0041
1D371 0062
0030 0021
-0030 003F
-FF10 0021
-FF10 003F
-1F101 0334
-1F101 0021
-1F101 003F
-1F100 0334
-1F100 0021
-1F100 003F
-1D7CE 0021
-1D7D8 0021
-1D7E2 0021
-1D7EC 0021
-1D7F6 0021
-1D7CE 003F
-1D7D8 003F
-1D7E2 003F
-1D7EC 003F
-1D7F6 003F
-24EA 0021
-24FF 0021
-24EA 003F
-24FF 003F
-2070 0021
-2070 003F
-2080 0021
-2080 003F
-1D7CE 0334
-1D7D8 0334
-1D7E2 0334
-1D7EC 0334
-1D7F6 0334
0660 0021
-0660 003F
06F0 0021
-06F0 003F
07C0 0021
-07C0 003F
-104A0 0021
-104A0 003F
-104A0 0334
0966 0021
-0966 003F
09E6 0021
-09E6 003F
0A66 0021
-0A66 003F
0AE6 0021
-0AE6 003F
0B66 0021
-0B66 003F
0BE6 0021
-0BE6 003F
0C66 0021
0C78 0021
-0C66 003F
-0C78 003F
0CE6 0021
-0CE6 003F
0D66 0021
-0D66 003F
-ABF0 0021
-ABF0 003F
-A8D0 0021
-A8D0 003F
+0E50 0021
+0ED0 0021
+0F20 0021
+1040 0021
+1090 0021
+17E0 0021
+17F0 0021
+1810 0021
1946 0021
-1946 003F
19D0 0021
-19D0 003F
1A80 0021
-1A80 003F
1A90 0021
-1A90 003F
-0E50 0021
+1B50 0021
+1BB0 0021
+1C40 0021
+1C50 0021
+3007 0021
+A620 0021
+A8D0 0021
+A900 0021
+A9D0 0021
+AA50 0021
+ABF0 0021
+1018A 0021
+104A0 0021
+11066 0021
+110F0 0021
+11136 0021
+111D0 0021
+116C0 0021
+0030 003F
+0660 003F
+06F0 003F
+07C0 003F
+0966 003F
+09E6 003F
+0A66 003F
+0AE6 003F
+0B66 003F
+0BE6 003F
+0C66 003F
+0C78 003F
+0CE6 003F
+0D66 003F
0E50 003F
-0ED0 0021
0ED0 003F
-0F20 0021
0F20 003F
-0F33 0021
-0F33 003F
-1C40 0021
-1C40 003F
-A900 0021
-A900 003F
-1040 0021
1040 003F
-1090 0021
1090 003F
-11136 0021
-11136 003F
-11136 0334
-17E0 0021
17E0 003F
-17F0 0021
17F0 003F
-AA50 0021
-AA50 003F
-1B50 0021
+1810 003F
+1946 003F
+19D0 003F
+1A80 003F
+1A90 003F
1B50 003F
-A9D0 0021
-A9D0 003F
-1BB0 0021
1BB0 003F
-1810 0021
-1810 003F
-1C50 0021
+1C40 003F
1C50 003F
-A620 0021
-A620 003F
-110F0 0021
-110F0 003F
-110F0 0334
-3007 0021
3007 003F
-1018A 0021
+A620 003F
+A8D0 003F
+A900 003F
+A9D0 003F
+AA50 003F
+ABF0 003F
1018A 003F
-1018A 0334
-111D0 0021
+104A0 003F
+11066 003F
+110F0 003F
+11136 003F
111D0 003F
-111D0 0334
-116C0 0021
116C0 003F
-116C0 0334
-11066 0021
-11066 003F
+FF10 0021
+FF10 003F
+1F101 0334
+1F101 0021
+1F101 003F
+0F33 0021
+0F33 003F
+1F100 0334
+1F100 0021
+1F100 003F
+1D7CE 0021
+1D7D8 0021
+1D7E2 0021
+1D7EC 0021
+1D7F6 0021
+1D7CE 003F
+1D7D8 003F
+1D7E2 003F
+1D7EC 003F
+1D7F6 003F
+24EA 0021
+24FF 0021
+24EA 003F
+24FF 003F
+2070 0021
+2070 003F
+2080 0021
+2080 003F
+1018A 0334
+104A0 0334
11066 0334
+110F0 0334
+11136 0334
+111D0 0334
+116C0 0334
+1D7CE 0334
+1D7D8 0334
+1D7E2 0334
+1D7EC 0334
+1D7F6 0334
2189 0021
2189 003F
2189 0061
2189 0041
2189 0062
0030 0061
-0030 0041
-FF10 0061
-FF10 0041
-1F101 0061
-1F100 0061
-1F101 0041
-1F100 0041
-1D7CE 0061
-1D7D8 0061
-1D7E2 0061
-1D7EC 0061
-1D7F6 0061
-1D7CE 0041
-1D7D8 0041
-1D7E2 0041
-1D7EC 0041
-1D7F6 0041
-24EA 0061
-24FF 0061
-24EA 0041
-24FF 0041
-2070 0061
-2070 0041
-2080 0061
-2080 0041
0660 0061
-0660 0041
06F0 0061
-06F0 0041
07C0 0061
-07C0 0041
-104A0 0061
-104A0 0041
0966 0061
-0966 0041
09E6 0061
-09E6 0041
0A66 0061
-0A66 0041
0AE6 0061
-0AE6 0041
0B66 0061
-0B66 0041
0BE6 0061
-0BE6 0041
0C66 0061
0C78 0061
-0C66 0041
-0C78 0041
0CE6 0061
-0CE6 0041
0D66 0061
-0D66 0041
-ABF0 0061
-ABF0 0041
-A8D0 0061
-A8D0 0041
+0E50 0061
+0ED0 0061
+0F20 0061
+1040 0061
+1090 0061
+17E0 0061
+17F0 0061
+1810 0061
1946 0061
-1946 0041
19D0 0061
-19D0 0041
1A80 0061
-1A80 0041
1A90 0061
-1A90 0041
-0E50 0061
+1B50 0061
+1BB0 0061
+1C40 0061
+1C50 0061
+3007 0061
+A620 0061
+A8D0 0061
+A900 0061
+A9D0 0061
+AA50 0061
+ABF0 0061
+1018A 0061
+104A0 0061
+11066 0061
+110F0 0061
+11136 0061
+111D0 0061
+116C0 0061
+0030 0041
+0660 0041
+06F0 0041
+07C0 0041
+0966 0041
+09E6 0041
+0A66 0041
+0AE6 0041
+0B66 0041
+0BE6 0041
+0C66 0041
+0C78 0041
+0CE6 0041
+0D66 0041
0E50 0041
-0ED0 0061
0ED0 0041
-0F20 0061
0F20 0041
-0F33 0061
-0F33 0041
-1C40 0061
-1C40 0041
-A900 0061
-A900 0041
-1040 0061
1040 0041
-1090 0061
1090 0041
-11136 0061
-11136 0041
-17E0 0061
17E0 0041
-17F0 0061
17F0 0041
-AA50 0061
-AA50 0041
-1B50 0061
+1810 0041
+1946 0041
+19D0 0041
+1A80 0041
+1A90 0041
1B50 0041
-A9D0 0061
-A9D0 0041
-1BB0 0061
1BB0 0041
-1810 0061
-1810 0041
-1C50 0061
+1C40 0041
1C50 0041
-A620 0061
-A620 0041
-110F0 0061
-110F0 0041
-3007 0061
3007 0041
-1018A 0061
+A620 0041
+A8D0 0041
+A900 0041
+A9D0 0041
+AA50 0041
+ABF0 0041
1018A 0041
-111D0 0061
+104A0 0041
+11066 0041
+110F0 0041
+11136 0041
111D0 0041
-116C0 0061
116C0 0041
-11066 0061
-11066 0041
+FF10 0061
+FF10 0041
+1F101 0061
+1F100 0061
+0F33 0061
+1F101 0041
+1F100 0041
+0F33 0041
+1D7CE 0061
+1D7D8 0061
+1D7E2 0061
+1D7EC 0061
+1D7F6 0061
+1D7CE 0041
+1D7D8 0041
+1D7E2 0041
+1D7EC 0041
+1D7F6 0041
+24EA 0061
+24FF 0061
+24EA 0041
+24FF 0041
+2070 0061
+2070 0041
+2080 0061
+2080 0041
0030 0062
-FF10 0062
-1F101 0062
-1F100 0062
-1D7CE 0062
-1D7D8 0062
-1D7E2 0062
-1D7EC 0062
-1D7F6 0062
-24EA 0062
-24FF 0062
-2070 0062
-2080 0062
0660 0062
06F0 0062
07C0 0062
-104A0 0062
0966 0062
09E6 0062
0A66 0062
0C78 0062
0CE6 0062
0D66 0062
-ABF0 0062
-A8D0 0062
-1946 0062
-19D0 0062
-1A80 0062
-1A90 0062
0E50 0062
0ED0 0062
0F20 0062
-0F33 0062
-1C40 0062
-A900 0062
1040 0062
1090 0062
-11136 0062
17E0 0062
17F0 0062
-AA50 0062
+1810 0062
+1946 0062
+19D0 0062
+1A80 0062
+1A90 0062
1B50 0062
-A9D0 0062
1BB0 0062
-1810 0062
+1C40 0062
1C50 0062
-A620 0062
-110F0 0062
3007 0062
+A620 0062
+A8D0 0062
+A900 0062
+A9D0 0062
+AA50 0062
+ABF0 0062
1018A 0062
+104A0 0062
+11066 0062
+110F0 0062
+11136 0062
111D0 0062
116C0 0062
-11066 0062
+FF10 0062
+1F101 0062
+1F100 0062
+0F33 0062
+1D7CE 0062
+1D7D8 0062
+1D7E2 0062
+1D7EC 0062
+1D7F6 0062
+24EA 0062
+24FF 0062
+2070 0062
+2080 0062
3358 0021
3358 003F
3358 0061
3358 0041
3358 0062
0031 0021
+0661 0021
+06F1 0021
+07C1 0021
+0967 0021
+09E7 0021
+0A67 0021
+0AE7 0021
+0B67 0021
+0BE7 0021
+0C67 0021
+0C79 0021
+0C7C 0021
+0CE7 0021
+0D67 0021
+0E51 0021
+0ED1 0021
+0F21 0021
+1041 0021
+1091 0021
+1369 0021
+17E1 0021
+17F1 0021
+1811 0021
+1947 0021
+19D1 0021
+19DA 0021
+1A81 0021
+1A91 0021
+1B51 0021
+1BB1 0021
+1C41 0021
+1C51 0021
+3021 0021
+A621 0021
+A8D1 0021
+A901 0021
+A9D1 0021
+AA51 0021
+ABF1 0021
+10107 0021
+10142 0021
+10158 0021
+10159 0021
+1015A 0021
+10320 0021
+103D1 0021
+104A1 0021
+10858 0021
+10916 0021
+10A40 0021
+10A7D 0021
+10B58 0021
+10B78 0021
+10E60 0021
+11052 0021
+11067 0021
+110F1 0021
+11137 0021
+111D1 0021
+116C1 0021
+12415 0021
+1241E 0021
+1242C 0021
+12434 0021
+1244F 0021
+12458 0021
+1D360 0021
0031 003F
+0661 003F
+06F1 003F
+07C1 003F
+0967 003F
+09E7 003F
+0A67 003F
+0AE7 003F
+0B67 003F
+0BE7 003F
+0C67 003F
+0C79 003F
+0C7C 003F
+0CE7 003F
+0D67 003F
+0E51 003F
+0ED1 003F
+0F21 003F
+1041 003F
+1091 003F
+1369 003F
+17E1 003F
+17F1 003F
+1811 003F
+1947 003F
+19D1 003F
+19DA 003F
+1A81 003F
+1A91 003F
+1B51 003F
+1BB1 003F
+1C41 003F
+1C51 003F
+3021 003F
+A621 003F
+A8D1 003F
+A901 003F
+A9D1 003F
+AA51 003F
+ABF1 003F
+10107 003F
+10142 003F
+10158 003F
+10159 003F
+1015A 003F
+10320 003F
+103D1 003F
+104A1 003F
+10858 003F
+10916 003F
+10A40 003F
+10A7D 003F
+10B58 003F
+10B78 003F
+10E60 003F
+11052 003F
+11067 003F
+110F1 003F
+11137 003F
+111D1 003F
+116C1 003F
+12415 003F
+1241E 003F
+1242C 003F
+12434 003F
+1244F 003F
+12458 003F
+1D360 003F
FF11 0021
FF11 003F
2474 0021
1F102 0334
1F102 0021
1F102 003F
+0F2A 0021
+0F2A 003F
2488 0021
2488 003F
1D7CF 0021
00B9 003F
2081 0021
2081 003F
-1D7CF 0334
-1D7D9 0334
-1D7E3 0334
-1D7ED 0334
-1D7F7 0334
-0661 0021
-0661 003F
-06F1 0021
-06F1 003F
-10E60 0021
-10E60 003F
-10E60 0334
-07C1 0021
-07C1 003F
-1369 0021
-1369 003F
-104A1 0021
-104A1 003F
-104A1 0334
-0967 0021
-0967 003F
-09E7 0021
-09E7 003F
-0A67 0021
-0A67 003F
-0AE7 0021
-0AE7 003F
-0B67 0021
-0B67 003F
-0BE7 0021
-0BE7 003F
-0C67 0021
-0C79 0021
-0C7C 0021
-0C67 003F
-0C79 003F
-0C7C 003F
-0CE7 0021
-0CE7 003F
-0D67 0021
-0D67 003F
-ABF1 0021
-ABF1 003F
-A8D1 0021
-A8D1 003F
-1947 0021
-1947 003F
-19D1 0021
-19DA 0021
-19D1 003F
-19DA 003F
-1A81 0021
-1A81 003F
-1A91 0021
-1A91 003F
-0E51 0021
-0E51 003F
-0ED1 0021
-0ED1 003F
-0F21 0021
-0F21 003F
-0F2A 0021
-0F2A 003F
-1C41 0021
-1C41 003F
-A901 0021
-A901 003F
-1041 0021
-1041 003F
-1091 0021
-1091 003F
-11137 0021
-11137 003F
-11137 0334
-17E1 0021
-17E1 003F
-17F1 0021
-17F1 003F
-AA51 0021
-AA51 003F
-1B51 0021
-1B51 003F
-A9D1 0021
-A9D1 003F
-1BB1 0021
-1BB1 003F
-1811 0021
-1811 003F
-1C51 0021
-1C51 003F
-A621 0021
-A621 003F
-110F1 0021
-110F1 003F
-110F1 0334
-3021 0021
-3021 003F
-10107 0021
-10107 003F
10107 0334
-10142 0021
-10158 0021
-10159 0021
-1015A 0021
-10142 003F
-10158 003F
-10159 003F
-1015A 003F
10142 0334
10158 0334
10159 0334
1015A 0334
-10320 0021
-10320 003F
10320 0334
-103D1 0021
-103D1 003F
103D1 0334
-12415 0021
-1241E 0021
-1242C 0021
-12434 0021
-1244F 0021
-12458 0021
-12415 003F
-1241E 003F
-1242C 003F
-12434 003F
-1244F 003F
-12458 003F
+104A1 0334
+10858 0334
+10916 0334
+10A40 0334
+10A7D 0334
+10B58 0334
+10B78 0334
+10E60 0334
+11052 0334
+11067 0334
+110F1 0334
+11137 0334
+111D1 0334
+116C1 0334
12415 0334
1241E 0334
1242C 0334
12434 0334
1244F 0334
12458 0334
-10A7D 0021
-10A7D 003F
-10A7D 0334
-10916 0021
-10916 003F
-10916 0334
-10858 0021
-10858 003F
-10858 0334
-10B58 0021
-10B58 003F
-10B58 0334
-10B78 0021
-10B78 003F
-10B78 0334
-111D1 0021
-111D1 003F
-111D1 0334
-116C1 0021
-116C1 003F
-116C1 0334
-11067 0021
-11067 003F
-11067 0334
-11052 0021
-11052 003F
-11052 0334
-10A40 0021
-10A40 003F
-10A40 0334
-1D360 0021
-1D360 003F
1D360 0334
+1D7CF 0334
+1D7D9 0334
+1D7E3 0334
+1D7ED 0334
+1D7F7 0334
215F 0021
215F 003F
2152 0021
336B 0041
336B 0062
0031 0061
-0031 0041
-FF11 0061
-FF11 0041
-2474 0061
-1F102 0061
-2488 0061
-2474 0041
-1F102 0041
-2488 0041
-1D7CF 0061
-1D7D9 0061
-1D7E3 0061
-1D7ED 0061
-1D7F7 0061
-1D7CF 0041
-1D7D9 0041
-1D7E3 0041
-1D7ED 0041
-1D7F7 0041
-2460 0061
-24F5 0061
-2776 0061
-2780 0061
-278A 0061
-2460 0041
-24F5 0041
-2776 0041
-2780 0041
-278A 0041
-00B9 0061
-00B9 0041
-2081 0061
-2081 0041
0661 0061
-0661 0041
06F1 0061
-06F1 0041
-10E60 0061
-10E60 0041
07C1 0061
-07C1 0041
-1369 0061
-1369 0041
-104A1 0061
-104A1 0041
0967 0061
-0967 0041
09E7 0061
-09E7 0041
0A67 0061
-0A67 0041
0AE7 0061
-0AE7 0041
0B67 0061
-0B67 0041
0BE7 0061
-0BE7 0041
0C67 0061
0C79 0061
0C7C 0061
-0C67 0041
-0C79 0041
-0C7C 0041
0CE7 0061
-0CE7 0041
0D67 0061
-0D67 0041
-ABF1 0061
-ABF1 0041
-A8D1 0061
-A8D1 0041
-1947 0061
-1947 0041
-19D1 0061
-19DA 0061
-19D1 0041
-19DA 0041
-1A81 0061
-1A81 0041
-1A91 0061
-1A91 0041
0E51 0061
-0E51 0041
0ED1 0061
-0ED1 0041
0F21 0061
-0F21 0041
-0F2A 0061
-0F2A 0041
-1C41 0061
-1C41 0041
-A901 0061
-A901 0041
1041 0061
-1041 0041
1091 0061
-1091 0041
-11137 0061
-11137 0041
+1369 0061
17E1 0061
-17E1 0041
17F1 0061
-17F1 0041
-AA51 0061
-AA51 0041
+1811 0061
+1947 0061
+19D1 0061
+19DA 0061
+1A81 0061
+1A91 0061
1B51 0061
-1B51 0041
-A9D1 0061
-A9D1 0041
1BB1 0061
-1BB1 0041
-1811 0061
-1811 0041
+1C41 0061
1C51 0061
-1C51 0041
-A621 0061
-A621 0041
-110F1 0061
-110F1 0041
3021 0061
-3021 0041
+A621 0061
+A8D1 0061
+A901 0061
+A9D1 0061
+AA51 0061
+ABF1 0061
10107 0061
-10107 0041
10142 0061
10158 0061
10159 0061
1015A 0061
-10142 0041
-10158 0041
-10159 0041
-1015A 0041
10320 0061
-10320 0041
103D1 0061
-103D1 0041
+104A1 0061
+10858 0061
+10916 0061
+10A40 0061
+10A7D 0061
+10B58 0061
+10B78 0061
+10E60 0061
+11052 0061
+11067 0061
+110F1 0061
+11137 0061
+111D1 0061
+116C1 0061
12415 0061
1241E 0061
1242C 0061
12434 0061
1244F 0061
12458 0061
+1D360 0061
+0031 0041
+0661 0041
+06F1 0041
+07C1 0041
+0967 0041
+09E7 0041
+0A67 0041
+0AE7 0041
+0B67 0041
+0BE7 0041
+0C67 0041
+0C79 0041
+0C7C 0041
+0CE7 0041
+0D67 0041
+0E51 0041
+0ED1 0041
+0F21 0041
+1041 0041
+1091 0041
+1369 0041
+17E1 0041
+17F1 0041
+1811 0041
+1947 0041
+19D1 0041
+19DA 0041
+1A81 0041
+1A91 0041
+1B51 0041
+1BB1 0041
+1C41 0041
+1C51 0041
+3021 0041
+A621 0041
+A8D1 0041
+A901 0041
+A9D1 0041
+AA51 0041
+ABF1 0041
+10107 0041
+10142 0041
+10158 0041
+10159 0041
+1015A 0041
+10320 0041
+103D1 0041
+104A1 0041
+10858 0041
+10916 0041
+10A40 0041
+10A7D 0041
+10B58 0041
+10B78 0041
+10E60 0041
+11052 0041
+11067 0041
+110F1 0041
+11137 0041
+111D1 0041
+116C1 0041
12415 0041
1241E 0041
1242C 0041
12434 0041
1244F 0041
12458 0041
-10A7D 0061
-10A7D 0041
-10916 0061
-10916 0041
-10858 0061
-10858 0041
-10B58 0061
-10B58 0041
-10B78 0061
-10B78 0041
-111D1 0061
-111D1 0041
-116C1 0061
-116C1 0041
-11067 0061
-11067 0041
-11052 0061
-11052 0041
-10A40 0061
-10A40 0041
-1D360 0061
1D360 0041
+FF11 0061
+FF11 0041
+2474 0061
+1F102 0061
+2488 0061
+0F2A 0061
+2474 0041
+1F102 0041
+2488 0041
+0F2A 0041
+1D7CF 0061
+1D7D9 0061
+1D7E3 0061
+1D7ED 0061
+1D7F7 0061
+1D7CF 0041
+1D7D9 0041
+1D7E3 0041
+1D7ED 0041
+1D7F7 0041
+2460 0061
+24F5 0061
+2776 0061
+2780 0061
+278A 0061
+2460 0041
+24F5 0041
+2776 0041
+2780 0041
+278A 0041
+00B9 0061
+00B9 0041
+2081 0061
+2081 0041
0031 0062
-FF11 0062
-2474 0062
-1F102 0062
-2488 0062
-1D7CF 0062
-1D7D9 0062
-1D7E3 0062
-1D7ED 0062
-1D7F7 0062
-2460 0062
-24F5 0062
-2776 0062
-2780 0062
-278A 0062
-00B9 0062
-2081 0062
0661 0062
06F1 0062
-10E60 0062
07C1 0062
-1369 0062
-104A1 0062
0967 0062
09E7 0062
0A67 0062
0C7C 0062
0CE7 0062
0D67 0062
-ABF1 0062
-A8D1 0062
-1947 0062
-19D1 0062
-19DA 0062
-1A81 0062
-1A91 0062
0E51 0062
0ED1 0062
0F21 0062
-0F2A 0062
-1C41 0062
-A901 0062
1041 0062
1091 0062
-11137 0062
+1369 0062
17E1 0062
17F1 0062
-AA51 0062
+1811 0062
+1947 0062
+19D1 0062
+19DA 0062
+1A81 0062
+1A91 0062
1B51 0062
-A9D1 0062
1BB1 0062
-1811 0062
+1C41 0062
1C51 0062
-A621 0062
-110F1 0062
3021 0062
+A621 0062
+A8D1 0062
+A901 0062
+A9D1 0062
+AA51 0062
+ABF1 0062
10107 0062
10142 0062
10158 0062
1015A 0062
10320 0062
103D1 0062
+104A1 0062
+10858 0062
+10916 0062
+10A40 0062
+10A7D 0062
+10B58 0062
+10B78 0062
+10E60 0062
+11052 0062
+11067 0062
+110F1 0062
+11137 0062
+111D1 0062
+116C1 0062
12415 0062
1241E 0062
1242C 0062
12434 0062
1244F 0062
12458 0062
-10A7D 0062
-10916 0062
-10858 0062
-10B58 0062
-10B78 0062
-111D1 0062
-116C1 0062
-11067 0062
-11052 0062
-10A40 0062
1D360 0062
+FF11 0062
+2474 0062
+1F102 0062
+2488 0062
+0F2A 0062
+1D7CF 0062
+1D7D9 0062
+1D7E3 0062
+1D7ED 0062
+1D7F7 0062
+2460 0062
+24F5 0062
+2776 0062
+2780 0062
+278A 0062
+00B9 0062
+2081 0062
33E0 0021
33E0 003F
33E0 0061
3359 0041
3359 0062
0032 0021
-0032 003F
-FF12 0021
-FF12 003F
-2475 0021
-2475 003F
-1F103 0334
-1F103 0021
-1F103 003F
-2489 0021
-2489 003F
-1D7D0 0021
-1D7DA 0021
-1D7E4 0021
-1D7EE 0021
-1D7F8 0021
-1D7D0 003F
-1D7DA 003F
-1D7E4 003F
-1D7EE 003F
-1D7F8 003F
-2461 0021
-24F6 0021
-2777 0021
-2781 0021
-278B 0021
-2461 003F
-24F6 003F
-2777 003F
-2781 003F
-278B 003F
-00B2 0021
-00B2 003F
-2082 0021
-2082 003F
-1D7D0 0334
-1D7DA 0334
-1D7E4 0334
-1D7EE 0334
-1D7F8 0334
0662 0021
-0662 003F
06F2 0021
-06F2 003F
-10E61 0021
-10E61 003F
-10E61 0334
07C2 0021
-07C2 003F
-136A 0021
-136A 003F
-104A2 0021
-104A2 003F
-104A2 0334
0968 0021
-0968 003F
09E8 0021
-09E8 003F
0A68 0021
-0A68 003F
0AE8 0021
-0AE8 003F
0B68 0021
-0B68 003F
0BE8 0021
-0BE8 003F
0C68 0021
0C7A 0021
0C7D 0021
-0C68 003F
-0C7A 003F
-0C7D 003F
0CE8 0021
-0CE8 003F
0D68 0021
-0D68 003F
-ABF2 0021
-ABF2 003F
-A8D2 0021
-A8D2 003F
-1948 0021
-1948 003F
-19D2 0021
-19D2 003F
-1A82 0021
-1A82 003F
-1A92 0021
-1A92 003F
0E52 0021
-0E52 003F
0ED2 0021
-0ED2 003F
0F22 0021
-0F22 003F
-0F2B 0021
-0F2B 003F
-1C42 0021
-1C42 003F
-A902 0021
-A902 003F
1042 0021
-1042 003F
1092 0021
-1092 003F
-11138 0021
-11138 003F
-11138 0334
+136A 0021
17E2 0021
-17E2 003F
17F2 0021
-17F2 003F
-AA52 0021
-AA52 003F
+1812 0021
+1948 0021
+19D2 0021
+1A82 0021
+1A92 0021
1B52 0021
-1B52 003F
-A9D2 0021
-A9D2 003F
1BB2 0021
-1BB2 003F
-1812 0021
-1812 003F
+1C42 0021
1C52 0021
-1C52 003F
-A622 0021
-A622 003F
-110F2 0021
-110F2 003F
-110F2 0334
3022 0021
-3022 003F
+A622 0021
+A8D2 0021
+A902 0021
+A9D2 0021
+AA52 0021
+ABF2 0021
10108 0021
-10108 003F
-10108 0334
1015B 0021
1015C 0021
1015D 0021
1015E 0021
-1015B 003F
-1015C 003F
-1015D 003F
-1015E 003F
-1015B 0334
-1015C 0334
-1015D 0334
-1015E 0334
103D2 0021
-103D2 003F
-103D2 0334
+104A2 0021
+10859 0021
+1091A 0021
+10A41 0021
+10B59 0021
+10B79 0021
+10E61 0021
+11053 0021
+11068 0021
+110F2 0021
+11138 0021
+111D2 0021
+116C2 0021
12400 0021
12416 0021
1241F 0021
12435 0021
1244A 0021
12450 0021
+12456 0021
12459 0021
+1D361 0021
+0032 003F
+0662 003F
+06F2 003F
+07C2 003F
+0968 003F
+09E8 003F
+0A68 003F
+0AE8 003F
+0B68 003F
+0BE8 003F
+0C68 003F
+0C7A 003F
+0C7D 003F
+0CE8 003F
+0D68 003F
+0E52 003F
+0ED2 003F
+0F22 003F
+1042 003F
+1092 003F
+136A 003F
+17E2 003F
+17F2 003F
+1812 003F
+1948 003F
+19D2 003F
+1A82 003F
+1A92 003F
+1B52 003F
+1BB2 003F
+1C42 003F
+1C52 003F
+3022 003F
+A622 003F
+A8D2 003F
+A902 003F
+A9D2 003F
+AA52 003F
+ABF2 003F
+10108 003F
+1015B 003F
+1015C 003F
+1015D 003F
+1015E 003F
+103D2 003F
+104A2 003F
+10859 003F
+1091A 003F
+10A41 003F
+10B59 003F
+10B79 003F
+10E61 003F
+11053 003F
+11068 003F
+110F2 003F
+11138 003F
+111D2 003F
+116C2 003F
12400 003F
12416 003F
1241F 003F
12435 003F
1244A 003F
12450 003F
+12456 003F
12459 003F
+1D361 003F
+FF12 0021
+FF12 003F
+2475 0021
+2475 003F
+1F103 0334
+1F103 0021
+1F103 003F
+0F2B 0021
+0F2B 003F
+2489 0021
+2489 003F
+1D7D0 0021
+1D7DA 0021
+1D7E4 0021
+1D7EE 0021
+1D7F8 0021
+1D7D0 003F
+1D7DA 003F
+1D7E4 003F
+1D7EE 003F
+1D7F8 003F
+2461 0021
+24F6 0021
+2777 0021
+2781 0021
+278B 0021
+2461 003F
+24F6 003F
+2777 003F
+2781 003F
+278B 003F
+00B2 0021
+00B2 003F
+2082 0021
+2082 003F
+10108 0334
+1015B 0334
+1015C 0334
+1015D 0334
+1015E 0334
+103D2 0334
+104A2 0334
+10859 0334
+1091A 0334
+10A41 0334
+10B59 0334
+10B79 0334
+10E61 0334
+11053 0334
+11068 0334
+110F2 0334
+11138 0334
+111D2 0334
+116C2 0334
12400 0334
12416 0334
1241F 0334
12435 0334
1244A 0334
12450 0334
+12456 0334
12459 0334
-1091A 0021
-1091A 003F
-1091A 0334
-10859 0021
-10859 003F
-10859 0334
-10B59 0021
-10B59 003F
-10B59 0334
-10B79 0021
-10B79 003F
-10B79 0334
-111D2 0021
-111D2 003F
-111D2 0334
-116C2 0021
-116C2 003F
-116C2 0334
-11068 0021
-11068 003F
-11068 0334
-11053 0021
-11053 003F
-11053 0334
-10A41 0021
-10A41 003F
-10A41 0334
-1D361 0021
-1D361 003F
1D361 0334
+1D7D0 0334
+1D7DA 0334
+1D7E4 0334
+1D7EE 0334
+1D7F8 0334
2154 0021
2154 003F
2154 0061
33FC 0041
33FC 0062
0032 0061
-0032 0041
-FF12 0061
-FF12 0041
-2475 0061
-1F103 0061
-2489 0061
-2475 0041
-1F103 0041
-2489 0041
-1D7D0 0061
-1D7DA 0061
-1D7E4 0061
-1D7EE 0061
-1D7F8 0061
-1D7D0 0041
-1D7DA 0041
-1D7E4 0041
-1D7EE 0041
-1D7F8 0041
-2461 0061
-24F6 0061
-2777 0061
-2781 0061
-278B 0061
-2461 0041
-24F6 0041
-2777 0041
-2781 0041
-278B 0041
-00B2 0061
-00B2 0041
-2082 0061
-2082 0041
0662 0061
-0662 0041
06F2 0061
-06F2 0041
-10E61 0061
-10E61 0041
07C2 0061
-07C2 0041
-136A 0061
-136A 0041
-104A2 0061
-104A2 0041
0968 0061
-0968 0041
09E8 0061
-09E8 0041
0A68 0061
-0A68 0041
0AE8 0061
-0AE8 0041
0B68 0061
-0B68 0041
0BE8 0061
-0BE8 0041
0C68 0061
0C7A 0061
0C7D 0061
-0C68 0041
-0C7A 0041
-0C7D 0041
0CE8 0061
-0CE8 0041
0D68 0061
-0D68 0041
-ABF2 0061
-ABF2 0041
-A8D2 0061
-A8D2 0041
-1948 0061
-1948 0041
-19D2 0061
-19D2 0041
-1A82 0061
-1A82 0041
-1A92 0061
-1A92 0041
0E52 0061
-0E52 0041
0ED2 0061
-0ED2 0041
0F22 0061
-0F22 0041
-0F2B 0061
-0F2B 0041
-1C42 0061
-1C42 0041
-A902 0061
-A902 0041
1042 0061
-1042 0041
1092 0061
-1092 0041
-11138 0061
-11138 0041
+136A 0061
17E2 0061
-17E2 0041
17F2 0061
-17F2 0041
-AA52 0061
-AA52 0041
+1812 0061
+1948 0061
+19D2 0061
+1A82 0061
+1A92 0061
1B52 0061
-1B52 0041
-A9D2 0061
-A9D2 0041
1BB2 0061
-1BB2 0041
-1812 0061
-1812 0041
+1C42 0061
1C52 0061
-1C52 0041
-A622 0061
-A622 0041
-110F2 0061
-110F2 0041
3022 0061
-3022 0041
+A622 0061
+A8D2 0061
+A902 0061
+A9D2 0061
+AA52 0061
+ABF2 0061
10108 0061
-10108 0041
1015B 0061
1015C 0061
1015D 0061
1015E 0061
-1015B 0041
-1015C 0041
-1015D 0041
-1015E 0041
103D2 0061
-103D2 0041
+104A2 0061
+10859 0061
+1091A 0061
+10A41 0061
+10B59 0061
+10B79 0061
+10E61 0061
+11053 0061
+11068 0061
+110F2 0061
+11138 0061
+111D2 0061
+116C2 0061
12400 0061
12416 0061
1241F 0061
12435 0061
1244A 0061
12450 0061
+12456 0061
12459 0061
+1D361 0061
+0032 0041
+0662 0041
+06F2 0041
+07C2 0041
+0968 0041
+09E8 0041
+0A68 0041
+0AE8 0041
+0B68 0041
+0BE8 0041
+0C68 0041
+0C7A 0041
+0C7D 0041
+0CE8 0041
+0D68 0041
+0E52 0041
+0ED2 0041
+0F22 0041
+1042 0041
+1092 0041
+136A 0041
+17E2 0041
+17F2 0041
+1812 0041
+1948 0041
+19D2 0041
+1A82 0041
+1A92 0041
+1B52 0041
+1BB2 0041
+1C42 0041
+1C52 0041
+3022 0041
+A622 0041
+A8D2 0041
+A902 0041
+A9D2 0041
+AA52 0041
+ABF2 0041
+10108 0041
+1015B 0041
+1015C 0041
+1015D 0041
+1015E 0041
+103D2 0041
+104A2 0041
+10859 0041
+1091A 0041
+10A41 0041
+10B59 0041
+10B79 0041
+10E61 0041
+11053 0041
+11068 0041
+110F2 0041
+11138 0041
+111D2 0041
+116C2 0041
12400 0041
12416 0041
1241F 0041
12435 0041
1244A 0041
12450 0041
+12456 0041
12459 0041
-1091A 0061
-1091A 0041
-10859 0061
-10859 0041
-10B59 0061
-10B59 0041
-10B79 0061
-10B79 0041
-111D2 0061
-111D2 0041
-116C2 0061
-116C2 0041
-11068 0061
-11068 0041
-11053 0061
-11053 0041
-10A41 0061
-10A41 0041
-1D361 0061
1D361 0041
-0032 0062
-FF12 0062
-2475 0062
-1F103 0062
-2489 0062
-1D7D0 0062
-1D7DA 0062
-1D7E4 0062
-1D7EE 0062
-1D7F8 0062
-2461 0062
-24F6 0062
-2777 0062
-2781 0062
-278B 0062
-00B2 0062
-2082 0062
+FF12 0061
+FF12 0041
+2475 0061
+1F103 0061
+2489 0061
+0F2B 0061
+2475 0041
+1F103 0041
+2489 0041
+0F2B 0041
+1D7D0 0061
+1D7DA 0061
+1D7E4 0061
+1D7EE 0061
+1D7F8 0061
+1D7D0 0041
+1D7DA 0041
+1D7E4 0041
+1D7EE 0041
+1D7F8 0041
+2461 0061
+24F6 0061
+2777 0061
+2781 0061
+278B 0061
+2461 0041
+24F6 0041
+2777 0041
+2781 0041
+278B 0041
+00B2 0061
+00B2 0041
+2082 0061
+2082 0041
+0032 0062
0662 0062
06F2 0062
-10E61 0062
07C2 0062
-136A 0062
-104A2 0062
0968 0062
09E8 0062
0A68 0062
0C7D 0062
0CE8 0062
0D68 0062
-ABF2 0062
-A8D2 0062
-1948 0062
-19D2 0062
-1A82 0062
-1A92 0062
0E52 0062
0ED2 0062
0F22 0062
-0F2B 0062
-1C42 0062
-A902 0062
1042 0062
1092 0062
-11138 0062
+136A 0062
17E2 0062
17F2 0062
-AA52 0062
+1812 0062
+1948 0062
+19D2 0062
+1A82 0062
+1A92 0062
1B52 0062
-A9D2 0062
1BB2 0062
-1812 0062
+1C42 0062
1C52 0062
-A622 0062
-110F2 0062
3022 0062
+A622 0062
+A8D2 0062
+A902 0062
+A9D2 0062
+AA52 0062
+ABF2 0062
10108 0062
1015B 0062
1015C 0062
1015D 0062
1015E 0062
103D2 0062
+104A2 0062
+10859 0062
+1091A 0062
+10A41 0062
+10B59 0062
+10B79 0062
+10E61 0062
+11053 0062
+11068 0062
+110F2 0062
+11138 0062
+111D2 0062
+116C2 0062
12400 0062
12416 0062
1241F 0062
12435 0062
1244A 0062
12450 0062
+12456 0062
12459 0062
-1091A 0062
-10859 0062
-10B59 0062
-10B79 0062
-111D2 0062
-116C2 0062
-11068 0062
-11053 0062
-10A41 0062
1D361 0062
+FF12 0062
+2475 0062
+1F103 0062
+2489 0062
+0F2B 0062
+1D7D0 0062
+1D7DA 0062
+1D7E4 0062
+1D7EE 0062
+1D7F8 0062
+2461 0062
+24F6 0062
+2777 0062
+2781 0062
+278B 0062
+00B2 0062
+2082 0062
33E1 0021
33E1 003F
33E1 0061
335A 0041
335A 0062
0033 0021
-0033 003F
-FF13 0021
-FF13 003F
-2476 0021
-2476 003F
-1F104 0334
-1F104 0021
-1F104 003F
-248A 0021
-248A 003F
-1D7D1 0021
-1D7DB 0021
-1D7E5 0021
-1D7EF 0021
-1D7F9 0021
-1D7D1 003F
-1D7DB 003F
-1D7E5 003F
-1D7EF 003F
-1D7F9 003F
-2462 0021
-24F7 0021
-2778 0021
-2782 0021
-278C 0021
-2462 003F
-24F7 003F
-2778 003F
-2782 003F
-278C 003F
-00B3 0021
-00B3 003F
-2083 0021
-2083 003F
-1D7D1 0334
-1D7DB 0334
-1D7E5 0334
-1D7EF 0334
-1D7F9 0334
0663 0021
-0663 003F
06F3 0021
-06F3 003F
-10E62 0021
-10E62 003F
-10E62 0334
07C3 0021
-07C3 003F
-136B 0021
-136B 003F
-104A3 0021
-104A3 003F
-104A3 0334
0969 0021
-0969 003F
09E9 0021
-09E9 003F
0A69 0021
-0A69 003F
0AE9 0021
-0AE9 003F
0B69 0021
-0B69 003F
0BE9 0021
-0BE9 003F
0C69 0021
0C7B 0021
0C7E 0021
-0C69 003F
-0C7B 003F
-0C7E 003F
0CE9 0021
-0CE9 003F
0D69 0021
-0D69 003F
-ABF3 0021
-ABF3 003F
-A8D3 0021
-A8D3 003F
-1949 0021
-1949 003F
-19D3 0021
-19D3 003F
-1A83 0021
-1A83 003F
-1A93 0021
-1A93 003F
0E53 0021
-0E53 003F
0ED3 0021
-0ED3 003F
0F23 0021
-0F23 003F
-0F2C 0021
-0F2C 003F
-1C43 0021
-1C43 003F
-A903 0021
-A903 003F
1043 0021
-1043 003F
1093 0021
-1093 003F
-11139 0021
-11139 003F
-11139 0334
+136B 0021
17E3 0021
-17E3 003F
17F3 0021
-17F3 003F
-AA53 0021
-AA53 003F
+1813 0021
+1949 0021
+19D3 0021
+1A83 0021
+1A93 0021
1B53 0021
-1B53 003F
-A9D3 0021
-A9D3 003F
1BB3 0021
-1BB3 003F
-1813 0021
-1813 003F
+1C43 0021
1C53 0021
-1C53 003F
-A623 0021
-A623 003F
-110F3 0021
-110F3 003F
-110F3 0334
3023 0021
-3023 003F
+A623 0021
+A8D3 0021
+A903 0021
+A9D3 0021
+AA53 0021
+ABF3 0021
10109 0021
-10109 003F
-10109 0334
+104A3 0021
+1085A 0021
+1091B 0021
+10A42 0021
+10B5A 0021
+10B7A 0021
+10E62 0021
+11054 0021
+11069 0021
+110F3 0021
+11139 0021
+111D3 0021
+116C3 0021
12401 0021
12408 0021
12417 0021
1243B 0021
1244B 0021
12451 0021
+12457 0021
+1D362 0021
+0033 003F
+0663 003F
+06F3 003F
+07C3 003F
+0969 003F
+09E9 003F
+0A69 003F
+0AE9 003F
+0B69 003F
+0BE9 003F
+0C69 003F
+0C7B 003F
+0C7E 003F
+0CE9 003F
+0D69 003F
+0E53 003F
+0ED3 003F
+0F23 003F
+1043 003F
+1093 003F
+136B 003F
+17E3 003F
+17F3 003F
+1813 003F
+1949 003F
+19D3 003F
+1A83 003F
+1A93 003F
+1B53 003F
+1BB3 003F
+1C43 003F
+1C53 003F
+3023 003F
+A623 003F
+A8D3 003F
+A903 003F
+A9D3 003F
+AA53 003F
+ABF3 003F
+10109 003F
+104A3 003F
+1085A 003F
+1091B 003F
+10A42 003F
+10B5A 003F
+10B7A 003F
+10E62 003F
+11054 003F
+11069 003F
+110F3 003F
+11139 003F
+111D3 003F
+116C3 003F
12401 003F
12408 003F
12417 003F
1243B 003F
1244B 003F
12451 003F
+12457 003F
+1D362 003F
+FF13 0021
+FF13 003F
+2476 0021
+2476 003F
+1F104 0334
+1F104 0021
+1F104 003F
+0F2C 0021
+0F2C 003F
+248A 0021
+248A 003F
+1D7D1 0021
+1D7DB 0021
+1D7E5 0021
+1D7EF 0021
+1D7F9 0021
+1D7D1 003F
+1D7DB 003F
+1D7E5 003F
+1D7EF 003F
+1D7F9 003F
+2462 0021
+24F7 0021
+2778 0021
+2782 0021
+278C 0021
+2462 003F
+24F7 003F
+2778 003F
+2782 003F
+278C 003F
+00B3 0021
+00B3 003F
+2083 0021
+2083 003F
+10109 0334
+104A3 0334
+1085A 0334
+1091B 0334
+10A42 0334
+10B5A 0334
+10B7A 0334
+10E62 0334
+11054 0334
+11069 0334
+110F3 0334
+11139 0334
+111D3 0334
+116C3 0334
12401 0334
12408 0334
12417 0334
1243B 0334
1244B 0334
12451 0334
-1091B 0021
-1091B 003F
-1091B 0334
-1085A 0021
-1085A 003F
-1085A 0334
-10B5A 0021
-10B5A 003F
-10B5A 0334
-10B7A 0021
-10B7A 003F
-10B7A 0334
-111D3 0021
-111D3 003F
-111D3 0334
-116C3 0021
-116C3 003F
-116C3 0334
-11069 0021
-11069 003F
-11069 0334
-11054 0021
-11054 003F
-11054 0334
-10A42 0021
-10A42 003F
-10A42 0334
-1D362 0021
-1D362 003F
+12457 0334
1D362 0334
+1D7D1 0334
+1D7DB 0334
+1D7E5 0334
+1D7EF 0334
+1D7F9 0334
00BE 0021
00BE 003F
00BE 0061
32B4 0041
32B4 0062
0033 0061
-0033 0041
-FF13 0061
-FF13 0041
-2476 0061
-1F104 0061
-248A 0061
-2476 0041
-1F104 0041
-248A 0041
-1D7D1 0061
-1D7DB 0061
-1D7E5 0061
-1D7EF 0061
-1D7F9 0061
-1D7D1 0041
-1D7DB 0041
-1D7E5 0041
-1D7EF 0041
-1D7F9 0041
-2462 0061
-24F7 0061
-2778 0061
-2782 0061
-278C 0061
-2462 0041
-24F7 0041
-2778 0041
-2782 0041
-278C 0041
-00B3 0061
-00B3 0041
-2083 0061
-2083 0041
0663 0061
-0663 0041
06F3 0061
-06F3 0041
-10E62 0061
-10E62 0041
07C3 0061
-07C3 0041
-136B 0061
-136B 0041
-104A3 0061
-104A3 0041
0969 0061
-0969 0041
09E9 0061
-09E9 0041
0A69 0061
-0A69 0041
0AE9 0061
-0AE9 0041
0B69 0061
-0B69 0041
0BE9 0061
-0BE9 0041
0C69 0061
0C7B 0061
0C7E 0061
-0C69 0041
-0C7B 0041
-0C7E 0041
0CE9 0061
-0CE9 0041
0D69 0061
-0D69 0041
-ABF3 0061
-ABF3 0041
-A8D3 0061
-A8D3 0041
-1949 0061
-1949 0041
-19D3 0061
-19D3 0041
-1A83 0061
-1A83 0041
-1A93 0061
-1A93 0041
0E53 0061
-0E53 0041
0ED3 0061
-0ED3 0041
0F23 0061
-0F23 0041
-0F2C 0061
-0F2C 0041
-1C43 0061
-1C43 0041
-A903 0061
-A903 0041
1043 0061
-1043 0041
1093 0061
-1093 0041
-11139 0061
-11139 0041
+136B 0061
17E3 0061
-17E3 0041
17F3 0061
-17F3 0041
-AA53 0061
-AA53 0041
+1813 0061
+1949 0061
+19D3 0061
+1A83 0061
+1A93 0061
1B53 0061
-1B53 0041
-A9D3 0061
-A9D3 0041
1BB3 0061
-1BB3 0041
-1813 0061
-1813 0041
+1C43 0061
1C53 0061
-1C53 0041
-A623 0061
-A623 0041
-110F3 0061
-110F3 0041
3023 0061
-3023 0041
+A623 0061
+A8D3 0061
+A903 0061
+A9D3 0061
+AA53 0061
+ABF3 0061
10109 0061
-10109 0041
+104A3 0061
+1085A 0061
+1091B 0061
+10A42 0061
+10B5A 0061
+10B7A 0061
+10E62 0061
+11054 0061
+11069 0061
+110F3 0061
+11139 0061
+111D3 0061
+116C3 0061
12401 0061
12408 0061
12417 0061
1243B 0061
1244B 0061
12451 0061
+12457 0061
+1D362 0061
+0033 0041
+0663 0041
+06F3 0041
+07C3 0041
+0969 0041
+09E9 0041
+0A69 0041
+0AE9 0041
+0B69 0041
+0BE9 0041
+0C69 0041
+0C7B 0041
+0C7E 0041
+0CE9 0041
+0D69 0041
+0E53 0041
+0ED3 0041
+0F23 0041
+1043 0041
+1093 0041
+136B 0041
+17E3 0041
+17F3 0041
+1813 0041
+1949 0041
+19D3 0041
+1A83 0041
+1A93 0041
+1B53 0041
+1BB3 0041
+1C43 0041
+1C53 0041
+3023 0041
+A623 0041
+A8D3 0041
+A903 0041
+A9D3 0041
+AA53 0041
+ABF3 0041
+10109 0041
+104A3 0041
+1085A 0041
+1091B 0041
+10A42 0041
+10B5A 0041
+10B7A 0041
+10E62 0041
+11054 0041
+11069 0041
+110F3 0041
+11139 0041
+111D3 0041
+116C3 0041
12401 0041
12408 0041
12417 0041
1243B 0041
1244B 0041
12451 0041
-1091B 0061
-1091B 0041
-1085A 0061
-1085A 0041
-10B5A 0061
-10B5A 0041
-10B7A 0061
-10B7A 0041
-111D3 0061
-111D3 0041
-116C3 0061
-116C3 0041
-11069 0061
-11069 0041
-11054 0061
-11054 0041
-10A42 0061
-10A42 0041
-1D362 0061
+12457 0041
1D362 0041
+FF13 0061
+FF13 0041
+2476 0061
+1F104 0061
+248A 0061
+0F2C 0061
+2476 0041
+1F104 0041
+248A 0041
+0F2C 0041
+1D7D1 0061
+1D7DB 0061
+1D7E5 0061
+1D7EF 0061
+1D7F9 0061
+1D7D1 0041
+1D7DB 0041
+1D7E5 0041
+1D7EF 0041
+1D7F9 0041
+2462 0061
+24F7 0061
+2778 0061
+2782 0061
+278C 0061
+2462 0041
+24F7 0041
+2778 0041
+2782 0041
+278C 0041
+00B3 0061
+00B3 0041
+2083 0061
+2083 0041
0033 0062
-FF13 0062
-2476 0062
-1F104 0062
-248A 0062
-1D7D1 0062
-1D7DB 0062
-1D7E5 0062
-1D7EF 0062
-1D7F9 0062
-2462 0062
-24F7 0062
-2778 0062
-2782 0062
-278C 0062
-00B3 0062
-2083 0062
0663 0062
06F3 0062
-10E62 0062
07C3 0062
-136B 0062
-104A3 0062
0969 0062
09E9 0062
0A69 0062
0C7E 0062
0CE9 0062
0D69 0062
-ABF3 0062
-A8D3 0062
-1949 0062
-19D3 0062
-1A83 0062
-1A93 0062
0E53 0062
0ED3 0062
0F23 0062
-0F2C 0062
-1C43 0062
-A903 0062
1043 0062
1093 0062
-11139 0062
+136B 0062
17E3 0062
17F3 0062
-AA53 0062
+1813 0062
+1949 0062
+19D3 0062
+1A83 0062
+1A93 0062
1B53 0062
-A9D3 0062
1BB3 0062
-1813 0062
+1C43 0062
1C53 0062
-A623 0062
-110F3 0062
3023 0062
+A623 0062
+A8D3 0062
+A903 0062
+A9D3 0062
+AA53 0062
+ABF3 0062
10109 0062
+104A3 0062
+1085A 0062
+1091B 0062
+10A42 0062
+10B5A 0062
+10B7A 0062
+10E62 0062
+11054 0062
+11069 0062
+110F3 0062
+11139 0062
+111D3 0062
+116C3 0062
12401 0062
12408 0062
12417 0062
1243B 0062
1244B 0062
12451 0062
-1091B 0062
-1085A 0062
-10B5A 0062
-10B7A 0062
-111D3 0062
-116C3 0062
-11069 0062
-11054 0062
-10A42 0062
+12457 0062
1D362 0062
+FF13 0062
+2476 0062
+1F104 0062
+248A 0062
+0F2C 0062
+1D7D1 0062
+1D7DB 0062
+1D7E5 0062
+1D7EF 0062
+1D7F9 0062
+2462 0062
+24F7 0062
+2778 0062
+2782 0062
+278C 0062
+00B3 0062
+2083 0062
33E2 0021
33E2 003F
33E2 0061
335B 0041
335B 0062
0034 0021
-0034 003F
-FF14 0021
-FF14 003F
-2477 0021
-2477 003F
-1F105 0334
-1F105 0021
-1F105 003F
-248B 0021
-248B 003F
-1D7D2 0021
-1D7DC 0021
-1D7E6 0021
-1D7F0 0021
-1D7FA 0021
-1D7D2 003F
-1D7DC 003F
-1D7E6 003F
-1D7F0 003F
-1D7FA 003F
-2463 0021
-24F8 0021
-2779 0021
-2783 0021
-278D 0021
-2463 003F
-24F8 003F
-2779 003F
-2783 003F
-278D 003F
-2074 0021
-2074 003F
-2084 0021
-2084 003F
-1D7D2 0334
-1D7DC 0334
-1D7E6 0334
-1D7F0 0334
-1D7FA 0334
0664 0021
-0664 003F
06F4 0021
-06F4 003F
-10E63 0021
-10E63 003F
-10E63 0334
07C4 0021
-07C4 003F
-136C 0021
-136C 003F
-104A4 0021
-104A4 003F
-104A4 0334
096A 0021
-096A 003F
09EA 0021
-09EA 003F
0A6A 0021
-0A6A 003F
0AEA 0021
-0AEA 003F
0B6A 0021
-0B6A 003F
0BEA 0021
-0BEA 003F
0C6A 0021
-0C6A 003F
0CEA 0021
-0CEA 003F
0D6A 0021
-0D6A 003F
-ABF4 0021
-ABF4 003F
-A8D4 0021
-A8D4 003F
-194A 0021
-194A 003F
-19D4 0021
-19D4 003F
-1A84 0021
-1A84 003F
-1A94 0021
-1A94 003F
0E54 0021
-0E54 003F
0ED4 0021
-0ED4 003F
0F24 0021
-0F24 003F
-0F2D 0021
-0F2D 003F
-1C44 0021
-1C44 003F
-A904 0021
-A904 003F
1044 0021
-1044 003F
1094 0021
-1094 003F
-1113A 0021
-1113A 003F
-1113A 0334
+136C 0021
17E4 0021
-17E4 003F
17F4 0021
-17F4 003F
-AA54 0021
-AA54 003F
+1814 0021
+194A 0021
+19D4 0021
+1A84 0021
+1A94 0021
1B54 0021
-1B54 003F
-A9D4 0021
-A9D4 003F
1BB4 0021
-1BB4 003F
-1814 0021
-1814 003F
+1C44 0021
1C54 0021
-1C54 003F
-A624 0021
-A624 003F
-110F4 0021
-110F4 003F
-110F4 0334
3024 0021
-3024 003F
+A624 0021
+A8D4 0021
+A904 0021
+A9D4 0021
+AA54 0021
+ABF4 0021
1010A 0021
-1010A 003F
-1010A 0334
+104A4 0021
+10A43 0021
+10B5B 0021
+10B7B 0021
+10E63 0021
+11055 0021
+1106A 0021
+110F4 0021
+1113A 0021
+111D4 0021
+116C4 0021
12402 0021
12409 0021
1240F 0021
1244C 0021
12452 0021
12453 0021
+1D363 0021
+0034 003F
+0664 003F
+06F4 003F
+07C4 003F
+096A 003F
+09EA 003F
+0A6A 003F
+0AEA 003F
+0B6A 003F
+0BEA 003F
+0C6A 003F
+0CEA 003F
+0D6A 003F
+0E54 003F
+0ED4 003F
+0F24 003F
+1044 003F
+1094 003F
+136C 003F
+17E4 003F
+17F4 003F
+1814 003F
+194A 003F
+19D4 003F
+1A84 003F
+1A94 003F
+1B54 003F
+1BB4 003F
+1C44 003F
+1C54 003F
+3024 003F
+A624 003F
+A8D4 003F
+A904 003F
+A9D4 003F
+AA54 003F
+ABF4 003F
+1010A 003F
+104A4 003F
+10A43 003F
+10B5B 003F
+10B7B 003F
+10E63 003F
+11055 003F
+1106A 003F
+110F4 003F
+1113A 003F
+111D4 003F
+116C4 003F
12402 003F
12409 003F
1240F 003F
1244C 003F
12452 003F
12453 003F
+1D363 003F
+FF14 0021
+FF14 003F
+2477 0021
+2477 003F
+1F105 0334
+1F105 0021
+1F105 003F
+0F2D 0021
+0F2D 003F
+248B 0021
+248B 003F
+1D7D2 0021
+1D7DC 0021
+1D7E6 0021
+1D7F0 0021
+1D7FA 0021
+1D7D2 003F
+1D7DC 003F
+1D7E6 003F
+1D7F0 003F
+1D7FA 003F
+2463 0021
+24F8 0021
+2779 0021
+2783 0021
+278D 0021
+2463 003F
+24F8 003F
+2779 003F
+2783 003F
+278D 003F
+2074 0021
+2074 003F
+2084 0021
+2084 003F
+1010A 0334
+104A4 0334
+10A43 0334
+10B5B 0334
+10B7B 0334
+10E63 0334
+11055 0334
+1106A 0334
+110F4 0334
+1113A 0334
+111D4 0334
+116C4 0334
12402 0334
12409 0334
1240F 0334
1244C 0334
12452 0334
12453 0334
-10B5B 0021
-10B5B 003F
-10B5B 0334
-10B7B 0021
-10B7B 003F
-10B7B 0334
-111D4 0021
-111D4 003F
-111D4 0334
-116C4 0021
-116C4 003F
-116C4 0334
-1106A 0021
-1106A 003F
-1106A 0334
-11055 0021
-11055 003F
-11055 0334
-10A43 0021
-10A43 003F
-10A43 0334
-1D363 0021
-1D363 003F
1D363 0334
+1D7D2 0334
+1D7DC 0334
+1D7E6 0334
+1D7F0 0334
+1D7FA 0334
2158 0021
2158 003F
2158 0061
32BE 0041
32BE 0062
0034 0061
-0034 0041
-FF14 0061
-FF14 0041
-2477 0061
-1F105 0061
-248B 0061
-2477 0041
-1F105 0041
-248B 0041
-1D7D2 0061
-1D7DC 0061
-1D7E6 0061
-1D7F0 0061
-1D7FA 0061
-1D7D2 0041
-1D7DC 0041
-1D7E6 0041
-1D7F0 0041
-1D7FA 0041
-2463 0061
-24F8 0061
-2779 0061
-2783 0061
-278D 0061
-2463 0041
-24F8 0041
-2779 0041
-2783 0041
-278D 0041
-2074 0061
-2074 0041
-2084 0061
-2084 0041
0664 0061
-0664 0041
06F4 0061
-06F4 0041
-10E63 0061
-10E63 0041
07C4 0061
-07C4 0041
-136C 0061
-136C 0041
-104A4 0061
-104A4 0041
096A 0061
-096A 0041
09EA 0061
-09EA 0041
0A6A 0061
-0A6A 0041
0AEA 0061
-0AEA 0041
0B6A 0061
-0B6A 0041
0BEA 0061
-0BEA 0041
0C6A 0061
-0C6A 0041
0CEA 0061
-0CEA 0041
0D6A 0061
-0D6A 0041
-ABF4 0061
-ABF4 0041
-A8D4 0061
-A8D4 0041
+0E54 0061
+0ED4 0061
+0F24 0061
+1044 0061
+1094 0061
+136C 0061
+17E4 0061
+17F4 0061
+1814 0061
194A 0061
-194A 0041
19D4 0061
-19D4 0041
1A84 0061
-1A84 0041
1A94 0061
-1A94 0041
-0E54 0061
-0E54 0041
-0ED4 0061
-0ED4 0041
-0F24 0061
-0F24 0041
-0F2D 0061
-0F2D 0041
-1C44 0061
-1C44 0041
-A904 0061
-A904 0041
-1044 0061
-1044 0041
-1094 0061
-1094 0041
-1113A 0061
-1113A 0041
-17E4 0061
-17E4 0041
-17F4 0061
-17F4 0041
-AA54 0061
-AA54 0041
1B54 0061
-1B54 0041
-A9D4 0061
-A9D4 0041
1BB4 0061
-1BB4 0041
-1814 0061
-1814 0041
+1C44 0061
1C54 0061
-1C54 0041
-A624 0061
-A624 0041
-110F4 0061
-110F4 0041
3024 0061
-3024 0041
+A624 0061
+A8D4 0061
+A904 0061
+A9D4 0061
+AA54 0061
+ABF4 0061
1010A 0061
-1010A 0041
+104A4 0061
+10A43 0061
+10B5B 0061
+10B7B 0061
+10E63 0061
+11055 0061
+1106A 0061
+110F4 0061
+1113A 0061
+111D4 0061
+116C4 0061
12402 0061
12409 0061
1240F 0061
1244C 0061
12452 0061
12453 0061
+1D363 0061
+0034 0041
+0664 0041
+06F4 0041
+07C4 0041
+096A 0041
+09EA 0041
+0A6A 0041
+0AEA 0041
+0B6A 0041
+0BEA 0041
+0C6A 0041
+0CEA 0041
+0D6A 0041
+0E54 0041
+0ED4 0041
+0F24 0041
+1044 0041
+1094 0041
+136C 0041
+17E4 0041
+17F4 0041
+1814 0041
+194A 0041
+19D4 0041
+1A84 0041
+1A94 0041
+1B54 0041
+1BB4 0041
+1C44 0041
+1C54 0041
+3024 0041
+A624 0041
+A8D4 0041
+A904 0041
+A9D4 0041
+AA54 0041
+ABF4 0041
+1010A 0041
+104A4 0041
+10A43 0041
+10B5B 0041
+10B7B 0041
+10E63 0041
+11055 0041
+1106A 0041
+110F4 0041
+1113A 0041
+111D4 0041
+116C4 0041
12402 0041
12409 0041
1240F 0041
1244C 0041
12452 0041
12453 0041
-10B5B 0061
-10B5B 0041
-10B7B 0061
-10B7B 0041
-111D4 0061
-111D4 0041
-116C4 0061
-116C4 0041
-1106A 0061
-1106A 0041
-11055 0061
-11055 0041
-10A43 0061
-10A43 0041
-1D363 0061
1D363 0041
+FF14 0061
+FF14 0041
+2477 0061
+1F105 0061
+248B 0061
+0F2D 0061
+2477 0041
+1F105 0041
+248B 0041
+0F2D 0041
+1D7D2 0061
+1D7DC 0061
+1D7E6 0061
+1D7F0 0061
+1D7FA 0061
+1D7D2 0041
+1D7DC 0041
+1D7E6 0041
+1D7F0 0041
+1D7FA 0041
+2463 0061
+24F8 0061
+2779 0061
+2783 0061
+278D 0061
+2463 0041
+24F8 0041
+2779 0041
+2783 0041
+278D 0041
+2074 0061
+2074 0041
+2084 0061
+2084 0041
0034 0062
-FF14 0062
-2477 0062
-1F105 0062
-248B 0062
-1D7D2 0062
-1D7DC 0062
-1D7E6 0062
-1D7F0 0062
-1D7FA 0062
-2463 0062
-24F8 0062
-2779 0062
-2783 0062
-278D 0062
-2074 0062
-2084 0062
0664 0062
06F4 0062
-10E63 0062
07C4 0062
-136C 0062
-104A4 0062
096A 0062
09EA 0062
0A6A 0062
0C6A 0062
0CEA 0062
0D6A 0062
-ABF4 0062
-A8D4 0062
-194A 0062
-19D4 0062
-1A84 0062
-1A94 0062
0E54 0062
0ED4 0062
0F24 0062
-0F2D 0062
-1C44 0062
-A904 0062
1044 0062
1094 0062
-1113A 0062
+136C 0062
17E4 0062
17F4 0062
-AA54 0062
+1814 0062
+194A 0062
+19D4 0062
+1A84 0062
+1A94 0062
1B54 0062
-A9D4 0062
1BB4 0062
-1814 0062
+1C44 0062
1C54 0062
-A624 0062
-110F4 0062
3024 0062
+A624 0062
+A8D4 0062
+A904 0062
+A9D4 0062
+AA54 0062
+ABF4 0062
1010A 0062
+104A4 0062
+10A43 0062
+10B5B 0062
+10B7B 0062
+10E63 0062
+11055 0062
+1106A 0062
+110F4 0062
+1113A 0062
+111D4 0062
+116C4 0062
12402 0062
12409 0062
1240F 0062
1244C 0062
12452 0062
12453 0062
-10B5B 0062
-10B7B 0062
-111D4 0062
-116C4 0062
-1106A 0062
-11055 0062
-10A43 0062
1D363 0062
+FF14 0062
+2477 0062
+1F105 0062
+248B 0062
+0F2D 0062
+1D7D2 0062
+1D7DC 0062
+1D7E6 0062
+1D7F0 0062
+1D7FA 0062
+2463 0062
+24F8 0062
+2779 0062
+2783 0062
+278D 0062
+2074 0062
+2084 0062
33E3 0021
33E3 003F
33E3 0061
335C 0041
335C 0062
0035 0021
-0035 003F
-FF15 0021
-FF15 003F
-2478 0021
-2478 003F
-1F106 0334
-1F106 0021
-1F106 003F
-248C 0021
-248C 003F
-1D7D3 0021
-1D7DD 0021
-1D7E7 0021
-1D7F1 0021
-1D7FB 0021
-1D7D3 003F
-1D7DD 003F
-1D7E7 003F
-1D7F1 003F
-1D7FB 003F
-2464 0021
-24F9 0021
-277A 0021
-2784 0021
-278E 0021
-2464 003F
-24F9 003F
-277A 003F
-2784 003F
-278E 003F
-2075 0021
-2075 003F
-2085 0021
-2085 003F
-1D7D3 0334
-1D7DD 0334
-1D7E7 0334
-1D7F1 0334
-1D7FB 0334
0665 0021
-0665 003F
06F5 0021
-06F5 003F
-10E64 0021
-10E64 003F
-10E64 0334
07C5 0021
-07C5 003F
-136D 0021
-136D 003F
-104A5 0021
-104A5 003F
-104A5 0334
096B 0021
-096B 003F
09EB 0021
-09EB 003F
0A6B 0021
-0A6B 003F
0AEB 0021
-0AEB 003F
0B6B 0021
-0B6B 003F
0BEB 0021
-0BEB 003F
0C6B 0021
-0C6B 003F
0CEB 0021
-0CEB 003F
0D6B 0021
-0D6B 003F
-ABF5 0021
-ABF5 003F
-A8D5 0021
-A8D5 003F
-194B 0021
-194B 003F
-19D5 0021
-19D5 003F
-1A85 0021
-1A85 003F
-1A95 0021
-1A95 003F
0E55 0021
-0E55 003F
0ED5 0021
-0ED5 003F
0F25 0021
-0F25 003F
-0F2E 0021
-0F2E 003F
-1C45 0021
-1C45 003F
-A905 0021
-A905 003F
1045 0021
-1045 003F
1095 0021
-1095 003F
-1113B 0021
-1113B 003F
-1113B 0334
+136D 0021
17E5 0021
-17E5 003F
17F5 0021
-17F5 003F
-AA55 0021
-AA55 003F
+1815 0021
+194B 0021
+19D5 0021
+1A85 0021
+1A95 0021
1B55 0021
-1B55 003F
-A9D5 0021
-A9D5 003F
1BB5 0021
-1BB5 003F
-1815 0021
-1815 003F
+1C45 0021
1C55 0021
-1C55 003F
-A625 0021
-A625 003F
-110F5 0021
-110F5 003F
-110F5 0334
3025 0021
-3025 003F
+A625 0021
+A8D5 0021
+A905 0021
+A9D5 0021
+AA55 0021
+ABF5 0021
1010B 0021
-1010B 003F
-1010B 0334
10143 0021
10148 0021
1014F 0021
1015F 0021
10173 0021
-10143 003F
-10148 003F
-1014F 003F
-1015F 003F
-10173 003F
-10143 0334
-10148 0334
-1014F 0334
-1015F 0334
-10173 0334
10321 0021
-10321 003F
-10321 0334
+104A5 0021
+10E64 0021
+11056 0021
+1106B 0021
+110F5 0021
+1113B 0021
+111D5 0021
+116C5 0021
12403 0021
1240A 0021
12410 0021
1244D 0021
12454 0021
12455 0021
+1D364 0021
+0035 003F
+0665 003F
+06F5 003F
+07C5 003F
+096B 003F
+09EB 003F
+0A6B 003F
+0AEB 003F
+0B6B 003F
+0BEB 003F
+0C6B 003F
+0CEB 003F
+0D6B 003F
+0E55 003F
+0ED5 003F
+0F25 003F
+1045 003F
+1095 003F
+136D 003F
+17E5 003F
+17F5 003F
+1815 003F
+194B 003F
+19D5 003F
+1A85 003F
+1A95 003F
+1B55 003F
+1BB5 003F
+1C45 003F
+1C55 003F
+3025 003F
+A625 003F
+A8D5 003F
+A905 003F
+A9D5 003F
+AA55 003F
+ABF5 003F
+1010B 003F
+10143 003F
+10148 003F
+1014F 003F
+1015F 003F
+10173 003F
+10321 003F
+104A5 003F
+10E64 003F
+11056 003F
+1106B 003F
+110F5 003F
+1113B 003F
+111D5 003F
+116C5 003F
12403 003F
1240A 003F
12410 003F
1244D 003F
12454 003F
12455 003F
+1D364 003F
+FF15 0021
+FF15 003F
+2478 0021
+2478 003F
+1F106 0334
+1F106 0021
+1F106 003F
+0F2E 0021
+0F2E 003F
+248C 0021
+248C 003F
+1D7D3 0021
+1D7DD 0021
+1D7E7 0021
+1D7F1 0021
+1D7FB 0021
+1D7D3 003F
+1D7DD 003F
+1D7E7 003F
+1D7F1 003F
+1D7FB 003F
+2464 0021
+24F9 0021
+277A 0021
+2784 0021
+278E 0021
+2464 003F
+24F9 003F
+277A 003F
+2784 003F
+278E 003F
+2075 0021
+2075 003F
+2085 0021
+2085 003F
+1010B 0334
+10143 0334
+10148 0334
+1014F 0334
+1015F 0334
+10173 0334
+10321 0334
+104A5 0334
+10E64 0334
+11056 0334
+1106B 0334
+110F5 0334
+1113B 0334
+111D5 0334
+116C5 0334
12403 0334
1240A 0334
12410 0334
1244D 0334
12454 0334
12455 0334
-111D5 0021
-111D5 003F
-111D5 0334
-116C5 0021
-116C5 003F
-116C5 0334
-1106B 0021
-1106B 003F
-1106B 0334
-11056 0021
-11056 003F
-11056 0334
-1D364 0021
-1D364 003F
1D364 0334
+1D7D3 0334
+1D7DD 0334
+1D7E7 0334
+1D7F1 0334
+1D7FB 0334
215A 0021
215A 003F
215A 0061
324C 0062
32BF 0062
0035 0061
-0035 0041
-FF15 0061
-FF15 0041
-2478 0061
-1F106 0061
-248C 0061
-2478 0041
-1F106 0041
-248C 0041
-1D7D3 0061
-1D7DD 0061
-1D7E7 0061
-1D7F1 0061
-1D7FB 0061
-1D7D3 0041
-1D7DD 0041
-1D7E7 0041
-1D7F1 0041
-1D7FB 0041
-2464 0061
-24F9 0061
-277A 0061
-2784 0061
-278E 0061
-2464 0041
-24F9 0041
-277A 0041
-2784 0041
-278E 0041
-2075 0061
-2075 0041
-2085 0061
-2085 0041
0665 0061
-0665 0041
06F5 0061
-06F5 0041
-10E64 0061
-10E64 0041
07C5 0061
-07C5 0041
-136D 0061
-136D 0041
-104A5 0061
-104A5 0041
096B 0061
-096B 0041
09EB 0061
-09EB 0041
0A6B 0061
-0A6B 0041
0AEB 0061
-0AEB 0041
0B6B 0061
-0B6B 0041
0BEB 0061
-0BEB 0041
0C6B 0061
-0C6B 0041
0CEB 0061
-0CEB 0041
0D6B 0061
-0D6B 0041
-ABF5 0061
-ABF5 0041
-A8D5 0061
-A8D5 0041
-194B 0061
-194B 0041
-19D5 0061
-19D5 0041
-1A85 0061
-1A85 0041
-1A95 0061
-1A95 0041
0E55 0061
-0E55 0041
0ED5 0061
-0ED5 0041
0F25 0061
-0F25 0041
-0F2E 0061
-0F2E 0041
-1C45 0061
-1C45 0041
-A905 0061
-A905 0041
1045 0061
-1045 0041
1095 0061
-1095 0041
-1113B 0061
-1113B 0041
+136D 0061
17E5 0061
-17E5 0041
17F5 0061
-17F5 0041
-AA55 0061
-AA55 0041
+1815 0061
+194B 0061
+19D5 0061
+1A85 0061
+1A95 0061
1B55 0061
-1B55 0041
-A9D5 0061
-A9D5 0041
1BB5 0061
-1BB5 0041
-1815 0061
-1815 0041
+1C45 0061
1C55 0061
-1C55 0041
-A625 0061
-A625 0041
-110F5 0061
-110F5 0041
3025 0061
-3025 0041
+A625 0061
+A8D5 0061
+A905 0061
+A9D5 0061
+AA55 0061
+ABF5 0061
1010B 0061
-1010B 0041
10143 0061
10148 0061
1014F 0061
1015F 0061
10173 0061
-10143 0041
-10148 0041
-1014F 0041
-1015F 0041
-10173 0041
10321 0061
-10321 0041
+104A5 0061
+10E64 0061
+11056 0061
+1106B 0061
+110F5 0061
+1113B 0061
+111D5 0061
+116C5 0061
12403 0061
1240A 0061
12410 0061
1244D 0061
12454 0061
12455 0061
+1D364 0061
+0035 0041
+0665 0041
+06F5 0041
+07C5 0041
+096B 0041
+09EB 0041
+0A6B 0041
+0AEB 0041
+0B6B 0041
+0BEB 0041
+0C6B 0041
+0CEB 0041
+0D6B 0041
+0E55 0041
+0ED5 0041
+0F25 0041
+1045 0041
+1095 0041
+136D 0041
+17E5 0041
+17F5 0041
+1815 0041
+194B 0041
+19D5 0041
+1A85 0041
+1A95 0041
+1B55 0041
+1BB5 0041
+1C45 0041
+1C55 0041
+3025 0041
+A625 0041
+A8D5 0041
+A905 0041
+A9D5 0041
+AA55 0041
+ABF5 0041
+1010B 0041
+10143 0041
+10148 0041
+1014F 0041
+1015F 0041
+10173 0041
+10321 0041
+104A5 0041
+10E64 0041
+11056 0041
+1106B 0041
+110F5 0041
+1113B 0041
+111D5 0041
+116C5 0041
12403 0041
1240A 0041
12410 0041
1244D 0041
12454 0041
12455 0041
-111D5 0061
-111D5 0041
-116C5 0061
-116C5 0041
-1106B 0061
-1106B 0041
-11056 0061
-11056 0041
-1D364 0061
1D364 0041
+FF15 0061
+FF15 0041
+2478 0061
+1F106 0061
+248C 0061
+0F2E 0061
+2478 0041
+1F106 0041
+248C 0041
+0F2E 0041
+1D7D3 0061
+1D7DD 0061
+1D7E7 0061
+1D7F1 0061
+1D7FB 0061
+1D7D3 0041
+1D7DD 0041
+1D7E7 0041
+1D7F1 0041
+1D7FB 0041
+2464 0061
+24F9 0061
+277A 0061
+2784 0061
+278E 0061
+2464 0041
+24F9 0041
+277A 0041
+2784 0041
+278E 0041
+2075 0061
+2075 0041
+2085 0061
+2085 0041
0035 0062
-FF15 0062
-2478 0062
-1F106 0062
-248C 0062
-1D7D3 0062
-1D7DD 0062
-1D7E7 0062
-1D7F1 0062
-1D7FB 0062
-2464 0062
-24F9 0062
-277A 0062
-2784 0062
-278E 0062
-2075 0062
-2085 0062
0665 0062
06F5 0062
-10E64 0062
07C5 0062
-136D 0062
-104A5 0062
096B 0062
09EB 0062
0A6B 0062
0C6B 0062
0CEB 0062
0D6B 0062
-ABF5 0062
-A8D5 0062
-194B 0062
-19D5 0062
-1A85 0062
-1A95 0062
0E55 0062
0ED5 0062
0F25 0062
-0F2E 0062
-1C45 0062
-A905 0062
1045 0062
1095 0062
-1113B 0062
+136D 0062
17E5 0062
17F5 0062
-AA55 0062
+1815 0062
+194B 0062
+19D5 0062
+1A85 0062
+1A95 0062
1B55 0062
-A9D5 0062
1BB5 0062
-1815 0062
+1C45 0062
1C55 0062
-A625 0062
-110F5 0062
3025 0062
+A625 0062
+A8D5 0062
+A905 0062
+A9D5 0062
+AA55 0062
+ABF5 0062
1010B 0062
10143 0062
10148 0062
1015F 0062
10173 0062
10321 0062
+104A5 0062
+10E64 0062
+11056 0062
+1106B 0062
+110F5 0062
+1113B 0062
+111D5 0062
+116C5 0062
12403 0062
1240A 0062
12410 0062
1244D 0062
12454 0062
12455 0062
-111D5 0062
-116C5 0062
-1106B 0062
-11056 0062
1D364 0062
+FF15 0062
+2478 0062
+1F106 0062
+248C 0062
+0F2E 0062
+1D7D3 0062
+1D7DD 0062
+1D7E7 0062
+1D7F1 0062
+1D7FB 0062
+2464 0062
+24F9 0062
+277A 0062
+2784 0062
+278E 0062
+2075 0062
+2085 0062
33E4 0021
33E4 003F
33E4 0061
335D 0041
335D 0062
0036 0021
-0036 003F
-FF16 0021
-FF16 003F
-2479 0021
-2479 003F
-1F107 0334
-1F107 0021
-1F107 003F
-248D 0021
-248D 003F
-1D7D4 0021
-1D7DE 0021
-1D7E8 0021
-1D7F2 0021
-1D7FC 0021
-1D7D4 003F
-1D7DE 003F
-1D7E8 003F
-1D7F2 003F
-1D7FC 003F
-2465 0021
-24FA 0021
-277B 0021
-2785 0021
-278F 0021
-2465 003F
-24FA 003F
-277B 003F
-2785 003F
-278F 003F
-2076 0021
-2076 003F
-2086 0021
-2086 003F
-1D7D4 0334
-1D7DE 0334
-1D7E8 0334
-1D7F2 0334
-1D7FC 0334
0666 0021
-0666 003F
06F6 0021
-06F6 003F
-10E65 0021
-10E65 003F
-10E65 0334
07C6 0021
-07C6 003F
-136E 0021
-136E 003F
-104A6 0021
-104A6 003F
-104A6 0334
096C 0021
-096C 003F
09EC 0021
-09EC 003F
0A6C 0021
-0A6C 003F
0AEC 0021
-0AEC 003F
0B6C 0021
-0B6C 003F
0BEC 0021
-0BEC 003F
0C6C 0021
-0C6C 003F
0CEC 0021
-0CEC 003F
0D6C 0021
-0D6C 003F
-ABF6 0021
-ABF6 003F
-A8D6 0021
-A8D6 003F
-194C 0021
-194C 003F
-19D6 0021
-19D6 003F
-1A86 0021
-1A86 003F
-1A96 0021
-1A96 003F
0E56 0021
-0E56 003F
0ED6 0021
-0ED6 003F
0F26 0021
-0F26 003F
-0F2F 0021
-0F2F 003F
-1C46 0021
-1C46 003F
-A906 0021
-A906 003F
1046 0021
-1046 003F
1096 0021
-1096 003F
-1113C 0021
-1113C 003F
-1113C 0334
+136E 0021
17E6 0021
-17E6 003F
17F6 0021
-17F6 003F
-AA56 0021
-AA56 003F
+1816 0021
+194C 0021
+19D6 0021
+1A86 0021
+1A96 0021
1B56 0021
-1B56 003F
-A9D6 0021
-A9D6 003F
1BB6 0021
-1BB6 003F
-1816 0021
-1816 003F
+1C46 0021
1C56 0021
-1C56 003F
-A626 0021
-A626 003F
-110F6 0021
-110F6 003F
-110F6 0334
+2185 0021
3026 0021
-3026 003F
+A626 0021
+A8D6 0021
+A906 0021
+A9D6 0021
+AA56 0021
+ABF6 0021
1010C 0021
-1010C 003F
-1010C 0334
-2185 0021
-2185 003F
+104A6 0021
+10E65 0021
+11057 0021
+1106C 0021
+110F6 0021
+1113C 0021
+111D6 0021
+116C6 0021
12404 0021
1240B 0021
12411 0021
12428 0021
12440 0021
1244E 0021
+1D365 0021
+0036 003F
+0666 003F
+06F6 003F
+07C6 003F
+096C 003F
+09EC 003F
+0A6C 003F
+0AEC 003F
+0B6C 003F
+0BEC 003F
+0C6C 003F
+0CEC 003F
+0D6C 003F
+0E56 003F
+0ED6 003F
+0F26 003F
+1046 003F
+1096 003F
+136E 003F
+17E6 003F
+17F6 003F
+1816 003F
+194C 003F
+19D6 003F
+1A86 003F
+1A96 003F
+1B56 003F
+1BB6 003F
+1C46 003F
+1C56 003F
+2185 003F
+3026 003F
+A626 003F
+A8D6 003F
+A906 003F
+A9D6 003F
+AA56 003F
+ABF6 003F
+1010C 003F
+104A6 003F
+10E65 003F
+11057 003F
+1106C 003F
+110F6 003F
+1113C 003F
+111D6 003F
+116C6 003F
12404 003F
1240B 003F
12411 003F
12428 003F
12440 003F
1244E 003F
+1D365 003F
+FF16 0021
+FF16 003F
+2479 0021
+2479 003F
+1F107 0334
+1F107 0021
+1F107 003F
+0F2F 0021
+0F2F 003F
+248D 0021
+248D 003F
+1D7D4 0021
+1D7DE 0021
+1D7E8 0021
+1D7F2 0021
+1D7FC 0021
+1D7D4 003F
+1D7DE 003F
+1D7E8 003F
+1D7F2 003F
+1D7FC 003F
+2465 0021
+24FA 0021
+277B 0021
+2785 0021
+278F 0021
+2465 003F
+24FA 003F
+277B 003F
+2785 003F
+278F 003F
+2076 0021
+2076 003F
+2086 0021
+2086 003F
+1010C 0334
+104A6 0334
+10E65 0334
+11057 0334
+1106C 0334
+110F6 0334
+1113C 0334
+111D6 0334
+116C6 0334
12404 0334
1240B 0334
12411 0334
12428 0334
12440 0334
1244E 0334
-111D6 0021
-111D6 003F
-111D6 0334
-116C6 0021
-116C6 003F
-116C6 0334
-1106C 0021
-1106C 003F
-1106C 0334
-11057 0021
-11057 003F
-11057 0334
-1D365 0021
-1D365 003F
1D365 0334
-324D 0021
-324D 003F
-324D 0061
+1D7D4 0334
+1D7DE 0334
+1D7E8 0334
+1D7F2 0334
+1D7FC 0334
+324D 0021
+324D 003F
+324D 0061
324D 0041
324D 0062
0036 0061
-0036 0041
-FF16 0061
-FF16 0041
-2479 0061
-1F107 0061
-248D 0061
-2479 0041
-1F107 0041
-248D 0041
-1D7D4 0061
-1D7DE 0061
-1D7E8 0061
-1D7F2 0061
-1D7FC 0061
-1D7D4 0041
-1D7DE 0041
-1D7E8 0041
-1D7F2 0041
-1D7FC 0041
-2465 0061
-24FA 0061
-277B 0061
-2785 0061
-278F 0061
-2465 0041
-24FA 0041
-277B 0041
-2785 0041
-278F 0041
-2076 0061
-2076 0041
-2086 0061
-2086 0041
0666 0061
-0666 0041
06F6 0061
-06F6 0041
-10E65 0061
-10E65 0041
07C6 0061
-07C6 0041
-136E 0061
-136E 0041
-104A6 0061
-104A6 0041
096C 0061
-096C 0041
09EC 0061
-09EC 0041
0A6C 0061
-0A6C 0041
0AEC 0061
-0AEC 0041
0B6C 0061
-0B6C 0041
0BEC 0061
-0BEC 0041
0C6C 0061
-0C6C 0041
0CEC 0061
-0CEC 0041
0D6C 0061
-0D6C 0041
-ABF6 0061
-ABF6 0041
-A8D6 0061
-A8D6 0041
-194C 0061
-194C 0041
-19D6 0061
-19D6 0041
-1A86 0061
-1A86 0041
-1A96 0061
-1A96 0041
0E56 0061
-0E56 0041
0ED6 0061
-0ED6 0041
0F26 0061
-0F26 0041
-0F2F 0061
-0F2F 0041
-1C46 0061
-1C46 0041
-A906 0061
-A906 0041
1046 0061
-1046 0041
1096 0061
-1096 0041
-1113C 0061
-1113C 0041
+136E 0061
17E6 0061
-17E6 0041
17F6 0061
-17F6 0041
-AA56 0061
-AA56 0041
+1816 0061
+194C 0061
+19D6 0061
+1A86 0061
+1A96 0061
1B56 0061
-1B56 0041
-A9D6 0061
-A9D6 0041
1BB6 0061
-1BB6 0041
-1816 0061
-1816 0041
+1C46 0061
1C56 0061
-1C56 0041
-A626 0061
-A626 0041
-110F6 0061
-110F6 0041
+2185 0061
3026 0061
-3026 0041
+A626 0061
+A8D6 0061
+A906 0061
+A9D6 0061
+AA56 0061
+ABF6 0061
1010C 0061
-1010C 0041
-2185 0061
-2185 0041
+104A6 0061
+10E65 0061
+11057 0061
+1106C 0061
+110F6 0061
+1113C 0061
+111D6 0061
+116C6 0061
12404 0061
1240B 0061
12411 0061
12428 0061
12440 0061
1244E 0061
+1D365 0061
+0036 0041
+0666 0041
+06F6 0041
+07C6 0041
+096C 0041
+09EC 0041
+0A6C 0041
+0AEC 0041
+0B6C 0041
+0BEC 0041
+0C6C 0041
+0CEC 0041
+0D6C 0041
+0E56 0041
+0ED6 0041
+0F26 0041
+1046 0041
+1096 0041
+136E 0041
+17E6 0041
+17F6 0041
+1816 0041
+194C 0041
+19D6 0041
+1A86 0041
+1A96 0041
+1B56 0041
+1BB6 0041
+1C46 0041
+1C56 0041
+2185 0041
+3026 0041
+A626 0041
+A8D6 0041
+A906 0041
+A9D6 0041
+AA56 0041
+ABF6 0041
+1010C 0041
+104A6 0041
+10E65 0041
+11057 0041
+1106C 0041
+110F6 0041
+1113C 0041
+111D6 0041
+116C6 0041
12404 0041
1240B 0041
12411 0041
12428 0041
12440 0041
1244E 0041
-111D6 0061
-111D6 0041
-116C6 0061
-116C6 0041
-1106C 0061
-1106C 0041
-11057 0061
-11057 0041
-1D365 0061
1D365 0041
+FF16 0061
+FF16 0041
+2479 0061
+1F107 0061
+248D 0061
+0F2F 0061
+2479 0041
+1F107 0041
+248D 0041
+0F2F 0041
+1D7D4 0061
+1D7DE 0061
+1D7E8 0061
+1D7F2 0061
+1D7FC 0061
+1D7D4 0041
+1D7DE 0041
+1D7E8 0041
+1D7F2 0041
+1D7FC 0041
+2465 0061
+24FA 0061
+277B 0061
+2785 0061
+278F 0061
+2465 0041
+24FA 0041
+277B 0041
+2785 0041
+278F 0041
+2076 0061
+2076 0041
+2086 0061
+2086 0041
0036 0062
-FF16 0062
-2479 0062
-1F107 0062
-248D 0062
-1D7D4 0062
-1D7DE 0062
-1D7E8 0062
-1D7F2 0062
-1D7FC 0062
-2465 0062
-24FA 0062
-277B 0062
-2785 0062
-278F 0062
-2076 0062
-2086 0062
0666 0062
06F6 0062
-10E65 0062
07C6 0062
-136E 0062
-104A6 0062
096C 0062
09EC 0062
0A6C 0062
0C6C 0062
0CEC 0062
0D6C 0062
-ABF6 0062
-A8D6 0062
-194C 0062
-19D6 0062
-1A86 0062
-1A96 0062
0E56 0062
0ED6 0062
0F26 0062
-0F2F 0062
-1C46 0062
-A906 0062
1046 0062
1096 0062
-1113C 0062
+136E 0062
17E6 0062
17F6 0062
-AA56 0062
+1816 0062
+194C 0062
+19D6 0062
+1A86 0062
+1A96 0062
1B56 0062
-A9D6 0062
1BB6 0062
-1816 0062
+1C46 0062
1C56 0062
-A626 0062
-110F6 0062
+2185 0062
3026 0062
+A626 0062
+A8D6 0062
+A906 0062
+A9D6 0062
+AA56 0062
+ABF6 0062
1010C 0062
-2185 0062
+104A6 0062
+10E65 0062
+11057 0062
+1106C 0062
+110F6 0062
+1113C 0062
+111D6 0062
+116C6 0062
12404 0062
1240B 0062
12411 0062
12428 0062
12440 0062
1244E 0062
-111D6 0062
-116C6 0062
-1106C 0062
-11057 0062
1D365 0062
+FF16 0062
+2479 0062
+1F107 0062
+248D 0062
+0F2F 0062
+1D7D4 0062
+1D7DE 0062
+1D7E8 0062
+1D7F2 0062
+1D7FC 0062
+2465 0062
+24FA 0062
+277B 0062
+2785 0062
+278F 0062
+2076 0062
+2086 0062
33E5 0021
33E5 003F
33E5 0061
335E 0041
335E 0062
0037 0021
-0037 003F
-FF17 0021
-FF17 003F
-247A 0021
-247A 003F
-1F108 0334
-1F108 0021
-1F108 003F
-248E 0021
-248E 003F
-1D7D5 0021
-1D7DF 0021
-1D7E9 0021
-1D7F3 0021
-1D7FD 0021
-1D7D5 003F
-1D7DF 003F
-1D7E9 003F
-1D7F3 003F
-1D7FD 003F
-2466 0021
-24FB 0021
-277C 0021
-2786 0021
-2790 0021
-2466 003F
-24FB 003F
-277C 003F
-2786 003F
-2790 003F
-2077 0021
-2077 003F
-2087 0021
-2087 003F
-1D7D5 0334
-1D7DF 0334
-1D7E9 0334
-1D7F3 0334
-1D7FD 0334
0667 0021
-0667 003F
06F7 0021
-06F7 003F
-10E66 0021
-10E66 003F
-10E66 0334
07C7 0021
-07C7 003F
-136F 0021
-136F 003F
-104A7 0021
-104A7 003F
-104A7 0334
096D 0021
-096D 003F
09ED 0021
-09ED 003F
0A6D 0021
-0A6D 003F
0AED 0021
-0AED 003F
0B6D 0021
-0B6D 003F
0BED 0021
-0BED 003F
0C6D 0021
-0C6D 003F
0CED 0021
-0CED 003F
0D6D 0021
-0D6D 003F
-ABF7 0021
-ABF7 003F
-A8D7 0021
-A8D7 003F
-194D 0021
-194D 003F
-19D7 0021
-19D7 003F
-1A87 0021
-1A87 003F
-1A97 0021
-1A97 003F
0E57 0021
-0E57 003F
0ED7 0021
-0ED7 003F
0F27 0021
-0F27 003F
-0F30 0021
-0F30 003F
-1C47 0021
-1C47 003F
-A907 0021
-A907 003F
1047 0021
-1047 003F
1097 0021
-1097 003F
-1113D 0021
-1113D 003F
-1113D 0334
+136F 0021
17E7 0021
-17E7 003F
17F7 0021
-17F7 003F
-AA57 0021
-AA57 003F
+1817 0021
+194D 0021
+19D7 0021
+1A87 0021
+1A97 0021
1B57 0021
-1B57 003F
-A9D7 0021
-A9D7 003F
1BB7 0021
-1BB7 003F
-1817 0021
-1817 003F
+1C47 0021
1C57 0021
-1C57 003F
-A627 0021
-A627 003F
-110F7 0021
-110F7 003F
-110F7 0334
3027 0021
-3027 003F
+A627 0021
+A8D7 0021
+A907 0021
+A9D7 0021
+AA57 0021
+ABF7 0021
1010D 0021
-1010D 003F
-1010D 0334
+104A7 0021
+10E66 0021
+11058 0021
+1106D 0021
+110F7 0021
+1113D 0021
+111D7 0021
+116C7 0021
12405 0021
1240C 0021
12412 0021
12441 0021
12442 0021
12443 0021
+1D366 0021
+0037 003F
+0667 003F
+06F7 003F
+07C7 003F
+096D 003F
+09ED 003F
+0A6D 003F
+0AED 003F
+0B6D 003F
+0BED 003F
+0C6D 003F
+0CED 003F
+0D6D 003F
+0E57 003F
+0ED7 003F
+0F27 003F
+1047 003F
+1097 003F
+136F 003F
+17E7 003F
+17F7 003F
+1817 003F
+194D 003F
+19D7 003F
+1A87 003F
+1A97 003F
+1B57 003F
+1BB7 003F
+1C47 003F
+1C57 003F
+3027 003F
+A627 003F
+A8D7 003F
+A907 003F
+A9D7 003F
+AA57 003F
+ABF7 003F
+1010D 003F
+104A7 003F
+10E66 003F
+11058 003F
+1106D 003F
+110F7 003F
+1113D 003F
+111D7 003F
+116C7 003F
12405 003F
1240C 003F
12412 003F
12441 003F
12442 003F
12443 003F
+1D366 003F
+FF17 0021
+FF17 003F
+247A 0021
+247A 003F
+1F108 0334
+1F108 0021
+1F108 003F
+0F30 0021
+0F30 003F
+248E 0021
+248E 003F
+1D7D5 0021
+1D7DF 0021
+1D7E9 0021
+1D7F3 0021
+1D7FD 0021
+1D7D5 003F
+1D7DF 003F
+1D7E9 003F
+1D7F3 003F
+1D7FD 003F
+2466 0021
+24FB 0021
+277C 0021
+2786 0021
+2790 0021
+2466 003F
+24FB 003F
+277C 003F
+2786 003F
+2790 003F
+2077 0021
+2077 003F
+2087 0021
+2087 003F
+1010D 0334
+104A7 0334
+10E66 0334
+11058 0334
+1106D 0334
+110F7 0334
+1113D 0334
+111D7 0334
+116C7 0334
12405 0334
1240C 0334
12412 0334
12441 0334
12442 0334
12443 0334
-111D7 0021
-111D7 003F
-111D7 0334
-116C7 0021
-116C7 003F
-116C7 0334
-1106D 0021
-1106D 003F
-1106D 0334
-11058 0021
-11058 003F
-11058 0334
-1D366 0021
-1D366 003F
1D366 0334
+1D7D5 0334
+1D7DF 0334
+1D7E9 0334
+1D7F3 0334
+1D7FD 0334
215E 0021
215E 003F
215E 0061
324E 0041
324E 0062
0037 0061
-0037 0041
-FF17 0061
-FF17 0041
-247A 0061
-1F108 0061
-248E 0061
-247A 0041
-1F108 0041
-248E 0041
-1D7D5 0061
-1D7DF 0061
-1D7E9 0061
-1D7F3 0061
-1D7FD 0061
-1D7D5 0041
-1D7DF 0041
-1D7E9 0041
-1D7F3 0041
-1D7FD 0041
-2466 0061
-24FB 0061
-277C 0061
-2786 0061
-2790 0061
-2466 0041
-24FB 0041
-277C 0041
-2786 0041
-2790 0041
-2077 0061
-2077 0041
-2087 0061
-2087 0041
0667 0061
-0667 0041
06F7 0061
-06F7 0041
-10E66 0061
-10E66 0041
07C7 0061
-07C7 0041
-136F 0061
-136F 0041
-104A7 0061
-104A7 0041
096D 0061
-096D 0041
09ED 0061
-09ED 0041
0A6D 0061
-0A6D 0041
0AED 0061
-0AED 0041
0B6D 0061
-0B6D 0041
0BED 0061
-0BED 0041
0C6D 0061
-0C6D 0041
0CED 0061
-0CED 0041
0D6D 0061
-0D6D 0041
-ABF7 0061
-ABF7 0041
-A8D7 0061
-A8D7 0041
-194D 0061
-194D 0041
-19D7 0061
-19D7 0041
-1A87 0061
-1A87 0041
-1A97 0061
-1A97 0041
0E57 0061
-0E57 0041
0ED7 0061
-0ED7 0041
0F27 0061
-0F27 0041
-0F30 0061
-0F30 0041
-1C47 0061
-1C47 0041
-A907 0061
-A907 0041
1047 0061
-1047 0041
1097 0061
-1097 0041
-1113D 0061
-1113D 0041
+136F 0061
17E7 0061
-17E7 0041
17F7 0061
-17F7 0041
-AA57 0061
-AA57 0041
+1817 0061
+194D 0061
+19D7 0061
+1A87 0061
+1A97 0061
1B57 0061
-1B57 0041
-A9D7 0061
-A9D7 0041
1BB7 0061
-1BB7 0041
-1817 0061
-1817 0041
+1C47 0061
1C57 0061
-1C57 0041
-A627 0061
-A627 0041
-110F7 0061
-110F7 0041
3027 0061
-3027 0041
+A627 0061
+A8D7 0061
+A907 0061
+A9D7 0061
+AA57 0061
+ABF7 0061
1010D 0061
-1010D 0041
+104A7 0061
+10E66 0061
+11058 0061
+1106D 0061
+110F7 0061
+1113D 0061
+111D7 0061
+116C7 0061
12405 0061
1240C 0061
12412 0061
12441 0061
12442 0061
12443 0061
+1D366 0061
+0037 0041
+0667 0041
+06F7 0041
+07C7 0041
+096D 0041
+09ED 0041
+0A6D 0041
+0AED 0041
+0B6D 0041
+0BED 0041
+0C6D 0041
+0CED 0041
+0D6D 0041
+0E57 0041
+0ED7 0041
+0F27 0041
+1047 0041
+1097 0041
+136F 0041
+17E7 0041
+17F7 0041
+1817 0041
+194D 0041
+19D7 0041
+1A87 0041
+1A97 0041
+1B57 0041
+1BB7 0041
+1C47 0041
+1C57 0041
+3027 0041
+A627 0041
+A8D7 0041
+A907 0041
+A9D7 0041
+AA57 0041
+ABF7 0041
+1010D 0041
+104A7 0041
+10E66 0041
+11058 0041
+1106D 0041
+110F7 0041
+1113D 0041
+111D7 0041
+116C7 0041
12405 0041
1240C 0041
12412 0041
12441 0041
12442 0041
12443 0041
-111D7 0061
-111D7 0041
-116C7 0061
-116C7 0041
-1106D 0061
-1106D 0041
-11058 0061
-11058 0041
-1D366 0061
1D366 0041
+FF17 0061
+FF17 0041
+247A 0061
+1F108 0061
+248E 0061
+0F30 0061
+247A 0041
+1F108 0041
+248E 0041
+0F30 0041
+1D7D5 0061
+1D7DF 0061
+1D7E9 0061
+1D7F3 0061
+1D7FD 0061
+1D7D5 0041
+1D7DF 0041
+1D7E9 0041
+1D7F3 0041
+1D7FD 0041
+2466 0061
+24FB 0061
+277C 0061
+2786 0061
+2790 0061
+2466 0041
+24FB 0041
+277C 0041
+2786 0041
+2790 0041
+2077 0061
+2077 0041
+2087 0061
+2087 0041
0037 0062
-FF17 0062
-247A 0062
-1F108 0062
-248E 0062
-1D7D5 0062
-1D7DF 0062
-1D7E9 0062
-1D7F3 0062
-1D7FD 0062
-2466 0062
-24FB 0062
-277C 0062
-2786 0062
-2790 0062
-2077 0062
-2087 0062
0667 0062
06F7 0062
-10E66 0062
07C7 0062
-136F 0062
-104A7 0062
096D 0062
09ED 0062
0A6D 0062
0C6D 0062
0CED 0062
0D6D 0062
-ABF7 0062
-A8D7 0062
-194D 0062
-19D7 0062
-1A87 0062
-1A97 0062
0E57 0062
0ED7 0062
0F27 0062
-0F30 0062
-1C47 0062
-A907 0062
1047 0062
1097 0062
-1113D 0062
+136F 0062
17E7 0062
17F7 0062
-AA57 0062
+1817 0062
+194D 0062
+19D7 0062
+1A87 0062
+1A97 0062
1B57 0062
-A9D7 0062
1BB7 0062
-1817 0062
+1C47 0062
1C57 0062
-A627 0062
-110F7 0062
3027 0062
+A627 0062
+A8D7 0062
+A907 0062
+A9D7 0062
+AA57 0062
+ABF7 0062
1010D 0062
+104A7 0062
+10E66 0062
+11058 0062
+1106D 0062
+110F7 0062
+1113D 0062
+111D7 0062
+116C7 0062
12405 0062
1240C 0062
12412 0062
12441 0062
12442 0062
12443 0062
-111D7 0062
-116C7 0062
-1106D 0062
-11058 0062
1D366 0062
+FF17 0062
+247A 0062
+1F108 0062
+248E 0062
+0F30 0062
+1D7D5 0062
+1D7DF 0062
+1D7E9 0062
+1D7F3 0062
+1D7FD 0062
+2466 0062
+24FB 0062
+277C 0062
+2786 0062
+2790 0062
+2077 0062
+2087 0062
33E6 0021
33E6 003F
33E6 0061
335F 0041
335F 0062
0038 0021
-0038 003F
-FF18 0021
-FF18 003F
-247B 0021
-247B 003F
-1F109 0334
-1F109 0021
-1F109 003F
-248F 0021
-248F 003F
-1D7D6 0021
-1D7E0 0021
-1D7EA 0021
-1D7F4 0021
-1D7FE 0021
-1D7D6 003F
-1D7E0 003F
-1D7EA 003F
-1D7F4 003F
-1D7FE 003F
-2467 0021
-24FC 0021
-277D 0021
-2787 0021
-2791 0021
-2467 003F
-24FC 003F
-277D 003F
-2787 003F
-2791 003F
-2078 0021
-2078 003F
-2088 0021
-2088 003F
-1D7D6 0334
-1D7E0 0334
-1D7EA 0334
-1D7F4 0334
-1D7FE 0334
0668 0021
-0668 003F
06F8 0021
-06F8 003F
-10E67 0021
-10E67 003F
-10E67 0334
07C8 0021
-07C8 003F
-1370 0021
-1370 003F
-104A8 0021
-104A8 003F
-104A8 0334
096E 0021
-096E 003F
09EE 0021
-09EE 003F
0A6E 0021
-0A6E 003F
0AEE 0021
-0AEE 003F
0B6E 0021
-0B6E 003F
0BEE 0021
-0BEE 003F
0C6E 0021
-0C6E 003F
0CEE 0021
-0CEE 003F
0D6E 0021
-0D6E 003F
-ABF8 0021
-ABF8 003F
-A8D8 0021
-A8D8 003F
-194E 0021
-194E 003F
-19D8 0021
-19D8 003F
-1A88 0021
-1A88 003F
-1A98 0021
-1A98 003F
0E58 0021
-0E58 003F
0ED8 0021
-0ED8 003F
0F28 0021
-0F28 003F
-0F31 0021
-0F31 003F
-1C48 0021
-1C48 003F
-A908 0021
-A908 003F
1048 0021
-1048 003F
1098 0021
-1098 003F
-1113E 0021
-1113E 003F
-1113E 0334
+1370 0021
17E8 0021
-17E8 003F
17F8 0021
-17F8 003F
-AA58 0021
-AA58 003F
+1818 0021
+194E 0021
+19D8 0021
+1A88 0021
+1A98 0021
1B58 0021
-1B58 003F
-A9D8 0021
-A9D8 003F
1BB8 0021
-1BB8 003F
-1818 0021
-1818 003F
+1C48 0021
1C58 0021
-1C58 003F
-A628 0021
-A628 003F
-110F8 0021
-110F8 003F
-110F8 0334
3028 0021
-3028 003F
+A628 0021
+A8D8 0021
+A908 0021
+A9D8 0021
+AA58 0021
+ABF8 0021
1010E 0021
-1010E 003F
-1010E 0334
+104A8 0021
+10E67 0021
+11059 0021
+1106E 0021
+110F8 0021
+1113E 0021
+111D8 0021
+116C8 0021
12406 0021
1240D 0021
12413 0021
1242A 0021
12444 0021
12445 0021
+1D367 0021
+0038 003F
+0668 003F
+06F8 003F
+07C8 003F
+096E 003F
+09EE 003F
+0A6E 003F
+0AEE 003F
+0B6E 003F
+0BEE 003F
+0C6E 003F
+0CEE 003F
+0D6E 003F
+0E58 003F
+0ED8 003F
+0F28 003F
+1048 003F
+1098 003F
+1370 003F
+17E8 003F
+17F8 003F
+1818 003F
+194E 003F
+19D8 003F
+1A88 003F
+1A98 003F
+1B58 003F
+1BB8 003F
+1C48 003F
+1C58 003F
+3028 003F
+A628 003F
+A8D8 003F
+A908 003F
+A9D8 003F
+AA58 003F
+ABF8 003F
+1010E 003F
+104A8 003F
+10E67 003F
+11059 003F
+1106E 003F
+110F8 003F
+1113E 003F
+111D8 003F
+116C8 003F
12406 003F
1240D 003F
12413 003F
1242A 003F
12444 003F
12445 003F
+1D367 003F
+FF18 0021
+FF18 003F
+247B 0021
+247B 003F
+1F109 0334
+1F109 0021
+1F109 003F
+0F31 0021
+0F31 003F
+248F 0021
+248F 003F
+1D7D6 0021
+1D7E0 0021
+1D7EA 0021
+1D7F4 0021
+1D7FE 0021
+1D7D6 003F
+1D7E0 003F
+1D7EA 003F
+1D7F4 003F
+1D7FE 003F
+2467 0021
+24FC 0021
+277D 0021
+2787 0021
+2791 0021
+2467 003F
+24FC 003F
+277D 003F
+2787 003F
+2791 003F
+2078 0021
+2078 003F
+2088 0021
+2088 003F
+1010E 0334
+104A8 0334
+10E67 0334
+11059 0334
+1106E 0334
+110F8 0334
+1113E 0334
+111D8 0334
+116C8 0334
12406 0334
1240D 0334
12413 0334
1242A 0334
12444 0334
12445 0334
-111D8 0021
-111D8 003F
-111D8 0334
-116C8 0021
-116C8 003F
-116C8 0334
-1106E 0021
-1106E 003F
-1106E 0334
-11059 0021
-11059 003F
-11059 0334
-1D367 0021
-1D367 003F
1D367 0334
+1D7D6 0334
+1D7E0 0334
+1D7EA 0334
+1D7F4 0334
+1D7FE 0334
324F 0021
324F 003F
324F 0061
324F 0041
324F 0062
0038 0061
-0038 0041
-FF18 0061
-FF18 0041
-247B 0061
-1F109 0061
-248F 0061
-247B 0041
-1F109 0041
-248F 0041
-1D7D6 0061
-1D7E0 0061
-1D7EA 0061
-1D7F4 0061
-1D7FE 0061
-1D7D6 0041
-1D7E0 0041
-1D7EA 0041
-1D7F4 0041
-1D7FE 0041
-2467 0061
-24FC 0061
-277D 0061
-2787 0061
-2791 0061
-2467 0041
-24FC 0041
-277D 0041
-2787 0041
-2791 0041
-2078 0061
-2078 0041
-2088 0061
-2088 0041
0668 0061
-0668 0041
06F8 0061
-06F8 0041
-10E67 0061
-10E67 0041
07C8 0061
-07C8 0041
-1370 0061
-1370 0041
-104A8 0061
-104A8 0041
096E 0061
-096E 0041
09EE 0061
-09EE 0041
0A6E 0061
-0A6E 0041
0AEE 0061
-0AEE 0041
0B6E 0061
-0B6E 0041
0BEE 0061
-0BEE 0041
0C6E 0061
-0C6E 0041
0CEE 0061
-0CEE 0041
0D6E 0061
-0D6E 0041
-ABF8 0061
-ABF8 0041
-A8D8 0061
-A8D8 0041
-194E 0061
-194E 0041
-19D8 0061
-19D8 0041
-1A88 0061
-1A88 0041
-1A98 0061
-1A98 0041
0E58 0061
-0E58 0041
0ED8 0061
-0ED8 0041
0F28 0061
-0F28 0041
-0F31 0061
-0F31 0041
-1C48 0061
-1C48 0041
-A908 0061
-A908 0041
1048 0061
-1048 0041
1098 0061
-1098 0041
-1113E 0061
-1113E 0041
+1370 0061
17E8 0061
-17E8 0041
17F8 0061
-17F8 0041
-AA58 0061
-AA58 0041
+1818 0061
+194E 0061
+19D8 0061
+1A88 0061
+1A98 0061
1B58 0061
-1B58 0041
-A9D8 0061
-A9D8 0041
1BB8 0061
-1BB8 0041
-1818 0061
-1818 0041
+1C48 0061
1C58 0061
-1C58 0041
-A628 0061
-A628 0041
-110F8 0061
-110F8 0041
3028 0061
-3028 0041
+A628 0061
+A8D8 0061
+A908 0061
+A9D8 0061
+AA58 0061
+ABF8 0061
1010E 0061
-1010E 0041
+104A8 0061
+10E67 0061
+11059 0061
+1106E 0061
+110F8 0061
+1113E 0061
+111D8 0061
+116C8 0061
12406 0061
1240D 0061
12413 0061
1242A 0061
12444 0061
12445 0061
+1D367 0061
+0038 0041
+0668 0041
+06F8 0041
+07C8 0041
+096E 0041
+09EE 0041
+0A6E 0041
+0AEE 0041
+0B6E 0041
+0BEE 0041
+0C6E 0041
+0CEE 0041
+0D6E 0041
+0E58 0041
+0ED8 0041
+0F28 0041
+1048 0041
+1098 0041
+1370 0041
+17E8 0041
+17F8 0041
+1818 0041
+194E 0041
+19D8 0041
+1A88 0041
+1A98 0041
+1B58 0041
+1BB8 0041
+1C48 0041
+1C58 0041
+3028 0041
+A628 0041
+A8D8 0041
+A908 0041
+A9D8 0041
+AA58 0041
+ABF8 0041
+1010E 0041
+104A8 0041
+10E67 0041
+11059 0041
+1106E 0041
+110F8 0041
+1113E 0041
+111D8 0041
+116C8 0041
12406 0041
1240D 0041
12413 0041
1242A 0041
12444 0041
12445 0041
-111D8 0061
-111D8 0041
-116C8 0061
-116C8 0041
-1106E 0061
-1106E 0041
-11059 0061
-11059 0041
-1D367 0061
1D367 0041
+FF18 0061
+FF18 0041
+247B 0061
+1F109 0061
+248F 0061
+0F31 0061
+247B 0041
+1F109 0041
+248F 0041
+0F31 0041
+1D7D6 0061
+1D7E0 0061
+1D7EA 0061
+1D7F4 0061
+1D7FE 0061
+1D7D6 0041
+1D7E0 0041
+1D7EA 0041
+1D7F4 0041
+1D7FE 0041
+2467 0061
+24FC 0061
+277D 0061
+2787 0061
+2791 0061
+2467 0041
+24FC 0041
+277D 0041
+2787 0041
+2791 0041
+2078 0061
+2078 0041
+2088 0061
+2088 0041
0038 0062
-FF18 0062
-247B 0062
-1F109 0062
-248F 0062
-1D7D6 0062
-1D7E0 0062
-1D7EA 0062
-1D7F4 0062
-1D7FE 0062
-2467 0062
-24FC 0062
-277D 0062
-2787 0062
-2791 0062
-2078 0062
-2088 0062
0668 0062
06F8 0062
-10E67 0062
07C8 0062
-1370 0062
-104A8 0062
096E 0062
09EE 0062
0A6E 0062
0C6E 0062
0CEE 0062
0D6E 0062
-ABF8 0062
-A8D8 0062
-194E 0062
-19D8 0062
-1A88 0062
-1A98 0062
0E58 0062
0ED8 0062
0F28 0062
-0F31 0062
-1C48 0062
-A908 0062
1048 0062
1098 0062
-1113E 0062
+1370 0062
17E8 0062
17F8 0062
-AA58 0062
+1818 0062
+194E 0062
+19D8 0062
+1A88 0062
+1A98 0062
1B58 0062
-A9D8 0062
1BB8 0062
-1818 0062
+1C48 0062
1C58 0062
-A628 0062
-110F8 0062
3028 0062
+A628 0062
+A8D8 0062
+A908 0062
+A9D8 0062
+AA58 0062
+ABF8 0062
1010E 0062
+104A8 0062
+10E67 0062
+11059 0062
+1106E 0062
+110F8 0062
+1113E 0062
+111D8 0062
+116C8 0062
12406 0062
1240D 0062
12413 0062
1242A 0062
12444 0062
12445 0062
-111D8 0062
-116C8 0062
-1106E 0062
-11059 0062
1D367 0062
+FF18 0062
+247B 0062
+1F109 0062
+248F 0062
+0F31 0062
+1D7D6 0062
+1D7E0 0062
+1D7EA 0062
+1D7F4 0062
+1D7FE 0062
+2467 0062
+24FC 0062
+277D 0062
+2787 0062
+2791 0062
+2078 0062
+2088 0062
33E7 0021
33E7 003F
33E7 0061
3360 0041
3360 0062
0039 0021
-0039 003F
-FF19 0021
-FF19 003F
-247C 0021
-247C 003F
-1F10A 0334
-1F10A 0021
-1F10A 003F
-2490 0021
-2490 003F
-1D7D7 0021
-1D7E1 0021
-1D7EB 0021
-1D7F5 0021
-1D7FF 0021
-1D7D7 003F
-1D7E1 003F
-1D7EB 003F
-1D7F5 003F
-1D7FF 003F
-2468 0021
-24FD 0021
-277E 0021
-2788 0021
-2792 0021
-2468 003F
-24FD 003F
-277E 003F
-2788 003F
-2792 003F
-2079 0021
-2079 003F
-2089 0021
-2089 003F
-1D7D7 0334
-1D7E1 0334
-1D7EB 0334
-1D7F5 0334
-1D7FF 0334
0669 0021
-0669 003F
06F9 0021
-06F9 003F
-10E68 0021
-10E68 003F
-10E68 0334
07C9 0021
-07C9 003F
-1371 0021
-1371 003F
-104A9 0021
-104A9 003F
-104A9 0334
096F 0021
-096F 003F
09EF 0021
-09EF 003F
0A6F 0021
-0A6F 003F
0AEF 0021
-0AEF 003F
0B6F 0021
-0B6F 003F
0BEF 0021
-0BEF 003F
0C6F 0021
-0C6F 003F
0CEF 0021
-0CEF 003F
0D6F 0021
-0D6F 003F
-ABF9 0021
-ABF9 003F
-A8D9 0021
-A8D9 003F
-194F 0021
-194F 003F
-19D9 0021
-19D9 003F
-1A89 0021
-1A89 003F
-1A99 0021
-1A99 003F
0E59 0021
-0E59 003F
0ED9 0021
-0ED9 003F
0F29 0021
-0F29 003F
-0F32 0021
-0F32 003F
-1C49 0021
-1C49 003F
-A909 0021
-A909 003F
1049 0021
-1049 003F
1099 0021
-1099 003F
-1113F 0021
-1113F 003F
-1113F 0334
+1371 0021
17E9 0021
-17E9 003F
17F9 0021
-17F9 003F
-AA59 0021
-AA59 003F
+1819 0021
+194F 0021
+19D9 0021
+1A89 0021
+1A99 0021
1B59 0021
-1B59 003F
-A9D9 0021
-A9D9 003F
1BB9 0021
-1BB9 003F
-1819 0021
-1819 003F
+1C49 0021
1C59 0021
-1C59 003F
-A629 0021
-A629 003F
-110F9 0021
-110F9 003F
-110F9 0334
3029 0021
-3029 003F
+A629 0021
+A8D9 0021
+A909 0021
+A9D9 0021
+AA59 0021
+ABF9 0021
1010F 0021
-1010F 003F
-1010F 0334
+104A9 0021
+10E68 0021
+1105A 0021
+1106F 0021
+110F9 0021
+1113F 0021
+111D9 0021
+116C9 0021
12407 0021
1240E 0021
12414 0021
12447 0021
12448 0021
12449 0021
+1D368 0021
+0039 003F
+0669 003F
+06F9 003F
+07C9 003F
+096F 003F
+09EF 003F
+0A6F 003F
+0AEF 003F
+0B6F 003F
+0BEF 003F
+0C6F 003F
+0CEF 003F
+0D6F 003F
+0E59 003F
+0ED9 003F
+0F29 003F
+1049 003F
+1099 003F
+1371 003F
+17E9 003F
+17F9 003F
+1819 003F
+194F 003F
+19D9 003F
+1A89 003F
+1A99 003F
+1B59 003F
+1BB9 003F
+1C49 003F
+1C59 003F
+3029 003F
+A629 003F
+A8D9 003F
+A909 003F
+A9D9 003F
+AA59 003F
+ABF9 003F
+1010F 003F
+104A9 003F
+10E68 003F
+1105A 003F
+1106F 003F
+110F9 003F
+1113F 003F
+111D9 003F
+116C9 003F
12407 003F
1240E 003F
12414 003F
12447 003F
12448 003F
12449 003F
+1D368 003F
+FF19 0021
+FF19 003F
+247C 0021
+247C 003F
+1F10A 0334
+1F10A 0021
+1F10A 003F
+0F32 0021
+0F32 003F
+2490 0021
+2490 003F
+1D7D7 0021
+1D7E1 0021
+1D7EB 0021
+1D7F5 0021
+1D7FF 0021
+1D7D7 003F
+1D7E1 003F
+1D7EB 003F
+1D7F5 003F
+1D7FF 003F
+2468 0021
+24FD 0021
+277E 0021
+2788 0021
+2792 0021
+2468 003F
+24FD 003F
+277E 003F
+2788 003F
+2792 003F
+2079 0021
+2079 003F
+2089 0021
+2089 003F
+1010F 0334
+104A9 0334
+10E68 0334
+1105A 0334
+1106F 0334
+110F9 0334
+1113F 0334
+111D9 0334
+116C9 0334
12407 0334
1240E 0334
12414 0334
12447 0334
12448 0334
12449 0334
-111D9 0021
-111D9 003F
-111D9 0334
-116C9 0021
-116C9 003F
-116C9 0334
-1106F 0021
-1106F 003F
-1106F 0334
-1105A 0021
-1105A 003F
-1105A 0334
-1D368 0021
-1D368 003F
1D368 0334
+1D7D7 0334
+1D7E1 0334
+1D7EB 0334
+1D7F5 0334
+1D7FF 0334
0039 0061
+0669 0061
+06F9 0061
+07C9 0061
+096F 0061
+09EF 0061
+0A6F 0061
+0AEF 0061
+0B6F 0061
+0BEF 0061
+0C6F 0061
+0CEF 0061
+0D6F 0061
+0E59 0061
+0ED9 0061
+0F29 0061
+1049 0061
+1099 0061
+1371 0061
+17E9 0061
+17F9 0061
+1819 0061
+194F 0061
+19D9 0061
+1A89 0061
+1A99 0061
+1B59 0061
+1BB9 0061
+1C49 0061
+1C59 0061
+3029 0061
+A629 0061
+A8D9 0061
+A909 0061
+A9D9 0061
+AA59 0061
+ABF9 0061
+1010F 0061
+104A9 0061
+10E68 0061
+1105A 0061
+1106F 0061
+110F9 0061
+1113F 0061
+111D9 0061
+116C9 0061
+12407 0061
+1240E 0061
+12414 0061
+1241D 0061
+1242B 0061
+12446 0061
+12447 0061
+12448 0061
+12449 0061
+1D368 0061
0039 0041
+0669 0041
+06F9 0041
+07C9 0041
+096F 0041
+09EF 0041
+0A6F 0041
+0AEF 0041
+0B6F 0041
+0BEF 0041
+0C6F 0041
+0CEF 0041
+0D6F 0041
+0E59 0041
+0ED9 0041
+0F29 0041
+1049 0041
+1099 0041
+1371 0041
+17E9 0041
+17F9 0041
+1819 0041
+194F 0041
+19D9 0041
+1A89 0041
+1A99 0041
+1B59 0041
+1BB9 0041
+1C49 0041
+1C59 0041
+3029 0041
+A629 0041
+A8D9 0041
+A909 0041
+A9D9 0041
+AA59 0041
+ABF9 0041
+1010F 0041
+104A9 0041
+10E68 0041
+1105A 0041
+1106F 0041
+110F9 0041
+1113F 0041
+111D9 0041
+116C9 0041
+12407 0041
+1240E 0041
+12414 0041
+1241D 0041
+1242B 0041
+12446 0041
+12447 0041
+12448 0041
+12449 0041
+1D368 0041
FF19 0061
FF19 0041
247C 0061
1F10A 0061
2490 0061
+0F32 0061
247C 0041
1F10A 0041
2490 0041
+0F32 0041
1D7D7 0061
1D7E1 0061
1D7EB 0061
2079 0041
2089 0061
2089 0041
-0669 0061
-0669 0041
-06F9 0061
-06F9 0041
-10E68 0061
-10E68 0041
-07C9 0061
-07C9 0041
-1371 0061
-1371 0041
-104A9 0061
-104A9 0041
-096F 0061
-096F 0041
-09EF 0061
-09EF 0041
-0A6F 0061
-0A6F 0041
-0AEF 0061
-0AEF 0041
-0B6F 0061
-0B6F 0041
-0BEF 0061
-0BEF 0041
-0C6F 0061
-0C6F 0041
-0CEF 0061
-0CEF 0041
-0D6F 0061
-0D6F 0041
-ABF9 0061
-ABF9 0041
-A8D9 0061
-A8D9 0041
-194F 0061
-194F 0041
-19D9 0061
-19D9 0041
-1A89 0061
-1A89 0041
-1A99 0061
-1A99 0041
-0E59 0061
-0E59 0041
-0ED9 0061
-0ED9 0041
-0F29 0061
-0F29 0041
-0F32 0061
-0F32 0041
-1C49 0061
-1C49 0041
-A909 0061
-A909 0041
-1049 0061
-1049 0041
-1099 0061
-1099 0041
-1113F 0061
-1113F 0041
-17E9 0061
-17E9 0041
-17F9 0061
-17F9 0041
-AA59 0061
-AA59 0041
-1B59 0061
-1B59 0041
-A9D9 0061
-A9D9 0041
-1BB9 0061
-1BB9 0041
-1819 0061
-1819 0041
-1C59 0061
-1C59 0041
-A629 0061
-A629 0041
-110F9 0061
-110F9 0041
-3029 0061
-3029 0041
-1010F 0061
-1010F 0041
-12407 0061
-1240E 0061
-12414 0061
-1241D 0061
-1242B 0061
-12446 0061
-12447 0061
-12448 0061
-12449 0061
-12407 0041
-1240E 0041
-12414 0041
-1241D 0041
-1242B 0041
-12446 0041
-12447 0041
-12448 0041
-12449 0041
-111D9 0061
-111D9 0041
-116C9 0061
-116C9 0041
-1106F 0061
-1106F 0041
-1105A 0061
-1105A 0041
-1D368 0061
-1D368 0041
0039 0062
-FF19 0062
-247C 0062
-1F10A 0062
-2490 0062
-1D7D7 0062
-1D7E1 0062
-1D7EB 0062
-1D7F5 0062
-1D7FF 0062
-2468 0062
-24FD 0062
-277E 0062
-2788 0062
-2792 0062
-2079 0062
-2089 0062
0669 0062
06F9 0062
-10E68 0062
07C9 0062
-1371 0062
-104A9 0062
096F 0062
09EF 0062
0A6F 0062
0C6F 0062
0CEF 0062
0D6F 0062
-ABF9 0062
-A8D9 0062
-194F 0062
-19D9 0062
-1A89 0062
-1A99 0062
0E59 0062
0ED9 0062
0F29 0062
-0F32 0062
-1C49 0062
-A909 0062
1049 0062
1099 0062
-1113F 0062
+1371 0062
17E9 0062
17F9 0062
-AA59 0062
+1819 0062
+194F 0062
+19D9 0062
+1A89 0062
+1A99 0062
1B59 0062
-A9D9 0062
1BB9 0062
-1819 0062
+1C49 0062
1C59 0062
-A629 0062
-110F9 0062
3029 0062
+A629 0062
+A8D9 0062
+A909 0062
+A9D9 0062
+AA59 0062
+ABF9 0062
1010F 0062
+104A9 0062
+10E68 0062
+1105A 0062
+1106F 0062
+110F9 0062
+1113F 0062
+111D9 0062
+116C9 0062
12407 0062
1240E 0062
12414 0062
12447 0062
12448 0062
12449 0062
-111D9 0062
-116C9 0062
-1106F 0062
-1105A 0062
1D368 0062
+FF19 0062
+247C 0062
+1F10A 0062
+2490 0062
+0F32 0062
+1D7D7 0062
+1D7E1 0062
+1D7EB 0062
+1D7F5 0062
+1D7FF 0062
+2468 0062
+24FD 0062
+277E 0062
+2788 0062
+2792 0062
+2079 0062
+2089 0062
33E8 0021
33E8 003F
33E8 0061
000C 0061
000D 0061
0085 0061
-180E 0061
2028 0061
2029 0061
0020 0061
169C 0061
2045 0061
2046 0061
+2308 0061
+2309 0061
+230A 0061
+230B 0061
29FC 0061
29FD 0061
2983 0061
0618 0061
0619 0061
061A 0061
+061C 0061
0640 0061
06D6 0061
06D7 0061
180B 0061
180C 0061
180D 0061
+180E 0061
1A7F 0061
1B6B 0061
1B6C 0061
2062 0061
2063 0061
2064 0061
+2066 0061
+2067 0061
+2068 0061
+2069 0061
206A 0061
206B 0061
206C 0061
000C 0041
000D 0041
0085 0041
-180E 0041
2028 0041
2029 0041
0020 0041
169C 0041
2045 0041
2046 0041
+2308 0041
+2309 0041
+230A 0041
+230B 0041
29FC 0041
29FD 0041
2983 0041
0618 0041
0619 0041
061A 0041
+061C 0041
0640 0041
06D6 0041
06D7 0041
180B 0041
180C 0041
180D 0041
+180E 0041
1A7F 0041
1B6B 0041
1B6C 0041
2062 0041
2063 0041
2064 0041
+2066 0041
+2067 0041
+2068 0041
+2069 0041
206A 0041
206B 0041
206C 0041
20E2 0041
20E3 0041
20E4 0041
+3099 0061
+3099 0041
+FF9E 0061
+FF9E 0041
+309A 0061
+309A 0041
+FF9F 0061
+FF9F 0041
+0335 0061
+0335 0041
0305 0061
0305 0041
0309 0061
0334 0041
0334 1DD3
1DD3 0334
-0335 0061
-0335 0041
0339 0061
0339 0041
0345 0061
302E 0041
302F 0061
302F 0041
-3099 0061
-3099 0041
-FF9E 0061
-FF9E 0041
-309A 0061
-309A 0041
-FF9F 0061
-FF9F 0041
20D0 0061
20D0 0041
20D1 0061
000C 0062
000D 0062
0085 0062
-180E 0062
2028 0062
2029 0062
0020 0062
169C 0062
2045 0062
2046 0062
+2308 0062
+2309 0062
+230A 0062
+230B 0062
29FC 0062
29FD 0062
2983 0062
0618 0062
0619 0062
061A 0062
+061C 0062
0640 0062
06D6 0062
06D7 0062
180B 0062
180C 0062
180D 0062
+180E 0062
1A7F 0062
1B6B 0062
1B6C 0062
2062 0062
2063 0062
2064 0062
+2066 0062
+2067 0062
+2068 0062
+2069 0062
206A 0062
206B 0062
206C 0062
20E2 0062
20E3 0062
20E4 0062
+3099 0062
+FF9E 0062
+309A 0062
+FF9F 0062
+0335 0062
0305 0062
0309 0062
030F 0062
0330 0062
0331 0062
0334 0062
-0335 0062
0339 0062
0345 0062
0358 0062
302D 0062
302E 0062
302F 0062
-3099 0062
-FF9E 0062
-309A 0062
-FF9F 0062
20D0 0062
20D1 0062
20D2 0062
1E11 003F
1E10 0021
1E10 003F
+0111 0021
+0111 003F
+0110 0021
+0110 003F
1E0D 0021
1E0D 003F
1E0C 0021
0044 0331 0334
0044 0334 0331
1E0E 0334
-0111 0021
-0111 003F
-0110 0021
-0110 003F
00F0 0021
1DD9 0021
00F0 003F
1E11 0041
1E10 0061
1E10 0041
+0111 0061
+0111 0041
+0110 0061
+0110 0041
1E0D 0061
1E0D 0041
1E0C 0061
1E0F 0041
1E0E 0061
1E0E 0041
-0111 0061
-0111 0041
-0110 0061
-0110 0041
00F0 0061
1DD9 0061
00F0 0041
1E0A 0062
1E11 0062
1E10 0062
+0111 0062
+0110 0062
1E0D 0062
1E0C 0062
1E13 0062
1E12 0062
1E0F 0062
1E0E 0062
-0111 0062
-0110 0062
00F0 0062
1DD9 0062
00D0 0062
1E29 003F
1E28 0021
1E28 003F
+0127 0021
+210F 0021
+0127 003F
+210F 003F
+0126 0021
+0126 003F
+A7F8 0021
+A7F8 003F
1E25 0021
1E25 003F
1E24 0021
0068 0331 0334
0068 0334 0331
1E96 0334
-0127 0021
-210F 0021
-0127 003F
-210F 003F
-0126 0021
-0126 003F
-A7F8 0021
-A7F8 003F
0334 036A
036A 0334
0068 0061
1E29 0041
1E28 0061
1E28 0041
+0127 0061
+210F 0061
+0127 0041
+210F 0041
+0126 0061
+0126 0041
+A7F8 0061
+A7F8 0041
1E25 0061
1E25 0041
1E24 0061
1E2A 0041
1E96 0061
1E96 0041
-0127 0061
-210F 0061
-0127 0041
-210F 0041
-0126 0061
-0126 0041
-A7F8 0061
-A7F8 0041
33CA 0061
33CA 0041
33CA 0062
1E22 0062
1E29 0062
1E28 0062
+0127 0062
+210F 0062
+0126 0062
+A7F8 0062
1E25 0062
1E24 0062
1E2B 0062
1E2A 0062
1E96 0062
-0127 0062
-210F 0062
-0126 0062
-A7F8 0062
32CC 0021
32CC 003F
32CC 0061
0069 0308 0301 0334
0069 0308 0334 0341
00EF 0301 0334
-1E2F 0334
+00EF 0334 0341
0049 0308 0334 0301
0049 0308 0341 0334
00CF 0334 0301
013C 003F
013B 0021
013B 003F
+0142 0021
+0142 003F
+0141 0021
+0141 003F
1E37 0021
1E37 003F
1E36 0021
004C 0331 0334
004C 0334 0331
1E3A 0334
-0142 0021
-0142 003F
-0141 0021
-0141 003F
006C 00B7 0021
006C 0387 0021
0140 0021
013C 0041
013B 0061
013B 0041
+0142 0061
+0142 0041
+0141 0061
+0141 0041
1E37 0061
1E37 0041
1E36 0061
1E3B 0041
1E3A 0061
1E3A 0041
-0142 0061
-0142 0041
-0141 0061
-0141 0041
006C 00B7 0061
006C 0387 0061
0140 0061
013D 0062
013C 0062
013B 0062
+0142 0062
+0141 0062
1E37 0062
1E36 0062
1E39 0062
1E3C 0062
1E3B 0062
1E3A 0062
-0142 0062
-0141 0062
006C 00B7 0062
006C 0387 0062
0140 0062
0075 0334 0344
0075 0344 0334
01D8 0334
-0055 0308 0341 0334
+0055 0308 0301 0334
0055 0334 0308 0301
-0055 0334 0308 0341
00DC 0301 0334
+00DC 0334 0301
0075 0308 0340 0334
0075 0334 0308 0340
00FC 0300 0334
0391 0334 0313 0340
1F08 0300 0334
1F0A 0334
-03B1 0313 0300 0334 0345
03B1 0343 0300 0345 0334
03B1 0343 0345 0334 0340
03B1 0345 0313 0300 0334
+1F00 0345 0340 0334
0391 0343 0334 0345 0340
0391 0345 0313 0334 0340
1F08 0300 0345 0334
1F09 0300 0334
1F09 0340 0334
03B1 0314 0334 0300 0345
-03B1 0334 0314 0345 0300
-03B1 0345 0314 0300 0334
-03B1 0345 0334 0314 0340
+03B1 0314 0345 0340 0334
+1F81 0340 0334
+1F83 0334
0391 0334 0345 0314 0300
1F09 0334 0345 0300
-1F09 0345 0300 0334
1F0B 0345 0334
+1FBC 0314 0300 0334
03B1 0314 0334 0342
03B1 0314 0342 0334
03B1 0334 0314 0342
0397 0334 0343
1F28 0334
03B7 0334 0313 0341
-03B7 0334 0343 0301
03B7 0334 0343 0341
-1F20 0334 0341
+03B7 0343 0341 0334
+1F20 0341 0334
0397 0313 0334 0301
0397 0313 0341 0334
0397 0334 0313 0301
0397 0343 0300 0334
03B7 0313 0345 0334 0300
03B7 0313 0345 0340 0334
-03B7 0343 0340 0334 0345
03B7 0343 0345 0340 0334
+1FC3 0334 0313 0300
0397 0313 0334 0300 0345
0397 0343 0334 0345 0340
0397 0343 0340 0345 0334
1F29 0334 0301
1F2D 0334
03B7 0314 0301 0345 0334
+03B7 0314 0334 0345 0301
03B7 0314 0334 0345 0341
-1F21 0334 0341 0345
-1F21 0345 0301 0334
+03B7 0314 0345 0334 0301
0397 0314 0334 0345 0341
0397 0334 0314 0345 0301
1F29 0334 0301 0345
03C5 0334 0343 0341
1F50 0334 0301
1F50 0334 0341
-03C5 0334 0313 0340
+03C5 0313 0340 0334
03C5 0334 0343 0340
1F50 0334 0340
1F52 0334
03D2 0308 0334
03D2 0334 0308
03D4 0334
+03B0 0334
03C5 0308 0341 0334
03C5 0344 0334
-03CB 0334 0341
03CB 0341 0334
03C5 0308 0300 0334
03C5 0308 0340 0334
2126 0334 0314 0301
2126 0334 0314 0341
03C9 0314 0334 0345 0301
-1F61 0334 0301 0345
-1F61 0341 0345 0334
+03C9 0345 0314 0334 0301
+1F61 0301 0345 0334
1F65 0345 0334
03A9 0314 0301 0345 0334
03A9 0345 0334 0314 0301
2126 0314 0334 0340
2126 0334 0314 0340
03C9 0314 0334 0345 0300
-03C9 0314 0340 0334 0345
03C9 0314 0345 0300 0334
-03C9 0334 0314 0340 0345
+03C9 0345 0314 0300 0334
+03C9 0345 0314 0334 0300
03A9 0314 0300 0345 0334
03A9 0345 0334 0314 0300
1F6B 0345 0334
FF73 003F
32D2 0021
32D2 003F
+3094 0021
+3094 003F
+30F4 0021
+30F4 003F
3046 0334 3099
3046 3099 0334
3094 0334
30A6 0334 3099
30A6 3099 0334
30F4 0334
-3094 0021
-3094 003F
-30F4 0021
-30F4 003F
3045 0061
3045 0041
3046 0061
FF76 003F
32D5 0021
32D5 003F
+304C 0021
+304C 003F
+30AC 0021
+30AC 003F
304B 0334 3099
304B 3099 0334
304C 0334
30AB 0334 3099
30AB 3099 0334
30AC 0334
-304C 0021
-304C 003F
-30AC 0021
-30AC 003F
3095 0061
3095 0041
304B 0061
FF77 003F
32D6 0021
32D6 003F
+304E 0021
+304E 003F
+30AE 0021
+30AE 003F
304D 0334 3099
304D 3099 0334
304E 0334
30AD 0334 3099
30AD 3099 0334
30AE 0334
-304E 0021
-304E 003F
-30AE 0021
-30AE 003F
304D 0061
304D 0041
30AD 0061
FF78 003F
32D7 0021
32D7 003F
+3050 0021
+3050 003F
+30B0 0021
+30B0 003F
304F 0334 3099
304F 3099 0334
3050 0334
30AF 0334 3099
30AF 3099 0334
30B0 0334
-3050 0021
-3050 003F
-30B0 0021
-30B0 003F
304F 0061
304F 0041
31F0 0061
FF79 003F
32D8 0021
32D8 003F
+3052 0021
+3052 003F
+30B2 0021
+30B2 003F
3051 0334 3099
3051 3099 0334
3052 0334
30B1 0334 3099
30B1 3099 0334
30B2 0334
-3052 0021
-3052 003F
-30B2 0021
-30B2 003F
331C 0021
331C 003F
331C 0061
FF7A 003F
32D9 0021
32D9 003F
+3054 0021
+3054 003F
+30B4 0021
+30B4 003F
3053 0334 3099
3053 3099 0334
3054 0334
30B3 0334 3099
30B3 3099 0334
30B4 0334
-3054 0021
-3054 003F
-30B4 0021
-30B4 003F
331E 0021
331E 003F
331E 0061
32DA 003F
1F202 0021
1F202 003F
+3056 0021
+3056 003F
+30B6 0021
+30B6 003F
1F202 0334
3055 0334 3099
3055 3099 0334
30B5 0334 3099
30B5 3099 0334
30B6 0334
-3056 0021
-3056 003F
-30B6 0021
-30B6 003F
3055 0061
3055 0041
30B5 0061
FF7C 003F
32DB 0021
32DB 003F
+3058 0021
+3058 003F
+30B8 0021
+30B8 003F
3057 0334 3099
3057 3099 0334
3058 0334
30B7 0334 3099
30B7 3099 0334
30B8 0334
-3058 0021
-3058 003F
-30B8 0021
-30B8 003F
3057 0061
3057 0041
31F1 0061
FF7D 003F
32DC 0021
32DC 003F
+305A 0021
+305A 003F
+30BA 0021
+30BA 003F
3059 0334 3099
3059 3099 0334
305A 0334
30B9 0334 3099
30B9 3099 0334
30BA 0334
-305A 0021
-305A 003F
-30BA 0021
-30BA 003F
3059 0061
3059 0041
31F2 0061
FF7E 003F
32DD 0021
32DD 003F
+305C 0021
+305C 003F
+30BC 0021
+30BC 003F
305B 0334 3099
305B 3099 0334
305C 0334
30BB 0334 3099
30BB 3099 0334
30BC 0334
-305C 0021
-305C 003F
-30BC 0021
-30BC 003F
305B 0061
305B 0041
30BB 0061
FF7F 003F
32DE 0021
32DE 003F
+305E 0021
+305E 003F
+30BE 0021
+30BE 003F
305D 0334 3099
305D 3099 0334
305E 0334
30BD 0334 3099
30BD 3099 0334
30BE 0334
-305E 0021
-305E 003F
-30BE 0021
-30BE 003F
305D 0061
305D 0041
30BD 0061
FF80 003F
32DF 0021
32DF 003F
+3060 0021
+3060 003F
+30C0 0021
+30C0 003F
305F 0334 3099
305F 3099 0334
3060 0334
30BF 0334 3099
30BF 3099 0334
30C0 0334
-3060 0021
-3060 003F
-30C0 0021
-30C0 003F
3324 0021
3324 003F
3324 0061
FF81 003F
32E0 0021
32E0 003F
+3062 0021
+3062 003F
+30C2 0021
+30C2 003F
3061 0334 3099
3061 3099 0334
3062 0334
30C1 0334 3099
30C1 3099 0334
30C2 0334
-3062 0021
-3062 003F
-30C2 0021
-30C2 003F
3061 0061
3061 0041
30C1 0061
FF82 003F
32E1 0021
32E1 003F
+3065 0021
+3065 003F
+30C5 0021
+30C5 003F
3064 0334 3099
3064 3099 0334
3065 0334
30C4 0334 3099
30C4 3099 0334
30C5 0334
-3065 0021
-3065 003F
-30C5 0021
-30C5 003F
3063 0061
3063 0041
3064 0061
FF83 003F
32E2 0021
32E2 003F
-3066 0334 3099
-3066 3099 0334
-3067 0334
-30C6 0334 3099
-30C6 3099 0334
-30C7 0334
3067 0021
3067 003F
30C7 0021
1F213 0021
1F213 003F
1F213 0334
+3066 0334 3099
+3066 3099 0334
+3067 0334
+30C6 0334 3099
+30C6 3099 0334
+30C7 0334
3066 0061
3066 0041
30C6 0061
FF84 003F
32E3 0021
32E3 003F
+3069 0021
+3069 003F
+30C9 0021
+30C9 003F
3068 0334 3099
3068 3099 0334
3069 0334
30C8 0334 3099
30C8 3099 0334
30C9 0334
-3069 0021
-3069 003F
-30C9 0021
-30C9 003F
3068 0061
3068 0041
31F3 0061
FF8A 003F
32E9 0021
32E9 003F
+3070 0021
+3070 003F
+30D0 0021
+30D0 003F
+3071 0021
+3071 003F
+30D1 0021
+30D1 003F
306F 0334 3099
306F 3099 0334
3070 0334
30CF 0334 309A
30CF 309A 0334
30D1 0334
-3070 0021
-3070 003F
-30D0 0021
-30D0 003F
-3071 0021
-3071 003F
-30D1 0021
-30D1 003F
332B 0021
332B 003F
332B 0061
FF8B 003F
32EA 0021
32EA 003F
+3073 0021
+3073 003F
+30D3 0021
+30D3 003F
+3074 0021
+3074 003F
+30D4 0021
+30D4 003F
3072 0334 3099
3072 3099 0334
3073 0334
30D2 0334 309A
30D2 309A 0334
30D4 0334
-3073 0021
-3073 003F
-30D3 0021
-30D3 003F
-3074 0021
-3074 003F
-30D4 0021
-30D4 003F
3072 0061
3072 0041
31F6 0061
FF8C 003F
32EB 0021
32EB 003F
+3076 0021
+3076 003F
+30D6 0021
+30D6 003F
+3077 0021
+3077 003F
+30D7 0021
+30D7 003F
3075 0334 3099
3075 3099 0334
3076 0334
30D5 0334 309A
30D5 309A 0334
30D7 0334
-3076 0021
-3076 003F
-30D6 0021
-30D6 003F
-3077 0021
-3077 003F
-30D7 0021
-30D7 003F
3075 0061
3075 0041
31F7 0061
FF8D 003F
32EC 0021
32EC 003F
+3079 0021
+3079 003F
+30D9 0021
+30D9 003F
+307A 0021
+307A 003F
+30DA 0021
+30DA 003F
3078 0334 3099
3078 3099 0334
3079 0334
30D8 0334 309A
30D8 309A 0334
30DA 0334
-3079 0021
-3079 003F
-30D9 0021
-30D9 003F
-307A 0021
-307A 003F
-30DA 0021
-30DA 003F
333B 0021
333B 003F
333B 0061
FF8E 003F
32ED 0021
32ED 003F
+307C 0021
+307C 003F
+30DC 0021
+30DC 003F
+307D 0021
+307D 003F
+30DD 0021
+30DD 003F
307B 0334 3099
307B 3099 0334
307C 0334
30DB 0334 309A
30DB 309A 0334
30DD 0334
-307C 0021
-307C 003F
-30DC 0021
-30DC 003F
-307D 0021
-307D 003F
-30DD 0021
-30DD 003F
3341 0021
3341 003F
3341 0061
FF9C 003F
32FB 0021
32FB 003F
+30F7 0021
+30F7 003F
30EF 0334 3099
30EF 3099 0334
30F7 0334
-30F7 0021
-30F7 003F
308E 0061
308E 0041
308F 0061
30F0 003F
32FC 0021
32FC 003F
+30F8 0021
+30F8 003F
30F0 0334 3099
30F0 3099 0334
30F8 0334
-30F8 0021
-30F8 003F
3090 0061
3090 0041
30F0 0061
30F1 003F
32FD 0021
32FD 003F
+30F9 0021
+30F9 003F
30F1 0334 3099
30F1 3099 0334
30F9 0334
-30F9 0021
-30F9 003F
3091 0061
3091 0041
30F1 0061
FF66 003F
32FE 0021
32FE 003F
+30FA 0021
+30FA 003F
30F2 0334 3099
30F2 3099 0334
30FA 0334
-30FA 0021
-30FA 003F
3092 0061
3092 0041
30F2 0061
12262 0061
12262 0041
12262 0062
+122D4 0021
+122D4 003F
+122D4 0334
+122D4 0061
+122D4 0041
+122D4 0062
+122D5 0021
+122D5 003F
+122D5 0334
+122D5 0061
+122D5 0041
+122D5 0062
12263 0021
12263 003F
12263 0334
122D3 0061
122D3 0041
122D3 0062
-122D4 0021
-122D4 003F
-122D4 0334
-122D4 0061
-122D4 0041
-122D4 0062
-122D5 0021
-122D5 003F
-122D5 0334
-122D5 0061
-122D5 0041
-122D5 0062
122D6 0021
122D6 003F
122D6 0334
10FFFF 0061
10FFFF 0041
10FFFF 0062
+FFFD 0021
+FFFD 003F
+FFFD 0061
+FFFD 0041
+FFFD 0062
FFFF 0021
FFFF 003F
FFFF 0061
--- /dev/null
+# Copyright (c) 2012-2014 International Business Machines
+# Corporation and others. All Rights Reserved.
+#
+# This file should be in UTF-8 with a signature byte sequence ("BOM").
+#
+# collationtest.txt: Collation test data.
+#
+# created on: 2012apr13
+# created by: Markus W. Scherer
+
+# A line with "** test: description" is used for verbose and error output.
+
+# A collator can be set with "@ root" or "@ locale language-tag",
+# for example "@ locale de-u-co-phonebk".
+
+# A collator can be built with "@ rules".
+# An "@ rules" line is followed by one or more lines with the tailoring rules.
+
+# A collator can be modified with "% attribute=value".
+
+# "* compare" tests the order (= or <) of the following strings.
+# The relation can be "=" or "<" (the level of the difference is not specified)
+# or "<1", "<2", "<c", "<3", "<4" (indicating the level of the difference).
+
+# Test sections ("* compare") are terminated by
+# definitions of new collators, changing attributes, or new test sections.
+
+** test: simple CEs & expansions
+# Many types of mappings are tested elsewhere, including via the UCA conformance tests.
+# Here we mostly cover a few unusual mappings.
+@ rules
+&\x01 # most control codes are ignorable
+<<<\u0300 # tertiary CE
+&9<\x00 # NUL not ignorable
+&\uA00A\uA00B=\uA002 # two long-primary CEs
+&\uA00A\uA00B\u00050005=\uA003 # three CEs, require 64 bits
+
+* compare
+= \x01
+= \x02
+<3 \u0300
+<1 9
+<1 \x00
+= \x01\x00\x02
+<1 a
+<3 a\u0300
+<2 a\u0308
+= ä
+<1 b
+<1 か # Hiragana Ka (U+304B)
+<2 か\u3099 # plus voiced sound mark
+= が # Hiragana Ga (U+304C)
+<1 \uA00A\uA00B
+= \uA002
+<1 \uA00A\uA00B\u00050004
+<1 \uA00A\uA00B\u00050005
+= \uA003
+<1 \uA00A\uA00B\u00050006
+
+** test: contractions
+# Create some interesting mappings, and map some normalization-inert characters
+# (which are not subject to canonical reordering)
+# to some of the same CEs to check the sequence of CEs.
+@ rules
+
+# Contractions starting with 'a' should not continue with any character < U+0300
+# so that we can test a shortcut for that.
+&a=ⓐ
+&b<bz=ⓑ
+&d<dz\u0301=ⓓ # d+z+acute
+&z
+<a\u0301=Ⓐ # a+acute sorts after z
+<a\u0301\u0301=Ⓑ # a+acute+acute
+<a\u0301\u0301\u0358=Ⓒ # a+acute+acute+dot above right
+<a\u030a=Ⓓ # a+ring
+<a\u0323=Ⓔ # a+dot below
+<a\u0323\u0358=Ⓕ # a+dot below+dot above right
+<a\u0327\u0323\u030a=Ⓖ # a+cedilla+dot below+ring
+<a\u0327\u0323bz=Ⓗ # a+cedilla+dot below+b+z
+
+&\U0001D158=⁰ # musical notehead black (has a symbol primary)
+<\U0001D158\U0001D165=¼ # musical quarter note
+
+# deliberately missing prefix contractions:
+# dz
+# a\u0327
+# a\u0327\u0323
+# a\u0327\u0323b
+
+&\x01
+<<<\U0001D165=¹ # musical stem (ccc=216)
+<<<\U0001D16D=² # musical augmentation dot (ccc=226)
+<<<\U0001D165\U0001D16D=³ # stem+dot (ccc=216 226)
+&\u0301=❶ # acute (ccc=230)
+&\u030a=❷ # ring (ccc=230)
+&\u0308=❸ # diaeresis (ccc=230)
+<<\u0308\u0301=❹ # diaeresis+acute (=dialytika tonos) (ccc=230 230)
+&\u0327=❺ # cedilla (ccc=202)
+&\u0323=❻ # dot below (ccc=220)
+&\u0331=❼ # macron below (ccc=220)
+<<\u0331\u0358=❽ # macron below+dot above right (ccc=220 232)
+&\u0334=❾ # tilde overlay (ccc=1)
+&\u0358=❿ # dot above right (ccc=232)
+
+&\u0f71=① # tibetan vowel sign aa
+&\u0f72=② # tibetan vowel sign i
+# \u0f71\u0f72 # tibetan vowel sign aa + i = ii = U+0F73
+&\u0f73=③ # tibetan vowel sign ii (ccc=0 but lccc=129)
+
+** test: simple contractions
+
+# Some strings are chosen to cause incremental contiguous contraction matching to
+# go into partial matches for prefixes of contractions
+# (where the prefixes are deliberately not also contractions).
+# When there is no complete match, then the matching code must back out of those
+# so that discontiguous contractions work as specified.
+
+* compare
+# contraction starter with no following text, or mismatch, or blocked
+<1 a
+= ⓐ
+<1 aa
+= ⓐⓐ
+<1 ab
+= ⓐb
+<1 az
+= ⓐz
+
+* compare
+<1 a
+<2 a\u0308\u030a # ring blocked by diaeresis
+= ⓐ❸❷
+<2 a\u0327
+= ⓐ❺
+
+* compare
+<2 \u0308
+= ❸
+<2 \u0308\u030a\u0301 # acute blocked by ring
+= ❸❷❶
+
+* compare
+<1 \U0001D158
+= ⁰
+<1 \U0001D158\U0001D165
+= ¼
+
+# no discontiguous contraction because of missing prefix contraction d+z,
+# and a starter ('z') after the 'd'
+* compare
+<1 dz\u0323\u0301
+= dz❻❶
+
+# contiguous contractions
+* compare
+<1 abz
+= ⓐⓑ
+<1 abzz
+= ⓐⓑz
+
+* compare
+<1 a
+<1 z
+<1 a\u0301
+= Ⓐ
+<1 a\u0301\u0301
+= Ⓑ
+<1 a\u0301\u0301\u0358
+= Ⓒ
+<1 a\u030a
+= Ⓓ
+<1 a\u0323\u0358
+= Ⓕ
+<1 a\u0327\u0323\u030a # match despite missing prefix
+= Ⓖ
+<1 a\u0327\u0323bz
+= Ⓗ
+
+* compare
+<2 \u0308\u0308\u0301 # acute blocked from first diaeresis, contracts with second
+= ❸❹
+
+* compare
+<1 \U0001D158\U0001D165
+= ¼
+
+* compare
+<3 \U0001D165\U0001D16D
+= ³
+
+** test: discontiguous contractions
+* compare
+<1 a\u0327\u030a # a+ring skips cedilla
+= Ⓓ❺
+<2 a\u0327\u0327\u030a # a+ring skips 2 cedillas
+= Ⓓ❺❺
+<2 a\u0327\u0327\u0327\u030a # a+ring skips 3 cedillas
+= Ⓓ❺❺❺
+<2 a\u0334\u0327\u0327\u030a # a+ring skips tilde overlay & 2 cedillas
+= Ⓓ❾❺❺
+<1 a\u0327\u0323 # a+dot below skips cedilla
+= Ⓔ❺
+<1 a\u0323\u0301\u0358 # a+dot below+dot ab.r.: 2-char match, then skips acute
+= Ⓕ❶
+<2 a\u0334\u0323\u0358 # a+dot below skips tilde overlay
+= Ⓕ❾
+
+* compare
+<2 \u0331\u0331\u0358 # macron below+dot ab.r. skips the second macron below
+= ❽❼
+
+* compare
+<1 a\u0327\u0331\u0323\u030a # a+ring skips cedilla, macron below, dot below (dot blocked by macron)
+= Ⓓ❺❼❻
+<1 a\u0327\u0323\U0001D16D\u030a # a+dot below skips cedilla
+= Ⓔ❺²❷
+<2 a\u0327\u0327\u0323\u030a # a+dot below skips 2 cedillas
+= Ⓔ❺❺❷
+<2 a\u0327\u0323\u0323\u030a # a+dot below skips cedilla
+= Ⓔ❺❻❷
+<2 a\u0334\u0327\u0323\u030a # a+dot below skips tilde overlay & cedilla
+= Ⓔ❾❺❷
+
+* compare
+<1 \U0001D158\u0327\U0001D165 # quarter note skips cedilla
+= ¼❺
+<1 a\U0001D165\u0323 # a+dot below skips stem
+= Ⓔ¹
+
+# partial contiguous match, backs up, matches discontiguous contraction
+<1 a\u0327\u0323b
+= Ⓔ❺b
+<1 a\u0327\u0323ba
+= Ⓔ❺bⓐ
+
+# a+acute+acute+dot above right skips cedilla, continues matching 2 same-ccc combining marks
+* compare
+<1 a\u0327\u0301\u0301\u0358
+= Ⓒ❺
+
+# FCD but not NFD
+* compare
+<1 a\u0f73\u0301 # a+acute skips tibetan ii
+= Ⓐ③
+
+# FCD but the 0f71 inside the 0f73 must be skipped
+# to match the discontiguous contraction of the first 0f71 with the trailing 0f72 inside the 0f73
+* compare
+<1 \u0f71\u0f73 # == \u0f73\u0f71 == \u0f71\u0f71\u0f72
+= ③①
+
+** test: discontiguous contractions with nested contractions
+* compare
+<1 a\u0323\u0308\u0301\u0358
+= Ⓕ❹
+<2 a\u0323\u0308\u0301\u0308\u0301\u0358
+= Ⓕ❹❹
+
+** test: discontiguous contractions with interleaved contractions
+* compare
+# a+ring & cedilla & macron below+dot above right
+<1 a\u0327\u0331\u030a\u0358
+= Ⓓ❺❽
+
+# a+ring & 1x..3x macron below+dot above right
+<2 a\u0331\u030a\u0358
+= Ⓓ❽
+<2 a\u0331\u0331\u030a\u0358\u0358
+= Ⓓ❽❽
+# also skips acute
+<2 a\u0331\u0331\u0331\u030a\u0301\u0358\u0358\u0358
+= Ⓓ❽❽❽❶
+
+# a+dot below & stem+augmentation dot, followed by contiguous d+z+acute
+<1 a\U0001D165\u0323\U0001D16Ddz\u0301
+= Ⓔ³ⓓ
+
+** test: some simple string comparisons
+@ root
+* compare
+# first string compares against ""
+= \u0000
+< a
+<1 b
+<3 B
+= \u0000B\u0000
+
+** test: compare with strength=primary
+% strength=primary
+* compare
+<1 a
+<1 b
+= B
+
+** test: compare with strength=secondary
+% strength=secondary
+* compare
+<1 a
+<1 b
+= B
+
+** test: compare with strength=tertiary
+% strength=tertiary
+* compare
+<1 a
+<1 b
+<3 B
+
+** test: compare with strength=quaternary
+% strength=quaternary
+* compare
+<1 a
+<1 b
+<3 B
+
+** test: compare with strength=identical
+% strength=identical
+* compare
+<1 a
+<1 b
+<3 B
+
+** test: côté with forwards secondary
+@ root
+* compare
+<1 cote
+<2 coté
+<2 côte
+<2 côté
+
+** test: côté with forwards secondary vs. U+FFFE merge separator
+# Merged sort keys: On each level, any difference in the first segment
+# must trump any further difference.
+* compare
+<1 cote\uFFFEcôté
+<2 coté\uFFFEcôte
+<2 côte\uFFFEcoté
+<2 côté\uFFFEcote
+
+** test: côté with backwards secondary
+% backwards=on
+* compare
+<1 cote
+<2 côte
+<2 coté
+<2 côté
+
+** test: côté with backwards secondary vs. U+FFFE merge separator
+# Merged sort keys: On each level, any difference in the first segment
+# must trump any further difference.
+* compare
+<1 cote\uFFFEcôté
+<2 côte\uFFFEcoté
+<2 coté\uFFFEcôte
+<2 côté\uFFFEcote
+
+** test: U+FFFE on identical level
+@ root
+% strength=identical
+* compare
+# All of these control codes are completely-ignorable, so that
+# their low code points are compared with the merge separator.
+# The merge separator must compare less than any other character.
+<1 \uFFFE\u0001\u0002\u0003
+<i \u0001\uFFFE\u0002\u0003
+<i \u0001\u0002\uFFFE\u0003
+<i \u0001\u0002\u0003\uFFFE
+
+* compare
+# The merge separator must even compare less than U+0000.
+<1 \uFFFE\u0000\u0000
+<i \u0000\uFFFE\u0000
+<i \u0000\u0000\uFFFE
+
+** test: Hani < surrogates < U+FFFD
+# Note: compareUTF8() treats unpaired surrogates like U+FFFD,
+# so with that the strings with surrogates will compare equal to each other
+# and equal to the string with U+FFFD.
+@ root
+% strength=identical
+* compare
+<1 abz
+<1 a\u4e00z
+<1 a\U00020000z
+<1 a\ud800z
+<1 a\udbffz
+<1 a\udc00z
+<1 a\udfffz
+<1 a\ufffdz
+
+** test: script reordering
+@ root
+% reorder Hani Zzzz digit
+* compare
+<1 ?
+<1 +
+<1 丂
+<1 a
+<1 α
+<1 5
+
+% reorder default
+* compare
+<1 ?
+<1 +
+<1 5
+<1 a
+<1 α
+<1 丂
+
+** test: empty rules
+@ rules
+* compare
+<1 a
+<2 ä
+<3 Ä
+<1 b
+
+** test: very simple rules
+@ rules
+&a=e<<<<q<<<<r<x<<<X<<y<<<Y;z,Z
+% strength=quaternary
+* compare
+<1 a
+= e
+<4 q
+<4 r
+<1 x
+<3 X
+<2 y
+<3 Y
+<2 z
+<3 Z
+
+** test: tailoring twice before a root position: primary
+@ rules
+&[before 1]b<p
+&[before 1]b<q
+* compare
+<1 a
+<1 p
+<1 q
+<1 b
+
+** test: tailoring twice before a root position: secondary
+@ rules
+&[before 2]ſ<<p
+&[before 2]ſ<<q
+* compare
+<1 s
+<2 p
+<2 q
+<2 ſ
+
+# secondary-before common weight
+@ rules
+&[before 2]b<<p
+&[before 2]b<<q
+* compare
+<1 a
+<1 p
+<2 q
+<2 b
+
+** test: tailoring twice before a root position: tertiary
+@ rules
+&[before 3]B<<<p
+&[before 3]B<<<q
+* compare
+<1 b
+<3 p
+<3 q
+<3 B
+
+# tertiary-before common weight
+@ rules
+&[before 3]b<<<p
+&[before 3]b<<<q
+* compare
+<1 a
+<1 p
+<3 q
+<3 b
+
+@ rules
+&[before 2]b<<s
+&[before 3]s<<<p
+&[before 3]s<<<q
+* compare
+<1 a
+<1 p
+<3 q
+<3 s
+<2 b
+
+** test: tailor after completely ignorable
+@ rules
+&\x00<<<x<<y
+* compare
+= \x00
+= \x1F
+<3 x
+<2 y
+
+** test: secondary tailoring gaps, ICU ticket 9362
+@ rules
+&[before 2]s<<'_'
+&s<<r # secondary between s and ſ (long s)
+&ſ<<*a-q # more than 15 between ſ and secondary CE boundary
+&[before 2][first primary ignorable]<<u<<v # between secondary CE boundary & lowest secondary CE
+&[last primary ignorable]<<y<<z
+
+* compare
+<2 u
+<2 v
+<2 \u0332 # lowest secondary CE
+<2 \u0308
+<2 y
+<2 z
+<1 s_
+<2 ss
+<2 sr
+<2 sſ
+<2 sa
+<2 sb
+<2 sp
+<2 sq
+<2 sus
+<2 svs
+<2 rs
+
+** test: tertiary tailoring gaps, ICU ticket 9362
+@ rules
+&[before 3]t<<<'_'
+&t<<<r # tertiary between t and fullwidth t
+&ᵀ<<<*a-q # more than 15 between ᵀ (modifier letter T) and tertiary CE boundary
+&[before 3][first secondary ignorable]<<<u<<<v # between tertiary CE boundary & lowest tertiary CE
+&[last secondary ignorable]<<<y<<<z
+
+* compare
+<3 u
+<3 v
+# Note: The root collator currently does not map any characters to tertiary CEs.
+<3 y
+<3 z
+<1 t_
+<3 tt
+<3 tr
+<3 tt
+<3 tᵀ
+<3 ta
+<3 tb
+<3 tp
+<3 tq
+<3 tut
+<3 tvt
+<3 rt
+
+** test: secondary & tertiary around root character
+@ rules
+&[before 2]m<<r
+&m<<s
+&[before 3]m<<<u
+&m<<<v
+* compare
+<1 l
+<1 r
+<2 u
+<3 m
+<3 v
+<2 s
+<1 n
+
+** test: secondary & tertiary around tailored item
+@ rules
+&m<x
+&[before 2]x<<r
+&x<<s
+&[before 3]x<<<u
+&x<<<v
+* compare
+<1 m
+<1 r
+<2 u
+<3 x
+<3 v
+<2 s
+<1 n
+
+** test: more nesting of secondary & tertiary before
+@ rules
+&[before 3]m<<<u
+&[before 2]m<<r
+&[before 3]r<<<q
+&m<<<w
+&m<<t
+&[before 3]w<<<v
+&w<<<x
+&w<<s
+* compare
+<1 l
+<1 q
+<3 r
+<2 u
+<3 m
+<3 v
+<3 w
+<3 x
+<2 s
+<2 t
+<1 n
+
+** test: case bits
+@ rules
+&w<x # tailored CE getting case bits
+ =uv=uV=Uv=UV # 2 chars -> 1 CE
+&ae=ch=cH=Ch=CH # 2 chars -> 2 CEs
+&rst=yz=yZ=Yz=YZ # 2 chars -> 3 CEs
+% caseFirst=lower
+* compare
+<1 ae
+= ch
+<3 cH
+<3 Ch
+<3 CH
+<1 rst
+= yz
+<3 yZ
+<3 Yz
+<3 YZ
+<1 w
+<1 x
+= uv
+<3 uV
+= Uv # mixed case on single CE cannot distinguish variations
+<3 UV
+
+** test: tertiary CEs, tertiary, caseLevel=off, caseFirst=lower
+@ rules
+&\u0001<<<t<<<T # tertiary CEs
+% caseFirst=lower
+* compare
+<1 aa
+<3 aat
+<3 aaT
+<3 aA
+<3 aAt
+<3 ata
+<3 aTa
+
+** test: tertiary CEs, tertiary, caseLevel=off, caseFirst=upper
+% caseFirst=upper
+* compare
+<1 aA
+<3 aAt
+<3 aa
+<3 aat
+<3 aaT
+<3 ata
+<3 aTa
+
+** test: reset on expansion, ICU tickets 9415 & 9593
+@ rules
+&æ<x # tailor the last primary CE so that x sorts between ae and af
+&æb=bæ # copy all reset CEs to make bæ sort the same
+&각<h # copy/tailor 3 CEs to make h sort before the next Hangul syllable 갂
+&⒀<<y # copy/tailor 4 CEs to make y sort with only a secondary difference
+&l·=z # handle the pre-context for · when fetching reset CEs
+ <<u # copy/tailor 2 CEs
+
+* compare
+<1 ae
+<2 æ
+<1 x
+<1 af
+
+* compare
+<1 aeb
+<2 æb
+= bæ
+
+* compare
+<1 각
+<1 h
+<1 갂
+<1 갃
+
+* compare
+<1 · # by itself: primary CE
+<1 l
+<2 l· # l+middle dot has only a secondary difference from l
+= z
+<2 u
+
+* compare
+<1 (13)
+<3 ⒀ # DUCET sets special tertiary weights in all CEs
+<2 y
+<1 (13[
+
+% alternate=shifted
+* compare
+<1 (13)
+= 13
+<3 ⒀
+= y # alternate=shifted removes the tailoring difference on the last CE
+<1 14
+
+** test: contraction inside extension, ICU ticket 9378
+@ rules
+&а<<х/й # all letters are Cyrillic
+* compare
+<1 ай
+<2 х
+
+** test: no duplicate tailored CEs for different reset positions with same CEs, ICU ticket 10104
+@ rules
+&t<x &ᵀ<y # same primary weights
+&q<u &[before 1]ꝗ<v # q and ꝗ are primary adjacent
+* compare
+<1 q
+<1 u
+<1 v
+<1 ꝗ
+<1 t
+<3 ᵀ
+<1 y
+<1 x
+
+# Principle: Each rule builds on the state of preceding rules and ignores following rules.
+
+** test: later rule does not affect earlier reset position, ICU ticket 10105
+@ rules
+&a < u < v < w &ov < x &b < v
+* compare
+<1 oa
+<1 ou
+<1 x # CE(o) followed by CE between u and w
+<1 ow
+<1 ob
+<1 ov
+
+** test: later rule does not affect earlier extension (1), ICU ticket 10105
+@ rules
+&a=x/b &v=b
+% strength=secondary
+* compare
+<1 B
+<1 c
+<1 v
+= b
+* compare
+<1 AB
+= x
+<1 ac
+<1 av
+= ab
+
+** test: later rule does not affect earlier extension (2), ICU ticket 10105
+@ rules
+&a <<< c / e &g <<< e / l
+% strength=secondary
+* compare
+<1 AE
+= c
+<2 æ
+<1 agl
+= ae
+
+** test: later rule does not affect earlier extension (3), ICU ticket 10105
+@ rules
+&a = b / c &d = c / e
+% strength=secondary
+* compare
+<1 AC # C is still only tertiary different from the original c
+= b
+<1 ade
+= ac
+
+** test: extension contains tailored character, ICU ticket 10105
+@ rules
+&a=e &b=u/e
+* compare
+<1 a
+= e
+<1 ba
+= be
+= u
+
+** test: add simple mappings for characters with root context
+@ rules
+&z=· # middle dot has a prefix mapping in the CLDR root
+&n=и # и (U+0438) has contractions in the root
+* compare
+<1 l
+<2 l· # root mapping for l|· still works
+<1 z
+= ·
+* compare
+<1 n
+= и
+<1 И
+<1 и\u0306 # root mapping for й=и\u0306 still works
+= й
+<3 Й
+
+** test: add context mappings around characters with root context
+@ rules
+&z=·h # middle dot has a prefix mapping in the CLDR root
+&n=ә|и # и (U+0438) has contractions in the root
+* compare
+<1 l
+<2 l· # root mapping for l|· still works
+<1 z
+= ·h
+* compare
+<1 и
+<3 И
+<1 и\u0306 # root mapping for й=и\u0306 still works
+= й
+* compare
+<1 әn
+= әи
+<1 әo
+
+** test: many secondary CEs at the top of their range
+@ rules
+&[last primary ignorable]<<*\u2801-\u28ff
+* compare
+<2 \u0308
+<2 \u2801
+<2 \u2802
+<2 \u2803
+<2 \u2804
+<2 \u28fd
+<2 \u28fe
+<2 \u28ff
+<1 \x20
+
+** test: many tertiary CEs at the top of their range
+@ rules
+&[last secondary ignorable]<<<*a-z
+* compare
+<3 a
+<3 b
+<3 c
+<3 d
+# e..w
+<3 x
+<3 y
+<3 z
+<2 \u0308
+
+** test: tailor contraction together with nearly equivalent prefix, ICU ticket 10101
+@ rules
+&a=p|x &b=px &c=op
+* compare
+<1 b
+= px
+<3 B
+<1 c
+= op
+<3 C
+* compare
+<1 ca
+= opx # first contraction op, then prefix p|x
+<3 cA
+<3 Ca
+
+** test: reset position with prefix (pre-context), ICU ticket 10102
+@ rules
+&a=p|x &px=y
+* compare
+<1 pa
+= px
+= y
+<3 pA
+<1 q
+<1 x
+
+** test: prefix+contraction together (1), ICU ticket 10071
+@ rules
+&x=a|bc
+* compare
+<1 ab
+<1 Abc
+<1 abd
+<1 ac
+<1 aw
+<1 ax
+= abc
+<3 aX
+<3 Ax
+<1 b
+<1 bb
+<1 bc
+<3 bC
+<3 Bc
+<1 bd
+
+** test: prefix+contraction together (2), ICU ticket 10071
+@ rules
+&w=bc &x=a|b
+* compare
+<1 w
+= bc
+<3 W
+* compare
+<1 aw
+<1 ax
+= ab
+<3 aX
+<1 axb
+<1 axc
+= abc # prefix match a|b takes precedence over contraction match bc
+<3 abC
+<1 abd
+<1 ay
+
+** test: prefix+contraction together (3), ICU ticket 10071
+@ rules
+&x=a|b &w=bc # reverse order of rules as previous test, order should not matter here
+* compare # same "compare" sequences as previous test
+<1 w
+= bc
+<3 W
+* compare
+<1 aw
+<1 ax
+= ab
+<3 aX
+<1 axb
+<1 axc
+= abc # prefix match a|b takes precedence over contraction match bc
+<3 abC
+<1 abd
+<1 ay
+
+** test: no mapping p|c, falls back to contraction ch, CLDR ticket 5962
+@ rules
+&d=ch &v=p|ci
+* compare
+<1 pc
+<3 pC
+<1 pcH
+<1 pcI
+<1 pd
+= pch # no-prefix contraction ch matches
+<3 pD
+<1 pv
+= pci # prefix+contraction p|ci matches
+<3 pV
+
+** test: tailor in & around compact ranges of root primaries
+# The Ogham characters U+1681..U+169A are in simple ascending order of primary CEs
+# which should be reliably encoded as one range in the root elements data.
+@ rules
+&[before 1]ᚁ<a
+&ᚁ<b
+&[before 1]ᚂ<c
+&ᚂ<d
+&[before 1]ᚚ<y
+&ᚚ<z
+&[before 2]ᚁ<<r
+&ᚁ<<s
+&[before 3]ᚚ<<<t
+&ᚚ<<<u
+* compare
+<1 ᣵ # U+18F5 last Canadian Aboriginal
+<1 a
+<1 r
+<2 ᚁ
+<2 s
+<1 b
+<1 c
+<1 ᚂ
+<1 d
+<1 ᚃ
+<1 ᚙ
+<1 y
+<1 t
+<3 ᚚ
+<3 u
+<1 z
+<1 ᚠ # U+16A0 first Runic
+
+** test: suppressContractions
+@ rules
+&z<ch<әж [suppressContractions [·cә]]
+* compare
+<1 ch
+<3 cH # ch was suppressed
+<1 l
+<1 l· # primary difference, not secondary, because l|· was suppressed
+<1 ә
+<2 ә\u0308 # secondary difference, not primary, because contractions for ә were suppressed
+<1 әж
+<3 әЖ
+
+** test: Hangul & Jamo
+@ rules
+&L=\u1100 # first Jamo L
+&V=\u1161 # first Jamo V
+&T=\u11A8 # first Jamo T
+&\uAC01<<*\u4E00-\u4EFF # first Hangul LVT syllable & lots of secondary diffs
+* compare
+<1 Lv
+<3 LV
+= \u1100\u1161
+= \uAC00
+<1 LVt
+<3 LVT
+= \u1100\u1161\u11A8
+= \uAC00\u11A8
+= \uAC01
+<2 LVT\u0308
+<2 \u4E00
+<2 \u4E01
+<2 \u4E80
+<2 \u4EFF
+<2 LV\u0308T
+<1 \uAC02
+
+** test: adjust special reset positions according to previous rules, CLDR ticket 6070
+@ rules
+&[last variable]<x
+[maxVariable space] # has effect only after building, no effect on following rules
+&[last variable]<y
+&[before 1][first regular]<z
+* compare
+<1 ? # some punctuation
+<1 x
+<1 y
+<1 z
+<1 $ # some symbol
+
+@ rules
+&[last primary ignorable]<<x<<<y
+&[last primary ignorable]<<z
+* compare
+<2 \u0358
+<2 x
+<3 y
+<2 z
+<1 \x20
+
+@ rules
+&[last secondary ignorable]<<<x
+&[last secondary ignorable]<<<y
+* compare
+<3 x
+<3 y
+<2 \u0358
+
+@ rules
+&[before 2][first variable]<<z
+&[before 2][first variable]<<y
+&[before 3][first variable]<<<x
+&[before 3][first variable]<<<w
+&[before 1][first variable]<v
+&[before 2][first variable]<<u
+&[before 3][first variable]<<<t
+&[before 2]\uFDD1\xA0<<s # FractionalUCA.txt: FDD1 00A0, SPACE first primary
+* compare
+<2 \u0358
+<1 s
+<2 \uFDD1\xA0
+<1 t
+<3 u
+<2 v
+<1 w
+<3 x
+<3 y
+<2 z
+<2 \t
+
+@ rules
+&[before 2][first regular]<<z
+&[before 3][first regular]<<<y
+&[before 1][first regular]<x
+&[before 3][first regular]<<<w
+&[before 2]\uFDD1\u263A<<v # FractionalUCA.txt: FDD1 263A, SYMBOL first primary
+&[before 3][first regular]<<<u
+&[before 1][first regular]<p # primary before the boundary: becomes variable
+&[before 3][first regular]<<<t # not affected by p
+&[last variable]<q # after p!
+* compare
+<1 ?
+<1 p
+<1 q
+<1 t
+<3 u
+<3 v
+<1 w
+<3 x
+<1 y
+<3 z
+<1 $
+
+# check that p & q are indeed variable
+% alternate=shifted
+* compare
+= ?
+= p
+= q
+<1 t
+<3 u
+<3 v
+<1 w
+<3 x
+<1 y
+<3 z
+<1 $
+
+@ rules
+&[before 2][first trailing]<<z
+&[before 1][first trailing]<y
+&[before 3][first trailing]<<<x
+* compare
+<1 \u4E00 # first Han, first implicit
+<1 \uFDD1\uFDD0 # FractionalUCA.txt: unassigned first primary
+# Note: The root collator currently does not map any characters to the trailing first boundary primary.
+<1 x
+<3 y
+<1 z
+<2 \uFFFD # The root collator currently maps U+FFFD to the first real trailing primary.
+
+@ rules
+&[before 2][first primary ignorable]<<z
+&[before 2][first primary ignorable]<<y
+&[before 3][first primary ignorable]<<<x
+&[before 3][first primary ignorable]<<<w
+* compare
+= \x01
+<2 w
+<3 x
+<3 y
+<2 z
+<2 \u0301
+
+@ rules
+&[before 3][first secondary ignorable]<<<y
+&[before 3][first secondary ignorable]<<<x
+* compare
+= \x01
+<3 x
+<3 y
+<2 \u0301
+
+** test: canonical closure
+@ rules
+&X=A &U=Â
+* compare
+<1 U
+= Â
+= A\u0302
+<2 Ú # U with acute
+= U\u0301
+= Ấ # A with circumflex & acute
+= Â\u0301
+= A\u0302\u0301
+<1 X
+= A
+<2 X\u030A # with ring above
+= Å
+= A\u030A
+= \u212B # Angstrom sign
+
+@ rules
+&x=\u5140\u55C0
+* compare
+<1 x
+= \u5140\u55C0
+= \u5140\uFA0D
+= \uFA0C\u55C0
+= \uFA0C\uFA0D # CJK compatibility characters
+<3 X
+
+# canonical closure on prefix rules, ICU ticket 9444
+@ rules
+&x=ä|ŝ
+* compare
+<1 äs # not tailored
+<1 äx
+= äŝ
+= a\u0308s\u0302
+= a\u0308ŝ
+= äs\u0302
+<3 äX
+
+** test: conjoining Jamo map to expansions
+@ rules
+&gg=\u1101 # Jamo Lead consonant GG
+&nj=\u11AC # Jamo Trail consonant NJ
+* compare
+<1 gg\u1161nj
+= \u1101\u1161\u11AC
+= \uAE4C\u11AC
+= \uAE51
+<3 gg\u1161nJ
+<1 \u1100\u1100
+
+** test: canonical tail closure, ICU ticket 5913
+@ rules
+&a<â
+* compare
+<1 a
+<1 â # tailored
+= a\u0302
+<2 a\u0323\u0302 # discontiguous contraction
+= ạ\u0302 # equivalent
+= ậ # equivalent
+<1 b
+
+@ rules
+&a<ạ
+* compare
+<1 a
+<1 ạ # tailored
+= a\u0323
+<2 a\u0323\u0302 # contiguous contraction plus extra diacritic
+= ạ\u0302 # equivalent
+= ậ # equivalent
+<1 b
+
+# Tail closure should work even if there is a prefix and/or contraction.
+@ rules
+&a<\u5140|câ
+# In order to find discontiguous contractions for \u5140|câ
+# there must exist a mapping for \u5140|ca, regardless of what it maps to.
+# (This follows from the UCA spec.)
+&x=\u5140|ca
+* compare
+<1 \u5140a
+= \uFA0Ca
+<1 \u5140câ # tailored
+= \uFA0Ccâ
+= \u5140ca\u0302
+= \uFA0Cca\u0302
+<2 \u5140ca\u0323\u0302 # discontiguous contraction
+= \uFA0Cca\u0323\u0302
+= \u5140cạ\u0302
+= \uFA0Ccạ\u0302
+= \u5140cậ
+= \uFA0Ccậ
+<1 \u5140b
+= \uFA0Cb
+<1 \u5140x
+= \u5140ca
+
+# Double-check that without the extra mapping there will be no discontiguous match.
+@ rules
+&a<\u5140|câ
+* compare
+<1 \u5140a
+= \uFA0Ca
+<1 \u5140câ # tailored
+= \uFA0Ccâ
+= \u5140ca\u0302
+= \uFA0Cca\u0302
+<1 \u5140b
+= \uFA0Cb
+<1 \u5140ca\u0323\u0302 # no discontiguous contraction
+= \uFA0Cca\u0323\u0302
+= \u5140cạ\u0302
+= \uFA0Ccạ\u0302
+= \u5140cậ
+= \uFA0Ccậ
+
+@ rules
+&a<cạ
+* compare
+<1 a
+<1 cạ # tailored
+= ca\u0323
+<2 ca\u0323\u0302 # contiguous contraction plus extra diacritic
+= cạ\u0302 # equivalent
+= cậ # equivalent
+<1 b
+
+# ᾢ = U+1FA2 GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+# = 03C9 0313 0300 0345
+# ccc = 0, 230, 230, 240
+@ rules
+&δ=αῳ
+# In order to find discontiguous contractions for αῳ
+# there must exist a mapping for αω, regardless of what it maps to.
+# (This follows from the UCA spec.)
+&ε=αω
+* compare
+<1 δ
+= αῳ
+= αω\u0345
+<2 αω\u0313\u0300\u0345 # discontiguous contraction
+= αὠ\u0300\u0345
+= αὢ\u0345
+= αᾢ
+<2 αω\u0300\u0313\u0345
+= αὼ\u0313\u0345
+= αῲ\u0313 # not FCD
+<1 ε
+= αω
+
+# Double-check that without the extra mapping there will be no discontiguous match.
+@ rules
+&δ=αῳ
+* compare
+<1 αω\u0313\u0300\u0345 # no discontiguous contraction
+= αὠ\u0300\u0345
+= αὢ\u0345
+= αᾢ
+<2 αω\u0300\u0313\u0345
+= αὼ\u0313\u0345
+= αῲ\u0313 # not FCD
+<1 δ
+= αῳ
+= αω\u0345
+
+# Add U+0315 COMBINING COMMA ABOVE RIGHT which has ccc=232.
+# Tests code paths where the tailored string has a combining mark
+# that does not occur in any composite's decomposition.
+@ rules
+&δ=αὼ\u0315
+* compare
+<1 αω\u0313\u0300\u0315 # Not tailored: The grave accent blocks the comma above.
+= αὠ\u0300\u0315
+= αὢ\u0315
+<1 δ
+= αὼ\u0315
+= αω\u0300\u0315
+<2 αω\u0300\u0315\u0345
+= αὼ\u0315\u0345
+= αῲ\u0315 # not FCD
+
+** test: danish a+a vs. a-umlaut, ICU ticket 9319
+@ rules
+&z<aa
+* compare
+<1 z
+<1 aa
+<2 aa\u0308
+= aä
+
+** test: Jamo L with and in prefix
+# Useful for the Korean "searchjl" tailoring (instead of contractions of pairs of Jamo L).
+@ rules
+# Jamo Lead consonant G after G or GG
+&[last primary ignorable]<<\u1100|\u1100=\u1101|\u1100
+# Jamo Lead consonant GG sorts like G+G
+&\u1100\u1100=\u1101
+# Note: Making G|GG and GG|GG sort the same as G|G+G
+# would require the ability to reset on G|G+G,
+# or we could make G-after-G equal to some secondary-CE character,
+# and reset on a pair of those.
+# (It does not matter much if there are at most two G in a row in real text.)
+* compare
+<1 \u1100
+<2 \u1100\u1100 # only one primary from a sequence of G lead consonants
+= \u1101
+<2 \u1100\u1100\u1100
+= \u1101\u1100
+# but not = \u1100\u1101, see above
+<1 \u1100\u1161
+= \uAC00
+<2 \u1100\u1100\u1161
+= \u1100\uAC00 # prefix match from the L of the LV syllable
+= \u1101\u1161
+= \uAE4C
+
+** test: proposed Korean "searchjl" tailoring with prefixes, CLDR ticket 6546
+@ rules
+# Low secondary CEs for Jamo V & T.
+# Note: T should sort before V for proper syllable order.
+&\u0332 # COMBINING LOW LINE (first primary ignorable)
+<<\u1161<<\u1162
+
+# Korean Jamo lead consonant search rules, part 2:
+# Make modern compound L jamo primary equivalent to non-compound forms.
+
+# Secondary CEs for Jamo L-after-L, greater than Jamo V & T.
+&\u0313 # COMBINING COMMA ABOVE (second primary ignorable)
+=\u1100|\u1100
+=\u1103|\u1103
+=\u1107|\u1107
+=\u1109|\u1109
+=\u110C|\u110C
+
+# Compound L Jamo map to equivalent expansions of primary+secondary CE.
+&\u1100\u0313=\u1101<<<\u3132 # HANGUL CHOSEONG SSANGKIYEOK, HANGUL LETTER SSANGKIYEOK
+&\u1103\u0313=\u1104<<<\u3138 # HANGUL CHOSEONG SSANGTIKEUT, HANGUL LETTER SSANGTIKEUT
+&\u1107\u0313=\u1108<<<\u3143 # HANGUL CHOSEONG SSANGPIEUP, HANGUL LETTER SSANGPIEUP
+&\u1109\u0313=\u110A<<<\u3146 # HANGUL CHOSEONG SSANGSIOS, HANGUL LETTER SSANGSIOS
+&\u110C\u0313=\u110D<<<\u3149 # HANGUL CHOSEONG SSANGCIEUC, HANGUL LETTER SSANGCIEUC
+
+* compare
+<1 \u1100\u1161
+= \uAC00
+<2 \u1100\u1162
+= \uAC1C
+<2 \u1100\u1100\u1161
+= \u1100\uAC00
+= \u1101\u1161
+= \uAE4C
+<3 \u3132\u1161
+
+** test: Hangul syllables in prefix & in the interior of a contraction
+@ rules
+&x=\u1100\u1161|a\u1102\u1162z
+* compare
+<1 \u1100\u1161x
+= \u1100\u1161a\u1102\u1162z
+= \u1100\u1161a\uB0B4z
+= \uAC00a\u1102\u1162z
+= \uAC00a\uB0B4z
+
+** test: digits are unsafe-backwards when numeric=on
+@ root
+% numeric=on
+* compare
+# If digits are not unsafe, then numeric collation sees "1"=="01" and "b">"a".
+# We need to back up before the identical prefix "1" and compare the full numbers.
+<1 11b
+<1 101a
+
+** test: simple locale data test
+@ locale de
+* compare
+<1 a
+<2 ä
+<1 ae
+<2 æ
+
+@ locale de-u-co-phonebk
+* compare
+<1 a
+<1 ae
+<2 ä
+<2 æ
+
+# The following test cases were moved here from ICU 52's DataDrivenCollationTest.txt.
+
+** test: DataDrivenCollationTest/TestMorePinyin
+# Testing the primary strength.
+@ locale zh
+% strength=primary
+* compare
+< lā
+= lĀ
+= Lā
+= LĀ
+< lān
+= lĀn
+< lē
+= lĒ
+= Lē
+= LĒ
+< lēn
+= lĒn
+
+** test: DataDrivenCollationTest/TestLithuanian
+# Lithuanian sort order.
+@ locale lt
+* compare
+< cz
+< č
+< d
+< iz
+< j
+< sz
+< š
+< t
+< zz
+< ž
+
+** test: DataDrivenCollationTest/TestLatvian
+# Latvian sort order.
+@ locale lv
+* compare
+< cz
+< č
+< d
+< gz
+< ģ
+< h
+< iz
+< j
+< kz
+< ķ
+< l
+< lz
+< ļ
+< m
+< nz
+< ņ
+< o
+< rz
+< ŗ
+< s
+< sz
+< š
+< t
+< zz
+< ž
+
+** test: DataDrivenCollationTest/TestEstonian
+# Estonian sort order.
+@ locale et
+* compare
+< sy
+< š
+< šy
+< z
+< zy
+< ž
+< v
+< w
+< va
+< õ
+< õy
+< ä
+< äy
+< ö
+< öy
+< ü
+< üy
+< x
+
+** test: DataDrivenCollationTest/TestAlbanian
+# Albanian sort order.
+@ locale sq
+* compare
+< cz
+< ç
+< d
+< dz
+< dh
+< e
+< ez
+< ë
+< f
+< gz
+< gj
+< h
+< lz
+< ll
+< m
+< nz
+< nj
+< o
+< rz
+< rr
+< s
+< sz
+< sh
+< t
+< tz
+< th
+< u
+< xz
+< xh
+< y
+< zz
+< zh
+
+** test: DataDrivenCollationTest/TestSimplifiedChineseOrder
+# Sorted file has different order.
+@ root
+# normalization=on turned on & off automatically.
+* compare
+< \u5F20
+< \u5F20\u4E00\u8E3F
+
+** test: DataDrivenCollationTest/TestTibetanNormalizedIterativeCrash
+# This pretty much crashes.
+@ root
+* compare
+< \u0f71\u0f72\u0f80\u0f71\u0f72
+< \u0f80
+
+** test: DataDrivenCollationTest/TestThaiPartialSortKeyProblems
+# These are examples of strings that caused trouble in partial sort key testing.
+@ locale th-TH
+* compare
+< \u0E01\u0E01\u0E38\u0E18\u0E20\u0E31\u0E13\u0E11\u0E4C
+< \u0E01\u0E01\u0E38\u0E2A\u0E31\u0E19\u0E42\u0E18
+* compare
+< \u0E01\u0E07\u0E01\u0E32\u0E23
+< \u0E01\u0E07\u0E42\u0E01\u0E49
+* compare
+< \u0E01\u0E23\u0E19\u0E17\u0E32
+< \u0E01\u0E23\u0E19\u0E19\u0E40\u0E0A\u0E49\u0E32
+* compare
+< \u0E01\u0E23\u0E30\u0E40\u0E08\u0E35\u0E22\u0E27
+< \u0E01\u0E23\u0E30\u0E40\u0E08\u0E35\u0E4A\u0E22\u0E27
+* compare
+< \u0E01\u0E23\u0E23\u0E40\u0E0A\u0E2D
+< \u0E01\u0E23\u0E23\u0E40\u0E0A\u0E49\u0E32
+
+** test: DataDrivenCollationTest/TestJavaStyleRule
+# java.text allows rules to start as '<<<x<<<y...'
+# we emulate this by assuming a &[first tertiary ignorable] in this case.
+@ rules
+&\u0001=equal<<<z<<x<<<w &[first tertiary ignorable]=a &[first primary ignorable]=b
+* compare
+= a
+= equal
+< z
+< x
+= b # x had become the new first primary ignorable
+< w
+
+** test: DataDrivenCollationTest/TestShiftedIgnorable
+# The UCA states that primary ignorables should be completely
+# ignorable when following a shifted code point.
+@ root
+% alternate=shifted
+% strength=quaternary
+* compare
+< a\u0020b
+= a\u0020\u0300b
+= a\u0020\u0301b
+< a_b
+= a_\u0300b
+= a_\u0301b
+< A\u0020b
+= A\u0020\u0300b
+= A\u0020\u0301b
+< A_b
+= A_\u0300b
+= A_\u0301b
+< a\u0301b
+< A\u0301b
+< a\u0300b
+< A\u0300b
+
+** test: DataDrivenCollationTest/TestNShiftedIgnorable
+# The UCA states that primary ignorables should be completely
+# ignorable when following a shifted code point.
+@ root
+% alternate=non-ignorable
+% strength=tertiary
+* compare
+< a\u0020b
+< A\u0020b
+< a\u0020\u0301b
+< A\u0020\u0301b
+< a\u0020\u0300b
+< A\u0020\u0300b
+< a_b
+< A_b
+< a_\u0301b
+< A_\u0301b
+< a_\u0300b
+< A_\u0300b
+< a\u0301b
+< A\u0301b
+< a\u0300b
+< A\u0300b
+
+** test: DataDrivenCollationTest/TestSafeSurrogates
+# It turned out that surrogates were not skipped properly
+# when iterating backwards if they were in the middle of a
+# contraction. This test assures that this is fixed.
+@ rules
+&a < x\ud800\udc00b
+* compare
+< a
+< x\ud800\udc00b
+
+** test: DataDrivenCollationTest/da_TestPrimary
+# This test goes through primary strength cases
+@ locale da
+% strength=primary
+* compare
+< Lvi
+< Lwi
+* compare
+< L\u00e4vi
+< L\u00f6wi
+* compare
+< L\u00fcbeck
+= Lybeck
+
+** test: DataDrivenCollationTest/da_TestTertiary
+# This test goes through tertiary strength cases
+@ locale da
+% strength=tertiary
+* compare
+< Luc
+< luck
+* compare
+< luck
+< L\u00fcbeck
+* compare
+< lybeck
+< L\u00fcbeck
+* compare
+< L\u00e4vi
+< L\u00f6we
+* compare
+< L\u00f6ww
+< mast
+
+* compare
+< A/S
+< ANDRE
+< ANDR\u00c9
+< ANDREAS
+< AS
+< CA
+< \u00c7A
+< CB
+< \u00c7C
+< D.S.B.
+< DA
+< \u00d0A
+< DB
+< \u00d0C
+< DSB
+< DSC
+< EKSTRA_ARBEJDE
+< EKSTRABUD0
+< H\u00d8ST
+< HAAG
+< H\u00c5NDBOG
+< HAANDV\u00c6RKSBANKEN
+< Karl
+< karl
+< NIELS\u0020J\u00d8RGEN
+< NIELS-J\u00d8RGEN
+< NIELSEN
+< R\u00c9E,\u0020A
+< REE,\u0020B
+< R\u00c9E,\u0020L
+< REE,\u0020V
+< SCHYTT,\u0020B
+< SCHYTT,\u0020H
+< SCH\u00dcTT,\u0020H
+< SCHYTT,\u0020L
+< SCH\u00dcTT,\u0020M
+< SS
+< \u00df
+< SSA
+< STORE\u0020VILDMOSE
+< STOREK\u00c6R0
+< STORM\u0020PETERSEN
+< STORMLY
+< THORVALD
+< THORVARDUR
+< \u00feORVAR\u00d0UR
+< THYGESEN
+< VESTERG\u00c5RD,\u0020A
+< VESTERGAARD,\u0020A
+< VESTERG\u00c5RD,\u0020B
+< \u00c6BLE
+< \u00c4BLE
+< \u00d8BERG
+< \u00d6BERG
+
+* compare
+< andere
+< chaque
+< chemin
+< cote
+< cot\u00e9
+< c\u00f4te
+< c\u00f4t\u00e9
+< \u010du\u010d\u0113t
+< Czech
+< hi\u0161a
+< irdisch
+< lie
+< lire
+< llama
+< l\u00f5ug
+< l\u00f2za
+< lu\u010d
+< luck
+< L\u00fcbeck
+< lye
+< l\u00e4vi
+< L\u00f6wen
+< m\u00e0\u0161ta
+< m\u00eer
+< myndig
+< M\u00e4nner
+< m\u00f6chten
+< pi\u00f1a
+< pint
+< pylon
+< \u0161\u00e0ran
+< savoir
+< \u0160erb\u016bra
+< Sietla
+< \u015blub
+< subtle
+< symbol
+< s\u00e4mtlich
+< verkehrt
+< vox
+< v\u00e4ga
+< waffle
+< wood
+< yen
+< yuan
+< yucca
+< \u017eal
+< \u017eena
+< \u017den\u0113va
+< zoo0
+< Zviedrija
+< Z\u00fcrich
+< zysk0
+< \u00e4ndere
+
+** test: DataDrivenCollationTest/hi_TestNewRules
+# This test goes through new rules and tests against old rules
+@ locale hi
+* compare
+< कॐ
+< कं
+< कँ
+< कः
+
+** test: DataDrivenCollationTest/ro_TestNewRules
+# This test goes through new rules and tests against old rules
+@ locale ro
+* compare
+< xAx
+< xă
+< xĂ
+< Xă
+< XĂ
+< xăx
+< xĂx
+< xâ
+< xÂ
+< Xâ
+< XÂ
+< xâx
+< xÂx
+< xb
+< xIx
+< xî
+< xÎ
+< Xî
+< XÎ
+< xîx
+< xÎx
+< xj
+< xSx
+< xș
+= xş
+< xȘ
+= xŞ
+< Xș
+= Xş
+< XȘ
+= XŞ
+< xșx
+= xşx
+< xȘx
+= xŞx
+< xT
+< xTx
+< xț
+= xţ
+< xȚ
+= xŢ
+< Xț
+= Xţ
+< XȚ
+= XŢ
+< xțx
+= xţx
+< xȚx
+= xŢx
+< xU
+
+** test: DataDrivenCollationTest/testOffsets
+# This tests cases where forwards and backwards iteration get different offsets
+@ locale en
+% strength=tertiary
+* compare
+< a\uD800\uDC00\uDC00
+< b\uD800\uDC00\uDC00
+* compare
+< \u0301A\u0301\u0301
+< \u0301B\u0301\u0301
+* compare
+< abcd\r\u0301
+< abce\r\u0301
+# TODO: test offsets in new CollationTest
+
+# End of test cases moved here from ICU 52's DataDrivenCollationTest.txt.
+
+** test: was ICU 52 cmsccoll/TestRedundantRules
+@ rules
+& a < b < c < d& [before 1] c < m
+* compare
+<1 a
+<1 b
+<1 m
+<1 c
+<1 d
+
+@ rules
+& a < b <<< c << d <<< e& [before 3] e <<< x
+* compare
+<1 a
+<1 b
+<3 c
+<2 d
+<3 x
+<3 e
+
+@ rules
+& a < b <<< c << d <<< e <<< f < g& [before 1] g < x
+* compare
+<1 a
+<1 b
+<3 c
+<2 d
+<3 e
+<3 f
+<1 x
+<1 g
+
+@ rules
+& a <<< b << c < d& a < m
+* compare
+<1 a
+<3 b
+<2 c
+<1 m
+<1 d
+
+@ rules
+&a<b<<b\u0301 &z<b
+* compare
+<1 a
+<1 b\u0301
+<1 z
+<1 b
+
+@ rules
+&z<m<<<q<<<m
+* compare
+<1 z
+<1 q
+<3 m
+
+@ rules
+&z<<<m<q<<<m
+* compare
+<1 z
+<1 q
+<3 m
+
+@ rules
+& a < b < c < d& r < c
+* compare
+<1 a
+<1 b
+<1 d
+<1 r
+<1 c
+
+@ rules
+& a < b < c < d& c < m
+* compare
+<1 a
+<1 b
+<1 c
+<1 m
+<1 d
+
+@ rules
+& a < b < c < d& a < m
+* compare
+<1 a
+<1 m
+<1 b
+<1 c
+<1 d
+
+** test: was ICU 52 cmsccoll/TestExpansionSyntax
+# The following two rules should sort the particular list of strings the same.
+@ rules
+&AE <<< a << b <<< c &d <<< f
+* compare
+<1 AE
+<3 a
+<2 b
+<3 c
+<1 d
+<3 f
+
+@ rules
+&A <<< a / E << b / E <<< c /E &d <<< f
+* compare
+<1 AE
+<3 a
+<2 b
+<3 c
+<1 d
+<3 f
+
+# The following two rules should sort the particular list of strings the same.
+@ rules
+&AE <<< a <<< b << c << d < e < f <<< g
+* compare
+<1 AE
+<3 a
+<3 b
+<2 c
+<2 d
+<1 e
+<1 f
+<3 g
+
+@ rules
+&A <<< a / E <<< b / E << c / E << d / E < e < f <<< g
+* compare
+<1 AE
+<3 a
+<3 b
+<2 c
+<2 d
+<1 e
+<1 f
+<3 g
+
+# The following two rules should sort the particular list of strings the same.
+@ rules
+&AE <<< B <<< C / D <<< F
+* compare
+<1 AE
+<3 B
+<3 F
+<1 AED
+<3 C
+
+@ rules
+&A <<< B / E <<< C / ED <<< F / E
+* compare
+<1 AE
+<3 B
+<3 F
+<1 AED
+<3 C
+
+** test: never reorder trailing primaries
+@ root
+% reorder Zzzz Grek
+* compare
+<1 L
+<1 字
+<1 Ω
+<1 \uFFFD
+<1 \uFFFF
+
+** test: fall back to mappings with shorter prefixes, not immediately to ones with no prefixes
+@ rules
+&u=ab|cd
+&v=b|ce
+* compare
+<1 abc
+<1 abcc
+<1 abcf
+<1 abcd
+= abu
+<1 abce
+= abv
+
+# With the following rules, there is only one prefix per composite ĉ or ç,
+# but both prefixes apply to just c in NFD form.
+# We would get different results for composed vs. NFD input
+# if we fell back directly from longest-prefix mappings to no-prefix mappings.
+@ rules
+&x=op|ĉ
+&y=p|ç
+* compare
+<1 opc
+<2 opć
+<1 opcz
+<1 opd
+<1 opĉ
+= opc\u0302
+= opx
+<1 opç
+= opc\u0327
+= opy
+
+# The mapping is used which has the longest matching prefix for which
+# there is also a suffix match, with the longest suffix match among several for that prefix.
+@ rules
+&❶=d
+&❷=de
+&❸=def
+&①=c|d
+&②=c|de
+&③=c|def
+&④=bc|d
+&⑤=bc|de
+&⑥=bc|def
+&⑦=abc|d
+&⑧=abc|de
+&⑨=abc|def
+* compare
+<1 9aadzz
+= 9aa❶zz
+<1 9aadez
+= 9aa❷z
+<1 9aadef
+= 9aa❸
+<1 9acdzz
+= 9ac①zz
+<1 9acdez
+= 9ac②z
+<1 9acdef
+= 9ac③
+<1 9bcdzz
+= 9bc④zz
+<1 9bcdez
+= 9bc⑤z
+<1 9bcdef
+= 9bc⑥
+<1 abcdzz
+= abc⑦zz
+<1 abcdez
+= abc⑧z
+<1 abcdef
+= abc⑨
+
+** test: prefix + discontiguous contraction with missing prefix contraction
+# Unfortunate terminology: The first "prefix" here is the pre-context,
+# the second "prefix" refers to the contraction/relation string that is
+# one shorter than the one being tested.
+@ rules
+&x=p|e
+&y=p|ê
+&z=op|ê
+# No mapping for op|e:
+# Discontiguous contraction matching should not match op|ê in opệ
+# because it would have to skip the dot below and extend a match on op|e by the circumflex,
+# but there is no match on op|e.
+* compare
+<1 oPe
+<1 ope
+= opx
+<1 opệ
+= opy\u0323 # y not z
+<1 opê
+= opz
+
+# We cannot test for fallback by whether the contraction default CE32
+# is for another contraction. With the following rules, there is no mapping for op|e,
+# and the fallback to prefix p has no contractions.
+@ rules
+&x=p|e
+&z=op|ê
+* compare
+<1 oPe
+<1 ope
+= opx
+<2 opệ
+= opx\u0323\u0302 # x not z
+<1 opê
+= opz
+
+# One more variation: Fallback to the simple code point, no shorter non-empty prefix.
+@ rules
+&x=e
+&z=op|ê
+* compare
+<1 ope
+= opx
+<3 oPe
+= oPx
+<2 opệ
+= opx\u0323\u0302 # x not z
+<1 opê
+= opz
+
+** test: maxVariable via rules
+@ rules
+[maxVariable space][alternate shifted]
+* compare
+= \u0020
+= \u000A
+<1 .
+<1 ° # degree sign
+<1 $
+<1 0
+
+** test: maxVariable via setting
+@ root
+% maxVariable=currency
+% alternate=shifted
+* compare
+= \u0020
+= \u000A
+= .
+= ° # degree sign
+= $
+<1 0
+
+** test: ICU4J CollationMiscTest/TestContractionClosure (ää)
+# This tests canonical closure, but it also tests that CollationFastLatin
+# bails out properly for contractions with combining marks.
+# For that we need pairs of strings that remain in the Latin fastpath
+# long enough, hence the extra "= b" lines.
+@ rules
+&b=\u00e4\u00e4
+* compare
+<1 b
+= \u00e4\u00e4
+= b
+= a\u0308a\u0308
+= b
+= \u00e4a\u0308
+= b
+= a\u0308\u00e4
+
+** test: ICU4J CollationMiscTest/TestContractionClosure (Å)
+@ rules
+&b=\u00C5
+* compare
+<1 b
+= \u00C5
+= b
+= A\u030A
+= b
+= \u212B
+
+** test: reset-before on already-tailored characters, ICU ticket 10108
+@ rules
+&a<w<<x &[before 2]x<<y
+* compare
+<1 a
+<1 w
+<2 y
+<2 x
+
+@ rules
+&a<<w<<<x &[before 2]x<<y
+* compare
+<1 a
+<2 y
+<2 w
+<3 x
+
+@ rules
+&a<w<x &[before 2]x<<y
+* compare
+<1 a
+<1 w
+<1 y
+<2 x
+
+@ rules
+&a<w<<<x &[before 2]x<<y
+* compare
+<1 a
+<1 y
+<2 w
+<3 x
+
+** test: numeric collation with other settings, ICU ticket 9092
+@ root
+% strength=identical
+% caseFirst=upper
+% numeric=on
+* compare
+<1 100\u0020a
+<1 101
+
+** test: collation type fallback from unsupported type, ICU ticket 10149
+@ locale fr-CA-u-co-phonebk
+# Expect the same result as with fr-CA, using backwards-secondary order.
+# That is, we should fall back from the unsupported collation type
+# to the locale's default collation type.
+* compare
+<1 cote
+<2 côte
+<2 coté
+<2 côté
+
+** test: @ is equivalent to [backwards 2], ICU ticket 9956
+@ rules
+&b<a @ &v<<w
+* compare
+<1 b
+<1 a
+<1 cote
+<2 côte
+<2 coté
+<2 côté
+<1 v
+<2 w
+<1 x
+
+** test: shifted+reordering, ICU ticket 9507
+@ root
+% reorder Grek punct space
+% alternate=shifted
+% strength=quaternary
+# Which primaries are "variable" should be determined without script reordering,
+# and then primaries should be reordered whether they are shifted to quaternary or not.
+* compare
+<4 ( # punctuation
+<4 )
+<4 \u0020 # space
+<1 ` # symbol
+<1 ^
+<1 $ # currency symbol
+<1 €
+<1 0 # numbers
+<1 ε # Greek
+<1 e # Latin
+<1 e(e
+<4 e)e
+<4 e\u0020e
+<4 ee
+<3 e(E
+<4 e)E
+<4 e\u0020E
+<4 eE
+
+** test: "uppercase first" could sort a string before its prefix, ICU ticket 9351
+@ rules
+&\u0001<<<b<<<B
+% caseFirst=upper
+* compare
+<1 aaa
+<3 aaaB
+
+** test: secondary+case ignores secondary ignorables, ICU ticket 9355
+@ rules
+&\u0001<<<b<<<B
+% strength=secondary
+% caseLevel=on
+* compare
+<1 a
+= ab
+= aB
+
+** test: custom collation rules involving tail of a contraction in Malayalam, ICU ticket 6328
+@ rules
+&[before 2] ൌ << ൗ # U+0D57 << U+0D4C == 0D46+0D57
+* compare
+<1 ൗx
+<2 ൌx
+<1 ൗy
+<2 ൌy
+
+** test: quoted apostrophe in compact syntax, ICU ticket 8204
+@ rules
+&q<<*a''c
+* compare
+<1 d
+<1 p
+<1 q
+<2 a
+<2 \u0027
+<2 c
+<1 r
/*
*******************************************************************************
- * Copyright (C) 2008-2013, International Business Machines Corporation and
+ * Copyright (C) 2008-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.RawCollationKey;
import com.ibm.icu.text.RuleBasedCollator;
+import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;
AlphabeticIndex alphabeticIndex = new AlphabeticIndex(Locale.ENGLISH);
RuleBasedCollator collator = alphabeticIndex.getCollator();
collator.setStrength(Collator.IDENTICAL);
- Collection<String> firsts = AlphabeticIndex.getFirstCharactersInScripts();
+ Collection<String> firsts = alphabeticIndex.getFirstCharactersInScripts();
// Verify that each script is represented exactly once.
UnicodeSet missingScripts = new UnicodeSet("[^[:sc=inherited:][:sc=unknown:][:sc=common:][:Script=Braille:]]");
String last = "";
for (String index : firsts) {
- if (index.equals("\uFFFF")) {
- continue;
- }
if (collator.compare(last,index) >= 0) {
errln("Characters not in order: " + last + " !< " + index);
}
- int script = UScript.getScript(index.codePointAt(0)); // we actually look at just the first char
+ int script = getFirstRealScript(index);
+ if (script == UScript.UNKNOWN) { continue; }
UnicodeSet s = new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, script);
if (missingScripts.containsNone(s)) {
errln("2nd character in script: " + index + "\t" + new UnicodeSet(missingScripts).retainAll(s).toPattern(false));
}
}
+ private static final int getFirstRealScript(CharSequence s) {
+ for (int i = 0; i < s.length();) {
+ int c = Character.codePointAt(s, i);
+ int script = UScript.getScript(c);
+ if (script != UScript.UNKNOWN && script != UScript.INHERITED && script != UScript.COMMON) {
+ return script;
+ }
+ i += Character.charCount(c);
+ }
+ return UScript.UNKNOWN;
+ }
+
public void TestBuckets() {
ULocale additionalLocale = ULocale.ENGLISH;
}
public void TestFirstScriptCharacters() {
- Collection<String> firstCharacters = AlphabeticIndex.getFirstCharactersInScripts();
+ Collection<String> firstCharacters =
+ new AlphabeticIndex(ULocale.ENGLISH).getFirstCharactersInScripts();
Collection<String> expectedFirstCharacters = firstStringsInScript((RuleBasedCollator) Collator.getInstance(ULocale.ROOT));
Collection<String> diff = new TreeSet<String>(firstCharacters);
diff.removeAll(expectedFirstCharacters);
private static Collection<String> firstStringsInScript(RuleBasedCollator ruleBasedCollator) {
String[] results = new String[UScript.CODE_LIMIT];
for (String current : TO_TRY) {
- if (ruleBasedCollator.compare(current, "a") < 0) { // TODO fix; we only want "real" script characters, not
- // symbols.
+ if (ruleBasedCollator.compare(current, "a") < 0) { // we only want "real" script characters, not symbols.
continue;
}
int script = UScript.getScript(current.codePointAt(0));
if (extras.size() != 0) {
Normalizer2 normalizer = Normalizer2.getNFKCInstance();
for (String current : extras) {
- if (!TO_TRY.containsAll(current))
- continue;
- if (!normalizer.isNormalized(current) || ruleBasedCollator.compare(current, "a") < 0) {
+ if (!normalizer.isNormalized(current) || ruleBasedCollator.compare(current, "9") <= 0) {
continue;
}
- int script = UScript.getScript(current.codePointAt(0));
+ int script = getFirstRealScript(current);
+ if (script == UScript.UNKNOWN && !isUnassignedBoundary(current)) { continue; }
if (results[script] == null) {
results[script] = current;
} else if (ruleBasedCollator.compare(current, results[script]) < 0) {
} catch (Exception e) {
} // why have a checked exception???
- results[UScript.LATIN] = "A"; // See comment about en_US_POSIX in the implementation.
// TODO: We should not test that we get the same strings, but that we
// get strings that sort primary-equal to those from the implementation.
- // This whole test becomes obsolete when the root collator adds script-first-primary mappings
- // and the AlphabeticIndex implementation starts using them.
Collection<String> result = new ArrayList<String>();
for (int i = 0; i < results.length; ++i) {
result.add(results[i]);
}
}
- // AlphabeticIndex also has a boundary string for the ultimate overflow bucket,
- // for unassigned code points and trailing/special primary weights.
- result.add("\uFFFF");
return result;
}
+ private static final boolean isUnassignedBoundary(CharSequence s) {
+ // The root collator provides a script-first-primary boundary contraction
+ // for the unassigned-implicit range.
+ return s.charAt(0) == 0xfdd1 &&
+ UScript.getScript(Character.codePointAt(s, 1)) == UScript.UNKNOWN;
+ }
public void TestZZZ() {
// int x = 3;
assertEquals("getBucketIndex(i)", 9, bucketIndex);
bucketIndex = index.getBucketIndex("\u03B1");
assertEquals("getBucketIndex(Greek alpha)", 27, bucketIndex);
- // TODO: Test with an unassigned code point (not just U+FFFF)
- // when unassigned code points are not in the Hani reordering group any more.
- // String unassigned = UTF16.valueOf(0x50005);
+ // U+50005 is an unassigned code point which sorts at the end, independent of the Hani group.
+ bucketIndex = index.getBucketIndex(UTF16.valueOf(0x50005));
+ assertEquals("getBucketIndex(U+50005)", 27, bucketIndex);
bucketIndex = index.getBucketIndex("\uFFFF");
assertEquals("getBucketIndex(U+FFFF)", 27, bucketIndex);
}
RuleBasedCollator coll = (RuleBasedCollator) Collator.getInstance(ULocale.CHINESE);
coll.setReorderCodes(UScript.HAN);
AlphabeticIndex index = new AlphabeticIndex(coll);
- assertEquals("getBucketCount()", 1, index.getBucketCount()); // ... (underflow only)
+ assertEquals("getBucketCount()", 28, index.getBucketCount()); // ... A-Z ...
index.addLabels(ULocale.CHINESE);
assertEquals("getBucketCount()", 28, index.getBucketCount()); // ... A-Z ...
int bucketIndex = index.getBucketIndex("\u897f");
assertEquals("getBucketIndex(i)", 9, bucketIndex);
bucketIndex = index.getBucketIndex("\u03B1");
assertEquals("getBucketIndex(Greek alpha)", 27, bucketIndex);
- // TODO: Test with an unassigned code point (not just U+FFFF)
- // when unassigned code points are not in the Hani reordering group any more.
- // String unassigned = UTF16.valueOf(0x50005);
+ // U+50005 is an unassigned code point which sorts at the end, independent of the Hani group.
+ bucketIndex = index.getBucketIndex(UTF16.valueOf(0x50005));
+ assertEquals("getBucketIndex(U+50005)", 27, bucketIndex);
bucketIndex = index.getBucketIndex("\uFFFF");
assertEquals("getBucketIndex(U+FFFF)", 27, bucketIndex);
}
assertEquals("label 4", "ㄈ", immIndex.getBucket(4).getLabel());
assertEquals("label 5", "ㄉ", immIndex.getBucket(5).getLabel());
}
+
+ public void TestJapaneseKanji() {
+ AlphabeticIndex index = new AlphabeticIndex(ULocale.JAPANESE);
+ AlphabeticIndex.ImmutableIndex immIndex = index.buildImmutableIndex();
+ // There are no index characters for Kanji in the Japanese standard collator.
+ // They should all go into the overflow bucket.
+ final int[] kanji = { 0x4E9C, 0x95C7, 0x4E00, 0x58F1 };
+ int overflowIndex = immIndex.getBucketCount() - 1;
+ for(int i = 0; i < kanji.length; ++i) {
+ String msg = String.format("kanji[%d]=U+%04X in overflow bucket", i, kanji[i]);
+ assertEquals(msg, overflowIndex, immIndex.getBucketIndex(UTF16.valueOf(kanji[i])));
+ }
+ }
+
+ public void TestFrozenCollator() {
+ // Ticket #9472
+ RuleBasedCollator coll = (RuleBasedCollator) Collator.getInstance(new ULocale("da"));
+ coll.setStrength(Collator.IDENTICAL);
+ coll.freeze();
+ // The AlphabeticIndex constructor used to throw an exception
+ // because it cloned the collator (which preserves frozenness)
+ // and set the clone's strength to PRIMARY.
+ AlphabeticIndex index = new AlphabeticIndex(coll);
+ assertEquals("same strength as input Collator",
+ Collator.IDENTICAL, index.getCollator().getStrength());
+ }
}
/*
*******************************************************************************
- * Copyright (C) 2002-2013, International Business Machines Corporation and
+ * Copyright (C) 2002-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
byte[] bytes = sortk1.toByteArray();
doAssert(bytes.length == 3 && bytes[0] == 1 && bytes[1] == 1
&& bytes[2] == 0,
- "Empty string should return an empty collation key");
+ "Empty string should return a collation key with empty levels");
+
+ // Most control codes and CGJ are completely ignorable.
+ // A string with only completely ignorables must compare equal to an empty string.
+ CollationKey sortkIgnorable = col.getCollationKey("\u0001\u034f");
+ doAssert(sortkIgnorable != null && sortkIgnorable.toByteArray().length == 3,
+ "Completely ignorable string should return a collation key with empty levels");
+ doAssert(sortkIgnorable.compareTo(sortk1) == 0,
+ "Completely ignorable string should compare equal to empty string");
+
// bogus key returned here
sortk1 = col.getCollationKey(null);
doAssert(sortk1 == null, "Error code should return bogus collation key");
// Collator col2 = (Collator)col1.clone();
// doAssert(col1.equals(col2), "Cloned object is not equal to the orginal");
- String ruleset = "< a, A < b, B < c, C < d, D, e, E";
+ String ruleset = "&9 < a, A < b, B < c, C < d, D, e, E";
RuleBasedCollator col3 = null;
try {
col3 = new RuleBasedCollator(ruleset);
} catch (Exception e) {
- errln("Failure creating RuleBasedCollator with rule:" + ruleset);
+ errln("Failure creating RuleBasedCollator with rule: \"" + ruleset + "\"\n" + e);
return;
}
doAssert(!col1.equals(col3), "Cloned object is equal to some dummy");
order1 = iterator1.next();
doAssert(!(iterator1.equals(iterator2)), "The first iterator advance failed");
order2 = iterator2.next();
-
- doAssert((iterator1.equals(iterator2)), "The second iterator advance failed");
+
+ // In ICU 52 and earlier we had iterator1.equals(iterator2)
+ // but in ICU 53 this fails because the iterators differ (String vs. CharacterIterator).
+ // doAssert((iterator1.equals(iterator2)), "The second iterator advance failed");
+ doAssert(iterator1.getOffset() == iterator2.getOffset(), "The second iterator advance failed");
doAssert((order1 == order2), "The order result should be the same");
order3 = iterator3.next();
doAssert(!(iterator1.equals(iterator2)), "The first iterator advance failed");
order2 = iterator2.next();
-
- doAssert((iterator1.equals(iterator2)), "The second iterator advance failed");
+
+ // In ICU 52 and earlier we had iterator1.equals(iterator2)
+ // but in ICU 53 this fails because the iterators differ (String vs. CharacterIterator).
+ // doAssert((iterator1.equals(iterator2)), "The second iterator advance failed");
+ doAssert(iterator1.getOffset() == iterator2.getOffset(), "The second iterator advance failed");
doAssert((order1 == order2), "The order result should be the same");
order3 = iterator3.next();
// rather than hardcoding (and updating each time) a particular UCA version.
VersionInfo ucdVersion = UCharacter.getUnicodeVersion();
VersionInfo ucaVersion = col.getUCAVersion();
- doAssert(logKnownIssue("9101", "update to collv2 & UCA 6.3") ?
- ucdVersion.getMajor() == 6 && ucdVersion.getMinor() == 3 :
- ucaVersion.equals(ucdVersion),
+ doAssert(ucaVersion.equals(ucdVersion),
"Expected UCA version "+ucdVersion.toString()+" got "+col.getUCAVersion().toString());
doAssert((col.compare("ab", "abc") < 0), "ab < abc comparison failed");
coll = new RuleBasedCollator(rules[i]);
set = coll.getTailoredSet();
logln("Got set: "+set.toPattern(true));
- if(set.size() != data[i].length) {
- errln("Tailored set size different ("+set.size()+") than expected ("+data[i].length+")");
+ if(set.size() < data[i].length) {
+ errln("Tailored set size smaller ("+set.size()+") than expected ("+data[i].length+")");
}
for(j = 0; j < data[i].length; j++) {
logln("Checking to see whether "+data[i][j]+" is in set");
{
class TestCollator extends Collator
{
+ @Override
public boolean equals(Object that) {
return this == that;
}
+ @Override
public int hashCode() {
return 0;
}
+ @Override
public int compare(String source, String target) {
return source.compareTo(target);
}
+ @Override
public CollationKey getCollationKey(String source)
{ return new CollationKey(source,
getRawCollationKey(source, new RawCollationKey()));
}
+ @Override
public RawCollationKey getRawCollationKey(String source,
RawCollationKey key)
{
return key;
}
+ @Override
public void setVariableTop(int ce)
{
if (isFrozen()) {
}
}
+ @Override
public int setVariableTop(String str)
{
if (isFrozen()) {
return 0;
}
+ @Override
public int getVariableTop()
{
return 0;
}
+ @Override
public VersionInfo getVersion()
{
return VersionInfo.getInstance(0);
}
+ @Override
public VersionInfo getUCAVersion()
{
return VersionInfo.getInstance(0);
errln("Error getting default tailored set");
}
}
-
- /**
- * Simple test the collator setter and getters
+
+ /**
+ * Simple test the collator setter and getters.
+ * Similar to C++ apicoll.cpp TestAttribute().
*/
public void TestSetGet()
{
errln("Setting case first handling default failed");
}
}
-
+
+ public void TestVariableTopSetting() {
+ RuleBasedCollator coll = (RuleBasedCollator)Collator.getInstance();
+
+ int oldVarTop = coll.getVariableTop();
+
+ // ICU 53+: The character must be in a supported reordering group,
+ // and the variable top is pinned to the end of that group.
+ try {
+ coll.setVariableTop("A");
+ errln("setVariableTop(letter) did not detect illegal argument");
+ } catch(IllegalArgumentException expected) {
+ }
+
+ // dollar sign (currency symbol)
+ int newVarTop = coll.setVariableTop("$");
+
+ if(newVarTop != coll.getVariableTop()) {
+ errln("setVariableTop(dollar sign) != following getVariableTop()");
+ }
+
+ String dollar = "$";
+ String euro = "\u20AC";
+ int newVarTop2 = coll.setVariableTop(euro);
+ assertEquals("setVariableTop(Euro sign) == following getVariableTop()",
+ newVarTop2, coll.getVariableTop());
+ assertEquals("setVariableTop(Euro sign) == setVariableTop(dollar sign) (should pin to top of currency group)",
+ newVarTop2, newVarTop);
+
+ coll.setAlternateHandlingShifted(true);
+ assertEquals("empty==dollar", 0, coll.compare("", dollar)); // UCOL_EQUAL
+ assertEquals("empty==euro", 0, coll.compare("", euro)); // UCOL_EQUAL
+ assertEquals("dollar<zero", -1, coll.compare(dollar, "0")); // UCOL_LESS
+
+ coll.setVariableTop(oldVarTop);
+
+ int newerVarTop = coll.setVariableTop("$");
+
+ if(newVarTop != newerVarTop) {
+ errln("Didn't set vartop properly from String!\n");
+ }
+ }
+
+ public void TestMaxVariable() {
+ RuleBasedCollator coll = (RuleBasedCollator)Collator.getInstance(ULocale.ROOT);
+
+ try {
+ coll.setMaxVariable(Collator.ReorderCodes.OTHERS);
+ errln("setMaxVariable(others) did not detect illegal argument");
+ } catch(IllegalArgumentException expected) {
+ }
+
+ coll.setMaxVariable(Collator.ReorderCodes.CURRENCY);
+
+ if(Collator.ReorderCodes.CURRENCY != coll.getMaxVariable()) {
+ errln("setMaxVariable(currency) != following getMaxVariable()");
+ }
+
+ coll.setAlternateHandlingShifted(true);
+ assertEquals("empty==dollar", 0, coll.compare("", "$")); // UCOL_EQUAL
+ assertEquals("empty==euro", 0, coll.compare("", "\u20AC")); // UCOL_EQUAL
+ assertEquals("dollar<zero", -1, coll.compare("$", "0")); // UCOL_LESS
+ }
+
+ public void TestGetLocale() {
+ String rules = "&a<x<y<z";
+
+ Collator coll = Collator.getInstance(new ULocale("root"));
+ ULocale locale = coll.getLocale(ULocale.ACTUAL_LOCALE);
+ if(!locale.equals(ULocale.ROOT)) {
+ errln("Collator.getInstance(\"root\").getLocale(actual) != ULocale.ROOT; " +
+ "getLocale().getName() = \"" + locale.getName() + "\"");
+ }
+
+ coll = Collator.getInstance(new ULocale(""));
+ locale = coll.getLocale(ULocale.ACTUAL_LOCALE);
+ if(!locale.equals(ULocale.ROOT)) {
+ errln("Collator.getInstance(\"\").getLocale(actual) != ULocale.ROOT; " +
+ "getLocale().getName() = \"" + locale.getName() + "\"");
+ }
+
+ int i = 0;
+
+ String[][] testStruct = {
+ // requestedLocale, validLocale, actualLocale
+ // Note: ULocale.ROOT.getName() == "" not "root".
+ { "de_DE", "de_DE", "" },
+ { "sr_RS", "sr_Cyrl_RS", "sr" },
+ { "en_US_CALIFORNIA", "en_US", "" },
+ { "fr_FR_NONEXISTANT", "fr_FR", "" },
+ // pinyin is the default, therefore suppressed.
+ { "zh_CN", "zh_Hans_CN", "zh" },
+ // zh_Hant has default=stroke but the data is in zh.
+ { "zh_TW", "zh_Hant_TW", "zh@collation=stroke" },
+ { "zh_TW@collation=pinyin", "zh_Hant_TW@collation=pinyin", "zh" },
+ { "zh_CN@collation=stroke", "zh_Hans_CN@collation=stroke", "zh@collation=stroke" }
+ };
+
+ /* test opening collators for different locales */
+ for(i = 0; i<testStruct.length; i++) {
+ String requestedLocale = testStruct[i][0];
+ String validLocale = testStruct[i][1];
+ String actualLocale = testStruct[i][2];
+ try {
+ coll = Collator.getInstance(new ULocale(requestedLocale));
+ } catch(Exception e) {
+ errln(String.format("Failed to open collator for %s with %s", requestedLocale, e));
+ continue;
+ }
+ // Note: C++ getLocale() recognizes ULOC_REQUESTED_LOCALE
+ // which does not exist in Java.
+ locale = coll.getLocale(ULocale.VALID_LOCALE);
+ if(!locale.equals(new ULocale(validLocale))) {
+ errln(String.format("[Coll %s]: Error in valid locale, expected %s, got %s",
+ requestedLocale, validLocale, locale.getName()));
+ }
+ locale = coll.getLocale(ULocale.ACTUAL_LOCALE);
+ if(!locale.equals(new ULocale(actualLocale))) {
+ errln(String.format("[Coll %s]: Error in actual locale, expected %s, got %s",
+ requestedLocale, actualLocale, locale.getName()));
+ }
+ // If we open a collator for the actual locale, we should get an equivalent one again.
+ Collator coll2;
+ try {
+ coll2 = Collator.getInstance(locale);
+ } catch(Exception e) {
+ errln(String.format("Failed to open collator for actual locale \"%s\" with %s",
+ locale.getName(), e));
+ continue;
+ }
+ ULocale actual2 = coll2.getLocale(ULocale.ACTUAL_LOCALE);
+ if(!actual2.equals(locale)) {
+ errln(String.format("[Coll actual \"%s\"]: Error in actual locale, got different one: \"%s\"",
+ locale.getName(), actual2.getName()));
+ }
+ if(!coll2.equals(coll)) {
+ errln(String.format("[Coll actual \"%s\"]: Got different collator than before",
+ locale.getName()));
+ }
+ }
+
+ /* completely non-existent locale for collator should get a default collator */
+ {
+ Collator defaultColl = Collator.getInstance();
+ try {
+ coll = Collator.getInstance(new ULocale("blahaha"));
+ } catch(Exception e) {
+ errln("Failed to open collator with " + e);
+ return;
+ }
+ if(!coll.getLocale(ULocale.VALID_LOCALE).equals(
+ defaultColl.getLocale(ULocale.VALID_LOCALE))) {
+ errln("Valid locale for nonexisting locale locale collator differs " +
+ "from valid locale for default collator");
+ }
+ if(!coll.getLocale(ULocale.ACTUAL_LOCALE).equals(
+ defaultColl.getLocale(ULocale.ACTUAL_LOCALE))) {
+ errln("Actual locale for nonexisting locale locale collator differs " +
+ "from actual locale for default collator");
+ }
+ }
+
+ /* collator instantiated from rules should have all locales null */
+ try {
+ coll = new RuleBasedCollator(rules);
+ } catch (Exception e) {
+ errln("RuleBasedCollator(" + rules + ") failed: " + e);
+ return;
+ }
+ locale = coll.getLocale(ULocale.VALID_LOCALE);
+ if(locale != null) {
+ errln(String.format("For collator instantiated from rules, valid locale %s is not bogus",
+ locale.getName()));
+ }
+ locale = coll.getLocale(ULocale.ACTUAL_LOCALE);
+ if(locale != null) {
+ errln(String.format("For collator instantiated from rules, actual locale %s is not bogus",
+ locale.getName()));
+ }
+ }
+
public void TestBounds()
{
Collator coll = Collator.getInstance(new Locale("sh", ""));
}
return ok;
}
-
- public void TestGetContractions()throws Exception {
+
+ // capitst.c/TestGetContractionsAndUnsafes()
+ public void TestGetContractions() throws Exception {
/* static struct {
const char* locale;
const char* inConts;
"[jabv]"
},
{ "ja",
- "[{\u3053\u3099\u309D}{\u3053\u3099\u309D\u3099}{\u3053\u3099\u309E}{\u3053\u3099\u30FC}{\u3053\u309D}{\u3053\u309D\u3099}{\u3053\u309E}{\u3053\u30FC}{\u30B3\u3099\u30FC}{\u30B3\u3099\u30FD}{\u30B3\u3099\u30FD\u3099}{\u30B3\u3099\u30FE}{\u30B3\u30FC}{\u30B3\u30FD}{\u30B3\u30FD\u3099}{\u30B3\u30FE}]",
+ /*
+ * The "collv2" builder omits mappings if the collator maps their
+ * character sequences to the same CEs.
+ * For example, it omits Japanese contractions for NFD forms
+ * of the voiced iteration mark (U+309E = U+309D + U+3099), such as
+ * {\u3053\u3099\u309D\u3099}{\u3053\u309D\u3099}
+ * {\u30B3\u3099\u30FD\u3099}{\u30B3\u30FD\u3099}.
+ * It does add mappings for the precomposed forms.
+ */
+ "[{\u3053\u3099\u309D}{\u3053\u3099\u309E}{\u3053\u3099\u30FC}" +
+ "{\u3053\u309D}{\u3053\u309E}{\u3053\u30FC}" +
+ "{\u30B3\u3099\u30FC}{\u30B3\u3099\u30FD}{\u30B3\u3099\u30FE}" +
+ "{\u30B3\u30FC}{\u30B3\u30FD}{\u30B3\u30FE}]",
"[{\u30FD\u3099}{\u309D\u3099}{\u3053\u3099}{\u30B3\u3099}{lj}{nj}]",
"[\u30FE\u00e6]",
"[a]",
"[]"
}
};
-
-
-
-
+
RuleBasedCollator coll = null;
int i = 0;
UnicodeSet conts = new UnicodeSet();
/*
*******************************************************************************
- * Copyright (C) 2002-2010, International Business Machines Corporation and *
+ * Copyright (C) 2002-2014, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
StringBuilder msg = new StringBuilder();
msg.append("With ").append(localeName).append(" collator\n and input string: ").append(string1).append('\n');
msg.append(" failed to produce identical keys on both collators\n");
- msg.append(" localeCollator key: ").append(CollationMiscTest.prettify(k1)).append('\n');
- msg.append(" ruleCollator key: ").append(CollationMiscTest.prettify(k2)).append('\n');
+ msg.append(" localeCollator key: ").append(CollationTest.prettify(k1)).append('\n');
+ msg.append(" ruleCollator key: ").append(CollationTest.prettify(k2)).append('\n');
errln(msg.toString());
}
}
/*
*******************************************************************************
- * Copyright (C) 2002-2010, International Business Machines Corporation and *
- * others. All Rights Reserved. *
+ * Copyright (C) 2002-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
String sExpect = new String("");
String sResult = new String("");
- sResult = appendCompareResult(compareResult, sResult);
- sExpect = appendCompareResult(expectedResult, sExpect);
+ sResult = CollationTest.appendCompareResult(compareResult, sResult);
+ sExpect = CollationTest.appendCompareResult(expectedResult, sExpect);
if (ok1) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
} else {
msg1 = ok2 ? "Ok: key(\"" : "FAIL: key(\"";
msg2 = "\").compareTo(key(\"";
msg3 = "\")) returned ";
- sResult = appendCompareResult(keyResult, sResult);
+ sResult = CollationTest.appendCompareResult(keyResult, sResult);
if (ok2) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
} else {
errln(msg1 + source + msg2 + target + msg3 + sResult + msg4 + sExpect);
msg1 = " ";
msg2 = " vs. ";
- errln(msg1 + prettify(sourceKey) + msg2 + prettify(targetKey));
+ errln(msg1 + CollationTest.prettify(sourceKey) + msg2 + CollationTest.prettify(targetKey));
}
msg1 = ok3 ? "Ok: incCompare(\"" : "FAIL: incCompare(\"";
msg2 = "\", \"";
msg3 = "\") returned ";
- sResult = appendCompareResult(incResult, sResult);
+ sResult = CollationTest.appendCompareResult(incResult, sResult);
if (ok3) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
}
}
}
-
- private String appendCompareResult(int result, String target){
- if (result == -1) {
- target += "LESS";
- } else if (result == 0) {
- target += "EQUAL";
- } else if (result == 1) {
- target += "GREATER";
- } else {
- String huh = "?";
- target += huh + result;
- }
- return target;
- }
-
- String prettify(CollationKey sourceKey) {
- int i;
- byte[] bytes= sourceKey.toByteArray();
- String target = "[";
-
- for (i = 0; i < bytes.length; i++) {
- target += Integer.toHexString(bytes[i]);
- target += " ";
- }
- target += "]";
- return target;
- }
-}
\ No newline at end of file
+}
/*
*******************************************************************************
- * Copyright (C) 2002-2010, International Business Machines Corporation and *
- * others. All Rights Reserved. *
+ * Copyright (C) 2002-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
public class CollationDummyTest extends TestFmwk {
public static void main(String[] args) throws Exception {
new CollationDummyTest().run(args);
- // new CollationDummyTest().TestVariableTop();
}
//testSourceCases[][] and testTargetCases[][], testCases[][] are ported from the file callcoll.c in icu4c
errln("Failed : non-tailored supplementary characters should have the same value\n");
}
}
-
+
+ private static final boolean SUPPORT_VARIABLE_TOP_RELATION = false;
//TestVariableTop() is ported from cintltst/callcoll.c
/**
* Tests the [variable top] tag in rule syntax. Since the default [alternate]
* a primary ce of 0.
*/
public void TestVariableTop() {
+ /*
+ * Starting with ICU 53, setting the variable top via a pseudo relation string
+ * is not supported any more.
+ * It was replaced by the [maxVariable symbol] setting.
+ * See ICU tickets #9958 and #8032.
+ */
+ if(!SUPPORT_VARIABLE_TOP_RELATION) { return; }
String rule = "&z = [variable top]";
Collator myColl;
Collator enColl;
/*
*******************************************************************************
- * Copyright (C) 2002-2010, International Business Machines Corporation and *
- * others. All Rights Reserved. *
+ * Copyright (C) 2002-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
String sExpect = new String("");
String sResult = new String("");
- sResult = appendCompareResult(compareResult, sResult);
- sExpect = appendCompareResult(expectedResult, sExpect);
+ sResult = CollationTest.appendCompareResult(compareResult, sResult);
+ sExpect = CollationTest.appendCompareResult(expectedResult, sExpect);
if (ok1) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
} else {
msg1 = ok2 ? "Ok: key(\"" : "FAIL: key(\"";
msg2 = "\").compareTo(key(\"";
msg3 = "\")) returned ";
- sResult = appendCompareResult(keyResult, sResult);
+ sResult = CollationTest.appendCompareResult(keyResult, sResult);
if (ok2) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
} else {
errln(msg1 + source + msg2 + target + msg3 + sResult + msg4 + sExpect);
msg1 = " ";
msg2 = " vs. ";
- errln(msg1 + prettify(sourceKey) + msg2 + prettify(targetKey));
+ errln(msg1 + CollationTest.prettify(sourceKey) + msg2 + CollationTest.prettify(targetKey));
}
msg1 = ok3 ? "Ok: incCompare(\"" : "FAIL: incCompare(\"";
msg2 = "\", \"";
msg3 = "\") returned ";
- sResult = appendCompareResult(incResult, sResult);
+ sResult = CollationTest.appendCompareResult(incResult, sResult);
if (ok3) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
}
}
}
-
- private String appendCompareResult(int result, String target){
- if (result == -1) //LESS
- {
- target += "LESS";
- }
- else if (result == 0) //EQUAL
- {
- target += "EQUAL";
- }
- else if (result == 1) //GREATER
- {
- target += "GREATER";
- }
- else
- {
- String huh = "?";
-
- target += huh + result;
- }
- return target;
- }
-
- String prettify(CollationKey sourceKey) {
- int i;
- byte[] bytes= sourceKey.toByteArray();
- String target = "[";
-
- for (i = 0; i < bytes.length; i++) {
- target += Integer.toHexString(bytes[i]);
- target += " ";
- }
- target += "]";
- return target;
- }
- }
\ No newline at end of file
+ }
/*
*******************************************************************************
- * Copyright (C) 2002-2010, International Business Machines Corporation and *
- * others. All Rights Reserved. *
+ * Copyright (C) 2002-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
String sExpect = new String("");
String sResult = new String("");
- sResult = appendCompareResult(compareResult, sResult);
- sExpect = appendCompareResult(expectedResult, sExpect);
+ sResult = CollationTest.appendCompareResult(compareResult, sResult);
+ sExpect = CollationTest.appendCompareResult(expectedResult, sExpect);
if (ok1) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
} else {
msg1 = ok2 ? "Ok: key(\"" : "FAIL: key(\"";
msg2 = "\").compareTo(key(\"";
msg3 = "\")) returned ";
- sResult = appendCompareResult(keyResult, sResult);
+ sResult = CollationTest.appendCompareResult(keyResult, sResult);
if (ok2) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
} else {
errln(msg1 + source + msg2 + target + msg3 + sResult + msg4 + sExpect);
msg1 = " ";
msg2 = " vs. ";
- errln(msg1 + prettify(sourceKey) + msg2 + prettify(targetKey));
+ errln(msg1 + CollationTest.prettify(sourceKey) + msg2 + CollationTest.prettify(targetKey));
}
msg1 = ok3 ? "Ok: incCompare(\"" : "FAIL: incCompare(\"";
msg2 = "\", \"";
msg3 = "\") returned ";
- sResult = appendCompareResult(incResult, sResult);
+ sResult = CollationTest.appendCompareResult(incResult, sResult);
if (ok3) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
}
}
}
-
- private String appendCompareResult(int result, String target) {
- if (result == -1) {
- target += "LESS";
- } else if (result == 0) {
- target += "EQUAL";
- } else if (result == 1) {
- target += "GREATER";
- } else {
- String huh = "?";
- target += huh + result;
- }
- return target;
- }
-
- String prettify(CollationKey sourceKey) {
- int i;
- byte[] bytes= sourceKey.toByteArray();
- String target = "[";
-
- for (i = 0; i < bytes.length; i++) {
- target += Integer.toHexString(bytes[i]);
- target += " ";
- }
- target += "]";
- return target;
- }
}
-
/*
*******************************************************************************
- * Copyright (C) 2002-2010, International Business Machines Corporation and *
- * others. All Rights Reserved. *
+ * Copyright (C) 2002-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
String sExpect = new String("");
String sResult = new String("");
- sResult = appendCompareResult(compareResult, sResult);
- sExpect = appendCompareResult(expectedResult, sExpect);
+ sResult = CollationTest.appendCompareResult(compareResult, sResult);
+ sExpect = CollationTest.appendCompareResult(expectedResult, sExpect);
if (ok1) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
} else {
msg1 = ok2 ? "Ok: key(\"" : "FAIL: key(\"";
msg2 = "\").compareTo(key(\"";
msg3 = "\")) returned ";
- sResult = appendCompareResult(keyResult, sResult);
+ sResult = CollationTest.appendCompareResult(keyResult, sResult);
if (ok2) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
} else {
errln(msg1 + source + msg2 + target + msg3 + sResult + msg4 + sExpect);
msg1 = " ";
msg2 = " vs. ";
- errln(msg1 + prettify(sourceKey) + msg2 + prettify(targetKey));
+ errln(msg1 + CollationTest.prettify(sourceKey) + msg2 + CollationTest.prettify(targetKey));
}
msg1 = ok3 ? "Ok: incCompare(\"" : "FAIL: incCompare(\"";
msg2 = "\", \"";
msg3 = "\") returned ";
- sResult = appendCompareResult(incResult, sResult);
+ sResult = CollationTest.appendCompareResult(incResult, sResult);
if (ok3) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
}
}
}
-
- private String appendCompareResult(int result, String target) {
- if (result == -1) {
- target += "LESS";
- } else if (result == 0) {
- target += "EQUAL";
- } else if (result == 1) {
- target += "GREATER";
- } else {
- String huh = "?";
- target += huh + result;
- }
- return target;
- }
-
- String prettify(CollationKey sourceKey) {
- int i;
- byte[] bytes= sourceKey.toByteArray();
- String target = "[";
-
- for (i = 0; i < bytes.length; i++) {
- target += Integer.toHexString(bytes[i]);
- target += " ";
- }
- target += "]";
- return target;
- }
}
/*
*******************************************************************************
- * Copyright (C) 2011, International Business Machines Corporation and *
- * others. All Rights Reserved. *
+ * Copyright (C) 2011-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
String msg4 = "; expected ";
String sExpect = new String("");
String sResult = new String("");
- sResult = appendCompareResult(compareResult, sResult);
- sExpect = appendCompareResult(expectedResult, sExpect);
+ sResult = CollationTest.appendCompareResult(compareResult, sResult);
+ sExpect = CollationTest.appendCompareResult(expectedResult, sExpect);
if (ok1) {
logln(msg1 + src + msg2 + target + msg3 + sResult);
} else {
msg1 = ok2 ? "Ok: key(\"" : "FAIL: key(\"";
msg2 = "\").compareTo(key(\"";
msg3 = "\")) returned ";
- sResult = appendCompareResult(keyResult, sResult);
+ sResult = CollationTest.appendCompareResult(keyResult, sResult);
if (ok2) {
logln(msg1 + src + msg2 + target + msg3 + sResult);
} else {
errln(msg1 + src + msg2 + target + msg3 + sResult + msg4 + sExpect);
msg1 = " ";
msg2 = " vs. ";
- errln(msg1 + prettify(sourceKey) + msg2 + prettify(targetKey));
+ errln(msg1 + CollationTest.prettify(sourceKey) + msg2 + CollationTest.prettify(targetKey));
}
msg1 = ok3 ? "Ok: incCompare(\"" : "FAIL: incCompare(\"";
msg2 = "\", \"";
msg3 = "\") returned ";
- sResult = appendCompareResult(incResult, sResult);
+ sResult = CollationTest.appendCompareResult(incResult, sResult);
if (ok3) {
logln(msg1 + src + msg2 + target + msg3 + sResult);
} else {
}
}
}
-
- String appendCompareResult(int result, String target) {
- if (result == -1) { //LESS
- target += "LESS";
- } else if (result == 0) { //EQUAL
- target += "EQUAL";
- } else if (result == 1) { //GREATER
- target += "GREATER";
- } else {
- String huh = "?";
- target += huh + result;
- }
- return target;
- }
-
- String prettify(CollationKey sourceKey) {
- int i;
- byte[] bytes= sourceKey.toByteArray();
- String target = "[";
-
- for (i = 0; i < bytes.length; i++) {
- target += Integer.toHexString(bytes[i]);
- target += " ";
- }
- target += "]";
- return target;
- }
}
/*
*******************************************************************************
- * Copyright (C) 2002-2010, International Business Machines Corporation and *
- * others. All Rights Reserved. *
+ * Copyright (C) 2002-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
String sExpect = new String("");
String sResult = new String("");
- sResult = appendCompareResult(compareResult, sResult);
- sExpect = appendCompareResult(expectedResult, sExpect);
+ sResult = CollationTest.appendCompareResult(compareResult, sResult);
+ sExpect = CollationTest.appendCompareResult(expectedResult, sExpect);
if (ok1) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
} else {
msg1 = ok2 ? "Ok: key(\"" : "FAIL: key(\"";
msg2 = "\").compareTo(key(\"";
msg3 = "\")) returned ";
- sResult = appendCompareResult(keyResult, sResult);
+ sResult = CollationTest.appendCompareResult(keyResult, sResult);
if (ok2) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
} else {
errln(msg1 + source + msg2 + target + msg3 + sResult + msg4 + sExpect);
msg1 = " ";
msg2 = " vs. ";
- errln(msg1 + prettify(sourceKey) + msg2 + prettify(targetKey));
+ errln(msg1 + CollationTest.prettify(sourceKey) + msg2 + CollationTest.prettify(targetKey));
}
msg1 = ok3 ? "Ok: incCompare(\"" : "FAIL: incCompare(\"";
msg2 = "\", \"";
msg3 = "\") returned ";
- sResult = appendCompareResult(incResult, sResult);
+ sResult = CollationTest.appendCompareResult(incResult, sResult);
if (ok3) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
}
}
}
-
- private String appendCompareResult(int result, String target){
- if (result == -1) //LESS
- {
- target += "LESS";
- }
- else if (result == 0) //EQUAL
- {
- target += "EQUAL";
- }
- else if (result == 1) //GREATER
- {
- target += "GREATER";
- }
- else
- {
- String huh = "?";
-
- target += huh + result;
- }
- return target;
- }
-
- String prettify(CollationKey sourceKey) {
- int i;
- byte[] bytes= sourceKey.toByteArray();
- String target = "[";
-
- for (i = 0; i < bytes.length; i++) {
- target += Integer.toHexString(bytes[i]);
- target += " ";
- }
- target += "]";
- return target;
- }
-}
\ No newline at end of file
+}
/*
*******************************************************************************
- * Copyright (C) 2002-2011, International Business Machines Corporation and *
- * others. All Rights Reserved. *
+ * Copyright (C) 2002-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
}
/**
- * TestSearchCollatorElements tests iterator behavior (forwards and backwards) with
- * normalization on AND jamo tailoring, among other things.
- */
+ * TestSearchCollatorElements tests iterator behavior (forwards and backwards) with
+ * normalization on AND jamo tailoring, among other things.
+ *
+ * Note: This test is sensitive to changes of the root collator,
+ * for example whether the ae-ligature maps to three CEs (as in the DUCET)
+ * or to two CEs (as in the CLDR 24 FractionalUCA.txt).
+ * It is also sensitive to how those CEs map to the iterator's 32-bit CE encoding.
+ * For example, the DUCET's artificial secondary CE in the ae-ligature
+ * may map to two 32-bit iterator CEs (as it did until ICU 52).
+ */
public void TestSearchCollatorElements()
{
String tsceText =
12, 13,14,15,
16, 17,18,19,
20, 21,22,23,
- 24, 25,26,26,26,
+ 24, 25,26, /* plus another 1-2 offset=26 if ae-ligature maps to three CEs */
26, 27,28,28,
28,
29
12, 13,14,15,
16, 17,18,19,20,
20, 21,22,22,23,23,23,24,
- 24, 25,26,26,26,
+ 24, 25,26, /* plus another 1-2 offset=26 if ae-ligature maps to three CEs */
26, 27,28,28,
28,
29
do {
offset = uce.getOffset();
element = uce.next();
+ logln(String.format("(%s) offset=%2d ce=%08x\n", tsceItem.localeString, offset, element));
if (element == 0) {
errln("Error: in locale " + localeString + ", CEIterator next() returned element 0");
}
if ( ioff < noff ) {
errln("Error: in locale " + localeString + ", CEIterator next() returned fewer elements than expected");
}
-
- /*
- // Skip the backwards test until ticket #8382 is fixed
+
+ // backwards test
uce.setOffset(tsceText.length());
ioff = noff;
do {
if ( ioff > 0 ) {
errln("Error: in locale " + localeString + ", CEIterator previous() returned fewer elements than expected");
}
- */
}
}
}
/*
*******************************************************************************
- * Copyright (C) 2002-2012, International Business Machines Corporation and
+ * Copyright (C) 2002-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
String sExpect = new String("");
String sResult = new String("");
- sResult = appendCompareResult(compareResult, sResult);
- sExpect = appendCompareResult(expectedResult, sExpect);
+ sResult = CollationTest.appendCompareResult(compareResult, sResult);
+ sExpect = CollationTest.appendCompareResult(expectedResult, sExpect);
if (ok1) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
} else {
msg1 = ok2 ? "Ok: key(\"" : "FAIL: key(\"";
msg2 = "\").compareTo(key(\"";
msg3 = "\")) returned ";
- sResult = appendCompareResult(keyResult, sResult);
+ sResult = CollationTest.appendCompareResult(keyResult, sResult);
if (ok2) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
} else {
errln(msg1 + source + msg2 + target + msg3 + sResult + msg4 + sExpect);
msg1 = " ";
msg2 = " vs. ";
- errln(msg1 + prettify(sourceKey) + msg2 + prettify(targetKey));
+ errln(msg1 + CollationTest.prettify(sourceKey) + msg2 + CollationTest.prettify(targetKey));
}
msg1 = ok3 ? "Ok: incCompare(\"" : "FAIL: incCompare(\"";
msg2 = "\", \"";
msg3 = "\") returned ";
- sResult = appendCompareResult(incResult, sResult);
+ sResult = CollationTest.appendCompareResult(incResult, sResult);
if (ok3) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
}
}
}
-
- private String appendCompareResult(int result, String target){
- if (result == -1) {
- target += "LESS";
- } else if (result == 0) {
- target += "EQUAL";
- } else if (result == 1) {
- target += "GREATER";
- } else {
- String huh = "?";
- target += huh + result;
- }
- return target;
- }
-
- String prettify(CollationKey sourceKey) {
- int i;
- byte[] bytes= sourceKey.toByteArray();
- String target = "[";
-
- for (i = 0; i < bytes.length; i++) {
- target += Integer.toHexString(bytes[i]);
- target += " ";
- }
- target += "]";
- return target;
- }
-}
\ No newline at end of file
+}
/*
*******************************************************************************
- * Copyright (C) 2002-2012, International Business Machines Corporation and *
- * others. All Rights Reserved. *
+ * Copyright (C) 2002-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.impl.ICUResourceBundle;
-import com.ibm.icu.impl.ImplicitCEGenerator;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
// have a code point associated to it anymore
// "&[before 3][last primary ignorable]<<<k",
// - all befores here amount to zero
+ /* "you cannot go before ...": The parser now sets an error for such nonsensical rules.
"&[before 3][first tertiary ignorable]<<<a",
- "&[before 3][last tertiary ignorable]<<<a",
+ "&[before 3][last tertiary ignorable]<<<a", */
+ /*
+ * However, there is a real secondary ignorable (artificial addition in FractionalUCA.txt),
+ * and it *is* possible to "go before" that.
+ */
"&[before 3][first secondary ignorable]<<<a",
"&[before 3][last secondary ignorable]<<<a",
// 'normal' befores
- "&[before 3][first primary ignorable]<<<c<<<b &[first primary ignorable]<a",
+ /*
+ * Note: With a "SPACE first primary" boundary CE in FractionalUCA.txt,
+ * it is not possible to tailor &[first primary ignorable]<a or &[last primary ignorable]<a
+ * because there is no tailoring space before that boundary.
+ * Made the tests work by tailoring to a space instead.
+ */
+ "&[before 3][first primary ignorable]<<<c<<<b &' '<a", /* was &[first primary ignorable]<a */
// we don't have a code point that corresponds to the last primary
// ignorable
- "&[before 3][last primary ignorable]<<<c<<<b &[last primary ignorable]<a",
+ "&[before 3][last primary ignorable]<<<c<<<b &' '<a", /* was &[last primary ignorable]<a */
"&[before 3][first variable]<<<c<<<b &[first variable]<a",
"&[last variable]<a &[before 3][last variable]<<<c<<<b ",
"&[first regular]<a &[before 1][first regular]<b",
"&[before 1][last regular]<b &[last regular]<a",
"&[before 1][first implicit]<b &[first implicit]<a",
- "&[before 1][last implicit]<b &[last implicit]<a",
- "&[last variable]<z&[last primary ignorable]<x&[last secondary ignorable]<<y&[last tertiary ignorable]<<<w&[top]<u",
+ /* The current builder does not support tailoring to unassigned-implicit CEs (seems unnecessary, adds complexity).
+ "&[before 1][last implicit]<b &[last implicit]<a", */
+ "&[last variable]<z" +
+ "&' '<x" + /* was &[last primary ignorable]<x, see above */
+ "&[last secondary ignorable]<<y&[last tertiary ignorable]<<<w&[top]<u",
};
String[][] data = {
// {"k", "\u20e3"},
+ /* "you cannot go before ...": The parser now sets an error for such nonsensical rules.
{"\\u0000", "a"}, // you cannot go before first tertiary ignorable
- {"\\u0000", "a"}, // you cannot go before last tertiary ignorable
- {"\\u0000", "a"}, // you cannot go before first secondary ignorable
- {"\\u0000", "a"}, // you cannot go before first secondary ignorable
+ {"\\u0000", "a"}, // you cannot go before last tertiary ignorable */
+ /*
+ * However, there is a real secondary ignorable (artificial addition in FractionalUCA.txt),
+ * and it *is* possible to "go before" that.
+ */
+ {"\\u0000", "a"},
+ {"\\u0000", "a"},
+ /*
+ * Note: With a "SPACE first primary" boundary CE in FractionalUCA.txt,
+ * it is not possible to tailor &[first primary ignorable]<a or &[last primary ignorable]<a
+ * because there is no tailoring space before that boundary.
+ * Made the tests work by tailoring to a space instead.
+ */
{"c", "b", "\\u0332", "a"},
{"\\u0332", "\\u20e3", "c", "b", "a"},
{"c", "b", "\\u0009", "a", "\\u000a"},
// [last regular [CE 27, 05, 05]] # U+1342E EGYPTIAN HIEROGLYPH AA032
{LAST_REGULAR_CHAR_STRING, "b", /* [last regular] */ "a", "\\u4e00"},
{"b", "\\u4e00", "a", "\\u4e01"},
- {"b", "\\U0010FFFD", "a"},
+ /* The current builder does not support tailoring to unassigned-implicit CEs (seems unnecessary, adds complexity).
+ {"b", "\\U0010FFFD", "a"}, */
{"\ufffb", "w", "y", "\u20e3", "x", LAST_VARIABLE_CHAR_STRING, "z", "u"},
};
// logln("Rules starter for " + rules);
genericOrderingTestWithResult(coll, s, result);
} catch (Exception e) {
- warnln("Unable to open collator with rules " + rules);
+ warnln("Unable to open collator with rules " + rules + ": " + e);
}
}
String msg4 = "; expected ";
String sExpect = new String("");
String sResult = new String("");
- sResult = appendCompareResult(compareResult, sResult);
- sExpect = appendCompareResult(expectedResult, sExpect);
+ sResult = CollationTest.appendCompareResult(compareResult, sResult);
+ sExpect = CollationTest.appendCompareResult(expectedResult, sExpect);
if (ok1) {
// logln(msg1 + source + msg2 + target + msg3 + sResult);
} else {
msg1 = ok2 ? "Ok: key(\"" : "FAIL: key(\"";
msg2 = "\").compareTo(key(\"";
msg3 = "\")) returned ";
- sResult = appendCompareResult(keyResult, sResult);
+ sResult = CollationTest.appendCompareResult(keyResult, sResult);
if (ok2) {
// logln(msg1 + source + msg2 + target + msg3 + sResult);
} else {
errln(msg1 + source + msg2 + target + msg3 + sResult + msg4 + sExpect);
msg1 = " ";
msg2 = " vs. ";
- errln(msg1 + prettify(sourceKey) + msg2 + prettify(targetKey));
+ errln(msg1 + CollationTest.prettify(sourceKey) + msg2 + CollationTest.prettify(targetKey));
}
msg1 = ok3 ? "Ok: incCompare(\"" : "FAIL: incCompare(\"";
msg2 = "\", \"";
msg3 = "\") returned ";
- sResult = appendCompareResult(incResult, sResult);
+ sResult = CollationTest.appendCompareResult(incResult, sResult);
if (ok3) {
// logln(msg1 + source + msg2 + target + msg3 + sResult);
} else {
}
}
- String appendCompareResult(int result, String target) {
- if (result == -1) { //LESS
- target += "LESS";
- } else if (result == 0) { //EQUAL
- target += "EQUAL";
- } else if (result == 1) { //GREATER
- target += "GREATER";
- } else {
- String huh = "?";
- target += huh + result;
- }
- return target;
- }
-
- static String prettify(CollationKey sourceKey) {
- int i;
- byte[] bytes= sourceKey.toByteArray();
- StringBuilder target = new StringBuilder("[");
-
- for (i = 0; i < bytes.length; i++) {
- String numStr = Integer.toHexString(bytes[i] & 0xff);
- if (numStr.length() < 2) {
- target.append('0');
- }
- target.append(numStr).append(' ');
- }
- target.append(']');
- return target.toString();
- }
-
public void TestBeforePrefixFailure() {
String[] rules = {
"&g <<< a&[before 3]\uff41 <<< x",
}
public void TestContractionClosure() {
+ // Note: This was also ported to the data-driven test, see collationtest.txt.
String[] rules = {
"&b=\u00e4\u00e4",
"&b=\u00C5",
};
// logln("mixed case test");
// logln("lower first, case level off");
- genericRulesStarter("[casefirst lower]&H<ch<<<Ch<<<CH", lowerFirst);
+ genericRulesStarter("[caseFirst lower]&H<ch<<<Ch<<<CH", lowerFirst);
// logln("upper first, case level off");
- genericRulesStarter("[casefirst upper]&H<ch<<<Ch<<<CH", upperFirst);
+ genericRulesStarter("[caseFirst upper]&H<ch<<<Ch<<<CH", upperFirst);
// logln("lower first, case level on");
- genericRulesStarter("[casefirst lower][caselevel on]&H<ch<<<Ch<<<CH", lowerFirst);
+ genericRulesStarter("[caseFirst lower][caseLevel on]&H<ch<<<Ch<<<CH", lowerFirst);
// logln("upper first, case level on");
- genericRulesStarter("[casefirst upper][caselevel on]&H<ch<<<Ch<<<CH", upperFirst);
+ genericRulesStarter("[caseFirst upper][caseLevel on]&H<ch<<<Ch<<<CH", upperFirst);
}
}
}
public void TestImplicitTailoring() {
- String rules[] = { "&[before 1]\u4e00 < b < c &[before 1]\u4e00 < d < e",
- "&\u4e00 < a <<< A < b <<< B",
- "&[before 1]\u4e00 < \u4e01 < \u4e02",
- "&[before 1]\u4e01 < \u4e02 < \u4e03",
+ String rules[] = {
+ /* Tailor b and c before U+4E00. */
+ "&[before 1]\u4e00 < b < c " +
+ /* Now, before U+4E00 is c; put d and e after that. */
+ "&[before 1]\u4e00 < d < e",
+ "&\u4e00 < a <<< A < b <<< B",
+ "&[before 1]\u4e00 < \u4e01 < \u4e02",
+ "&[before 1]\u4e01 < \u4e02 < \u4e03",
};
String cases[][] = {
- { "d", "e", "b", "c", "\u4e00"},
- { "\u4e00", "a", "A", "b", "B", "\u4e01"},
- { "\u4e01", "\u4e02", "\u4e00"},
- { "\u4e02", "\u4e03", "\u4e01"},
+ { "b", "c", "d", "e", "\u4e00" },
+ { "\u4e00", "a", "A", "b", "B", "\u4e01" },
+ { "\u4e01", "\u4e02", "\u4e00" },
+ { "\u4e02", "\u4e03", "\u4e01" },
};
int i = 0;
for(i = 0; i < rules.length; i++) {
genericRulesStarter(rules[i], cases[i]);
}
-
}
public void TestFCDProblem() {
public void TestJ3087()
{
- String rule[] = {"&h<H&CH=\u0427",
- "&CH=\u0427&h<H",
- "&CH=\u0427"};
+ String rule[] = {
+ "&h<H&CH=\u0427",
+ /*
+ * The ICU 53 builder adheres to the principle that
+ * a rule is affected by previous rules but not following ones.
+ * Therefore, setting CH=\u0427 and then re-tailoring H makes CH != \u0427.
+ "&CH=\u0427&h<H", */
+ "&CH=\u0427"
+ };
RuleBasedCollator rbc = null;
CollationElementIterator iter1;
CollationElementIterator iter2;
rbc = new RuleBasedCollator(rule[i]);
} catch (Exception e) {
warnln(e.getMessage());
- return;
+ continue;
}
iter1 = rbc.getCollationElementIterator("CH");
iter2 = rbc.getCollationElementIterator("\u0427");
int ce1 = CollationElementIterator.IGNORABLE;
int ce2 = CollationElementIterator.IGNORABLE;
+ // The ICU 53 builder code sets the uppercase flag only on the first CE.
+ int mask = ~0;
while (ce1 != CollationElementIterator.NULLORDER
&& ce2 != CollationElementIterator.NULLORDER) {
ce1 = iter1.next();
ce2 = iter2.next();
- if (ce1 != ce2) {
+ if ((ce1 & mask) != (ce2 & mask)) {
errln("Error generating RuleBasedCollator with the rule "
+ rule[i]);
errln("CH != \\u0427");
}
+ mask = ~0xc0; // mask off case/continuation bits
}
}
}
+ "&u<\u01d6<\u01d8<\u01da<\u01dc<\u00fc", data);
}
- public void TestRedundantRules() {
- String[] rules = {
- //"& a <<< b <<< c << d <<< e& [before 1] e <<< x",
- "& b <<< c <<< d << e <<< f& [before 3] f <<< x",
- "& a < b <<< c << d <<< e& [before 1] e <<< x",
- "& a < b < c < d& [before 1] c < m",
- "& a < b <<< c << d <<< e& [before 3] e <<< x",
- "& a < b <<< c << d <<< e& [before 2] e <<< x",
- "& a < b <<< c << d <<< e <<< f < g& [before 1] g < x",
- "& a <<< b << c < d& a < m",
- "&a<b<<b\u0301 &z<b",
- "&z<m<<<q<<<m",
- "&z<<<m<q<<<m",
- "& a < b < c < d& r < c",
- "& a < b < c < d& r < c",
- "& a < b < c < d& c < m",
- "& a < b < c < d& a < m"
- };
-
- String[] expectedRules = {
- //"&\u2089<<<x",
- "&\u0252<<<x",
- "& a <<< x < b <<< c << d <<< e",
- "& a < b < m < c < d",
- "& a < b <<< c << d <<< x <<< e",
- "& a < b <<< c <<< x << d <<< e",
- "& a < b <<< c << d <<< e <<< f < x < g",
- "& a <<< b << c < m < d",
- "&a<b\u0301 &z<b",
- "&z<q<<<m",
- "&z<q<<<m",
- "& a < b < d& r < c",
- "& a < b < d& r < c",
- "& a < b < c < m < d",
- "& a < m < b < c < d"
- };
-
- String[][] testdata = {
- // {"\u2089", "x"},
- {"\u0252", "x"},
- {"a", "x", "b", "c", "d", "e"},
- {"a", "b", "m", "c", "d"},
- {"a", "b", "c", "d", "x", "e"},
- {"a", "b", "c", "x", "d", "e"},
- {"a", "b", "c", "d", "e", "f", "x", "g"},
- {"a", "b", "c", "m", "d"},
- {"a", "b\u0301", "z", "b"},
- {"z", "q", "m"},
- {"z", "q", "m"},
- {"a", "b", "d"},
- {"r", "c"},
- {"a", "b", "c", "m", "d"},
- {"a", "m", "b", "c", "d"}
- };
-
- String rlz = "";
- for(int i = 0; i<rules.length; i++) {
- logln("testing rule " + rules[i] + ", expected to be" + expectedRules[i]);
- try {
- rlz = rules[i];
- Collator credundant = new RuleBasedCollator(rlz);
- rlz = expectedRules[i];
- Collator cresulting = new RuleBasedCollator(rlz);
- logln(" credundant Rule:" + ((RuleBasedCollator)credundant).getRules());
- logln(" cresulting Rule:" + ((RuleBasedCollator)cresulting).getRules());
- } catch (Exception e) {
- warnln("Cannot create RuleBasedCollator");
- }
- //testAgainstUCA(cresulting, credundant, "expected", TRUE, &status);
- // logln("testing using data\n");
- genericRulesStarter(rules[i], testdata[i]);
- }
- }
-
- public void TestExpansionSyntax() {
- String[] rules = {
- "&AE <<< a << b <<< c &d <<< f",
- "&AE <<< a <<< b << c << d < e < f <<< g",
- "&AE <<< B <<< C / D <<< F"
- };
-
- String[] expectedRules = {
- "&A <<< a / E << b / E <<< c /E &d <<< f",
- "&A <<< a / E <<< b / E << c / E << d / E < e < f <<< g",
- "&A <<< B / E <<< C / ED <<< F / E"
- };
-
- String[][] testdata = {
- {"AE", "a", "b", "c"},
- {"AE", "a", "b", "c", "d", "e", "f", "g"},
- {"AE", "B", "C"} // / ED <<< F / E"},
- };
-
- for(int i = 0; i<rules.length; i++) {
- // logln("testing rule " + rules[i] + ", expected to be " + expectedRules[i]);
- try {
- String rlz = rules[i];
- Collator credundant = new RuleBasedCollator(rlz);
- rlz = expectedRules[i];
- Collator cresulting = new RuleBasedCollator(rlz);
- logln(" credundant Rule:" + ((RuleBasedCollator)credundant).getRules());
- logln(" cresulting Rule:" + ((RuleBasedCollator)cresulting).getRules());
- } catch (Exception e) {
- warnln(e.getMessage());
- }
- // testAgainstUCA still doesn't handle expansions correctly, so this is not run
- // as a hard error test, but only in information mode
- //testAgainstUCA(cresulting, credundant, "expected", FALSE, &status);
-
- // logln("testing using data");
- genericRulesStarter(rules[i], testdata[i]);
- }
- }
-
public void TestHangulTailoring() {
String[] koreanData = {
"\uac00", "\u4f3d", "\u4f73", "\u5047", "\u50f9", "\u52a0", "\u53ef", "\u5475",
}
// logln("Using start of korean rules\n");
genericOrderingTest(coll, koreanData);
- // logln("Setting jamoSpecial to TRUE and testing once more\n");
-
- // can't set jamo in icu4j
- // ((UCATableHeader *)coll->image)->jamoSpecial = TRUE; // don't try this at home
- // genericOrderingTest(coll, koreanData);
// no such locale in icu4j
// logln("Using ko__LOTUS locale\n");
"\u0063\u0068",
"\u0063\u006C",
};
+ /*
+ * These pairs of rule strings are not guaranteed to yield the very same mappings.
+ * In fact, LDML 24 recommends an improved way of creating mappings
+ * which always yields different mappings for such pairs. See
+ * http://www.unicode.org/reports/tr35/tr35-33/tr35-collation.html#Orderings
String[] testrules3 = {
"&z < xyz &xyzw << B",
"&z < xyz &xyz << B / w",
"&\ud800\udc00 << B / w",
"&a\ud800\udc00m << B",
"&a << B / \ud800\udc00m",
- };
+ }; */
RuleBasedCollator coll = null;
for (int i = 0; i < testrules.length; i ++) {
errln("Expected " + testdata2[1] + " < " + testdata2[2]);
return;
}
- for (int i = 0; i < testrules3.length; i += 2) {
+ /* see above -- for (int i = 0; i < testrules3.length; i += 2) {
RuleBasedCollator coll1, coll2;
CollationElementIterator iter1, iter2;
char ch = 0x0042;
errln("CEs not exhausted\n");
return;
}
- }
+ } */
}
public void TestExpansion() {
String[] testrules = {
+ /*
+ * This seems to have tested that M was not mapped to an expansion.
+ * I believe the old builder just did that because it computed the extension CEs
+ * at the very end, which was a bug.
+ * Among other problems, it violated the core tailoring principle
+ * by making an earlier rule depend on a later one.
+ * And, of course, if M did not get an expansion, then it was primary different from K,
+ * unlike what the rule &K<<M says.
"&J << K / B & K << M",
+ */
"&J << K / B << M"
};
String[] testdata = {
>= 0) {
errln("Error while comparing prefixed keys @ strength "
+ strength);
- errln(prettify(mergedPrefixKeys[i-1]));
- errln(prettify(mergedPrefixKeys[i]));
+ errln(CollationTest.prettify(mergedPrefixKeys[i-1]));
+ errln(CollationTest.prettify(mergedPrefixKeys[i]));
}
if (mergedSuffixKeys[i-1].compareTo(mergedSuffixKeys[i])
>= 0) {
errln("Error while comparing suffixed keys @ strength "
+ strength);
- errln(prettify(mergedSuffixKeys[i-1]));
- errln(prettify(mergedSuffixKeys[i]));
+ errln(CollationTest.prettify(mergedSuffixKeys[i-1]));
+ errln(CollationTest.prettify(mergedSuffixKeys[i]));
}
}
}
public void TestVariableTop()
{
+ // ICU 53+: The character must be in a supported reordering group,
+ // and the variable top is pinned to the end of that group.
// parseNextToken is not released as public so i create my own rules
- String rules = "& a < b < c < de < fg & hi = j";
+ String rules = "& ' ' < b < c < de < fg & hi = j";
try {
RuleBasedCollator coll = new RuleBasedCollator(rules);
- String tokens[] = {"a", "b", "c", "de", "fg", "hi", "j", "ab"};
+ String tokens[] = {" ", "b", "c", "de", "fg", "hi", "j", "ab"};
coll.setAlternateHandlingShifted(true);
for (int i = 0; i < tokens.length; i ++) {
int varTopOriginal = coll.getVariableTop();
}
}
+ // ported from cmsccoll.c
+ public void TestVariableTopSetting() {
+ int varTopOriginal = 0, varTop1, varTop2;
+ Collator coll = Collator.getInstance(ULocale.ROOT);
+
+ String empty = "";
+ String space = " ";
+ String dot = "."; /* punctuation */
+ String degree = "\u00b0"; /* symbol */
+ String dollar = "$"; /* currency symbol */
+ String zero = "0"; /* digit */
+
+ varTopOriginal = coll.getVariableTop();
+ logln(String.format("coll.getVariableTop(root) -> %08x", varTopOriginal));
+ ((RuleBasedCollator)coll).setAlternateHandlingShifted(true);
+
+ varTop1 = coll.setVariableTop(space);
+ varTop2 = coll.getVariableTop();
+ logln(String.format("coll.setVariableTop(space) -> %08x", varTop1));
+ if(varTop1 != varTop2 ||
+ !coll.equals(empty, space) ||
+ coll.equals(empty, dot) ||
+ coll.equals(empty, degree) ||
+ coll.equals(empty, dollar) ||
+ coll.equals(empty, zero) ||
+ coll.compare(space, dot) >= 0) {
+ errln("coll.setVariableTop(space) did not work");
+ }
+
+ varTop1 = coll.setVariableTop(dot);
+ varTop2 = coll.getVariableTop();
+ logln(String.format("coll.setVariableTop(dot) -> %08x", varTop1));
+ if(varTop1 != varTop2 ||
+ !coll.equals(empty, space) ||
+ !coll.equals(empty, dot) ||
+ coll.equals(empty, degree) ||
+ coll.equals(empty, dollar) ||
+ coll.equals(empty, zero) ||
+ coll.compare(dot, degree) >= 0) {
+ errln("coll.setVariableTop(dot) did not work");
+ }
+
+ varTop1 = coll.setVariableTop(degree);
+ varTop2 = coll.getVariableTop();
+ logln(String.format("coll.setVariableTop(degree) -> %08x", varTop1));
+ if(varTop1 != varTop2 ||
+ !coll.equals(empty, space) ||
+ !coll.equals(empty, dot) ||
+ !coll.equals(empty, degree) ||
+ coll.equals(empty, dollar) ||
+ coll.equals(empty, zero) ||
+ coll.compare(degree, dollar) >= 0) {
+ errln("coll.setVariableTop(degree) did not work");
+ }
+
+ varTop1 = coll.setVariableTop(dollar);
+ varTop2 = coll.getVariableTop();
+ logln(String.format("coll.setVariableTop(dollar) -> %08x", varTop1));
+ if(varTop1 != varTop2 ||
+ !coll.equals(empty, space) ||
+ !coll.equals(empty, dot) ||
+ !coll.equals(empty, degree) ||
+ !coll.equals(empty, dollar) ||
+ coll.equals(empty, zero) ||
+ coll.compare(dollar, zero) >= 0) {
+ errln("coll.setVariableTop(dollar) did not work");
+ }
+
+ logln("Testing setting variable top to contractions");
+ try {
+ coll.setVariableTop("@P");
+ errln("Invalid contraction succeded in setting variable top!");
+ } catch(Exception expected) {
+ }
+
+ logln("Test restoring variable top");
+ coll.setVariableTop(varTopOriginal);
+ if(varTopOriginal != coll.getVariableTop()) {
+ errln("Couldn't restore old variable top");
+ }
+ }
+
+ // ported from cmsccoll.c
+ public void TestMaxVariable() {
+ int oldMax, max;
+
+ String empty = "";
+ String space = " ";
+ String dot = "."; /* punctuation */
+ String degree = "\u00b0"; /* symbol */
+ String dollar = "$"; /* currency symbol */
+ String zero = "0"; /* digit */
+
+ Collator coll = Collator.getInstance(ULocale.ROOT);
+
+ oldMax = coll.getMaxVariable();
+ logln(String.format("coll.getMaxVariable(root) -> %04x", oldMax));
+ ((RuleBasedCollator)coll).setAlternateHandlingShifted(true);
+
+ coll.setMaxVariable(Collator.ReorderCodes.SPACE);
+ max = coll.getMaxVariable();
+ logln(String.format("coll.setMaxVariable(space) -> %04x", max));
+ if(max != Collator.ReorderCodes.SPACE ||
+ !coll.equals(empty, space) ||
+ coll.equals(empty, dot) ||
+ coll.equals(empty, degree) ||
+ coll.equals(empty, dollar) ||
+ coll.equals(empty, zero) ||
+ coll.compare(space, dot) >= 0) {
+ errln("coll.setMaxVariable(space) did not work");
+ }
+
+ coll.setMaxVariable(Collator.ReorderCodes.PUNCTUATION);
+ max = coll.getMaxVariable();
+ logln(String.format("coll.setMaxVariable(punctuation) -> %04x", max));
+ if(max != Collator.ReorderCodes.PUNCTUATION ||
+ !coll.equals(empty, space) ||
+ !coll.equals(empty, dot) ||
+ coll.equals(empty, degree) ||
+ coll.equals(empty, dollar) ||
+ coll.equals(empty, zero) ||
+ coll.compare(dot, degree) >= 0) {
+ errln("coll.setMaxVariable(punctuation) did not work");
+ }
+
+ coll.setMaxVariable(Collator.ReorderCodes.SYMBOL);
+ max = coll.getMaxVariable();
+ logln(String.format("coll.setMaxVariable(symbol) -> %04x", max));
+ if(max != Collator.ReorderCodes.SYMBOL ||
+ !coll.equals(empty, space) ||
+ !coll.equals(empty, dot) ||
+ !coll.equals(empty, degree) ||
+ coll.equals(empty, dollar) ||
+ coll.equals(empty, zero) ||
+ coll.compare(degree, dollar) >= 0) {
+ errln("coll.setMaxVariable(symbol) did not work");
+ }
+
+ coll.setMaxVariable(Collator.ReorderCodes.CURRENCY);
+ max = coll.getMaxVariable();
+ logln(String.format("coll.setMaxVariable(currency) -> %04x", max));
+ if(max != Collator.ReorderCodes.CURRENCY ||
+ !coll.equals(empty, space) ||
+ !coll.equals(empty, dot) ||
+ !coll.equals(empty, degree) ||
+ !coll.equals(empty, dollar) ||
+ coll.equals(empty, zero) ||
+ coll.compare(dollar, zero) >= 0) {
+ errln("coll.setMaxVariable(currency) did not work");
+ }
+
+ logln("Test restoring maxVariable");
+ coll.setMaxVariable(oldMax);
+ if(oldMax != coll.getMaxVariable()) {
+ errln("Couldn't restore old maxVariable");
+ }
+ }
+
public void TestUCARules()
{
try {
genericLocaleStarter(new Locale("zh", "", "PINYIN"), test);
}
- static final long topByte = 0xFF000000L;
- static final long bottomByte = 0xFFL;
- static final long fourBytes = 0xFFFFFFFFL;
-
- static final int MAX_INPUT = 0x220001; // 2 * Unicode range + 2
-
- private void show(int i, ImplicitCEGenerator imp) {
- if (i >= 0 && i <= MAX_INPUT) {
- logln(Utility.hex(i) + "\t" + Utility.hex(imp.getImplicitFromRaw(i) & fourBytes));
- }
- }
-
- private void throwError(String title, int cp, ImplicitCEGenerator imp) {
- throw new IllegalArgumentException(title + "\t" + Utility.hex(cp, 6) + "\t" + Utility.hex(imp.getImplicitFromRaw(cp) & fourBytes));
- }
-
- private void throwError(String title, long ce) {
- errln(title + "\t" + Utility.hex(ce & fourBytes));
- }
-
- public void TestImplicitGeneration()
- {
- logln("Start");
- try {
- ImplicitCEGenerator foo = new ImplicitCEGenerator(0xE0, 0xE4);
-
- //int x = foo.getRawImplicit(0xF810);
- foo.getRawFromImplicit(0xE20303E7);
-
- //int gap4 = foo.getGap4();
- //logln("Gap4: " + gap4);
- //int gap3 = foo.getGap3();
- //int minTrail = foo.getMinTrail();
- //int maxTrail = foo.getMaxTrail();
- long last = 0;
- long current;
- for (int i = 0; i <= MAX_INPUT; ++i) {
- current = foo.getImplicitFromRaw(i) & fourBytes;
-
- // check that it round-trips AND that all intervening ones are illegal
- int roundtrip = foo.getRawFromImplicit((int)current);
- if (roundtrip != i) {
- throwError("No roundtrip", i, foo);
- }
- if (last != 0) {
- for (long j = last + 1; j < current; ++j) {
- roundtrip = foo.getRawFromImplicit((int)j);
- // raise an error if it *doesn't* find an error
- if (roundtrip != -1) {
- throwError("Fails to recognize illegal", j);
- }
- }
- }
- // now do other consistency checks
- long lastBottom = last & bottomByte;
- long currentBottom = current & bottomByte;
- long lastTop = last & topByte;
- long currentTop = current & topByte;
-
- // do some consistency checks
- /*
- long gap = current - last;
- if (currentBottom != 0) { // if we are a 4-byte
- // gap has to be at least gap4
- // and gap from minTrail, maxTrail has to be at least gap4
- if (gap <= gap4) foo.throwError("Failed gap4 between", i);
- if (currentBottom < minTrail + gap4) foo.throwError("Failed gap4 before", i);
- if (currentBottom > maxTrail - gap4) foo.throwError("Failed gap4 after", i);
- } else { // we are a three-byte
- gap = gap >> 8; // move gap down for comparison.
- long current3Bottom = (current >> 8) & bottomByte;
- if (gap <= gap3) foo.throwError("Failed gap3 between ", i);
- if (current3Bottom < minTrail + gap3) foo.throwError("Failed gap3 before", i);
- if (current3Bottom > maxTrail - gap3) foo.throwError("Failed gap3 after", i);
- }
- */
- // print out some values for spot-checking
- if (lastTop != currentTop || i == 0x10000 || i == 0x110000) {
- show(i-3, foo);
- show(i-2, foo);
- show(i-1, foo);
- if (i == 0) {
- // do nothing
- } else if (lastBottom == 0 && currentBottom != 0) {
- logln("+ primary boundary, 4-byte CE's below");
- } else if (lastTop != currentTop) {
- logln("+ primary boundary");
- }
- show(i, foo);
- show(i+1, foo);
- show(i+2, foo);
- logln("...");
- }
- last = current;
- if(foo.getCodePointFromRaw(foo.getRawFromCodePoint(i)) != i) {
- errln("No raw <-> code point roundtrip for "+Utility.hex(i));
- }
- }
- show(MAX_INPUT-2, foo);
- show(MAX_INPUT-1, foo);
- show(MAX_INPUT, foo);
- } catch (Exception e) {
- e.printStackTrace();
- warnln(e.getMessage());
- } finally {
- logln("End");
- }
- }
-
/* supercedes TestJ784 */
public void TestBeforePinyin() {
String rules =
"&a < \u00e2 <<< \u00c2",
"&a < \u1FF3 ", // OMEGA WITH YPOGEGRAMMENI
"&s < \u0161 ", // &s < s with caron
- "&z < a\u00EA", // &z < a+e with circumflex
+ /*
+ * Note: Just tailoring &z<ae^ does not work as expected:
+ * The UCA spec requires for discontiguous contractions that they
+ * extend an *existing match* by one combining mark at a time.
+ * Therefore, ae must be a contraction so that the builder finds
+ * discontiguous contractions for ae^, for example with an intervening underdot.
+ * Only then do we get the expected tail closure with a\u1EC7, a\u1EB9\u0302, etc.
+ */
+ "&x < ae &z < a\u00EA", // &x < ae &z < a+e with circumflex
};
String cases[][] = {
{ "\u1EAC", "A\u0323\u0302", "\u1EA0\u0302", "\u00C2\u0323", },
CollationKey key = coll.getCollationKey(cases[i][j]);
if ( key.compareTo(expectingKey)!=0) {
errln("Error! Test case["+i+"]:"+"source:" + key.getSourceString());
- errln("expecting:"+prettify(expectingKey)+ "got:"+ prettify(key));
+ errln("expecting:"+CollationTest.prettify(expectingKey)+ "got:"+ CollationTest.prettify(key));
}
- logln(" Key:"+ prettify(key));
+ logln(" Key:"+ CollationTest.prettify(key));
}
}
// errln("source:" + key.getSourceString());
// errln("expecting:"+prettify(expectingKey)+ "got:"+ prettify(key));
logln("Error!! in Vietnese sort - source:" + key.getSourceString());
- logln("expecting:"+prettify(expectingKey)+ "got:"+ prettify(key));
+ logln("expecting:"+CollationTest.prettify(expectingKey)+ "got:"+ CollationTest.prettify(key));
}
// logln("source:" + key.getSourceString());
- logln(" Key:"+ prettify(key));
+ logln(" Key:"+ CollationTest.prettify(key));
}
} catch (Exception e) {
warnln("Error creating Vietnese collator");
try {
coll = new RuleBasedCollator(rules[i]);
} catch (Exception e) {
- warnln("Unable to open collator with rules " + rules[i]);
+ warnln("Unable to open collator with rules " + rules[i] + ": " + e);
+ return;
}
logln("Test rule["+i+"]"+rules[i]);
CollationKey keyA = coll.getCollationKey("a");
- logln("Key for \"a\":"+ prettify(keyA));
+ logln("Key for \"a\":"+ CollationTest.prettify(keyA));
if (keyA.compareTo(coll.getCollationKey(lastPrimIgn))<=0) {
CollationKey key = coll.getCollationKey(lastPrimIgn);
- logln("Collation key for 0xD800 0xDDFD: "+prettify(key));
+ logln("Collation key for 0xD800 0xDDFD: "+CollationTest.prettify(key));
errln("Error! String \"a\" must be greater than \uD800\uDDFD -"+
"[Last Primary Ignorable]");
}
if (keyA.compareTo(coll.getCollationKey(firstVariable))>=0) {
CollationKey key = coll.getCollationKey(firstVariable);
- logln("Collation key for 0x0009: "+prettify(key));
+ logln("Collation key for 0x0009: "+CollationTest.prettify(key));
errln("Error! String \"a\" must be less than 0x0009 - [First Variable]");
}
CollationKey keyB = coll.getCollationKey("b");
- logln("Key for \"b\":"+ prettify(keyB));
+ logln("Key for \"b\":"+ CollationTest.prettify(keyB));
if (keyB.compareTo(coll.getCollationKey(firstPrimIgn))<=0) {
CollationKey key = coll.getCollationKey(firstPrimIgn);
- logln("Collation key for 0x0332: "+prettify(key));
+ logln("Collation key for 0x0332: "+CollationTest.prettify(key));
errln("Error! String \"b\" must be greater than 0x0332 -"+
"[First Primary Ignorable]");
}
if (keyB.compareTo(coll.getCollationKey(firstVariable))>=0) {
CollationKey key = coll.getCollationKey(firstVariable);
- logln("Collation key for 0x0009: "+prettify(key));
+ logln("Collation key for 0x0009: "+CollationTest.prettify(key));
errln("Error! String \"b\" must be less than 0x0009 - [First Variable]");
}
}
logln("Test rule["+i+"]"+rules[i]);
CollationKey keyA = coll.getCollationKey("a");
- logln("Key for \"a\":"+ prettify(keyA));
+ logln("Key for \"a\":"+ CollationTest.prettify(keyA));
byte[] keyAInBytes = keyA.toByteArray();
for (int j=0; j<keyAInBytes.length && j<secIgnKey.length; j++) {
if (keyAInBytes[j]!=secIgnKey[j]) {
if (keyA.compareTo(coll.getCollationKey(firstVariable))>=0) {
errln("Error! String \"a\" must be less than 0x0009 - [First Variable]");
CollationKey key = coll.getCollationKey(firstVariable);
- logln("Collation key for 0x0009: "+prettify(key));
+ logln("Collation key for 0x0009: "+CollationTest.prettify(key));
}
CollationKey keyB = coll.getCollationKey("b");
- logln("Key for \"b\":"+ prettify(keyB));
+ logln("Key for \"b\":"+ CollationTest.prettify(keyB));
byte[] keyBInBytes = keyB.toByteArray();
for (int j=0; j<keyBInBytes.length && j<secIgnKey.length; j++) {
if (keyBInBytes[j]!=secIgnKey[j]) {
}
if (keyB.compareTo(coll.getCollationKey(firstVariable))>=0) {
CollationKey key = coll.getCollationKey(firstVariable);
- logln("Collation key for 0x0009: "+prettify(key));
+ logln("Collation key for 0x0009: "+CollationTest.prettify(key));
errln("Error! String \"b\" must be less than 0x0009 - [First Variable]");
}
}
errln("expecting:"+prettify(expectingKey)+ "got:"+ prettify(key));
}
*/
- logln("String:"+cases[j]+" Key:"+ prettify(key));
+ logln("String:"+cases[j]+" Key:"+ CollationTest.prettify(key));
}
} catch (Exception e) {
warnln("Error creating English collator");
" is not >= previous test string.");
}
}
- logln("String:"+cases[j]+" Key:"+ prettify(key));
+ logln("String:"+cases[j]+" Key:"+ CollationTest.prettify(key));
}
} catch (Exception e) {
warnln("Error creating Japanese collator");
}
}
}
- logln("String:"+cases[j]+" Key:"+ prettify(key));
+ logln("String:"+cases[j]+" Key:"+ CollationTest.prettify(key));
}
}
}
try {
myCollation = new RuleBasedCollator(rule);
} catch (Exception e) {
- warnln("ERROR: in creation of rule based collator");
+ warnln("ERROR: in creation of rule based collator: " + e);
return;
}
// are working fine.
private OneTestCase[] m_rangeTestCasesSupplemental_ = {
// Left Right Result
- new OneTestCase( "\ufffe", "\uffff", -1 ),
- new OneTestCase( "\uffff", "\ud800\udc00", -1 ), // U+FFFF < U+10000
+ new OneTestCase( "\u4e00", "\ufffb", -1 ),
+ new OneTestCase( "\ufffb", "\ud800\udc00", -1 ), // U+FFFB < U+10000
new OneTestCase( "\ud800\udc00", "\ud800\udc01", -1 ), // U+10000 < U+10001
- new OneTestCase( "\ufffe", "\ud800\udc01", -1 ), // U+FFFE < U+10001
+ new OneTestCase( "\u4e00", "\ud800\udc01", -1 ), // U+4E00 < U+10001
new OneTestCase( "\ud800\udc01", "\ud800\udc02", -1 ), // U+10001 < U+10002
new OneTestCase( "\ud800\udc00", "\ud840\udc02", -1 ), // U+10000 < U+10002
- new OneTestCase( "\ufffe", "\u0d840\udc02", -1 ), // U+FFFF < U+10002
+ new OneTestCase( "\u4e00", "\u0d840\udc02", -1 ), // U+4E00 < U+10002
};
public void TestSameStrengthListWithSupplementalCharacters() {
String[] rules = new String[] {
// ** Rule without compact list syntax **
- // \ufffe < \uffff < \U00010000 < \U00010001 < \U00010002
- "&'\ufffe'<'\uffff'<'\ud800\udc00'<'\ud800\udc01'<'\ud800\udc02' " +
+ // \u4e00 < \ufffb < \U00010000 < \U00010001 < \U00010002
+ "&\u4e00<\ufffb<'\ud800\udc00'<'\ud800\udc01'<'\ud800\udc02' " +
// \U00010000 << \U00020001 << \U00020002 \U00020002
"&'\ud800\udc00'<<'\ud840\udc01'<<'\ud840\udc02'<<'\ud840\udc02' " +
// \U00020001 = \U0003001 = \U0004001 = \U0004002
- "&'\ud840\udc01'='\ud880\udc01'='\ud8c0\udc01'='\ud8c0\udc02'" +
- // \U00040008 < \U00030008 < \U00020008
- "&'\ud8c0\udc08'<'\ud880\udc08'<'\ud840\udc08'",
+ "&'\ud840\udc01'='\ud880\udc01'='\ud8c0\udc01'='\ud8c0\udc02'",
// ** Rule with compact list syntax **
- // \ufffe <* \uffff\U00010000 \U00010001
- "&'\ufffe'<*'\uffff\ud800\udc00\ud800\udc01\ud800\udc02' " +
+ // \u4e00 <* \ufffb\U00010000 \U00010001
+ "&\u4e00<*'\ufffb\ud800\udc00\ud800\udc01\ud800\udc02' " +
// \U00010000 <<* \U00020001 \U00020002
"&'\ud800\udc00'<<*'\ud840\udc01\ud840\udc02\ud840\udc03' " +
// \U00020001 =* \U0003001 \U0003002 \U0003003 \U0004001
- "&'\ud840\udc01'=*'\ud880\udc01\ud880\udc02\ud880\udc03\ud8c0\udc01' " +
- // \U00040008 <* \U00030008 \U00030009 \U0003000a \U00020008
- "&'\ud8c0\udc08'<*'\ud880\udc08\ud880\udc09\ud880\udc0a\ud840\udc08'",
+ "&'\ud840\udc01'=*'\ud880\udc01\ud880\udc02\ud880\udc03\ud8c0\udc01' "
};
doTestCollation(m_rangeTestCasesSupplemental_, rules);
// Tests the compact range syntax with supplemental codepoints.
public void TestSameStrengthListRangesWithSupplementalCharacters() {
String[] rules = new String[] {
- // \ufffe <* \uffff\U00010000 \U00010001
- "&'\ufffe'<*'\uffff'-'\ud800\udc02' " +
+ // \u4e00 <* \ufffb\U00010000 \U00010001
+ "&\u4e00<*'\ufffb'\ud800\udc00-'\ud800\udc02' " +
// \U00010000 <<* \U00020001 - \U00020003
"&'\ud800\udc00'<<*'\ud840\udc01'-'\ud840\udc03' " +
// \U00020001 =* \U0003001 \U0004001
- "&'\ud840\udc01'=*'\ud880\udc01'-'\ud880\udc03\ud8c0\udc01' " +
- // \U00040008 <* \U00030008 \U00020008
- "&'\ud8c0\udc08'<*'\ud880\udc08'-'\ud880\udc0a\ud840\udc08'",
+ "&'\ud840\udc01'=*'\ud880\udc01'-'\ud880\udc03\ud8c0\udc01' "
};
doTestCollation(m_rangeTestCasesSupplemental_, rules);
}
public void TestInvalidListsAndRanges() {
String[] invalidRules = new String[] {
// Range not in starred expression
- "&'\ufffe'<'\uffff'-'\ud800\udc02'",
+ "&\u4e00<\ufffb-'\ud800\udc02'",
// Range without start
"&a<*-c",
if (sortKey1.compareTo(sortKey2) >= 0) {
errln("TestHungarianTailoring getCollationKey(\"" + str1 +"\") was suppose "+
"less than getCollationKey(\""+ str2 + "\").");
- errln(" getCollationKey(\"ggy\"):" + prettify(sortKey1) +
- " getCollationKey(\"GGY\"):" + prettify(sortKey2));
+ errln(" getCollationKey(\"ggy\"):" + CollationTest.prettify(sortKey1) +
+ " getCollationKey(\"GGY\"):" + CollationTest.prettify(sortKey2));
}
CollationElementIterator iter1 = coll.getCollationElementIterator(str1);
/*
*******************************************************************************
- * Copyright (C) 2002-2010, International Business Machines Corporation and *
- * others. All Rights Reserved. *
+ * Copyright (C) 2002-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
String msg4 = "; expected ";
String sExpect = new String("");
String sResult = new String("");
- sResult = appendCompareResult(compareResult, sResult);
- sExpect = appendCompareResult(expectedResult, sExpect);
+ sResult = CollationTest.appendCompareResult(compareResult, sResult);
+ sExpect = CollationTest.appendCompareResult(expectedResult, sExpect);
if (ok1) {
logln(msg1 + src + msg2 + target + msg3 + sResult);
} else {
msg1 = ok2 ? "Ok: key(\"" : "FAIL: key(\"";
msg2 = "\").compareTo(key(\"";
msg3 = "\")) returned ";
- sResult = appendCompareResult(keyResult, sResult);
+ sResult = CollationTest.appendCompareResult(keyResult, sResult);
if (ok2) {
logln(msg1 + src + msg2 + target + msg3 + sResult);
} else {
errln(msg1 + src + msg2 + target + msg3 + sResult + msg4 + sExpect);
msg1 = " ";
msg2 = " vs. ";
- errln(msg1 + prettify(sourceKey) + msg2 + prettify(targetKey));
+ errln(msg1 + CollationTest.prettify(sourceKey) + msg2 + CollationTest.prettify(targetKey));
}
msg1 = ok3 ? "Ok: incCompare(\"" : "FAIL: incCompare(\"";
msg2 = "\", \"";
msg3 = "\") returned ";
- sResult = appendCompareResult(incResult, sResult);
+ sResult = CollationTest.appendCompareResult(incResult, sResult);
if (ok3) {
logln(msg1 + src + msg2 + target + msg3 + sResult);
} else {
}
}
}
-
- String appendCompareResult(int result, String target) {
- if (result == -1) { //LESS
- target += "LESS";
- } else if (result == 0) { //EQUAL
- target += "EQUAL";
- } else if (result == 1) { //GREATER
- target += "GREATER";
- } else {
- String huh = "?";
- target += huh + result;
- }
- return target;
- }
-
- String prettify(CollationKey sourceKey) {
- int i;
- byte[] bytes= sourceKey.toByteArray();
- String target = "[";
-
- for (i = 0; i < bytes.length; i++) {
- target += Integer.toHexString(bytes[i]);
- target += " ";
- }
- target += "]";
- return target;
- }
}
/*
*******************************************************************************
- * Copyright (C) 2002-2013, International Business Machines Corporation and *
- * others. All Rights Reserved. *
+ * Copyright (C) 2002-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
// Collator -> rules -> Collator round-trip broken for expanding characters
//
public void Test4051866() {
- String rules = "< o & oe ,o\u3080& oe ,\u1530 ,O& OE ,O\u3080& OE ,\u1520< p ,P";
+ String rules = "&n < o & oe ,o\u3080& oe ,\u1530 ,O& OE ,O\u3080& OE ,\u1520< p ,P";
// Build a collator containing expanding characters
RuleBasedCollator c1 = null;
String sExpect = new String("");
String sResult = new String("");
- sResult = appendCompareResult(compareResult, sResult);
- sExpect = appendCompareResult(expectedResult, sExpect);
+ sResult = CollationTest.appendCompareResult(compareResult, sResult);
+ sExpect = CollationTest.appendCompareResult(expectedResult, sExpect);
if (ok1) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
} else {
msg1 = ok2 ? "Ok: key(\"" : "FAIL: key(\"";
msg2 = "\").compareTo(key(\"";
msg3 = "\")) returned ";
- sResult = appendCompareResult(keyResult, sResult);
+ sResult = CollationTest.appendCompareResult(keyResult, sResult);
if (ok2) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
} else {
errln(msg1 + source + msg2 + target + msg3 + sResult + msg4 + sExpect);
msg1 = " ";
msg2 = " vs. ";
- errln(msg1 + prettify(sourceKey) + msg2 + prettify(targetKey));
+ errln(msg1 + CollationTest.prettify(sourceKey) + msg2 + CollationTest.prettify(targetKey));
}
msg1 = ok3 ? "Ok: incCompare(\"" : "FAIL: incCompare(\"";
msg2 = "\", \"";
msg3 = "\") returned ";
- sResult = appendCompareResult(incResult, sResult);
+ sResult = CollationTest.appendCompareResult(incResult, sResult);
if (ok3) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
}
}
}
-
- String appendCompareResult(int result, String target) {
- if (result == -1) { //LESS
- target += "LESS";
- } else if (result == 0) { //EQUAL
- target += "EQUAL";
- } else if (result == 1) { //GREATER
- target += "GREATER";
- } else {
- String huh = "?";
- target += huh + result;
- }
- return target;
- }
-
- String prettify(CollationKey sourceKey) {
- int i;
- byte[] bytes= sourceKey.toByteArray();
- String target = "[";
-
- for (i = 0; i < bytes.length; i++) {
- target += Integer.toHexString(bytes[i]);
- target += " ";
- }
- target += "]";
- return target;
- }
-
+
// @bug 4054736
//
// Full Decomposition mode not implemented
//
public void Test4059820(/* char* par */) {
RuleBasedCollator c = null;
- String rules = "< a < b , c/a < d < z";
+ String rules = "&9 < a < b , c/a < d < z";
try {
c = new RuleBasedCollator(rules);
} catch (Exception e) {
// MergeCollation::fixEntry broken for "& H < \u0131, \u0130, i, I"
//
public void Test4060154(/* char* par */) {
- String rules ="< g, G < h, H < i, I < j, J & H < \u0131, \u0130, i, I";
+ String rules ="&f < g, G < h, H < i, I < j, J & H < \u0131, \u0130, i, I";
RuleBasedCollator c = null;
try {
c = new RuleBasedCollator(rules);
} catch (Exception e) {
//System.out.println(e);
- errln("failure building collator.");
+ errln("failure building collator:" + e);
return;
}
public void Test4078588(/* char *par */) {
RuleBasedCollator rbc = null;
try {
- rbc = new RuleBasedCollator("< a < bb");
+ rbc = new RuleBasedCollator("&9 < a < bb");
} catch (Exception e) {
errln("Failed to create RuleBasedCollator.");
return;
c.setStrength(Collator.SECONDARY);
String tests[] = {
"\u007a", "\u003c", "\u00E6", // z < ae
- "\u0061\u0308", "\u003c", "\u0061\u030A", // a-unlaut < a-ring
+ "\u0061\u0308", "\u003c", "\u0061\u030A", // a-umlaut < a-ring
"\u0059", "\u003c", "\u0075\u0308", // Y < u-umlaut
};
compareArray(c, tests);
//
public void Test4101940(/* char* par */) {
RuleBasedCollator c = null;
- String rules = "< a < b";
+ String rules = "&9 < a < b";
String nothing = "";
try {
c = new RuleBasedCollator(rules);
iter.setOffset(5);
int elt5 = CollationElementIterator.primaryOrder(iter.next());
- if (elt4 != elt0 || elt5 != elt0)
- errln("The collation elements at positions 0 (" + elt0 + "), 4 ("
- + elt4 + "), and 5 (" + elt5 + ") don't match.");
+ // Compares and prints only 16-bit primary weights.
+ if (elt4 != elt0 || elt5 != elt0) {
+ errln(String.format("The collation elements at positions 0 (0x%04x), " +
+ "4 (0x%04x), and 5 (0x%04x) don't match.",
+ elt0, elt4, elt5));
+ }
// test that the "cat" combination works properly
iter.setOffset(14);
iter.setOffset(19);
int elt19 = CollationElementIterator.primaryOrder(iter.next());
+ // Compares and prints only 16-bit primary weights.
if (elt14 != elt15 || elt14 != elt16 || elt14 != elt17
- || elt14 != elt18 || elt14 != elt19)
- errln("\"cat\" elements don't match: elt14 = " + elt14 + ", elt15 = "
- + elt15 + ", elt16 = " + elt16 + ", elt17 = " + elt17
- + ", elt18 = " + elt18 + ", elt19 = " + elt19);
+ || elt14 != elt18 || elt14 != elt19) {
+ errln(String.format("\"cat\" elements don't match: elt14 = 0x%04x, " +
+ "elt15 = 0x%04x, elt16 = 0x%04x, elt17 = 0x%04x, " +
+ "elt18 = 0x%04x, elt19 = 0x%04x",
+ elt14, elt15, elt16, elt17, elt18, elt19));
+ }
// now generate a complete list of the collation elements,
// first using next() and then using setOffset(), and
throw new Exception("\"a<a\" collation sequence didn't cause parse error!");
}
- RuleBasedCollator collator = new RuleBasedCollator("<\u00e0=a\u0300");
+ RuleBasedCollator collator = new RuleBasedCollator("&a<\u00e0=a\u0300");
//commented by Kevin 2003/10/21
//for "FULL_DECOMPOSITION is not supported here." in ICU4J DOC
//collator.setDecomposition(Collator.FULL_DECOMPOSITION);
caseFirstCompressionSub(col, "lower first");
}
+ public void TestTrailingComment() throws Exception {
+ // ICU ticket #8070:
+ // Check that the rule parser handles a comment without terminating end-of-line.
+ RuleBasedCollator coll = new RuleBasedCollator("&c<b#comment1\n<a#comment2");
+ assertTrue("c<b", coll.compare("c", "b") < 0);
+ assertTrue("b<a", coll.compare("b", "a") < 0);
+ }
+
+ public void TestBeforeWithTooStrongAfter() {
+ // ICU ticket #9959:
+ // Forbid rules with a before-reset followed by a stronger relation.
+ try {
+ new RuleBasedCollator("&[before 2]x<<q<p");
+ errln("should forbid before-2-reset followed by primary relation");
+ } catch(Exception expected) {
+ }
+ try {
+ new RuleBasedCollator("&[before 3]x<<<q<<s<p");
+ errln("should forbid before-3-reset followed by primary or secondary relation");
+ } catch(Exception expected) {
+ }
+ }
+
/*
* Compare two strings - "aaa...A" and "aaa...a" with
* Collation#compare and CollationKey#compareTo, called from
/*
*******************************************************************************
- * Copyright (C) 2002-2010, International Business Machines Corporation and *
- * others. All Rights Reserved. *
+ * Copyright (C) 2002-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
String sExpect = new String("");
String sResult = new String("");
- sResult = appendCompareResult(compareResult, sResult);
- sExpect = appendCompareResult(expectedResult, sExpect);
+ sResult = CollationTest.appendCompareResult(compareResult, sResult);
+ sExpect = CollationTest.appendCompareResult(expectedResult, sExpect);
if (ok1) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
} else {
msg1 = ok2 ? "Ok: key(\"" : "FAIL: key(\"";
msg2 = "\").compareTo(key(\"";
msg3 = "\")) returned ";
- sResult = appendCompareResult(keyResult, sResult);
+ sResult = CollationTest.appendCompareResult(keyResult, sResult);
if (ok2) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
} else {
errln(msg1 + source + msg2 + target + msg3 + sResult + msg4 + sExpect);
msg1 = " ";
msg2 = " vs. ";
- errln(msg1 + prettify(sourceKey) + msg2 + prettify(targetKey));
+ errln(msg1 + CollationTest.prettify(sourceKey) + msg2 + CollationTest.prettify(targetKey));
}
msg1 = ok3 ? "Ok: incCompare(\"" : "FAIL: incCompare(\"";
msg2 = "\", \"";
msg3 = "\") returned ";
- sResult = appendCompareResult(incResult, sResult);
+ sResult = CollationTest.appendCompareResult(incResult, sResult);
if (ok3) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
}
}
}
-
- private String appendCompareResult(int result, String target){
- if (result == -1) {
- target += "LESS";
- } else if (result == 0) {
- target += "EQUAL";
- } else if (result == 1) {
- target += "GREATER";
- } else {
- String huh = "?";
- target += huh + result;
- }
- return target;
- }
-
- String prettify(CollationKey sourceKey) {
- int i;
- byte[] bytes= sourceKey.toByteArray();
- String target = "[";
-
- for (i = 0; i < bytes.length; i++) {
- target += Integer.toHexString(bytes[i]);
- target += " ";
- }
- target += "]";
- return target;
- }
}
/**
*******************************************************************************
- * Copyright (C) 2001-2010, International Business Machines Corporation and *
- * others. All Rights Reserved. *
+ * Copyright (C) 2001-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
+ * CollationTest.java, ported from collationtest.cpp
+ * C++ version created on: 2012apr27
+ * created by: Markus W. Scherer
*/
package com.ibm.icu.dev.test.collator;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Locale;
-import java.util.MissingResourceException;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.text.ParseException;
+import java.util.HashSet;
+import java.util.Set;
-import com.ibm.icu.dev.test.ModuleTest;
-import com.ibm.icu.dev.test.TestDataModule.DataMap;
import com.ibm.icu.dev.test.TestFmwk;
-import com.ibm.icu.impl.LocaleUtility;
+import com.ibm.icu.dev.test.TestUtil;
+import com.ibm.icu.impl.Norm2AllModes;
import com.ibm.icu.impl.Utility;
-import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.impl.coll.Collation;
+import com.ibm.icu.impl.coll.CollationData;
+import com.ibm.icu.impl.coll.CollationFCD;
+import com.ibm.icu.impl.coll.CollationIterator;
+import com.ibm.icu.impl.coll.CollationRoot;
+import com.ibm.icu.impl.coll.CollationRootElements;
+import com.ibm.icu.impl.coll.CollationRuleParser;
+import com.ibm.icu.impl.coll.CollationWeights;
+import com.ibm.icu.impl.coll.FCDIterCollationIterator;
+import com.ibm.icu.impl.coll.FCDUTF16CollationIterator;
+import com.ibm.icu.impl.coll.UTF16CollationIterator;
+import com.ibm.icu.impl.coll.UVector32;
import com.ibm.icu.text.CollationElementIterator;
import com.ibm.icu.text.CollationKey;
import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.Collator.ReorderCodes;
+import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.RawCollationKey;
import com.ibm.icu.text.RuleBasedCollator;
+import com.ibm.icu.text.UCharacterIterator;
import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.text.UnicodeSetIterator;
+import com.ibm.icu.util.IllformedLocaleException;
+import com.ibm.icu.util.Output;
+import com.ibm.icu.util.ULocale;
-public class CollationTest extends ModuleTest{
- // public methods --------------------------------------------------------
-
+public class CollationTest extends TestFmwk {
public static void main(String[] args) throws Exception{
new CollationTest().run(args);
}
public CollationTest() {
- super("com/ibm/icu/dev/data/testdata/", "DataDrivenCollationTest");
- }
-
- public void processModules() {
- for (Iterator iter = t.getSettingsIterator(); iter.hasNext();) {
- DataMap setting = (DataMap) iter.next();
- processSetting(setting);
- }
}
-
+
+ // Fields
+ Normalizer2 fcd, nfd;
+ Collator coll;
+ String fileLine;
+ int fileLineNumber;
+ String fileTestName;
+ Throwable error;
+
// package private methods ----------------------------------------------
static void doTest(TestFmwk test, RuleBasedCollator col, String source,
}
if (o != orders[index]) {
test.errln("Mismatch at index " + index + ": 0x"
- + Integer.toHexString(orders[index]) + " vs 0x" + Integer.toHexString(o));
+ + Utility.hex(orders[index]) + " vs 0x" + Utility.hex(o));
break;
}
}
iter.reset();
test.err("next: ");
while ((o = iter.next()) != CollationElementIterator.NULLORDER) {
- String hexString = "0x" + Integer.toHexString(o) + " ";
+ String hexString = "0x" + Utility.hex(o) + " ";
test.err(hexString);
}
test.errln("");
test.err("prev: ");
while ((o = iter.previous()) != CollationElementIterator.NULLORDER) {
- String hexString = "0x" + Integer.toHexString(o) + " ";
+ String hexString = "0x" + Utility.hex(o) + " ";
test.err(hexString);
}
test.errln("");
}
}
- // private data members --------------------------------------------------
-
- private String m_sequence_;
- private int m_sequenceIndex_;
- private String m_source_;
- private StringBuffer m_target_ = new StringBuffer();
- private int m_nextRelation_;
- private int m_relation_;
-
- // private methods -------------------------------------------------------
-
- private void processSetting(DataMap settings) {
- RuleBasedCollator col = null;
- // ok i have to be careful here since it seems like we can have
- // multiple locales for each test
- String locale = settings.getString("TestLocale");
-
- if (locale != null) {
- // this is a case where we have locale
- try {
- Locale l = LocaleUtility.getLocaleFromName(locale);
- col = (RuleBasedCollator)Collator.getInstance(l);
- }catch (MissingResourceException e){
- warnln("Could not load the locale data for locale " + locale);
- }catch (Exception e) {
- errln("Error creating collator for locale " + locale);
- }
- logln("Testing collator for locale " + locale);
- processSetting2(settings, col);
- }
- String rules = settings.getString("Rules");
- // ok i have to be careful here since it seems like we can have
- // multiple rules for each test
- if (rules != null) {
- // here we deal with rules
- try {
- col = new RuleBasedCollator(rules);
- }catch (MissingResourceException e){
- warnln("Could not load the locale data: " + e.getMessage());
- } catch (Exception e) {
- errln("Error creating collator for rules " + rules);
- }
- processSetting2(settings, col);
+ static final String appendCompareResult(int result, String target){
+ if (result == -1) {
+ target += "LESS";
+ } else if (result == 0) {
+ target += "EQUAL";
+ } else if (result == 1) {
+ target += "GREATER";
+ } else {
+ String huh = "?";
+ target += huh + result;
}
+ return target;
}
- private void processSetting2(DataMap settings,RuleBasedCollator col)
- {
-
- // ok i have to be careful here since it seems like we can have
- // multiple rules for each test
- String arguments = settings.getString("Arguments");
- if (arguments != null) {
- handleArguments(col, arguments);
- }
- processTestCases(col);
+ static final String prettify(CollationKey key) {
+ byte[] bytes = key.toByteArray();
+ return prettify(bytes, bytes.length);
}
- /**
- * Reads the options string and sets appropriate attributes in collator
- */
- private void handleArguments(RuleBasedCollator col, String argument) {
- int i = 0;
- boolean printInfo = false;
- while (i < argument.length()) {
- if (!UCharacter.isWhitespace(argument.charAt(i))) {
- // eat whitespace
- break;
- }
- i ++;
- }
- while (i < argument.length()) {
- // skip opening '['
- if (argument.charAt(i) == '[') {
- i ++;
- }
- else {
- if(!isModularBuild()){
- errln("Error in collation arguments, missing ["); // no opening '['
- }
- // !!! following line has no effect
- printInfo=true;
- return;
- }
-
- int value = argument.indexOf(' ', i);
- String option = argument.substring(i, value);
- i = argument.indexOf(']', value);
- String optionvalue = argument.substring(value + 1, i);
- i ++;
- // some options are not added because they have no public apis yet
- // TODO add the rest of the options
- if (option.equalsIgnoreCase("alternate")) {
- if (optionvalue.equalsIgnoreCase("non-ignorable")) {
- col.setAlternateHandlingShifted(false);
- }
- else {
- col.setAlternateHandlingShifted(true);
- }
- }
- else if (option.equals("strength")) {
- if (optionvalue.equalsIgnoreCase("1")) {
- col.setStrength(Collator.PRIMARY);
- }
- else if (optionvalue.equalsIgnoreCase("2")) {
- col.setStrength(Collator.SECONDARY);
- }
- else if (optionvalue.equalsIgnoreCase("3")) {
- col.setStrength(Collator.TERTIARY);
- }
- else if (optionvalue.equalsIgnoreCase("4")) {
- col.setStrength(Collator.QUATERNARY);
- }
- }
- }
- if (printInfo) {
- warnln("Could not load the locale data. Skipping...");
- }
- // !!! effect is odd, if no modular build, this emits no
- // message at all. How come? Hmmm. printInfo is never
- // true if we get here, so this code is never executed.
- /*
- if(printInfo == true && isModularBuild()){
- infoln("Could not load the locale data. Skipping...");
- }
- */
- }
-
- private void processTestCases(RuleBasedCollator col) {
- for (Iterator iter = t.getDataIterator(); iter.hasNext();) {
- DataMap e1 = (DataMap) iter.next();
- processSequence(col, e1.getString("sequence"));
- }
- }
-
- private void processSequence(RuleBasedCollator col, String sequence) {
- // TODO: have a smarter tester that remembers the sequence and ensures
- // that the complete sequence is in order. That is why I have made a
- // constraint in the sequence format.
- m_sequence_ = sequence;
- m_sequenceIndex_ = 0;
- m_nextRelation_ = -1;
- m_target_.delete(0, m_target_.length());
- List vector = new ArrayList();
- int lastsmallerthanindex = -1;
- getNextInSequence();
- while (getNextInSequence()) {
- String target = m_target_.toString();
- doTest(this, col, m_source_, target, m_relation_);
- int vsize = vector.size();
- for (int i = vsize - 1; i >= 0; i --) {
- String source = (String)vector.get(i);
- if (i > lastsmallerthanindex) {
- doTest(this, col, source, target, m_relation_);
- }
- else {
- doTest(this, col, source, target, -1);
- }
- }
- vector.add(target);
- if (m_relation_ < 0) {
- lastsmallerthanindex = vsize - 1;
- }
- }
+ static final String prettify(RawCollationKey key) {
+ return prettify(key.bytes, key.size);
}
- /**
- * Parses the sequence to be tested
- */
- private boolean getNextInSequence() {
- if (m_sequenceIndex_ >= m_sequence_.length()) {
- return false;
+ static final String prettify(byte[] skBytes, int length) {
+ StringBuilder target = new StringBuilder(length * 3 + 2).append('[');
+
+ for (int i = 0; i < length; i++) {
+ String numStr = Integer.toHexString(skBytes[i] & 0xff);
+ if (numStr.length() < 2) {
+ target.append('0');
+ }
+ target.append(numStr).append(' ');
}
-
- boolean quoted = false;
- boolean quotedsingle = false;
- boolean done = false;
- int i = m_sequenceIndex_;
- int offset = 0;
- m_source_ = m_target_.toString();
- m_relation_ = m_nextRelation_;
- m_target_.delete(0, m_target_.length());
- while (i < m_sequence_.length() && !done) {
- int ch = UTF16.charAt(m_sequence_, i);
- if (UCharacter.isSupplementary(ch)) {
- i += 2;
- }
- else {
- i ++;
- }
- if (!quoted) {
- if (UCharacter.isWhitespace(ch)) {
- continue;
- }
- switch (ch) {
- case 0x003C : // <
- m_nextRelation_ = -1;
- done = true;
- break;
- case 0x003D : // =
- m_nextRelation_ = 0;
- done = true;
- break;
- case 0x003E : // >
- m_nextRelation_ = 1;
- done = true;
- break;
- case 0x0027 : // ' very basic quoting
- quoted = true;
- quotedsingle = false;
- break;
- case 0x005c : // \ single quote
- quoted = true;
- quotedsingle = true;
- break;
- default:
- UTF16.insert(m_target_, offset, ch);
- if (UCharacter.isSupplementary(ch)) {
- offset += 2;
- }
- else {
- offset ++;
- }
- }
- }
- else {
- if (ch == 0x0027) {
- quoted = false;
- }
- else {
- UTF16.insert(m_target_, offset, ch);
- if (UCharacter.isSupplementary(ch)) {
- offset += 2;
- }
- else {
- offset ++;
- }
- }
- if (quotedsingle) {
- quoted = false;
- }
- }
- }
- if (quoted == true) {
- errln("Quote in sequence not closed!");
- return false;
- }
-
-
- m_sequenceIndex_ = i;
- return true;
+ target.append(']');
+ return target.toString();
}
private static void doTestVariant(TestFmwk test,
test.warnln("Could not load locale data skipping.");
}
}
+
+ public void TestMinMax() {
+ setRootCollator();
+ RuleBasedCollator rbc = (RuleBasedCollator)coll;
+
+ final String s = "\uFFFE\uFFFF";
+ long[] ces;
+
+ ces = rbc.internalGetCEs(s);
+ if (ces.length != 2) {
+ errln("expected 2 CEs for <FFFE, FFFF>, got " + ces.length);
+ return;
+ }
+
+ long ce = ces[0];
+ long expected = (Collation.MERGE_SEPARATOR_PRIMARY << 32) |
+ Collation.MERGE_SEPARATOR_LOWER32;
+ if (ce != expected) {
+ errln("CE(U+fffe)=0x" + Utility.hex(ce) + " != 02.02.02");
+ }
+
+ ce = ces[1];
+ expected = Collation.makeCE(Collation.MAX_PRIMARY);
+ if (ce != expected) {
+ errln("CE(U+ffff)=0x" + Utility.hex(ce) + " != max..");
+ }
+ }
+
+ public void TestImplicits() {
+ CollationData cd = CollationRoot.getData();
+
+ // Implicit primary weights should be assigned for the following sets,
+ // and sort in ascending order by set and then code point.
+ // See http://www.unicode.org/reports/tr10/#Implicit_Weights
+ // core Han Unified Ideographs
+ UnicodeSet coreHan = new UnicodeSet("[\\p{unified_ideograph}&"
+ + "[\\p{Block=CJK_Unified_Ideographs}"
+ + "\\p{Block=CJK_Compatibility_Ideographs}]]");
+ // all other Unified Han ideographs
+ UnicodeSet otherHan = new UnicodeSet("[\\p{unified ideograph}-"
+ + "[\\p{Block=CJK_Unified_Ideographs}"
+ + "\\p{Block=CJK_Compatibility_Ideographs}]]");
+
+ UnicodeSet unassigned = new UnicodeSet("[[:Cn:][:Cs:][:Co:]]");
+ unassigned.remove(0xfffe, 0xffff); // These have special CLDR root mappings.
+
+ UnicodeSet[] sets = { coreHan, otherHan, unassigned };
+ int prev = 0;
+ long prevPrimary = 0;
+ UTF16CollationIterator ci = new UTF16CollationIterator(cd, false, "", 0);
+ for (int i = 0; i < sets.length; ++i) {
+ UnicodeSetIterator iter = new UnicodeSetIterator(sets[i]);
+ while (iter.next()) {
+ String s = iter.getString();
+ int c = s.codePointAt(0);
+ ci.setText(false, s, 0);
+ long ce = ci.nextCE();
+ long ce2 = ci.nextCE();
+ if (ce == Collation.NO_CE || ce2 != Collation.NO_CE) {
+ errln("CollationIterator.nextCE(0x" + Utility.hex(c)
+ + ") did not yield exactly one CE");
+ continue;
+
+ }
+ if ((ce & 0xffffffffL) != Collation.COMMON_SEC_AND_TER_CE) {
+ errln("CollationIterator.nextCE(U+" + Utility.hex(c, 4)
+ + ") has non-common sec/ter weights: 0x" + Utility.hex(ce & 0xffffffffL, 8));
+ continue;
+ }
+ long primary = ce >>> 32;
+ if (!(primary > prevPrimary)) {
+ errln("CE(U+" + Utility.hex(c) + ")=0x" + Utility.hex(primary)
+ + ".. not greater than CE(U+" + Utility.hex(prev)
+ + ")=0x" + Utility.hex(prevPrimary) + "..");
+
+ }
+ prev = c;
+ prevPrimary = primary;
+ }
+ }
+ }
+
+ // ICU4C: TestNulTerminated / renamed for ICU4J
+ public void TestSubSequence() {
+ CollationData data = CollationRoot.getData();
+ final String s = "abab"; // { 0x61, 0x62, 0x61, 0x62 }
+
+ UTF16CollationIterator ci1 = new UTF16CollationIterator(data, false, s, 0);
+ UTF16CollationIterator ci2 = new UTF16CollationIterator(data, false, s, 2);
+
+ for (int i = 0; i < 2; ++i) {
+ long ce1 = ci1.nextCE();
+ long ce2 = ci2.nextCE();
+
+ if (ce1 != ce2) {
+ errln("CollationIterator.nextCE(with start position at 0) != "
+ + "nextCE(with start position at 2) at CE " + i);
+ }
+ }
+ }
+
+
+ // ICU4C: TestIllegalUTF8 / not applicable to ICU4J
+
+
+ private static void addLeadSurrogatesForSupplementary(UnicodeSet src, UnicodeSet dest) {
+ for(int c = 0x10000; c < 0x110000;) {
+ int next = c + 0x400;
+ if(src.containsSome(c, next - 1)) {
+ dest.add(UTF16.getLeadSurrogate(c));
+ }
+ c = next;
+ }
+ }
+
+ public void TestShortFCDData() {
+ UnicodeSet expectedLccc = new UnicodeSet("[:^lccc=0:]");
+ expectedLccc.add(0xdc00, 0xdfff); // add all trail surrogates
+ addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
+
+ UnicodeSet lccc = new UnicodeSet(); // actual
+ for (int c = 0; c <= 0xffff; ++c) {
+ if (CollationFCD.hasLccc(c)) {
+ lccc.add(c);
+ }
+ }
+
+ UnicodeSet diff = new UnicodeSet(expectedLccc);
+ diff.removeAll(lccc);
+ diff.remove(0x10000, 0x10ffff); // hasLccc() only works for the BMP
+
+ String empty = "[]";
+ String diffString;
+
+ diffString = diff.toPattern(true);
+ assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
+
+ diff = lccc;
+ diff.removeAll(expectedLccc);
+ diffString = diff.toPattern(true);
+ assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString);
+
+ UnicodeSet expectedTccc = new UnicodeSet("[:^tccc=0:]");
+ addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
+ addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
+
+ UnicodeSet tccc = new UnicodeSet(); // actual
+ for(int c = 0; c <= 0xffff; ++c) {
+ if (CollationFCD.hasTccc(c)) {
+ tccc.add(c);
+ }
+ }
+
+ diff = new UnicodeSet(expectedTccc);
+ diff.removeAll(tccc);
+ diff.remove(0x10000, 0x10ffff); // hasTccc() only works for the BMP
+ assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
+
+ diff = tccc;
+ diff.removeAll(expectedTccc);
+ diffString = diff.toPattern(true);
+ assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
+ }
+
+ private static class CodePointIterator {
+ int[] cp;
+ int length;
+ int pos;
+
+ CodePointIterator(int[] cp) {
+ this.cp = cp;
+ this.length = cp.length;
+ this.pos = 0;
+ }
+
+ void resetToStart() {
+ pos = 0;
+ }
+
+ int next() {
+ return (pos < length) ? cp[pos++] : Collation.SENTINEL_CP;
+ }
+
+ int previous() {
+ return (pos > 0) ? cp[--pos] : Collation.SENTINEL_CP;
+ }
+
+ int getLength() {
+ return length;
+ }
+
+ int getIndex() {
+ return pos;
+ }
+ }
+
+ private void checkFCD(String name, CollationIterator ci, CodePointIterator cpi) {
+ // Iterate forward to the limit.
+ for (;;) {
+ int c1 = ci.nextCodePoint();
+ int c2 = cpi.next();
+ if (c1 != c2) {
+ errln(name + ".nextCodePoint(to limit, 1st pass) = U+" + Utility.hex(c1)
+ + " != U+" + Utility.hex(c1) + " at " + cpi.getIndex());
+ return;
+ }
+ if (c1 < 0) {
+ break;
+ }
+ }
+
+ // Iterate backward most of the way.
+ for (int n = (cpi.getLength() * 2) / 3; n > 0; --n) {
+ int c1 = ci.previousCodePoint();
+ int c2 = cpi.previous();
+ if (c1 != c2) {
+ errln(name + ".previousCodePoint() = U+" + Utility.hex(c1) +
+ " != U+" + Utility.hex(c2) + " at " + cpi.getIndex());
+ return;
+ }
+ }
+
+ // Forward again.
+ for (;;) {
+ int c1 = ci.nextCodePoint();
+ int c2 = cpi.next();
+ if (c1 != c2) {
+ errln(name + ".nextCodePoint(to limit again) = U+" + Utility.hex(c1)
+ + " != U+" + Utility.hex(c2) + " at " + cpi.getIndex());
+ return;
+ }
+ if (c1 < 0) {
+ break;
+ }
+ }
+
+ // Iterate backward to the start.
+ for (;;) {
+ int c1 = ci.previousCodePoint();
+ int c2 = cpi.previous();
+ if (c1 != c2) {
+ errln(name + ".nextCodePoint(to start) = U+" + Utility.hex(c1)
+ + " != U+" + Utility.hex(c2) + " at " + cpi.getIndex());
+ return;
+ }
+ if (c1 < 0) {
+ break;
+ }
+ }
+ }
+
+ public void TestFCD() {
+ CollationData data = CollationRoot.getData();
+
+ // Input string, not FCD.
+ StringBuilder buf = new StringBuilder();
+ buf.append("\u0308\u00e1\u0062\u0301\u0327\u0430\u0062")
+ .appendCodePoint(0x1D15F) // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
+ .append("\u0327\u0308") // ccc=202, 230
+ .appendCodePoint(0x1D16D) // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
+ .appendCodePoint(0x1D15F)
+ .appendCodePoint(0x1D16D)
+ .append("\uac01")
+ .append("\u00e7") // Character with tccc!=0 decomposed together with mis-ordered sequence.
+ .appendCodePoint(0x1D16D).appendCodePoint(0x1D165)
+ .append("\u00e1") // Character with tccc!=0 decomposed together with decomposed sequence.
+ .append("\u0f73\u0f75") // Tibetan composite vowels must be decomposed.
+ .append("\u4e00\u0f81");
+ String s = buf.toString();
+
+ // Expected code points.
+ int[] cp = {
+ 0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
+ 0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
+ 0x1D15F, 0x1D16D,
+ 0xac01,
+ 0x63, 0x327, 0x1D165, 0x1D16D,
+ 0x61,
+ 0xf71, 0xf71, 0xf72, 0xf74, 0x301,
+ 0x4e00, 0xf71, 0xf80
+ };
+
+ FCDUTF16CollationIterator u16ci = new FCDUTF16CollationIterator(data, false, s, 0);
+ CodePointIterator cpi = new CodePointIterator(cp);
+ checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
+
+ cpi.resetToStart();
+ UCharacterIterator iter = UCharacterIterator.getInstance(s);
+ FCDIterCollationIterator uici = new FCDIterCollationIterator(data, false, iter, 0);
+ checkFCD("FCDIterCollationIterator", uici, cpi);
+ }
+
+ private void checkAllocWeights(CollationWeights cw, long lowerLimit, long upperLimit,
+ int n, int someLength, int minCount) {
+
+ if (!cw.allocWeights(lowerLimit, upperLimit, n)) {
+ errln("CollationWeights::allocWeights(0x"
+ + Utility.hex(lowerLimit) + ",0x"
+ + Utility.hex(upperLimit) + ","
+ + n + ") = false");
+ return;
+ }
+ long previous = lowerLimit;
+ int count = 0; // number of weights that have someLength
+ for (int i = 0; i < n; ++i) {
+ long w = cw.nextWeight();
+ if (w == 0xffffffffL) {
+ errln("CollationWeights::allocWeights(0x"
+ + Utility.hex(lowerLimit) + ",0x"
+ + Utility.hex(upperLimit) + ",0x"
+ + n + ").nextWeight() returns only "
+ + i + " weights");
+ return;
+ }
+ if (!(previous < w && w < upperLimit)) {
+ errln("CollationWeights::allocWeights(0x"
+ + Utility.hex(lowerLimit) + ",0x"
+ + Utility.hex(upperLimit) + ","
+ + n + ").nextWeight() number "
+ + (i + 1) + " -> 0x" + Utility.hex(w)
+ + " not between "
+ + Utility.hex(previous) + " and "
+ + Utility.hex(upperLimit));
+ return;
+ }
+ if (CollationWeights.lengthOfWeight(w) == someLength) {
+ ++count;
+ }
+ }
+ if (count < minCount) {
+ errln("CollationWeights::allocWeights(0x"
+ + Utility.hex(lowerLimit) + ",0x"
+ + Utility.hex(upperLimit) + ","
+ + n + ").nextWeight() returns only "
+ + count + " < " + minCount + " weights of length "
+ + someLength);
+
+ }
+ }
+
+ public void TestCollationWeights() {
+ CollationWeights cw = new CollationWeights();
+
+ // Non-compressible primaries use 254 second bytes 02..FF.
+ logln("CollationWeights.initForPrimary(non-compressible)");
+ cw.initForPrimary(false);
+ // Expect 1 weight 11 and 254 weights 12xx.
+ checkAllocWeights(cw, 0x10000000L, 0x13000000L, 255, 1, 1);
+ checkAllocWeights(cw, 0x10000000L, 0x13000000L, 255, 2, 254);
+ // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
+ checkAllocWeights(cw, 0x10fefe40L, 0x12030300L, 260, 2, 255);
+ // Expect 254 two-byte weights from the ranges 10ff and 11xx.
+ checkAllocWeights(cw, 0x10fefe40L, 0x12030300L, 600, 2, 254);
+ // Expect 254^2=64516 three-byte weights.
+ // During computation, there should be 3 three-byte ranges
+ // 10ffff, 11xxxx, 120202.
+ // The middle one should be split 64515:1,
+ // and the newly-split-off range and the last ranged lengthened.
+ checkAllocWeights(cw, 0x10fffe00L, 0x12020300L, 1 + 64516 + 254 + 1, 3, 64516);
+ // Expect weights 1102 & 1103.
+ checkAllocWeights(cw, 0x10ff0000L, 0x11040000L, 2, 2, 2);
+ // Expect weights 102102 & 102103.
+ checkAllocWeights(cw, 0x1020ff00L, 0x10210400L, 2, 3, 2);
+
+ // Compressible primaries use 251 second bytes 04..FE.
+ logln("CollationWeights.initForPrimary(compressible)");
+ cw.initForPrimary(true);
+ // Expect 1 weight 11 and 251 weights 12xx.
+ checkAllocWeights(cw, 0x10000000L, 0x13000000L, 252, 1, 1);
+ checkAllocWeights(cw, 0x10000000L, 0x13000000L, 252, 2, 251);
+ // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
+ checkAllocWeights(cw, 0x10fdfe40L, 0x12050300L, 260, 2, 252);
+ // Expect weights 1104 & 1105.
+ checkAllocWeights(cw, 0x10fe0000L, 0x11060000L, 2, 2, 2);
+ // Expect weights 102102 & 102103.
+ checkAllocWeights(cw, 0x1020ff00L, 0x10210400L, 2, 3, 2);
+
+ // Secondary and tertiary weights use only bytes 3 & 4.
+ logln("CollationWeights.initForSecondary()");
+ cw.initForSecondary();
+ // Expect weights fbxx and all four fc..ff.
+ checkAllocWeights(cw, 0xfb20L, 0x10000L, 20, 3, 4);
+
+ logln("CollationWeights.initForTertiary()");
+ cw.initForTertiary();
+ // Expect weights 3dxx and both 3e & 3f.
+ checkAllocWeights(cw, 0x3d02L, 0x4000L, 10, 3, 2);
+ }
+
+ private static boolean isValidCE(CollationRootElements re, CollationData data, long p, long s, long ctq) {
+ long p1 = p >>> 24;
+ long p2 = (p >>> 16) & 0xff;
+ long p3 = (p >>> 8) & 0xff;
+ long p4 = p & 0xff;
+ long s1 = s >>> 8;
+ long s2 = s & 0xff;
+ // ctq = Case, Tertiary, Quaternary
+ long c = (ctq & Collation.CASE_MASK) >>> 14;
+ long t = ctq & Collation.ONLY_TERTIARY_MASK;
+ long t1 = t >>> 8;
+ long t2 = t & 0xff;
+ long q = ctq & Collation.QUATERNARY_MASK;
+ // No leading zero bytes.
+ if ((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
+ return false;
+ }
+ // No intermediate zero bytes.
+ if (p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
+ return false;
+ }
+ if (p2 != 0 && p3 == 0 && p4 != 0) {
+ return false;
+ }
+ // Minimum & maximum lead bytes.
+ if ((p1 != 0 && p1 <= Collation.MERGE_SEPARATOR_BYTE)
+ || (s1 != 0 && s1 <= Collation.MERGE_SEPARATOR_BYTE)
+ || (t1 != 0 && t1 <= Collation.MERGE_SEPARATOR_BYTE)) {
+ return false;
+ }
+ if (t1 != 0 && t1 > 0x3f) {
+ return false;
+ }
+ if (c > 2) {
+ return false;
+ }
+ // The valid byte range for the second primary byte depends on compressibility.
+ if (p2 != 0) {
+ if (data.isCompressibleLeadByte((int)p1)) {
+ if (p2 <= Collation.PRIMARY_COMPRESSION_LOW_BYTE
+ || Collation.PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
+ return false;
+ }
+ } else {
+ if (p2 <= Collation.LEVEL_SEPARATOR_BYTE) {
+ return false;
+ }
+ }
+ }
+ // Other bytes just need to avoid the level separator.
+ // Trailing zeros are ok.
+ // assert (Collation.LEVEL_SEPARATOR_BYTE == 1);
+ if (p3 == Collation.LEVEL_SEPARATOR_BYTE || p4 == Collation.LEVEL_SEPARATOR_BYTE
+ || s2 == Collation.LEVEL_SEPARATOR_BYTE || t2 == Collation.LEVEL_SEPARATOR_BYTE) {
+ return false;
+ }
+ // Well-formed CEs.
+ if (p == 0) {
+ if (s == 0) {
+ if (t == 0) {
+ // Completely ignorable CE.
+ // Quaternary CEs are not supported.
+ if (c != 0 || q != 0) {
+ return false;
+ }
+ } else {
+ // Tertiary CE.
+ if (t < re.getTertiaryBoundary() || c != 2) {
+ return false;
+ }
+ }
+ } else {
+ // Secondary CE.
+ if (s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
+ return false;
+ }
+ }
+ } else {
+ // Primary CE.
+ if (s == 0 || (Collation.COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary())
+ || s >= re.getSecondaryBoundary()) {
+ return false;
+ }
+ if (t == 0 || t >= re.getTertiaryBoundary()) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private static boolean isValidCE(CollationRootElements re, CollationData data, long ce) {
+ long p = ce >>> 32;
+ long secTer = ce & 0xffffffffL;
+ return isValidCE(re, data, p, secTer >>> 16, secTer & 0xffff);
+ }
+
+ private static class RootElementsIterator {
+ CollationData data;
+ long[] elements;
+ int length;
+
+ long pri;
+ long secTer;
+ int index;
+
+ RootElementsIterator(CollationData root) {
+ data = root;
+ elements = root.rootElements;
+ length = elements.length;
+ pri = 0;
+ secTer = 0;
+ index = (int)elements[CollationRootElements.IX_FIRST_TERTIARY_INDEX];
+ }
+
+ boolean next() {
+ if (index >= length) {
+ return false;
+ }
+ long p = elements[index];
+ if (p == CollationRootElements.PRIMARY_SENTINEL) {
+ return false;
+ }
+ if ((p & CollationRootElements.SEC_TER_DELTA_FLAG) != 0) {
+ ++index;
+ secTer = p & ~CollationRootElements.SEC_TER_DELTA_FLAG;
+ return true;
+ }
+ if ((p & CollationRootElements.PRIMARY_STEP_MASK) != 0) {
+ // End of a range, enumerate the primaries in the range.
+ int step = (int)p & CollationRootElements.PRIMARY_STEP_MASK;
+ p &= 0xffffff00;
+ if (pri == p) {
+ // Finished the range, return the next CE after it.
+ ++index;
+ return next();
+ }
+ assert (pri < p);
+ // Return the next primary in this range.
+ boolean isCompressible = data.isCompressiblePrimary(pri);
+ if ((pri & 0xffff) == 0) {
+ pri = Collation.incTwoBytePrimaryByOffset(pri, isCompressible, step);
+ } else {
+ pri = Collation.incThreeBytePrimaryByOffset(pri, isCompressible, step);
+ }
+ return true;
+ }
+ // Simple primary CE.
+ ++index;
+ pri = p;
+ secTer = Collation.COMMON_SEC_AND_TER_CE;
+ return true;
+ }
+
+ long getPrimary() {
+ return pri;
+ }
+
+ long getSecTer() {
+ return secTer;
+ }
+ }
+
+ public void TestRootElements() {
+ CollationData root = CollationRoot.getData();
+
+ CollationRootElements rootElements = new CollationRootElements(root.rootElements);
+ RootElementsIterator iter = new RootElementsIterator(root);
+
+ // We check each root CE for validity,
+ // and we also verify that there is a tailoring gap between each two CEs.
+ CollationWeights cw1c = new CollationWeights(); // compressible primary weights
+ CollationWeights cw1u = new CollationWeights(); // uncompressible primary weights
+ CollationWeights cw2 = new CollationWeights();
+ CollationWeights cw3 = new CollationWeights();
+
+ cw1c.initForPrimary(true);
+ cw1u.initForPrimary(false);
+ cw2.initForSecondary();
+ cw3.initForTertiary();
+
+ // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
+ // nor the special merge-separator CE for U+FFFE.
+ long prevPri = 0;
+ long prevSec = 0;
+ long prevTer = 0;
+
+ while (iter.next()) {
+ long pri = iter.getPrimary();
+ long secTer = iter.getSecTer();
+ // CollationRootElements CEs must have 0 case and quaternary bits.
+ if ((secTer & Collation.CASE_AND_QUATERNARY_MASK) != 0) {
+ errln("CollationRootElements CE has non-zero case and/or quaternary bits: "
+ + "0x" + Utility.hex(pri, 8) + " 0x" + Utility.hex(secTer, 8));
+ }
+ long sec = secTer >>> 16;
+ long ter = secTer & Collation.ONLY_TERTIARY_MASK;
+ long ctq = ter;
+ if (pri == 0 && sec == 0 && ter != 0) {
+ // Tertiary CEs must have uppercase bits,
+ // but they are not stored in the CollationRootElements.
+ ctq |= 0x8000;
+ }
+ if (!isValidCE(rootElements, root, pri, sec, ctq)) {
+ errln("invalid root CE 0x"
+ + Utility.hex(pri, 8) + " 0x" + Utility.hex(secTer, 8));
+ } else {
+ if (pri != prevPri) {
+ long newWeight = 0;
+ if (prevPri == 0 || prevPri >= Collation.FFFD_PRIMARY) {
+ // There is currently no tailoring gap after primary ignorables,
+ // and we forbid tailoring after U+FFFD and U+FFFF.
+ } else if (root.isCompressiblePrimary(prevPri)) {
+ if (!cw1c.allocWeights(prevPri, pri, 1)) {
+ errln("no primary/compressible tailoring gap between "
+ + "0x" + Utility.hex(prevPri, 8)
+ + " and 0x" + Utility.hex(pri, 8));
+ } else {
+ newWeight = cw1c.nextWeight();
+ }
+ } else {
+ if (!cw1u.allocWeights(prevPri, pri, 1)) {
+ errln("no primary/uncompressible tailoring gap between "
+ + "0x" + Utility.hex(prevPri, 8)
+ + " and 0x" + Utility.hex(pri, 8));
+ } else {
+ newWeight = cw1u.nextWeight();
+ }
+ }
+ if (newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
+ errln("mis-allocated primary weight, should get "
+ + "0x" + Utility.hex(prevPri, 8)
+ + " < 0x" + Utility.hex(newWeight, 8)
+ + " < 0x" + Utility.hex(pri, 8));
+ }
+ } else if (sec != prevSec) {
+ long lowerLimit = prevSec == 0 ?
+ rootElements.getSecondaryBoundary() - 0x100 : prevSec;
+ if (!cw2.allocWeights(lowerLimit, sec, 1)) {
+ errln("no secondary tailoring gap between "
+ + "0x" + Utility.hex(lowerLimit)
+ + " and 0x" + Utility.hex(sec));
+ } else {
+ long newWeight = cw2.nextWeight();
+ if (!(prevSec < newWeight && newWeight < sec)) {
+ errln("mis-allocated secondary weight, should get "
+ + "0x" + Utility.hex(lowerLimit)
+ + " < 0x" + Utility.hex(newWeight)
+ + " < 0x" + Utility.hex(sec));
+ }
+ }
+ } else if (ter != prevTer) {
+ long lowerLimit = prevTer == 0 ?
+ rootElements.getTertiaryBoundary() - 0x100 : prevTer;
+ if (!cw3.allocWeights(lowerLimit, ter, 1)) {
+ errln("no tertiary tailoring gap between "
+ + "0x" + Utility.hex(lowerLimit)
+ + " and 0x" + Utility.hex(ter));
+ } else {
+ long newWeight = cw3.nextWeight();
+ if (!(prevTer < newWeight && newWeight < ter)) {
+ errln("mis-allocated tertiary weight, should get "
+ + "0x" + Utility.hex(lowerLimit)
+ + " < 0x" + Utility.hex(newWeight)
+ + " < 0x" + Utility.hex(ter));
+ }
+ }
+ } else {
+ errln("duplicate root CE 0x"
+ + Utility.hex(pri, 8) + " 0x" + Utility.hex(secTer, 8));
+ }
+ }
+ prevPri = pri;
+ prevSec = sec;
+ prevTer = ter;
+ }
+ }
+
+ public void TestTailoredElements() {
+ CollationData root = CollationRoot.getData();
+ CollationRootElements rootElements = new CollationRootElements(root.rootElements);
+
+ Set<String> prevLocales = new HashSet<String>();
+ prevLocales.add("");
+ prevLocales.add("root");
+ prevLocales.add("root@collation=standard");
+
+ long[] ces;
+ ULocale[] locales = Collator.getAvailableULocales();
+ String localeID = "root";
+ int locIdx = 0;
+
+ for (; locIdx < locales.length; localeID = locales[locIdx++].getName()) {
+ ULocale locale = new ULocale(localeID);
+ String[] types = Collator.getKeywordValuesForLocale("collation", locale, false);
+ String type = null; // default type
+ int typeIdx = 0;
+ for (; typeIdx < types.length; type = types[typeIdx++]) {
+ ULocale localeWithType = locale;
+ if (type != null) {
+ localeWithType = localeWithType.setKeywordValue("collation", type);
+ }
+ Collator coll = Collator.getInstance(localeWithType);
+ ULocale actual = coll.getLocale(ULocale.ACTUAL_LOCALE);
+ if (prevLocales.contains(actual.getName())) {
+ continue;
+ }
+ prevLocales.add(actual.getName());
+ logln("TestTailoredElements(): requested " + localeWithType.getName()
+ + " -> actual " + actual.getName());
+ if (!(coll instanceof RuleBasedCollator)) {
+ continue;
+ }
+ RuleBasedCollator rbc = (RuleBasedCollator) coll;
+
+ // Note: It would be better to get tailored strings such that we can
+ // identify the prefix, and only get the CEs for the prefix+string,
+ // not also for the prefix.
+ // There is currently no API for that.
+ // It would help in an unusual case where a contraction starting in the prefix
+ // extends past its end, and we do not see the intended mapping.
+ // For example, for a mapping p|st, if there is also a contraction ps,
+ // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
+ UnicodeSet tailored = coll.getTailoredSet();
+ UnicodeSetIterator iter = new UnicodeSetIterator(tailored);
+ while (iter.next()) {
+ String s = iter.getString();
+ ces = rbc.internalGetCEs(s);
+ for (int i = 0; i < ces.length; ++i) {
+ long ce = ces[i];
+ if (!isValidCE(rootElements, root, ce)) {
+ logln(prettify(s));
+ errln("invalid tailored CE 0x" + Utility.hex(ce, 16)
+ + " at CE index " + i + " from string:");
+ }
+ }
+ }
+ }
+ }
+ }
+
+ private static boolean isSpace(char c) {
+ return (c == 0x09 || c == 0x20 || c == 0x3000);
+ }
+
+ private static boolean isSectionStarter(char c) {
+ return (c == '%' || c == '*' || c == '@');
+ }
+
+ private int skipSpaces(int i) {
+ while (isSpace(fileLine.charAt(i))) {
+ ++i;
+ }
+ return i;
+ }
+
+ private String printSortKey(byte[] p) {
+ StringBuilder s = new StringBuilder();
+ for (int i = 0; i < p.length; ++i) {
+ if (i > 0) {
+ s.append(' ');
+ }
+ byte b = p[i];
+ if (b == 0) {
+ s.append('.');
+ } else if (b == 1) {
+ s.append('|');
+ } else {
+ s.append(String.format("%02x", b & 0xff));
+ }
+ }
+ return s.toString();
+ }
+
+ private String printCollationKey(CollationKey key) {
+ byte[] p = key.toByteArray();
+ return printSortKey(p);
+ }
+
+ private boolean readLine(BufferedReader in) throws IOException {
+ String line = in.readLine();
+ if (line == null) {
+ fileLine = null;
+ return false;
+ }
+ ++fileLineNumber;
+ // Strip trailing comments and spaces
+ int idx = line.indexOf('#');
+ if (idx < 0) {
+ idx = line.length();
+ }
+ for (; idx > 0; idx--) {
+ if (!isSpace(line.charAt(idx -1))) {
+ break;
+ }
+ }
+
+ fileLine = idx < line.length() ? line.substring(0, idx) : line;
+ return true;
+ }
+
+ private int parseString(int start, Output<String> prefix, Output<String> s) {
+ int length = fileLine.length();
+ int i;
+ for (i = start; i < length && !isSpace(fileLine.charAt(i)); ++i) {
+ }
+ int pipeIndex = fileLine.indexOf('|', start);
+ if (pipeIndex >= 0 && pipeIndex < i) {
+ String tmpPrefix = Utility.unescape(fileLine.substring(start, pipeIndex));
+ if (tmpPrefix.length() == 0) {
+ prefix.value = null;
+ logln(fileLine);
+ error = new ParseException("empty prefix on line " + fileLineNumber, fileLineNumber);
+ errln("empty prefix on line " + fileLineNumber);
+ return start;
+ }
+ prefix.value = tmpPrefix;
+ start = pipeIndex + 1;
+ } else {
+ prefix.value = null;
+ }
+
+ String tmp = Utility.unescape(fileLine.substring(start, i));
+ if (tmp.length() == 0) {
+ s.value = null;
+ logln(fileLine);
+ error = new ParseException("empty string on line " + fileLineNumber, fileLineNumber);
+ errln("empty string on line " + fileLineNumber);
+ return start;
+ }
+ s.value = tmp;
+ return i;
+ }
+
+ private int parseRelationAndString(Output<String> s) {
+ int relation = Collation.NO_LEVEL;
+ int start;
+ if (fileLine.charAt(0) == '<') {
+ char second = fileLine.charAt(1);
+ start = 2;
+ switch(second) {
+ case 0x31: // <1
+ relation = Collation.PRIMARY_LEVEL;
+ break;
+ case 0x32: // <2
+ relation = Collation.SECONDARY_LEVEL;
+ break;
+ case 0x33: // <3
+ relation = Collation.TERTIARY_LEVEL;
+ break;
+ case 0x34: // <4
+ relation = Collation.QUATERNARY_LEVEL;
+ break;
+ case 0x63: // <c
+ relation = Collation.CASE_LEVEL;
+ break;
+ case 0x69: // <i
+ relation = Collation.IDENTICAL_LEVEL;
+ break;
+ default: // just <
+ relation = Collation.NO_LEVEL;
+ start = 1;
+ break;
+ }
+ } else if (fileLine.charAt(0) == '=') {
+ relation = Collation.ZERO_LEVEL;
+ start = 1;
+ } else {
+ start = 0;
+ }
+
+ if (start == 0 || !isSpace(fileLine.charAt(start))) {
+ logln(fileLine);
+ error = new ParseException("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line "
+ + fileLineNumber, fileLineNumber);
+ errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line " + fileLineNumber);
+ return Collation.NO_LEVEL;
+ }
+
+ start = skipSpaces(start);
+ Output<String> prefixOut = new Output<String>();
+ start = parseString(start, prefixOut, s);
+ if (error == null && prefixOut.value != null) {
+ logln(fileLine);
+ error = new ParseException("prefix string not allowed for test string: on line "
+ + fileLineNumber, fileLineNumber);
+ errln("prefix string not allowed for test string: on line " + fileLineNumber);
+ return Collation.NO_LEVEL;
+ }
+ if (start < fileLine.length()) {
+ logln(fileLine);
+ error = new ParseException("unexpected line contents after test string on line "
+ + fileLineNumber, fileLineNumber);
+ errln("unexpected line contents after test string on line " + fileLineNumber);
+ return Collation.NO_LEVEL;
+ }
+
+ return relation;
+ }
+
+ private void parseAndSetAttribute() {
+ int start = skipSpaces(1);
+ int equalPos = fileLine.indexOf('=');
+ if (equalPos < 0) {
+ if (fileLine.regionMatches(start, "reorder", 0, 7)) {
+ parseAndSetReorderCodes(start + 7);
+ return;
+ }
+ logln(fileLine);
+ error = new ParseException("missing '=' on line " + fileLineNumber, fileLineNumber);
+ errln("missing '=' on line " + fileLineNumber);
+ return;
+ }
+
+ String attrString = fileLine.substring(start, equalPos);
+ String valueString = fileLine.substring(equalPos + 1);
+ if (attrString.equals("maxVariable")) {
+ int max;
+ if (valueString.equals("space")) {
+ max = ReorderCodes.SPACE;
+ } else if(valueString.equals("punct")) {
+ max = ReorderCodes.PUNCTUATION;
+ } else if(valueString.equals("symbol")) {
+ max = ReorderCodes.SYMBOL;
+ } else if(valueString.equals("currency")) {
+ max = ReorderCodes.CURRENCY;
+ } else {
+ logln(fileLine);
+ error = new ParseException("invalid attribute value name on line "
+ + fileLineNumber, fileLineNumber);
+ errln("invalid attribute value name on line " + fileLineNumber);
+ return;
+ }
+ coll.setMaxVariable(max);
+ fileLine = null;
+ return;
+ }
+
+ boolean parsed = true;
+ RuleBasedCollator rbc = (RuleBasedCollator)coll;
+ if (attrString.equals("backwards")) {
+ if (valueString.equals("on")) {
+ rbc.setFrenchCollation(true);
+ } else if (valueString.equals("off")) {
+ rbc.setFrenchCollation(false);
+ } else if (valueString.equals("default")) {
+ rbc.setFrenchCollationDefault();
+ } else {
+ parsed = false;
+ }
+ } else if (attrString.equals("alternate")) {
+ if (valueString.equals("non-ignorable")) {
+ rbc.setAlternateHandlingShifted(false);
+ } else if (valueString.equals("shifted")) {
+ rbc.setAlternateHandlingShifted(true);
+ } else if (valueString.equals("default")) {
+ rbc.setAlternateHandlingDefault();
+ } else {
+ parsed = false;
+ }
+ } else if (attrString.equals("caseFirst")) {
+ if (valueString.equals("upper")) {
+ rbc.setUpperCaseFirst(true);
+ } else if (valueString.equals("lower")) {
+ rbc.setLowerCaseFirst(true);
+ } else if (valueString.equals("default")) {
+ rbc.setCaseFirstDefault();
+ } else {
+ parsed = false;
+ }
+ } else if (attrString.equals("caseLevel")) {
+ if (valueString.equals("on")) {
+ rbc.setCaseLevel(true);
+ } else if (valueString.equals("off")) {
+ rbc.setCaseLevel(false);
+ } else if (valueString.equals("default")) {
+ rbc.setCaseLevelDefault();
+ } else {
+ parsed = false;
+ }
+ } else if (attrString.equals("strength")) {
+ if (valueString.equals("primary")) {
+ rbc.setStrength(Collator.PRIMARY);
+ } else if (valueString.equals("secondary")) {
+ rbc.setStrength(Collator.SECONDARY);
+ } else if (valueString.equals("tertiary")) {
+ rbc.setStrength(Collator.TERTIARY);
+ } else if (valueString.equals("quaternary")) {
+ rbc.setStrength(Collator.QUATERNARY);
+ } else if (valueString.equals("identical")) {
+ rbc.setStrength(Collator.IDENTICAL);
+ } else if (valueString.equals("default")) {
+ rbc.setStrengthDefault();
+ } else {
+ parsed = false;
+ }
+ } else if (attrString.equals("numeric")) {
+ if (valueString.equals("on")) {
+ rbc.setNumericCollation(true);
+ } else if (valueString.equals("off")) {
+ rbc.setNumericCollation(false);
+ } else if (valueString.equals("default")) {
+ rbc.setNumericCollationDefault();
+ } else {
+ parsed = false;
+ }
+ } else {
+ logln(fileLine);
+ error = new ParseException("invalid attribute value name on line "
+ + fileLineNumber, fileLineNumber);
+ errln("invalid attribute value name on line " + fileLineNumber);
+ return;
+ }
+ if (!parsed) {
+ logln(fileLine);
+ error = new ParseException("invalid attribute=value combination on line "
+ + fileLineNumber, fileLineNumber);
+ errln("invalid attribute=value combination on line " + fileLineNumber);
+ return;
+ }
+
+ fileLine = null;
+ }
+
+ private void parseAndSetReorderCodes(int start) {
+ UVector32 reorderCodes = new UVector32();
+ while (start < fileLine.length()) {
+ start = skipSpaces(start);
+ int limit = start;
+ while (limit < fileLine.length() && !isSpace(fileLine.charAt(limit))) {
+ ++limit;
+ }
+ String name = fileLine.substring(start, limit);
+ int code = CollationRuleParser.getReorderCode(name);
+ if (code < -1) {
+ logln(fileLine);
+ error = new ParseException("invalid reorder code '" + name + "' on line "
+ + fileLineNumber, fileLineNumber);
+ return;
+ }
+ reorderCodes.addElement(code);
+ start = limit;
+ }
+ int[] reorderCodesArray = new int[reorderCodes.size()];
+ System.arraycopy(reorderCodes.getBuffer(), 0,
+ reorderCodesArray, 0, reorderCodes.size());
+ coll.setReorderCodes(reorderCodesArray);
+
+ fileLine = null;
+ }
+
+ private void buildTailoring(BufferedReader in) throws IOException {
+ StringBuilder rules = new StringBuilder();
+ while (readLine(in)) {
+ if (fileLine.length() == 0) {
+ continue;
+ }
+ if (isSectionStarter(fileLine.charAt(0))) {
+ break;
+ }
+ rules.append(Utility.unescape(fileLine));
+ }
+
+ try {
+ coll = new RuleBasedCollator(rules.toString());
+ } catch (Exception e) {
+ logln(rules.toString());
+ errln("RuleBasedCollator(rules) failed - " + e.getMessage());
+ error = e;
+ }
+ }
+
+ private void setRootCollator() {
+ coll = Collator.getInstance(ULocale.ROOT);
+ }
+
+ private void setLocaleCollator() {
+ ULocale locale = null;
+ if (fileLine.length() > 9) {
+ String langTag = fileLine.substring(9); // "@ locale <langTag>"
+
+ ULocale.Builder locBld = new ULocale.Builder();
+ try {
+ locale = locBld.setLanguageTag(langTag).build();
+ } catch (IllformedLocaleException e) {
+ locale = null;
+ }
+ }
+ if (locale == null) {
+ logln(fileLine);
+ errln("invalid language tag on line " + fileLineNumber);
+ error = new ParseException("invalid langauge tag on line " + fileLineNumber, fileLineNumber);
+ return;
+ }
+
+ logln("creating a collator for locale ID " + locale.getName());
+ coll = Collator.getInstance(locale);
+ }
+
+ private boolean needsNormalization(String s) {
+ if (!fcd.isNormalized(s)) {
+ return true;
+ }
+ // In some sequences with Tibetan composite vowel signs,
+ // even if the string passes the FCD check,
+ // those composites must be decomposed.
+ // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
+ int index = 0;
+ while((index = s.indexOf(0xf71, index)) >= 0) {
+ if (++index < s.length()) {
+ char c = s.charAt(index);
+ if (c == 0xf73 || c == 0xf75 || c == 0xf81) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ private boolean getCollationKey(String norm, String line, String s, Output<CollationKey> keyOut) {
+ CollationKey key = coll.getCollationKey(s);
+ keyOut.value = key;
+
+ byte[] keyBytes = key.toByteArray();
+ if (keyBytes.length == 0 || keyBytes[keyBytes.length - 1] != 0) {
+ logln(fileTestName);
+ logln(line);
+ logln(printCollationKey(key));
+ errln("Collator(" + norm + ").getCollationKey() wrote an empty or unterminated key");
+ return false;
+ }
+
+ int numLevels = coll.getStrength();
+ if (numLevels < Collator.IDENTICAL) {
+ ++numLevels;
+ } else {
+ numLevels = 5;
+ }
+ if (((RuleBasedCollator)coll).isCaseLevel()) {
+ ++numLevels;
+ }
+ int numLevelSeparators = 0;
+ for (int i = 0; i < (keyBytes.length - 1); ++i) {
+ byte b = keyBytes[i];
+ if (b == 0) {
+ logln(fileTestName);
+ logln(line);
+ logln(printCollationKey(key));
+ errln("Collator(" + norm + ").getCollationKey() contains a 00 byte");
+ return false;
+ }
+ if (b == 1) {
+ ++numLevelSeparators;
+ }
+ }
+ if (numLevelSeparators != (numLevels - 1)) {
+ logln(fileTestName);
+ logln(line);
+ logln(printCollationKey(key));
+ errln("Collator(" + norm + ").getCollationKey() has "
+ + numLevelSeparators + " level separators for "
+ + numLevels + " levels");
+ return false;
+ }
+
+ // If s contains U+FFFE, check that merged segments make the same key.
+ CollationKey mergedKey = null;
+ int sLength = s.length();
+ int segmentStart = 0;
+ for (int i = 0;;) {
+ if (i == sLength) {
+ if (segmentStart == 0) {
+ // s does not contain any U+FFFE.
+ break;
+ }
+ } else if (s.charAt(i) != '\uFFFE') {
+ ++i;
+ continue;
+ }
+ // Get the sort key for another segment and merge it into mergedKey.
+ CollationKey tmpKey = coll.getCollationKey(s.substring(segmentStart, i));
+ if (mergedKey == null) {
+ mergedKey = tmpKey;
+ } else {
+ mergedKey = mergedKey.merge(tmpKey);
+ }
+ if (i == sLength) {
+ break;
+ }
+ segmentStart = ++i;
+ }
+ if (segmentStart != 0 && key.compareTo(mergedKey) != 0) {
+ logln(fileTestName);
+ logln(line);
+ logln(printCollationKey(key));
+ logln(printCollationKey(mergedKey));
+ errln("Collator(" + norm
+ + ").getCollationKey(with U+FFFE) != CollationKey.merge(segments)");
+ return false;
+ }
+
+ // No nextSortKeyPart support in ICU4J
+
+ return true;
+ }
+
+ private boolean checkCompareTwo(String norm, String prevFileLine, String prevString, String s,
+ int expectedOrder, int expectedLevel) {
+ // Get the sort keys first, for error debug output.
+ Output<CollationKey> prevKeyOut = new Output<CollationKey>();
+ CollationKey prevKey;
+ if (!getCollationKey(norm, fileLine, prevString, prevKeyOut)) {
+ return false;
+ }
+ prevKey = prevKeyOut.value;
+
+ Output<CollationKey> keyOut = new Output<CollationKey>();
+ CollationKey key;
+ if (!getCollationKey(norm, fileLine, s, keyOut)) {
+ return false;
+ }
+ key = keyOut.value;
+
+ int order = coll.compare(prevString, s);
+ if (order != expectedOrder) {
+ logln(fileTestName);
+ logln(prevFileLine);
+ logln(fileLine);
+ logln(printCollationKey(prevKey));
+ logln(printCollationKey(key));
+ errln("line " + fileLineNumber
+ + " Collator(" + norm + ").compare(previous, current) wrong order: "
+ + order + " != " + expectedOrder);
+ return false;
+ }
+ order = coll.compare(s, prevString);
+ if (order != -expectedOrder) {
+ logln(fileTestName);
+ logln(prevFileLine);
+ logln(fileLine);
+ logln(printCollationKey(prevKey));
+ logln(printCollationKey(key));
+ errln("line " + fileLineNumber
+ + " Collator(" + norm + ").compare(current, previous) wrong order: "
+ + order + " != " + -expectedOrder);
+ return false;
+ }
+
+ order = prevKey.compareTo(key);
+ if (order != expectedOrder) {
+ logln(fileTestName);
+ logln(prevFileLine);
+ logln(fileLine);
+ logln(printCollationKey(prevKey));
+ logln(printCollationKey(key));
+ errln("line " + fileLineNumber
+ + " Collator(" + norm + ").getCollationKey(previous, current).compareTo() wrong order: "
+ + order + " != " + expectedOrder);
+ return false;
+ }
+ if (order != Collation.EQUAL && expectedLevel != Collation.NO_LEVEL) {
+ byte[] prevBytes = prevKey.toByteArray();
+ byte[] bytes = key.toByteArray();
+ int level = Collation.PRIMARY_LEVEL;
+ for (int i = 0;; ++i) {
+ byte b = prevBytes[i];
+ if (b != bytes[i]) {
+ break;
+ }
+ if ((int)b == Collation.LEVEL_SEPARATOR_BYTE) {
+ ++level;
+ if (level == Collation.CASE_LEVEL
+ && !((RuleBasedCollator)coll).isCaseLevel()) {
+ ++level;
+ }
+ }
+ }
+ if (level != expectedLevel) {
+ logln(fileTestName);
+ logln(prevFileLine);
+ logln(fileLine);
+ logln(printCollationKey(prevKey));
+ logln(printCollationKey(key));
+ errln("line " + fileLineNumber
+ + " Collator(" + norm + ").getCollationKey(previous, current).compareTo()="
+ + level + " wrong level: " + level + " != " + expectedLevel);
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private void checkCompareStrings(BufferedReader in) throws IOException {
+ String prevFileLine = "(none)";
+ String prevString = "";
+ String s;
+ Output<String> sOut = new Output<String>();
+ while (readLine(in)) {
+ if (fileLine.length() == 0) {
+ continue;
+ }
+ if (isSectionStarter(fileLine.charAt(0))) {
+ break;
+ }
+ int relation = parseRelationAndString(sOut);
+ s = sOut.value;
+ int expectedOrder = (relation == Collation.ZERO_LEVEL) ? Collation.EQUAL : Collation.LESS;
+ int expectedLevel = relation;
+ boolean isOk = true;
+ if (!needsNormalization(prevString) && !needsNormalization(s)) {
+ coll.setDecomposition(Collator.NO_DECOMPOSITION);
+ isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
+ expectedOrder, expectedLevel);
+ }
+ if (isOk) {
+ coll.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
+ isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
+ expectedOrder, expectedLevel);
+ }
+ if (isOk && (!nfd.isNormalized(prevString) || !nfd.isNormalized(s))) {
+ String pn = nfd.normalize(prevString);
+ String n = nfd.normalize(s);
+ isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
+ expectedOrder, expectedLevel);
+ }
+ prevFileLine = fileLine;
+ prevString = s;
+ }
+ }
+
+ public void TestDataDriven() {
+ nfd = Normalizer2.getNFDInstance();
+ fcd = Norm2AllModes.getFCDNormalizer2();
+
+ BufferedReader in = null;
+
+ try {
+ in = TestUtil.getDataReader("collationtest.txt", "UTF-8");
+
+ // read first line and remove BOM if present
+ readLine(in);
+ if (fileLine != null && fileLine.charAt(0) == '\uFEFF') {
+ fileLine = fileLine.substring(1);
+ }
+
+ while (error == null) {
+ if (fileLine == null || fileLine.length() == 0) {
+ if (!readLine(in)) {
+ break;
+ }
+ continue;
+ }
+ if (!isSectionStarter(fileLine.charAt(0))) {
+ logln(fileLine);
+ errln("syntax error on line " + fileLineNumber);
+ return;
+ }
+ if (fileLine.startsWith("** test: ")) {
+ fileTestName = fileLine;
+ logln(fileLine);
+ fileLine = null;
+ } else if (fileLine.equals("@ root")) {
+ setRootCollator();
+ fileLine = null;
+ } else if (fileLine.startsWith("@ locale ")) {
+ setLocaleCollator();
+ fileLine = null;
+ } else if (fileLine.equals("@ rules")) {
+ buildTailoring(in);
+ } else if (fileLine.charAt(0) == '%'
+ && fileLine.length() > 1 && isSpace(fileLine.charAt(1))) {
+ parseAndSetAttribute();
+ } else if (fileLine.equals("* compare")) {
+ checkCompareStrings(in);
+ } else {
+ logln(fileLine);
+ errln("syntax error on line " + fileLineNumber);
+ return;
+ }
+ }
+ } catch (IOException e) {
+ errln(e.getMessage());
+ } finally {
+ try {
+ if (in != null) {
+ in.close();
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
}
/*
*******************************************************************************
- * Copyright (C) 2002-2012, International Business Machines Corporation and *
- * others. All Rights Reserved. *
+ * Copyright (C) 2002-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
errln("Fail: getCollationKey returned ");
return;
}
- msg += "key1: " + prettify(k1) + "\n"
- + "key2: " + prettify(k2);
+ msg += "key1: " + CollationTest.prettify(k1) + "\n"
+ + "key2: " + CollationTest.prettify(k2);
errln(msg);
}
}
}
compareArray(collator, testcontraction);
}
-
-
-
-
-
-
-
-
-
-
- String prettify(CollationKey sourceKey) {
- int i;
- byte[] bytes= sourceKey.toByteArray();
- String target = "[";
-
- for (i = 0; i < bytes.length; i++) {
- target += Integer.toHexString(bytes[i]);
- target += " ";
- }
- target += "]";
- return target;
- }
-
// private inner class -------------------------------------------------
private static final class StrCmp implements Comparator<String>
/*
*******************************************************************************
- * Copyright (C) 2002-2010, International Business Machines Corporation and *
- * others. All Rights Reserved. *
+ * Copyright (C) 2002-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
String sExpect = new String("");
String sResult = new String("");
- sResult = appendCompareResult(compareResult, sResult);
- sExpect = appendCompareResult(expectedResult, sExpect);
+ sResult = CollationTest.appendCompareResult(compareResult, sResult);
+ sExpect = CollationTest.appendCompareResult(expectedResult, sExpect);
if (ok1) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
} else {
msg1 = ok2 ? "Ok: key(\"" : "FAIL: key(\"";
msg2 = "\").compareTo(key(\"";
msg3 = "\")) returned ";
- sResult = appendCompareResult(keyResult, sResult);
+ sResult = CollationTest.appendCompareResult(keyResult, sResult);
if (ok2) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
} else {
errln(msg1 + source + msg2 + target + msg3 + sResult + msg4 + sExpect);
msg1 = " ";
msg2 = " vs. ";
- errln(msg1 + prettify(sourceKey) + msg2 + prettify(targetKey));
+ errln(msg1 + CollationTest.prettify(sourceKey) + msg2 + CollationTest.prettify(targetKey));
}
msg1 = ok3 ? "Ok: incCompare(\"" : "FAIL: incCompare(\"";
msg2 = "\", \"";
msg3 = "\") returned ";
- sResult = appendCompareResult(incResult, sResult);
+ sResult = CollationTest.appendCompareResult(incResult, sResult);
if (ok3) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
}
}
}
-
- private String appendCompareResult(int result, String target) {
- if (result == -1) {
- target += "LESS";
- } else if (result == 0) {
- target += "EQUAL";
- } else if (result == 1) {
- target += "GREATER";
- } else {
- String huh = "?";
- target += huh + result;
- }
- return target;
- }
-
- String prettify(CollationKey sourceKey) {
- int i;
- byte[] bytes= sourceKey.toByteArray();
- String target = "[";
-
- for (i = 0; i < bytes.length; i++) {
- target += Integer.toHexString(bytes[i]);
- target += " ";
- }
- target += "]";
- return target;
- }
-}
\ No newline at end of file
+}
/*
*******************************************************************************
- * Copyright (C) 2002-2010, International Business Machines Corporation and *
- * others. All Rights Reserved. *
+ * Copyright (C) 2002-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
String sExpect = new String("");
String sResult = new String("");
- sResult = appendCompareResult(compareResult, sResult);
- sExpect = appendCompareResult(expectedResult, sExpect);
+ sResult = CollationTest.appendCompareResult(compareResult, sResult);
+ sExpect = CollationTest.appendCompareResult(expectedResult, sExpect);
if (ok1) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
} else {
msg1 = ok2 ? "Ok: key(\"" : "FAIL: key(\"";
msg2 = "\").compareTo(key(\"";
msg3 = "\")) returned ";
- sResult = appendCompareResult(keyResult, sResult);
+ sResult = CollationTest.appendCompareResult(keyResult, sResult);
if (ok2) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
} else {
errln(msg1 + source + msg2 + target + msg3 + sResult + msg4 + sExpect);
msg1 = " ";
msg2 = " vs. ";
- errln(msg1 + prettify(sourceKey) + msg2 + prettify(targetKey));
+ errln(msg1 + CollationTest.prettify(sourceKey) + msg2 + CollationTest.prettify(targetKey));
}
msg1 = ok3 ? "Ok: incCompare(\"" : "FAIL: incCompare(\"";
msg2 = "\", \"";
msg3 = "\") returned ";
- sResult = appendCompareResult(incResult, sResult);
+ sResult = CollationTest.appendCompareResult(incResult, sResult);
if (ok3) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
}
}
}
-
- private String appendCompareResult(int result, String target){
- if (result == -1) {
- target += "LESS";
- } else if (result == 0) {
- target += "EQUAL";
- } else if (result == 1) {
- target += "GREATER";
- } else {
- String huh = "?";
- target += huh + result;
- }
- return target;
- }
-
- String prettify(CollationKey sourceKey) {
- int i;
- byte[] bytes= sourceKey.toByteArray();
- String target = "[";
-
- for (i = 0; i < bytes.length; i++) {
- target += Integer.toHexString(bytes[i]);
- target += " ";
- }
- target += "]";
- return target;
- }
}
/*
*******************************************************************************
- * Copyright (C) 2002-2010, International Business Machines Corporation and *
- * others. All Rights Reserved. *
+ * Copyright (C) 2002-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
String sExpect = new String("");
String sResult = new String("");
- sResult = appendCompareResult(compareResult, sResult);
- sExpect = appendCompareResult(expectedResult, sExpect);
+ sResult = CollationTest.appendCompareResult(compareResult, sResult);
+ sExpect = CollationTest.appendCompareResult(expectedResult, sExpect);
if (ok1) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
} else {
msg1 = ok2 ? "Ok: key(\"" : "FAIL: key(\"";
msg2 = "\").compareTo(key(\"";
msg3 = "\")) returned ";
- sResult = appendCompareResult(keyResult, sResult);
+ sResult = CollationTest.appendCompareResult(keyResult, sResult);
if (ok2) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
} else {
errln(msg1 + source + msg2 + target + msg3 + sResult + msg4 + sExpect);
msg1 = " ";
msg2 = " vs. ";
- errln(msg1 + prettify(sourceKey) + msg2 + prettify(targetKey));
+ errln(msg1 + CollationTest.prettify(sourceKey) + msg2 + CollationTest.prettify(targetKey));
}
msg1 = ok3 ? "Ok: incCompare(\"" : "FAIL: incCompare(\"";
msg2 = "\", \"";
msg3 = "\") returned ";
- sResult = appendCompareResult(incResult, sResult);
+ sResult = CollationTest.appendCompareResult(incResult, sResult);
if (ok3) {
logln(msg1 + source + msg2 + target + msg3 + sResult);
}
}
}
-
- private String appendCompareResult(int result, String target){
- if (result == -1) {
- target += "LESS";
- } else if (result == 0) {
- target += "EQUAL";
- } else if (result == 1) {
- target += "GREATER";
- } else {
- String huh = "?";
- target += huh + result;
- }
- return target;
- }
-
- String prettify(CollationKey sourceKey) {
- int i;
- byte[] bytes= sourceKey.toByteArray();
- String target = "[";
-
- for (i = 0; i < bytes.length; i++) {
- target += Integer.toHexString(bytes[i]);
- target += " ";
- }
- target += "]";
- return target;
- }
-}
\ No newline at end of file
+}
/********************************************************************
- * COPYRIGHT:
- * Copyright (c) 2002-2012, International Business Machines Corporation and
+ * Copyright (c) 2002-2014, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
package com.ibm.icu.dev.test.collator;
import java.io.BufferedReader;
-import java.util.Locale;
+import java.io.IOException;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.dev.test.TestUtil;
import com.ibm.icu.lang.UCharacter;
-import com.ibm.icu.text.CollationKey;
import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RawCollationKey;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.text.UTF16;
+import com.ibm.icu.util.ULocale;
import com.ibm.icu.util.VersionInfo;
public class UCAConformanceTest extends TestFmwk {
public UCAConformanceTest() {
}
- protected void init()throws Exception{
- UCA = (RuleBasedCollator)Collator.getInstance(
- new Locale("root", "", ""));
+ @Override
+ protected void init() throws Exception{
+ UCA = (RuleBasedCollator)Collator.getInstance(ULocale.ROOT);
comparer = new UTF16.StringComparator(true, false, UTF16.StringComparator.FOLD_CASE_DEFAULT);
}
}
public void TestRulesNonIgnorable() {
+ if(logKnownIssue("cldrbug:6745", "UCARules.txt has problems")) { return; }
initRbUCA();
if(rbUCA == null) { return; }
String collationTest = "CollationTest_";
String ext = ".txt";
try {
- if(in != null) {
- in.close();
- }
- } catch (Exception e) {
- errln("Could not close the opened file!");
- return;
- }
- try {
- in = TestUtil.getDataReader(collationTest+type+ext);
+ in = TestUtil.getDataReader(collationTest+type+"_SHORT"+ext);
} catch (Exception e) {
try {
- in = TestUtil.getDataReader(collationTest+type+"_SHORT"+ext);
+ in = TestUtil.getDataReader(collationTest+type+ext);
} catch (Exception e1) {
try {
in = TestUtil.getDataReader(collationTest+type+"_STUB"+ext);
logln( "INFO: Working with the stub file.\n"+
"If you need the full conformance test, please\n"+
"download the appropriate data files from:\n"+
- "http://source.icu-project.org/repos/icu/tools/trunk/unicodetools/com/ibm/text/data/");
+ "http://unicode.org/cldr/trac/browser/trunk/common/uca");
} catch (Exception e11) {
errln("ERROR: Could not find any of the test files");
}
private static final int FROM_RULES = 2;
private static boolean skipLineBecauseOfBug(String s, int flags) {
- // TODO: Fix ICU ticket #8052
- if(s.length() >= 3 &&
- (s.charAt(0) == 0xfb2 || s.charAt(0) == 0xfb3) &&
- s.charAt(1) == 0x334 &&
- (s.charAt(2) == 0xf73 || s.charAt(2) == 0xf75 || s.charAt(2) == 0xf81)) {
- return true;
- }
- // TODO: Fix ICU ticket #9361
- if((flags & IS_SHIFTED) != 0 && s.length() >= 2 && s.charAt(0) == 0xfffe) {
- return true;
- }
- // TODO: Fix ICU ticket #9494
- int c;
- if(s.length() >= 2 && 0xe0100 <= (c = s.codePointAt(0)) && c <= 0xe01ef) {
- return true;
- }
- // TODO: Fix ICU ticket #8923
- if((flags & FROM_RULES) != 0 && 0xac00 <= (c = s.charAt(0)) && c <= 0xd7a3) {
- return true;
- }
- // TODO: Fix tailoring builder, ICU ticket #9593.
- if((flags & FROM_RULES) != 0 && s.length() >= 2 && ((c = s.charAt(1)) == 0xedc || c == 0xedd)) {
- return true;
- }
+ // Add temporary exceptions here if there are ICU bugs, until we can fix them.
+ // For examples see the ICU 52 version of this file.
return false;
}
skipFlags |= FROM_RULES;
}
+ logln("-prop:ucaconfnosortkeys=1 turns off getSortKey() in UCAConformanceTest");
+ boolean withSortKeys = getProperty("ucaconfnosortkeys") == null;
+
int lineNo = 0;
String line = null, oldLine = null, buffer = null, oldB = null;
- CollationKey oldSk = null, newSk = null;
+ RawCollationKey sk1 = new RawCollationKey(), sk2 = new RawCollationKey();
+ RawCollationKey oldSk = null, newSk = sk1;
try {
while ((line = in.readLine()) != null) {
continue;
}
- newSk = coll.getCollationKey(buffer);
+ if(withSortKeys) {
+ coll.getRawCollationKey(buffer, newSk);
+ }
if(oldSk != null) {
- int skres = oldSk.compareTo(newSk);
+ boolean ok = true;
+ int skres = withSortKeys ? oldSk.compareTo(newSk) : 0;
int cmpres = coll.compare(oldB, buffer);
int cmpres2 = coll.compare(buffer, oldB);
if(cmpres != -cmpres2) {
- errln("Compare result not symmetrical on line "+lineNo);
+ errln(String.format(
+ "Compare result not symmetrical on line %i: " +
+ "previous vs. current (%d) / current vs. previous (%d)",
+ lineNo, cmpres, cmpres2));
+ ok = false;
}
- if(normalizeResult(cmpres) != normalizeResult(skres)) {
+
+ // TODO: Compare with normalization turned off if the input passes the FCD test.
+
+ if(withSortKeys && cmpres != normalizeResult(skres)) {
errln("Difference between coll.compare (" + cmpres + ") and sortkey compare (" + skres + ") on line " + lineNo);
- errln(oldLine);
- errln(line);
+ ok = false;
}
int res = cmpres;
}
if(res > 0) {
errln("Line " + lineNo + " is not greater or equal than previous line");
- errln(oldLine);
- errln(line);
+ ok = false;
+ }
+
+ if(!ok) {
+ errln(" Previous data line " + oldLine);
+ errln(" Current data line " + line);
+ if(withSortKeys) {
+ errln(" Previous key: " + CollationTest.prettify(oldSk));
+ errln(" Current key: " + CollationTest.prettify(newSk));
+ }
}
}
oldSk = newSk;
oldB = buffer;
oldLine = line;
+ if(oldSk == sk1) {
+ newSk = sk2;
+ } else {
+ newSk = sk1;
+ }
}
} catch (Exception e) {
errln("Unexpected exception "+e);
+ } finally {
+ try {
+ in.close();
+ } catch (IOException ignored) {
+ }
+ in = null;
}
}
}
/*
*******************************************************************************
- * Copyright (C) 2000-2013, International Business Machines Corporation and *
+ * Copyright (C) 2000-2014, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.dev.test.search;
+import static com.ibm.icu.text.Collator.IDENTICAL;
+import static com.ibm.icu.text.Collator.PRIMARY;
+import static com.ibm.icu.text.Collator.QUATERNARY;
+import static com.ibm.icu.text.Collator.SECONDARY;
+import static com.ibm.icu.text.Collator.TERTIARY;
+import static com.ibm.icu.text.SearchIterator.ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD;
+import static com.ibm.icu.text.SearchIterator.ElementComparisonType.PATTERN_BASE_WEIGHT_IS_WILDCARD;
+import static com.ibm.icu.text.SearchIterator.ElementComparisonType.STANDARD_ELEMENT_COMPARISON;
+
import java.text.StringCharacterIterator;
import java.util.Locale;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.text.SearchIterator;
+import com.ibm.icu.text.SearchIterator.ElementComparisonType;
import com.ibm.icu.text.StringSearch;
import com.ibm.icu.util.ULocale;
//inner class
static class SearchData {
- SearchData(String text, String pattern, String coll, int strength, String breaker,
- int[] offset, int[] size) {
+ SearchData(String text, String pattern,
+ String coll, int strength, ElementComparisonType cmpType, String breaker,
+ int[] offset, int[] size) {
this.text = text;
this.pattern = pattern;
this.collator = coll;
this.strength = strength;
+ this.cmpType = cmpType;
this.breaker = breaker;
this.offset = offset;
this.size = size;
String pattern;
String collator;
int strength;
+ ElementComparisonType cmpType;
String breaker;
int[] offset;
int[] size;
BreakIterator m_en_wordbreaker_;
BreakIterator m_en_characterbreaker_;
+ // Just calling SearchData constructor, to make the test data source code
+ // nice and short
+ private static SearchData SD(String text, String pattern, String coll, int strength,
+ ElementComparisonType cmpType, String breaker, int[] offset, int[] size) {
+ return new SearchData(text, pattern, coll, strength, cmpType, breaker, offset, size);
+ }
+
+ // Just returning int[], to make the test data nice and short
+ private static int[] IA(int... elements) {
+ return elements;
+ }
+
static SearchData[] BASIC = {
- new SearchData("xxxxxxxxxxxxxxxxxxxx", "fisher", null, Collator.TERTIARY, null, new int[] {-1}, new int[]{0}),
- new SearchData("silly spring string", "string", null, Collator.TERTIARY, null, new int[]{13, -1}, new int[]{6}),
- new SearchData("silly spring string string", "string", null, Collator.TERTIARY, null, new int[]{13, 20, -1}, new int[]{6, 6}),
- new SearchData("silly string spring string", "string", null, Collator.TERTIARY, null, new int[]{6, 20, -1}, new int[]{6, 6}),
- new SearchData("string spring string", "string", null, Collator.TERTIARY, null, new int[]{0, 14, -1}, new int[]{6, 6}),
- new SearchData("Scott Ganyo", "c", null, Collator.TERTIARY, null, new int[]{1, -1}, new int[]{1}),
- new SearchData("Scott Ganyo", " ", null, Collator.TERTIARY, null, new int[]{5, -1}, new int[]{1}),
- new SearchData("\u0300\u0325", "\u0300", null, Collator.TERTIARY, null, new int[]{-1}, new int[]{0}),
- new SearchData("a\u0300\u0325", "\u0300", null, Collator.TERTIARY, null, new int[]{-1}, new int[]{0}),
- new SearchData("a\u0300\u0325", "\u0300\u0325", null, Collator.TERTIARY, null, new int[]{1, -1}, new int[]{2}),
- new SearchData("a\u0300b", "\u0300", null, Collator.TERTIARY, null, new int[]{1, -1}, new int[]{1}),
- new SearchData("\u00c9", "e", null, Collator.PRIMARY, null, new int[]{0, -1}, new int[]{1}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[]{-1}, new int[]{0})
+ SD("xxxxxxxxxxxxxxxxxxxx", "fisher", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("silly spring string", "string", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(13, -1), IA(6)),
+ SD("silly spring string string", "string", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(13, 20, -1), IA(6, 6)),
+ SD("silly string spring string", "string", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(6, 20, -1), IA(6, 6)),
+ SD("string spring string", "string", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, 14, -1), IA(6, 6)),
+ SD("Scott Ganyo", "c", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(1, -1), IA(1)),
+ SD("Scott Ganyo", " ", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(5, -1), IA(1)),
+ SD("\u0300\u0325", "\u0300", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("a\u0300\u0325", "\u0300", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("a\u0300\u0325", "\u0300\u0325", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("a\u0300b", "\u0300", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("\u00c9", "e", null, PRIMARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, -1), IA(1)),
};
SearchData BREAKITERATOREXACT[] = {
- new SearchData("foxy fox", "fox", null, Collator.TERTIARY, "characterbreaker", new int[] {0, 5, -1}, new int[] {3, 3}),
- new SearchData("foxy fox", "fox", null, Collator.TERTIARY, "wordbreaker", new int[] {5, -1}, new int[] {3}),
- new SearchData("This is a toe T\u00F6ne", "toe", "de", Collator.PRIMARY, "characterbreaker", new int[] {10, 14, -1}, new int[] {3, 2}),
- new SearchData("This is a toe T\u00F6ne", "toe", "de", Collator.PRIMARY, "wordbreaker", new int[] {10, -1}, new int[] {3}),
- new SearchData("Channel, another channel, more channels, and one last Channel", "Channel", "es", Collator.TERTIARY,
- "wordbreaker", new int[] {0, 54, -1}, new int[] {7, 7}),
+ SD("foxy fox", "fox", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, "characterbreaker", IA(0, 5, -1), IA(3, 3)),
+ SD("foxy fox", "fox", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, "wordbreaker", IA(5, -1), IA(3)),
+ SD("This is a toe T\u00F6ne", "toe", "de", PRIMARY, STANDARD_ELEMENT_COMPARISON, "characterbreaker", IA(10, 14, -1), IA(3, 2)),
+ SD("This is a toe T\u00F6ne", "toe", "de", PRIMARY, STANDARD_ELEMENT_COMPARISON, "wordbreaker", IA(10, -1), IA(3)),
+ SD("Channel, another channel, more channels, and one last Channel", "Channel", "es", TERTIARY, STANDARD_ELEMENT_COMPARISON, "wordbreaker", IA(0, 54, -1), IA(7, 7)),
/* jitterbug 1745 */
- new SearchData("testing that \u00e9 does not match e", "e", null, Collator.TERTIARY,
- "characterbreaker", new int[] {1, 17, 30, -1}, new int[] {1, 1, 1}),
- new SearchData("testing that string ab\u00e9cd does not match e", "e", null, Collator.TERTIARY,
- "characterbreaker", new int[] {1, 28, 41, -1}, new int[] {1, 1, 1}),
- new SearchData("\u00c9", "e", "fr", Collator.PRIMARY, "characterbreaker", new int[]{0, -1}, new int[]{1}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0})
+ SD("testing that \u00e9 does not match e", "e", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, "characterbreaker", IA(1, 17, 30, -1), IA(1, 1, 1)),
+ SD("testing that string ab\u00e9cd does not match e", "e", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, "characterbreaker", IA(1, 28, 41, -1), IA(1, 1, 1)),
+ SD("\u00c9", "e", "fr", PRIMARY, STANDARD_ELEMENT_COMPARISON, "characterbreaker", IA(0, -1), IA(1)),
};
SearchData BREAKITERATORCANONICAL[] = {
- new SearchData("foxy fox", "fox", null, Collator.TERTIARY, "characterbreaker", new int[] {0, 5, -1}, new int[] {3, 3}),
- new SearchData("foxy fox", "fox", null, Collator.TERTIARY, "wordbreaker", new int[] {5, -1}, new int[] {3}),
- new SearchData("This is a toe T\u00F6ne", "toe", "de", Collator.PRIMARY, "characterbreaker", new int[] {10, 14, -1}, new int[] {3, 2}),
- new SearchData("This is a toe T\u00F6ne", "toe", "de", Collator.PRIMARY, "wordbreaker", new int[] {10, -1}, new int[] {3}),
- new SearchData("Channel, another channel, more channels, and one last Channel", "Channel", "es", Collator.TERTIARY, "wordbreaker",
- new int[] {0, 54, -1}, new int[] {7, 7}),
+ SD("foxy fox", "fox", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, "characterbreaker", IA(0, 5, -1), IA(3, 3)),
+ SD("foxy fox", "fox", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, "wordbreaker", IA(5, -1), IA(3)),
+ SD("This is a toe T\u00F6ne", "toe", "de", PRIMARY, STANDARD_ELEMENT_COMPARISON, "characterbreaker", IA(10, 14, -1), IA(3, 2)),
+ SD("This is a toe T\u00F6ne", "toe", "de", PRIMARY, STANDARD_ELEMENT_COMPARISON, "wordbreaker", IA(10, -1), IA(3)),
+ SD("Channel, another channel, more channels, and one last Channel", "Channel", "es", TERTIARY, STANDARD_ELEMENT_COMPARISON, "wordbreaker", IA(0, 54, -1), IA(7, 7)),
/* jitterbug 1745 */
- new SearchData("testing that \u00e9 does not match e", "e", null, Collator.TERTIARY,
- "characterbreaker", new int[] {1, 17, 30, -1}, new int[] {1, 1, 1}),
- new SearchData("testing that string ab\u00e9cd does not match e", "e", null,
- Collator.TERTIARY, "characterbreaker", new int[] {1, 28, 41, -1}, new int[] {1, 1, 1}),
- new SearchData("\u00c9", "e", "fr", Collator.PRIMARY, "characterbreaker", new int[]{0, -1}, new int[]{1}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0})
+ SD("testing that \u00e9 does not match e", "e", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, "characterbreaker", IA(1, 17, 30, -1), IA(1, 1, 1)),
+ SD("testing that string ab\u00e9cd does not match e", "e", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, "characterbreaker", IA(1, 28, 41, -1), IA(1, 1, 1)),
+ SD("\u00c9", "e", "fr", PRIMARY, STANDARD_ELEMENT_COMPARISON, "characterbreaker", IA(0, -1), IA(1)),
};
SearchData BASICCANONICAL[] = {
- new SearchData("xxxxxxxxxxxxxxxxxxxx", "fisher", null, Collator.TERTIARY, null, new int[] {-1}, new int [] {0}),
- new SearchData("silly spring string", "string", null, Collator.TERTIARY, null, new int[] {13, -1}, new int[] {6}),
- new SearchData("silly spring string string", "string", null, Collator.TERTIARY, null, new int[] {13, 20, -1}, new int[] {6, 6}),
- new SearchData("silly string spring string", "string", null, Collator.TERTIARY, null, new int[] {6, 20, -1}, new int[] {6, 6}),
- new SearchData("string spring string", "string", null, Collator.TERTIARY, null, new int[] {0, 14, -1}, new int[] {6, 6}),
- new SearchData("Scott Ganyo", "c", null, Collator.TERTIARY, null, new int[] {1, -1}, new int[] {1}),
- new SearchData("Scott Ganyo", " ", null, Collator.TERTIARY, null, new int[] {5, -1}, new int[] {1}),
- new SearchData("\u0300\u0325", "\u0300", null, Collator.TERTIARY, null, new int [] {0, -1}, new int[] {2}),
- new SearchData("a\u0300\u0325", "\u0300", null, Collator.TERTIARY, null, new int [] {1, -1}, new int[] {2}),
- new SearchData("a\u0300\u0325", "\u0300\u0325", null, Collator.TERTIARY, null, new int[] {1, -1}, new int[]{2}),
- new SearchData("a\u0300b", "\u0300", null, Collator.TERTIARY, null, new int[]{1, -1}, new int[] {1}),
- new SearchData("a\u0300\u0325b", "\u0300b", null, Collator.TERTIARY, null, new int[] {1, -1}, new int[] {3}),
- new SearchData("\u0325\u0300A\u0325\u0300", "\u0300A\u0300", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {5}),
- new SearchData("\u0325\u0300A\u0325\u0300", "\u0325A\u0325", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {5}),
- new SearchData("a\u0300\u0325b\u0300\u0325c \u0325b\u0300 \u0300b\u0325", "\u0300b\u0325", null, Collator.TERTIARY, null,
- new int[] {1, 12, -1}, new int[] {5, 3}),
- new SearchData("\u00c4\u0323", "A\u0323\u0308", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {2}),
- new SearchData("\u0308\u0323", "\u0323\u0308", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {2}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0})
+ SD("xxxxxxxxxxxxxxxxxxxx", "fisher", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("silly spring string", "string", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(13, -1), IA(6)),
+ SD("silly spring string string", "string", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(13, 20, -1), IA(6, 6)),
+ SD("silly string spring string", "string", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(6, 20, -1), IA(6, 6)),
+ SD("string spring string", "string", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, 14, -1), IA(6, 6)),
+ SD("Scott Ganyo", "c", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(1, -1), IA(1)),
+ SD("Scott Ganyo", " ", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(5, -1), IA(1)),
+
+ SD("\u0300\u0325", "\u0300", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("a\u0300\u0325", "\u0300", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("a\u0300\u0325", "\u0300\u0325", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("a\u0300b", "\u0300", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("a\u0300\u0325b", "\u0300b", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("\u0325\u0300A\u0325\u0300", "\u0300A\u0300", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("\u0325\u0300A\u0325\u0300", "\u0325A\u0325", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("a\u0300\u0325b\u0300\u0325c \u0325b\u0300 \u0300b\u0325", "\u0300b\u0325", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
+ SD("\u00c4\u0323", "A\u0323\u0308", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, -1), IA(2)),
+ SD("\u0308\u0323", "\u0323\u0308", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, -1), IA(2)),
};
SearchData COLLATOR[] = {
/* english */
- new SearchData("fox fpx", "fox", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {3}),
+ SD("fox fpx", "fox", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, -1), IA(3)),
/* tailored */
- new SearchData("fox fpx", "fox", null, Collator.PRIMARY, null, new int[] {0, 4, -1}, new int[] {3, 3}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0})
+ SD("fox fpx", "fox", null, PRIMARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, 4, -1), IA(3, 3)),
};
String TESTCOLLATORRULE = "& o,O ; p,P";
String EXTRACOLLATIONRULE = " & ae ; \u00e4 & AE ; \u00c4 & oe ; \u00f6 & OE ; \u00d6 & ue ; \u00fc & UE ; \u00dc";
-
SearchData COLLATORCANONICAL[] = {
/* english */
- new SearchData("fox fpx", "fox", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {3}),
+ SD("fox fpx", "fox", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, -1), IA(3)),
/* tailored */
- new SearchData("fox fpx", "fox", null, Collator.PRIMARY, null, new int[] {0, 4, -1}, new int[] {3, 3}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0})
+ SD("fox fpx", "fox", null, PRIMARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, 4, -1), IA(3, 3)),
};
SearchData COMPOSITEBOUNDARIES[] = {
- new SearchData("\u00C0", "A", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {1}),
- new SearchData("A\u00C0C", "A", null, Collator.TERTIARY, null, new int[] {0, 1, -1}, new int[] {1, 1}),
- new SearchData("\u00C0A", "A", null, Collator.TERTIARY, null, new int[] {0, 1, -1}, new int[] {1, 1}),
- new SearchData("B\u00C0", "A", null, Collator.TERTIARY, null, new int[] {1, -1}, new int[] {1}),
- new SearchData("\u00C0B", "A", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {1}),
- new SearchData("\u00C0", "\u0300", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {1}),
- new SearchData("\u0300\u00C0", "\u0300", null, Collator.TERTIARY, null, new int[] {0, 1, -1}, new int[] {1, 1}),
- new SearchData("\u00C0\u0300", "\u0300", null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0}),
+ SD("\u00C0", "A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("A\u00C0C", "A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, -1), IA(1)),
+ SD("\u00C0A", "A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(1, -1), IA(1)),
+ SD("B\u00C0", "A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("\u00C0B", "A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("\u00C0", "\u0300", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
+ /* first one matches only because it's at the start of the text */
+ SD("\u0300\u00C0", "\u0300", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, -1), IA(1)),
+
+ /* \\u0300 blocked by \\u0300 */
+ SD("\u00C0\u0300", "\u0300", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
/* A + 030A + 0301 */
- new SearchData("\u01FA", "\u01FA", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {1}),
- new SearchData("\u01FA", "\u030A", null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0}),
- new SearchData("\u01FA", "A\u030A", null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0}),
- new SearchData("\u01FA", "\u030AA", null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0}),
- new SearchData("\u01FA", "\u0301", null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0}),
- new SearchData("\u01FA", "A\u0301", null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0}),
- new SearchData("\u01FA", "\u0301A", null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0}),
- new SearchData("\u01FA", "\u030A\u0301", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {1}),
- new SearchData("A\u01FA", "A\u030A", null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0}),
- new SearchData("\u01FAA", "\u0301A", null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0}),
- new SearchData("\u0F73", "\u0F73", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {1}),
- new SearchData("\u0F73", "\u0F71", null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0}),
- new SearchData("\u0F73", "\u0F72", null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0}),
- new SearchData("\u0F73", "\u0F71\u0F72", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {1}),
- new SearchData("A\u0F73", "A\u0F71", null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0}),
- new SearchData("\u0F73A", "\u0F72A", null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0})
+ SD("\u01FA", "\u01FA", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, -1), IA(1)),
+ SD("\u01FA", "A\u030A\u0301", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, -1), IA(1)),
+
+ SD("\u01FA", "\u030A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("\u01FA", "A\u030A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
+ SD("\u01FA", "\u030AA", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
+ SD("\u01FA", "\u0301", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
+ /* blocked accent */
+ SD("\u01FA", "A\u0301", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("\u01FA", "\u0301A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
+ SD("\u01FA", "\u030A\u0301", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("A\u01FA", "A\u030A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("\u01FAA", "\u0301A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
+ SD("\u0F73", "\u0F73", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, -1), IA(1)),
+
+ SD("\u0F73", "\u0F71", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("\u0F73", "\u0F72", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
+ SD("\u0F73", "\u0F71\u0F72", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, -1), IA(1)),
+
+ SD("A\u0F73", "A\u0F71", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("\u0F73A", "\u0F72A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("\u01FA A\u0301\u030A A\u030A\u0301 A\u030A \u01FA", "A\u030A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(10, -1), IA(2)),
};
SearchData COMPOSITEBOUNDARIESCANONICAL[] = {
- new SearchData("\u00C0", "A", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {1}),
- new SearchData("A\u00C0C", "A", null, Collator.TERTIARY, null, new int[] {0, 1, -1}, new int[] {1, 1}),
- new SearchData("\u00C0A", "A", null, Collator.TERTIARY, null, new int[] {0, 1, -1}, new int[] {1, 1}),
- new SearchData("B\u00C0", "A", null, Collator.TERTIARY, null, new int[] {1, -1}, new int[] {1}),
- new SearchData("\u00C0B", "A", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {1}),
- new SearchData("\u00C0", "\u0300", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {1}),
- new SearchData("\u0300\u00C0", "\u0300", null, Collator.TERTIARY, null, new int[] {0, 1, -1}, new int[] {1, 1}),
+ SD("\u00C0", "A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("A\u00C0C", "A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, -1), IA(1)),
+ SD("\u00C0A", "A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(1, -1), IA(1)),
+ SD("B\u00C0", "A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("\u00C0B", "A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("\u00C0", "\u0300", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
+ /* first one matches only because it's at the start of the text */
+ SD("\u0300\u00C0", "\u0300", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, -1), IA(1)),
+
/* \u0300 blocked by \u0300 */
- new SearchData("\u00C0\u0300", "\u0300", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {2}),
+ SD("\u00C0\u0300", "\u0300", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
/* A + 030A + 0301 */
- new SearchData("\u01FA", "\u01FA", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {1}),
- new SearchData("\u01FA", "\u030A", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {1}),
- new SearchData("\u01FA", "A\u030A", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {1}),
- new SearchData("\u01FA", "\u030AA", null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0}),
- new SearchData("\u01FA", "\u0301", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {1}),
+ SD("\u01FA", "\u01FA", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, -1), IA(1)),
+ SD("\u01FA", "A\u030A\u0301", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, -1), IA(1)),
+
+ SD("\u01FA", "\u030A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("\u01FA", "A\u030A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
+ SD("\u01FA", "\u030AA", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
+ SD("\u01FA", "\u0301", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
/* blocked accent */
- new SearchData("\u01FA", "A\u0301", null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0}),
- new SearchData("\u01FA", "\u0301A", null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0}),
- new SearchData("\u01FA", "\u030A\u0301", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {1}),
- new SearchData("A\u01FA", "A\u030A", null, Collator.TERTIARY, null, new int[] {1, -1}, new int[] {1}),
- new SearchData("\u01FAA", "\u0301A", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {2}),
- new SearchData("\u0F73", "\u0F73", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {1}),
- new SearchData("\u0F73", "\u0F71", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {1}),
- new SearchData("\u0F73", "\u0F72", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {1}),
- new SearchData("\u0F73", "\u0F71\u0F72", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {1}),
- new SearchData("A\u0F73", "A\u0F71", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {2}),
- new SearchData("\u0F73A", "\u0F72A", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {2}),
- new SearchData("\u01FA A\u0301\u030A A\u030A\u0301 A\u030A \u01FA", "A\u030A",
- null, Collator.TERTIARY, null, new int[] {0, 6, 10, 13, -1}, new int[] {1, 3, 2, 1}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0})
+ SD("\u01FA", "A\u0301", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("\u01FA", "\u0301A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
+ SD("\u01FA", "\u030A\u0301", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("A\u01FA", "A\u030A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("\u01FAA", "\u0301A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
+ SD("\u0F73", "\u0F73", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, -1), IA(1)),
+
+ SD("\u0F73", "\u0F71", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("\u0F73", "\u0F72", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
+ SD("\u0F73", "\u0F71\u0F72", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, -1), IA(1)),
+
+ SD("A\u0F73", "A\u0F71", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("\u0F73A", "\u0F72A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
+ SD("\u01FA A\u0301\u030A A\u030A\u0301 A\u030A \u01FA", "A\u030A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(10, -1), IA(2)),
};
SearchData SUPPLEMENTARY[] = {
- /* 012345678901234567890123456789012345678901234567890012345678901234567890123456789012345678901234567890012345678901234567890123456789 */
- new SearchData("abc \uD800\uDC00 \uD800\uDC01 \uD801\uDC00 \uD800\uDC00abc abc\uD800\uDC00 \uD800\uD800\uDC00 \uD800\uDC00\uDC00",
- "\uD800\uDC00", null, Collator.TERTIARY, null,
- new int[] {4, 13, 22, 26, 29, -1}, new int[] {2, 2, 2, 2, 2}),
- new SearchData("and\uD834\uDDB9this sentence", "\uD834\uDDB9", null,
- Collator.TERTIARY, null, new int[] {3, -1},
- new int[] {2}),
- new SearchData("and \uD834\uDDB9 this sentence", " \uD834\uDDB9 ",
- null, Collator.TERTIARY, null, new int[] {3, -1},
- new int[] {4}),
- new SearchData("and-\uD834\uDDB9-this sentence", "-\uD834\uDDB9-",
- null, Collator.TERTIARY, null, new int[] {3, -1},
- new int[] {4}),
- new SearchData("and,\uD834\uDDB9,this sentence", ",\uD834\uDDB9,",
- null, Collator.TERTIARY, null, new int[] {3, -1},
- new int[] {4}),
- new SearchData("and?\uD834\uDDB9?this sentence", "?\uD834\uDDB9?",
- null, Collator.TERTIARY, null, new int[] {3, -1},
- new int[] {4}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0})
+ SD("abc \uD800\uDC00 \uD800\uDC01 \uD801\uDC00 \uD800\uDC00abc abc\uD800\uDC00 \uD800\uD800\uDC00 \uD800\uDC00\uDC00",
+ "\uD800\uDC00", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(4, 13, 22, 26, 29, -1), IA(2, 2, 2, 2, 2)),
+ SD("and\uD834\uDDB9this sentence", "\uD834\uDDB9", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(3, -1), IA(2)),
+ SD("and \uD834\uDDB9 this sentence", " \uD834\uDDB9 ", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(3, -1), IA(4)),
+ SD("and-\uD834\uDDB9-this sentence", "-\uD834\uDDB9-", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(3, -1), IA(4)),
+ SD("and,\uD834\uDDB9,this sentence", ",\uD834\uDDB9,", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(3, -1), IA(4)),
+ SD("and?\uD834\uDDB9?this sentence", "?\uD834\uDDB9?", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(3, -1), IA(4)),
};
String CONTRACTIONRULE = "&z = ab/c < AB < X\u0300 < ABC < X\u0300\u0315";
SearchData CONTRACTION[] = {
/* common discontiguous */
- new SearchData("A\u0300\u0315", "\u0300", null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0}),
- new SearchData("A\u0300\u0315", "\u0300\u0315", null, Collator.TERTIARY, null, new int[] {1, -1}, new int[] {2}),
+ SD("A\u0300\u0315", "\u0300", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
+ SD("A\u0300\u0315", "\u0300\u0315", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
/* contraction prefix */
- new SearchData("AB\u0315C", "A", null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0}),
- new SearchData("AB\u0315C", "AB", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {2}),
- new SearchData("AB\u0315C", "\u0315", null, Collator.TERTIARY, null, new int[] {2, -1}, new int[] {1}),
- /* discontiguous problem here for backwards iteration.
- accents not found because discontiguous stores all information */
- new SearchData("X\u0300\u0319\u0315", "\u0319", null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0}),
- /* ends not with a contraction character */
- new SearchData("X\u0315\u0300D", "\u0300\u0315", null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0}),
- new SearchData("X\u0315\u0300D", "X\u0300\u0315", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {3}),
- new SearchData("X\u0300\u031A\u0315D", "X\u0300", null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0}),
+ SD("AB\u0315C", "A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
+ SD("AB\u0315C", "AB", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("AB\u0315C", "\u0315", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
+ /*
+ * discontiguous problem here for backwards iteration. accents not found because discontiguous stores all
+ * information
+ */
+ SD("X\u0300\u0319\u0315", "\u0319", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ /* ends not with a contraction character */
+ SD("X\u0315\u0300D", "\u0300\u0315", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("X\u0315\u0300D", "X\u0300\u0315", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, -1), IA(3)),
+ SD("X\u0300\u031A\u0315D", "X\u0300", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
/* blocked discontiguous */
- new SearchData("X\u0300\u031A\u0315D", "\u031A\u0315D", null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0}),
- new SearchData("ab", "z", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {2}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0})
+ SD("X\u0300\u031A\u0315D", "\u031A\u0315D", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
+ /*
+ * "ab" generates a contraction that's an expansion. The "z" matches the first CE of the expansion but the
+ * match fails because it ends in the middle of an expansion...
+ */
+ SD("ab", "z", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
};
SearchData CONTRACTIONCANONICAL[] = {
/* common discontiguous */
- new SearchData("A\u0300\u0315", "\u0300", null, Collator.TERTIARY, null, new int[] {1, -1}, new int[] {2}),
- new SearchData("A\u0300\u0315", "\u0300\u0315", null, Collator.TERTIARY, null, new int[] {1, -1}, new int[] {2}),
+ SD("A\u0300\u0315", "\u0300", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("A\u0300\u0315", "\u0300\u0315", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
/* contraction prefix */
- new SearchData("AB\u0315C", "A", null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0}),
- new SearchData("AB\u0315C", "AB", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {2}),
- new SearchData("AB\u0315C", "\u0315", null, Collator.TERTIARY, null, new int[] {2, -1}, new int[] {1}),
- /* discontiguous problem here for backwards iteration.
- forwards gives 0, 4 but backwards give 1, 3 */
- /* {"X\u0300\u0319\u0315", "\u0319", null, Collator.TERTIARY, null, {0, -1},
- {4}}, */
-
- /* ends not with a contraction character */
- new SearchData("X\u0315\u0300D", "\u0300\u0315", null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0}),
- new SearchData("X\u0315\u0300D", "X\u0300\u0315", null, Collator.TERTIARY, null,
- new int[] {0, -1}, new int[] {3}),
- new SearchData("X\u0300\u031A\u0315D", "X\u0300", null, Collator.TERTIARY, null,
- new int[] {0, -1}, new int[] {4}),
+ SD("AB\u0315C", "A", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
+ SD("AB\u0315C", "AB", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("AB\u0315C", "\u0315", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
+ /*
+ * discontiguous problem here for backwards iteration. forwards gives 0, 4 but backwards give 1, 3
+ */
+ /*
+ * {"X\u0300\u0319\u0315", "\u0319", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, {0, -1), {4}),
+ */
+
+ /* ends not with a contraction character */
+ SD("X\u0315\u0300D", "\u0300\u0315", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("X\u0315\u0300D", "X\u0300\u0315", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, -1), IA(3)),
+
+ SD("X\u0300\u031A\u0315D", "X\u0300", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
/* blocked discontiguous */
- new SearchData("X\u0300\u031A\u0315D", "\u031A\u0315D", null, Collator.TERTIARY, null,
- new int[] {1, -1}, new int[] {4}),
- new SearchData("ab", "z", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {2}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0})
+ SD("X\u0300\u031A\u0315D", "\u031A\u0315D", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+
+ /*
+ * "ab" generates a contraction that's an expansion. The "z" matches the first CE of the expansion but the
+ * match fails because it ends in the middle of an expansion...
+ */
+ SD("ab", "z", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(2)),
};
SearchData MATCH[] = {
- new SearchData("a busy bee is a very busy beeee", "bee", null, Collator.TERTIARY, null,
- new int[] {7, 26, -1}, new int[] {3, 3}),
- /* 012345678901234567890123456789012345678901234567890 */
- new SearchData("a busy bee is a very busy beeee with no bee life", "bee", null,
- Collator.TERTIARY, null, new int[] {7, 26, 40, -1}, new int[] {3, 3, 3}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0})
+ SD("a busy bee is a very busy beeee", "bee", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(7, 26, -1), IA(3, 3)),
+ /* 012345678901234567890123456789012345678901234567890 */
+ SD("a busy bee is a very busy beeee with no bee life", "bee", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(7, 26, 40, -1), IA(3, 3, 3)),
};
String IGNORABLERULE = "&a = \u0300";
SearchData IGNORABLE[] = {
- new SearchData("\u0300\u0315 \u0300\u0315 ", "\u0300", null, Collator.PRIMARY, null,
- new int[] {0, 3, -1}, new int[] {2, 2}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0})
+ /*
+ * This isn't much of a test when matches have to be on grapheme boundiaries. The match at 0 only works because it's
+ * at the start of the text.
+ */
+ SD("\u0300\u0315 \u0300\u0315 ", "\u0300", null, PRIMARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, -1), IA(2)),
};
-
+
SearchData DIACTRICMATCH[] = {
- new SearchData("\u0061\u0061\u00E1", "\u0061\u00E1", null, Collator.SECONDARY, null,
- new int[] {1, -1}, new int[] {2}),
- new SearchData("\u0020\u00C2\u0303\u0020\u0041\u0061\u1EAA\u0041\u0302\u0303\u00C2\u0303\u1EAB\u0061\u0302\u0303\u00E2\u0303\uD806\uDC01\u0300\u0020",
- "\u00C2\u0303", null, Collator.PRIMARY, null, new int[] {1, 4, 5, 6, 7, 10, 12, 13, 16,-1}, new int[] {2, 1, 1, 1, 3, 2, 1, 3, 2}),
- new SearchData("\u03BA\u03B1\u03B9\u0300\u0020\u03BA\u03B1\u1F76", "\u03BA\u03B1\u03B9", null, Collator.PRIMARY, null,
- new int[] {0, 5, -1}, new int[] {4, 3}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0})
+ SD("\u0061\u0061\u00E1", "\u0061\u00E1", null, SECONDARY, STANDARD_ELEMENT_COMPARISON, null, IA(1, -1), IA(2)),
+ SD("\u0020\u00C2\u0303\u0020\u0041\u0061\u1EAA\u0041\u0302\u0303\u00C2\u0303\u1EAB\u0061\u0302\u0303\u00E2\u0303\uD806\uDC01\u0300\u0020", "\u00C2\u0303",
+ null, PRIMARY, STANDARD_ELEMENT_COMPARISON, null, IA(1, 4, 5, 6, 7, 10, 12, 13, 16, -1), IA(2, 1, 1, 1, 3, 2, 1, 3, 2)),
+ SD("\u03BA\u03B1\u03B9\u0300\u0020\u03BA\u03B1\u1F76", "\u03BA\u03B1\u03B9", null, PRIMARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, 5, -1), IA(4, 3)),
};
SearchData NORMCANONICAL[] = {
- new SearchData("\u0300\u0325", "\u0300", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {2}),
- new SearchData("\u0300\u0325", "\u0325", null, Collator.TERTIARY, null, new int[] {0, -1}, new int[] {2}),
- new SearchData("a\u0300\u0325", "\u0325\u0300", null, Collator.TERTIARY, null, new int[] {1, -1},
- new int[] {2}),
- new SearchData("a\u0300\u0325", "\u0300\u0325", null, Collator.TERTIARY, null, new int[] {1, -1},
- new int[] {2}),
- new SearchData("a\u0300\u0325", "\u0325", null, Collator.TERTIARY, null, new int[] {1, -1}, new int[] {2}),
- new SearchData("a\u0300\u0325", "\u0300", null, Collator.TERTIARY, null, new int[] {1, -1}, new int[] {2}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0})
+ SD("\u0300\u0325", "\u0300", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("\u0300\u0325", "\u0325", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("a\u0300\u0325", "\u0325\u0300", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("a\u0300\u0325", "\u0300\u0325", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("a\u0300\u0325", "\u0325", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("a\u0300\u0325", "\u0300", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
};
SearchData NORMEXACT[] = {
- new SearchData("a\u0300\u0325", "\u0325\u0300", null, Collator.TERTIARY, null, new int[] {1, -1}, new int[] {2}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0})
+ SD("a\u0300\u0325", "a\u0325\u0300", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, -1), IA(3)),
};
SearchData NONNORMEXACT[] = {
- new SearchData("a\u0300\u0325", "\u0325\u0300", null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0})
+ SD("a\u0300\u0325", "\u0325\u0300", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
};
SearchData OVERLAP[] = {
- new SearchData("abababab", "abab", null, Collator.TERTIARY, null, new int[] {0, 2, 4, -1}, new int[] {4, 4, 4}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0})
+ SD("abababab", "abab", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, 2, 4, -1), IA(4, 4, 4)),
};
SearchData NONOVERLAP[] = {
- new SearchData("abababab", "abab", null, Collator.TERTIARY, null, new int[] {0, 4, -1}, new int[] {4, 4}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0})
+ SD("abababab", "abab", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, 4, -1), IA(4, 4)),
};
SearchData OVERLAPCANONICAL[] = {
- new SearchData("abababab", "abab", null, Collator.TERTIARY, null, new int[] {0, 2, 4, -1},
- new int[] {4, 4, 4}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0})
+ SD("abababab", "abab", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, 2, 4, -1), IA(4, 4, 4)),
};
SearchData NONOVERLAPCANONICAL[] = {
- new SearchData("abababab", "abab", null, Collator.TERTIARY, null, new int[] {0, 4, -1}, new int[] {4, 4}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0})
+ SD("abababab", "abab", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, 4, -1), IA(4, 4)),
};
SearchData PATTERNCANONICAL[] = {
- new SearchData("The quick brown fox jumps over the lazy foxes", "the", null,
- Collator.PRIMARY, null, new int[] {0, 31, -1}, new int[] {3, 3}),
- new SearchData("The quick brown fox jumps over the lazy foxes", "fox", null,
- Collator.PRIMARY, null, new int[] {16, 40, -1}, new int[] {3, 3}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0})
+ SD("The quick brown fox jumps over the lazy foxes", "the", null, PRIMARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, 31, -1), IA(3, 3)),
+ SD("The quick brown fox jumps over the lazy foxes", "fox", null, PRIMARY, STANDARD_ELEMENT_COMPARISON, null, IA(16, 40, -1), IA(3, 3)),
};
SearchData PATTERN[] = {
- new SearchData("The quick brown fox jumps over the lazy foxes", "the", null,
- Collator.PRIMARY, null, new int[] {0, 31, -1}, new int[] {3, 3}),
- new SearchData("The quick brown fox jumps over the lazy foxes", "fox", null,
- Collator.PRIMARY, null, new int[] {16, 40, -1}, new int[] {3, 3}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0})
+ SD("The quick brown fox jumps over the lazy foxes", "the", null, PRIMARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, 31, -1), IA(3, 3)),
+ SD("The quick brown fox jumps over the lazy foxes", "fox", null, PRIMARY, STANDARD_ELEMENT_COMPARISON, null, IA(16, 40, -1), IA(3, 3)),
};
+ String PECHE_WITH_ACCENTS = "un p\u00E9ch\u00E9, "
+ + "\u00E7a p\u00E8che par, "
+ + "p\u00E9cher, "
+ + "une p\u00EAche, "
+ + "un p\u00EAcher, "
+ + "j\u2019ai p\u00EAch\u00E9, "
+ + "un p\u00E9cheur, "
+ + "\u201Cp\u00E9che\u201D, "
+ + "decomp peche\u0301, "
+ + "base peche";
+ // in the above, the interesting words and their offsets are:
+ // 3 pe<301>che<301>
+ // 13 pe<300>che
+ // 24 pe<301>cher
+ // 36 pe<302>che
+ // 46 pe<302>cher
+ // 59 pe<302>che<301>
+ // 69 pe<301>cheur
+ // 79 pe<301>che
+ // 94 peche<+301>
+ // 107 peche
+
SearchData STRENGTH[] = {
- /*012345678901234567890123456789012345678901234567890123456789*/
- new SearchData("The quick brown fox jumps over the lazy foxes", "fox", "en",
- Collator.PRIMARY, null, new int[] {16, 40, -1}, new int[] {3, 3}),
- new SearchData("The quick brown fox jumps over the lazy foxes", "fox", "en",
- Collator.PRIMARY, "wordbreaker", new int[] {16, -1}, new int[] {3}),
- new SearchData("blackbirds Pat p\u00E9ch\u00E9 p\u00EAche p\u00E9cher p\u00EAcher Tod T\u00F6ne black Tofu blackbirds Ton PAT toehold blackbird black-bird pat toe big Toe",
- "peche", "fr", Collator.PRIMARY, null, new int[] {15, 21, 27, 34, -1}, new int[] {5, 5, 5, 5}),
- new SearchData("This is a toe T\u00F6ne", "toe", "de", Collator.PRIMARY, null,
- new int[] {10, 14, -1}, new int[] {3, 2}),
- new SearchData("A channel, another CHANNEL, more Channels, and one last channel...", "channel", "es",
- Collator.PRIMARY, null, new int[] {2, 19, 33, 56, -1}, new int[] {7, 7, 7, 7}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0})
+ /* 012345678901234567890123456789012345678901234567890123456789 */
+ SD("The quick brown fox jumps over the lazy foxes", "fox", "en", PRIMARY, STANDARD_ELEMENT_COMPARISON, null, IA(16, 40, -1), IA(3, 3)),
+ SD("The quick brown fox jumps over the lazy foxes", "fox", "en", PRIMARY, STANDARD_ELEMENT_COMPARISON, "wordbreaker", IA(16, -1), IA(3)),
+ SD("blackbirds Pat p\u00E9ch\u00E9 p\u00EAche p\u00E9cher p\u00EAcher Tod T\u00F6ne black Tofu blackbirds Ton PAT toehold blackbird black-bird pat toe big Toe",
+ "peche", "fr", PRIMARY, STANDARD_ELEMENT_COMPARISON, null, IA(15, 21, 27, 34, -1), IA(5, 5, 5, 5)),
+ SD("This is a toe T\u00F6ne", "toe", "de", PRIMARY, STANDARD_ELEMENT_COMPARISON, null, IA(10, 14, -1), IA(3, 2)),
+ SD("A channel, another CHANNEL, more Channels, and one last channel...", "channel", "es", PRIMARY, STANDARD_ELEMENT_COMPARISON, null, IA(2, 19, 33, 56, -1), IA(7, 7, 7, 7)),
+ SD("\u00c0 should match but not A", "A\u0300", "en", IDENTICAL, STANDARD_ELEMENT_COMPARISON, null, IA(0, -1), IA(1, 0)),
+
+ /* some tests for modified element comparison, ticket #7093 */
+ SD(PECHE_WITH_ACCENTS, "peche", "en", PRIMARY, STANDARD_ELEMENT_COMPARISON, null, IA(3, 13, 24, 36, 46, 59, 69, 79, 94, 107, -1), IA(5, 5, 5, 5, 5, 5, 5, 5, 6, 5)),
+ SD(PECHE_WITH_ACCENTS, "peche", "en", PRIMARY, STANDARD_ELEMENT_COMPARISON, "wordbreaker", IA(3, 13, 36, 59, 79, 94, 107, -1), IA(5, 5, 5, 5, 5, 6, 5)),
+ SD(PECHE_WITH_ACCENTS, "peche", "en", SECONDARY, STANDARD_ELEMENT_COMPARISON, null, IA(107, -1), IA(5)),
+ SD(PECHE_WITH_ACCENTS, "peche", "en", SECONDARY, PATTERN_BASE_WEIGHT_IS_WILDCARD, null, IA(3, 13, 24, 36, 46, 59, 69, 79, 94, 107, -1), IA(5, 5, 5, 5, 5, 5, 5, 5, 6, 5)),
+ SD(PECHE_WITH_ACCENTS, "peche", "en", SECONDARY, PATTERN_BASE_WEIGHT_IS_WILDCARD, "wordbreaker", IA(3, 13, 36, 59, 79, 94, 107, -1), IA(5, 5, 5, 5, 5, 6, 5)),
+ SD(PECHE_WITH_ACCENTS, "p\u00E9che", "en", SECONDARY, STANDARD_ELEMENT_COMPARISON, null, IA(24, 69, 79, -1), IA(5, 5, 5)),
+ SD(PECHE_WITH_ACCENTS, "p\u00E9che", "en", SECONDARY, STANDARD_ELEMENT_COMPARISON, "wordbreaker", IA(79, -1), IA(5)),
+ SD(PECHE_WITH_ACCENTS, "p\u00E9che", "en", SECONDARY, PATTERN_BASE_WEIGHT_IS_WILDCARD, null, IA(3, 24, 69, 79, -1), IA(5, 5, 5, 5)),
+ SD(PECHE_WITH_ACCENTS, "p\u00E9che", "en", SECONDARY, PATTERN_BASE_WEIGHT_IS_WILDCARD, "wordbreaker", IA(3, 79, -1), IA(5, 5)),
+ SD(PECHE_WITH_ACCENTS, "p\u00E9che", "en", SECONDARY, ANY_BASE_WEIGHT_IS_WILDCARD, null, IA(3, 24, 69, 79, 94, 107, -1), IA(5, 5, 5, 5, 6, 5)),
+ SD(PECHE_WITH_ACCENTS, "p\u00E9che", "en", SECONDARY, ANY_BASE_WEIGHT_IS_WILDCARD, "wordbreaker", IA(3, 79, 94, 107, -1), IA(5, 5, 6, 5)),
+ SD(PECHE_WITH_ACCENTS, "pech\u00E9", "en", SECONDARY, PATTERN_BASE_WEIGHT_IS_WILDCARD, null, IA(3, 59, 94, -1), IA(5, 5, 6)),
+ SD(PECHE_WITH_ACCENTS, "pech\u00E9", "en", SECONDARY, PATTERN_BASE_WEIGHT_IS_WILDCARD, "wordbreaker", IA(3, 59, 94, -1), IA(5, 5, 6)),
+ SD(PECHE_WITH_ACCENTS, "pech\u00E9", "en", SECONDARY, ANY_BASE_WEIGHT_IS_WILDCARD, null, IA(3, 13, 24, 36, 46, 59, 69, 79, 94, 107, -1), IA(5, 5, 5, 5, 5, 5, 5, 5, 6, 5)),
+ SD(PECHE_WITH_ACCENTS, "pech\u00E9", "en", SECONDARY, ANY_BASE_WEIGHT_IS_WILDCARD, "wordbreaker", IA(3, 13, 36, 59, 79, 94, 107, -1), IA(5, 5, 5, 5, 5, 6, 5)),
+ SD(PECHE_WITH_ACCENTS, "peche\u0301", "en", SECONDARY, PATTERN_BASE_WEIGHT_IS_WILDCARD, null, IA(3, 59, 94, -1), IA(5, 5, 6)),
+ SD(PECHE_WITH_ACCENTS, "peche\u0301", "en", SECONDARY, PATTERN_BASE_WEIGHT_IS_WILDCARD, "wordbreaker", IA(3, 59, 94, -1), IA(5, 5, 6)),
+ SD(PECHE_WITH_ACCENTS, "peche\u0301", "en", SECONDARY, ANY_BASE_WEIGHT_IS_WILDCARD, null, IA(3, 13, 24, 36, 46, 59, 69, 79, 94, 107, -1), IA(5, 5, 5, 5, 5, 5, 5, 5, 6, 5)),
+ SD(PECHE_WITH_ACCENTS, "peche\u0301", "en", SECONDARY, ANY_BASE_WEIGHT_IS_WILDCARD, "wordbreaker", IA(3, 13, 36, 59, 79, 94, 107, -1), IA(5, 5, 5, 5, 5, 6, 5)),
+
+ /* more tests for modified element comparison (with fr), ticket #7093 */
+ SD(PECHE_WITH_ACCENTS, "peche", "fr", PRIMARY, STANDARD_ELEMENT_COMPARISON, null, IA(3, 13, 24, 36, 46, 59, 69, 79, 94, 107, -1), IA(5, 5, 5, 5, 5, 5, 5, 5, 6, 5)),
+ SD(PECHE_WITH_ACCENTS, "peche", "fr", PRIMARY, STANDARD_ELEMENT_COMPARISON, "wordbreaker", IA(3, 13, 36, 59, 79, 94, 107, -1), IA(5, 5, 5, 5, 5, 6, 5)),
+ SD(PECHE_WITH_ACCENTS, "peche", "fr", SECONDARY, STANDARD_ELEMENT_COMPARISON, null, IA(107, -1), IA(5)),
+ SD(PECHE_WITH_ACCENTS, "peche", "fr", SECONDARY, PATTERN_BASE_WEIGHT_IS_WILDCARD, null, IA(3, 13, 24, 36, 46, 59, 69, 79, 94, 107, -1), IA(5, 5, 5, 5, 5, 5, 5, 5, 6, 5)),
+ SD(PECHE_WITH_ACCENTS, "peche", "fr", SECONDARY, PATTERN_BASE_WEIGHT_IS_WILDCARD, "wordbreaker", IA(3, 13, 36, 59, 79, 94, 107, -1), IA(5, 5, 5, 5, 5, 6, 5)),
+ SD(PECHE_WITH_ACCENTS, "p\u00E9che", "fr", SECONDARY, STANDARD_ELEMENT_COMPARISON, null, IA(24, 69, 79, -1), IA(5, 5, 5)),
+ SD(PECHE_WITH_ACCENTS, "p\u00E9che", "fr", SECONDARY, STANDARD_ELEMENT_COMPARISON, "wordbreaker", IA(79, -1), IA(5)),
+ SD(PECHE_WITH_ACCENTS, "p\u00E9che", "fr", SECONDARY, PATTERN_BASE_WEIGHT_IS_WILDCARD, null, IA(3, 24, 69, 79, -1), IA(5, 5, 5, 5)),
+ SD(PECHE_WITH_ACCENTS, "p\u00E9che", "fr", SECONDARY, PATTERN_BASE_WEIGHT_IS_WILDCARD, "wordbreaker", IA(3, 79, -1), IA(5, 5)),
+ SD(PECHE_WITH_ACCENTS, "p\u00E9che", "fr", SECONDARY, ANY_BASE_WEIGHT_IS_WILDCARD, null, IA(3, 24, 69, 79, 94, 107, -1), IA(5, 5, 5, 5, 6, 5)),
+ SD(PECHE_WITH_ACCENTS, "p\u00E9che", "fr", SECONDARY, ANY_BASE_WEIGHT_IS_WILDCARD, "wordbreaker", IA(3, 79, 94, 107, -1), IA(5, 5, 6, 5)),
+ SD(PECHE_WITH_ACCENTS, "pech\u00E9", "fr", SECONDARY, PATTERN_BASE_WEIGHT_IS_WILDCARD, null, IA(3, 59, 94, -1), IA(5, 5, 6)),
+ SD(PECHE_WITH_ACCENTS, "pech\u00E9", "fr", SECONDARY, PATTERN_BASE_WEIGHT_IS_WILDCARD, "wordbreaker", IA(3, 59, 94, -1), IA(5, 5, 6)),
+ SD(PECHE_WITH_ACCENTS, "pech\u00E9", "fr", SECONDARY, ANY_BASE_WEIGHT_IS_WILDCARD, null, IA(3, 13, 24, 36, 46, 59, 69, 79, 94, 107, -1), IA(5, 5, 5, 5, 5, 5, 5, 5, 6, 5)),
+ SD(PECHE_WITH_ACCENTS, "pech\u00E9", "fr", SECONDARY, ANY_BASE_WEIGHT_IS_WILDCARD, "wordbreaker", IA(3, 13, 36, 59, 79, 94, 107, -1), IA(5, 5, 5, 5, 5, 6, 5)),
+ SD(PECHE_WITH_ACCENTS, "peche\u0301", "fr", SECONDARY, PATTERN_BASE_WEIGHT_IS_WILDCARD, null, IA(3, 59, 94, -1), IA(5, 5, 6)),
+ SD(PECHE_WITH_ACCENTS, "peche\u0301", "fr", SECONDARY, PATTERN_BASE_WEIGHT_IS_WILDCARD, "wordbreaker", IA(3, 59, 94, -1), IA(5, 5, 6)),
+ SD(PECHE_WITH_ACCENTS, "peche\u0301", "fr", SECONDARY, ANY_BASE_WEIGHT_IS_WILDCARD, null, IA(3, 13, 24, 36, 46, 59, 69, 79, 94, 107, -1), IA(5, 5, 5, 5, 5, 5, 5, 5, 6, 5)),
+ SD(PECHE_WITH_ACCENTS, "peche\u0301", "fr", SECONDARY, ANY_BASE_WEIGHT_IS_WILDCARD, "wordbreaker", IA(3, 13, 36, 59, 79, 94, 107, -1), IA(5, 5, 5, 5, 5, 6, 5)),
+
};
SearchData STRENGTHCANONICAL[] = {
- /*012345678901234567890123456789012345678901234567890123456789 */
- new SearchData("The quick brown fox jumps over the lazy foxes", "fox", "en",
- Collator.PRIMARY, null, new int[] {16, 40, -1}, new int[] {3, 3}),
- new SearchData("The quick brown fox jumps over the lazy foxes", "fox", "en",
- Collator.PRIMARY, "wordbreaker", new int[] {16, -1}, new int[] {3}),
- new SearchData("blackbirds Pat p\u00E9ch\u00E9 p\u00EAche p\u00E9cher p\u00EAcher Tod T\u00F6ne black Tofu blackbirds Ton PAT toehold blackbird black-bird pat toe big Toe",
- "peche", "fr", Collator.PRIMARY, null, new int[] {15, 21, 27, 34, -1}, new int[] {5, 5, 5, 5}),
- new SearchData("This is a toe T\u00F6ne", "toe", "de", Collator.PRIMARY, null,
- new int[] {10, 14, -1}, new int[] {3, 2}),
- new SearchData("A channel, another CHANNEL, more Channels, and one last channel...", "channel", "es",
- Collator.PRIMARY, null, new int[]{2, 19, 33, 56, -1}, new int[] {7, 7, 7, 7}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[]{0})
+ /* 012345678901234567890123456789012345678901234567890123456789 */
+ SD("The quick brown fox jumps over the lazy foxes", "fox", "en", PRIMARY, STANDARD_ELEMENT_COMPARISON, null, IA(16, 40, -1), IA(3, 3)),
+ SD("The quick brown fox jumps over the lazy foxes", "fox", "en", PRIMARY, STANDARD_ELEMENT_COMPARISON, "wordbreaker", IA(16, -1), IA(3)),
+ SD("blackbirds Pat p\u00E9ch\u00E9 p\u00EAche p\u00E9cher p\u00EAcher Tod T\u00F6ne black Tofu blackbirds Ton PAT toehold blackbird black-bird pat toe big Toe",
+ "peche", "fr", PRIMARY, STANDARD_ELEMENT_COMPARISON, null, IA(15, 21, 27, 34, -1), IA(5, 5, 5, 5)),
+ SD("This is a toe T\u00F6ne", "toe", "de", PRIMARY, STANDARD_ELEMENT_COMPARISON, null, IA(10, 14, -1), IA(3, 2)),
+ SD("A channel, another CHANNEL, more Channels, and one last channel...", "channel", "es", PRIMARY, STANDARD_ELEMENT_COMPARISON, null, IA(2, 19, 33, 56, -1), IA(7, 7, 7, 7)),
};
SearchData SUPPLEMENTARYCANONICAL[] = {
- /*012345678901234567890123456789012345678901234567890012345678901234567890123456789012345678901234567890012345678901234567890123456789 */
- new SearchData("abc \uD800\uDC00 \uD800\uDC01 \uD801\uDC00 \uD800\uDC00abc abc\uD800\uDC00 \uD800\uD800\uDC00 \uD800\uDC00\uDC00",
- "\uD800\uDC00", null, Collator.TERTIARY, null, new int[] {4, 13, 22, 26, 29, -1},
- new int[] {2, 2, 2, 2, 2}),
- new SearchData("and\uD834\uDDB9this sentence", "\uD834\uDDB9", null,
- Collator.TERTIARY, null, new int[] {3, -1},
- new int[] {2}),
- new SearchData("and \uD834\uDDB9 this sentence", " \uD834\uDDB9 ",
- null, Collator.TERTIARY, null, new int[] {3, -1},
- new int[] {4}),
- new SearchData("and-\uD834\uDDB9-this sentence", "-\uD834\uDDB9-",
- null, Collator.TERTIARY, null, new int[] {3, -1},
- new int[] {4}),
- new SearchData("and,\uD834\uDDB9,this sentence", ",\uD834\uDDB9,",
- null, Collator.TERTIARY, null, new int[] {3, -1},
- new int[] {4}),
- new SearchData("and?\uD834\uDDB9?this sentence", "?\uD834\uDDB9?",
- null, Collator.TERTIARY, null, new int[] {3, -1},
- new int[] {4}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0})
+ /* 012345678901234567890123456789012345678901234567890012345678901234567890123456789012345678901234567890012345678901234567890123456789 */
+ SD("abc \uD800\uDC00 \uD800\uDC01 \uD801\uDC00 \uD800\uDC00abc abc\uD800\uDC00 \uD800\uD800\uDC00 \uD800\uDC00\uDC00", "\uD800\uDC00",
+ null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(4, 13, 22, 26, 29, -1), IA(2, 2, 2, 2, 2)),
+ SD("and\uD834\uDDB9this sentence", "\uD834\uDDB9", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(3, -1), IA(2)),
+ SD("and \uD834\uDDB9 this sentence", " \uD834\uDDB9 ", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(3, -1), IA(4)),
+ SD("and-\uD834\uDDB9-this sentence", "-\uD834\uDDB9-", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(3, -1), IA(4)),
+ SD("and,\uD834\uDDB9,this sentence", ",\uD834\uDDB9,", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(3, -1), IA(4)),
+ SD("and?\uD834\uDDB9?this sentence", "?\uD834\uDDB9?", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(3, -1), IA(4)),
};
static SearchData VARIABLE[] = {
- /*012345678901234567890123456789012345678901234567890123456789*/
- new SearchData("blackbirds black blackbirds blackbird black-bird", "blackbird", null, Collator.TERTIARY, null,
- new int[] {0, 17, 28, 38, -1}, new int[] {9, 9, 9, 10}),
-
- /* to see that it doesn't go into an infinite loop if the start of text
- is a ignorable character */
- new SearchData(" on", "go", null, Collator.TERTIARY, null,
- new int[] {-1}, new int[]{0}),
- new SearchData("abcdefghijklmnopqrstuvwxyz", " ", null, Collator.PRIMARY, null,
- new int[]{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1},
- new int[] {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}),
+ /* 012345678901234567890123456789012345678901234567890123456789 */
+ SD("blackbirds black blackbirds blackbird black-bird", "blackbird", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(0, 17, 28, 38, -1), IA(9, 9, 9, 10)),
+
+ /*
+ * to see that it doesn't go into an infinite loop if the start of text is a ignorable character
+ */
+ SD(" on", "go", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
+ SD("abcdefghijklmnopqrstuvwxyz", " ",
+ null, PRIMARY, STANDARD_ELEMENT_COMPARISON, null,
+ IA(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1),
+ IA(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)),
/* testing tightest match */
- new SearchData(" abc a bc ab c a bc ab c", "abc", null, Collator.QUATERNARY, null,
- new int[]{1, -1}, new int[] {3}),
- /*012345678901234567890123456789012345678901234567890123456789 */
- new SearchData(" abc a bc ab c a bc ab c", "abc", null, Collator.SECONDARY, null,
- new int[] {1, 6, 13, 21, 31, -1}, new int[] {3, 4, 4, 5, 5}),
+ SD(" abc a bc ab c a bc ab c", "abc", null, QUATERNARY, STANDARD_ELEMENT_COMPARISON, null, IA(1, -1), IA(3)),
+ /* 012345678901234567890123456789012345678901234567890123456789 */
+ SD(" abc a bc ab c a bc ab c", "abc", null, SECONDARY, STANDARD_ELEMENT_COMPARISON, null, IA(1, 6, 13, 21, 31, -1), IA(3, 4, 4, 5, 5)),
/* totally ignorable text */
- new SearchData(" ---------------", "abc", null, Collator.SECONDARY, null,
- new int[] {-1}, new int[] {0}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[] {0})
+ SD(" ---------------", "abc", null, SECONDARY, STANDARD_ELEMENT_COMPARISON, null, IA(-1), IA(0)),
};
static SearchData TEXTCANONICAL[] = {
- new SearchData("the foxy brown fox", "fox", null, Collator.TERTIARY, null,
- new int[] {4, 15, -1}, new int[] {3, 3}),
- new SearchData("the quick brown fox", "fox", null, Collator.TERTIARY, null,
- new int[] {16, -1}, new int[]{3}),
- new SearchData(null, null, null, Collator.TERTIARY,null, new int[] {-1}, new int[] {0})
+ SD("the foxy brown fox", "fox", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(4, 15, -1), IA(3, 3)),
+ SD("the quick brown fox", "fox", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(16, -1), IA(3)),
};
/**
breaker.setText(text);
}
collator.setStrength(search.strength);
+ collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
try {
strsrch = new StringSearch(pattern, new StringCharacterIterator(text), (RuleBasedCollator)collator, breaker);
+ strsrch.setElementComparisonType(search.cmpType);
strsrch.setCanonical(true);
} catch (Exception e) {
errln("Error opening string search" + e.getMessage());
}
if (!assertEqualWithStringSearch(strsrch, search)) {
- collator.setStrength(Collator.TERTIARY);
+ collator.setStrength(TERTIARY);
+ collator.setDecomposition(Collator.NO_DECOMPOSITION);
return false;
}
- collator.setStrength(Collator.TERTIARY);
+ collator.setStrength(TERTIARY);
+ collator.setDecomposition(Collator.NO_DECOMPOSITION);
return true;
}
collator.setStrength(search.strength);
try {
strsrch = new StringSearch(pattern, new StringCharacterIterator(text), (RuleBasedCollator)collator, breaker);
+ strsrch.setElementComparisonType(search.cmpType);
} catch (Exception e) {
errln("Error opening string search " + e.getMessage());
return false;
}
if (!assertEqualWithStringSearch(strsrch, search)) {
- collator.setStrength(Collator.TERTIARY);
+ collator.setStrength(TERTIARY);
return false;
}
- collator.setStrength(Collator.TERTIARY);
+ collator.setStrength(TERTIARY);
return true;
}
strsrch = new StringSearch(pattern, new StringCharacterIterator(text), (RuleBasedCollator)collator, breaker);
strsrch.setCanonical(canonical);
strsrch.setOverlapping(overlap);
+ strsrch.setElementComparisonType(search.cmpType);
} catch (Exception e) {
errln("Error opening string search " + e.getMessage());
return false;
}
if (!assertEqualWithStringSearch(strsrch, search)) {
- collator.setStrength(Collator.TERTIARY);
+ collator.setStrength(TERTIARY);
return false;
}
- collator.setStrength(Collator.TERTIARY);
+ collator.setStrength(TERTIARY);
return true;
}
}
public void TestBasic() {
- int count = 0;
- while (BASIC[count].text != null) {
+ for (int count = 0; count < BASIC.length; count++) {
if (!assertEqual(BASIC[count])) {
errln("Error at test number " + count);
}
- count ++;
}
}
errln("Error setting break iterator");
}
if (!assertEqualWithStringSearch(strsrch, search)) {
- collator.setStrength(Collator.TERTIARY);
+ collator.setStrength(TERTIARY);
}
search = BREAKITERATOREXACT[count + 1];
breaker = getBreakIterator(search.breaker);
}
count += 2;
}
- count = 0;
- while (BREAKITERATOREXACT[count].text != null) {
+ for (count = 0; count < BREAKITERATOREXACT.length; count++) {
if (!assertEqual(BREAKITERATOREXACT[count])) {
errln("Error at test number " + count);
}
- count++;
}
}
return;
}
if (!assertEqualWithStringSearch(strsrch, search)) {
- collator.setStrength(Collator.TERTIARY);
+ collator.setStrength(TERTIARY);
return;
}
search = BREAKITERATOREXACT[count + 1];
}
count += 2;
}
- count = 0;
- while (BREAKITERATORCANONICAL[count].text != null) {
+
+ for (count = 0; count < BREAKITERATORCANONICAL.length; count++) {
if (!assertEqual(BREAKITERATORCANONICAL[count])) {
errln("Error at test number " + count);
return;
}
- count++;
}
}
public void TestCanonical() {
- int count = 0;
- while (BASICCANONICAL[count].text != null) {
+ for (int count = 0; count < BASICCANONICAL.length; count++) {
if (!assertCanonicalEqual(BASICCANONICAL[count])) {
errln("Error at test number " + count);
}
- count ++;
}
}
}
public void TestCompositeBoundaries() {
- int count = 0;
- while (COMPOSITEBOUNDARIES[count].text != null) {
+ for (int count = 0; count < COMPOSITEBOUNDARIES.length; count++) {
// logln("composite " + count);
if (!assertEqual(COMPOSITEBOUNDARIES[count])) {
errln("Error at test number " + count);
}
- count++;
}
}
public void TestCompositeBoundariesCanonical() {
- int count = 0;
- while (COMPOSITEBOUNDARIESCANONICAL[count].text != null) {
+ for (int count = 0; count < COMPOSITEBOUNDARIESCANONICAL.length; count++) {
// logln("composite " + count);
if (!assertCanonicalEqual(COMPOSITEBOUNDARIESCANONICAL[count])) {
errln("Error at test number " + count);
}
- count++;
}
}
RuleBasedCollator collator = null;
try {
collator = new RuleBasedCollator(rules);
- collator.setStrength(Collator.TERTIARY);
+ collator.setStrength(TERTIARY);
collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
} catch (Exception e) {
errln("Error opening collator ");
errln("Error opening string search ");
}
- int count = 0;
- while (CONTRACTION[count].text != null) {
+ for (int count = 0; count< CONTRACTION.length; count++) {
text = CONTRACTION[count].text;
pattern = CONTRACTION[count].pattern;
strsrch.setTarget(new StringCharacterIterator(text));
if (!assertEqualWithStringSearch(strsrch, CONTRACTION[count])) {
errln("Error at test number " + count);
}
- count++;
}
}
RuleBasedCollator collator = null;
try {
collator = new RuleBasedCollator(rules);
- collator.setStrength(Collator.TERTIARY);
+ collator.setStrength(TERTIARY);
collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
} catch (Exception e) {
errln("Error opening collator ");
errln("Error opening string search");
}
- int count = 0;
- while (CONTRACTIONCANONICAL[count].text != null) {
+ for (int count = 0; count < CONTRACTIONCANONICAL.length; count++) {
text = CONTRACTIONCANONICAL[count].text;
pattern = CONTRACTIONCANONICAL[count].pattern;
strsrch.setTarget(new StringCharacterIterator(text));
if (!assertEqualWithStringSearch(strsrch, CONTRACTIONCANONICAL[count])) {
errln("Error at test number " + count);
}
- count++;
}
}
errln("Error setting canonical match false");
}
+ if (strsrch.getElementComparisonType() != STANDARD_ELEMENT_COMPARISON) {
+ errln("Error default element comparison type should be STANDARD_ELEMENT_COMPARISON");
+ }
+ strsrch.setElementComparisonType(ElementComparisonType.PATTERN_BASE_WEIGHT_IS_WILDCARD);
+ if (strsrch.getElementComparisonType() != ElementComparisonType.PATTERN_BASE_WEIGHT_IS_WILDCARD) {
+ errln("Error setting element comparison type PATTERN_BASE_WEIGHT_IS_WILDCARD");
+ }
}
public void TestGetSetOffset() {
logln("PASS: strsrch.setIndex(128) failed as expected");
}
- int index = 0;
- while (BASIC[index].text != null) {
- SearchData search = BASIC[index ++];
+ for (int index = 0; index < BASIC.length; index++) {
+ SearchData search = BASIC[index];
text =search.text;
pattern = search.pattern;
return;
}
}
- strsrch.getCollator().setStrength(Collator.TERTIARY);
+ strsrch.getCollator().setStrength(TERTIARY);
}
public void TestGetSetOffsetCanonical() {
return;
}
strsrch.setCanonical(true);
+ //TODO: setCanonical is not sufficient for canonical match. See #10725
+ strsrch.getCollator().setDecomposition(Collator.CANONICAL_DECOMPOSITION);
/* testing out of bounds error */
try {
strsrch.setIndex(-1);
logln("PASS: strsrch.setIndex(128) failed as expected");
}
- int index = 0;
- while (BASICCANONICAL[index].text != null) {
- SearchData search = BASICCANONICAL[index ++];
- if (BASICCANONICAL[index].text == null) {
- // skip the last one
- break;
- }
-
+ for (int index = 0; index < BASICCANONICAL.length; index++) {
+ SearchData search = BASICCANONICAL[index];
text = search.text;
pattern = search.pattern;
strsrch.setTarget(new StringCharacterIterator(text));
return;
}
}
- strsrch.getCollator().setStrength(Collator.TERTIARY);
+ strsrch.getCollator().setStrength(TERTIARY);
+ strsrch.getCollator().setDecomposition(Collator.NO_DECOMPOSITION);
}
public void TestIgnorable() {
return;
}
- while (IGNORABLE[count].text != null) {
+ for (; count < IGNORABLE.length; count++) {
text = IGNORABLE[count].text;
pattern = IGNORABLE[count].pattern;
strsrch.setTarget(new StringCharacterIterator(text));
if (!assertEqualWithStringSearch(strsrch, IGNORABLE[count])) {
errln("Error at test number " + count);
}
- count++;
}
}
public void TestNormCanonical() {
m_en_us_.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
- int count = 0;
- while (NORMCANONICAL[count].text != null) {
+ for (int count = 0; count < NORMCANONICAL.length; count++) {
if (!assertCanonicalEqual(NORMCANONICAL[count])) {
errln("Error at test number " + count);
}
- count++;
}
m_en_us_.setDecomposition(Collator.NO_DECOMPOSITION);
}
public void TestNormExact() {
- int count = 0;
+ int count;
+
m_en_us_.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
- while (BASIC[count].text != null) {
+ for (count = 0; count < BASIC.length; count++) {
if (!assertEqual(BASIC[count])) {
errln("Error at test number " + count);
}
- count++;
}
- count = 0;
- while (NORMEXACT[count].text != null) {
+ for (count = 0; count < NORMEXACT.length; count++) {
if (!assertEqual(NORMEXACT[count])) {
errln("Error at test number " + count);
}
- count++;
}
m_en_us_.setDecomposition(Collator.NO_DECOMPOSITION);
- count = 0;
- while (NONNORMEXACT[count].text != null) {
+ for (count = 0; count < NONNORMEXACT.length; count++) {
if (!assertEqual(NONNORMEXACT[count])) {
errln("Error at test number " + count);
}
- count++;
}
}
}
public void TestOverlap() {
- int count = 0;
- while (OVERLAP[count].text != null) {
+ int count;
+
+ for (count = 0; count < OVERLAP.length; count++) {
if (!assertEqualWithAttribute(OVERLAP[count], false, true)) {
errln("Error at overlap test number " + count);
}
- count++;
}
- count = 0;
- while (NONOVERLAP[count].text != null) {
+
+ for (count = 0; count < NONOVERLAP.length; count++) {
if (!assertEqual(NONOVERLAP[count])) {
errln("Error at non overlap test number " + count);
}
- count++;
}
- count = 0;
- while (count < 1) {
+ for (count = 0; count < OVERLAP.length && count < NONOVERLAP.length; count++) {
SearchData search = (OVERLAP[count]);
String text = search.text;
String pattern = search.pattern;
if (!assertEqualWithStringSearch(strsrch, search)) {
errln("Error at test number " + count);
}
- count ++;
}
}
public void TestOverlapCanonical() {
- int count = 0;
- while (OVERLAPCANONICAL[count].text != null) {
- if (!assertEqualWithAttribute(OVERLAPCANONICAL[count], true,
- true)) {
+ int count;
+
+ for (count = 0; count < OVERLAPCANONICAL.length; count++) {
+ if (!assertEqualWithAttribute(OVERLAPCANONICAL[count], true, true)) {
errln("Error at overlap test number %d" + count);
}
- count ++;
}
- count = 0;
- while (NONOVERLAP[count].text != null) {
+
+ for (count = 0; count < NONOVERLAP.length; count++) {
if (!assertCanonicalEqual(NONOVERLAPCANONICAL[count])) {
errln("Error at non overlap test number %d" + count);
}
- count ++;
}
- count = 0;
- while (count < 1) {
- /* UChar temp[128];
- const SearchData *search = &(OVERLAPCANONICAL[count]);
- UErrorCode status = U_ZERO_ERROR;*/
+ for (count = 0; count < OVERLAPCANONICAL.length && count < NONOVERLAPCANONICAL.length; count++) {
SearchData search = OVERLAPCANONICAL[count];
-
- /*u_unescape(search.text, temp, 128);
- UnicodeString text;
- text.setTo(temp, u_strlen(temp));
- u_unescape(search.pattern, temp, 128);
- UnicodeString pattern;
- pattern.setTo(temp, u_strlen(temp));*/
RuleBasedCollator collator = getCollator(search.collator);
StringSearch strsrch = new StringSearch(search.pattern, new StringCharacterIterator(search.text), collator, null);
strsrch.setCanonical(true);
strsrch = null;
errln("Error at test number %d" + count);
}
-
- count ++;
- strsrch = null;
}
}
m_en_us_.setStrength(PATTERN[0].strength);
StringSearch strsrch = new StringSearch(PATTERN[0].pattern, new StringCharacterIterator(PATTERN[0].text), m_en_us_, null);
- /*if (U_FAILURE(status)) {
- errln("Error opening string search %s", u_errorName(status));
- m_en_us_.setStrength(getECollationStrength(UCOL_TERTIARY));
- if (strsrch != NULL) {
- delete strsrch;
- }
- return;
- }*/
-
if (strsrch.getPattern() != PATTERN[0].pattern) {
errln("Error setting pattern");
}
if (!assertEqualWithStringSearch(strsrch, PATTERN[0])) {
- m_en_us_.setStrength(Collator.TERTIARY);
+ m_en_us_.setStrength(TERTIARY);
if (strsrch != null) {
strsrch = null;
}
strsrch.setPattern(PATTERN[1].pattern);
if (PATTERN[1].pattern != strsrch.getPattern()) {
errln("Error setting pattern");
- m_en_us_.setStrength(Collator.TERTIARY);
+ m_en_us_.setStrength(TERTIARY);
if (strsrch != null) {
strsrch = null;
}
strsrch.reset();
if (!assertEqualWithStringSearch(strsrch, PATTERN[1])) {
- m_en_us_.setStrength(Collator.TERTIARY);
+ m_en_us_.setStrength(TERTIARY);
if (strsrch != null) {
strsrch = null;
}
strsrch.setPattern(PATTERN[0].pattern);
if (PATTERN[0].pattern != strsrch.getPattern()) {
errln("Error setting pattern");
- m_en_us_.setStrength(Collator.TERTIARY);
+ m_en_us_.setStrength(TERTIARY);
if (strsrch != null) {
strsrch = null;
}
strsrch.reset();
if (!assertEqualWithStringSearch(strsrch, PATTERN[0])) {
- m_en_us_.setStrength(Collator.TERTIARY);
+ m_en_us_.setStrength(TERTIARY);
if (strsrch != null) {
strsrch = null;
}
errln("Error setting pattern with size 512");
}
- m_en_us_.setStrength(Collator.TERTIARY);
+ m_en_us_.setStrength(TERTIARY);
if (strsrch != null) {
strsrch = null;
}
errln("Error setting pattern");
}
if (!assertEqualWithStringSearch(strsrch, PATTERNCANONICAL[0])) {
- m_en_us_.setStrength(Collator.TERTIARY);
+ m_en_us_.setStrength(TERTIARY);
strsrch = null;
return;
}
strsrch.setPattern(PATTERNCANONICAL[1].pattern);
if (PATTERNCANONICAL[1].pattern != strsrch.getPattern()) {
errln("Error setting pattern");
- m_en_us_.setStrength(Collator.TERTIARY);
+ m_en_us_.setStrength(TERTIARY);
strsrch = null;
return;
}
strsrch.setCanonical(true);
if (!assertEqualWithStringSearch(strsrch, PATTERNCANONICAL[1])) {
- m_en_us_.setStrength(Collator.TERTIARY);
+ m_en_us_.setStrength(TERTIARY);
strsrch = null;
return;
}
strsrch.setPattern(PATTERNCANONICAL[0].pattern);
if (PATTERNCANONICAL[0].pattern != strsrch.getPattern()) {
errln("Error setting pattern");
- m_en_us_.setStrength(Collator.TERTIARY);
+ m_en_us_.setStrength(TERTIARY);
strsrch = null;
return;
}
strsrch.reset();
strsrch.setCanonical(true);
if (!assertEqualWithStringSearch(strsrch, PATTERNCANONICAL[0])) {
- m_en_us_.setStrength(Collator.TERTIARY);
+ m_en_us_.setStrength(TERTIARY);
strsrch = null;
return;
}
}
public void TestSetMatch() {
- int count = 0;
- while (MATCH[count].text != null) {
+ for (int count = 0; count < MATCH.length; count++) {
SearchData search = MATCH[count];
StringSearch strsrch = new StringSearch(search.pattern, new StringCharacterIterator(search.text),
m_en_us_, null);
if (strsrch.preceding(0) != SearchIterator.DONE) {
errln("Error expecting out of bounds match");
}
- count ++;
- strsrch = null;
}
}
public void TestStrength() {
- int count = 0;
- while (STRENGTH[count].text != null) {
- if (count == 3) count ++;
+ for (int count = 0; count < STRENGTH.length; count++) {
if (!assertEqual(STRENGTH[count])) {
errln("Error at test number " + count);
}
- count ++;
}
}
public void TestStrengthCanonical() {
- int count = 0;
- while (STRENGTHCANONICAL[count].text != null) {
- if (count == 3) count ++;
+ for (int count = 0; count < STRENGTHCANONICAL.length; count++) {
if (!assertCanonicalEqual(STRENGTHCANONICAL[count])) {
errln("Error at test number" + count);
}
- count ++;
}
}
public void TestSupplementary() {
- if (logKnownIssue("8080", null)) {
- return;
- }
- int count = 0;
- while (SUPPLEMENTARY[count].text != null) {
+ for (int count = 0; count < SUPPLEMENTARY.length; count++) {
if (!assertEqual(SUPPLEMENTARY[count])) {
errln("Error at test number " + count);
}
- count ++;
}
}
public void TestSupplementaryCanonical() {
- if (logKnownIssue("8080", null)) {
- return;
- }
- int count = 0;
- while (SUPPLEMENTARYCANONICAL[count].text != null) {
+ for (int count = 0; count < SUPPLEMENTARYCANONICAL.length; count++) {
if (!assertCanonicalEqual(SUPPLEMENTARYCANONICAL[count])) {
errln("Error at test number" + count);
}
- count ++;
}
}
public void TestText() {
SearchData TEXT[] = {
- new SearchData("the foxy brown fox", "fox", null, Collator.TERTIARY, null, new int[] {4, 15, -1}, new int[] {3, 3}),
- new SearchData("the quick brown fox", "fox", null, Collator.TERTIARY, null, new int[] {16, -1}, new int[] {3}),
- new SearchData(null, null, null, Collator.TERTIARY, null, new int[] {-1}, new int[]{0})
+ SD("the foxy brown fox", "fox", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(4, 15, -1), IA(3, 3)),
+ SD("the quick brown fox", "fox", null, TERTIARY, STANDARD_ELEMENT_COMPARISON, null, IA(16, -1), IA(3))
};
StringCharacterIterator t = new StringCharacterIterator(TEXT[0].text);
StringSearch strsrch = new StringSearch(TEXT[0].pattern, t, m_en_us_, null);
}
public void TestVariable() {
- int count = 0;
m_en_us_.setAlternateHandlingShifted(true);
- while (VARIABLE[count].text != null) {
+ for (int count = 0; count < VARIABLE.length; count++) {
// logln("variable" + count);
if (!assertEqual(VARIABLE[count])) {
errln("Error at test number " + count);
}
- count ++;
}
m_en_us_.setAlternateHandlingShifted(false);
}
public void TestVariableCanonical() {
- int count = 0;
m_en_us_.setAlternateHandlingShifted(true);
- while (VARIABLE[count].text != null) {
+ for (int count = 0; count < VARIABLE.length; count++) {
// logln("variable " + count);
if (!assertCanonicalEqual(VARIABLE[count])) {
errln("Error at test number " + count);
}
- count ++;
}
m_en_us_.setAlternateHandlingShifted(false);
}
String pattern = "pattern";
String text = "text";
StringSearch strsrch = null;
- int count = 0;
try {
strsrch = new StringSearch(pattern, text);
} catch (Exception e) {
return;
}
- while (DIACTRICMATCH[count].text != null) {
+ for (int count = 0; count < DIACTRICMATCH.length; count++) {
strsrch.setCollator(getCollator(DIACTRICMATCH[count].collator));
strsrch.getCollator().setStrength(DIACTRICMATCH[count].strength);
strsrch.setBreakIterator(getBreakIterator(DIACTRICMATCH[count].breaker));
if (!assertEqualWithStringSearch(strsrch, DIACTRICMATCH[count])) {
errln("Error at test number " + count);
}
- count++;
}
}