m_CEBuffer_ = new int[CE_BUFFER_INIT_SIZE_];
m_buffer_ = new StringBuilder();
m_utilSpecialBackUp_ = new Backup();
- if (collator.getDecomposition() != Collator.NO_DECOMPOSITION) {
- m_nfcImpl_.getFCDTrie(); // ensure the FCD data is initialized
- }
}
/**
m_FCDStart_ = offset - 1;
m_source_.setIndex(offset);
// trie access
- int fcd = m_nfcImpl_.getFCD16FromSingleLead((char)ch);
- if (fcd != 0 && Character.isHighSurrogate((char)ch)) {
- int c2 = m_source_.next();
- if (c2 < 0) {
- fcd = 0; // end of input
- } else if (Character.isLowSurrogate((char)c2)) {
- fcd = m_nfcImpl_.getFCD16(Character.toCodePoint((char)ch, (char)c2));
+ int fcd;
+ if (ch < 0x180) {
+ fcd = m_nfcImpl_.getFCD16FromBelow180(ch);
+ } else if (m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) {
+ if (Character.isHighSurrogate((char)ch)) {
+ int c2 = m_source_.next();
+ if (c2 < 0) {
+ fcd = 0; // end of input
+ } else if (Character.isLowSurrogate((char)c2)) {
+ fcd = m_nfcImpl_.getFCD16FromNormData(Character.toCodePoint((char)ch, (char)c2));
+ } else {
+ m_source_.moveIndex(-1);
+ fcd = 0;
+ }
} else {
- m_source_.moveIndex(-1);
- fcd = 0;
+ fcd = m_nfcImpl_.getFCD16FromNormData(ch);
}
+ } else {
+ fcd = 0;
}
int prevTrailCC = fcd & LAST_BYTE_MASK_;
int fcd;
m_FCDLimit_ = offset + 1;
m_source_.setIndex(offset);
- if (!UTF16.isSurrogate((char)ch)) {
- fcd = m_nfcImpl_.getFCD16FromSingleLead((char)ch);
+ if (ch < 0x180) {
+ fcd = m_nfcImpl_.getFCD16FromBelow180(ch);
+ } else if (!Character.isLowSurrogate((char)ch)) {
+ if (m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) {
+ fcd = m_nfcImpl_.getFCD16FromNormData(ch);
+ } else {
+ fcd = 0;
+ }
} else {
- fcd = 0;
- if (!Normalizer2Impl.UTF16Plus.isSurrogateLead(ch)) {
- int c2 = m_source_.previous();
- if (c2 < 0) {
- // start of input
- } else if (Character.isHighSurrogate((char)c2)) {
- ch = Character.toCodePoint((char)c2, (char)ch);
- fcd = m_nfcImpl_.getFCD16(ch);
- --offset;
- } else {
- m_source_.moveIndex(1);
- }
+ int c2 = m_source_.previous();
+ if (c2 < 0) {
+ fcd = 0; // start of input
+ } else if (Character.isHighSurrogate((char)c2)) {
+ ch = Character.toCodePoint((char)c2, (char)ch);
+ fcd = m_nfcImpl_.getFCD16FromNormData(ch);
+ --offset;
+ } else {
+ m_source_.moveIndex(1);
+ fcd = 0;
}
}
* thrown when argument rules have an invalid syntax
*/
CollationParsedRuleBuilder(String rules) throws ParseException {
- m_nfcImpl_.getFCDTrie(); // initialize the optional FCD trie
m_parser_ = new CollationRuleParser(rules);
m_parser_.assembleTokenList();
m_utilColEIter_ = RuleBasedCollator.UCA_
}
if (!buildCMTabFlag) {
// check combining class
- int fcd = m_nfcImpl_.getFCD16FromSingleLead(m_utilElement_.m_cPoints_.charAt(i)); // TODO: review for handling supplementary characters
+ int fcd = m_nfcImpl_.getFCD16(m_utilElement_.m_cPoints_.charAt(i)); // TODO: review for handling supplementary characters
if ((fcd & 0xff) == 0) {
// reset flag when current char is not combining mark.
containCombinMarks = false;
cm = new char[0x10000];
}
for (char c = 0; c < 0xffff; c++) {
- int fcd = m_nfcImpl_.getFCD16FromSingleLead(c); // TODO: review for handling supplementary characters
+ int fcd;
+ if (UTF16.isLeadSurrogate(c)) {
+ fcd = 0;
+ if (m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(c)) {
+ int supp = Character.toCodePoint(c, (char)0xdc00);
+ int suppLimit = supp + 0x400;
+ while (supp < suppLimit) {
+ fcd |= m_nfcImpl_.getFCD16(supp++);
+ }
+ }
+ } else {
+ fcd = m_nfcImpl_.getFCD16(c);
+ }
+ // TODO: review for handling supplementary characters
if (fcd >= 0x100 || // if the leading combining class(c) > 0 ||
(UTF16.isLeadSurrogate(c) && fcd != 0)) {
// c is a leading surrogate with some FCD data
for (int j = 0; j < m_utilElement_.m_cPoints_.length()
- m_utilElement_.m_cPointsOffset_; j++) {
- int fcd = m_nfcImpl_.getFCD16FromSingleLead(m_utilElement_.m_cPoints_.charAt(j)); // TODO: review for handling supplementary characters
+ int fcd = m_nfcImpl_.getFCD16(m_utilElement_.m_cPoints_.charAt(j)); // TODO: review for handling supplementary characters
if ((fcd & 0xff) == 0) {
baseChar = m_utilElement_.m_cPoints_.charAt(j);
} else {
}
CombinClassTable cmLookup = t.cmLookup;
int[] index = cmLookup.index;
- int cClass = m_nfcImpl_.getFCD16FromSingleLead(cMark) & 0xff; // TODO: review for handling supplementary characters
+ int cClass = m_nfcImpl_.getFCD16(cMark) & 0xff; // TODO: review for handling supplementary characters
int maxIndex = 0;
char[] precompCh = new char[256];
int[] precompClass = new int[256];
String comp = Normalizer.compose(decompBuf.toString(), false);
if (comp.length() == 1) {
precompCh[precompLen] = comp.charAt(0);
- precompClass[precompLen] = (m_nfcImpl_.getFCD16FromSingleLead(cmLookup.cPoints[i]) & 0xff); // TODO: review for handling supplementary characters
+ precompClass[precompLen] = m_nfcImpl_.getFCD16(cmLookup.cPoints[i]) & 0xff; // TODO: review for handling supplementary characters
precompLen++;
StringBuilder decomp = new StringBuilder();
for (int j = 0; j < m_utilElement_.m_cPoints_.length(); j++) {
private final char getFCD(CharacterIterator str, int offset)
{
char ch = str.setIndex(offset);
- int result = m_nfcImpl_.getFCD16FromSingleLead(ch);
- if (result != 0 && Character.isHighSurrogate(ch)) {
- char c2 = str.next();
- if (Character.isLowSurrogate(c2)) {
- result = m_nfcImpl_.getFCD16(Character.toCodePoint(ch, c2));
+ if (ch < 0x180) {
+ return (char)m_nfcImpl_.getFCD16FromBelow180(ch);
+ } else if (m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) {
+ if (!Character.isHighSurrogate(ch)) {
+ return (char)m_nfcImpl_.getFCD16FromNormData(ch);
} else {
- result = 0;
+ char c2 = str.next();
+ if (Character.isLowSurrogate(c2)) {
+ return (char)m_nfcImpl_.getFCD16FromNormData(Character.toCodePoint(ch, c2));
+ }
}
}
- return (char)result;
+ return 0;
}
/**
* Gets the FCD value for the code point before the input offset.
* @return FCD value for the character before offset
*/
private final int getFCDBefore(CharacterIterator iter, int offset) {
- int result;
iter.setIndex(offset);
char c = iter.previous();
- if (UTF16.isSurrogate(c)) {
- if (Normalizer2Impl.UTF16Plus.isSurrogateLead(c)) {
- result = 0;
- } else {
- char lead = iter.previous();
- if (Character.isHighSurrogate(lead)) {
- result = m_nfcImpl_.getFCD16(Character.toCodePoint(lead, c));
- } else {
- result = 0;
- }
+ if (c < 0x180) {
+ return (char)m_nfcImpl_.getFCD16FromBelow180(c);
+ } else if (!Character.isLowSurrogate(c)) {
+ if (m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(c)) {
+ return (char)m_nfcImpl_.getFCD16FromNormData(c);
}
} else {
- result = m_nfcImpl_.getFCD16FromSingleLead(c);
+ char lead = iter.previous();
+ if (Character.isHighSurrogate(lead)) {
+ return (char)m_nfcImpl_.getFCD16FromNormData(Character.toCodePoint(lead, c));
+ }
}
- return result;
+ return 0;
}
/**
* Gets the fcd value for a character at the argument index.
private final char getFCD(String str, int offset)
{
char ch = str.charAt(offset);
- int result = m_nfcImpl_.getFCD16FromSingleLead(ch);
- if (result != 0 && Character.isHighSurrogate(ch)) {
- char c2;
- if (++offset < str.length() && Character.isLowSurrogate(c2 = str.charAt(offset))) {
- result = m_nfcImpl_.getFCD16(Character.toCodePoint(ch, c2));
+ if (ch < 0x180) {
+ return (char)m_nfcImpl_.getFCD16FromBelow180(ch);
+ } else if (m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) {
+ if (!Character.isHighSurrogate(ch)) {
+ return (char)m_nfcImpl_.getFCD16FromNormData(ch);
} else {
- result = 0;
+ char c2;
+ if (++offset < str.length() && Character.isLowSurrogate(c2 = str.charAt(offset))) {
+ return (char)m_nfcImpl_.getFCD16FromNormData(Character.toCodePoint(ch, c2));
+ }
}
}
- return (char)result;
+ return 0;
}
/**
*/
private final void initialize()
{
- m_nfcImpl_.getFCDTrie(); // ensure the FCD data is initialized
int expandlength = initializePattern();
if (m_pattern_.m_CELength_ > 0) {
char minlength = (char)(m_pattern_.m_CELength_ > expandlength
// iteration ends with reading CharacterIterator.DONE which has fcd==0
char c = text.setIndex(textoffset);
for (;;) {
- if ((m_nfcImpl_.getFCD16FromSingleLead(c) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
+ if (c < Normalizer2Impl.MIN_CCC_LCCC_CP || !m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(c)) {
return textoffset;
}
char next = text.next();
if (Character.isSurrogatePair(c, next)) {
- int fcd = m_nfcImpl_.getFCD16(Character.toCodePoint(c, next));
+ int fcd = m_nfcImpl_.getFCD16FromNormData(Character.toCodePoint(c, next));
if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
return textoffset;
}
next = text.next();
textoffset += 2;
} else {
+ int fcd = m_nfcImpl_.getFCD16FromNormData(c);
+ if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
+ return textoffset;
+ }
++textoffset;
}
c = next;
* @return FCD normalizer
*/
public static Normalizer2 getFCDNormalizer2() {
- Norm2AllModes allModes=getNFCInstance();
- allModes.impl.getFCDTrie();
- return allModes.fcd;
+ return getNFCInstance().fcd;
}
private static final class Norm2AllModesSingleton {
// low-level properties ------------------------------------------------ ***
public Trie2_16 getNormTrie() { return normTrie; }
- /**
- * Builds and returns the FCD trie based on the data used in this instance.
- * This is required before any of {@link #getFCD16(int)} or
- * {@link #getFCD16FromSingleLead(char)} are called,
- * or else they crash.
- * This method is called automatically by Normalizer2.getInstance(..., Mode.FCD).
- * @return The FCD trie for this instance's data.
- */
- public synchronized Trie2_16 getFCDTrie() {
- if(fcdTrie!=null) {
- return fcdTrie;
- }
- Trie2Writable newFCDTrie=new Trie2Writable(0, 0);
- Iterator<Trie2.Range> trieIterator=normTrie.iterator();
- Trie2.Range range;
- while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
- // Set the FCD value for a range of same-norm16 characters.
- if(range.value!=0) {
- setFCD16FromNorm16(range.startCodePoint, range.endCodePoint, range.value, newFCDTrie);
- }
- }
- for(char lead=0xd800; lead<0xdc00; ++lead) {
- // Collect (OR together) the FCD values for a range of supplementary characters,
- // for their lead surrogate code unit.
- int oredValue=newFCDTrie.get(lead);
- trieIterator=normTrie.iteratorForLeadSurrogate(lead);
- while(trieIterator.hasNext()) {
- oredValue|=trieIterator.next().value;
- }
- if(oredValue!=0) {
- // Set a "bad" value for makeFCD() to break the quick check loop
- // and look up the value for the supplementary code point.
- // If there is any lccc, then set the worst-case lccc of 1.
- // The ORed-together value's tccc is already the worst case.
- if(oredValue>0xff) {
- oredValue=0x100|(oredValue&0xff);
- }
- newFCDTrie.setForLeadSurrogateCodeUnit(lead, oredValue);
- }
- }
- return fcdTrie=newFCDTrie.toTrie2_16();
- }
+
+ // Note: Normalizer2Impl.java r30983 (2011-nov-27)
+ // still had getFCDTrie() which built and cached an FCD trie.
+ // That provided faster access to FCD data than getFCD16FromNormData()
+ // but required synchronization and consumed some 10kB of heap memory
+ // in any process that uses FCD (e.g., via collation).
+ // tccc180[] and smallFCD[] are intended to help with any loss of performance,
+ // at least for Latin & CJK.
/**
* Builds the canonical-iterator data for this instance.
/**
* Returns the FCD data for code point c.
- * <b>{@link #getFCDTrie()} must have been called before this method,
- * or else this method will crash.</b>
- * @param c A Unicode code point.
- * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
- */
- public int getFCD16(int c) { return fcdTrie.get(c); }
- /**
- * Returns the FCD data for the single-or-lead code unit c.
- * <b>{@link #getFCDTrie()} must have been called before this method,
- * or else this method will crash.</b>
* @param c A Unicode code point.
* @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
*/
- public int getFCD16FromSingleLead(char c) { return fcdTrie.getFromU16SingleLead(c); }
+ public int getFCD16(int c) {
+ if(c<0) {
+ return 0;
+ } else if(c<0x180) {
+ return tccc180[c];
+ } else if(c<=0xffff) {
+ if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
+ }
+ return getFCD16FromNormData(c);
+ }
+ /** Returns the FCD data for U+0000<=c<U+0180. */
+ public int getFCD16FromBelow180(int c) { return tccc180[c]; }
+ /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
+ public boolean singleLeadMightHaveNonZeroFCD16(int lead) {
+ // 0<=lead<=0xffff
+ byte bits=smallFCD[lead>>8];
+ if(bits==0) { return false; }
+ return ((bits>>((lead>>5)&7))&1)!=0;
+ }
/** Gets the FCD value from the regular normalization data. */
public int getFCD16FromNormData(int c) {
}
}
- private void setFCD16FromNorm16(int start, int end, int norm16, Trie2Writable newFCDTrie) {
- // Only loops for 1:1 algorithmic mappings.
- for(;;) {
- if(norm16>=MIN_NORMAL_MAYBE_YES) {
- norm16&=0xff;
- norm16|=norm16<<8;
- } else if(norm16<=minYesNo || minMaybeYes<=norm16) {
- // no decomposition or Hangul syllable, all zeros
- break;
- } else if(limitNoNo<=norm16) {
- int delta=norm16-(minMaybeYes-MAX_DELTA-1);
- if(start==end) {
- start+=delta;
- norm16=getNorm16(start);
- } else {
- // the same delta leads from different original characters to different mappings
- do {
- int c=start+delta;
- setFCD16FromNorm16(c, c, getNorm16(c), newFCDTrie);
- } while(++start<=end);
- break;
- }
- } else {
- // c decomposes, get everything from the variable-length extra data
- int firstUnit=extraData.charAt(norm16);
- if((firstUnit&MAPPING_LENGTH_MASK)==0) {
- // A character that is deleted (maps to an empty string) must
- // get the worst-case lccc and tccc values because arbitrary
- // characters on both sides will become adjacent.
- norm16=0x1ff;
- } else {
- int fcd16=firstUnit>>8; // tccc
- if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
- fcd16|=extraData.charAt(norm16-1)&0xff00; // lccc
- }
- norm16=fcd16;
- }
- }
- newFCDTrie.setRange(start, end, norm16, true);
- break;
- }
- }
-
/**
* Gets the decomposition for one code point.
* @param c code point
if((c=s.charAt(src))<MIN_CCC_LCCC_CP) {
prevFCD16=~c;
++src;
- } else if((fcd16=fcdTrie.getFromU16SingleLead((char)c))<=0xff) {
- prevFCD16=fcd16;
+ } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
+ prevFCD16=0;
++src;
- } else if(!UTF16.isSurrogate((char)c)) {
- break;
} else {
- char c2;
- if(UTF16Plus.isSurrogateLead(c)) {
- if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
- c=Character.toCodePoint((char)c, c2);
- }
- } else /* trail surrogate */ {
- if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
- --src;
- c=Character.toCodePoint(c2, (char)c);
+ if(UTF16.isSurrogate((char)c)) {
+ char c2;
+ if(UTF16Plus.isSurrogateLead(c)) {
+ if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
+ c=Character.toCodePoint((char)c, c2);
+ }
+ } else /* trail surrogate */ {
+ if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
+ --src;
+ c=Character.toCodePoint(c2, (char)c);
+ }
}
}
- if((fcd16=getFCD16(c))<=0xff) {
+ if((fcd16=getFCD16FromNormData(c))<=0xff) {
prevFCD16=fcd16;
src+=Character.charCount(c);
} else {
// We know that the previous character's lccc==0.
if(prevFCD16<0) {
// Fetching the fcd16 value was deferred for this below-U+0300 code point.
- prevFCD16=getFCD16FromSingleLead((char)~prevFCD16);
+ int prev=~prevFCD16;
+ prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
if(prevFCD16>1) {
--prevBoundary;
}
--p;
// Need to fetch the previous character's FCD value because
// prevFCD16 was just for the trail surrogate code point.
- prevFCD16=getFCD16(Character.toCodePoint(s.charAt(p), s.charAt(p+1)));
+ prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1)));
// Still known to have lccc==0 because its lead surrogate unit had lccc==0.
}
if(prevFCD16>1) {
while(p>0) {
int c=Character.codePointBefore(s, p);
p-=Character.charCount(c);
- if(fcdTrie.get(c)<=0xff) {
+ if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) {
break;
}
}
private int findNextFCDBoundary(CharSequence s, int p, int limit) {
while(p<limit) {
int c=Character.codePointAt(s, p);
- int fcd16=fcdTrie.get(c);
- if(fcd16<=0xff) {
+ if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) {
break;
}
p+=Character.charCount(c);
private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0
private int[] tccc180; // [0x180] tccc values for U+0000..U+017F
- private Trie2_16 fcdTrie;
private Trie2_32 canonIterData;
private ArrayList<UnicodeSet> canonStartSets;
new NormQuickCheckIntProperty(SRC_NFKC, UProperty.NFKC_QUICK_CHECK, 2),
new CombiningClassIntProperty(SRC_NFC) { // LEAD_CANONICAL_COMBINING_CLASS
int getValue(int c) {
- return Norm2AllModes.getNFCInstance().impl.getFCDTrie().get(c)>>8;
+ return Norm2AllModes.getNFCInstance().impl.getFCD16(c)>>8;
}
},
new CombiningClassIntProperty(SRC_NFC) { // TRAIL_CANONICAL_COMBINING_CLASS
int getValue(int c) {
- return Norm2AllModes.getNFCInstance().impl.getFCDTrie().get(c)&0xff;
+ return Norm2AllModes.getNFCInstance().impl.getFCD16(c)&0xff;
}
},
new IntProperty(2, GCB_MASK, GCB_SHIFT), // GRAPHEME_CLUSTER_BREAK