From: Markus Scherer Date: Mon, 28 Nov 2011 23:04:28 +0000 (+0000) Subject: ICU-8942 use smaller/simpler FCD data rather than building an FCD trie X-Git-Tag: milestone-59-0-1~4326 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=c267b157f0f226f9d305ae039ad1f80ba208f95d;p=icu ICU-8942 use smaller/simpler FCD data rather than building an FCD trie X-SVN-Rev: 30986 --- diff --git a/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationElementIterator.java b/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationElementIterator.java index 88173078a4f..3d74fb6bea3 100644 --- a/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationElementIterator.java +++ b/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationElementIterator.java @@ -601,9 +601,6 @@ public final class CollationElementIterator m_CEBuffer_ = new int[CE_BUFFER_INIT_SIZE_]; m_buffer_ = new StringBuilder(); m_utilSpecialBackUp_ = new Backup(); - if (collator.getDecomposition() != Collator.NO_DECOMPOSITION) { - m_nfcImpl_.getFCDTrie(); // ensure the FCD data is initialized - } } /** @@ -1077,17 +1074,25 @@ public final class CollationElementIterator m_FCDStart_ = offset - 1; m_source_.setIndex(offset); // trie access - int fcd = m_nfcImpl_.getFCD16FromSingleLead((char)ch); - if (fcd != 0 && Character.isHighSurrogate((char)ch)) { - int c2 = m_source_.next(); - if (c2 < 0) { - fcd = 0; // end of input - } else if (Character.isLowSurrogate((char)c2)) { - fcd = m_nfcImpl_.getFCD16(Character.toCodePoint((char)ch, (char)c2)); + int fcd; + if (ch < 0x180) { + fcd = m_nfcImpl_.getFCD16FromBelow180(ch); + } else if (m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) { + if (Character.isHighSurrogate((char)ch)) { + int c2 = m_source_.next(); + if (c2 < 0) { + fcd = 0; // end of input + } else if (Character.isLowSurrogate((char)c2)) { + fcd = m_nfcImpl_.getFCD16FromNormData(Character.toCodePoint((char)ch, (char)c2)); + } else { + m_source_.moveIndex(-1); + fcd = 0; + } } else { - m_source_.moveIndex(-1); - fcd = 0; + fcd = m_nfcImpl_.getFCD16FromNormData(ch); } + } else { + fcd = 0; } int prevTrailCC = fcd & LAST_BYTE_MASK_; @@ -1216,21 +1221,25 @@ public final class CollationElementIterator int fcd; m_FCDLimit_ = offset + 1; m_source_.setIndex(offset); - if (!UTF16.isSurrogate((char)ch)) { - fcd = m_nfcImpl_.getFCD16FromSingleLead((char)ch); + if (ch < 0x180) { + fcd = m_nfcImpl_.getFCD16FromBelow180(ch); + } else if (!Character.isLowSurrogate((char)ch)) { + if (m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) { + fcd = m_nfcImpl_.getFCD16FromNormData(ch); + } else { + fcd = 0; + } } else { - fcd = 0; - if (!Normalizer2Impl.UTF16Plus.isSurrogateLead(ch)) { - int c2 = m_source_.previous(); - if (c2 < 0) { - // start of input - } else if (Character.isHighSurrogate((char)c2)) { - ch = Character.toCodePoint((char)c2, (char)ch); - fcd = m_nfcImpl_.getFCD16(ch); - --offset; - } else { - m_source_.moveIndex(1); - } + int c2 = m_source_.previous(); + if (c2 < 0) { + fcd = 0; // start of input + } else if (Character.isHighSurrogate((char)c2)) { + ch = Character.toCodePoint((char)c2, (char)ch); + fcd = m_nfcImpl_.getFCD16FromNormData(ch); + --offset; + } else { + m_source_.moveIndex(1); + fcd = 0; } } diff --git a/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationParsedRuleBuilder.java b/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationParsedRuleBuilder.java index 61fa7bc4b98..b654526e2e2 100644 --- a/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationParsedRuleBuilder.java +++ b/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationParsedRuleBuilder.java @@ -47,7 +47,6 @@ final class CollationParsedRuleBuilder { * thrown when argument rules have an invalid syntax */ CollationParsedRuleBuilder(String rules) throws ParseException { - m_nfcImpl_.getFCDTrie(); // initialize the optional FCD trie m_parser_ = new CollationRuleParser(rules); m_parser_.assembleTokenList(); m_utilColEIter_ = RuleBasedCollator.UCA_ @@ -1815,7 +1814,7 @@ final class CollationParsedRuleBuilder { } if (!buildCMTabFlag) { // check combining class - int fcd = m_nfcImpl_.getFCD16FromSingleLead(m_utilElement_.m_cPoints_.charAt(i)); // TODO: review for handling supplementary characters + int fcd = m_nfcImpl_.getFCD16(m_utilElement_.m_cPoints_.charAt(i)); // TODO: review for handling supplementary characters if ((fcd & 0xff) == 0) { // reset flag when current char is not combining mark. containCombinMarks = false; @@ -3809,7 +3808,20 @@ final class CollationParsedRuleBuilder { cm = new char[0x10000]; } for (char c = 0; c < 0xffff; c++) { - int fcd = m_nfcImpl_.getFCD16FromSingleLead(c); // TODO: review for handling supplementary characters + int fcd; + if (UTF16.isLeadSurrogate(c)) { + fcd = 0; + if (m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(c)) { + int supp = Character.toCodePoint(c, (char)0xdc00); + int suppLimit = supp + 0x400; + while (supp < suppLimit) { + fcd |= m_nfcImpl_.getFCD16(supp++); + } + } + } else { + fcd = m_nfcImpl_.getFCD16(c); + } + // TODO: review for handling supplementary characters if (fcd >= 0x100 || // if the leading combining class(c) > 0 || (UTF16.isLeadSurrogate(c) && fcd != 0)) { // c is a leading surrogate with some FCD data @@ -3979,7 +3991,7 @@ final class CollationParsedRuleBuilder { for (int j = 0; j < m_utilElement_.m_cPoints_.length() - m_utilElement_.m_cPointsOffset_; j++) { - int fcd = m_nfcImpl_.getFCD16FromSingleLead(m_utilElement_.m_cPoints_.charAt(j)); // TODO: review for handling supplementary characters + int fcd = m_nfcImpl_.getFCD16(m_utilElement_.m_cPoints_.charAt(j)); // TODO: review for handling supplementary characters if ((fcd & 0xff) == 0) { baseChar = m_utilElement_.m_cPoints_.charAt(j); } else { @@ -4008,7 +4020,7 @@ final class CollationParsedRuleBuilder { } CombinClassTable cmLookup = t.cmLookup; int[] index = cmLookup.index; - int cClass = m_nfcImpl_.getFCD16FromSingleLead(cMark) & 0xff; // TODO: review for handling supplementary characters + int cClass = m_nfcImpl_.getFCD16(cMark) & 0xff; // TODO: review for handling supplementary characters int maxIndex = 0; char[] precompCh = new char[256]; int[] precompClass = new int[256]; @@ -4024,7 +4036,7 @@ final class CollationParsedRuleBuilder { String comp = Normalizer.compose(decompBuf.toString(), false); if (comp.length() == 1) { precompCh[precompLen] = comp.charAt(0); - precompClass[precompLen] = (m_nfcImpl_.getFCD16FromSingleLead(cmLookup.cPoints[i]) & 0xff); // TODO: review for handling supplementary characters + precompClass[precompLen] = m_nfcImpl_.getFCD16(cmLookup.cPoints[i]) & 0xff; // TODO: review for handling supplementary characters precompLen++; StringBuilder decomp = new StringBuilder(); for (int j = 0; j < m_utilElement_.m_cPoints_.length(); j++) { diff --git a/icu4j/main/classes/collate/src/com/ibm/icu/text/StringSearch.java b/icu4j/main/classes/collate/src/com/ibm/icu/text/StringSearch.java index 11265cd21ea..2be076c6e8a 100644 --- a/icu4j/main/classes/collate/src/com/ibm/icu/text/StringSearch.java +++ b/icu4j/main/classes/collate/src/com/ibm/icu/text/StringSearch.java @@ -819,16 +819,19 @@ public final class StringSearch extends SearchIterator private final char getFCD(CharacterIterator str, int offset) { char ch = str.setIndex(offset); - int result = m_nfcImpl_.getFCD16FromSingleLead(ch); - if (result != 0 && Character.isHighSurrogate(ch)) { - char c2 = str.next(); - if (Character.isLowSurrogate(c2)) { - result = m_nfcImpl_.getFCD16(Character.toCodePoint(ch, c2)); + if (ch < 0x180) { + return (char)m_nfcImpl_.getFCD16FromBelow180(ch); + } else if (m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) { + if (!Character.isHighSurrogate(ch)) { + return (char)m_nfcImpl_.getFCD16FromNormData(ch); } else { - result = 0; + char c2 = str.next(); + if (Character.isLowSurrogate(c2)) { + return (char)m_nfcImpl_.getFCD16FromNormData(Character.toCodePoint(ch, c2)); + } } } - return (char)result; + return 0; } /** * Gets the FCD value for the code point before the input offset. @@ -838,24 +841,21 @@ public final class StringSearch extends SearchIterator * @return FCD value for the character before offset */ private final int getFCDBefore(CharacterIterator iter, int offset) { - int result; iter.setIndex(offset); char c = iter.previous(); - if (UTF16.isSurrogate(c)) { - if (Normalizer2Impl.UTF16Plus.isSurrogateLead(c)) { - result = 0; - } else { - char lead = iter.previous(); - if (Character.isHighSurrogate(lead)) { - result = m_nfcImpl_.getFCD16(Character.toCodePoint(lead, c)); - } else { - result = 0; - } + if (c < 0x180) { + return (char)m_nfcImpl_.getFCD16FromBelow180(c); + } else if (!Character.isLowSurrogate(c)) { + if (m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(c)) { + return (char)m_nfcImpl_.getFCD16FromNormData(c); } } else { - result = m_nfcImpl_.getFCD16FromSingleLead(c); + char lead = iter.previous(); + if (Character.isHighSurrogate(lead)) { + return (char)m_nfcImpl_.getFCD16FromNormData(Character.toCodePoint(lead, c)); + } } - return result; + return 0; } /** * Gets the fcd value for a character at the argument index. @@ -867,16 +867,19 @@ public final class StringSearch extends SearchIterator private final char getFCD(String str, int offset) { char ch = str.charAt(offset); - int result = m_nfcImpl_.getFCD16FromSingleLead(ch); - if (result != 0 && Character.isHighSurrogate(ch)) { - char c2; - if (++offset < str.length() && Character.isLowSurrogate(c2 = str.charAt(offset))) { - result = m_nfcImpl_.getFCD16(Character.toCodePoint(ch, c2)); + if (ch < 0x180) { + return (char)m_nfcImpl_.getFCD16FromBelow180(ch); + } else if (m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) { + if (!Character.isHighSurrogate(ch)) { + return (char)m_nfcImpl_.getFCD16FromNormData(ch); } else { - result = 0; + char c2; + if (++offset < str.length() && Character.isLowSurrogate(c2 = str.charAt(offset))) { + return (char)m_nfcImpl_.getFCD16FromNormData(Character.toCodePoint(ch, c2)); + } } } - return (char)result; + return 0; } /** @@ -1058,7 +1061,6 @@ public final class StringSearch extends SearchIterator */ private final void initialize() { - m_nfcImpl_.getFCDTrie(); // ensure the FCD data is initialized int expandlength = initializePattern(); if (m_pattern_.m_CELength_ > 0) { char minlength = (char)(m_pattern_.m_CELength_ > expandlength @@ -1147,18 +1149,22 @@ public final class StringSearch extends SearchIterator // iteration ends with reading CharacterIterator.DONE which has fcd==0 char c = text.setIndex(textoffset); for (;;) { - if ((m_nfcImpl_.getFCD16FromSingleLead(c) >> SECOND_LAST_BYTE_SHIFT_) == 0) { + if (c < Normalizer2Impl.MIN_CCC_LCCC_CP || !m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(c)) { return textoffset; } char next = text.next(); if (Character.isSurrogatePair(c, next)) { - int fcd = m_nfcImpl_.getFCD16(Character.toCodePoint(c, next)); + int fcd = m_nfcImpl_.getFCD16FromNormData(Character.toCodePoint(c, next)); if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) { return textoffset; } next = text.next(); textoffset += 2; } else { + int fcd = m_nfcImpl_.getFCD16FromNormData(c); + if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) { + return textoffset; + } ++textoffset; } c = next; diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/Norm2AllModes.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/Norm2AllModes.java index b60374260ec..a1302638204 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/Norm2AllModes.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/Norm2AllModes.java @@ -357,9 +357,7 @@ public final class Norm2AllModes { * @return FCD normalizer */ public static Normalizer2 getFCDNormalizer2() { - Norm2AllModes allModes=getNFCInstance(); - allModes.impl.getFCDTrie(); - return allModes.fcd; + return getNFCInstance().fcd; } private static final class Norm2AllModesSingleton { diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/Normalizer2Impl.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/Normalizer2Impl.java index 0fc5c9237b2..8a70e035be5 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/Normalizer2Impl.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/Normalizer2Impl.java @@ -536,48 +536,14 @@ public final class Normalizer2Impl { // low-level properties ------------------------------------------------ *** public Trie2_16 getNormTrie() { return normTrie; } - /** - * Builds and returns the FCD trie based on the data used in this instance. - * This is required before any of {@link #getFCD16(int)} or - * {@link #getFCD16FromSingleLead(char)} are called, - * or else they crash. - * This method is called automatically by Normalizer2.getInstance(..., Mode.FCD). - * @return The FCD trie for this instance's data. - */ - public synchronized Trie2_16 getFCDTrie() { - if(fcdTrie!=null) { - return fcdTrie; - } - Trie2Writable newFCDTrie=new Trie2Writable(0, 0); - Iterator trieIterator=normTrie.iterator(); - Trie2.Range range; - while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { - // Set the FCD value for a range of same-norm16 characters. - if(range.value!=0) { - setFCD16FromNorm16(range.startCodePoint, range.endCodePoint, range.value, newFCDTrie); - } - } - for(char lead=0xd800; lead<0xdc00; ++lead) { - // Collect (OR together) the FCD values for a range of supplementary characters, - // for their lead surrogate code unit. - int oredValue=newFCDTrie.get(lead); - trieIterator=normTrie.iteratorForLeadSurrogate(lead); - while(trieIterator.hasNext()) { - oredValue|=trieIterator.next().value; - } - if(oredValue!=0) { - // Set a "bad" value for makeFCD() to break the quick check loop - // and look up the value for the supplementary code point. - // If there is any lccc, then set the worst-case lccc of 1. - // The ORed-together value's tccc is already the worst case. - if(oredValue>0xff) { - oredValue=0x100|(oredValue&0xff); - } - newFCDTrie.setForLeadSurrogateCodeUnit(lead, oredValue); - } - } - return fcdTrie=newFCDTrie.toTrie2_16(); - } + + // Note: Normalizer2Impl.java r30983 (2011-nov-27) + // still had getFCDTrie() which built and cached an FCD trie. + // That provided faster access to FCD data than getFCD16FromNormData() + // but required synchronization and consumed some 10kB of heap memory + // in any process that uses FCD (e.g., via collation). + // tccc180[] and smallFCD[] are intended to help with any loss of performance, + // at least for Latin & CJK. /** * Builds the canonical-iterator data for this instance. @@ -695,20 +661,28 @@ public final class Normalizer2Impl { /** * Returns the FCD data for code point c. - * {@link #getFCDTrie()} must have been called before this method, - * or else this method will crash. - * @param c A Unicode code point. - * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. - */ - public int getFCD16(int c) { return fcdTrie.get(c); } - /** - * Returns the FCD data for the single-or-lead code unit c. - * {@link #getFCDTrie()} must have been called before this method, - * or else this method will crash. * @param c A Unicode code point. * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. */ - public int getFCD16FromSingleLead(char c) { return fcdTrie.getFromU16SingleLead(c); } + public int getFCD16(int c) { + if(c<0) { + return 0; + } else if(c<0x180) { + return tccc180[c]; + } else if(c<=0xffff) { + if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } + } + return getFCD16FromNormData(c); + } + /** Returns the FCD data for U+0000<=c>8]; + if(bits==0) { return false; } + return ((bits>>((lead>>5)&7))&1)!=0; + } /** Gets the FCD value from the regular normalization data. */ public int getFCD16FromNormData(int c) { @@ -745,49 +719,6 @@ public final class Normalizer2Impl { } } - private void setFCD16FromNorm16(int start, int end, int norm16, Trie2Writable newFCDTrie) { - // Only loops for 1:1 algorithmic mappings. - for(;;) { - if(norm16>=MIN_NORMAL_MAYBE_YES) { - norm16&=0xff; - norm16|=norm16<<8; - } else if(norm16<=minYesNo || minMaybeYes<=norm16) { - // no decomposition or Hangul syllable, all zeros - break; - } else if(limitNoNo<=norm16) { - int delta=norm16-(minMaybeYes-MAX_DELTA-1); - if(start==end) { - start+=delta; - norm16=getNorm16(start); - } else { - // the same delta leads from different original characters to different mappings - do { - int c=start+delta; - setFCD16FromNorm16(c, c, getNorm16(c), newFCDTrie); - } while(++start<=end); - break; - } - } else { - // c decomposes, get everything from the variable-length extra data - int firstUnit=extraData.charAt(norm16); - if((firstUnit&MAPPING_LENGTH_MASK)==0) { - // A character that is deleted (maps to an empty string) must - // get the worst-case lccc and tccc values because arbitrary - // characters on both sides will become adjacent. - norm16=0x1ff; - } else { - int fcd16=firstUnit>>8; // tccc - if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { - fcd16|=extraData.charAt(norm16-1)&0xff00; // lccc - } - norm16=fcd16; - } - } - newFCDTrie.setRange(start, end, norm16, true); - break; - } - } - /** * Gets the decomposition for one code point. * @param c code point @@ -1450,24 +1381,24 @@ public final class Normalizer2Impl { if((c=s.charAt(src))1) { --prevBoundary; } @@ -1499,7 +1431,7 @@ public final class Normalizer2Impl { --p; // Need to fetch the previous character's FCD value because // prevFCD16 was just for the trail surrogate code point. - prevFCD16=getFCD16(Character.toCodePoint(s.charAt(p), s.charAt(p+1))); + prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1))); // Still known to have lccc==0 because its lead surrogate unit had lccc==0. } if(prevFCD16>1) { @@ -2158,7 +2090,7 @@ public final class Normalizer2Impl { while(p>0) { int c=Character.codePointBefore(s, p); p-=Character.charCount(c); - if(fcdTrie.get(c)<=0xff) { + if(c canonStartSets; diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/UCharacterProperty.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/UCharacterProperty.java index acb645874e6..4df1c9d9b56 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/UCharacterProperty.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/UCharacterProperty.java @@ -549,12 +549,12 @@ public final class UCharacterProperty new NormQuickCheckIntProperty(SRC_NFKC, UProperty.NFKC_QUICK_CHECK, 2), new CombiningClassIntProperty(SRC_NFC) { // LEAD_CANONICAL_COMBINING_CLASS int getValue(int c) { - return Norm2AllModes.getNFCInstance().impl.getFCDTrie().get(c)>>8; + return Norm2AllModes.getNFCInstance().impl.getFCD16(c)>>8; } }, new CombiningClassIntProperty(SRC_NFC) { // TRAIL_CANONICAL_COMBINING_CLASS int getValue(int c) { - return Norm2AllModes.getNFCInstance().impl.getFCDTrie().get(c)&0xff; + return Norm2AllModes.getNFCInstance().impl.getFCD16(c)&0xff; } }, new IntProperty(2, GCB_MASK, GCB_SHIFT), // GRAPHEME_CLUSTER_BREAK