From: Andy Heninger Date: Tue, 19 Sep 2017 18:17:22 +0000 (+0000) Subject: ICU-9954 Break Iteration, remove reverse rules, add boundary caching. X-Git-Tag: release-60-rc~132 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=4e1c4096a61036dfd2f6b3359a7e77b7ecf8e55b;p=icu ICU-9954 Break Iteration, remove reverse rules, add boundary caching. X-SVN-Rev: 40433 --- diff --git a/icu4c/source/common/Makefile.in b/icu4c/source/common/Makefile.in index bc91704e5d7..cf0799aed14 100644 --- a/icu4c/source/common/Makefile.in +++ b/icu4c/source/common/Makefile.in @@ -104,7 +104,7 @@ patternprops.o uchar.o uprops.o ucase.o propname.o ubidi_props.o ubidi.o ubidiwr uscript.o uscript_props.o usc_impl.o unames.o \ utrie.o utrie2.o utrie2_builder.o bmpset.o unisetspan.o uset_props.o uniset_props.o uniset_closure.o uset.o uniset.o usetiter.o ruleiter.o caniter.o unifilt.o unifunct.o \ uarrsort.o brkiter.o ubrk.o brkeng.o dictbe.o filteredbrk.o \ -rbbi.o rbbidata.o rbbinode.o rbbirb.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o \ +rbbi.o rbbidata.o rbbinode.o rbbirb.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o rbbi_cache.o \ serv.o servnotf.o servls.o servlk.o servlkf.o servrbf.o servslkf.o \ uidna.o usprep.o uts46.o punycode.o \ util.o util_props.o parsepos.o locbased.o cwchar.o wintz.o dtintrv.o ucnvsel.o propsvec.o \ diff --git a/icu4c/source/common/brkeng.cpp b/icu4c/source/common/brkeng.cpp index a5900607a19..88024b2e621 100644 --- a/icu4c/source/common/brkeng.cpp +++ b/icu4c/source/common/brkeng.cpp @@ -11,9 +11,6 @@ #if !UCONFIG_NO_BREAK_ITERATION -#include "brkeng.h" -#include "cmemory.h" -#include "dictbe.h" #include "unicode/uchar.h" #include "unicode/uniset.h" #include "unicode/chariter.h" @@ -24,6 +21,10 @@ #include "unicode/uscript.h" #include "unicode/ucharstrie.h" #include "unicode/bytestrie.h" + +#include "brkeng.h" +#include "cmemory.h" +#include "dictbe.h" #include "charstr.h" #include "dictionarydata.h" #include "mutex.h" @@ -80,23 +81,15 @@ UnhandledEngine::handles(UChar32 c, int32_t breakType) const { int32_t UnhandledEngine::findBreaks( UText *text, - int32_t startPos, - int32_t endPos, - UBool reverse, - int32_t breakType, - UStack &/*foundBreaks*/ ) const { + int32_t /* startPos */, + int32_t endPos, + int32_t breakType, + UVector32 &/*foundBreaks*/ ) const { if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) { UChar32 c = utext_current32(text); - if (reverse) { - while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) { - c = utext_previous32(text); - } - } - else { - while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) { - utext_next32(text); // TODO: recast loop to work with post-increment operations. - c = utext_current32(text); - } + while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) { + utext_next32(text); // TODO: recast loop to work with post-increment operations. + c = utext_current32(text); } } return 0; diff --git a/icu4c/source/common/brkeng.h b/icu4c/source/common/brkeng.h index 0335d11f69e..f59e7df5200 100644 --- a/icu4c/source/common/brkeng.h +++ b/icu4c/source/common/brkeng.h @@ -19,6 +19,7 @@ U_NAMESPACE_BEGIN class UnicodeSet; class UStack; +class UVector32; class DictionaryMatcher; /******************************************************************* @@ -67,18 +68,15 @@ class LanguageBreakEngine : public UMemory { * is capable of handling. * @param startPos The start of the run within the supplied text. * @param endPos The end of the run within the supplied text. - * @param reverse Whether the caller is looking for breaks in a reverse - * direction. * @param breakType The type of break desired, or -1. - * @param foundBreaks An allocated C array of the breaks found, if any + * @param foundBreaks A Vector of int32_t to receive the breaks. * @return The number of breaks found. */ virtual int32_t findBreaks( UText *text, int32_t startPos, int32_t endPos, - UBool reverse, int32_t breakType, - UStack &foundBreaks ) const = 0; + UVector32 &foundBreaks ) const = 0; }; @@ -192,8 +190,6 @@ class UnhandledEngine : public LanguageBreakEngine { * is capable of handling. * @param startPos The start of the run within the supplied text. * @param endPos The end of the run within the supplied text. - * @param reverse Whether the caller is looking for breaks in a reverse - * direction. * @param breakType The type of break desired, or -1. * @param foundBreaks An allocated C array of the breaks found, if any * @return The number of breaks found. @@ -201,9 +197,8 @@ class UnhandledEngine : public LanguageBreakEngine { virtual int32_t findBreaks( UText *text, int32_t startPos, int32_t endPos, - UBool reverse, int32_t breakType, - UStack &foundBreaks ) const; + UVector32 &foundBreaks ) const; /** *

Tell the engine to handle a particular character and break type.

diff --git a/icu4c/source/common/common.vcxproj b/icu4c/source/common/common.vcxproj index 3a36fcc563d..781be356ff6 100644 --- a/icu4c/source/common/common.vcxproj +++ b/icu4c/source/common/common.vcxproj @@ -268,6 +268,8 @@ + + @@ -573,6 +575,7 @@ + copy "%(FullPath)" ..\..\include\unicode diff --git a/icu4c/source/common/common.vcxproj.filters b/icu4c/source/common/common.vcxproj.filters index 1bdeced6798..ac14b7e0923 100644 --- a/icu4c/source/common/common.vcxproj.filters +++ b/icu4c/source/common/common.vcxproj.filters @@ -97,6 +97,9 @@ break iteration + + break iteration + break iteration @@ -639,6 +642,9 @@ break iteration + + break iteration + break iteration diff --git a/icu4c/source/common/dictbe.cpp b/icu4c/source/common/dictbe.cpp index 8ee76efbb8b..dc9d348ef8f 100644 --- a/icu4c/source/common/dictbe.cpp +++ b/icu4c/source/common/dictbe.cpp @@ -46,9 +46,9 @@ int32_t DictionaryBreakEngine::findBreaks( UText *text, int32_t startPos, int32_t endPos, - UBool reverse, int32_t breakType, - UStack &foundBreaks ) const { + UVector32 &foundBreaks ) const { + (void)startPos; // TODO: remove this param? int32_t result = 0; // Find the span of characters included in the set. @@ -60,34 +60,12 @@ DictionaryBreakEngine::findBreaks( UText *text, int32_t rangeStart; int32_t rangeEnd; UChar32 c = utext_current32(text); - if (reverse) { - UBool isDict = fSet.contains(c); - while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) { - c = utext_previous32(text); - isDict = fSet.contains(c); - } - if (current < startPos) { - rangeStart = startPos; - } else { - rangeStart = current; - if (!isDict) { - utext_next32(text); - rangeStart = (int32_t)utext_getNativeIndex(text); - } - } - // rangeEnd = start + 1; - utext_setNativeIndex(text, start); - utext_next32(text); - rangeEnd = (int32_t)utext_getNativeIndex(text); - } - else { - while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) { - utext_next32(text); // TODO: recast loop for postincrement - c = utext_current32(text); - } - rangeStart = start; - rangeEnd = current; + while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) { + utext_next32(text); // TODO: recast loop for postincrement + c = utext_current32(text); } + rangeStart = start; + rangeEnd = current; if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) { result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); utext_setNativeIndex(text, current); @@ -248,7 +226,7 @@ int32_t ThaiBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, - UStack &foundBreaks ) const { + UVector32 &foundBreaks ) const { utext_setNativeIndex(text, rangeStart); utext_moveIndex32(text, THAI_MIN_WORD_SPAN); if (utext_getNativeIndex(text) >= rangeEnd) { @@ -487,7 +465,7 @@ int32_t LaoBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, - UStack &foundBreaks ) const { + UVector32 &foundBreaks ) const { if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) { return 0; // Not enough characters for two words } @@ -680,7 +658,7 @@ int32_t BurmeseBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, - UStack &foundBreaks ) const { + UVector32 &foundBreaks ) const { if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) { return 0; // Not enough characters for two words } @@ -885,7 +863,7 @@ int32_t KhmerBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, - UStack &foundBreaks ) const { + UVector32 &foundBreaks ) const { if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { return 0; // Not enough characters for two words } @@ -1128,14 +1106,14 @@ static inline int32_t utext_i32_flag(int32_t bitIndex) { * @param text A UText representing the text * @param rangeStart The start of the range of dictionary characters * @param rangeEnd The end of the range of dictionary characters - * @param foundBreaks Output of C array of int32_t break positions, or 0 + * @param foundBreaks vector to receive the break positions * @return The number of breaks found */ int32_t CjkBreakEngine::divideUpDictionaryRange( UText *inText, int32_t rangeStart, int32_t rangeEnd, - UStack &foundBreaks ) const { + UVector32 &foundBreaks ) const { if (rangeStart >= rangeEnd) { return 0; } diff --git a/icu4c/source/common/dictbe.h b/icu4c/source/common/dictbe.h index 242f475ca35..de1d49f6935 100644 --- a/icu4c/source/common/dictbe.h +++ b/icu4c/source/common/dictbe.h @@ -15,6 +15,7 @@ #include "unicode/utext.h" #include "brkeng.h" +#include "uvectr32.h" U_NAMESPACE_BEGIN @@ -84,21 +85,18 @@ class DictionaryBreakEngine : public LanguageBreakEngine { * * @param text A UText representing the text. The iterator is left at * the end of the run of characters which the engine is capable of handling - * that starts from the first (or last) character in the range. + * that starts from the first character in the range. * @param startPos The start of the run within the supplied text. * @param endPos The end of the run within the supplied text. - * @param reverse Whether the caller is looking for breaks in a reverse - * direction. * @param breakType The type of break desired, or -1. - * @param foundBreaks An allocated C array of the breaks found, if any + * @param foundBreaks vector of int32_t to receive the break positions * @return The number of breaks found. */ virtual int32_t findBreaks( UText *text, int32_t startPos, int32_t endPos, - UBool reverse, int32_t breakType, - UStack &foundBreaks ) const; + UVector32 &foundBreaks ) const; protected: @@ -128,7 +126,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine { virtual int32_t divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, - UStack &foundBreaks ) const = 0; + UVector32 &foundBreaks ) const = 0; }; @@ -185,7 +183,7 @@ class ThaiBreakEngine : public DictionaryBreakEngine { virtual int32_t divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, - UStack &foundBreaks ) const; + UVector32 &foundBreaks ) const; }; @@ -241,7 +239,7 @@ class LaoBreakEngine : public DictionaryBreakEngine { virtual int32_t divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, - UStack &foundBreaks ) const; + UVector32 &foundBreaks ) const; }; @@ -297,7 +295,7 @@ class BurmeseBreakEngine : public DictionaryBreakEngine { virtual int32_t divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, - UStack &foundBreaks ) const; + UVector32 &foundBreaks ) const; }; @@ -353,7 +351,7 @@ class KhmerBreakEngine : public DictionaryBreakEngine { virtual int32_t divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, - UStack &foundBreaks ) const; + UVector32 &foundBreaks ) const; }; @@ -417,7 +415,7 @@ class CjkBreakEngine : public DictionaryBreakEngine { virtual int32_t divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, - UStack &foundBreaks ) const; + UVector32 &foundBreaks ) const; }; diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp index af18e7ac9e6..2c5e0e7092b 100644 --- a/icu4c/source/common/rbbi.cpp +++ b/icu4c/source/common/rbbi.cpp @@ -7,7 +7,7 @@ *************************************************************************** */ // -// file: rbbi.c Contains the implementation of the rule based break iterator +// file: rbbi.cpp Contains the implementation of the rule based break iterator // runtime engine and the API implementation for // class RuleBasedBreakIterator // @@ -23,15 +23,17 @@ #include "unicode/uchriter.h" #include "unicode/uclean.h" #include "unicode/udata.h" + #include "brkeng.h" +#include "ucln_cmn.h" #include "cmemory.h" #include "cstring.h" #include "rbbidata.h" +#include "rbbi_cache.h" #include "rbbirb.h" #include "uassert.h" -#include "ucln_cmn.h" #include "umutex.h" -#include "uvector.h" +#include "uvectr32.h" // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included. #if U_LOCAL_SERVICE_HOOK @@ -39,16 +41,16 @@ #endif #ifdef RBBI_DEBUG -static UBool fTrace = FALSE; +static UBool gTrace = FALSE; #endif U_NAMESPACE_BEGIN // The state number of the starting state -#define START_STATE 1 +constexpr int32_t START_STATE = 1; // The state-transition value indicating "stop" -#define STOP_STATE 0 +constexpr int32_t STOP_STATE = 0; UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator) @@ -62,9 +64,8 @@ UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator) * Constructs a RuleBasedBreakIterator that uses the already-created * tables object that is passed in as a parameter. */ -RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) -{ - init(); +RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) { + init(status); fData = new RBBIDataWrapper(data, status); // status checked in constructor if (U_FAILURE(status)) {return;} if(fData == 0) { @@ -80,7 +81,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules, uint32_t ruleLength, UErrorCode &status) { - init(); + init(status); if (U_FAILURE(status)) { return; } @@ -110,7 +111,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules, //------------------------------------------------------------------------------- RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status) { - init(); + init(status); fData = new RBBIDataWrapper(udm, status); // status checked in constructor if (U_FAILURE(status)) {return;} if(fData == 0) { @@ -130,7 +131,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, UParseError &parseError, UErrorCode &status) { - init(); + init(status); if (U_FAILURE(status)) {return;} RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *) RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status); @@ -152,7 +153,8 @@ RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, // of rules. //------------------------------------------------------------------------------- RuleBasedBreakIterator::RuleBasedBreakIterator() { - init(); + UErrorCode status = U_ZERO_ERROR; + init(status); } @@ -165,7 +167,8 @@ RuleBasedBreakIterator::RuleBasedBreakIterator() { RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other) : BreakIterator(other) { - this->init(); + UErrorCode status = U_ZERO_ERROR; + this->init(status); *this = other; } @@ -180,7 +183,7 @@ RuleBasedBreakIterator::~RuleBasedBreakIterator() { } fCharIter = NULL; delete fSCharIter; - fCharIter = NULL; + fSCharIter = NULL; delete fDCharIter; fDCharIter = NULL; @@ -190,18 +193,17 @@ RuleBasedBreakIterator::~RuleBasedBreakIterator() { fData->removeReference(); fData = NULL; } - if (fCachedBreakPositions) { - uprv_free(fCachedBreakPositions); - fCachedBreakPositions = NULL; - } - if (fLanguageBreakEngines) { - delete fLanguageBreakEngines; - fLanguageBreakEngines = NULL; - } - if (fUnhandledBreakEngine) { - delete fUnhandledBreakEngine; - fUnhandledBreakEngine = NULL; - } + delete fBreakCache; + fBreakCache = NULL; + + delete fDictionaryCache; + fDictionaryCache = NULL; + + delete fLanguageBreakEngines; + fLanguageBreakEngines = NULL; + + delete fUnhandledBreakEngine; + fUnhandledBreakEngine = NULL; } /** @@ -215,7 +217,6 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { } BreakIterator::operator=(that); - reset(); // Delete break cache information fBreakType = that.fBreakType; if (fLanguageBreakEngines != NULL) { delete fLanguageBreakEngines; @@ -245,6 +246,17 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { fData = that.fData->addReference(); } + fPosition = that.fPosition; + fRuleStatusIndex = that.fRuleStatusIndex; + fDone = that.fDone; + + // TODO: both the dictionary and the main cache need to be copied. + // Current position could be within a dictionary range. Trying to continue + // the iteration without the caches present would go to the rules, with + // the assumption that the current position is on a rule boundary. + fBreakCache->reset(fPosition, fRuleStatusIndex); + fDictionaryCache->reset(); + return *this; } @@ -256,33 +268,43 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { // Initializes all fields, leaving the object in a consistent state. // //----------------------------------------------------------------------------- -void RuleBasedBreakIterator::init() { - UErrorCode status = U_ZERO_ERROR; - fText = utext_openUChars(NULL, NULL, 0, &status); +void RuleBasedBreakIterator::init(UErrorCode &status) { + fText = NULL; fCharIter = NULL; fSCharIter = NULL; fDCharIter = NULL; fData = NULL; - fLastRuleStatusIndex = 0; - fLastStatusIndexValid = TRUE; + fPosition = 0; + fRuleStatusIndex = 0; + fDone = false; fDictionaryCharCount = 0; fBreakType = UBRK_WORD; // Defaulting BreakType to word gives reasonable // dictionary behavior for Break Iterators that are // built from rules. Even better would be the ability to // declare the type in the rules. - fCachedBreakPositions = NULL; - fLanguageBreakEngines = NULL; - fUnhandledBreakEngine = NULL; - fNumCachedBreakPositions = 0; - fPositionInCache = 0; + fLanguageBreakEngines = NULL; + fUnhandledBreakEngine = NULL; + fBreakCache = NULL; + fDictionaryCache = NULL; + + if (U_FAILURE(status)) { + return; + } + + fText = utext_openUChars(NULL, NULL, 0, &status); + fDictionaryCache = new DictionaryCache(this, status); + fBreakCache = new BreakCache(this, status); + if (U_SUCCESS(status) && (fText == NULL || fDictionaryCache == NULL || fBreakCache == NULL)) { + status = U_MEMORY_ALLOCATION_ERROR; + } #ifdef RBBI_DEBUG static UBool debugInitDone = FALSE; if (debugInitDone == FALSE) { char *debugEnv = getenv("U_RBBIDEBUG"); if (debugEnv && uprv_strstr(debugEnv, "trace")) { - fTrace = TRUE; + gTrace = TRUE; } debugInitDone = TRUE; } @@ -312,6 +334,9 @@ RuleBasedBreakIterator::operator==(const BreakIterator& that) const { if (typeid(*this) != typeid(that)) { return FALSE; } + if (this == &that) { + return TRUE; + } // The base class BreakIterator carries no state that participates in equality, // and does not implement an equality function that would otherwise be @@ -326,6 +351,12 @@ RuleBasedBreakIterator::operator==(const BreakIterator& that) const { return FALSE; }; + if (!(fPosition == that2.fPosition && + fRuleStatusIndex == that2.fRuleStatusIndex && + fDone == that2.fDone)) { + return FALSE; + } + if (that2.fData == fData || (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) { // The two break iterators are using the same rules. @@ -352,7 +383,8 @@ void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { if (U_FAILURE(status)) { return; } - reset(); + fBreakCache->reset(); + fDictionaryCache->reset(); fText = utext_clone(fText, ut, FALSE, TRUE, &status); // Set up a dummy CharacterIterator to be returned if anyone @@ -413,7 +445,8 @@ RuleBasedBreakIterator::adoptText(CharacterIterator* newText) { fCharIter = newText; UErrorCode status = U_ZERO_ERROR; - reset(); + fBreakCache->reset(); + fDictionaryCache->reset(); if (newText==NULL || newText->startIndex() != 0) { // startIndex !=0 wants to be an error, but there's no way to report it. // Make the iterator text be an empty string. @@ -432,7 +465,8 @@ RuleBasedBreakIterator::adoptText(CharacterIterator* newText) { void RuleBasedBreakIterator::setText(const UnicodeString& newText) { UErrorCode status = U_ZERO_ERROR; - reset(); + fBreakCache->reset(); + fDictionaryCache->reset(); fText = utext_openConstUnicodeString(fText, &newText, &status); // Set up a character iterator on the string. @@ -492,13 +526,12 @@ RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, U * @return The new iterator position, which is zero. */ int32_t RuleBasedBreakIterator::first(void) { - reset(); - fLastRuleStatusIndex = 0; - fLastStatusIndexValid = TRUE; - //if (fText == NULL) - // return BreakIterator::DONE; - - utext_setNativeIndex(fText, 0); + UErrorCode status = U_ZERO_ERROR; + if (!fBreakCache->seek(0)) { + fBreakCache->populateNear(0, status); + } + fBreakCache->current(); + U_ASSERT(fPosition == 0); return 0; } @@ -507,17 +540,12 @@ int32_t RuleBasedBreakIterator::first(void) { * @return The text's past-the-end offset. */ int32_t RuleBasedBreakIterator::last(void) { - reset(); - if (fText == NULL) { - fLastRuleStatusIndex = 0; - fLastStatusIndexValid = TRUE; - return BreakIterator::DONE; - } - - fLastStatusIndexValid = FALSE; - int32_t pos = (int32_t)utext_nativeLength(fText); - utext_setNativeIndex(fText, pos); - return pos; + int32_t endPos = (int32_t)utext_nativeLength(fText); + UBool endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position. + (void)endShouldBeBoundary; + U_ASSERT(endShouldBeBoundary); + U_ASSERT(fPosition == endPos); + return endPos; } /** @@ -530,14 +558,17 @@ int32_t RuleBasedBreakIterator::last(void) { * the current one. */ int32_t RuleBasedBreakIterator::next(int32_t n) { - int32_t result = current(); - while (n > 0) { - result = next(); - --n; - } - while (n < 0) { - result = previous(); - ++n; + int32_t result = 0; + if (n > 0) { + for (; n > 0 && result != UBRK_DONE; --n) { + result = next(); + } + } else if (n < 0) { + for (; n < 0 && result != UBRK_DONE; ++n) { + result = previous(); + } + } else { + result = current(); } return result; } @@ -547,396 +578,120 @@ int32_t RuleBasedBreakIterator::next(int32_t n) { * @return The position of the first boundary after this one. */ int32_t RuleBasedBreakIterator::next(void) { - // if we have cached break positions and we're still in the range - // covered by them, just move one step forward in the cache - if (fCachedBreakPositions != NULL) { - if (fPositionInCache < fNumCachedBreakPositions - 1) { - ++fPositionInCache; - int32_t pos = fCachedBreakPositions[fPositionInCache]; - utext_setNativeIndex(fText, pos); - return pos; - } - else { - reset(); - } - } - - int32_t startPos = current(); - fDictionaryCharCount = 0; - int32_t result = handleNext(fData->fForwardTable); - if (fDictionaryCharCount > 0) { - result = checkDictionary(startPos, result, FALSE); - } - return result; + fBreakCache->next(); + return fDone ? UBRK_DONE : fPosition; } /** - * Advances the iterator backwards, to the last boundary preceding this one. - * @return The position of the last boundary position preceding this one. + * Move the iterator backwards, to the boundary preceding the current one. + * + * Starts from the current position within fText. + * Starting position need not be on a boundary. + * + * @return The position of the boundary position immediately preceding the starting position. */ int32_t RuleBasedBreakIterator::previous(void) { - int32_t result; - int32_t startPos; - - // if we have cached break positions and we're still in the range - // covered by them, just move one step backward in the cache - if (fCachedBreakPositions != NULL) { - if (fPositionInCache > 0) { - --fPositionInCache; - // If we're at the beginning of the cache, need to reevaluate the - // rule status - if (fPositionInCache <= 0) { - fLastStatusIndexValid = FALSE; - } - int32_t pos = fCachedBreakPositions[fPositionInCache]; - utext_setNativeIndex(fText, pos); - return pos; - } - else { - reset(); - } - } - - // if we're already sitting at the beginning of the text, return DONE - if (fText == NULL || (startPos = current()) == 0) { - fLastRuleStatusIndex = 0; - fLastStatusIndexValid = TRUE; - return BreakIterator::DONE; - } - - if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) { - result = handlePrevious(fData->fReverseTable); - if (fDictionaryCharCount > 0) { - result = checkDictionary(result, startPos, TRUE); - } - return result; - } - - // old rule syntax - // set things up. handlePrevious() will back us up to some valid - // break position before the current position (we back our internal - // iterator up one step to prevent handlePrevious() from returning - // the current position), but not necessarily the last one before - // where we started - - int32_t start = current(); - - (void)UTEXT_PREVIOUS32(fText); - int32_t lastResult = handlePrevious(fData->fReverseTable); - if (lastResult == UBRK_DONE) { - lastResult = 0; - utext_setNativeIndex(fText, 0); - } - result = lastResult; - int32_t lastTag = 0; - UBool breakTagValid = FALSE; - - // iterate forward from the known break position until we pass our - // starting point. The last break position before the starting - // point is our return value - - for (;;) { - result = next(); - if (result == BreakIterator::DONE || result >= start) { - break; - } - lastResult = result; - lastTag = fLastRuleStatusIndex; - breakTagValid = TRUE; - } - - // fLastBreakTag wants to have the value for section of text preceding - // the result position that we are to return (in lastResult.) If - // the backwards rules overshot and the above loop had to do two or more - // next()s to move up to the desired return position, we will have a valid - // tag value. But, if handlePrevious() took us to exactly the correct result position, - // we wont have a tag value for that position, which is only set by handleNext(). - - // Set the current iteration position to be the last break position - // before where we started, and then return that value. - utext_setNativeIndex(fText, lastResult); - fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() - fLastStatusIndexValid = breakTagValid; - - // No need to check the dictionary; it will have been handled by - // next() - - return lastResult; + UErrorCode status = U_ZERO_ERROR; + fBreakCache->previous(status); + return fDone ? UBRK_DONE : fPosition; } /** * Sets the iterator to refer to the first boundary position following * the specified position. - * @offset The position from which to begin searching for a break position. + * @param startPos The position from which to begin searching for a break position. * @return The position of the first break after the current position. */ -int32_t RuleBasedBreakIterator::following(int32_t offset) { - // if the offset passed in is already past the end of the text, - // just return DONE; if it's before the beginning, return the +int32_t RuleBasedBreakIterator::following(int32_t startPos) { + // if the supplied position is before the beginning, return the // text's starting offset - if (fText == NULL || offset >= utext_nativeLength(fText)) { - last(); - return next(); - } - else if (offset < 0) { + if (startPos < 0) { return first(); } // Move requested offset to a code point start. It might be on a trail surrogate, - // or on a trail byte if the input is UTF-8. - utext_setNativeIndex(fText, offset); - offset = (int32_t)utext_getNativeIndex(fText); - - // if we have cached break positions and offset is in the range - // covered by them, use them - // TODO: could use binary search - // TODO: what if offset is outside range, but break is not? - if (fCachedBreakPositions != NULL) { - if (offset >= fCachedBreakPositions[0] - && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) { - fPositionInCache = 0; - // We are guaranteed not to leave the array due to range test above - while (offset >= fCachedBreakPositions[fPositionInCache]) { - ++fPositionInCache; - } - int32_t pos = fCachedBreakPositions[fPositionInCache]; - utext_setNativeIndex(fText, pos); - return pos; - } - else { - reset(); - } - } + // or on a trail byte if the input is UTF-8. Or it may be beyond the end of the text. + utext_setNativeIndex(fText, startPos); + startPos = (int32_t)utext_getNativeIndex(fText); - // Set our internal iteration position (temporarily) - // to the position passed in. If this is the _beginning_ position, - // then we can just use next() to get our return value - - int32_t result = 0; - - if (fData->fSafeRevTable != NULL) { - // new rule syntax - utext_setNativeIndex(fText, offset); - // move forward one codepoint to prepare for moving back to a - // safe point. - // this handles offset being between a supplementary character - // TODO: is this still needed, with move to code point boundary handled above? - (void)UTEXT_NEXT32(fText); - // handlePrevious will move most of the time to < 1 boundary away - handlePrevious(fData->fSafeRevTable); - int32_t result = next(); - while (result <= offset) { - result = next(); - } - return result; - } - if (fData->fSafeFwdTable != NULL) { - // backup plan if forward safe table is not available - utext_setNativeIndex(fText, offset); - (void)UTEXT_PREVIOUS32(fText); - // handle next will give result >= offset - handleNext(fData->fSafeFwdTable); - // previous will give result 0 or 1 boundary away from offset, - // most of the time - // we have to - int32_t oldresult = previous(); - while (oldresult > offset) { - int32_t result = previous(); - if (result <= offset) { - return oldresult; - } - oldresult = result; - } - int32_t result = next(); - if (result <= offset) { - return next(); - } - return result; - } - // otherwise, we have to sync up first. Use handlePrevious() to back - // up to a known break position before the specified position (if - // we can determine that the specified position is a break position, - // we don't back up at all). This may or may not be the last break - // position at or before our starting position. Advance forward - // from here until we've passed the starting position. The position - // we stop on will be the first break position after the specified one. - // old rule syntax - - utext_setNativeIndex(fText, offset); - if (offset==0 || - (offset==1 && utext_getNativeIndex(fText)==0)) { - return next(); - } - result = previous(); - - while (result != BreakIterator::DONE && result <= offset) { - result = next(); - } - - return result; + UErrorCode status = U_ZERO_ERROR; + fBreakCache->following(startPos, status); + return fDone ? UBRK_DONE : fPosition; } /** * Sets the iterator to refer to the last boundary position before the * specified position. - * @offset The position to begin searching for a break from. + * @param offset The position to begin searching for a break from. * @return The position of the last boundary before the starting position. */ int32_t RuleBasedBreakIterator::preceding(int32_t offset) { - // if the offset passed in is already past the end of the text, - // just return DONE; if it's before the beginning, return the - // text's starting offset if (fText == NULL || offset > utext_nativeLength(fText)) { return last(); } - else if (offset < 0) { - return first(); - } // Move requested offset to a code point start. It might be on a trail surrogate, // or on a trail byte if the input is UTF-8. - utext_setNativeIndex(fText, offset); - offset = (int32_t)utext_getNativeIndex(fText); - - // if we have cached break positions and offset is in the range - // covered by them, use them - if (fCachedBreakPositions != NULL) { - // TODO: binary search? - // TODO: What if offset is outside range, but break is not? - if (offset > fCachedBreakPositions[0] - && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) { - fPositionInCache = 0; - while (fPositionInCache < fNumCachedBreakPositions - && offset > fCachedBreakPositions[fPositionInCache]) - ++fPositionInCache; - --fPositionInCache; - // If we're at the beginning of the cache, need to reevaluate the - // rule status - if (fPositionInCache <= 0) { - fLastStatusIndexValid = FALSE; - } - utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]); - return fCachedBreakPositions[fPositionInCache]; - } - else { - reset(); - } - } - - // if we start by updating the current iteration position to the - // position specified by the caller, we can just use previous() - // to carry out this operation - - if (fData->fSafeFwdTable != NULL) { - // new rule syntax - utext_setNativeIndex(fText, offset); - int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText); - if (newOffset != offset) { - // Will come here if specified offset was not a code point boundary AND - // the underlying implmentation is using UText, which snaps any non-code-point-boundary - // indices to the containing code point. - // For breakitereator::preceding only, these non-code-point indices need to be moved - // up to refer to the following codepoint. - (void)UTEXT_NEXT32(fText); - offset = (int32_t)UTEXT_GETNATIVEINDEX(fText); - } - // TODO: (synwee) would it be better to just check for being in the middle of a surrogate pair, - // rather than adjusting the position unconditionally? - // (Change would interact with safe rules.) - // TODO: change RBBI behavior for off-boundary indices to match that of UText? - // affects only preceding(), seems cleaner, but is slightly different. - (void)UTEXT_PREVIOUS32(fText); - handleNext(fData->fSafeFwdTable); - int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText); - while (result >= offset) { - result = previous(); - } - return result; - } - if (fData->fSafeRevTable != NULL) { - // backup plan if forward safe table is not available - // TODO: check whether this path can be discarded - // It's probably OK to say that rules must supply both safe tables - // if they use safe tables at all. We have certainly never described - // to anyone how to work with just one safe table. - utext_setNativeIndex(fText, offset); - (void)UTEXT_NEXT32(fText); - - // handle previous will give result <= offset - handlePrevious(fData->fSafeRevTable); - - // next will give result 0 or 1 boundary away from offset, - // most of the time - // we have to - int32_t oldresult = next(); - while (oldresult < offset) { - int32_t result = next(); - if (result >= offset) { - return oldresult; - } - oldresult = result; - } - int32_t result = previous(); - if (result >= offset) { - return previous(); - } - return result; - } - - // old rule syntax utext_setNativeIndex(fText, offset); - return previous(); + int32_t adjustedOffset = utext_getNativeIndex(fText); + + UErrorCode status = U_ZERO_ERROR; + fBreakCache->preceding(adjustedOffset, status); + return fDone ? UBRK_DONE : fPosition; } /** * Returns true if the specfied position is a boundary position. As a side * effect, leaves the iterator pointing to the first boundary position at * or after "offset". + * * @param offset the offset to check. * @return True if "offset" is a boundary position. */ UBool RuleBasedBreakIterator::isBoundary(int32_t offset) { - // the beginning index of the iterator is always a boundary position by definition - if (offset == 0) { - first(); // For side effects on current position, tag values. - return TRUE; - } - - if (offset == (int32_t)utext_nativeLength(fText)) { - last(); // For side effects on current position, tag values. - return TRUE; - } - // out-of-range indexes are never boundary positions if (offset < 0) { first(); // For side effects on current position, tag values. return FALSE; } - if (offset > utext_nativeLength(fText)) { - last(); // For side effects on current position, tag values. - return FALSE; + // Adjust offset to be on a code point boundary and not beyond the end of the text. + // Note that isBoundary() is always be false for offsets that are not on code point boundaries. + // But we still need the side effect of leaving iteration at the following boundary. + + utext_setNativeIndex(fText, offset); + int32_t adjustedOffset = utext_getNativeIndex(fText); + + bool result = false; + UErrorCode status = U_ZERO_ERROR; + if (fBreakCache->seek(adjustedOffset) || fBreakCache->populateNear(adjustedOffset, status)) { + result = (fBreakCache->current() == offset); } - // otherwise, we can use following() on the position before the specified - // one and return true if the position we get back is the one the user - // specified - utext_previous32From(fText, offset); - int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText); - UBool result = following(backOne) == offset; + if (result && adjustedOffset < offset && utext_char32At(fText, offset) == U_SENTINEL) { + // Original offset is beyond the end of the text. Return FALSE, it's not a boundary, + // but the iteration position remains set to the end of the text, which is a boundary. + return FALSE; + } + if (!result) { + // Not on a boundary. isBoundary() must leave iterator on the following boundary. + // Cache->seek(), above, left us on the preceding boundary, so advance one. + next(); + } return result; } + /** * Returns the current iteration position. * @return The current iteration position. */ int32_t RuleBasedBreakIterator::current(void) const { - int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); - return pos; + return fPosition; } + //======================================================================= // implementation //======================================================================= @@ -1003,15 +758,11 @@ struct LookAheadResults { //----------------------------------------------------------------------------------- // -// handleNext(stateTable) -// This method is the actual implementation of the rbbi next() method. -// This method initializes the state machine to state 1 -// and advances through the text character by character until we reach the end -// of the text or the state machine transitions to state 0. We update our return -// value every time the state machine passes through an accepting state. +// handleNext() +// Run the state machine to find a boundary // //----------------------------------------------------------------------------------- -int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { +int32_t RuleBasedBreakIterator::handleNext() { int32_t state; uint16_t category = 0; RBBIRunMode mode; @@ -1021,25 +772,29 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { LookAheadResults lookAheadMatches; int32_t result = 0; int32_t initialPosition = 0; + const RBBIStateTable *statetable = fData->fForwardTable; const char *tableData = statetable->fTableData; uint32_t tableRowLen = statetable->fRowLen; - #ifdef RBBI_DEBUG - if (fTrace) { + if (gTrace) { RBBIDebugPuts("Handle Next pos char state category"); } #endif - // No matter what, handleNext alway correctly sets the break tag value. - fLastStatusIndexValid = TRUE; - fLastRuleStatusIndex = 0; + // handleNext alway sets the break tag value. + // Set the default for it. + fRuleStatusIndex = 0; + + fDictionaryCharCount = 0; // if we're already at the end of the text, return DONE. - initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); + initialPosition = fPosition; + UTEXT_SETNATIVEINDEX(fText, initialPosition); result = initialPosition; c = UTEXT_NEXT32(fText); if (fData == NULL || c==U_SENTINEL) { - return BreakIterator::DONE; + fDone = TRUE; + return UBRK_DONE; } // Set the initial state for the state machine @@ -1086,7 +841,7 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { category = UTRIE2_GET16(fData->fTrie, c); // Check the dictionary bit in the character's category. - // Counter is only used by dictionary based iterators (subclasses). + // Counter is only used by dictionary based iteration. // Chars that need to be handled by a dictionary have a flag bit set // in their category values. // @@ -1098,7 +853,7 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { } #ifdef RBBI_DEBUG - if (fTrace) { + if (gTrace) { RBBIDebugPrintf(" %4ld ", utext_getNativeIndex(fText)); if (0x20<=c && c<0x7f) { RBBIDebugPrintf("\"%c\" ", c); @@ -1127,7 +882,7 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { if (mode != RBBI_START) { result = (int32_t)UTEXT_GETNATIVEINDEX(fText); } - fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. + fRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. } int16_t completedRule = row->fAccepting; @@ -1135,8 +890,8 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { // Lookahead match is completed. int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule); if (lookaheadResult >= 0) { - fLastRuleStatusIndex = row->fTagIdx; - UTEXT_SETNATIVEINDEX(fText, lookaheadResult); + fRuleStatusIndex = row->fTagIdx; + fPosition = lookaheadResult; return lookaheadResult; } } @@ -1165,8 +920,6 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { mode = RBBI_RUN; } } - - } // The state machine is done. Check whether it found a match... @@ -1175,15 +928,16 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { // (This really indicates a defect in the break rules. They should always match // at least one character.) if (result == initialPosition) { - UTEXT_SETNATIVEINDEX(fText, initialPosition); - UTEXT_NEXT32(fText); - result = (int32_t)UTEXT_GETNATIVEINDEX(fText); + utext_setNativeIndex(fText, initialPosition); + utext_next32(fText); + result = (int32_t)utext_getNativeIndex(fText); + fRuleStatusIndex = 0; } // Leave the iterator at our result position. - UTEXT_SETNATIVEINDEX(fText, result); + fPosition = result; #ifdef RBBI_DEBUG - if (fTrace) { + if (gTrace) { RBBIDebugPrintf("result = %d\n\n", result); } #endif @@ -1196,13 +950,11 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { // // handlePrevious() // -// Iterate backwards, according to the logic of the reverse rules. -// This version handles the exact style backwards rules. -// +// Iterate backwards using the safe reverse rules. // The logic of this function is very similar to handleNext(), above. // //----------------------------------------------------------------------------------- -int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) { +int32_t RuleBasedBreakIterator::handlePrevious(int32_t fromPosition) { int32_t state; uint16_t category = 0; RBBIRunMode mode; @@ -1212,19 +964,14 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) int32_t result = 0; int32_t initialPosition = 0; + const RBBIStateTable *stateTable = fData->fSafeRevTable; + UTEXT_SETNATIVEINDEX(fText, fromPosition); #ifdef RBBI_DEBUG - if (fTrace) { + if (gTrace) { RBBIDebugPuts("Handle Previous pos char state category"); } #endif - // handlePrevious() never gets the rule status. - // Flag the status as invalid; if the user ever asks for status, we will need - // to back up, then re-find the break position using handleNext(), which does - // get the status value. - fLastStatusIndexValid = FALSE; - fLastRuleStatusIndex = 0; - // if we're already at the start of the text, return DONE. if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) { return BreakIterator::DONE; @@ -1238,10 +985,10 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) // Set the initial state for the state machine state = START_STATE; row = (RBBIStateTableRow *) - (statetable->fTableData + (statetable->fRowLen * state)); + (stateTable->fTableData + (stateTable->fRowLen * state)); category = 3; mode = RBBI_RUN; - if (statetable->fFlags & RBBI_BOF_REQUIRED) { + if (stateTable->fFlags & RBBI_BOF_REQUIRED) { category = 2; mode = RBBI_START; } @@ -1256,12 +1003,6 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) // We have already run the loop one last time with the // character set to the psueudo {eof} value. Now it is time // to unconditionally bail out. - if (result == initialPosition) { - // Ran off start, no match found. - // move one index one (towards the start, since we are doing a previous()) - UTEXT_SETNATIVEINDEX(fText, initialPosition); - (void)UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check. - } break; } // Run the loop one last time with the fake end-of-input character category. @@ -1280,22 +1021,13 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, // not the size of the character going in, which is a UChar32. // + // And off the dictionary flag bit. For reverse iteration it is not used. category = UTRIE2_GET16(fData->fTrie, c); - - // Check the dictionary bit in the character's category. - // Counter is only used by dictionary based iterators (subclasses). - // Chars that need to be handled by a dictionary have a flag bit set - // in their category values. - // - if ((category & 0x4000) != 0) { - fDictionaryCharCount++; - // And off the dictionary flag bit. - category &= ~0x4000; - } + category &= ~0x4000; } #ifdef RBBI_DEBUG - if (fTrace) { + if (gTrace) { RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText)); if (0x20<=c && c<0x7f) { RBBIDebugPrintf("\"%c\" ", c); @@ -1315,7 +1047,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) U_ASSERT(categoryfHeader->fCatCount); state = row->fNextState[category]; /*Not accessing beyond memory*/ row = (RBBIStateTableRow *) - (statetable->fTableData + (statetable->fRowLen * state)); + (stateTable->fTableData + (stateTable->fRowLen * state)); if (row->fAccepting == -1) { // Match found, common case. @@ -1369,10 +1101,8 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) result = (int32_t)UTEXT_GETNATIVEINDEX(fText); } - // Leave the iterator at our result position. - UTEXT_SETNATIVEINDEX(fText, result); #ifdef RBBI_DEBUG - if (fTrace) { + if (gTrace) { RBBIDebugPrintf("result = %d\n\n", result); } #endif @@ -1380,20 +1110,6 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) } -void -RuleBasedBreakIterator::reset() -{ - if (fCachedBreakPositions) { - uprv_free(fCachedBreakPositions); - } - fCachedBreakPositions = NULL; - fNumCachedBreakPositions = 0; - fDictionaryCharCount = 0; - fPositionInCache = 0; -} - - - //------------------------------------------------------------------------------- // // getRuleStatus() Return the break rule tag associated with the current @@ -1401,64 +1117,27 @@ RuleBasedBreakIterator::reset() // position by iterating forwards, the value will have been // cached by the handleNext() function. // -// If no cached status value is available, the status is -// found by doing a previous() followed by a next(), which -// leaves the iterator where it started, and computes the -// status while doing the next(). -// //------------------------------------------------------------------------------- -void RuleBasedBreakIterator::makeRuleStatusValid() { - if (fLastStatusIndexValid == FALSE) { - // No cached status is available. - if (fText == NULL || current() == 0) { - // At start of text, or there is no text. Status is always zero. - fLastRuleStatusIndex = 0; - fLastStatusIndexValid = TRUE; - } else { - // Not at start of text. Find status the tedious way. - int32_t pa = current(); - previous(); - if (fNumCachedBreakPositions > 0) { - reset(); // Blow off the dictionary cache - } - int32_t pb = next(); - if (pa != pb) { - // note: the if (pa != pb) test is here only to eliminate warnings for - // unused local variables on gcc. Logically, it isn't needed. - U_ASSERT(pa == pb); - } - } - } - U_ASSERT(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fData->fStatusMaxIdx); -} - int32_t RuleBasedBreakIterator::getRuleStatus() const { - RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; - nonConstThis->makeRuleStatusValid(); // fLastRuleStatusIndex indexes to the start of the appropriate status record // (the number of status values.) // This function returns the last (largest) of the array of status values. - int32_t idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex]; + int32_t idx = fRuleStatusIndex + fData->fRuleStatusTable[fRuleStatusIndex]; int32_t tagVal = fData->fRuleStatusTable[idx]; return tagVal; } - - int32_t RuleBasedBreakIterator::getRuleStatusVec( - int32_t *fillInVec, int32_t capacity, UErrorCode &status) -{ + int32_t *fillInVec, int32_t capacity, UErrorCode &status) { if (U_FAILURE(status)) { return 0; } - RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; - nonConstThis->makeRuleStatusValid(); - int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex]; + int32_t numVals = fData->fRuleStatusTable[fRuleStatusIndex]; int32_t numValsToCopy = numVals; if (numVals > capacity) { status = U_BUFFER_OVERFLOW_ERROR; @@ -1466,7 +1145,7 @@ int32_t RuleBasedBreakIterator::getRuleStatusVec( } int i; for (i=0; ifRuleStatusTable[fLastRuleStatusIndex + i + 1]; + fillInVec[i] = fData->fRuleStatusTable[fRuleStatusIndex + i + 1]; } return numVals; } @@ -1514,169 +1193,6 @@ BreakIterator * RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer* return (RuleBasedBreakIterator *)clonedBI; } - -//------------------------------------------------------------------------------- -// -// checkDictionary This function handles all processing of characters in -// the "dictionary" set. It will determine the appropriate -// course of action, and possibly set up a cache in the -// process. -// -//------------------------------------------------------------------------------- -int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, - int32_t endPos, - UBool reverse) { - // Reset the old break cache first. - reset(); - - // note: code segment below assumes that dictionary chars are in the - // startPos-endPos range - // value returned should be next character in sequence - if ((endPos - startPos) <= 1) { - return (reverse ? startPos : endPos); - } - - // Starting from the starting point, scan towards the proposed result, - // looking for the first dictionary character (which may be the one - // we're on, if we're starting in the middle of a range). - utext_setNativeIndex(fText, reverse ? endPos : startPos); - if (reverse) { - UTEXT_PREVIOUS32(fText); - } - - int32_t rangeStart = startPos; - int32_t rangeEnd = endPos; - - uint16_t category; - int32_t current; - UErrorCode status = U_ZERO_ERROR; - UStack breaks(status); - int32_t foundBreakCount = 0; - UChar32 c = utext_current32(fText); - - category = UTRIE2_GET16(fData->fTrie, c); - - // Is the character we're starting on a dictionary character? If so, we - // need to back up to include the entire run; otherwise the results of - // the break algorithm will differ depending on where we start. Since - // the result is cached and there is typically a non-dictionary break - // within a small number of words, there should be little performance impact. - if (category & 0x4000) { - if (reverse) { - do { - utext_next32(fText); // TODO: recast to work directly with postincrement. - c = utext_current32(fText); - category = UTRIE2_GET16(fData->fTrie, c); - } while (c != U_SENTINEL && (category & 0x4000)); - // Back up to the last dictionary character - rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText); - if (c == U_SENTINEL) { - // c = fText->last32(); - // TODO: why was this if needed? - c = UTEXT_PREVIOUS32(fText); - } - else { - c = UTEXT_PREVIOUS32(fText); - } - } - else { - do { - c = UTEXT_PREVIOUS32(fText); - category = UTRIE2_GET16(fData->fTrie, c); - } - while (c != U_SENTINEL && (category & 0x4000)); - // Back up to the last dictionary character - if (c == U_SENTINEL) { - // c = fText->first32(); - c = utext_current32(fText); - } - else { - utext_next32(fText); - c = utext_current32(fText); - } - rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);; - } - category = UTRIE2_GET16(fData->fTrie, c); - } - - // Loop through the text, looking for ranges of dictionary characters. - // For each span, find the appropriate break engine, and ask it to find - // any breaks within the span. - // Note: we always do this in the forward direction, so that the break - // cache is built in the right order. - if (reverse) { - utext_setNativeIndex(fText, rangeStart); - c = utext_current32(fText); - category = UTRIE2_GET16(fData->fTrie, c); - } - while(U_SUCCESS(status)) { - while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) { - utext_next32(fText); // TODO: tweak for post-increment operation - c = utext_current32(fText); - category = UTRIE2_GET16(fData->fTrie, c); - } - if (current >= rangeEnd) { - break; - } - - // We now have a dictionary character. Get the appropriate language object - // to deal with it. - const LanguageBreakEngine *lbe = getLanguageBreakEngine(c); - - // Ask the language object if there are any breaks. It will leave the text - // pointer on the other side of its range, ready to search for the next one. - if (lbe != NULL) { - foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks); - } - - // Reload the loop variables for the next go-round - c = utext_current32(fText); - category = UTRIE2_GET16(fData->fTrie, c); - } - - // If we found breaks, build a new break cache. The first and last entries must - // be the original starting and ending position. - if (foundBreakCount > 0) { - U_ASSERT(foundBreakCount == breaks.size()); - int32_t totalBreaks = foundBreakCount; - if (startPos < breaks.elementAti(0)) { - totalBreaks += 1; - } - if (endPos > breaks.peeki()) { - totalBreaks += 1; - } - fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t)); - if (fCachedBreakPositions != NULL) { - int32_t out = 0; - fNumCachedBreakPositions = totalBreaks; - if (startPos < breaks.elementAti(0)) { - fCachedBreakPositions[out++] = startPos; - } - for (int32_t i = 0; i < foundBreakCount; ++i) { - fCachedBreakPositions[out++] = breaks.elementAti(i); - } - if (endPos > fCachedBreakPositions[out-1]) { - fCachedBreakPositions[out] = endPos; - } - // If there are breaks, then by definition, we are replacing the original - // proposed break by one of the breaks we found. Use following() and - // preceding() to do the work. They should never recurse in this case. - if (reverse) { - return preceding(endPos); - } - else { - return following(startPos); - } - } - // If the allocation failed, just fall through to the "no breaks found" case. - } - - // If we get here, there were no language-based breaks. Set the text pointer - // to the original proposed break. - utext_setNativeIndex(fText, reverse ? startPos : endPos); - return (reverse ? startPos : endPos); -} - U_NAMESPACE_END @@ -1824,9 +1340,11 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { void RuleBasedBreakIterator::setBreakType(int32_t type) { fBreakType = type; - reset(); } +void RuleBasedBreakIterator::dumpCache() { + fBreakCache->dumpCache(); +} /** * Returns the description used to create this iterator diff --git a/icu4c/source/common/rbbi_cache.cpp b/icu4c/source/common/rbbi_cache.cpp new file mode 100644 index 00000000000..0a1e1a7151f --- /dev/null +++ b/icu4c/source/common/rbbi_cache.cpp @@ -0,0 +1,622 @@ +// Copyright (C) 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +// file: rbbi_cache.cpp + +#include "unicode/utypes.h" +#include "unicode/ubrk.h" +#include "unicode/rbbi.h" + +#include "rbbi_cache.h" + +#include "brkeng.h" +#include "cmemory.h" +#include "rbbidata.h" +#include "uassert.h" +#include "uvectr32.h" + +U_NAMESPACE_BEGIN + +/* + * DictionaryCache implementation + */ + +RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) : + fBI(bi), fBreaks(NULL), fPositionInCache(-1), + fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) { + fBreaks = new UVector32(status); +} + +RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() { + delete fBreaks; + fBreaks = NULL; +} + +void RuleBasedBreakIterator::DictionaryCache::reset() { + fPositionInCache = -1; + fStart = 0; + fLimit = 0; + fFirstRuleStatusIndex = 0; + fOtherRuleStatusIndex = 0; + fBreaks->removeAllElements(); +} + +UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) { + if (fromPos >= fLimit || fromPos < fStart) { + fPositionInCache = -1; + return FALSE; + } + + // Sequential iteration, move from previous boundary to the following + + int32_t r = 0; + if (fPositionInCache >= 0 && fPositionInCache < fBreaks->size() && fBreaks->elementAti(fPositionInCache) == fromPos) { + ++fPositionInCache; + if (fPositionInCache >= fBreaks->size()) { + fPositionInCache = -1; + return FALSE; + } + r = fBreaks->elementAti(fPositionInCache); + U_ASSERT(r > fromPos); + *result = r; + *statusIndex = fOtherRuleStatusIndex; + return TRUE; + } + + // Random indexing. Linear search for the boundary following the given position. + + for (fPositionInCache = 0; fPositionInCache < fBreaks->size(); ++fPositionInCache) { + r= fBreaks->elementAti(fPositionInCache); + if (r > fromPos) { + *result = r; + *statusIndex = fOtherRuleStatusIndex; + return TRUE; + } + } + U_ASSERT(FALSE); + fPositionInCache = -1; + return FALSE; +} + + +UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_t *result, int32_t *statusIndex) { + if (fromPos <= fStart || fromPos > fLimit) { + fPositionInCache = -1; + return FALSE; + } + + if (fromPos == fLimit) { + fPositionInCache = fBreaks->size() - 1; + if (fPositionInCache >= 0) { + U_ASSERT(fBreaks->elementAti(fPositionInCache) == fromPos); + } + } + + int32_t r; + if (fPositionInCache > 0 && fPositionInCache < fBreaks->size() && fBreaks->elementAti(fPositionInCache) == fromPos) { + --fPositionInCache; + r = fBreaks->elementAti(fPositionInCache); + U_ASSERT(r < fromPos); + *result = r; + *statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; + return TRUE; + } + + if (fPositionInCache == 0) { + fPositionInCache = -1; + return FALSE; + } + + for (fPositionInCache = fBreaks->size()-1; fPositionInCache >= 0; --fPositionInCache) { + r = fBreaks->elementAti(fPositionInCache); + if (r < fromPos) { + *result = r; + *statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; + return TRUE; + } + } + U_ASSERT(FALSE); + fPositionInCache = -1; + return FALSE; +} + +void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos, + int32_t firstRuleStatus, int32_t otherRuleStatus) { + if ((endPos - startPos) <= 1) { + return; + } + + reset(); + fFirstRuleStatusIndex = firstRuleStatus; + fOtherRuleStatusIndex = otherRuleStatus; + + int32_t rangeStart = startPos; + int32_t rangeEnd = endPos; + + uint16_t category; + int32_t current; + UErrorCode status = U_ZERO_ERROR; + int32_t foundBreakCount = 0; + UText *text = fBI->fText; + + // Loop through the text, looking for ranges of dictionary characters. + // For each span, find the appropriate break engine, and ask it to find + // any breaks within the span. + + utext_setNativeIndex(text, rangeStart); + UChar32 c = utext_current32(text); + category = UTRIE2_GET16(fBI->fData->fTrie, c); + + while(U_SUCCESS(status)) { + while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd && (category & 0x4000) == 0) { + utext_next32(text); // TODO: cleaner loop structure. + c = utext_current32(text); + category = UTRIE2_GET16(fBI->fData->fTrie, c); + } + if (current >= rangeEnd) { + break; + } + + // We now have a dictionary character. Get the appropriate language object + // to deal with it. + const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(c); + + // Ask the language object if there are any breaks. It will add them to the cache and + // leave the text pointer on the other side of its range, ready to search for the next one. + if (lbe != NULL) { + foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBI->fBreakType, *fBreaks); + } + + // Reload the loop variables for the next go-round + c = utext_current32(text); + category = UTRIE2_GET16(fBI->fData->fTrie, c); + } + + // If we found breaks, ensure that the first and last entries are + // the original starting and ending position. And initialize the + // cache iteration position to the first entry. + + // printf("foundBreakCount = %d\n", foundBreakCount); + if (foundBreakCount > 0) { + U_ASSERT(foundBreakCount == fBreaks->size()); + if (startPos < fBreaks->elementAti(0)) { + // The dictionary did not place a boundary at the start of the segment of text. + // Add one now. This should not commonly happen, but it would be easy for interactions + // of the rules for dictionary segments and the break engine implementations to + // inadvertently cause it. Cover it here, just in case. + fBreaks->insertElementAt(startPos, 0, status); + } + if (endPos > fBreaks->peeki()) { + fBreaks->push(endPos, status); + } + fPositionInCache = 0; + // Note: Dictionary matching may extend beyond the original limit. + fStart = fBreaks->elementAti(0); + fLimit = fBreaks->peeki(); + } else { + // there were no language-based breaks, even though the segment contained + // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache + // for this range will fail, and the calling code will fall back to the rule based boundaries. + } +} + + +/* + * BreakCache implemetation + */ + +RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) : + fBI(bi), fSideBuffer(status) { + reset(); +} + + +RuleBasedBreakIterator::BreakCache::~BreakCache() { +} + + +void RuleBasedBreakIterator::BreakCache::reset(int32_t pos, int32_t ruleStatus) { + fStartBufIdx = 0; + fEndBufIdx = 0; + fTextIdx = pos; + fBufIdx = 0; + fBoundaries[0] = pos; + fStatuses[0] = (uint16_t)ruleStatus; +} + + +int32_t RuleBasedBreakIterator::BreakCache::current() { + fBI->fPosition = fTextIdx; + fBI->fRuleStatusIndex = fStatuses[fBufIdx]; + fBI->fDone = FALSE; + return fTextIdx; +} + + +void RuleBasedBreakIterator::BreakCache::following(int32_t startPos, UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) { + // startPos is in the cache. Do a next() from that position. + // TODO: an awkward set of interactions with bi->fDone + // seek() does not clear it; it can't because of interactions with populateNear(). + // next() does not clear it in the fast-path case, where everything matters. Maybe it should. + // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end. + fBI->fDone = false; + next(); + } + return; +} + + +void RuleBasedBreakIterator::BreakCache::preceding(int32_t startPos, UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) { + if (startPos == fTextIdx) { + previous(status); + } else { + // seek() leaves the BreakCache positioned at the preceding boundary + // if the requested position is between two bounaries. + // current() pushes the BreakCache position out to the BreakIterator itself. + U_ASSERT(startPos > fTextIdx); + current(); + } + } + return; +} + + +/* + * Out-of-line code for BreakCache::next(). + * Cache does not already contain the boundary + */ +void RuleBasedBreakIterator::BreakCache::nextOL() { + fBI->fDone = !populateFollowing(); + fBI->fPosition = fTextIdx; + fBI->fRuleStatusIndex = fStatuses[fBufIdx]; + return; +} + + +void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + int32_t initialBufIdx = fBufIdx; + if (fBufIdx == fStartBufIdx) { + // At start of cache. Prepend to it. + populatePreceding(status); + } else { + // Cache already holds the next boundary + fBufIdx = modChunkSize(fBufIdx - 1); + fTextIdx = fBoundaries[fBufIdx]; + } + fBI->fDone = (fBufIdx == initialBufIdx); + fBI->fPosition = fTextIdx; + fBI->fRuleStatusIndex = fStatuses[fBufIdx]; + return; +} + + +UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) { + if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) { + return FALSE; + } + if (pos == fBoundaries[fStartBufIdx]) { + // Common case: seek(0), from BreakIterator::first() + fBufIdx = fStartBufIdx; + fTextIdx = fBoundaries[fBufIdx]; + return TRUE; + } + if (pos == fBoundaries[fEndBufIdx]) { + fBufIdx = fEndBufIdx; + fTextIdx = fBoundaries[fBufIdx]; + return TRUE; + } + + int32_t min = fStartBufIdx; + int32_t max = fEndBufIdx; + while (min != max) { + int32_t probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2; + probe = modChunkSize(probe); + if (fBoundaries[probe] > pos) { + max = probe; + } else { + min = modChunkSize(probe + 1); + } + } + U_ASSERT(fBoundaries[max] > pos); + fBufIdx = modChunkSize(max - 1); + fTextIdx = fBoundaries[fBufIdx]; + U_ASSERT(fTextIdx <= pos); + return TRUE; +} + + +UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorCode &status) { + if (U_FAILURE(status)) { + return FALSE; + } + U_ASSERT(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]); + + // Find a boundary somewhere in the vicinity of the requested position. + // Depending on the safe rules and the text data, it could be either before, at, or after + // the requested position. + + + // If the requested position is not near already cached positions, clear the existing cache, + // find a near-by boundary and begin new cache contents there. + + if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) { + int32_t aBoundary = 0; + int32_t ruleStatusIndex = 0; + // TODO: check for position == length of text. Although may still need to back up to get rule status. + if (position > 20) { + int32_t backupPos = fBI->handlePrevious(position); + fBI->fPosition = backupPos; + aBoundary = fBI->handleNext(); // Ignore dictionary, just finding a rule based boundary. + ruleStatusIndex = fBI->fRuleStatusIndex; + } + reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. + } + + // Fill in boundaries between existing cache content and the new requested position. + + if (fBoundaries[fEndBufIdx] < position) { + // The last position in the cache precedes the requested position. + // Add following position(s) to the cache. + while (fBoundaries[fEndBufIdx] < position) { + if (!populateFollowing()) { + U_ASSERT(false); + return false; + } + } + fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer. + fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries. + while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos. + previous(status); + } + return true; + } + + if (fBoundaries[fStartBufIdx] > position) { + // The first position in the cache is beyond the requested position. + // back up more until we get a boundary <= the requested position. + while (fBoundaries[fStartBufIdx] > position) { + populatePreceding(status); + } + fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer. + fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries. + while (fTextIdx < position) { // Move forwards to a position at or following the requested pos. + next(); + } + if (fTextIdx > position) { + // If position is not itself a boundary, the next() loop above will overshoot. + // Back up one, leaving cache position at the boundary preceding the requested position. + previous(status); + } + return true; + } + + U_ASSERT(fTextIdx == position); + return true; +} + + + +UBool RuleBasedBreakIterator::BreakCache::populateFollowing() { + int32_t fromPosition = fBoundaries[fEndBufIdx]; + int32_t fromRuleStatusIdx = fStatuses[fEndBufIdx]; + int32_t pos = 0; + int32_t ruleStatusIdx = 0; + + if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) { + addFollowing(pos, ruleStatusIdx, UpdateCachePosition); + return TRUE; + } + + fBI->fPosition = fromPosition; + pos = fBI->handleNext(); + if (pos == UBRK_DONE) { + return FALSE; + } + + ruleStatusIdx = fBI->fRuleStatusIndex; + if (fBI->fDictionaryCharCount > 0) { + // The text segment obtained from the rules includes dictionary characters. + // Subdivide it, with subdivided results going into the dictionary cache. + fBI->fDictionaryCache->populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx); + if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) { + addFollowing(pos, ruleStatusIdx, UpdateCachePosition); + return TRUE; + // TODO: may want to move a sizable chunk of dictionary cache to break cache at this point. + // But be careful with interactions with populateNear(). + } + } + + // Rule based segment did not include dictionary characters. + // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them, + // meaning that we didn't take the return, above. + // Add its end point to the cache. + addFollowing(pos, ruleStatusIdx, UpdateCachePosition); + + // Add several non-dictionary boundaries at this point, to optimize straight forward iteration. + // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results. + // + for (int count=0; count<6; ++count) { + pos = fBI->handleNext(); + if (pos == UBRK_DONE || fBI->fDictionaryCharCount > 0) { + break; + } + addFollowing(pos, fBI->fRuleStatusIndex, RetainCachePosition); + } + + return TRUE; +} + + +UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status) { + if (U_FAILURE(status)) { + return FALSE; + } + + int32_t fromPosition = fBoundaries[fStartBufIdx]; + if (fromPosition == 0) { + return FALSE; + } + + int32_t position = 0; + int32_t positionStatusIdx = 0; + + if (fBI->fDictionaryCache->preceding(fromPosition, &position, &positionStatusIdx)) { + addPreceding(position, positionStatusIdx, UpdateCachePosition); + return TRUE; + } + + int32_t backupPosition = fromPosition; + + // Find a boundary somewhere preceding the first already-cached boundary + do { + backupPosition = backupPosition - 30; + if (backupPosition <= 0) { + backupPosition = 0; + } else { + backupPosition = fBI->handlePrevious(backupPosition); + } + if (backupPosition == UBRK_DONE || backupPosition == 0) { + position = 0; + positionStatusIdx = 0; + } else { + fBI->fPosition = backupPosition; // TODO: pass starting position in a clearer way. + position = fBI->handleNext(); + positionStatusIdx = fBI->fRuleStatusIndex; + + } + } while (position >= fromPosition); + + // Find boundaries between the one we just located and the first already-cached boundary + // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.. + + fSideBuffer.removeAllElements(); + fSideBuffer.addElement(position, status); + fSideBuffer.addElement(positionStatusIdx, status); + + do { + int32_t prevPosition = fBI->fPosition = position; + int32_t prevStatusIdx = positionStatusIdx; + position = fBI->handleNext(); + positionStatusIdx = fBI->fRuleStatusIndex; + if (position == UBRK_DONE) { + break; + } + + UBool segmentHandledByDictionary = FALSE; + if (fBI->fDictionaryCharCount != 0) { + // Segment from the rules includes dictionary characters. + // Subdivide it, with subdivided results going into the dictionary cache. + int32_t dictSegEndPosition = position; + fBI->fDictionaryCache->populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx); + while (fBI->fDictionaryCache->following(prevPosition, &position, &positionStatusIdx)) { + segmentHandledByDictionary = true; + U_ASSERT(position > prevPosition); + if (position >= fromPosition) { + break; + } + U_ASSERT(position <= dictSegEndPosition); + fSideBuffer.addElement(position, status); + fSideBuffer.addElement(positionStatusIdx, status); + prevPosition = position; + } + U_ASSERT(position==dictSegEndPosition || position>=fromPosition); + } + + if (!segmentHandledByDictionary && position < fromPosition) { + fSideBuffer.addElement(position, status); + fSideBuffer.addElement(positionStatusIdx, status); + } + } while (position < fromPosition); + + // Move boundaries from the side buffer to the main circular buffer. + UBool success = FALSE; + if (!fSideBuffer.isEmpty()) { + positionStatusIdx = fSideBuffer.popi(); + position = fSideBuffer.popi(); + addPreceding(position, positionStatusIdx, UpdateCachePosition); + success = TRUE; + } + + while (!fSideBuffer.isEmpty()) { + positionStatusIdx = fSideBuffer.popi(); + position = fSideBuffer.popi(); + if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) { + // No space in circular buffer to hold a new preceding result while + // also retaining the current cache (iteration) position. + // Bailing out is safe; the cache will refill again if needed. + break; + } + } + + return success; +} + + +void RuleBasedBreakIterator::BreakCache::addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) { + U_ASSERT(position > fBoundaries[fEndBufIdx]); + U_ASSERT(ruleStatusIdx <= UINT16_MAX); + int32_t nextIdx = modChunkSize(fEndBufIdx + 1); + if (nextIdx == fStartBufIdx) { + fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1. + } + fBoundaries[nextIdx] = position; + fStatuses[nextIdx] = ruleStatusIdx; + fEndBufIdx = nextIdx; + if (update == UpdateCachePosition) { + // Set current position to the newly added boundary. + fBufIdx = nextIdx; + fTextIdx = position; + } else { + // Retaining the original cache position. + // Check if the added boundary wraps around the buffer, and would over-write the original position. + // It's the responsibility of callers of this function to not add too many. + U_ASSERT(nextIdx != fBufIdx); + } +} + +bool RuleBasedBreakIterator::BreakCache::addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) { + U_ASSERT(position < fBoundaries[fStartBufIdx]); + U_ASSERT(ruleStatusIdx <= UINT16_MAX); + int32_t nextIdx = modChunkSize(fStartBufIdx - 1); + if (nextIdx == fEndBufIdx) { + if (fBufIdx == fEndBufIdx && update == RetainCachePosition) { + // Failure. The insertion of the new boundary would claim the buffer position that is the + // current iteration position. And we also want to retain the current iteration position. + // (The buffer is already completely full of entries that precede the iteration position.) + return false; + } + fEndBufIdx = modChunkSize(fEndBufIdx - 1); + } + fBoundaries[nextIdx] = position; + fStatuses[nextIdx] = ruleStatusIdx; + fStartBufIdx = nextIdx; + if (update == UpdateCachePosition) { + fBufIdx = nextIdx; + fTextIdx = position; + } + return true; +} + + +void RuleBasedBreakIterator::BreakCache::dumpCache() { + printf("fTextIdx:%d fBufIdx:%d\n", fTextIdx, fBufIdx); + for (int32_t i=fStartBufIdx; ; i=modChunkSize(i+1)) { + printf("%d %d\n", i, fBoundaries[i]); + if (i == fEndBufIdx) { + break; + } + } +} + +U_NAMESPACE_END diff --git a/icu4c/source/common/rbbi_cache.h b/icu4c/source/common/rbbi_cache.h new file mode 100644 index 00000000000..72576e55a2a --- /dev/null +++ b/icu4c/source/common/rbbi_cache.h @@ -0,0 +1,199 @@ +// Copyright (C) 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +// file: rbbi_cache.h +// +#ifndef RBBI_CACHE_H +#define RBBI_CACHE_H + +#include "unicode/utypes.h" + +#include "unicode/rbbi.h" +#include "unicode/uobject.h" + +#include "uvectr32.h" + +U_NAMESPACE_BEGIN + +/* DictionaryCache stores the boundaries obtained from a run of dictionary characters. + * Dictionary boundaries are moved first to this cache, then from here + * to the main BreakCache, where they may inter-leave with non-dictionary + * boundaries. The public BreakIterator API always fetches directly + * from the main BreakCache, not from here. + * + * In common situations, the number of boundaries in a single dictionary run + * should be quite small, it will be terminated by punctuation, spaces, + * or any other non-dictionary characters. The main BreakCache may end + * up with boundaries from multiple dictionary based runs. + * + * The boundaries are stored in a simple ArrayList (vector), with the + * assumption that they will be accessed sequentially. + */ +class RuleBasedBreakIterator::DictionaryCache: public UMemory { + public: + DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status); + ~DictionaryCache(); + + void reset(); + + UBool following(int32_t fromPos, int32_t *pos, int32_t *statusIndex); + UBool preceding(int32_t fromPos, int32_t *pos, int32_t *statusIndex); + + /** + * Populate the cache with the dictionary based boundaries within a region of text. + * @param startPos The start position of a range of text + * @param endPos The end position of a range of text + * @param firstRuleStatus The rule status index that applies to the break at startPos + * @param otherRuleStatus The rule status index that applies to boundaries other than startPos + * @internal + */ + void populateDictionary(int32_t startPos, int32_t endPos, + int32_t firstRuleStatus, int32_t otherRuleStatus); + + + + RuleBasedBreakIterator *fBI; + + UVector32 *fBreaks; // A vector containing the boundaries. + int32_t fPositionInCache; // Index in fBreaks of last boundary returned by following() + // or preceding(). Optimizes sequential access. + int32_t fStart; // Text position of first boundary in cache. + int32_t fLimit; // Last boundary in cache. Which is the limit of the + // text segment being handled by the dictionary. + int32_t fFirstRuleStatusIndex; // Rule status info for first boundary. + int32_t fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries. +}; + + +/* + * class BreakCache + * + * Cache of break boundary positions and rule status values. + * Break iterator API functions, next(), previous(), etc., will use cached results + * when possible, and otherwise cache new results as they are obtained. + * + * Uniformly caches both dictionary and rule based (non-dictionary) boundaries. + * + * The cache is implemented as a single circular buffer. + */ + +/* + * size of the circular cache buffer. + */ + +class RuleBasedBreakIterator::BreakCache: public UMemory { + public: + BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status); + virtual ~BreakCache(); + void reset(int32_t pos = 0, int32_t ruleStatus = 0); + void next() { if (fBufIdx == fEndBufIdx) { + nextOL(); + } else { + fBufIdx = modChunkSize(fBufIdx + 1); + fTextIdx = fBI->fPosition = fBoundaries[fBufIdx]; + fBI->fRuleStatusIndex = fStatuses[fBufIdx]; + } + }; + + + void nextOL(); + void previous(UErrorCode &status); + + // Move the iteration state to the position following the startPosition. + // Input position must be pinned to the input length. + void following(int32_t startPosition, UErrorCode &status); + + void preceding(int32_t startPosition, UErrorCode &status); + + /* + * Update the state of the public BreakIterator (fBI) to reflect the + * current state of the break iterator cache (this). + */ + int32_t current(); + + /** + * Add boundaries to the cache near the specified position. + * The given position need not be a boundary itself. + * The input position must be within the range of the text, and + * on a code point boundary. + * If the requested position is a break boundary, leave the iteration + * position on it. + * If the requested position is not a boundary, leave the iteration + * position on the preceding boundary and include both the the + * preceding and following boundaries in the cache. + * Additional boundaries, either preceding or following, may be added + * to the cache as a side effect. + * + * Return FALSE if the operation failed. + */ + UBool populateNear(int32_t position, UErrorCode &status); + + /** + * Add boundary(s) to the cache following the current last boundary. + * Return FALSE if at the end of the text, and no more boundaries can be added. + * Leave iteration position at the first newly added boundary, or unchanged if no boundary was added. + */ + UBool populateFollowing(); + + /** + * Add one or more boundaries to the cache preceding the first currently cached boundary. + * Leave the iteration position on the first added boundary. + * Return false if no boundaries could be added (if at the start of the text.) + */ + UBool populatePreceding(UErrorCode &status); + + enum UpdatePositionValues { + RetainCachePosition = 0, + UpdateCachePosition = 1 + }; + + /* + * Add the boundary following the current position. + * The current position can be left as it was, or changed to the newly added boundary, + * as specified by the update parameter. + */ + void addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update); + + + /* + * Add the boundary preceding the current position. + * The current position can be left as it was, or changed to the newly added boundary, + * as specified by the update parameter. + */ + bool addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update); + + /** + * Set the cache position to the specified position, or, if the position + * falls between to cached boundaries, to the preceding boundary. + * Fails if the requested position is outside of the range of boundaries currently held by the cache. + * The startPosition must be on a code point boundary. + * + * Return TRUE if successful, FALSE if the specified position is after + * the last cached boundary or before the first. + */ + UBool seek(int32_t startPosition); + + void dumpCache(); + + private: + static inline int32_t modChunkSize(int index) { return index & (CACHE_SIZE - 1); }; + + static constexpr int32_t CACHE_SIZE = 128; + static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two."); + + RuleBasedBreakIterator *fBI; + int32_t fStartBufIdx; + int32_t fEndBufIdx; // inclusive + + int32_t fTextIdx; + int32_t fBufIdx; + + int32_t fBoundaries[CACHE_SIZE]; + uint16_t fStatuses[CACHE_SIZE]; + + UVector32 fSideBuffer; +}; + +U_NAMESPACE_END + +#endif // RBBI_CACHE_H diff --git a/icu4c/source/common/rbbidata.cpp b/icu4c/source/common/rbbidata.cpp index 9850be12dfc..33405708c06 100644 --- a/icu4c/source/common/rbbidata.cpp +++ b/icu4c/source/common/rbbidata.cpp @@ -14,7 +14,7 @@ #include "unicode/utypes.h" #include "rbbidata.h" #include "rbbirb.h" -#include "utrie.h" +#include "utrie2.h" #include "udatamem.h" #include "cmemory.h" #include "cstring.h" @@ -83,11 +83,11 @@ void RBBIDataWrapper::init0() { fReverseTable = NULL; fSafeFwdTable = NULL; fSafeRevTable = NULL; - fRuleSource = NULL; + fRuleSource = NULL; fRuleStatusTable = NULL; - fTrie = NULL; - fUDataMem = NULL; - fRefCount = 0; + fTrie = NULL; + fUDataMem = NULL; + fRefCount = 0; fDontFreeData = TRUE; } @@ -118,6 +118,14 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) { fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable); } + // Rule Compatibility Hacks + // If a rule set includes reverse rules but does not explicitly include safe reverse rules, + // the reverse rules are to be treated as safe reverse rules. + + if (fSafeRevTable == NULL && fReverseTable != NULL) { + fSafeRevTable = fReverseTable; + fReverseTable = NULL; + } fTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, (uint8_t *)data + fHeader->fTrie, diff --git a/icu4c/source/common/rbbidata.h b/icu4c/source/common/rbbidata.h index bd703f9029f..bd25e06d201 100644 --- a/icu4c/source/common/rbbidata.h +++ b/icu4c/source/common/rbbidata.h @@ -184,11 +184,11 @@ public: /* number of int32_t values in the rule status table. Used to sanity check indexing */ int32_t fStatusMaxIdx; - UTrie2 *fTrie; + UTrie2 *fTrie; private: u_atomic_int32_t fRefCount; - UDataMemory *fUDataMem; + UDataMemory *fUDataMem; UnicodeString fRuleString; UBool fDontFreeData; diff --git a/icu4c/source/common/rbbirb.cpp b/icu4c/source/common/rbbirb.cpp index 84f9974204b..72447d88f00 100644 --- a/icu4c/source/common/rbbirb.cpp +++ b/icu4c/source/common/rbbirb.cpp @@ -24,16 +24,16 @@ #include "unicode/uchriter.h" #include "unicode/parsepos.h" #include "unicode/parseerr.h" + #include "cmemory.h" #include "cstring.h" - #include "rbbirb.h" #include "rbbinode.h" - #include "rbbiscan.h" #include "rbbisetb.h" #include "rbbitblb.h" #include "rbbidata.h" +#include "uassert.h" U_NAMESPACE_BEGIN @@ -164,8 +164,13 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() { int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t)); int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar)); - int32_t totalSize = headerSize + forwardTableSize + reverseTableSize - + safeFwdTableSize + safeRevTableSize + (void)safeFwdTableSize; + + int32_t totalSize = headerSize + + forwardTableSize + + /* reverseTableSize */ 0 + + /* safeFwdTableSize */ 0 + + (safeRevTableSize ? safeRevTableSize : reverseTableSize) + statusTableSize + trieSize + rulesSize; RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize); @@ -184,16 +189,38 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() { data->fLength = totalSize; data->fCatCount = fSetBuilder->getNumCharCategories(); + // Only save the forward table and the safe reverse table, + // because these are the only ones used at run-time. + // + // For the moment, we still build the other tables if they are present in the rule source files, + // for backwards compatibility. Old rule files need to work, and this is the simplest approach. + // + // Additional backwards compatibility consideration: if no safe rules are provided, consider the + // reverse rules to actually be the safe reverse rules. + data->fFTable = headerSize; data->fFTableLen = forwardTableSize; + + // Do not save Reverse Table. data->fRTable = data->fFTable + forwardTableSize; - data->fRTableLen = reverseTableSize; - data->fSFTable = data->fRTable + reverseTableSize; - data->fSFTableLen = safeFwdTableSize; - data->fSRTable = data->fSFTable + safeFwdTableSize; - data->fSRTableLen = safeRevTableSize; + data->fRTableLen = 0; + + // Do not save the Safe Forward table. + data->fSFTable = data->fRTable + 0; + data->fSFTableLen = 0; + + data->fSRTable = data->fSFTable + 0; + if (safeRevTableSize > 0) { + data->fSRTableLen = safeRevTableSize; + } else if (reverseTableSize > 0) { + data->fSRTableLen = reverseTableSize; + } else { + U_ASSERT(FALSE); // Rule build should have failed for lack of a reverse table + // before reaching this point. + } + - data->fTrie = data->fSRTable + safeRevTableSize; + data->fTrie = data->fSRTable + data->fSRTableLen; data->fTrieLen = fSetBuilder->getTrieSize(); data->fStatusTable = data->fTrie + trieSize; data->fStatusTableLen= statusTableSize; @@ -203,9 +230,14 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() { uprv_memset(data->fReserved, 0, sizeof(data->fReserved)); fForwardTables->exportTable((uint8_t *)data + data->fFTable); - fReverseTables->exportTable((uint8_t *)data + data->fRTable); - fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable); - fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable); + // fReverseTables->exportTable((uint8_t *)data + data->fRTable); + // fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable); + if (safeRevTableSize > 0) { + fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable); + } else { + fReverseTables->exportTable((uint8_t *)data + data->fSRTable); + } + fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie); int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable); diff --git a/icu4c/source/common/rbbiscan.cpp b/icu4c/source/common/rbbiscan.cpp index e68e0529d02..db0d7614238 100644 --- a/icu4c/source/common/rbbiscan.cpp +++ b/icu4c/source/common/rbbiscan.cpp @@ -47,6 +47,7 @@ // //------------------------------------------------------------------------------ static const UChar gRuleSet_rule_char_pattern[] = { + // Characters that may appear as literals in patterns without escaping or quoting. // [ ^ [ \ p { Z } \ u 0 0 2 0 0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30, // - \ u 0 0 7 f ] - [ \ p @@ -558,6 +559,10 @@ UBool RBBIRuleScanner::doParseActions(int32_t action) fRB->fDefaultTree = &fRB->fSafeRevTree; } else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) { fRB->fLookAheadHardBreak = TRUE; + } else if (opt == UNICODE_STRING("quoted_literals_only", 20)) { + fRuleSets[kRuleSet_rule_char-128].clear(); + } else if (opt == UNICODE_STRING("unquoted_literals", 17)) { + fRuleSets[kRuleSet_rule_char-128].applyPattern(UnicodeString(gRuleSet_rule_char_pattern), *fRB->fStatus); } else { error(U_BRK_UNRECOGNIZED_OPTION); } diff --git a/icu4c/source/common/rbbisetb.cpp b/icu4c/source/common/rbbisetb.cpp index f473b16974a..e97eba8d14d 100644 --- a/icu4c/source/common/rbbisetb.cpp +++ b/icu4c/source/common/rbbisetb.cpp @@ -250,12 +250,17 @@ void RBBISetBuilder::build() { // Build the Trie table for mapping UChar32 values to the corresponding // range group number // - fTrie = utrie2_open(0, // Initial value for all code points - 0, // errorValue + fTrie = utrie2_open(0, // Initial value for all code points. + 0, // Error value for out-of-range input. fStatus); - for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { - utrie2_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar, rlRange->fNum, TRUE, fStatus); + for (rlRange = fRangeList; rlRange!=0 && U_SUCCESS(*fStatus); rlRange=rlRange->fNext) { + utrie2_setRange32(fTrie, + rlRange->fStartChar, // Range start + rlRange->fEndChar, // Range end (inclusive) + rlRange->fNum, // value for range + TRUE, // Overwrite previously written values + fStatus); } } @@ -265,7 +270,10 @@ void RBBISetBuilder::build() { // getTrieSize() Return the size that will be required to serialize the Trie. // //----------------------------------------------------------------------------------- -int32_t RBBISetBuilder::getTrieSize() /*const*/ { +int32_t RBBISetBuilder::getTrieSize() { + if (U_FAILURE(*fStatus)) { + return 0; + } utrie2_freeze(fTrie, UTRIE2_16_VALUE_BITS, fStatus); fTrieSize = utrie2_serialize(fTrie, NULL, // Buffer diff --git a/icu4c/source/common/rbbisetb.h b/icu4c/source/common/rbbisetb.h index 49ecb0024a5..7cedb45b335 100644 --- a/icu4c/source/common/rbbisetb.h +++ b/icu4c/source/common/rbbisetb.h @@ -111,8 +111,8 @@ private: RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors - UTrie2 *fTrie; // The mapping TRIE that is the end result of processing - uint32_t fTrieSize; // the Unicode Sets. + UTrie2 *fTrie; // The mapping TRIE that is the end result of processing + uint32_t fTrieSize; // the Unicode Sets. // Groups correspond to character categories - // groups of ranges that are in the same original UnicodeSets. diff --git a/icu4c/source/common/unicode/rbbi.h b/icu4c/source/common/unicode/rbbi.h index eb80b9b665e..302d9ffc651 100644 --- a/icu4c/source/common/unicode/rbbi.h +++ b/icu4c/source/common/unicode/rbbi.h @@ -31,21 +31,14 @@ #include "unicode/schriter.h" #include "unicode/uchriter.h" - U_NAMESPACE_BEGIN /** @internal */ +class LanguageBreakEngine; struct RBBIDataHeader; -class RuleBasedBreakIteratorTables; -class BreakIterator; class RBBIDataWrapper; -class UStack; -class LanguageBreakEngine; class UnhandledEngine; -struct RBBIStateTable; - - - +class UStack; /** * @@ -94,47 +87,49 @@ private: */ RBBIDataWrapper *fData; - /** Index of the Rule {tag} values for the most recent match. + /** + * The iteration state - current position, rule status for the current position, + * and whether the iterator ran off the end, yielding UBRK_DONE. + * Current position is pinned to be 0 < position <= text.length. + * Current position is always set to a boundary. * @internal */ - int32_t fLastRuleStatusIndex; + /** + * The current position of the iterator. Pinned, 0 < fPosition <= text.length. + * Never has the value UBRK_DONE (-1). + */ + int32_t fPosition; /** - * Rule tag value valid flag. - * Some iterator operations don't intrinsically set the correct tag value. - * This flag lets us lazily compute the value if we are ever asked for it. - * @internal - */ - UBool fLastStatusIndexValid; + * TODO: + */ + int32_t fRuleStatusIndex; /** - * Counter for the number of characters encountered with the "dictionary" - * flag set. - * @internal - */ - uint32_t fDictionaryCharCount; + * True when iteration has run off the end, and iterator functions should return UBRK_DONE. + */ + UBool fDone; /** - * When a range of characters is divided up using the dictionary, the break - * positions that are discovered are stored here, preventing us from having - * to use either the dictionary or the state table again until the iterator - * leaves this range of text. Has the most impact for line breaking. - * @internal + * Cache of previously determined boundary positions. */ - int32_t* fCachedBreakPositions; - + public: // TODO: debug, return to private. + class BreakCache; + BreakCache *fBreakCache; + private: /** - * The number of elements in fCachedBreakPositions + * Counter for the number of characters encountered with the "dictionary" + * flag set. * @internal */ - int32_t fNumCachedBreakPositions; + uint32_t fDictionaryCharCount; /** - * if fCachedBreakPositions is not null, this indicates which item in the - * cache the current iteration position refers to - * @internal + * Cache of boundary positions within a region of text that has been + * sub-divided by dictionary based breaking. */ - int32_t fPositionInCache; + class DictionaryCache; + DictionaryCache *fDictionaryCache; /** * @@ -177,13 +172,11 @@ private: */ RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status); - + /** @internal */ friend class RBBIRuleBuilder; /** @internal */ friend class BreakIterator; - - public: /** Default constructor. Creates an empty shell of an iterator, with no @@ -467,7 +460,10 @@ public: virtual UBool isBoundary(int32_t offset); /** - * Returns the current iteration position. + * Returns the current iteration position. Note that UBRK_DONE is never + * returned from this function; if iteration has run to the end of a + * string, current() will return the length of the string while + * next() will return UBRK_DONE). * @return The current iteration position. * @stable ICU 2.0 */ @@ -499,6 +495,7 @@ public: * Note: this function is not thread safe. It should not have been * declared const, and the const remains only for compatibility * reasons. (The function is logically const, but not bit-wise const). + * TODO: check this. Probably thread safe now. *

* @return the status from the break rule that determined the most recently * returned break position. @@ -658,46 +655,31 @@ private: * Common initialization function, used by constructors and bufferClone. * @internal */ - void init(); + void init(UErrorCode &status); /** - * This method backs the iterator back up to a "safe position" in the text. - * This is a position that we know, without any context, must be a break position. - * The various calling methods then iterate forward from this safe position to - * the appropriate position to return. (For more information, see the description - * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.) - * @param statetable state table used of moving backwards + * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules. + * This locates a "Safe Position" from which the forward break rules + * will operate correctly. A Safe Position is not necessarily a boundary itself. + * + * @param fromPosition the position in the input text to begin the iteration. * @internal */ - int32_t handlePrevious(const RBBIStateTable *statetable); + int32_t handlePrevious(int32_t fromPosition); /** - * This method is the actual implementation of the next() method. All iteration - * vectors through here. This method initializes the state machine to state 1 - * and advances through the text character by character until we reach the end - * of the text or the state machine transitions to state 0. We update our return - * value every time the state machine passes through a possible end state. - * @param statetable state table used of moving forwards - * @internal - */ - int32_t handleNext(const RBBIStateTable *statetable); - - - /** - * This is the function that actually implements dictionary-based - * breaking. Covering at least the range from startPos to endPos, - * it checks for dictionary characters, and if it finds them determines - * the appropriate object to deal with them. It may cache found breaks in - * fCachedBreakPositions as it goes. It may well also look at text outside - * the range startPos to endPos. - * If going forward, endPos is the normal Unicode break result, and - * if goind in reverse, startPos is the normal Unicode break result - * @param startPos The start position of a range of text - * @param endPos The end position of a range of text - * @param reverse The call is for the reverse direction + * Find a rule-based boundary by running the state machine. + * Input + * fPosition, the position in the text to begin from. + * Output + * fPosition: the boundary following the starting position. + * fDictionaryCharCount the number of dictionary characters encountered. + * If > 0, the segment will be further subdivided + * fRuleStatusIndex Info from the state table indicating which rules caused the boundary. + * * @internal */ - int32_t checkDictionary(int32_t startPos, int32_t endPos, UBool reverse); + int32_t handleNext(); /** @@ -708,11 +690,12 @@ private: */ const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c); + public: /** - * @internal + * Debugging function only. + * @internal */ - void makeRuleStatusValid(); - + void dumpCache(); }; //------------------------------------------------------------------------------ diff --git a/icu4c/source/data/brkitr/rules/char.txt b/icu4c/source/data/brkitr/rules/char.txt index 77572f5cd68..ad070638aa3 100644 --- a/icu4c/source/data/brkitr/rules/char.txt +++ b/icu4c/source/data/brkitr/rules/char.txt @@ -1,4 +1,4 @@ -# +# # Copyright (C) 2016 and later: Unicode, Inc. and others. # License & terms of use: http://www.unicode.org/copyright.html # Copyright (C) 2002-2016, International Business Machines Corporation and others. @@ -12,6 +12,8 @@ # Plus revisions to rule GB 11 from http://unicode.org/cldr/trac/ticket/10088 # Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html +!!quoted_literals_only; + # # Character Class Definitions. # @@ -78,42 +80,6 @@ $Prepend [^$Control $CR $LF]; ## ------------------------------------------------- -!!reverse; -$LF $CR; -($L | $V | $LV | $LVT) $L; -($V | $T) ($LV | $V); -$T ($LVT | $T); - -# GB 9 -($Extend | $ZWJ) [^$Control $CR $LF]; #note that this will chain into Regional_Indicator when needed. - -# GB 9a -$SpacingMark [^$Control $CR $LF]; - -# GB 9b -[^$Control $CR $LF] $Prepend; - -# GB 10 -$E_Modifier $Extend* ($E_Base | $E_Base_GAZ); - -# GB 11 Don't break between ZWJ and Glue_After_ZWJ -($Extended_Pict | $EmojiNRK) $ZWJ $Extend* ($Extended_Pict | $EmojiNRK); - -# GB 12-13. Going backwards, we must scan through any number of regional indicators as pairs. -# -[{bof} $Extend $ZWJ $SpacingMark] $Regional_Indicator $Regional_Indicator / ($Regional_Indicator $Regional_Indicator)+ [{eof}[^$Regional_Indicator]]; -[{bof} $Extend $ZWJ $SpacingMark] $Regional_Indicator / ($Regional_Indicator $Regional_Indicator)+ [{eof}[^$Regional_Indicator]]; -$Regional_Indicator $Regional_Indicator; -$Regional_Indicator $Prepend; - -## ------------------------------------------------- - !!safe_reverse; $Regional_Indicator $Regional_Indicator; ($Extend | $ZWJ | $EmojiNRK | $Extended_Pict)+ .; - -## ------------------------------------------------- - -!!safe_forward; -$Regional_Indicator $Regional_Indicator; -($Extend | $ZWJ | $EmojiNRK | $Extended_Pict)+ .; diff --git a/icu4c/source/data/brkitr/rules/line.txt b/icu4c/source/data/brkitr/rules/line.txt index f03995cf28a..902ca8bfea6 100644 --- a/icu4c/source/data/brkitr/rules/line.txt +++ b/icu4c/source/data/brkitr/rules/line.txt @@ -1,4 +1,4 @@ -# Copyright (C) 2016 and later: Unicode, Inc. and others. +# Copyright (C) 2016 and later: Unicode, Inc. and others. # License & terms of use: http://www.unicode.org/copyright.html # Copyright (c) 2002-2016 International Business Machines Corporation and # others. All Rights Reserved. @@ -25,6 +25,7 @@ # !!chain; +!!quoted_literals_only; $AI = [:LineBreak = Ambiguous:]; $AL = [:LineBreak = Alphabetic:]; @@ -334,209 +335,6 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); # LB 30b Do not break between an Emoji Base and an Emoji Modifier $EB $CM* $EM; -# -# Reverse Rules. -# -## ------------------------------------------------- - -!!reverse; - -# LB 9 Combining Marks. -# Stick together any combining sequences that don't match other rules. - -^$CM+ $CAN_CM?; - -# -# Sequences of the form (shown forwards) -# [CANT_CM] [CM] [whatever] -# The CM needs to behave as an AL -# -$AL_FOLLOW $CM+ / ( - [$BK $CR $LF $NL $ZW {eof}] | - $SP+ $CM+ $SP | - $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break. - # LB14 says OP SP* x . - # becomes OP SP* x AL - # becomes OP SP* x CM+ AL_FOLLOW - # - # Further note: the $AL in [$AL {eof}] is only to work around - # a rule compiler bug which complains about - # empty sets otherwise. - - -# LB 4, 5, 6 - -$LB4Breaks [$LB4NonBreaks-$CM]; -$LB4Breaks $CM+ $CAN_CM; -$LF $CR; - - -# LB 7 x SP -# x ZW -[$SP $ZW] [$LB4NonBreaks-$CM]; -[$SP $ZW] $CM+ $CAN_CM; - -# LB 8 ZW SP* -# TODO: to implement this, we need more than one look-ahead hard break in play at a time. -# Requires an engine enhancement. -# / $SP* $ZW - -# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) -# -($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?; - - -# LB 9,10 Combining marks. -# X $CM needs to behave like X, where X is not $SP or controls. -# $CM not covered by the above needs to behave like $AL -# Stick together any combining sequences that don't match other rules. -^$CM+ $CAN_CM; - - -# LB 11 -# -$WJ $CM* $CAN_CM; -$WJ [$LB8NonBreaks-$CM]; - - $CANT_CM $CM* $WJ; -$CAN_CM $CM* $WJ; - -# LB 12a -# [^SP BA HY] x GL -# -$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]]; - -# LB 12 -# GL x -# -$CANT_CM $CM* $GL; -$CAN_CM $CM* $GL; - - -# LB 13 -$CL $CM+ $CAN_CM; -$CP $CM+ $CAN_CM; -$EX $CM+ $CAN_CM; -$IS $CM+ $CAN_CM; -$SY $CM+ $CAN_CM; - -$CL [$LB8NonBreaks-$CM]; -$CP [$LB8NonBreaks-$CM]; -$EX [$LB8NonBreaks-$CM]; -$IS [$LB8NonBreaks-$CM]; -$SY [$LB8NonBreaks-$CM]; - - -# LB 14 OP SP* x -# -. $SP* $CM* $OP; -$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP - - -# LB 15 -$OP $SP* $CM* $QU; - -# LB 16 -$NS $SP* $CM* ($CL | $CP); - -# LB 17 -$B2 $SP* $CM* $B2; - -# LB 18 break after spaces -# Nothing explicit needed here. - - -# -# LB 19 -# -$QU $CM* $CAN_CM; # . x QU -$QU $LB18NonBreaks; - - -$CAN_CM $CM* $QU; # QU x . - $CANT_CM $CM* $QU; - -# -# LB 20 Break before and after CB. -# nothing needed here. -# - -# LB 21 -($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) - -[$LB20NonBreaks-$CM] $CM* $BB; # BB x . -[^$CB] $CM* $BB; # - -# LB21a -[^$CB] $CM* ($HY | $BA) $CM* $HL; - -# LB21b (reverse) -$HL $CM* $SY; - -# LB 22 -$IN $CM* ($ALPlus | $HL); -$IN $CM* $EX; -$IN $CM* ($ID | $EB | $EM); -$IN $CM* $IN; -$IN $CM* $NU; - -# LB 23 -$NU $CM* ($ALPlus | $HL); -($ALPlus | $HL) $CM* $NU; - -# LB23a -($ID | $EB | $EM) $CM* $PR; -$PO $CM* ($ID | $EB | $EM); - -# LB 24 -($ALPlus | $HL) $CM* ($PR | $PO); -($PR | $PO) $CM* ($ALPlus | $HL); - - -# LB 25 -($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?; - -# LB 26 -($H3 | $H2 | $JV | $JL) $CM* $JL; -($JT | $JV) $CM* ($H2 | $JV); -$JT $CM* ($H3 | $JT); - -# LB 27 -$IN $CM* ($H3 | $H2 | $JT | $JV | $JL); -$PO $CM* ($H3 | $H2 | $JT | $JV | $JL); - ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; - -# LB 28 -($ALPlus | $HL) $CM* ($ALPlus | $HL); - - -# LB 29 -($ALPlus | $HL) $CM* $IS; - -# LB 30 -$OP $CM* ($ALPlus | $HL | $NU); -($ALPlus | $HL | $NU) $CM* $CP; - -# LB 30a -# Pairs of Regional Indicators. -# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs, -# the second with an even number. Stripping away the cruft they look like -# [^RI] RI / (RI RI)+ ^RI; -# [^RI] RI RI / (RI RI)+ ^RI; -# -[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; -[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; - -# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs. -$RI $CM* $RI; - -# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI". -$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL)); - - -# LB 30b Do not break between an Emoji Base and an Emoji Modifier -$EM $CM* $EB; - ## ------------------------------------------------- @@ -544,7 +342,6 @@ $EM $CM* $EB; # LB 9 ^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; -^$CM+ $SP / .; # LB 14 $SP+ $CM* $OP; @@ -571,19 +368,3 @@ $CM* ($HY | $BA) $CM* $HL; # For dictionary-based break $dictionary $dictionary; -## ------------------------------------------------- - -!!safe_forward; - -# Skip forward over all character classes that are involved in -# rules containing patterns with possibly more than one char -# of context. -# -# It might be slightly more efficient to have specific rules -# instead of one generic one, but only if we could -# turn off rule chaining. We don't want to move more -# than necessary. -# -^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary]; -$dictionary $dictionary; - diff --git a/icu4c/source/data/brkitr/rules/line_fi.txt b/icu4c/source/data/brkitr/rules/line_fi.txt index 23d16f1f317..73cc9edbd01 100644 --- a/icu4c/source/data/brkitr/rules/line_fi.txt +++ b/icu4c/source/data/brkitr/rules/line_fi.txt @@ -1,4 +1,4 @@ -# Copyright (C) 2016 and later: Unicode, Inc. and others. +# Copyright (C) 2016 and later: Unicode, Inc. and others. # License & terms of use: http://www.unicode.org/copyright.html # Copyright (c) 2002-2016 International Business Machines Corporation and # others. All Rights Reserved. @@ -30,6 +30,7 @@ # !!chain; +!!quoted_literals_only; $AI = [:LineBreak = Ambiguous:]; $AL = [:LineBreak = Alphabetic:]; @@ -343,220 +344,12 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); # LB 30b Do not break between an Emoji Base and an Emoji Modifier $EB $CM* $EM; -# -# Reverse Rules. -# -## ------------------------------------------------- - -!!reverse; - -# LB 9 Combining Marks. -# Stick together any combining sequences that don't match other rules. - -^$CM+ $CAN_CM?; - -# -# Sequences of the form (shown forwards) -# [CANT_CM] [CM] [whatever] -# The CM needs to behave as an AL -# -$AL_FOLLOW $CM+ / ( - [$BK $CR $LF $NL $ZW {eof}] | - $SP+ $CM+ $SP | - $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break. - # LB14 says OP SP* x . - # becomes OP SP* x AL - # becomes OP SP* x CM+ AL_FOLLOW - # - # Further note: the $AL in [$AL {eof}] is only to work around - # a rule compiler bug which complains about - # empty sets otherwise. - - -# LB 4, 5, 6 - -$LB4Breaks [$LB4NonBreaks-$CM]; -$LB4Breaks $CM+ $CAN_CM; -$LF $CR; - - -# LB 7 x SP -# x ZW -[$SP $ZW] [$LB4NonBreaks-$CM]; -[$SP $ZW] $CM+ $CAN_CM; - -# LB 8 ZW SP* -# TODO: to implement this, we need more than one look-ahead hard break in play at a time. -# Requires an engine enhancement. -# / $SP* $ZW - -# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) -# -($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?; - - -# LB 9,10 Combining marks. -# X $CM needs to behave like X, where X is not $SP or controls. -# $CM not covered by the above needs to behave like $AL -# Stick together any combining sequences that don't match other rules. -^$CM+ $CAN_CM; - - -# LB 11 -# -$WJ $CM* $CAN_CM; -$WJ [$LB8NonBreaks-$CM]; - - $CANT_CM $CM* $WJ; -$CAN_CM $CM* $WJ; - -# LB 12a -# [^SP BA HY] x GL -# -$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]]; - -# LB 12 -# GL x -# -$CANT_CM $CM* $GL; -$CAN_CM $CM* $GL; - - -# LB 13 -$CL $CM+ $CAN_CM; -$CP $CM+ $CAN_CM; -$EX $CM+ $CAN_CM; -$IS $CM+ $CAN_CM; -$SY $CM+ $CAN_CM; - -$CL [$LB8NonBreaks-$CM]; -$CP [$LB8NonBreaks-$CM]; -$EX [$LB8NonBreaks-$CM]; -$IS [$LB8NonBreaks-$CM]; -$SY [$LB8NonBreaks-$CM]; - - -# LB 14 OP SP* x -# -. $SP* $CM* $OP; -$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP - - -# LB 15 -$OP $SP* $CM* $QU; - -# LB 16 -$NS $SP* $CM* ($CL | $CP); - -# LB 17 -$B2 $SP* $CM* $B2; - -# LB 18 break after spaces -# Nothing explicit needed here. - - -# -# LB 19 -# -$QU $CM* $CAN_CM; # . x QU -$QU $LB18NonBreaks; - - -$CAN_CM $CM* $QU; # QU x . - $CANT_CM $CM* $QU; - -# -# LB 20 Break before and after CB. -# nothing needed here. -# - -# LB 20.09 added rule for Finnish tailoring -$AL ($HY | $HH) / $SP; - -# LB 21 -($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) - -[$LB20NonBreaks-$CM] $CM* $BB; # BB x . -[^$CB] $CM* $BB; # - -# LB21a -[^$CB] $CM* ($HY | $BA | $HH) $CM* $HL; - -# LB21b (reverse) -$HL $CM* $SY; - -# LB 22 -$IN $CM* ($ALPlus | $HL); -$IN $CM* $EX; -$IN $CM* ($ID | $EB | $EM); -$IN $CM* $IN; -$IN $CM* $NU; - -# LB 23 -$NU $CM* ($ALPlus | $HL); -($ALPlus | $HL) $CM* $NU; - -# LB23a -($ID | $EB | $EM) $CM* $PR; -$PO $CM* ($ID | $EB | $EM); - -# LB 24 -($ALPlus | $HL) $CM* ($PR | $PO); -($PR | $PO) $CM* ($ALPlus | $HL); - - -# LB 25 -($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?; - -# LB 26 -($H3 | $H2 | $JV | $JL) $CM* $JL; -($JT | $JV) $CM* ($H2 | $JV); -$JT $CM* ($H3 | $JT); - -# LB 27 -$IN $CM* ($H3 | $H2 | $JT | $JV | $JL); -$PO $CM* ($H3 | $H2 | $JT | $JV | $JL); - ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; - -# LB 28 -($ALPlus | $HL) $CM* ($ALPlus | $HL); - - -# LB 29 -($ALPlus | $HL) $CM* $IS; - -# LB 30 -$OP $CM* ($ALPlus | $HL | $NU); -($ALPlus | $HL | $NU) $CM* $CP; - -# LB 30a -# Pairs of Regional Indicators. -# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs, -# the second with an even number. Stripping away the cruft they look like -# [^RI] RI / (RI RI)+ ^RI; -# [^RI] RI RI / (RI RI)+ ^RI; -# -[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; -[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; - -# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs. -$RI $CM* $RI; - -# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI". -$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL)); - - -# LB 30b Do not break between an Emoji Base and an Emoji Modifier -$EM $CM* $EB; - - ## ------------------------------------------------- !!safe_reverse; # LB 9 ^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; -^$CM+ $SP / .; # LB 14 $SP+ $CM* $OP; @@ -582,20 +375,3 @@ $CM* ($HY | $BA | $HH) $CM* $HL; # For dictionary-based break $dictionary $dictionary; - -## ------------------------------------------------- - -!!safe_forward; - -# Skip forward over all character classes that are involved in -# rules containing patterns with possibly more than one char -# of context. -# -# It might be slightly more efficient to have specific rules -# instead of one generic one, but only if we could -# turn off rule chaining. We don't want to move more -# than necessary. -# -^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary]; -$dictionary $dictionary; - diff --git a/icu4c/source/data/brkitr/rules/line_loose.txt b/icu4c/source/data/brkitr/rules/line_loose.txt index d57a0222f7b..0e39e9cc3ff 100644 --- a/icu4c/source/data/brkitr/rules/line_loose.txt +++ b/icu4c/source/data/brkitr/rules/line_loose.txt @@ -1,5 +1,6 @@ -# Copyright (C) 2016 and later: Unicode, Inc. and others. +# Copyright (C) 2016 and later: Unicode, Inc. and others. # License & terms of use: http://www.unicode.org/copyright.html +# # Copyright (c) 2002-2016 International Business Machines Corporation and # others. All Rights Reserved. # @@ -32,6 +33,7 @@ # !!chain; +!!quoted_literals_only; $AI = [:LineBreak = Ambiguous:]; $AL = [:LineBreak = Alphabetic:]; @@ -345,212 +347,6 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); # LB 30b Do not break between an Emoji Base and an Emoji Modifier $EB $CM* $EM; -# -# Reverse Rules. -# -## ------------------------------------------------- - -!!reverse; - -# LB 9 Combining Marks. -# Stick together any combining sequences that don't match other rules. - -^$CM+ $CAN_CM?; - -# -# Sequences of the form (shown forwards) -# [CANT_CM] [CM] [whatever] -# The CM needs to behave as an AL -# -$AL_FOLLOW $CM+ / ( - [$BK $CR $LF $NL $ZW {eof}] | - $SP+ $CM+ $SP | - $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break. - # LB14 says OP SP* x . - # becomes OP SP* x AL - # becomes OP SP* x CM+ AL_FOLLOW - # - # Further note: the $AL in [$AL {eof}] is only to work around - # a rule compiler bug which complains about - # empty sets otherwise. - - -# LB 4, 5, 6 - -$LB4Breaks [$LB4NonBreaks-$CM]; -$LB4Breaks $CM+ $CAN_CM; -$LF $CR; - - -# LB 7 x SP -# x ZW -[$SP $ZW] [$LB4NonBreaks-$CM]; -[$SP $ZW] $CM+ $CAN_CM; - -# LB 8 ZW SP* -# TODO: to implement this, we need more than one look-ahead hard break in play at a time. -# Requires an engine enhancement. -# / $SP* $ZW - -# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) -# -($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?; - - -# LB 9,10 Combining marks. -# X $CM needs to behave like X, where X is not $SP or controls. -# $CM not covered by the above needs to behave like $AL -# Stick together any combining sequences that don't match other rules. -^$CM+ $CAN_CM; - - -# LB 11 -# -$WJ $CM* $CAN_CM; -$WJ [$LB8NonBreaks-$CM]; - - $CANT_CM $CM* $WJ; -$CAN_CM $CM* $WJ; - -# LB 12a -# [^SP BA HY] x GL -# -$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]]; - -# LB 12 -# GL x -# -$CANT_CM $CM* $GL; -$CAN_CM $CM* $GL; - - -# LB 13 -$CL $CM+ $CAN_CM; -$CP $CM+ $CAN_CM; -$EX $CM+ $CAN_CM; -$IS $CM+ $CAN_CM; -$SY $CM+ $CAN_CM; - -$CL [$LB8NonBreaks-$CM]; -$CP [$LB8NonBreaks-$CM]; -$EX [$LB8NonBreaks-$CM]; -$IS [$LB8NonBreaks-$CM]; -$SY [$LB8NonBreaks-$CM]; - - -# LB 14 OP SP* x -# -. $SP* $CM* $OP; -$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP - - -# LB 15 -$OP $SP* $CM* $QU; - -# LB 16 -# Don't include $NSX here -$NS $SP* $CM* ($CL | $CP); - -# LB 17 -$B2 $SP* $CM* $B2; - -# LB 18 break after spaces -# Nothing explicit needed here. - - -# -# LB 19 -# -$QU $CM* $CAN_CM; # . x QU -$QU $LB18NonBreaks; - - -$CAN_CM $CM* $QU; # QU x . - $CANT_CM $CM* $QU; - -# -# LB 20 Break before and after CB. -# nothing needed here. -# - -# LB 21 -# Don't include $NSX here -($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) - -[$LB20NonBreaks-$CM] $CM* $BB; # BB x . -[^$CB] $CM* $BB; # - -# LB21a -[^$CB] $CM* ($HY | $BA) $CM* $HL; - -# LB21b (reverse) -$HL $CM* $SY; - -# LB 22 -$IN $CM* ($ALPlus | $HL); -$IN $CM* $EX; -$IN $CM* ($ID | $EB | $EM); -# $IN $CM* $IN; # delete this rule for CSS loose -$IN $CM* $NU; - -# LB 23 -$NU $CM* ($ALPlus | $HL); -($ALPlus | $HL) $CM* $NU; - -# LB23a -($ID | $EB | $EM) $CM* $PR; -$PO $CM* ($ID | $EB | $EM); - -# LB 24 -($ALPlus | $HL) $CM* ($PR | $PO); -($PR | $PO) $CM* ($ALPlus | $HL); - - -# LB 25 -($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?; - -# LB 26 -($H3 | $H2 | $JV | $JL) $CM* $JL; -($JT | $JV) $CM* ($H2 | $JV); -$JT $CM* ($H3 | $JT); - -# LB 27 -$IN $CM* ($H3 | $H2 | $JT | $JV | $JL); -$PO $CM* ($H3 | $H2 | $JT | $JV | $JL); - ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; - -# LB 28 -($ALPlus | $HL) $CM* ($ALPlus | $HL); - - -# LB 29 -($ALPlus | $HL) $CM* $IS; - -# LB 30 -$OP $CM* ($ALPlus | $HL | $NU); -($ALPlus | $HL | $NU) $CM* $CP; - -# LB 30a -# Pairs of Regional Indicators. -# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs, -# the second with an even number. Stripping away the cruft they look like -# [^RI] RI / (RI RI)+ ^RI; -# [^RI] RI RI / (RI RI)+ ^RI; -# -# Line Loose tailoring: Don't include NSX here. -[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; -[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; - -# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs. -$RI $CM* $RI; - -# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI". -$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL)); - - -# LB 30b Do not break between an Emoji Base and an Emoji Modifier -$EM $CM* $EB; - ## ------------------------------------------------- @@ -558,7 +354,6 @@ $EM $CM* $EB; # LB 9 ^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; -^$CM+ $SP / .; # LB 14 $SP+ $CM* $OP; @@ -584,20 +379,3 @@ $CM* ($HY | $BA) $CM* $HL; # For dictionary-based break $dictionary $dictionary; - -## ------------------------------------------------- - -!!safe_forward; - -# Skip forward over all character classes that are involved in -# rules containing patterns with possibly more than one char -# of context. -# -# It might be slightly more efficient to have specific rules -# instead of one generic one, but only if we could -# turn off rule chaining. We don't want to move more -# than necessary. -# -^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary]; -$dictionary $dictionary; - diff --git a/icu4c/source/data/brkitr/rules/line_loose_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_cj.txt index c265799dbf1..4b452de1f5b 100644 --- a/icu4c/source/data/brkitr/rules/line_loose_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_loose_cj.txt @@ -1,4 +1,4 @@ -# Copyright (C) 2016 and later: Unicode, Inc. and others. +# Copyright (C) 2016 and later: Unicode, Inc. and others. # License & terms of use: http://www.unicode.org/copyright.html # Copyright (c) 2002-2016 International Business Machines Corporation and # others. All Rights Reserved. @@ -39,6 +39,7 @@ # !!chain; +!!quoted_literals_only; $AI = [:LineBreak = Ambiguous:]; $AL = [:LineBreak = Alphabetic:]; @@ -360,226 +361,12 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); # LB 30b Do not break between an Emoji Base and an Emoji Modifier $EB $CM* $EM; -# -# Reverse Rules. -# -## ------------------------------------------------- - -!!reverse; - -# LB 9 Combining Marks. -# Stick together any combining sequences that don't match other rules. - -^$CM+ $CAN_CM?; - -# -# Sequences of the form (shown forwards) -# [CANT_CM] [CM] [whatever] -# The CM needs to behave as an AL -# -$AL_FOLLOW $CM+ / ( - [$BK $CR $LF $NL $ZW {eof}] | - $SP+ $CM+ $SP | - $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break. - # LB14 says OP SP* x . - # becomes OP SP* x AL - # becomes OP SP* x CM+ AL_FOLLOW - # - # Further note: the $AL in [$AL {eof}] is only to work around - # a rule compiler bug which complains about - # empty sets otherwise. - - -# LB 4, 5, 6 - -$LB4Breaks [$LB4NonBreaks-$CM]; -$LB4Breaks $CM+ $CAN_CM; -$LF $CR; - - -# LB 7 x SP -# x ZW -[$SP $ZW] [$LB4NonBreaks-$CM]; -[$SP $ZW] $CM+ $CAN_CM; - -# LB 8 ZW SP* -# TODO: to implement this, we need more than one look-ahead hard break in play at a time. -# Requires an engine enhancement. -# / $SP* $ZW - -# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) -# -($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?; - - -# LB 9,10 Combining marks. -# X $CM needs to behave like X, where X is not $SP or controls. -# $CM not covered by the above needs to behave like $AL -# Stick together any combining sequences that don't match other rules. -^$CM+ $CAN_CM; - - -# LB 11 -# -$WJ $CM* $CAN_CM; -$WJ [$LB8NonBreaks-$CM]; - - $CANT_CM $CM* $WJ; -$CAN_CM $CM* $WJ; - -# LB 12a -# [^SP BA HY] x GL -# -$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]]; - -# LB 12 -# GL x -# -$CANT_CM $CM* $GL; -$CAN_CM $CM* $GL; - - -# LB 13 -# Do not include $EXX here -$CL $CM+ $CAN_CM; -$CP $CM+ $CAN_CM; -$EX $CM+ $CAN_CM; -$IS $CM+ $CAN_CM; -$SY $CM+ $CAN_CM; - -$CL [$LB8NonBreaks-$CM]; -$CP [$LB8NonBreaks-$CM]; -$EX [$LB8NonBreaks-$CM]; -$IS [$LB8NonBreaks-$CM]; -$SY [$LB8NonBreaks-$CM]; - - -# LB 14 OP SP* x -# -. $SP* $CM* $OP; -$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP - - -# LB 15 -$OP $SP* $CM* $QU; - -# LB 16 -# Don't include $NSX here -$NS $SP* $CM* ($CL | $CP); - -# LB 17 -$B2 $SP* $CM* $B2; - -# LB 18 break after spaces -# Nothing explicit needed here. - - -# -# LB 19 -# -$QU $CM* $CAN_CM; # . x QU -$QU $LB18NonBreaks; - - -$CAN_CM $CM* $QU; # QU x . - $CANT_CM $CM* $QU; - -# -# LB 20 Break before and after CB. -# nothing needed here. -# - -# LB 21 -# Don't include $BAX or $NSX here -($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) - -[$LB20NonBreaks-$CM] $CM* $BB; # BB x . -[^$CB] $CM* $BB; # - -# LB21a Don't break after Hebrew + Hyphen. -([^$CB] $CM*)? ($HY | $BA | $BAX) $CM* $HL; - -# LB21b (reverse) -$HL $CM* $SY; - -# LB 22 -$IN $CM* ($ALPlus | $HL); -$IN $CM* $EX; -$IN $CM* ($ID | $EB | $EM); -# $IN $CM* $IN; # delete this rule for CSS loose -$IN $CM* $NU; - -# LB 23 -# Do not include $POX here -$NU $CM* ($ALPlus | $HL); -($ALPlus | $HL) $CM* $NU; - -# LB23a -# Do not include $PRX here -($ID | $EB | $EM) $CM* $PR; -$PO $CM* ($ID | $EB | $EM); - -# LB 24 -# Do not include $PRX here -($ALPlus | $HL) $CM* ($PR | $PO | $POX); -($PR | $PO | $POX) $CM* ($ALPlus | $HL); - - -# LB 25 -# Here do not include $POX at the beginning or $PRX at the end -($CM* ($PR | $PRX | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO | $POX))?; - -# LB 26 -($H3 | $H2 | $JV | $JL) $CM* $JL; -($JT | $JV) $CM* ($H2 | $JV); -$JT $CM* ($H3 | $JT); - -# LB 27 -# Do not include $POX or $PRX here -$IN $CM* ($H3 | $H2 | $JT | $JV | $JL); -$PO $CM* ($H3 | $H2 | $JT | $JV | $JL); - ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; - -# LB 28 -($ALPlus | $HL) $CM* ($ALPlus | $HL); - - -# LB 29 -($ALPlus | $HL) $CM* $IS; - -# LB 30 -$OP $CM* ($ALPlus | $HL | $NU); -($ALPlus | $HL | $NU) $CM* $CP; - -# LB 30a -# Pairs of Regional Indicators. -# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs, -# the second with an even number. Stripping away the cruft they look like -# [^RI] RI / (RI RI)+ ^RI; -# [^RI] RI RI / (RI RI)+ ^RI; -# -# Line Loose tailoring: Don't include NSX here. -[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; -[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; - -# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs. -$RI $CM* $RI; - -# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI". -$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL)); - - -# LB 30b Do not break between an Emoji Base and an Emoji Modifier -$EM $CM* $EB; - - ## ------------------------------------------------- !!safe_reverse; # LB 9 ^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; -^$CM+ $SP / .; # LB 14 $SP+ $CM* $OP; @@ -605,20 +392,3 @@ $CM* ($HY | $BA | $BAX) $CM* $HL; # For dictionary-based break $dictionary $dictionary; - -## ------------------------------------------------- - -!!safe_forward; - -# Skip forward over all character classes that are involved in -# rules containing patterns with possibly more than one char -# of context. -# -# It might be slightly more efficient to have specific rules -# instead of one generic one, but only if we could -# turn off rule chaining. We don't want to move more -# than necessary. -# -^[$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $RI $ZWJ $dictionary]; -$dictionary $dictionary; - diff --git a/icu4c/source/data/brkitr/rules/line_loose_fi.txt b/icu4c/source/data/brkitr/rules/line_loose_fi.txt index 7d80941a648..f68057bfebf 100644 --- a/icu4c/source/data/brkitr/rules/line_loose_fi.txt +++ b/icu4c/source/data/brkitr/rules/line_loose_fi.txt @@ -1,4 +1,4 @@ -# Copyright (C) 2016 and later: Unicode, Inc. and others. +# Copyright (C) 2016 and later: Unicode, Inc. and others. # License & terms of use: http://www.unicode.org/copyright.html # Copyright (c) 2002-2016 International Business Machines Corporation and # others. All Rights Reserved. @@ -28,6 +28,7 @@ # !!chain; +!!quoted_literals_only; $AI = [:LineBreak = Ambiguous:]; $AL = [:LineBreak = Alphabetic:]; @@ -345,215 +346,6 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); # LB 30b Do not break between an Emoji Base and an Emoji Modifier $EB $CM* $EM; -# -# Reverse Rules. -# -## ------------------------------------------------- - -!!reverse; - -# LB 9 Combining Marks. -# Stick together any combining sequences that don't match other rules. - -^$CM+ $CAN_CM?; - -# -# Sequences of the form (shown forwards) -# [CANT_CM] [CM] [whatever] -# The CM needs to behave as an AL -# -$AL_FOLLOW $CM+ / ( - [$BK $CR $LF $NL $ZW {eof}] | - $SP+ $CM+ $SP | - $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break. - # LB14 says OP SP* x . - # becomes OP SP* x AL - # becomes OP SP* x CM+ AL_FOLLOW - # - # Further note: the $AL in [$AL {eof}] is only to work around - # a rule compiler bug which complains about - # empty sets otherwise. - - -# LB 4, 5, 6 - -$LB4Breaks [$LB4NonBreaks-$CM]; -$LB4Breaks $CM+ $CAN_CM; -$LF $CR; - - -# LB 7 x SP -# x ZW -[$SP $ZW] [$LB4NonBreaks-$CM]; -[$SP $ZW] $CM+ $CAN_CM; - -# LB 8 ZW SP* -# TODO: to implement this, we need more than one look-ahead hard break in play at a time. -# Requires an engine enhancement. -# / $SP* $ZW - -# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) -# -($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?; - - -# LB 9,10 Combining marks. -# X $CM needs to behave like X, where X is not $SP or controls. -# $CM not covered by the above needs to behave like $AL -# Stick together any combining sequences that don't match other rules. -^$CM+ $CAN_CM; - - -# LB 11 -# -$WJ $CM* $CAN_CM; -$WJ [$LB8NonBreaks-$CM]; - - $CANT_CM $CM* $WJ; -$CAN_CM $CM* $WJ; - -# LB 12a -# [^SP BA HY] x GL -# -$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]]; - -# LB 12 -# GL x -# -$CANT_CM $CM* $GL; -$CAN_CM $CM* $GL; - - -# LB 13 -$CL $CM+ $CAN_CM; -$CP $CM+ $CAN_CM; -$EX $CM+ $CAN_CM; -$IS $CM+ $CAN_CM; -$SY $CM+ $CAN_CM; - -$CL [$LB8NonBreaks-$CM]; -$CP [$LB8NonBreaks-$CM]; -$EX [$LB8NonBreaks-$CM]; -$IS [$LB8NonBreaks-$CM]; -$SY [$LB8NonBreaks-$CM]; - - -# LB 14 OP SP* x -# -. $SP* $CM* $OP; -$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP - - -# LB 15 -$OP $SP* $CM* $QU; - -# LB 16 -# Don't include $NSX here -$NS $SP* $CM* ($CL | $CP); - -# LB 17 -$B2 $SP* $CM* $B2; - -# LB 18 break after spaces -# Nothing explicit needed here. - - -# -# LB 19 -# -$QU $CM* $CAN_CM; # . x QU -$QU $LB18NonBreaks; - - -$CAN_CM $CM* $QU; # QU x . - $CANT_CM $CM* $QU; - -# -# LB 20 Break before and after CB. -# nothing needed here. -# - -# LB 20.09 added rule for Finnish tailoring -$AL ($HY | $HH) / $SP; - -# LB 21 -# Don't include $NSX here -($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) - -[$LB20NonBreaks-$CM] $CM* $BB; # BB x . -[^$CB] $CM* $BB; # - -# LB21a -[^$CB] $CM* ($HY | $BA | $HH) $CM* $HL; - -# LB21b (reverse) -$HL $CM* $SY; - -# LB 22 -$IN $CM* ($ALPlus | $HL); -$IN $CM* $EX; -$IN $CM* ($ID | $EB | $EM); -# $IN $CM* $IN; # delete this rule for CSS loose -$IN $CM* $NU; - -# LB 23 -$NU $CM* ($ALPlus | $HL); -($ALPlus | $HL) $CM* $NU; - -# LB23a -($ID | $EB | $EM) $CM* $PR; -$PO $CM* ($ID | $EB | $EM); - -# LB 24 -($ALPlus | $HL) $CM* ($PR | $PO); -($PR | $PO) $CM* ($ALPlus | $HL); - - -# LB 25 -($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?; - -# LB 26 -($H3 | $H2 | $JV | $JL) $CM* $JL; -($JT | $JV) $CM* ($H2 | $JV); -$JT $CM* ($H3 | $JT); - -# LB 27 -$IN $CM* ($H3 | $H2 | $JT | $JV | $JL); -$PO $CM* ($H3 | $H2 | $JT | $JV | $JL); - ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; - -# LB 28 -($ALPlus | $HL) $CM* ($ALPlus | $HL); - - -# LB 29 -($ALPlus | $HL) $CM* $IS; - -# LB 30 -$OP $CM* ($ALPlus | $HL | $NU); -($ALPlus | $HL | $NU) $CM* $CP; - -# LB 30a -# Pairs of Regional Indicators. -# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs, -# the second with an even number. Stripping away the cruft they look like -# [^RI] RI / (RI RI)+ ^RI; -# [^RI] RI RI / (RI RI)+ ^RI; -# -# Line Loose tailoring: Don't include NSX here. -[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; -[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; - -# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs. -$RI $CM* $RI; - -# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI". -$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL)); - - -# LB 30b Do not break between an Emoji Base and an Emoji Modifier -$EM $CM* $EB; - ## ------------------------------------------------- @@ -561,7 +353,6 @@ $EM $CM* $EB; # LB 9 ^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; -^$CM+ $SP / .; # LB 14 $SP+ $CM* $OP; @@ -587,20 +378,3 @@ $CM* ($HY | $BA | $HH) $CM* $HL; # For dictionary-based break $dictionary $dictionary; - -## ------------------------------------------------- - -!!safe_forward; - -# Skip forward over all character classes that are involved in -# rules containing patterns with possibly more than one char -# of context. -# -# It might be slightly more efficient to have specific rules -# instead of one generic one, but only if we could -# turn off rule chaining. We don't want to move more -# than necessary. -# -^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary]; -$dictionary $dictionary; - diff --git a/icu4c/source/data/brkitr/rules/line_normal.txt b/icu4c/source/data/brkitr/rules/line_normal.txt index 1150d50cf28..594cb165475 100644 --- a/icu4c/source/data/brkitr/rules/line_normal.txt +++ b/icu4c/source/data/brkitr/rules/line_normal.txt @@ -1,4 +1,4 @@ -# Copyright (C) 2016 and later: Unicode, Inc. and others. +# Copyright (C) 2016 and later: Unicode, Inc. and others. # License & terms of use: http://www.unicode.org/copyright.html # Copyright (c) 2002-2016 International Business Machines Corporation and # others. All Rights Reserved. @@ -29,6 +29,7 @@ # !!chain; +!!quoted_literals_only; $AI = [:LineBreak = Ambiguous:]; $AL = [:LineBreak = Alphabetic:]; @@ -338,217 +339,12 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); # LB 30b Do not break between an Emoji Base and an Emoji Modifier $EB $CM* $EM; -# -# Reverse Rules. -# -## ------------------------------------------------- - -!!reverse; - -# LB 9 Combining Marks. -# Stick together any combining sequences that don't match other rules. - -^$CM+ $CAN_CM?; - -# -# Sequences of the form (shown forwards) -# [CANT_CM] [CM] [whatever] -# The CM needs to behave as an AL -# -$AL_FOLLOW $CM+ / ( - [$BK $CR $LF $NL $ZW {eof}] | - $SP+ $CM+ $SP | - $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break. - # LB14 says OP SP* x . - # becomes OP SP* x AL - # becomes OP SP* x CM+ AL_FOLLOW - # - # Further note: the $AL in [$AL {eof}] is only to work around - # a rule compiler bug which complains about - # empty sets otherwise. - - -# LB 4, 5, 6 - -$LB4Breaks [$LB4NonBreaks-$CM]; -$LB4Breaks $CM+ $CAN_CM; -$LF $CR; - - -# LB 7 x SP -# x ZW -[$SP $ZW] [$LB4NonBreaks-$CM]; -[$SP $ZW] $CM+ $CAN_CM; - -# LB 8 ZW SP* -# TODO: to implement this, we need more than one look-ahead hard break in play at a time. -# Requires an engine enhancement. -# / $SP* $ZW - -# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) -# -($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?; - - -# LB 9,10 Combining marks. -# X $CM needs to behave like X, where X is not $SP or controls. -# $CM not covered by the above needs to behave like $AL -# Stick together any combining sequences that don't match other rules. -^$CM+ $CAN_CM; - - -# LB 11 -# -$WJ $CM* $CAN_CM; -$WJ [$LB8NonBreaks-$CM]; - - $CANT_CM $CM* $WJ; -$CAN_CM $CM* $WJ; - -# LB 12a -# [^SP BA HY] x GL -# -$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]]; - -# LB 12 -# GL x -# -$CANT_CM $CM* $GL; -$CAN_CM $CM* $GL; - - -# LB 13 -$CL $CM+ $CAN_CM; -$CP $CM+ $CAN_CM; -$EX $CM+ $CAN_CM; -$IS $CM+ $CAN_CM; -$SY $CM+ $CAN_CM; - -$CL [$LB8NonBreaks-$CM]; -$CP [$LB8NonBreaks-$CM]; -$EX [$LB8NonBreaks-$CM]; -$IS [$LB8NonBreaks-$CM]; -$SY [$LB8NonBreaks-$CM]; - - -# LB 14 OP SP* x -# -. $SP* $CM* $OP; -$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP - - -# LB 15 -$OP $SP* $CM* $QU; - -# LB 16 -$NS $SP* $CM* ($CL | $CP); - -# LB 17 -$B2 $SP* $CM* $B2; - -# LB 18 break after spaces -# Nothing explicit needed here. - - -# -# LB 19 -# -$QU $CM* $CAN_CM; # . x QU -$QU $LB18NonBreaks; - - -$CAN_CM $CM* $QU; # QU x . - $CANT_CM $CM* $QU; - -# -# LB 20 Break before and after CB. -# nothing needed here. -# - -# LB 21 -($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) - -[$LB20NonBreaks-$CM] $CM* $BB; # BB x . -[^$CB] $CM* $BB; # - -# LB21a -[^$CB] $CM* ($HY | $BA) $CM* $HL; - -# LB21b (reverse) -$HL $CM* $SY; - -# LB 22 -$IN $CM* ($ALPlus | $HL); -$IN $CM* $EX; -$IN $CM* ($ID | $EB | $EM); -$IN $CM* $IN; -$IN $CM* $NU; - -# LB 23 -$NU $CM* ($ALPlus | $HL); -($ALPlus | $HL) $CM* $NU; - -# LB23a -($ID | $EB | $EM) $CM* $PR; -$PO $CM* ($ID | $EB | $EM); - -# LB 24 -($ALPlus | $HL) $CM* ($PR | $PO); -($PR | $PO) $CM* ($ALPlus | $HL); - - -# LB 25 -($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?; - -# LB 26 -($H3 | $H2 | $JV | $JL) $CM* $JL; -($JT | $JV) $CM* ($H2 | $JV); -$JT $CM* ($H3 | $JT); - -# LB 27 -$IN $CM* ($H3 | $H2 | $JT | $JV | $JL); -$PO $CM* ($H3 | $H2 | $JT | $JV | $JL); - ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; - -# LB 28 -($ALPlus | $HL) $CM* ($ALPlus | $HL); - - -# LB 29 -($ALPlus | $HL) $CM* $IS; - -# LB 30 -$OP $CM* ($ALPlus | $HL | $NU); -($ALPlus | $HL | $NU) $CM* $CP; - -# LB 30a -# Pairs of Regional Indicators. -# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs, -# the second with an even number. Stripping away the cruft they look like -# [^RI] RI / (RI RI)+ ^RI; -# [^RI] RI RI / (RI RI)+ ^RI; -# -[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; -[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; - -# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs. -$RI $CM* $RI; - -# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI". -$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL)); - - -# LB 30b Do not break between an Emoji Base and an Emoji Modifier -$EM $CM* $EB; - - ## ------------------------------------------------- !!safe_reverse; # LB 9 ^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; -^$CM+ $SP / .; # LB 14 $SP+ $CM* $OP; @@ -574,20 +370,3 @@ $CM* ($HY | $BA) $CM* $HL; # For dictionary-based break $dictionary $dictionary; - -## ------------------------------------------------- - -!!safe_forward; - -# Skip forward over all character classes that are involved in -# rules containing patterns with possibly more than one char -# of context. -# -# It might be slightly more efficient to have specific rules -# instead of one generic one, but only if we could -# turn off rule chaining. We don't want to move more -# than necessary. -# -^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary]; -$dictionary $dictionary; - diff --git a/icu4c/source/data/brkitr/rules/line_normal_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_cj.txt index a3a2f3e38bc..59c706d2562 100644 --- a/icu4c/source/data/brkitr/rules/line_normal_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_normal_cj.txt @@ -1,4 +1,4 @@ -# Copyright (C) 2016 and later: Unicode, Inc. and others. +# Copyright (C) 2016 and later: Unicode, Inc. and others. # License & terms of use: http://www.unicode.org/copyright.html # Copyright (c) 2002-2016 International Business Machines Corporation and # others. All Rights Reserved. @@ -30,6 +30,7 @@ # !!chain; +!!quoted_literals_only; $AI = [:LineBreak = Ambiguous:]; $AL = [:LineBreak = Alphabetic:]; @@ -344,219 +345,12 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); # LB 30b Do not break between an Emoji Base and an Emoji Modifier $EB $CM* $EM; -# -# Reverse Rules. -# -## ------------------------------------------------- - -!!reverse; - -# LB 9 Combining Marks. -# Stick together any combining sequences that don't match other rules. - -^$CM+ $CAN_CM?; - -# -# Sequences of the form (shown forwards) -# [CANT_CM] [CM] [whatever] -# The CM needs to behave as an AL -# -$AL_FOLLOW $CM+ / ( - [$BK $CR $LF $NL $ZW {eof}] | - $SP+ $CM+ $SP | - $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break. - # LB14 says OP SP* x . - # becomes OP SP* x AL - # becomes OP SP* x CM+ AL_FOLLOW - # - # Further note: the $AL in [$AL {eof}] is only to work around - # a rule compiler bug which complains about - # empty sets otherwise. - - -# LB 4, 5, 6 - -$LB4Breaks [$LB4NonBreaks-$CM]; -$LB4Breaks $CM+ $CAN_CM; -$LF $CR; - - -# LB 7 x SP -# x ZW -[$SP $ZW] [$LB4NonBreaks-$CM]; -[$SP $ZW] $CM+ $CAN_CM; - -# LB 8 ZW SP* -# TODO: to implement this, we need more than one look-ahead hard break in play at a time. -# Requires an engine enhancement. -# / $SP* $ZW - -# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) -# -($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?; - - -# LB 9,10 Combining marks. -# X $CM needs to behave like X, where X is not $SP or controls. -# $CM not covered by the above needs to behave like $AL -# Stick together any combining sequences that don't match other rules. -^$CM+ $CAN_CM; - - -# LB 11 -# -$WJ $CM* $CAN_CM; -$WJ [$LB8NonBreaks-$CM]; - - $CANT_CM $CM* $WJ; -$CAN_CM $CM* $WJ; - -# LB 12a -# [^SP BA HY] x GL -# -$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]]; - -# LB 12 -# GL x -# -$CANT_CM $CM* $GL; -$CAN_CM $CM* $GL; - - -# LB 13 -$CL $CM+ $CAN_CM; -$CP $CM+ $CAN_CM; -$EX $CM+ $CAN_CM; -$IS $CM+ $CAN_CM; -$SY $CM+ $CAN_CM; - -$CL [$LB8NonBreaks-$CM]; -$CP [$LB8NonBreaks-$CM]; -$EX [$LB8NonBreaks-$CM]; -$IS [$LB8NonBreaks-$CM]; -$SY [$LB8NonBreaks-$CM]; - - -# LB 14 OP SP* x -# -. $SP* $CM* $OP; -$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP - - -# LB 15 -$OP $SP* $CM* $QU; - -# LB 16 -# Don't include $NSX here -$NS $SP* $CM* ($CL | $CP); - -# LB 17 -$B2 $SP* $CM* $B2; - -# LB 18 break after spaces -# Nothing explicit needed here. - - -# -# LB 19 -# -$QU $CM* $CAN_CM; # . x QU -$QU $LB18NonBreaks; - - -$CAN_CM $CM* $QU; # QU x . - $CANT_CM $CM* $QU; - -# -# LB 20 Break before and after CB. -# nothing needed here. -# - -# LB 21 -# Don't include $BAX or $NSX here -($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) - -[$LB20NonBreaks-$CM] $CM* $BB; # BB x . -[^$CB] $CM* $BB; # - -# LB21a Don't break after Hebrew + Hyphen. -([^$CB] $CM*)? ($HY | $BA | $BAX) $CM* $HL; - -# LB21b (reverse) -$HL $CM* $SY; - -# LB 22 -$IN $CM* ($ALPlus | $HL); -$IN $CM* $EX; -$IN $CM* ($ID | $EB | $EM); -$IN $CM* $IN; -$IN $CM* $NU; - -# LB 23 -$NU $CM* ($ALPlus | $HL); -($ALPlus | $HL) $CM* $NU; - -# LB23a -($ID | $EB | $EM) $CM* $PR; -$PO $CM* ($ID | $EB | $EM); - -# LB 24 -($ALPlus | $HL) $CM* ($PR | $PO); -($PR | $PO) $CM* ($ALPlus | $HL); - - -# LB 25 -($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?; - -# LB 26 -($H3 | $H2 | $JV | $JL) $CM* $JL; -($JT | $JV) $CM* ($H2 | $JV); -$JT $CM* ($H3 | $JT); - -# LB 27 -$IN $CM* ($H3 | $H2 | $JT | $JV | $JL); -$PO $CM* ($H3 | $H2 | $JT | $JV | $JL); - ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; - -# LB 28 -($ALPlus | $HL) $CM* ($ALPlus | $HL); - - -# LB 29 -($ALPlus | $HL) $CM* $IS; - -# LB 30 -$OP $CM* ($ALPlus | $HL | $NU); -($ALPlus | $HL | $NU) $CM* $CP; - -# LB 30a -# Pairs of Regional Indicators. -# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs, -# the second with an even number. Stripping away the cruft they look like -# [^RI] RI / (RI RI)+ ^RI; -# [^RI] RI RI / (RI RI)+ ^RI; -# -[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; -[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; - -# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs. -$RI $CM* $RI; - -# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI". -$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL)); - - -# LB 30b Do not break between an Emoji Base and an Emoji Modifier -$EM $CM* $EB; - - ## ------------------------------------------------- !!safe_reverse; # LB 9 ^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; -^$CM+ $SP / .; # LB 14 $SP+ $CM* $OP; @@ -582,20 +376,3 @@ $CM* ($HY | $BA | $BAX) $CM* $HL; # For dictionary-based break $dictionary $dictionary; - -## ------------------------------------------------- - -!!safe_forward; - -# Skip forward over all character classes that are involved in -# rules containing patterns with possibly more than one char -# of context. -# -# It might be slightly more efficient to have specific rules -# instead of one generic one, but only if we could -# turn off rule chaining. We don't want to move more -# than necessary. -# -^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $BAX $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $BAX $RI $ZWJ $dictionary]; -$dictionary $dictionary; - diff --git a/icu4c/source/data/brkitr/rules/line_normal_fi.txt b/icu4c/source/data/brkitr/rules/line_normal_fi.txt index bb299c8691c..b6e10b90be4 100644 --- a/icu4c/source/data/brkitr/rules/line_normal_fi.txt +++ b/icu4c/source/data/brkitr/rules/line_normal_fi.txt @@ -1,4 +1,4 @@ -# Copyright (C) 2016 and later: Unicode, Inc. and others. +# Copyright (C) 2016 and later: Unicode, Inc. and others. # License & terms of use: http://www.unicode.org/copyright.html # Copyright (c) 2002-2016 International Business Machines Corporation and # others. All Rights Reserved. @@ -28,6 +28,7 @@ # !!chain; +!!quoted_literals_only; $AI = [:LineBreak = Ambiguous:]; $AL = [:LineBreak = Alphabetic:]; @@ -341,213 +342,6 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); # LB 30b Do not break between an Emoji Base and an Emoji Modifier $EB $CM* $EM; -# -# Reverse Rules. -# -## ------------------------------------------------- - -!!reverse; - -# LB 9 Combining Marks. -# Stick together any combining sequences that don't match other rules. - -^$CM+ $CAN_CM?; - -# -# Sequences of the form (shown forwards) -# [CANT_CM] [CM] [whatever] -# The CM needs to behave as an AL -# -$AL_FOLLOW $CM+ / ( - [$BK $CR $LF $NL $ZW {eof}] | - $SP+ $CM+ $SP | - $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break. - # LB14 says OP SP* x . - # becomes OP SP* x AL - # becomes OP SP* x CM+ AL_FOLLOW - # - # Further note: the $AL in [$AL {eof}] is only to work around - # a rule compiler bug which complains about - # empty sets otherwise. - - -# LB 4, 5, 6 - -$LB4Breaks [$LB4NonBreaks-$CM]; -$LB4Breaks $CM+ $CAN_CM; -$LF $CR; - - -# LB 7 x SP -# x ZW -[$SP $ZW] [$LB4NonBreaks-$CM]; -[$SP $ZW] $CM+ $CAN_CM; - -# LB 8 ZW SP* -# TODO: to implement this, we need more than one look-ahead hard break in play at a time. -# Requires an engine enhancement. -# / $SP* $ZW - -# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) -# -($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?; - - -# LB 9,10 Combining marks. -# X $CM needs to behave like X, where X is not $SP or controls. -# $CM not covered by the above needs to behave like $AL -# Stick together any combining sequences that don't match other rules. -^$CM+ $CAN_CM; - - -# LB 11 -# -$WJ $CM* $CAN_CM; -$WJ [$LB8NonBreaks-$CM]; - - $CANT_CM $CM* $WJ; -$CAN_CM $CM* $WJ; - -# LB 12a -# [^SP BA HY] x GL -# -$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]]; - -# LB 12 -# GL x -# -$CANT_CM $CM* $GL; -$CAN_CM $CM* $GL; - - -# LB 13 -$CL $CM+ $CAN_CM; -$CP $CM+ $CAN_CM; -$EX $CM+ $CAN_CM; -$IS $CM+ $CAN_CM; -$SY $CM+ $CAN_CM; - -$CL [$LB8NonBreaks-$CM]; -$CP [$LB8NonBreaks-$CM]; -$EX [$LB8NonBreaks-$CM]; -$IS [$LB8NonBreaks-$CM]; -$SY [$LB8NonBreaks-$CM]; - - -# LB 14 OP SP* x -# -. $SP* $CM* $OP; -$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP - - -# LB 15 -$OP $SP* $CM* $QU; - -# LB 16 -$NS $SP* $CM* ($CL | $CP); - -# LB 17 -$B2 $SP* $CM* $B2; - -# LB 18 break after spaces -# Nothing explicit needed here. - - -# -# LB 19 -# -$QU $CM* $CAN_CM; # . x QU -$QU $LB18NonBreaks; - - -$CAN_CM $CM* $QU; # QU x . - $CANT_CM $CM* $QU; - -# -# LB 20 Break before and after CB. -# nothing needed here. -# - -# LB 20.09 added rule for Finnish tailoring -$AL ($HY | $HH) / $SP; - -# LB 21 -($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) - -[$LB20NonBreaks-$CM] $CM* $BB; # BB x . -[^$CB] $CM* $BB; # - -# LB21a -[^$CB] $CM* ($HY | $BA | $HH) $CM* $HL; - -# LB21b (reverse) -$HL $CM* $SY; - -# LB 22 -$IN $CM* ($ALPlus | $HL); -$IN $CM* $EX; -$IN $CM* ($ID | $EB | $EM); -$IN $CM* $IN; -$IN $CM* $NU; - -# LB 23 -$NU $CM* ($ALPlus | $HL); -($ALPlus | $HL) $CM* $NU; - -# LB23a -($ID | $EB | $EM) $CM* $PR; -$PO $CM* ($ID | $EB | $EM); - -# LB 24 -($ALPlus | $HL) $CM* ($PR | $PO); -($PR | $PO) $CM* ($ALPlus | $HL); - - -# LB 25 -($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?; - -# LB 26 -($H3 | $H2 | $JV | $JL) $CM* $JL; -($JT | $JV) $CM* ($H2 | $JV); -$JT $CM* ($H3 | $JT); - -# LB 27 -$IN $CM* ($H3 | $H2 | $JT | $JV | $JL); -$PO $CM* ($H3 | $H2 | $JT | $JV | $JL); - ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; - -# LB 28 -($ALPlus | $HL) $CM* ($ALPlus | $HL); - - -# LB 29 -($ALPlus | $HL) $CM* $IS; - -# LB 30 -$OP $CM* ($ALPlus | $HL | $NU); -($ALPlus | $HL | $NU) $CM* $CP; - -# LB 30a -# Pairs of Regional Indicators. -# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs, -# the second with an even number. Stripping away the cruft they look like -# [^RI] RI / (RI RI)+ ^RI; -# [^RI] RI RI / (RI RI)+ ^RI; -# -[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; -[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; - -# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs. -$RI $CM* $RI; - -# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI". -$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL)); - - -# LB 30b Do not break between an Emoji Base and an Emoji Modifier -$EM $CM* $EB; - - ## ------------------------------------------------- !!safe_reverse; @@ -580,20 +374,3 @@ $CM* ($HY | $BA | $HH) $CM* $HL; # For dictionary-based break $dictionary $dictionary; - -## ------------------------------------------------- - -!!safe_forward; - -# Skip forward over all character classes that are involved in -# rules containing patterns with possibly more than one char -# of context. -# -# It might be slightly more efficient to have specific rules -# instead of one generic one, but only if we could -# turn off rule chaining. We don't want to move more -# than necessary. -# -^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary]; -$dictionary $dictionary; - diff --git a/icu4c/source/data/brkitr/rules/sent.txt b/icu4c/source/data/brkitr/rules/sent.txt index 49970ee3aeb..95e6f030ff6 100644 --- a/icu4c/source/data/brkitr/rules/sent.txt +++ b/icu4c/source/data/brkitr/rules/sent.txt @@ -1,6 +1,5 @@ -# -# Copyright (C) 2016 and later: Unicode, Inc. and others. -# License & terms of use: http://www.unicode.org/copyright.html#License +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html # # Copyright (C) 2002-2015, International Business Machines Corporation and others. # All Rights Reserved. @@ -12,6 +11,7 @@ # These rules are based on UAX #29 Revision 26 for Unicode Version 8.0 # +!!quoted_literals_only; # # Character categories as defined in TR 29 @@ -85,22 +85,13 @@ $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower; ## ------------------------------------------------- -!!reverse; +!!safe_reverse; $SpEx_R = ($Extend | $Format)* $Sp; $ATermEx_R = ($Extend | $Format)* $ATerm; $STermEx_R = ($Extend | $Format)* $STerm; $CloseEx_R = ($Extend | $Format)* $Close; -# -# Reverse rules. -# For now, use the old style inexact reverse rules, which are easier -# to write, but less efficient. -# TODO: exact reverse rules. It appears that exact reverse rules -# may require improving support for look-ahead breaks in the -# builder. Needs more investigation. -# - [{bof}] (.? | $LF $CR) [^$Sep $CR $LF]* [$Sep $CR $LF {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*; #.*; @@ -112,9 +103,9 @@ $CloseEx_R = ($Extend | $Format)* $Close; # The preceding $Sep, which will be the second one that the rule matches. # Any immediately preceding STerm or ATerm sequences. We need to see these # to get the correct rule status when moving forwards again. -# +# # [{bof}] inhibit rule chaining. Without this, rule would loop on itself and match -# the entire string. +# the entire string. TODO: can bof be replaced with ^ # # (.? | $LF $CR) Match one $Sep instance. Use .? rather than $Sep because position might be # at the beginning of the string at this point, and we don't want to fail. diff --git a/icu4c/source/data/brkitr/rules/sent_el.txt b/icu4c/source/data/brkitr/rules/sent_el.txt index 7a0c984003d..fec60ed76c3 100644 --- a/icu4c/source/data/brkitr/rules/sent_el.txt +++ b/icu4c/source/data/brkitr/rules/sent_el.txt @@ -1,6 +1,6 @@ +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html # -# Copyright (C) 2016 and later: Unicode, Inc. and others. -# License & terms of use: http://www.unicode.org/copyright.html#License # # Copyright (C) 2002-2015, International Business Machines Corporation and others. # All Rights Reserved. @@ -12,6 +12,7 @@ # These rules are based on UAX #29 Revision 26 for Unicode Version 8.0 # +!!quoted_literals_only; # # Character categories as defined in TR 29 @@ -85,7 +86,7 @@ $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower; ## ------------------------------------------------- -!!reverse; +!!safe_reverse; $SpEx_R = ($Extend | $Format)* $Sp; $ATermEx_R = ($Extend | $Format)* $ATerm; @@ -102,7 +103,6 @@ $CloseEx_R = ($Extend | $Format)* $Close; # [{bof}] (.? | $LF $CR) [^$Sep $CR $LF]* [$Sep $CR $LF {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*; -#.*; # Explanation for this rule: # @@ -112,7 +112,7 @@ $CloseEx_R = ($Extend | $Format)* $Close; # The preceding $Sep, which will be the second one that the rule matches. # Any immediately preceding STerm or ATerm sequences. We need to see these # to get the correct rule status when moving forwards again. -# +# # [{bof}] inhibit rule chaining. Without this, rule would loop on itself and match # the entire string. # diff --git a/icu4c/source/data/brkitr/rules/title.txt b/icu4c/source/data/brkitr/rules/title.txt index a21cb496716..0634a9ee15a 100644 --- a/icu4c/source/data/brkitr/rules/title.txt +++ b/icu4c/source/data/brkitr/rules/title.txt @@ -1,5 +1,5 @@ # Copyright (C) 2016 and later: Unicode, Inc. and others. -# License & terms of use: http://www.unicode.org/copyright.html#License +# License & terms of use: http://www.unicode.org/copyright.html # # Copyright (c) 2002-2015, International Business Machines Corporation and # others. All Rights Reserved. @@ -7,6 +7,7 @@ # Title Casing Break Rules # +!!quoted_literals_only; $CaseIgnorable = [[:Mn:][:Me:][:Cf:][:Lm:][:Sk:] \u0027 \u00AD \u2019]; $Cased = [[:Upper_Case:][:Lower_Case:][:Lt:] - $CaseIgnorable]; @@ -27,19 +28,6 @@ $NotCased = [[^ $Cased] - $CaseIgnorable]; $Cased ($Cased | $CaseIgnorable)* ($NotCased | $CaseIgnorable)*; -# Reverse Rules -!!reverse; - -# Normal Rule, will work nearly universally, so long as there is a -# start-of-word preceding the current iteration position. - -($NotCased | $CaseIgnorable)* ($Cased | $CaseIgnorable)* $Cased; - -# Short rule, will be effective only when moving to the start of text, -# with no word (cased character) preceding the current iteration position. - -($NotCased | $CaseIgnorable)*; - !!safe_reverse; # Safe Reverse: the exact forward rule must not start in the middle @@ -47,10 +35,3 @@ $Cased ($Cased | $CaseIgnorable)* ($NotCased | $CaseIgnorable)*; # leaving it just before the start of a word. ($Cased | $CaseIgnorable)*; - -!!safe_forward; - -# Safe Forward, nothing needs to be done, the exact Reverse rules will -# always find valid boundaries from any starting position. -# Still, some rule is needed, so '.', a one character movement. -.; diff --git a/icu4c/source/data/brkitr/rules/word.txt b/icu4c/source/data/brkitr/rules/word.txt index 742d8f8fe31..7c4e9a39a64 100644 --- a/icu4c/source/data/brkitr/rules/word.txt +++ b/icu4c/source/data/brkitr/rules/word.txt @@ -1,7 +1,7 @@ -# +# # Copyright (C) 2016 and later: Unicode, Inc. and others. # License & terms of use: http://www.unicode.org/copyright.html -# Copyright (C) 2002-2016, International Business Machines Corporation +# Copyright (C) 2002-2016, International Business Machines Corporation # and others. All Rights Reserved. # # file: word.txt @@ -22,6 +22,7 @@ ############################################################################## !!chain; +!!quoted_literals_only; # @@ -194,95 +195,6 @@ $HangulSyllable $HangulSyllable {200}; $KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found -## ------------------------------------------------- - -!!reverse; - -$BackHebrew_LetterEx = ($Format | $Extend | $ZWJ)* $Hebrew_Letter; -$BackALetterEx = ($Format | $Extend | $ZWJ)* $ALetterPlus; -$BackSingle_QuoteEx = ($Format | $Extend | $ZWJ)* $Single_Quote; -$BackDouble_QuoteEx = ($Format | $Extend | $ZWJ)* $Double_Quote; -$BackMidNumLetEx = ($Format | $Extend | $ZWJ)* $MidNumLet; -$BackNumericEx = ($Format | $Extend | $ZWJ)* $Numeric; -$BackMidNumEx = ($Format | $Extend | $ZWJ)* $MidNum; -$BackMidLetterEx = ($Format | $Extend | $ZWJ)* $MidLetter; -$BackKatakanaEx = ($Format | $Extend | $ZWJ)* $Katakana; -$BackHiraganaEx = ($Format | $Extend | $ZWJ)* $Hiragana; -$BackExtendNumLetEx = ($Format | $Extend | $ZWJ)* $ExtendNumLet; -$BackRegional_IndicatorEx = ($Format | $Extend | $ZWJ)* $Regional_Indicator; - -# rule 3 -$LF $CR; - -# Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed. -# -($Extended_Pict | $EmojiNRK) $ZWJ; - -# rule 4 -($Format | $Extend | $ZWJ)* [^$CR $LF $Newline]?; - -# rule 5 - -($BackALetterEx | $BackHebrew_LetterEx) ($BackALetterEx | $BackHebrew_LetterEx); - -# rule 6 and 7 - -($BackALetterEx | $BackHebrew_LetterEx) ($BackMidLetterEx | $BackMidNumLetEx | $BackSingle_QuoteEx) ($BackALetterEx | $BackHebrew_LetterEx); - -# rule 7a -$BackSingle_QuoteEx $BackHebrew_LetterEx; - -# Rule 7b and 7c -$BackHebrew_LetterEx $BackDouble_QuoteEx $BackHebrew_LetterEx; - -# rule 8 - -$BackNumericEx $BackNumericEx; - -# rule 9 - -$BackNumericEx ($BackALetterEx | $BackHebrew_LetterEx); - -# rule 10 - -($BackALetterEx | $BackHebrew_LetterEx) $BackNumericEx; - -# rule 11 and 12 - -$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx | $BackSingle_QuoteEx) $BackNumericEx; - -# rule 13 - -$BackKatakanaEx $BackKatakanaEx; - -# rules 13 a/b -# -$BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx); -($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx; - -# special handling for CJK characters: chain for later dictionary segmentation -$HangulSyllable $HangulSyllable; -$KanaKanji $KanaKanji; #different rule status if both kanji and kana found - -# rule 14 - -$E_Modifier ($Format | $Extend | $ZWJ)* ($E_Base | $EBG); - -# rule 15 - 17 -# Pairs of Regional Indicators stay together. - -^$BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)* - ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}]; -^$BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)* - ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}]; - -($Extended_Pict | $EmojiNRK) $ZWJ $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)* - ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}]; -($Extended_Pict | $EmojiNRK) $ZWJ $BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)* - ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}]; - - - ## ------------------------------------------------- !!safe_reverse; @@ -291,39 +203,17 @@ $E_Modifier ($Format | $Extend | $ZWJ)* ($E_Base | $EBG); ($Extend | $Format | $ZWJ)+ .?; # rule 6 -($MidLetter | $MidNumLet | $Single_Quote) ($BackALetterEx | $BackHebrew_LetterEx); +($MidLetter | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* ($Hebrew_Letter | $ALetterPlus); # rule 7b -$Double_Quote $BackHebrew_LetterEx; - +$Double_Quote ($Format | $Extend | $ZWJ)* $Hebrew_Letter; -# rule 11 -($MidNum | $MidNumLet | $Single_Quote) $BackNumericEx; - -# rule 13c -$BackRegional_IndicatorEx*; - -# For dictionary-based break -$dictionary $dictionary; - -## ------------------------------------------------- - -!!safe_forward; - -# rule 4 -($Extend | $Format | $ZWJ)+ .?; - -# rule 6 -($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx); - -# rule 7b -$Double_QuoteEx $Hebrew_LetterEx; # rule 11 -($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx; +($MidNum | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* $Numeric; # rule 13c -$Regional_IndicatorEx*; +$Regional_Indicator ($Format | $Extend | $ZWJ)* $Regional_Indicator; # For dictionary-based break $dictionary $dictionary; diff --git a/icu4c/source/data/brkitr/rules/word_POSIX.txt b/icu4c/source/data/brkitr/rules/word_POSIX.txt index 85a976cbf3d..ec46da62915 100644 --- a/icu4c/source/data/brkitr/rules/word_POSIX.txt +++ b/icu4c/source/data/brkitr/rules/word_POSIX.txt @@ -1,7 +1,7 @@ -# +# # Copyright (C) 2016 and later: Unicode, Inc. and others. # License & terms of use: http://www.unicode.org/copyright.html -# Copyright (C) 2002-2016, International Business Machines Corporation +# Copyright (C) 2002-2016, International Business Machines Corporation # and others. All Rights Reserved. # # file: word_POSIX.txt @@ -22,6 +22,7 @@ ############################################################################## !!chain; +!!quoted_literals_only; # @@ -62,7 +63,7 @@ $Hiragana = [:Hiragana:]; # 5.0 or later as the definition of Complex_Context was corrected to include all # characters requiring dictionary break. -$Control = [\p{Grapheme_Cluster_Break = Control}]; +$Control = [\p{Grapheme_Cluster_Break = Control}]; $HangulSyllable = [\uac00-\ud7a3]; $ComplexContext = [:LineBreak = Complex_Context:]; $KanaKanji = [$Han $Hiragana $Katakana]; @@ -74,7 +75,7 @@ $ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; # -# Rules 4 Ignore Format and Extend characters, +# Rules 4 Ignore Format and Extend characters, # except when they appear at the beginning of a region of text. # # TODO: check if handling of katakana in dictionary makes rules incorrect/void @@ -154,7 +155,7 @@ $NumericEx $NumericEx {100}; $NumericEx ($ALetterEx | $Hebrew_LetterEx) {200}; -# rule 11 and 12 +# rule 11 and 12 $NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100}; @@ -191,96 +192,7 @@ $ExtendNumLetEx $KatakanaEx {400}; # (13b) # special handling for CJK characters: chain for later dictionary segmentation $HangulSyllable $HangulSyllable {200}; -$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found - - -## ------------------------------------------------- - -!!reverse; - -$BackHebrew_LetterEx = ($Format | $Extend | $ZWJ)* $Hebrew_Letter; -$BackALetterEx = ($Format | $Extend | $ZWJ)* $ALetterPlus; -$BackSingle_QuoteEx = ($Format | $Extend | $ZWJ)* $Single_Quote; -$BackDouble_QuoteEx = ($Format | $Extend | $ZWJ)* $Double_Quote; -$BackMidNumLetEx = ($Format | $Extend | $ZWJ)* $MidNumLet; -$BackNumericEx = ($Format | $Extend | $ZWJ)* $Numeric; -$BackMidNumEx = ($Format | $Extend | $ZWJ)* $MidNum; -$BackMidLetterEx = ($Format | $Extend | $ZWJ)* $MidLetter; -$BackKatakanaEx = ($Format | $Extend | $ZWJ)* $Katakana; -$BackHiraganaEx = ($Format | $Extend | $ZWJ)* $Hiragana; -$BackExtendNumLetEx = ($Format | $Extend | $ZWJ)* $ExtendNumLet; -$BackRegional_IndicatorEx = ($Format | $Extend | $ZWJ)* $Regional_Indicator; - -# rule 3 -$LF $CR; - -# Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed. -# -($Extended_Pict | $EmojiNRK) $ZWJ; - -# rule 4 -($Format | $Extend | $ZWJ)* [^$CR $LF $Newline]?; - -# rule 5 - -($BackALetterEx | $BackHebrew_LetterEx) ($BackALetterEx | $BackHebrew_LetterEx); - -# rule 6 and 7 - -($BackALetterEx | $BackHebrew_LetterEx) ($BackMidLetterEx | $BackMidNumLetEx | $BackSingle_QuoteEx) ($BackALetterEx | $BackHebrew_LetterEx); - -# rule 7a -$BackSingle_QuoteEx $BackHebrew_LetterEx; - -# Rule 7b and 7c -$BackHebrew_LetterEx $BackDouble_QuoteEx $BackHebrew_LetterEx; - -# rule 8 - -$BackNumericEx $BackNumericEx; - -# rule 9 - -$BackNumericEx ($BackALetterEx | $BackHebrew_LetterEx); - -# rule 10 - -($BackALetterEx | $BackHebrew_LetterEx) $BackNumericEx; - -# rule 11 and 12 - -$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx | $BackSingle_QuoteEx) $BackNumericEx; - -# rule 13 - -$BackKatakanaEx $BackKatakanaEx; - -# rules 13 a/b -# -$BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx); -($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx; - -# special handling for CJK characters: chain for later dictionary segmentation -$HangulSyllable $HangulSyllable; -$KanaKanji $KanaKanji; #different rule status if both kanji and kana found - -# rule 14 - -$E_Modifier ($Format | $Extend | $ZWJ)* ($E_Base | $EBG); - -# rule 15 - 17 -# Pairs of Regional Indicators stay together. - -^$BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)* - ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}]; -^$BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)* - ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}]; - -($Extended_Pict | $EmojiNRK) $ZWJ $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)* - ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}]; -($Extended_Pict | $EmojiNRK) $ZWJ $BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)* - ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}]; - +$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found ## ------------------------------------------------- @@ -291,39 +203,17 @@ $E_Modifier ($Format | $Extend | $ZWJ)* ($E_Base | $EBG); ($Extend | $Format | $ZWJ)+ .?; # rule 6 -($MidLetter | $MidNumLet | $Single_Quote) ($BackALetterEx | $BackHebrew_LetterEx); +($MidLetter | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* ($Hebrew_Letter | $ALetterPlus); # rule 7b -$Double_Quote $BackHebrew_LetterEx; +$Double_Quote ($Format | $Extend | $ZWJ)* $Hebrew_Letter; # rule 11 -($MidNum | $MidNumLet | $Single_Quote) $BackNumericEx; - -# rule 13c -$BackRegional_IndicatorEx*; - -# For dictionary-based break -$dictionary $dictionary; - -## ------------------------------------------------- - -!!safe_forward; - -# rule 4 -($Extend | $Format | $ZWJ)+ .?; - -# rule 6 -($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx); - -# rule 7b -$Double_QuoteEx $Hebrew_LetterEx; - -# rule 11 -($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx; +($MidNum | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* $Numeric; # rule 13c -$Regional_IndicatorEx*; +$Regional_Indicator ($Format | $Extend | $ZWJ)* $Regional_Indicator; # For dictionary-based break $dictionary $dictionary; diff --git a/icu4c/source/test/intltest/rbbiapts.cpp b/icu4c/source/test/intltest/rbbiapts.cpp index ea1055f5de5..1a872907595 100644 --- a/icu4c/source/test/intltest/rbbiapts.cpp +++ b/icu4c/source/test/intltest/rbbiapts.cpp @@ -473,27 +473,27 @@ void RBBIAPITest::TestIteration() bi->first(); i = bi->current(); if (i != 0) { - errln("%s:%d Incorrect value from bi->previous(). Expected 0, got %d", __FILE__, __LINE__, i); + errln("%s:%d Incorrect value from bi->current(). Expected 0, got %d", __FILE__, __LINE__, i); } bi->next(); i = bi->current(); if (i != 1) { - errln("%s:%d Incorrect value from bi->previous(). Expected 1, got %d", __FILE__, __LINE__, i); + errln("%s:%d Incorrect value from bi->current(). Expected 1, got %d", __FILE__, __LINE__, i); } bi->last(); bi->next(); i = bi->current(); if (i != 10) { - errln("%s:%d Incorrect value from bi->previous(). Expected 10, got %d", __FILE__, __LINE__, i); + errln("%s:%d Incorrect value from bi->current(). Expected 10, got %d", __FILE__, __LINE__, i); } bi->first(); bi->previous(); i = bi->current(); if (i != 0) { - errln("%s:%d Incorrect value from bi->previous(). Expected 0, got %d", __FILE__, __LINE__, i); + errln("%s:%d Incorrect value from bi->current(). Expected 0, got %d", __FILE__, __LINE__, i); } diff --git a/icu4c/source/test/intltest/rbbimonkeytest.cpp b/icu4c/source/test/intltest/rbbimonkeytest.cpp index e5eeeac2eb9..30755b36484 100644 --- a/icu4c/source/test/intltest/rbbimonkeytest.cpp +++ b/icu4c/source/test/intltest/rbbimonkeytest.cpp @@ -667,6 +667,7 @@ void RBBIMonkeyImpl::runTest() { testFollowing(status); testPreceding(status); testIsBoundary(status); + testIsBoundaryRandom(status); if (fLoopCount < 0 && loopCount % 100 == 0) { fprintf(stderr, "."); @@ -802,6 +803,29 @@ void RBBIMonkeyImpl::testIsBoundary(UErrorCode &status) { checkResults("testForwards", FORWARD, status); } +void RBBIMonkeyImpl::testIsBoundaryRandom(UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + fBI->setText(fTestData->fString); + + int stringLen = fTestData->fString.length(); + for (int i=stringLen; i>=0; --i) { + int strIdx = fRandomGenerator() % stringLen; + if (fTestData->fExpectedBreaks.charAt(strIdx) != fBI->isBoundary(strIdx)) { + IntlTest::gTest->errln("%s:%d testIsBoundaryRandom failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", + __FILE__, __LINE__, strIdx, fRuleFileName, fTestData->fRandomSeed); + if (fVerbose) { + fTestData->dump(i); + } + status = U_INVALID_STATE_ERROR; + break; + } + } +} + + + void RBBIMonkeyImpl::checkResults(const char *msg, CheckDirection direction, UErrorCode &status) { if (U_FAILURE(status)) { return; diff --git a/icu4c/source/test/intltest/rbbimonkeytest.h b/icu4c/source/test/intltest/rbbimonkeytest.h index baad854d6ea..83b16f3c6cd 100644 --- a/icu4c/source/test/intltest/rbbimonkeytest.h +++ b/icu4c/source/test/intltest/rbbimonkeytest.h @@ -194,6 +194,7 @@ class RBBIMonkeyImpl: public UObject { void testFollowing(UErrorCode &status); void testPreceding(UErrorCode &status); void testIsBoundary(UErrorCode &status); + void testIsBoundaryRandom(UErrorCode &status); void checkResults(const char *msg, CheckDirection dir, UErrorCode &status); class RBBIMonkeyThread: public SimpleThread { diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 20ed7e012d0..7333a79544d 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -974,7 +974,8 @@ void RBBITest::executeTest(TestParams *t, UErrorCode &status) { // Run the iterator backwards, verify that the same breaks are found. // prevBP = utext_nativeLength(t->textToBreak)+2; // start with a phony value for the last break pos seen. - for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) { + bp = t->bi->last(); + while (bp != BreakIterator::DONE) { if (prevBP == bp) { // Fail for lack of progress. errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d", @@ -1012,6 +1013,7 @@ void RBBITest::executeTest(TestParams *t, UErrorCode &status) { } prevBP = bp; + bp = t->bi->previous(); } // Verify that there were no missed breaks prior to the last one found @@ -1465,6 +1467,7 @@ void RBBITest::TestExtended() { // Reached end of test file. Raise an error if parseState indicates that we are // within a block that should have been terminated. + if (parseState == PARSE_RULES) { errln("rbbitst.txt:%d block beginning at line %d is not closed.", lineNum, rulesFirstLine); diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 0757bdf7dbc..1450a98d7be 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -1386,8 +1386,7 @@ Bangkok)• -#TODO: uncomment this line when quoted_literals_only is implemented. -#!!quoted_literals_only; +!!quoted_literals_only; !!forward; 'Hello World'; !!reverse; @@ -1395,3 +1394,83 @@ Bangkok)• •Hello World• + +# Test for circular buffer overflow during reverse iteration with inefficient reverse rules, +# Too many boundaries between safe back up position and current position. + + +!!forward; +.; +!!reverse; +.*; + +•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a• + +# +# Dictionary regression check +# Intended to find unexpected behavior changes when changing dictionary implementation code, +# but may also be fragile, failing with intended improvements to dictionary breaking. +# + + +•Unicode<200> •คือ<200>อะไร<200>?• +•Unicode<200> •กำหนด<200>หมายเลข<200>เฉพาะ<200>สำหรับ<200>ทุก<200>อักขระ<200> +•โดย<200>ไม่<200>สนใจ<200>ว่า<200>เป็น<200>แพ<200>ล็ต<200>ฟอร์ม<200>ใด<200> +•ไม่<200>ขึ้น<200>กับ<200>ว่า<200>จะ<200>เป็น<200>โปรแกรม<200>ใด<200> +•และ<200>ไม่<200>ว่า<200>จะ<200>เป็น<200>ภาษา<200>ใด<200> +• +•โดย<200>พื้น<200>ฐาน<200>แล้ว<200>,• •คอมพิวเตอร์<200>จะ<200>เกี่ยวข้อง<200>กับ<200>เรื่อง<200>ของ<200>ตัวเลข<200>.• •คอมพิวเตอร์<200>จัด<200>เก็บ<200>ตัว<200>อักษร<200>และ<200>อักข<200>ระ<200>อื่นๆ<200> •โดย<200>การ<200>กำหนด<200>หมายเลข<200>ให้<200>สำหรับ<200>แต่ละ<200>ตัว<200>.• •ก่อน<200>หน้า<200>ที่๊<200> •Unicode<200> •จะ<200>ถูก<200>สร้าง<200>ขึ้น<200>,• •ได้<200>มี<200>ระบบ<200> •encoding<200> •อยู่<200>หลาย<200>ร้อย<200>ระบบ<200>สำหรับ<200>การ<200>กำหนด<200>หมายเลข<200>เหล่า<200>นี้<200>.• •ไม่มี<200> •encoding<200> •ใด<200>ที่<200>มี<200>จำนวน<200>ตัว<200>อักขระ<200>มาก<200>เพียง<200>พอ<200>:• •ยก<200>ตัวอย่าง<200>เช่น<200>,• •เฉพาะ<200>ใน<200>กลุ่ม<200>สหภาพ<200>ยุโรป<200>เพียง<200>แห่ง<200>เดียว<200> •ก็<200>ต้องการ<200>หลาย<200> •encoding<200> •ใน<200>การ<200>ครอบคลุม<200>ทุก<200>ภาษา<200>ใน<200>กลุ่ม<200>.• •หรือ<200>แม้แต่<200>ใน<200>ภาษา<200>เดี่ยว<200> •เช่น<200> •ภาษา<200>อังกฤษ<200> •ก็<200>ไม่มี<200> •encoding<200> •ใด<200>ที่<200>เพียง<200>พอ<200>สำหรับ<200>ทุก<200>ตัว<200>อักษร<200>,• •เครื่องหมาย<200>วรรค<200>ตอน<200> •และ<200>สัญลักษณ์<200>ทาง<200>เทคนิค<200>ที่<200>ใช้<200>กัน<200>อยู่<200>ทั่วไป<200>.• +• +•ระบบ<200> •encoding<200> •เหล่า<200>นี้<200>ยัง<200>ขัด<200>แย้ง<200>ซึ่ง<200>กัน<200>และ<200>กัน<200>.• •นั่น<200>ก็<200>คือ<200>,• •ใน<200>สอง<200> •encoding<200> •สามารถ<200>ใช้<200>หมายเลข<200>เดียวกัน<200>สำหรับ<200>ตัว<200>อักขระ<200>สอง<200>ตัว<200>ที่<200>แตก<200>ต่าง<200>กัน<200>,•หรือ<200>ใช้<200>หมายเลข<200>ต่าง<200>กัน<200>สำหรับ<200>อักขระ<200>ตัว<200>เดียวกัน<200>.• •ใน<200>ระบบ<200>คอมพิวเตอร์<200> •(•โดย<200>เฉพาะ<200>เซิร์ฟเวอร์<200>)• •ต้อง<200>มี<200>การ<200>สนับสนุน<200>หลาย<200> •encoding<200>;• •และ<200>เมื่อ<200>ข้อมูล<200>ที่<200>ผ่าน<200>ไป<200>มาระ<200>หว่าง<200>การ<200>เข้า<200>รหัส<200>หรือ<200>แพ<200>ล็ต<200>ฟอร์ม<200>ที่<200>ต่าง<200>กัน<200>,• •ข้อมูล<200>นั้น<200>จะ<200>เสี่ยง<200>ต่อ<200>การ<200>ผิด<200>พลาด<200>เสีย<200>หาย<200>.• +• +•Unicode<200> •จะ<200>เปลี่ยนแปลง<200>สิ่ง<200>เหล่า<200>นั้น<200>ทั้งหมด<200>!• +• +•Unicode<200> •กำหนด<200>หมายเลข<200>เฉพาะ<200>สำหรับ<200>แต่ละ<200>อักขระ<200>,• •โดย<200>ไม่<200>สนใจ<200>ว่า<200>เป็น<200>แพ<200>ล็ต<200>ฟอร์ม<200>ใด<200>,• •ไม่<200>ขึ้น<200>กับ<200>ว่า<200>จะ<200>เป็น<200>โปรแกรม<200>ใด<200>และ<200>ไม่<200>ว่า<200>จะ<200>เป็น<200>ภาษา<200>ใด<200>.• •มาตรฐาน<200> •Unicode<200> •ได้<200>ถูก<200>นำ<200>ไป<200>ใช้<200>โดย<200>ผู้นำ<200>ใน<200>อุตสาหกรรม<200> •เช่น<200> •Apple<200>,• •HP<200>,• •IBM<200>,• •JustSystem<200>,• •Microsoft<200>,• •Oracle<200>,• •SAP<200>,• •Sun<200>,• •Sybase<200>,• •Unisys<200> •และ<200>อื่นๆ<200> •อีก<200>มาก<200>.• •Unicode<200> •เป็น<200>สิ่ง<200>ที่<200>จำเป็น<200>สำหรับ<200>มาตร<200>ฐาน<200>ใหม่ๆ<200> •เช่น<200> •XML<200>,• •Java<200>,• •ECMAScript<200> •(•JavaScript<200>)•,• •LDAP<200>,• •CORBA<200> •3.0<100>,• •WML<200> •ฯลฯ<200>.•,• •และ<200>เป็น<200>แนวทาง<200>อย่าง<200>เป็น<200>ทางการ<200>ใน<200>การ<200>ทำ<200> •ISO<200>/•IEC<200> •10646<100>.• •Unicode<200> •ได้<200>รับ<200>การ<200>สนับสนุน<200>ใน<200>ระบบ<200>ปฏิบัติ<200>การ<200>จำนวน<200>มาก<200>,• •บราวเซอร์<200>ใหม่ๆ<200> •ทก<200>ตัว<200>,• •และ<200>ผลิต<200>ภัณฑ์<200>อื่นๆ<200> •อีก<200>มาก<200>.• •การ<200>เกิด<200>ขึ้น<200>ของ<200> •Unicode<200> •Standard<200> •และ<200>ทูล<200>ส์<200>ต่างๆ<200> •ที่<200>มี<200>ใน<200>การ<200>สนับสนุน<200> •Unicode<200>,• •เป็น<200>หนึ่ง<200>ใน<200>แนว<200>โน้ม<200>ทาง<200>เทคโนโลยี<200>ซอฟต์แวร์<200>ระดับ<200>โลก<200>ที่<200>มี<200>ความ<200>สำคัญ<200>ที่สุด<200>.• +• +•การ<200>รวม<200> •Unicode<200> •เข้าไป<200>ใน<200>ระบบ<200>ไคลเอ็นต์<200>-•เซิร์ฟเวอร์<200> •หรือ<200>แอ็พ<200>พลิ<200>เค<200>ชัน<200>แบบ<200> •multi<200>-•tiered<200> •และ<200>เว็บไซต์<200> •จะ<200>ทำให้<200>เกิด<200>การ<200>ประหยัด<200>ค่า<200>ใช้<200>จ่าย<200>มากกว่า<200>การ<200>ใช้<200>ชุด<200>อักขระ<200>แบบ<200>เดิม<200>.• •Unicode<200> •ทำให้<200>ผลิตภัณฑ์<200>ซอฟต์แวร์<200>หนึ่ง<200>เดียว<200> •หรือ<200>เว็บไซต์<200>แห่ง<200>เดียว<200> •รองรับ<200>ได้<200>หลาย<200>แพ<200>ล็ต<200>ฟอร์ม<200>,• •หลาย<200>ภาษา<200>และ<200>หลาย<200>ประเทศ<200>โดย<200>ไม่<200>ต้อง<200>ทำการ<200>รื้อ<200>ปรับ<200>ระบบ<200>.• •Unicode<200> •ยัง<200>ทำให้<200>ข้อมูล<200>สามารถ<200>เคลื่อน<200>ย้าย<200>ไป<200>มา<200>ใน<200>หลายๆ<200> •ระบบ<200>โดย<200>ไม่<200>เกิด<200>ความ<200>ผิด<200>พลาด<200>เสีย<200>หาย<200>.• +• +•เกี่ยว<200>กับ<200> •Unicode<200> •Consortium<200> +• +•Unicode<200> •Consortium<200> •เป็น<200>องค์กร<200>ไม่<200>แสวงหา<200>กำไร<200>ที่<200>ก่อ<200>ตั้ง<200>ขึ้น<200>เพื่อ<200>พัฒนา<200>,• •ขยาย<200>และ<200>ส่ง<200>เสริม<200>การ<200>ใช้<200> •Unicode<200> •Standard<200>,• •ซึ่ง<200>กำหนด<200>รูป<200>แบบ<200>การ<200>แทน<200>ค่า<200>ของ<200>ข้อความ<200>ใน<200>ผลิตภัณฑ์<200>ซอฟต์แวร์<200>และ<200>มาตร<200>ฐาน<200>ใหม่ๆ<200>.• •สมาชิก<200>ของ<200>สมาคม<200>เป็น<200>ตัวแทน<200>จาก<200>บริษัท<200>และ<200>องค์กร<200>ใน<200>อุตสาหกรรม<200>คอมพิวเตอร์<200>และ<200>การ<200>ประมวล<200>ผล<200>สารสนเทศ<200>.• •สมาคม<200>ได้<200>รับ<200>การ<200>สนับสนุน<200>ทางการ<200>เงิน<200>ผ่าน<200>ทาง<200>ค่า<200>ธรรมเนียม<200>ของ<200>การ<200>เป็น<200>สมาชิก<200>เท่านั้น<200>.• •สมาชิก<200>ภาพ<200>ของ<200> •Unicode<200> •Consortium<200> •เปิด<200>กว้าง<200>สำหรับ<200>องค์กร<200>หรือ<200>บุคคล<200>ใดๆ<200> •ใน<200>โลก<200>ที่<200>ต้องการ<200>สนับสนุน<200> •Unicode<200> •Standard<200> •และ<200>ช่วย<200>เหลือ<200>การ<200>ขยาย<200>ตัว<200>และ<200>การนำ<200> •Unicode<200> •ไป<200>ใช้<200>งาน<200>.• +• +•สำหรับ<200>ข้อมูล<200>เพิ่ม<200>เติม<200>,• •ให้<200>ดู<200>ที่<200> •Glossary<200>,• •Sample<200> •Unicode<200>-•Enabled<200> •Products<200>,• •Technical<200> •Introduction<200> •และ<200> •Useful<200> •Resources<200>.• + + +# Burmese +•အ<200>လော<200>င္<200>မ<200>င္<200>တရား<200> +• • • • • •မဟာ<200>ဓမ္မရာဇာ<200>မိ<200>ပတိ<200>လ<200>က္<200>ထ<200>က္<200>တ္<200>ဝ<200>င္<200> •အ<200>င္<200>ဝ<200>နေ<200>ပ္<200>ရ<200>ည္<200>တော္<200>က္<200>ရီး<200>သ<200>ည္<200> •မ<200>င္<200>ရိ<200>မ္<200>မ<200>သ<200>က္<200>ဖ္<200>ရ<200>စ္<200>နေ<200>သ<200>ည္<200>။• •မဏိ<200>ပူ<200>ရ<200> •က<200>သ<200>ည္<200>မ္<200>ယား<200>က<200> •အ<200>င္<200>ဝ<200>နေ<200>ပ္<200>ရ<200>ည္<200>တော္<200>၏• •မ္<200>ရော<200>က္<200>ဘ<200>က္<200>တ<200>လ္<200>ဝ္<200>ဟား<200>ကုိ<200> •တုိ<200>က္<200>ခုိ<200>က္<200>ဖ္<200>ယ<200>က္<200>ဆီး<200>သ<200>ည္<200>။• •အော<200>က္<200>မ္<200>ရ<200>န္<200>မာ<200>နုိ<200>င္<200>ငံ<200> •ဟံ<200>သာ<200>ဝ<200>တီ<200>သား<200>တုိ့<200>က<200>လ<200>ည္<200> •ပု<200>န္<200>က<200>န္<200>သ<200>ည္<200>။• •မတ္တ<200>ရာ<200>အု<200>တ္<200>ဖုိ<200>ရ္<200>ဟိ<200> •က္<200>ဝေ့<200>ရ္<200>ဟ<200>မ္<200>မ္<200>ယား<200>က<200>လ<200>ည္<200> •ထ<200>က္<200>ရ္<200>ဝ<200>သ<200>ည္<200>။• +• +• • • • •ထုိ<200>အ<200>ခ္<200>ယိ<200>န္<200>တ္<200>ဝ<200>င္<200> •မု<200>ဆုိး<200>ဖုိ<200>ရ္<200>ဝာ<200>သူ<200>က္<200>ရီး<200> •အော<200>င္<200>ဇေ<200>ယ္<200>ယ<200>သ<200>ည္<200> •မိမိ<200>၏•ရ္<200>ဝာ<200>ကုိ<200> •လုံ<200>ခ္<200>ရုံ<200>အော<200>င္<200>ထ<200>န္<200>လုံး<200>တ<200>ပ္<200>မ္<200>ယား<200>ကာ<200>ရ<200>သ<200>ည္<200>။• •အနီး<200>အ<200>ပား<200> •က္<200>ယေး<200>ရ္<200>ဝာ<200> •လေး<200>ဆ<200>ယ့္<200>ခ္<200>ရော<200>က္<200>ရ္<200>ဝာ<200>ကုိ<200> •သိ<200>မ္း<200>သ္<200>ဝ<200>င္<200>ထား<200>သ<200>ည္<200>။• •မ<200>က္<200>ရာ<200>မီ<200>ပ<200>င္<200> •အ<200>င္<200>ဝ<200>နေ<200>ပ္<200>ရ<200>ည္<200>တော္<200>က္<200>ရီး<200>သ<200>ည္<200> •ဟံ<200>သာ<200>ဝ<200>တီ<200>တ<200>ပ္<200>မ္<200>ယား<200> •လ<200>က္<200>တ္<200>ဝ<200>င္<200>သ<200>က္<200>ဆ<200>င္<200>ရ<200>တော့<200>သ<200>ည္<200>။• +• +• • • • •အ<200>င္<200>ဝ<200>ကုိ<200> •သိ<200>မ္<200>ပုိ<200>က္<200>ပ္<200>ရီး<200>သော<200> •ဟံ<200>သာ<200>ဝ<200>တီ<200>တ<200>ပ္<200>မ္<200>ယား<200>သ<200>ည္<200> •မ္<200>ရော<200>က္<200>ဘ<200>က္<200>တ<200>လ္<200>ဝ္<200>ဟား<200>က္<200>ယေး<200>ရ္<200>ဝာ<200>မ္<200>ယား<200>ကုိ<200> •သစ္စာ<200>ခံ<200>ခုိ<200>င္<200>ရ<200>န္<200> •လာ<200>က္<200>ရ<200>ရာ<200> •မု<200>ဆုိး<200>ဖုိ<200>ရ္<200>ဝာ<200>သုိ့<200> •ရော<200>က္<200>ရ္<200>ဟိ<200>လာ<200>သ<200>ည္<200>။• •တ<200>ခ္<200>ယိ<200>န္<200>တ<200>ည္<200>မ္<200>ဟာ<200>ပ<200>င္<200> •က္<200>ဝေ့<200>ရ္<200>ဟ<200>မ္<200>မ္<200>ယား<200>က<200>လ<200>ည္<200> •သစ္စာ<200>ခံ<200>ခုိ<200>င္<200>ရ<200>န္<200> •ရော<200>က္<200>ရ္<200>ဟိ<200>လာ<200>သ<200>ည္<200>။• •ဦး<200>အော<200>င္<200>ဇေ<200>ယ္<200>ယ<200>သ<200>ည္<200> •အ<200>ဖ္<200>ဝဲ့<200>န္<200>ဟ<200>စ္<200>ဖ္<200>ဝဲ့<200>ကုိ<200> •ခ္<200>ရေ<200>ငံ<200>စ္<200>ဝာ<200> •ဆ<200>က္<200>ဆံ<200>သ<200>ည္<200>။• •မ<200>ည္<200>သူ့<200>သ<200>စ္<200>စာ<200>ကုိ<200>မ္<200>ယ္<200>ဟ<200> •ခံ<200>ယူ<200>ခ္<200>ရ<200>င္<200>မ<200>ပ္<200>ရု<200>ပေ<200>။• •ဟံ<200>သာ<200>ဝ<200>တီ<200>တ<200>ပ္<200>ဖ္<200>ဝဲ့<200>ကုိ<200> •အ<200>ပ္<200>ရ<200>န္<200>ခ<200>ရီး<200>တ္<200>ဝ<200>င္<200> •လ<200>မ္<200>မ္<200>ဟ<200>ဖ္<200>ရ<200>တ္<200>၍• •တုိ<200>က္<200>ခုိ<200>က္<200>သ<200>ည္<200>။• •ဟံ<200>သာ<200>ဝ<200>တီ<200>တ<200>ပ္<200>ဖ္<200>ဝဲ့<200>မ္<200>ယား<200> •အထိ<200>အ<200>ခုိ<200>က္<200>အ<200>က္<200>ယ<200>အ<200>ဆုံး<200>မ္<200>ယား<200>စ္<200>ဝာ<200>ဖ္<200>ရ<200>င္<200> •ပ္<200>ရ<200>န္<200>ရ<200>သ<200>ည္<200>။• +• +• • • • •ဟံ<200>သာ<200>ဝ<200>တီ<200>တ<200>ပ္<200>မ္<200>ယား<200>သ<200>ည္<200> •မု<200>ဆုိး<200>ဖုိ<200>ရ္<200>ဝာ<200>ကုိ<200> •လာ<200>ရော<200>က္<200>တုိ<200>က္<200>ခုိ<200>က္<200>က္<200>ရ<200>ပ္<200>ရ<200>န္<200>သ<200>ည္<200>။• •ဦး<200>အော<200>င္<200>ဇေ<200>ယ္<200>ယ<200>သ<200>ည္<200> •သ္<200>ဝေး<200>သော<200>က္<200>ရဲ<200>ဘော္<200> •ခ္<200>ရော<200>က္<200>က္<200>ယိ<200>ပ္<200>ရ္<200>ဟ<200>စ္<200>ယော<200>က္<200>န္<200>ဟ<200>င္<200>အတူ<200> •ဦးစီး<200>ကာ<200>အော<200>င္<200>မ္<200>ရ<200>င္<200>စ္<200>ဝာ<200>ခု<200>ခံ<200>တ္<200>ဝ<200>န္<200>လ္<200>ဟ<200>န္<200>နုိ<200>င္<200>ခဲ့<200>သ<200>ည္<200>။• •ထုိ့<200>နော<200>က္<200> •ဦး<200>အော<200>င္<200>ဇေ<200>ယ္<200>ယ<200>သ<200>ည္<200> •မ္<200>ရော<200>က္<200>ဘ<200>က္<200>တ<200>လ္<200>ဝ္<200>ဟား<200>ရ္<200>ဟိ<200> •ရ္<200>ဟ<200>မ္<200>မ္<200>ယား<200>န္<200>ဟ<200>င္<200> •မ္<200>ရ<200>န္<200>မာ<200>မ္<200>ယား<200>ကုိ<200>လ<200>ည္<200> •ဆ<200>က္<200>သ္<200>ဝ<200>ယ္<200>စ<200>ည္<200>ရုံး<200>နုိ<200>င္<200>ခဲ့<200>သ<200>ည္<200>။• •ဤ<200>သုိ့<200>ဖ္<200>ရ<200>င္<200> •ဦး<200>အော<200>င္<200>ဇေ<200>ယ္<200>ယ<200>၏• •အ<200>ရ္<200>ဟိ<200>န္<200>အ<200>ဝာ<200> •မ္<200>ရ<200>င္<200>မား<200>လာ<200>လေ<200>သ<200>ည္<200>။• +• +• • • • •ဦး<200>အော<200>င္<200>ဇေ<200>ယ္<200>ယ<200>သ<200>ည္<200> •အ<200>လော<200>င္<200>မ<200>င္<200>တ<200>ရား<200>ဘ္<200>ဝဲ့<200>ကုိ<200> •ခံယူ<200>ကာ<200> •ကု<200>န္<200>ဘော<200>င္<200>မ<200>င္<200>ဆ<200>က္<200>ကုိ<200>စ<200>တ<200>င္<200>တ<200>ည္<200>ထော<200>င္<200>သ<200>ည္<200>။• •မု<200>ဆုိး<200>ဖုိ<200>ရ္<200>ဝာ<200>ကုိ<200> •ရ္<200>ဝ္<200>ဟ<200>ဝေ<200>ဘုိ<200>ဟု<200> •သ<200>မု<200>တ္<200>ကာ<200> •မ္<200>ရုိ့<200>န<200>န္<200>တ<200>ည္<200>သ<200>ည္<200>။• •န<200>န္<200>တ<200>ည္<200>သ<200>က္<200>က<200>ရာ<200>ဇ္<200>ဖ္<200>ရ<200>စ္<200>သော<200> •၁၁၁၅<100> •ခု<200>ကုိ<200> •ဥ<200>ဩ<200>အော္<200>မ္<200>ရ<200>ည္<200> •ကု<200>န္<200>ဘော<200>င္<200>တ<200>ည္<200>ဟု<200> •အ<200>မ္<200>ဟ<200>တ္<200>အ<200>သား<200>ပ္<200>ရု<200>က္<200>ရ<200>သ<200>ည္<200>။• +• +• • • • •အ<200>လော<200>င္<200>မ<200>င္<200>တရား<200>သ<200>ည္<200> •ဧရာ<200>ဝ<200>တီ<200>န္<200>ဟ<200>င္<200>ခ္<200>ယ<200>င္<200>တ္<200>ဝ<200>င္<200> •မ္<200>ရ<200>စ္<200>န္<200>ဟ<200>စ္<200>သ္<200>ဝ<200>ယ္<200>အ<200>က္<200>ရား<200> •ဒေ<200>သ<200>မ္<200>ယား<200>ကုိ<200>အ<200>ခုိ<200>င္<200>အ<200>မာ<200> •စု<200>စ<200>ည္<200>ပ္<200>ရီး<200>နော<200>က္<200> •အ<200>င္<200>ဝ<200>ကုိ<200> •တုိ<200>က္<200>ခုိ<200>က္<200>အော<200>င္<200>မ္<200>ရ<200>င္<200>သ<200>ည္<200>။• •ထုိ<200>နော<200>က္<200>တ္<200>ဝ<200>င္<200>ပ္<200>ရ<200>ည္<200>၊• •လ္<200>ဝ<200>န္<200>ဆေး<200>၊• •ဒ<200>ဂုံ<200>မ္<200>ရုိ့<200>မ္<200>ယား<200>ကုိ<200> •သိ<200>မ္<200>ပုိ<200>က္<200>သ<200>ည္<200>။• •လ္<200>ဝ<200>န္<200>ဆေး<200> •ကုိ<200>မ္<200>ရ<200>န္<200>အော<200>င္<200>ဟူ<200>၍• •သ<200>မု<200>တ္<200>သ<200>ည္<200>။• •ဒ<200>ဂုံ<200>ကုိ<200>ရ<200>န္<200>ကု<200>န္<200>ဟူ<200>၍• •သ<200>မု<200>တ္<200>ထ<200>သ<200>ည္<200>။• + + +# japanese +•ユニ<400>コード<400>と<400>は<400>何<400>か<400>?• +•ユニ<400>コード<400>は<400>、•すべて<400>の<400>文字<400>に<400>固有<400>の<400>番号<400>を<400>付与<400>し<400>ます<400> +•プラットフォーム<400>に<400>は<400>依存<400>しま<400>せん<400> +•プログラム<400>に<400>も<400>依存<400>しま<400>せん<400> +•言語<400>に<400>も<400>依存<400>しま<400>せん<400> +• +•コンピューター<400>は<400>、•本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。•コンピューター<400>は<400>、•文字<400>や<400>記号<400>など<400>の<400>それぞれに<400>番号<400>を<400>割り振る<400>こと<400>によって<400>扱える<400>よう<400>にし<400>ます<400>。•ユニ<400>コード<400>が<400>出来る<400>まで<400>は<400>、•これらの<400>番号<400>を<400>割り振る<400>仕組み<400>が<400>何<400>百<400>種類<400>も<400>存在<400>しま<400>した<400>。•どの<400>一つ<400>を<400>とっても<400>、•十分<400>な<400>文字<400>を<400>含<400>んで<400>は<400>いま<400>せん<400>で<400>した<400>。•例えば<400>、•欧州<400>連合<400>一つ<400>を<400>見<400>て<400>も<400>、•その<400>すべて<400>の<400>言語<400>を<400>カバー<400>する<400>ため<400>に<400>は<400>、•いくつか<400>の<400>異なる<400>符号<400>化<400>の<400>仕組み<400>が<400>必要<400>で<400>した<400>。•英語<400>の<400>よう<400>な<400>一つ<400>の<400>言語<400>に<400>限<400>って<400>も<400>、•一つ<400>だけ<400>の<400>符号<400>化<400>の<400>仕組み<400>では<400>、•一般<400>的<400>に<400>使<400>われる<400>すべて<400>の<400>文字<400>、•句読点<400>、•技術<400>的<400>な<400>記号<400>など<400>を<400>扱う<400>に<400>は<400>不十分<400>で<400>した<400>。• +• +•これらの<400>符号<400>化<400>の<400>仕組み<400>は<400>、•相互<400>に<400>矛盾<400>する<400>もの<400>でも<400>ありま<400>した<400>。•二つ<400>の<400>異なる<400>符号<400>化<400>の<400>仕組み<400>が<400>、•二つ<400>の<400>異なる<400>文字<400>に<400>同一<400>の<400>番号<400>を<400>付ける<400>こと<400>も<400>できる<400>し<400>、•同じ<400>文字<400>に<400>異なる<400>番号<400>を<400>付ける<400>こと<400>も<400>できる<400>の<400>です<400>。•どの<400>よう<400>な<400>コンピューター<400>も<400>(•特に<400>サーバー<400>は<400>)•多く<400>の<400>異<400>な<400>っ<400>た<400>符号<400>化<400>の<400>仕組み<400>を<400>サポート<400>する<400>必要<400>が<400>あり<400>ます<400>。•たとえ<400>データ<400>が<400>異なる<400>符号<400>化<400>の<400>仕組み<400>や<400>プラットフォーム<400>を<400>通過<400>し<400>て<400>も<400>、•いつ<400>どこ<400>で<400>データ<400>が<400>乱れる<400>か<400>分<400>から<400>ない<400>危険<400>を<400>冒す<400>こと<400>の<400>なる<400>の<400>です<400>。• +• +•ユニ<400>コード<400>は<400>すべて<400>を<400>変<400>え<400>ます<400> +• +•ユニ<400>コード<400>は<400>、•プラットフォーム<400>に<400>係<400>わら<400>ず<400>、•プログラム<400>に<400>係<400>わら<400>ず<400>、•言語<400>に<400>係<400>わら<400>ず<400>、•すべて<400>の<400>文字<400>に<400>独立<400>した<400>番号<400>を<400>与<400>え<400>ます<400>。•ユニ<400>コード<400>標準<400>は<400>、•アップル<400>、•ヒュー<400>レット<400>パッ<400>カード<400>、•IBM<200>、•ジャスト<400>システム<400>、•マイクロ<400>ソフト<400>、•オラクル<400>、•SAP<200>、•サン<400>、•サイ<400>ベース<400>など<400>の<400>産業<400>界<400>の<400>主導<400>的<400>企業<400>と<400>他の<400>多く<400>の<400>企業<400>に<400>採用<400>さ<400>れ<400>てい<400>ます<400>。•ユニ<400>コード<400>は<400>、•XML<200>、•Java<200>、•ECMAScript<200>(•JavaScript<200>)•、•LDAP<200>、•CORBA<200> •3.0<100>など<400>の<400>最先端<400>の<400>標準<400>の<400>前提<400>と<400>な<400>って<400>おり<400>、•ユニ<400>コード<400>を<400>実装<400>す<400>れ<400>ば<400>、•ISO<200>/•IEC<200> •10646<100>に<400>適合<400>する<400>ことに<400>なり<400>ます<400>。•ユニ<400>コード<400>は<400>、•多く<400>の<400>オペレーティングシステム<400>と<400>すべて<400>の<400>最新<400>の<400>ブラウザー<400>と<400>他の<400>多く<400>の<400>製品<400>で<400>サポート<400>さ<400>れ<400>てい<400>ます<400>。•ユニ<400>コード<400>標準<400>の<400>出現<400>と<400>ユニ<400>コード<400>を<400>サポート<400>する<400>ツール<400>類<400>は<400>、•昨今<400>顕著<400>に<400>な<400>って<400>いる<400>ソフトウエア<400>技術<400>の<400>グローバル<400>化<400>の<400>流れ<400>に対して<400>、•特に<400>役<400>に<400>立<400>って<400>い<400>ます<400>。• +• +•ユニ<400>コード<400>を<400>ク<400>ライアン<400>ト<400>サーバー<400>型<400>の<400>アプリケーション<400>や<400>、•多層<400>構造<400>を<400>持つ<400>アプリケーション<400>、•ウェブサイト<400>など<400>に<400>組み込む<400>こと<400>で<400>、•従来<400>の<400>文字<400>コードセット<400>を<400>用いる<400>より<400>も<400>明らか<400>な<400>コスト<400>削減<400>が<400>可能<400>です<400>。•ユニ<400>コード<400>は<400>、•単一<400>の<400>ソフトウエア<400>製品<400>、•単一<400>の<400>ウェブサイト<400>に<400>、•何ら<400>手<400>を<400>加える<400>こと<400>なく<400>、•複数<400>の<400>プラットフォーム<400>、•複数<400>の<400>言語<400>、•複数<400>の<400>国<400>を<400>カバー<400>する<400>こと<400>が<400>出来る<400>の<400>です<400>。•ユニ<400>コード<400>は<400>、•データ<400>が<400>多く<400>の<400>異なる<400>システム<400>の<400>間<400>を<400>、•何<400>の<400>乱れ<400>も<400>なし<400>に<400>転送<400>する<400>こと<400>を<400>可能<400>と<400>する<400>の<400>です<400>。• +• +•ユニ<400>コード<400>コンソーシアム<400>について<400> +• +•ユニ<400>コード<400>コンソーシアム<400>は<400>、•最新<400>の<400>ソフトウエア<400>製品<400>と<400>標準<400>において<400>テキスト<400>を<400>表現<400>する<400>こと<400>を<400>意味<400>する<400>“•ユニ<400>コード<400>標準<400>”•の<400>構築<400>、•発展<400>、•普及<400>、•利用<400>促進<400>を<400>目的<400>として<400>設立<400>さ<400>れ<400>た<400>非<400>営利<400>組織<400>です<400>。•同<400>コンソーシアム<400>の<400>会員<400>は<400>、•コンピューター<400>と<400>情報処理<400>に<400>係わる<400>広汎<400>な<400>企業<400>や<400>組織<400>から<400>構成<400>さ<400>れ<400>てい<400>ます<400>。•同<400>コンソーシアム<400>は<400>、•財政<400>的<400>に<400>は<400>、•純粋<400>に<400>会費<400>のみ<400>によって<400>運営<400>さ<400>れ<400>てい<400>ます<400>。•ユニ<400>コード<400>標準<400>を<400>支持<400>し<400>、•その<400>拡張<400>と<400>実装<400>を<400>支援<400>する<400>世界中<400>の<400>組織<400>や<400>個人<400>は<400>、•だれ<400>も<400>が<400>ユニ<400>コード<400>コンソーシアム<400>の<400>会員<400>なる<400>こと<400>が<400>でき<400>ます<400>。• +• +•より<400>詳しい<400>こと<400>を<400>お<400>知<400>り<400>に<400>なり<400>たい<400>方<400>は<400>、•Glossary<200>,• •Technical<200> •Introduction<200> •および<400> •Useful<200> •Resources<200>を<400>ご<400>参照<400>くだ<400>さい<400>。• +• diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryBreakEngine.java index 99ed238fd8e..ddd4a56fc53 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryBreakEngine.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryBreakEngine.java @@ -89,11 +89,18 @@ abstract class DictionaryBreakEngine implements LanguageBreakEngine { * For internal use only. * @internal */ - static class DequeI { + static class DequeI implements Cloneable { private int[] data = new int[50]; private int lastIdx = 4; // or base of stack. Index of element. private int firstIdx = 4; // or Top of Stack. Index of element + 1. + @Override + public Object clone() throws CloneNotSupportedException { + DequeI result = (DequeI)super.clone(); + data = data.clone(); + return result; + } + int size() { return firstIdx - lastIdx; } @@ -150,6 +157,15 @@ abstract class DictionaryBreakEngine implements LanguageBreakEngine { } return false; } + + int elementAt(int i) { + assert i < size(); + return data[lastIdx + i]; + } + + void removeAllElements() { + lastIdx = firstIdx = 4; + } } UnicodeSet fSet = new UnicodeSet(); @@ -173,8 +189,8 @@ abstract class DictionaryBreakEngine implements LanguageBreakEngine { @Override public int findBreaks(CharacterIterator text, int startPos, int endPos, - boolean reverse, int breakType, DequeI foundBreaks) { - int result = 0; + int breakType, DequeI foundBreaks) { + int result = 0; // Find the span of characters included in the set. // The span to break begins at the current position int the text, and @@ -185,24 +201,15 @@ abstract class DictionaryBreakEngine implements LanguageBreakEngine { int rangeStart; int rangeEnd; int c = CharacterIteration.current32(text); - if (reverse) { - boolean isDict = fSet.contains(c); - while ((current = text.getIndex()) > startPos && isDict) { - c = CharacterIteration.previous32(text); - isDict = fSet.contains(c); - } - rangeStart = (current < startPos) ? startPos : - current + (isDict ? 0 : 1); - rangeEnd = start + 1; - } else { - while ((current = text.getIndex()) < endPos && fSet.contains(c)) { - CharacterIteration.next32(text); - c = CharacterIteration.current32(text); - } - rangeStart = start; - rangeEnd = current; + while ((current = text.getIndex()) < endPos && fSet.contains(c)) { + CharacterIteration.next32(text); + c = CharacterIteration.current32(text); } + rangeStart = start; + rangeEnd = current; + // if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) { + // TODO: Why does icu4c have this? result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); text.setIndex(current); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/LanguageBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/text/LanguageBreakEngine.java index b24f113680c..649b28bd39d 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/LanguageBreakEngine.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/LanguageBreakEngine.java @@ -11,7 +11,7 @@ package com.ibm.icu.text; import java.text.CharacterIterator; /** - * The LanguageBreakEngine interface is to be used to implement any + * The LanguageBreakEngine interface is to be used to implement any * language-specific logic for break iteration. */ interface LanguageBreakEngine { @@ -24,21 +24,20 @@ interface LanguageBreakEngine { boolean handles(int c, int breakType); /** - * Implements the actual breaking logic. - * @param text The text to break over - * @param startPos The index of the beginning of our range + * Implements the actual breaking logic. Find any breaks within a run in the supplied text. + * @param text The text to break over. The iterator is left at + * the end of the run of characters which the engine has handled. + * @param startPos The index of the beginning of the range * @param endPos The index of the possible end of our range. It is possible, - * however, that our range ends earlier - * @param reverse true iff we are iterating backwards (in a call to - * previous(), for example) + * however, that the range ends earlier * @param breakType The kind of break iterator that is wanting to make use * of this engine - character, word, line, sentence - * @param foundBreaks A Stack that the breaks found will be added to - * @return the number of words found + * @param foundBreaks A data structure to receive the break positions. + * @return the number of breaks found */ int findBreaks(CharacterIterator text, int startPos, int endPos, - boolean reverse, int breakType, DictionaryBreakEngine.DequeI foundBreaks); + int breakType, DictionaryBreakEngine.DequeI foundBreaks); } - - - + + + diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIDataWrapper.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIDataWrapper.java index 7e330e60935..977ba6f579f 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIDataWrapper.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIDataWrapper.java @@ -249,6 +249,15 @@ final class RBBIDataWrapper { pos += This.fHeader.fSRTableLen; } + // Rule Compatibility Hacks + // If a rule set includes reverse rules but does not explicitly include safe reverse rules, + // the reverse rules are to be treated as safe reverse rules. + + if (This.fSRTable == null && This.fRTable != null) { + This.fSRTable = This.fRTable; + This.fRTable = null; + } + // // Unserialize the Character categories TRIE // Because we can't be absolutely certain where the Trie deserialize will @@ -370,7 +379,7 @@ final class RBBIDataWrapper { ///CLOVER:OFF /** Dump a state table. (A full set of RBBI rules has 4 state tables.) */ private void dumpTable(java.io.PrintStream out, short table[]) { - if (table == null) { + if (table == null || table.length == 0) { out.println(" -- null -- "); } else { int n; diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java index 961a23c1f8b..6236a308c16 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java @@ -176,14 +176,18 @@ class RBBIRuleBuilder { int headerSize = 24 * 4; // align8(sizeof(RBBIDataHeader)); int forwardTableSize = align8(fForwardTables.getTableSize()); int reverseTableSize = align8(fReverseTables.getTableSize()); - int safeFwdTableSize = align8(fSafeFwdTables.getTableSize()); + // int safeFwdTableSize = align8(fSafeFwdTables.getTableSize()); int safeRevTableSize = align8(fSafeRevTables.getTableSize()); int trieSize = align8(fSetBuilder.getTrieSize()); int statusTableSize = align8(fRuleStatusVals.size() * 4); int rulesSize = align8((strippedRules.length()) * 2); - int totalSize = headerSize + forwardTableSize + reverseTableSize - + safeFwdTableSize + safeRevTableSize - + statusTableSize + trieSize + rulesSize; + + int totalSize = headerSize + + forwardTableSize + + /* reverseTableSize */ 0 + + /* safeFwdTableSize */ 0 + + (safeRevTableSize > 0 ? safeRevTableSize : reverseTableSize) + + statusTableSize + trieSize + rulesSize; int outputPos = 0; // Track stream position, starting from RBBIDataHeader. // @@ -199,18 +203,40 @@ class RBBIRuleBuilder { header[RBBIDataWrapper.DH_FORMATVERSION] = RBBIDataWrapper.FORMAT_VERSION; header[RBBIDataWrapper.DH_LENGTH] = totalSize; // fLength, the total size of all rule sections. header[RBBIDataWrapper.DH_CATCOUNT] = fSetBuilder.getNumCharCategories(); // fCatCount. + + // Only save the forward table and the safe reverse table, + // because these are the only ones used at run-time. + // + // For the moment, we still build the other tables if they are present in the rule source files, + // for backwards compatibility. Old rule files need to work, and this is the simplest approach. + // + // Additional backwards compatibility consideration: if no safe rules are provided, consider the + // reverse rules to actually be the safe reverse rules. + header[RBBIDataWrapper.DH_FTABLE] = headerSize; // fFTable header[RBBIDataWrapper.DH_FTABLELEN] = forwardTableSize; // fTableLen + + // Do not save Reverse Table. header[RBBIDataWrapper.DH_RTABLE] = header[RBBIDataWrapper.DH_FTABLE] + forwardTableSize; // fRTable - header[RBBIDataWrapper.DH_RTABLELEN] = reverseTableSize; // fRTableLen + header[RBBIDataWrapper.DH_RTABLELEN] = 0; // fRTableLen + + // Do not save the Safe Forward table. header[RBBIDataWrapper.DH_SFTABLE] = header[RBBIDataWrapper.DH_RTABLE] - + reverseTableSize; // fSTable - header[RBBIDataWrapper.DH_SFTABLELEN] = safeFwdTableSize; // fSTableLen + + 0; // fSTable + header[RBBIDataWrapper.DH_SFTABLELEN] = 0; // fSTableLen + + // Safe reverse table. Use if present, otherwise save regular reverse table as the safe reverse. header[RBBIDataWrapper.DH_SRTABLE] = header[RBBIDataWrapper.DH_SFTABLE] - + safeFwdTableSize; // fSRTable - header[RBBIDataWrapper.DH_SRTABLELEN] = safeRevTableSize; // fSRTableLen + + 0; // fSRTable + if (safeRevTableSize > 0) { + header[RBBIDataWrapper.DH_SRTABLELEN] = safeRevTableSize; + } else { + assert reverseTableSize > 0; + header[RBBIDataWrapper.DH_SRTABLELEN] = reverseTableSize; + } + header[RBBIDataWrapper.DH_TRIE] = header[RBBIDataWrapper.DH_SRTABLE] - + safeRevTableSize; // fTrie + + header[RBBIDataWrapper.DH_SRTABLELEN]; // fTrie header[RBBIDataWrapper.DH_TRIELEN] = fSetBuilder.getTrieSize(); // fTrieLen header[RBBIDataWrapper.DH_STATUSTABLE] = header[RBBIDataWrapper.DH_TRIE] + header[RBBIDataWrapper.DH_TRIELEN]; @@ -232,22 +258,32 @@ class RBBIRuleBuilder { outputPos += 2; } + /* do not write the reverse table tableData = fReverseTables.exportTable(); Assert.assrt(outputPos == header[6]); for (i = 0; i < tableData.length; i++) { dos.writeShort(tableData[i]); outputPos += 2; } + */ + /* do not write safe forwards table Assert.assrt(outputPos == header[8]); tableData = fSafeFwdTables.exportTable(); for (i = 0; i < tableData.length; i++) { dos.writeShort(tableData[i]); outputPos += 2; } + */ + // Write the safe reverse table. + // If not present, write the plain reverse table (old style rule compatibility) Assert.assrt(outputPos == header[10]); - tableData = fSafeRevTables.exportTable(); + if (safeRevTableSize > 0) { + tableData = fSafeRevTables.exportTable(); + } else { + tableData = fReverseTables.exportTable(); + } for (i = 0; i < tableData.length; i++) { dos.writeShort(tableData[i]); outputPos += 2; diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java index e16927c967c..1fb61332399 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java @@ -79,7 +79,7 @@ class RBBIRuleScanner { UnicodeSet fRuleSets[] = new UnicodeSet[10]; // Unicode Sets that are needed during // the scanning of RBBI rules. The - // indicies for these are assigned by the + // Indices for these are assigned by the // perl script that builds the state tables. // See rbbirpt.h. @@ -89,7 +89,7 @@ class RBBIRuleScanner { // keyword, while being scanned. - + // gRuleSet_rule_char_pattern is characters that may appear as literals in patterns without escaping or quoting. static private String gRuleSet_rule_char_pattern = "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]"; static private String gRuleSet_name_char_pattern = "[_\\p{L}\\p{N}]"; static private String gRuleSet_digit_char_pattern = "[0-9]"; @@ -447,6 +447,10 @@ class RBBIRuleScanner { fRB.fDefaultTree = RBBIRuleBuilder.fSafeRevTree; } else if (opt.equals("lookAheadHardBreak")) { fRB.fLookAheadHardBreak = true; + } else if (opt.equals("quoted_literals_only")) { + fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128].clear(); + } else if (opt.equals("unquoted_literals")) { + fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128].applyPattern(gRuleSet_rule_char_pattern); } else { error(RBBIRuleBuilder.U_BRK_UNRECOGNIZED_OPTION); } @@ -957,7 +961,7 @@ class RBBIRuleScanner { } } - + // If there are no forward rules throw an error. // if (fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree] == null) { diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java index 88eb5bd1926..44352cb9674 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java @@ -281,11 +281,16 @@ class RBBISetBuilder { if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("rgroup")>=0) {printRangeGroups();} if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("esets")>=0) {printSets();} - fTrie = new Trie2Writable(0, // Initial value for all code points - 0); // Error value. + fTrie = new Trie2Writable(0, // Initial value for all code points. + 0); // Error value for out-of-range input. for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { - fTrie.setRange(rlRange.fStartChar, rlRange.fEndChar, rlRange.fNum, true); + fTrie.setRange( + rlRange.fStartChar, // Range start + rlRange.fEndChar, // Range end (inclusive) + rlRange.fNum, // value for range + true // Overwrite previously written values + ); } } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java index 7b8dce8b29b..2ddfffa04f0 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java @@ -23,7 +23,6 @@ import java.text.CharacterIterator; import java.util.ArrayList; import java.util.List; -import com.ibm.icu.impl.Assert; import com.ibm.icu.impl.CharacterIteration; import com.ibm.icu.impl.ICUBinary; import com.ibm.icu.impl.ICUDebug; @@ -47,7 +46,6 @@ public class RuleBasedBreakIterator extends BreakIterator { * private constructor */ private RuleBasedBreakIterator() { - fLastStatusIndexValid = true; fDictionaryCharCount = 0; synchronized(gAllBreakEngines) { fBreakEngines = new ArrayList(gAllBreakEngines); @@ -131,9 +129,9 @@ public class RuleBasedBreakIterator extends BreakIterator { * @stable ICU 2.0 */ @Override - public Object clone() - { - RuleBasedBreakIterator result = (RuleBasedBreakIterator)super.clone(); + public Object clone() { + RuleBasedBreakIterator result; + result = (RuleBasedBreakIterator)super.clone(); if (fText != null) { result.fText = (CharacterIterator)(fText.clone()); } @@ -141,12 +139,12 @@ public class RuleBasedBreakIterator extends BreakIterator { result.fBreakEngines = new ArrayList(gAllBreakEngines); } result.fLookAheadMatches = new LookAheadResults(); - if (fCachedBreakPositions != null) { - result.fCachedBreakPositions = fCachedBreakPositions.clone(); - } + result.fBreakCache = result.new BreakCache(fBreakCache); + result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache); return result; } + /** * Returns true if both BreakIterators are of the same class, have the same * rules, and iterate over the same text. @@ -172,10 +170,10 @@ public class RuleBasedBreakIterator extends BreakIterator { if (fText == null && other.fText == null) { return true; } - if (fText == null || other.fText == null) { + if (fText == null || other.fText == null || !fText.equals(other.fText)) { return false; } - return fText.equals(other.fText); + return fPosition == other.fPosition; } catch(ClassCastException e) { return false; @@ -228,17 +226,32 @@ public class RuleBasedBreakIterator extends BreakIterator { */ RBBIDataWrapper fRData; - /* + /** + * The iteration state - current position, rule status for the current position, + * and whether the iterator ran off the end, yielding UBRK_DONE. + * Current position is pinned to be 0 < position <= text.length. + * Current position is always set to a boundary. + * + * The current position of the iterator. Pinned, 0 < fPosition <= text.length. + * Never has the value UBRK_DONE (-1). + */ + private int fPosition; + + /** * Index of the Rule {tag} values for the most recent match. */ - private int fLastRuleStatusIndex; + private int fRuleStatusIndex; - /* - * Rule tag value valid flag. - * Some iterator operations don't intrinsically set the correct tag value. - * This flag lets us lazily compute the value if we are ever asked for it. + /** + * True when iteration has run off the end, and iterator functions should return UBRK_DONE. + */ + private boolean fDone; + + /** + * Cache of previously determined boundary positions. */ - private boolean fLastStatusIndexValid; + private BreakCache fBreakCache = new BreakCache(); + /** * Counter for the number of characters encountered with the "dictionary" @@ -249,6 +262,8 @@ public class RuleBasedBreakIterator extends BreakIterator { */ private int fDictionaryCharCount; + private DictionaryCache fDictionaryCache = new DictionaryCache(); + /* * ICU debug argument name for RBBI */ @@ -261,10 +276,11 @@ public class RuleBasedBreakIterator extends BreakIterator { && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0; /** - * What kind of break iterator this is. Set to KIND_LINE by default, - * since this produces sensible output. + * What kind of break iterator this is. + * Defaulting BreakType to word gives reasonable dictionary behavior for + * Break Iterators that are built from rules. */ - private int fBreakType = KIND_LINE; + private int fBreakType = KIND_WORD; /** * The "default" break engine - just skips over ranges of dictionary words, @@ -297,31 +313,6 @@ public class RuleBasedBreakIterator extends BreakIterator { */ private List fBreakEngines; - /** - * when a range of characters is divided up using the dictionary, the break - * positions that are discovered are stored here, preventing us from having - * to use either the dictionary or the state table again until the iterator - * leaves this range of text - */ - private int[] fCachedBreakPositions; - - /** - * if fCachedBreakPositions is not null, this indicates which item in the - * cache the current iteration position refers to - */ - private int fPositionInCache; - - /** - * Dumps caches and performs other actions associated with a complete change - * in text or iteration position. - */ - private void reset() { - fCachedBreakPositions = null; - // fNumCachedBreakPositions = 0; - fDictionaryCharCount = 0; - fPositionInCache = 0; - - } /** * Dump the contents of the state table and character classes for this break iterator. * For debugging only. @@ -367,16 +358,17 @@ public class RuleBasedBreakIterator extends BreakIterator { */ @Override public int first() { - fCachedBreakPositions = null; - fDictionaryCharCount = 0; - fPositionInCache = 0; - fLastRuleStatusIndex = 0; - fLastStatusIndexValid = true; if (fText == null) { return BreakIterator.DONE; } fText.first(); - return fText.getIndex(); + int start = fText.getIndex(); + if (!fBreakCache.seek(start)) { + fBreakCache.populateNear(start); + } + fBreakCache.current(); + assert(fPosition == start); + return fPosition; } /** @@ -387,24 +379,16 @@ public class RuleBasedBreakIterator extends BreakIterator { */ @Override public int last() { - fCachedBreakPositions = null; - fDictionaryCharCount = 0; - fPositionInCache = 0; - if (fText == null) { - fLastRuleStatusIndex = 0; - fLastStatusIndexValid = true; return BreakIterator.DONE; } - - // t.last() returns the offset of the last character, - // rather than the past-the-end offset - // so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ... - // will work correctly. - fLastStatusIndexValid = false; - int pos = fText.getEndIndex(); - fText.setIndex(pos); - return pos; + int endPos = fText.getEndIndex(); + boolean endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position. + assert(endShouldBeBoundary); + if (fPosition != endPos) { + assert(fPosition == endPos); + } + return endPos; } /** @@ -419,14 +403,17 @@ public class RuleBasedBreakIterator extends BreakIterator { */ @Override public int next(int n) { - int result = current(); - while (n > 0) { - result = next(); - --n; - } - while (n < 0) { - result = previous(); - ++n; + int result = 0; + if (n > 0) { + for (; n > 0 && result != DONE; --n) { + result = next(); + } + } else if (n < 0) { + for (; n < 0 && result != DONE; ++n) { + result = previous(); + } + } else { + result = current(); } return result; } @@ -438,401 +425,44 @@ public class RuleBasedBreakIterator extends BreakIterator { */ @Override public int next() { - // if we have cached break positions and we're still in the range - // covered by them, just move one step forward in the cache - if (fCachedBreakPositions != null) { - if (fPositionInCache < fCachedBreakPositions.length - 1) { - ++fPositionInCache; - int pos = fCachedBreakPositions[fPositionInCache]; - fText.setIndex(pos); - return pos; - } - else { - reset(); - } - } - - int startPos = current(); - fDictionaryCharCount = 0; - int result = handleNext(fRData.fFTable); - if (fDictionaryCharCount > 0) { - result = checkDictionary(startPos, result, false); - } - return result; + fBreakCache.next(); + return fDone ? DONE : fPosition; } /** - * checkDictionary This function handles all processing of characters in - * the "dictionary" set. It will determine the appropriate - * course of action, and possibly set up a cache in the - * process. - */ - private int checkDictionary(int startPos, int endPos, boolean reverse) { - - // Reset the old break cache first. - reset(); - - // note: code segment below assumes that dictionary chars are in the - // startPos-endPos range - // value returned should be next character in sequence - if ((endPos - startPos) <= 1) { - return (reverse ? startPos : endPos); - } - - // Starting from the starting point, scan towards the proposed result, - // looking for the first dictionary character (which may be the one - // we're on, if we're starting in the middle of a range). - fText.setIndex(reverse ? endPos : startPos); - if (reverse) { - CharacterIteration.previous32(fText); - } - - int rangeStart = startPos; - int rangeEnd = endPos; - - int category; - int current; - DictionaryBreakEngine.DequeI breaks = new DictionaryBreakEngine.DequeI(); - int foundBreakCount = 0; - int c = CharacterIteration.current32(fText); - category = (short)fRData.fTrie.get(c); - - // Is the character we're starting on a dictionary character? If so, we - // need to back up to include the entire run; otherwise the results of - // the break algorithm will differ depending on where we start. Since - // the result is cached and there is typically a non-dictionary break - // within a small number of words, there should be little performance impact. - if ((category & 0x4000) != 0) { - if (reverse) { - do { - CharacterIteration.next32(fText); - c = CharacterIteration.current32(fText); - category = (short)fRData.fTrie.get(c); - } while (c != CharacterIteration.DONE32 && ((category & 0x4000)) != 0); - - // Back up to the last dictionary character - rangeEnd = fText.getIndex(); - if (c == CharacterIteration.DONE32) { - // c = fText->last32(); - // TODO: why was this if needed? - c = CharacterIteration.previous32(fText); - } - else { - c = CharacterIteration.previous32(fText); - } - } - else { - do { - c = CharacterIteration.previous32(fText); - category = (short)fRData.fTrie.get(c); - } - while (c != CharacterIteration.DONE32 && ((category & 0x4000) != 0)); - // Back up to the last dictionary character - if (c == CharacterIteration.DONE32) { - // c = fText->first32(); - c = CharacterIteration.current32(fText); - } - else { - CharacterIteration.next32(fText); - c = CharacterIteration.current32(fText); - } - rangeStart = fText.getIndex(); - } - category = (short)fRData.fTrie.get(c); - } - - - // Loop through the text, looking for ranges of dictionary characters. - // For each span, find the appropriate break engine, and ask it to find - // any breaks within the span. - // Note: we always do this in the forward direction, so that the break - // cache is built in the right order. - if (reverse) { - fText.setIndex(rangeStart); - c = CharacterIteration.current32(fText); - category = (short)fRData.fTrie.get(c); - } - LanguageBreakEngine lbe = null; - while(true) { - while((current = fText.getIndex()) < rangeEnd && (category & 0x4000) == 0) { - CharacterIteration.next32(fText); - c = CharacterIteration.current32(fText); - category = (short)fRData.fTrie.get(c); - } - if (current >= rangeEnd) { - break; - } - - // We now have a dictionary character. Get the appropriate language object - // to deal with it. - lbe = getLanguageBreakEngine(c); - - // Ask the language object if there are any breaks. It will leave the text - // pointer on the other side of its range, ready to search for the next one. - if (lbe != null) { - int startingIdx = fText.getIndex(); - foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, false, fBreakType, breaks); - assert fText.getIndex() > startingIdx; - } - - // Reload the loop variables for the next go-round - c = CharacterIteration.current32(fText); - category = (short)fRData.fTrie.get(c); - } - - // If we found breaks, build a new break cache. The first and last entries must - // be the original starting and ending position. - if (foundBreakCount > 0) { - if (foundBreakCount != breaks.size()) { - System.out.println("oops, foundBreakCount != breaks.size(). LBE = " + lbe.getClass()); - } - assert foundBreakCount == breaks.size(); - if (startPos < breaks.peekLast()) { - breaks.offer(startPos); - } - if (endPos > breaks.peek()) { - breaks.push(endPos); - } - - // TODO: get rid of this array, use results from the deque directly - fCachedBreakPositions = new int[breaks.size()]; - - int i = 0; - while (breaks.size() > 0) { - fCachedBreakPositions[i++] = breaks.pollLast(); - } - - // If there are breaks, then by definition, we are replacing the original - // proposed break by one of the breaks we found. Use following() and - // preceding() to do the work. They should never recurse in this case. - if (reverse) { - return preceding(endPos); - } - else { - return following(startPos); - } - } - - // If we get here, there were no language-based breaks. Set the text pointer - // to the original proposed break. - fText.setIndex(reverse ? startPos : endPos); - return (reverse ? startPos : endPos); - - } - - - /** - * Moves the iterator backwards, to the last boundary preceding this one. - * @return The position of the last boundary position preceding this one. + * Moves the iterator backwards, to the boundary preceding the current one. + * @return The position of the boundary position immediately preceding the starting position. * @stable ICU 2.0 */ @Override public int previous() { - int result; - int startPos; - - CharacterIterator text = getText(); - - fLastStatusIndexValid = false; - - // if we have cached break positions and we're still in the range - // covered by them, just move one step backward in the cache - if (fCachedBreakPositions != null) { - if (fPositionInCache > 0) { - --fPositionInCache; - // If we're at the beginning of the cache, need to reevaluate the - // rule status - if (fPositionInCache <= 0) { - fLastStatusIndexValid = false; - } - int pos = fCachedBreakPositions[fPositionInCache]; - text.setIndex(pos); - return pos; - } else { - reset(); - } - } - - // if we're already sitting at the beginning of the text, return DONE - startPos = current(); - if (fText == null || startPos == fText.getBeginIndex()) { - fLastRuleStatusIndex = 0; - fLastStatusIndexValid = true; - return BreakIterator.DONE; - } - - // Rules with an exact reverse table are handled here. - if (fRData.fSRTable != null || fRData.fSFTable != null) { - result = handlePrevious(fRData.fRTable); - if (fDictionaryCharCount > 0) { - result = checkDictionary(result, startPos, true); - } - return result; - } - - // old rule syntax - // set things up. handlePrevious() will back us up to some valid - // break position before the current position (we back our internal - // iterator up one step to prevent handlePrevious() from returning - // the current position), but not necessarily the last one before - // where we started - - int start = current(); - - previous32(fText); - int lastResult = handlePrevious(fRData.fRTable); - if (lastResult == BreakIterator.DONE) { - lastResult = fText.getBeginIndex(); - fText.setIndex(lastResult); - } - result = lastResult; - int lastTag = 0; - boolean breakTagValid = false; - - // iterate forward from the known break position until we pass our - // starting point. The last break position before the starting - // point is our return value - - for (;;) { - result = next(); - if (result == BreakIterator.DONE || result >= start) { - break; - } - lastResult = result; - lastTag = fLastRuleStatusIndex; - breakTagValid = true; - } - - // fLastBreakTag wants to have the value for section of text preceding - // the result position that we are to return (in lastResult.) If - // the backwards rules overshot and the above loop had to do two or more - // handleNext()s to move up to the desired return position, we will have a valid - // tag value. But, if handlePrevious() took us to exactly the correct result position, - // we wont have a tag value for that position, which is only set by handleNext(). - - // Set the current iteration position to be the last break position - // before where we started, and then return that value. - fText.setIndex(lastResult); - fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() - fLastStatusIndexValid = breakTagValid; - return lastResult; + fBreakCache.previous(); + return fDone ? DONE : fPosition; } /** * Sets the iterator to refer to the first boundary position following * the specified position. - * @param offset The position from which to begin searching for a break position. + * @param startPos The position from which to begin searching for a break position. * @return The position of the first break after the current position. * @stable ICU 2.0 */ @Override - public int following(int offset) { - CharacterIterator text = getText(); - - // if we have no cached break positions, or if "offset" is outside the - // range covered by the cache, then dump the cache and call our - // inherited following() method. This will call other methods in this - // class that may refresh the cache. - if (fCachedBreakPositions == null || offset < fCachedBreakPositions[0] || - offset >= fCachedBreakPositions[fCachedBreakPositions.length - 1]) { - fCachedBreakPositions = null; - return rulesFollowing(offset); - } - - // on the other hand, if "offset" is within the range covered by the - // cache, then just search the cache for the first break position - // after "offset" - else { - fPositionInCache = 0; - while (fPositionInCache < fCachedBreakPositions.length - && offset >= fCachedBreakPositions[fPositionInCache]) - ++fPositionInCache; - text.setIndex(fCachedBreakPositions[fPositionInCache]); - return text.getIndex(); - } - } - - private int rulesFollowing(int offset) { - // if the offset passed in is already past the end of the text, - // just return DONE; if it's before the beginning, return the + public int following(int startPos) { + // if the supplied position is before the beginning, return the // text's starting offset - fLastRuleStatusIndex = 0; - fLastStatusIndexValid = true; - if (fText == null || offset >= fText.getEndIndex()) { - last(); - return next(); - } - else if (offset < fText.getBeginIndex()) { + if (startPos < fText.getBeginIndex()) { return first(); } - // otherwise, set our internal iteration position (temporarily) - // to the position passed in. If this is the _beginning_ position, - // then we can just use next() to get our return value - - int result = 0; - - if (fRData.fSRTable != null) { - // Safe Point Reverse rules exist. - // This allows us to use the optimum algorithm. - fText.setIndex(offset); - // move forward one codepoint to prepare for moving back to a - // safe point. - // this handles offset being between a supplementary character - next32(fText); - // handlePrevious will move most of the time to < 1 boundary away - handlePrevious(fRData.fSRTable); - result = next(); - while (result <= offset) { - result = next(); - } - return result; - } - if (fRData.fSFTable != null) { - // No Safe point reverse table, but there is a safe pt forward table. - // - fText.setIndex(offset); - previous32(fText); - // handle next will give result >= offset - handleNext(fRData.fSFTable); - // previous will give result 0 or 1 boundary away from offset, - // most of the time - // we have to - int oldresult = previous(); - while (oldresult > offset) { - result = previous(); - if (result <= offset) { - return oldresult; - } - oldresult = result; - } - result = next(); - if (result <= offset) { - return next(); - } - return result; - } - // otherwise, we have to sync up first. Use handlePrevious() to back - // us up to a known break position before the specified position (if - // we can determine that the specified position is a break position, - // we don't back up at all). This may or may not be the last break - // position at or before our starting position. Advance forward - // from here until we've passed the starting position. The position - // we stop on will be the first break position after the specified one. - // old rule syntax - - fText.setIndex(offset); - if (offset == fText.getBeginIndex()) { - return next(); - } - result = previous(); + // Move requested offset to a code point start. It might be on a trail surrogate. + // Or it may be beyond the end of the text. + startPos = CISetIndex32(fText, startPos); + fBreakCache.following(startPos); + return fDone ? DONE : fPosition; + } - while (result != BreakIterator.DONE && result <= offset) { - result = next(); - } - return result; - } /** * Sets the iterator to refer to the last boundary position before the * specified position. @@ -842,95 +472,21 @@ public class RuleBasedBreakIterator extends BreakIterator { */ @Override public int preceding(int offset) { - CharacterIterator text = getText(); - - // if we have no cached break positions, or "offset" is outside the - // range covered by the cache, we can just call the inherited routine - // (which will eventually call other routines in this class that may - // refresh the cache) - if (fCachedBreakPositions == null || offset <= fCachedBreakPositions[0] || - offset > fCachedBreakPositions[fCachedBreakPositions.length - 1]) { - fCachedBreakPositions = null; - return rulesPreceding(offset); - } - - // on the other hand, if "offset" is within the range covered by the cache, - // then all we have to do is search the cache for the last break position - // before "offset" - else { - fPositionInCache = 0; - while (fPositionInCache < fCachedBreakPositions.length - && offset > fCachedBreakPositions[fPositionInCache]) - ++fPositionInCache; - --fPositionInCache; - text.setIndex(fCachedBreakPositions[fPositionInCache]); - return text.getIndex(); - } - } - - private int rulesPreceding(int offset) { - // if the offset passed in is already past the end of the text, - // just return DONE; if it's before the beginning, return the - - // text's starting offset if (fText == null || offset > fText.getEndIndex()) { - // return BreakIterator::DONE; return last(); - } - else if (offset < fText.getBeginIndex()) { + } else if (offset < fText.getBeginIndex()) { return first(); } - // if we start by updating the current iteration position to the - // position specified by the caller, we can just use previous() - // to carry out this operation - - int result; - if (fRData.fSFTable != null) { - /// todo synwee - // new rule syntax - fText.setIndex(offset); - // move backwards one codepoint to prepare for moving forwards to a - // safe point. - // this handles offset being between a supplementary character - previous32(fText); - handleNext(fRData.fSFTable); - result = previous(); - while (result >= offset) { - result = previous(); - } - return result; - } - if (fRData.fSRTable != null) { - // backup plan if forward safe table is not available - fText.setIndex(offset); - next32(fText); - // handle previous will give result <= offset - handlePrevious(fRData.fSRTable); - - // next will give result 0 or 1 boundary away from offset, - // most of the time - // we have to - int oldresult = next(); - while (oldresult < offset) { - result = next(); - if (result >= offset) { - return oldresult; - } - oldresult = result; - } - result = previous(); - if (result >= offset) { - return previous(); - } - return result; - } + // Move requested offset to a code point start. It might be on a trail surrogate. + // int adjustedOffset = CISetIndex32(fText, offset); // TODO: restore to match ICU4C behavior. + int adjustedOffset = offset; + fBreakCache.preceding(adjustedOffset); + return fDone ? DONE : fPosition; - // old rule syntax - fText.setIndex(offset); - return previous(); } + /** * Throw IllegalArgumentException unless begin <= offset < end. * @stable ICU 2.0 @@ -952,65 +508,42 @@ public class RuleBasedBreakIterator extends BreakIterator { */ @Override public boolean isBoundary(int offset) { + // TODO: behavior difference with ICU4C, which considers out-of-range offsets + // to not be boundaries, and to not be errors. checkOffset(offset, fText); - // the beginning index of the iterator is always a boundary position by definition - if (offset == fText.getBeginIndex()) { - first(); // For side effects on current position, tag values. - return true; - } + // Adjust offset to be on a code point boundary and not beyond the end of the text. + // Note that isBoundary() is always be false for offsets that are not on code point boundaries. + // But we still need the side effect of leaving iteration at the following boundary. + int adjustedOffset = CISetIndex32(fText, offset); - if (offset == fText.getEndIndex()) { - last(); // For side effects on current position, tag values. - return true; + boolean result = false; + if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) { + result = (fBreakCache.current() == offset); } - // otherwise, we can use following() on the position before the specified - // one and return true if the position we get back is the one the user - // specified - - // return following(offset - 1) == offset; - // TODO: check whether it is safe to revert to the simpler offset-1 code - // The safe rules may take care of unpaired surrogates ok. - fText.setIndex(offset); - previous32(fText); - int pos = fText.getIndex(); - boolean result = following(pos) == offset; + if (!result) { + // Not on a boundary. isBoundary() must leave iterator on the following boundary. + // fBreakCache.seek(), above, left us on the preceding boundary, so advance one. + next(); + } return result; + } /** - * Returns the current iteration position. + * Returns the current iteration position. Note that UBRK_DONE is never + * returned from this function; if iteration has run to the end of a + * string, current() will return the length of the string while + * next() will return BreakIterator.DONE). * @return The current iteration position. * @stable ICU 2.0 */ @Override public int current() { - return (fText != null) ? fText.getIndex() : BreakIterator.DONE; + return (fText != null) ? fPosition : BreakIterator.DONE; } - private void makeRuleStatusValid() { - if (fLastStatusIndexValid == false) { - // No cached status is available. - int curr = current(); - if (curr == BreakIterator.DONE || curr == fText.getBeginIndex()) { - // At start of text, or there is no text. Status is always zero. - fLastRuleStatusIndex = 0; - fLastStatusIndexValid = true; - } else { - // Not at start of text. Find status the tedious way. - int pa = fText.getIndex(); - first(); - int pb = current(); - while (fText.getIndex() < pa) { - pb = next(); - } - Assert.assrt(pa == pb); - } - Assert.assrt(fLastStatusIndexValid == true); - Assert.assrt(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fRData.fStatusTable.length); - } - } /** * Return the status tag from the break rule that determined the most recently @@ -1019,7 +552,7 @@ public class RuleBasedBreakIterator extends BreakIterator { * status, a default value of 0 is returned. If more than one rule applies, * the numerically largest of the possible status values is returned. *

- * Of the standard types of ICU break iterators, only the word break + * Of the standard types of ICU break iterators, only the word and line break * iterator provides status values. The values are defined in * class RuleBasedBreakIterator, and allow distinguishing between words * that contain alphabetic letters, "words" that appear to be numbers, @@ -1031,13 +564,11 @@ public class RuleBasedBreakIterator extends BreakIterator { * @return the status from the break rule that determined the most recently * returned break position. * - * @draft ICU 3.0 (retain) - * @provisional This is a draft API and might change in a future release of ICU. + * @stable ICU 60 */ @Override public int getRuleStatus() { - makeRuleStatusValid(); // Status records have this form: // Count N <-- fLastRuleStatusIndex points here. // Status val 0 @@ -1046,7 +577,7 @@ public class RuleBasedBreakIterator extends BreakIterator { // Status val N-1 <-- the value we need to return // The status values are sorted in ascending order. // This function returns the last (largest) of the array of status values. - int idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex]; + int idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex]; int tagVal = fRData.fStatusTable[idx]; return tagVal; } @@ -1070,17 +601,15 @@ public class RuleBasedBreakIterator extends BreakIterator { * In the event that the array is too small, the return value * is the total number of status values that were available, * not the reduced number that were actually returned. - * @draft ICU 3.0 (retain) - * @provisional This is a draft API and might change in a future release of ICU. + * @stable ICU 60 */ @Override public int getRuleStatusVec(int[] fillInArray) { - makeRuleStatusValid(); - int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex]; + int numStatusVals = fRData.fStatusTable[fRuleStatusIndex]; if (fillInArray != null) { int numToCopy = Math.min(numStatusVals, fillInArray.length); for (int i=0; i 0, the segment will be further subdivided + * fRuleStatusIndex Info from the state table indicating which rules caused the boundary. + * * @return the new iterator position * * A note on supplementary characters and the position of underlying @@ -1274,29 +815,34 @@ public class RuleBasedBreakIterator extends BreakIterator { * This is different from everywhere else, where an iterator always * points at the lead surrogate of a supplementary. */ - private int handleNext(short stateTable[]) { + private int handleNext() { if (TRACE) { System.out.println("Handle Next pos char state category"); } - // No matter what, handleNext alway correctly sets the break tag value. - fLastStatusIndexValid = true; - fLastRuleStatusIndex = 0; + // handleNext always sets the break tag value. + // Set the default for it. + fRuleStatusIndex = 0; + fDictionaryCharCount = 0; // caches for quicker access CharacterIterator text = fText; Trie2 trie = fRData.fTrie; + short[] stateTable = fRData.fFTable; + int initialPosition = fPosition; + text.setIndex(initialPosition); + int result = initialPosition; + // Set up the starting char - int c = text.current(); + int c = text.current(); if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { c = nextTrail32(text, c); if (c == DONE32) { + fDone = true; return BreakIterator.DONE; } } - int initialPosition = text.getIndex(); - int result = initialPosition; // Set the initial state for the state machine int state = START_STATE; @@ -1383,7 +929,7 @@ public class RuleBasedBreakIterator extends BreakIterator { } // Remember the break status (tag) values. - fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX]; + fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX]; } int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING]; @@ -1391,8 +937,8 @@ public class RuleBasedBreakIterator extends BreakIterator { // Lookahead match is completed int lookaheadResult = fLookAheadMatches.getPosition(completedRule); if (lookaheadResult >= 0) { - fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX]; - text.setIndex(lookaheadResult); + fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX]; + fPosition = lookaheadResult; return lookaheadResult; } } @@ -1425,21 +971,32 @@ public class RuleBasedBreakIterator extends BreakIterator { text.setIndex(initialPosition); next32(text); result = text.getIndex(); + fRuleStatusIndex = 0; } - else { - // Leave the iterator at our result position. - // (we may have advanced beyond the last accepting position chasing after - // longer matches that never completed.) - text.setIndex(result); - } + + // Leave the iterator at our result position. + // (we may have advanced beyond the last accepting position chasing after + // longer matches that never completed.) + fPosition = result; + if (TRACE) { System.out.println("result = " + result); } return result; } - private int handlePrevious(short stateTable[]) { - if (fText == null || stateTable == null) { + /** + * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules. + * This locates a "Safe Position" from which the forward break rules + * will operate correctly. A Safe Position is not necessarily a boundary itself. + * + * The logic of this function is very similar to handleNext(), above. + * + * @param fromPosition the position in the input text to begin the iteration. + * @internal + */ + private int handlePrevious(int fromPosition) { + if (fText == null) { return 0; } @@ -1449,18 +1006,15 @@ public class RuleBasedBreakIterator extends BreakIterator { int row; int c; int result = 0; - int initialPosition = 0; + int initialPosition = fromPosition; fLookAheadMatches.reset(); - - // handlePrevious() never gets the rule status. - // Flag the status as invalid; if the user ever asks for status, we will need - // to back up, then re-find the break position using handleNext(), which does - // get the status value. - fLastStatusIndexValid = false; - fLastRuleStatusIndex = 0; + short[] stateTable = fRData.fSRTable; + CISetIndex32(fText, fromPosition); + if (fromPosition == fText.getBeginIndex()) { + return BreakIterator.DONE; + } // set up the starting char - initialPosition = fText.getIndex(); result = initialPosition; c = previous32(fText); @@ -1486,12 +1040,6 @@ public class RuleBasedBreakIterator extends BreakIterator { if (mode == RBBI_END) { // We have already done the {eof} iteration. Now is the time // to unconditionally bail out. - if (result == initialPosition) { - // Ran off start, no match found. - // Move one position (towards the start, since we are doing previous.) - fText.setIndex(initialPosition); - previous32(fText); - } break mainLoop; } mode = RBBI_END; @@ -1502,21 +1050,11 @@ public class RuleBasedBreakIterator extends BreakIterator { // look up the current character's category, which tells us // which column in the state table to look at. // + // And off the dictionary flag bit. For reverse iteration it is not used. category = (short) fRData.fTrie.get(c); - - // Check the dictionary bit in the character's category. - // Counter is only used by dictionary based iterators (subclasses). - // Chars that need to be handled by a dictionary have a flag bit set - // in their category values. - // - if ((category & 0x4000) != 0) { - fDictionaryCharCount++; - // And off the dictionary flag bit. - category &= ~0x4000; - } + category &= ~0x4000; } - if (TRACE) { System.out.print(" " + fText.getIndex() + " "); if (0x20 <= c && c < 0x7f) { @@ -1575,21 +1113,774 @@ public class RuleBasedBreakIterator extends BreakIterator { // The state machine is done. Check whether it found a match... // - // If the iterator failed to advance in the match engine, force it ahead by one. + // If the iterator failed to move in the match engine, force it back by one code point. // (This really indicates a defect in the break rules. They should always match // at least one character.) if (result == initialPosition) { - result = fText.setIndex(initialPosition); + CISetIndex32(fText, initialPosition); previous32(fText); result = fText.getIndex(); } - fText.setIndex(result); if (TRACE) { System.out.println("Result = " + result); } return result; } + + /** + * Set the index of a CharacterIterator. + * Pin the index to the valid range range of BeginIndex <= index <= EndIndex. + * If the index points to a trail surrogate of a supplementary character, adjust it + * to the start (lead surrogate) index. + * + * @param ci A CharacterIterator to set + * @param index the index to set + * @return the resulting index, possibly pinned or adjusted. + */ + private static int CISetIndex32(CharacterIterator ci, int index) { + if (index <= ci.getBeginIndex()) { + ci.first(); + } else if (index >= ci.getEndIndex()) { + ci.setIndex(ci.getEndIndex()); + } else if (Character.isLowSurrogate(ci.setIndex(index))) { + if (!Character.isHighSurrogate(ci.previous())) { + ci.next(); + } + } + return ci.getIndex(); + } + + /* DictionaryCache stores the boundaries obtained from a run of dictionary characters. + * Dictionary boundaries are moved first to this cache, then from here + * to the main BreakCache, where they may inter-leave with non-dictionary + * boundaries. The public BreakIterator API always fetches directly + * from the main BreakCache, not from here. + * + * In common situations, the number of boundaries in a single dictionary run + * should be quite small, it will be terminated by punctuation, spaces, + * or any other non-dictionary characters. The main BreakCache may end + * up with boundaries from multiple dictionary based runs. + * + * The boundaries are stored in a simple ArrayList (vector), with the + * assumption that they will be accessed sequentially. + */ + class DictionaryCache { + + void reset() { + fPositionInCache = -1; + fStart = 0; + fLimit = 0; + fFirstRuleStatusIndex = 0; + fOtherRuleStatusIndex = 0; + fBreaks.removeAllElements(); + }; + + boolean following(int fromPos) { + if (fromPos >= fLimit || fromPos < fStart) { + fPositionInCache = -1; + return false; + } + + // Sequential iteration, move from previous boundary to the following + + int r = 0; + if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { + ++fPositionInCache; + if (fPositionInCache >= fBreaks.size()) { + fPositionInCache = -1; + return false; + } + r = fBreaks.elementAt(fPositionInCache); + assert(r > fromPos); + fBoundary = r; + fStatusIndex = fOtherRuleStatusIndex; + return true; + } + + // Random indexing. Linear search for the boundary following the given position. + + for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) { + r= fBreaks.elementAt(fPositionInCache); + if (r > fromPos) { + fBoundary = r; + fStatusIndex = fOtherRuleStatusIndex; + return true; + } + } + + // Internal error. fStart <= fromPos < fLimit, but no cached boundary. + assert(false); + fPositionInCache = -1; + return false; + }; + + boolean preceding(int fromPos) { + if (fromPos <= fStart || fromPos > fLimit) { + fPositionInCache = -1; + return false; + } + + if (fromPos == fLimit) { + fPositionInCache = fBreaks.size() - 1; + if (fPositionInCache >= 0) { + assert(fBreaks.elementAt(fPositionInCache) == fromPos); + } + } + + int r; + if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { + --fPositionInCache; + r = fBreaks.elementAt(fPositionInCache); + assert(r < fromPos); + fBoundary = r; + fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; + return true; + } + + if (fPositionInCache == 0) { + fPositionInCache = -1; + return false; + } + + for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) { + r = fBreaks.elementAt(fPositionInCache); + if (r < fromPos) { + fBoundary = r; + fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; + return true; + } + } + assert(false); + fPositionInCache = -1; + return false; + }; + + /** + * Populate the cache with the dictionary based boundaries within a region of text. + * @param startPos The start position of a range of text + * @param endPos The end position of a range of text + * @param firstRuleStatus The rule status index that applies to the break at startPos + * @param otherRuleStatus The rule status index that applies to boundaries other than startPos + * @internal + */ + void populateDictionary(int startPos, int endPos, + int firstRuleStatus, int otherRuleStatus) { + if ((endPos - startPos) <= 1) { + return; + } + + reset(); + fFirstRuleStatusIndex = firstRuleStatus; + fOtherRuleStatusIndex = otherRuleStatus; + + int rangeStart = startPos; + int rangeEnd = endPos; + + int category; + int current; + int foundBreakCount = 0; + + // Loop through the text, looking for ranges of dictionary characters. + // For each span, find the appropriate break engine, and ask it to find + // any breaks within the span. + + fText.setIndex(rangeStart); + int c = CharacterIteration.current32(fText); + category = (short)fRData.fTrie.get(c); + + while(true) { + while((current = fText.getIndex()) < rangeEnd && (category & 0x4000) == 0) { + c = CharacterIteration.next32(fText); // pre-increment + category = (short)fRData.fTrie.get(c); + } + if (current >= rangeEnd) { + break; + } + + // We now have a dictionary character. Get the appropriate language object + // to deal with it. + LanguageBreakEngine lbe = getLanguageBreakEngine(c); + + // Ask the language object if there are any breaks. It will add them to the cache and + // leave the text pointer on the other side of its range, ready to search for the next one. + if (lbe != null) { + foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreakType, fBreaks); + } + + // Reload the loop variables for the next go-round + c = CharacterIteration.current32(fText); + category = (short)fRData.fTrie.get(c); + } + + // If we found breaks, ensure that the first and last entries are + // the original starting and ending position. And initialize the + // cache iteration position to the first entry. + + // System.out.printf("foundBreakCount = %d\n", foundBreakCount); + if (foundBreakCount > 0) { + assert(foundBreakCount == fBreaks.size()); + if (startPos < fBreaks.elementAt(0)) { + // The dictionary did not place a boundary at the start of the segment of text. + // Add one now. This should not commonly happen, but it would be easy for interactions + // of the rules for dictionary segments and the break engine implementations to + // inadvertently cause it. Cover it here, just in case. + fBreaks.offer(startPos); + } + if (endPos > fBreaks.peek()) { + fBreaks.push(endPos); + } + fPositionInCache = 0; + // Note: Dictionary matching may extend beyond the original limit. + fStart = fBreaks.elementAt(0); + fLimit = fBreaks.peek(); + } else { + // there were no language-based breaks, even though the segment contained + // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache + // for this range will fail, and the calling code will fall back to the rule based boundaries. + } + + }; + + + DictionaryCache() { + fPositionInCache = -1; + fBreaks = new DictionaryBreakEngine.DequeI(); + } + + /** + * copy constructor. Used by RuleBasedBreakIterator.clone(). + * + * @param src the source object to be copied. + */ + DictionaryCache(DictionaryCache src) { + try { + fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone(); + } + catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + fPositionInCache = src.fPositionInCache; + fStart = src.fStart; + fLimit = src.fLimit; + fFirstRuleStatusIndex = src.fFirstRuleStatusIndex; + fOtherRuleStatusIndex = src.fOtherRuleStatusIndex; + fBoundary = src.fBoundary; + fStatusIndex = src.fStatusIndex; + } + + // A data structure containing the boundaries themselves. Essentially a vector of raw ints. + DictionaryBreakEngine.DequeI fBreaks; + int fPositionInCache; // Index in fBreaks of last boundary returned by following() + // // or preceding(). Optimizes sequential access. + int fStart; // Text position of first boundary in cache. + int fLimit; // Last boundary in cache. Which is the limit of the + // // text segment being handled by the dictionary. + int fFirstRuleStatusIndex; // Rule status info for first boundary. + int fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries. + int fBoundary; // Current boundary. Set by preceding(), following(). + int fStatusIndex; // Current rule status index. Set by preceding, following(). + }; + + + + +/* + * class BreakCache + * + * Cache of break boundary positions and rule status values. + * Break iterator API functions, next(), previous(), etc., will use cached results + * when possible, and otherwise cache new results as they are obtained. + * + * Uniformly caches both dictionary and rule based (non-dictionary) boundaries. + * + * The cache is implemented as a single circular buffer. + */ + +/* + * size of the circular cache buffer. + */ + +class BreakCache { + + BreakCache() { + reset(); + }; + + void reset(int pos, int ruleStatus) { + fStartBufIdx = 0; + fEndBufIdx = 0; + fTextIdx = pos; + fBufIdx = 0; + fBoundaries[0] = pos; + fStatuses[0] = (short)ruleStatus; + } + + void reset() {reset(0, 0); }; + + void next() { + if (fBufIdx == fEndBufIdx) { + fDone = !populateFollowing(); + fPosition = fTextIdx; + fRuleStatusIndex = fStatuses[fBufIdx]; + } else { + fBufIdx = modChunkSize(fBufIdx + 1); + fTextIdx = fPosition = fBoundaries[fBufIdx]; + fRuleStatusIndex = fStatuses[fBufIdx]; + } + }; + + void previous() { + int initialBufIdx = fBufIdx; + if (fBufIdx == fStartBufIdx) { + // At start of cache. Prepend to it. + populatePreceding(); + } else { + // Cache already holds the next boundary + fBufIdx = modChunkSize(fBufIdx - 1); + fTextIdx = fBoundaries[fBufIdx]; + } + fDone = (fBufIdx == initialBufIdx); + fPosition = fTextIdx; + fRuleStatusIndex = fStatuses[fBufIdx]; + return; + }; + + // Move the iteration state to the position following the startPosition. + // Input position must be pinned to the input length. + void following(int startPos) { + if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { + // startPos is in the cache. Do a next() from that position. + // TODO: an awkward set of interactions with bi->fDone + // seek() does not clear it; it can't because of interactions with populateNear(). + // next() does not clear it in the fast-path case, where everything matters. Maybe it should. + // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end. + fDone = false; + next(); + } + + }; + + void preceding(int startPos) { + if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { + if (startPos == fTextIdx) { + previous(); + } else { + // seek() leaves the BreakCache positioned at the preceding boundary + // if the requested position is between two bounaries. + // current() pushes the BreakCache position out to the BreakIterator itself. + assert(startPos > fTextIdx); + current(); + } + } + return; + }; + + /* + * Update the state of the public BreakIterator (fBI) to reflect the + * current state of the break iterator cache (this). + */ + int current() { + fPosition = fTextIdx; + fRuleStatusIndex = fStatuses[fBufIdx]; + fDone = false; + return fTextIdx; + }; + + /** + * Add boundaries to the cache near the specified position. + * The given position need not be a boundary itself. + * The input position must be within the range of the text, and + * on a code point boundary. + * If the requested position is a break boundary, leave the iteration + * position on it. + * If the requested position is not a boundary, leave the iteration + * position on the preceding boundary and include both the the + * preceding and following boundaries in the cache. + * Additional boundaries, either preceding or following, may be added + * to the cache as a side effect. + * + * Return false if the operation failed. + */ + boolean populateNear(int position) { + assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]); + + // Find a boundary somewhere in the vicinity of the requested position. + // Depending on the safe rules and the text data, it could be either before, at, or after + // the requested position. + + + // If the requested position is not near already cached positions, clear the existing cache, + // find a near-by boundary and begin new cache contents there. + + if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) { + int aBoundary = fText.getBeginIndex(); + int ruleStatusIndex = 0; + // TODO: check for position == length of text. Although may still need to back up to get rule status. + if (position > aBoundary + 20) { + int backupPos = handlePrevious(position); + fPosition = backupPos; + aBoundary = handleNext(); // Ignore dictionary, just finding a rule based boundary. + ruleStatusIndex = fRuleStatusIndex; + } + reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. + } + + // Fill in boundaries between existing cache content and the new requested position. + + if (fBoundaries[fEndBufIdx] < position) { + // The last position in the cache precedes the requested position. + // Add following position(s) to the cache. + while (fBoundaries[fEndBufIdx] < position) { + if (!populateFollowing()) { + assert false; + return false; + } + } + fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer. + fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries. + while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos. + previous(); + } + return true; + } + + if (fBoundaries[fStartBufIdx] > position) { + // The first position in the cache is beyond the requested position. + // back up more until we get a boundary <= the requested position. + while (fBoundaries[fStartBufIdx] > position) { + populatePreceding(); + } + fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer. + fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries. + while (fTextIdx < position) { // Move forwards to a position at or following the requested pos. + next(); + } + if (fTextIdx > position) { + // If position is not itself a boundary, the next() loop above will overshoot. + // Back up one, leaving cache position at the boundary preceding the requested position. + previous(); + } + return true; + } + + assert fTextIdx == position; + return true; + + }; + + /** + * Add boundary(s) to the cache following the current last boundary. + * Return false if at the end of the text, and no more boundaries can be added. + * Leave iteration position at the first newly added boundary, or unchanged if no boundary was added. + */ + boolean populateFollowing() { + int fromPosition = fBoundaries[fEndBufIdx]; + int fromRuleStatusIdx = fStatuses[fEndBufIdx]; + int pos = 0; + int ruleStatusIdx = 0; + + if (fDictionaryCache.following(fromPosition)) { + addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); + return true; + } + + fPosition = fromPosition; + pos = handleNext(); + if (pos == BreakIterator.DONE) { + return false; + } + + ruleStatusIdx = fRuleStatusIndex; + if (fDictionaryCharCount > 0) { + // The text segment obtained from the rules includes dictionary characters. + // Subdivide it, with subdivided results going into the dictionary cache. + fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx); + if (fDictionaryCache.following(fromPosition)) { + addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); + return true; + // TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point. + // But be careful with interactions with populateNear(). + } + } + + // Rule based segment did not include dictionary characters. + // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them, + // meaning that we didn't take the return, above. + // Add its end point to the cache. + addFollowing(pos, ruleStatusIdx, UpdateCachePosition); + + // Add several non-dictionary boundaries at this point, to optimize straight forward iteration. + // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results. + // + for (int count=0; count<6; ++count) { + pos = handleNext(); + if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) { + break; + } + addFollowing(pos, fRuleStatusIndex, RetainCachePosition); + } + return true; + }; + + /** + * Add one or more boundaries to the cache preceding the first currently cached boundary. + * Leave the iteration position on the first added boundary. + * Return false if no boundaries could be added (if at the start of the text.) + */ + boolean populatePreceding() { + int textBegin = fText.getBeginIndex(); + int fromPosition = fBoundaries[fStartBufIdx]; + if (fromPosition == textBegin) { + return false; + } + + int position = textBegin; + int positionStatusIdx = 0; + + if (fDictionaryCache.preceding(fromPosition)) { + addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); + return true; + } + + int backupPosition = fromPosition; + + // Find a boundary somewhere preceding the first already-cached boundary + do { + backupPosition = backupPosition - 30; + if (backupPosition <= textBegin) { + backupPosition = textBegin; + } else { + backupPosition = handlePrevious(backupPosition); + } + if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) { + position = textBegin; + positionStatusIdx = 0; + } else { + fPosition = backupPosition; // TODO: pass starting position in a clearer way. + position = handleNext(); + positionStatusIdx = fRuleStatusIndex; + + } + } while (position >= fromPosition); + + // Find boundaries between the one we just located and the first already-cached boundary + // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.. + + fSideBuffer.removeAllElements(); + fSideBuffer.push(position); + fSideBuffer.push(positionStatusIdx); + + do { + int prevPosition = fPosition = position; + int prevStatusIdx = positionStatusIdx; + position = handleNext(); + positionStatusIdx = fRuleStatusIndex; + if (position == BreakIterator.DONE) { + break; + } + + boolean segmentHandledByDictionary = false; + if (fDictionaryCharCount != 0) { + // Segment from the rules includes dictionary characters. + // Subdivide it, with subdivided results going into the dictionary cache. + int dictSegEndPosition = position; + fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx); + while (fDictionaryCache.following(prevPosition)) { + position = fDictionaryCache.fBoundary; + positionStatusIdx = fDictionaryCache.fStatusIndex; + segmentHandledByDictionary = true; + assert(position > prevPosition); + if (position >= fromPosition) { + break; + } + assert(position <= dictSegEndPosition); + fSideBuffer.push(position); + fSideBuffer.push(positionStatusIdx); + prevPosition = position; + } + assert(position==dictSegEndPosition || position>=fromPosition); + } + + if (!segmentHandledByDictionary && position < fromPosition) { + fSideBuffer.push(position); + fSideBuffer.push(positionStatusIdx); + } + } while (position < fromPosition); + + // Move boundaries from the side buffer to the main circular buffer. + boolean success = false; + if (!fSideBuffer.isEmpty()) { + positionStatusIdx = fSideBuffer.pop(); + position = fSideBuffer.pop(); + addPreceding(position, positionStatusIdx, UpdateCachePosition); + success = true; + } + + while (!fSideBuffer.isEmpty()) { + positionStatusIdx = fSideBuffer.pop(); + position = fSideBuffer.pop(); + if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) { + // No space in circular buffer to hold a new preceding result while + // also retaining the current cache (iteration) position. + // Bailing out is safe; the cache will refill again if needed. + break; + } + } + return success; + }; + + + static final boolean RetainCachePosition = false; + static final boolean UpdateCachePosition = true; + + /* + * Add the boundary following the current position. + * The current position can be left as it was, or changed to the newly added boundary, + * as specified by the update parameter. + */ + void addFollowing(int position, int ruleStatusIdx, boolean update) { + assert(position > fBoundaries[fEndBufIdx]); + assert(ruleStatusIdx <= Short.MAX_VALUE); + int nextIdx = modChunkSize(fEndBufIdx + 1); + if (nextIdx == fStartBufIdx) { + fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1. + } + fBoundaries[nextIdx] = position; + fStatuses[nextIdx] = (short)ruleStatusIdx; + fEndBufIdx = nextIdx; + if (update == UpdateCachePosition) { + // Set current position to the newly added boundary. + fBufIdx = nextIdx; + fTextIdx = position; + } else { + // Retaining the original cache position. + // Check if the added boundary wraps around the buffer, and would over-write the original position. + // It's the responsibility of callers of this function to not add too many. + assert(nextIdx != fBufIdx); + } + + }; + + + /* + * Add the boundary preceding the current position. + * The current position can be left as it was, or changed to the newly added boundary, + * as specified by the update parameter. + */ + boolean addPreceding(int position, int ruleStatusIdx, boolean update) { + assert(position < fBoundaries[fStartBufIdx]); + assert(ruleStatusIdx <= Short.MAX_VALUE); + int nextIdx = modChunkSize(fStartBufIdx - 1); + if (nextIdx == fEndBufIdx) { + if (fBufIdx == fEndBufIdx && update == RetainCachePosition) { + // Failure. The insertion of the new boundary would claim the buffer position that is the + // current iteration position. And we also want to retain the current iteration position. + // (The buffer is already completely full of entries that precede the iteration position.) + return false; + } + fEndBufIdx = modChunkSize(fEndBufIdx - 1); + } + fBoundaries[nextIdx] = position; + fStatuses[nextIdx] = (short)ruleStatusIdx; + fStartBufIdx = nextIdx; + if (update == UpdateCachePosition) { + fBufIdx = nextIdx; + fTextIdx = position; + } + return true; + }; + + /** + * Set the cache position to the specified position, or, if the position + * falls between to cached boundaries, to the preceding boundary. + * Fails if the requested position is outside of the range of boundaries currently held by the cache. + * The startPosition must be on a code point boundary. + * + * Return true if successful, false if the specified position is after + * the last cached boundary or before the first. + */ + boolean seek(int pos) { + if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) { + return false; + } + if (pos == fBoundaries[fStartBufIdx]) { + // Common case: seek(0), from BreakIterator::first() + fBufIdx = fStartBufIdx; + fTextIdx = fBoundaries[fBufIdx]; + return true; + } + if (pos == fBoundaries[fEndBufIdx]) { + fBufIdx = fEndBufIdx; + fTextIdx = fBoundaries[fBufIdx]; + return true; + } + + int min = fStartBufIdx; + int max = fEndBufIdx; + while (min != max) { + int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2; + probe = modChunkSize(probe); + if (fBoundaries[probe] > pos) { + max = probe; + } else { + min = modChunkSize(probe + 1); + } + } + assert(fBoundaries[max] > pos); + fBufIdx = modChunkSize(max - 1); + fTextIdx = fBoundaries[fBufIdx]; + assert(fTextIdx <= pos); + return true; + + }; + + + /** + * copy constructor, used from RuleBasedBreakIterator.clone(). + * + * @param src + */ + BreakCache(BreakCache src) { + fStartBufIdx = src.fStartBufIdx; + fEndBufIdx = src.fEndBufIdx; + fTextIdx = src.fTextIdx; + fBufIdx = src.fBufIdx; + fBoundaries = src.fBoundaries.clone(); + fStatuses = src.fStatuses.clone(); + fSideBuffer = new DictionaryBreakEngine.DequeI(); // Transient, no need to clone contents. + } + + void dumpCache() { + System.out.printf("fTextIdx:%d fBufIdx:%d\n", fTextIdx, fBufIdx); + for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) { + System.out.printf("%d %d\n", i, fBoundaries[i]); + if (i == fEndBufIdx) { + break; + } + } + }; + + private final int modChunkSize(int index) { return index & (CACHE_SIZE - 1); }; + + static final int CACHE_SIZE = 128; + // static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two."); + + int fStartBufIdx; + int fEndBufIdx; // inclusive + + int fTextIdx; + int fBufIdx; + + int[] fBoundaries = new int[CACHE_SIZE]; + short[] fStatuses = new short[CACHE_SIZE]; + + DictionaryBreakEngine.DequeI fSideBuffer = new DictionaryBreakEngine.DequeI(); +}; + + + + } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnhandledBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnhandledBreakEngine.java index 642d0965cca..3da163623dc 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnhandledBreakEngine.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnhandledBreakEngine.java @@ -50,20 +50,13 @@ final class UnhandledBreakEngine implements LanguageBreakEngine { @Override public int findBreaks(CharacterIterator text, int startPos, int endPos, - boolean reverse, int breakType, DictionaryBreakEngine.DequeI foundBreaks) { + int breakType, DictionaryBreakEngine.DequeI foundBreaks) { if (breakType >= 0 && breakType < fHandled.length()) { UnicodeSet uniset = fHandled.get(breakType); int c = CharacterIteration.current32(text); - if (reverse) { - while (text.getIndex() > startPos && uniset.contains(c)) { - CharacterIteration.previous32(text); - c = CharacterIteration.current32(text); - } - } else { - while (text.getIndex() < endPos && uniset.contains(c)) { - CharacterIteration.next32(text); - c = CharacterIteration.current32(text); - } + while (text.getIndex() < endPos && uniset.contains(c)) { + CharacterIteration.next32(text); + c = CharacterIteration.current32(text); } } return 0; diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar index 0beadd2cdc2..c1cd48305f8 100755 --- a/icu4j/main/shared/data/icudata.jar +++ b/icu4j/main/shared/data/icudata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e8f0af1c8a7b33d8ff22dea3143ecde79291218f0cea9b18181d3fdbd462c2fb -size 12226267 +oid sha256:c147e785f0d4400d571af2fd76930e9a94b87d63b27dd5039e385e92969dd296 +size 12197459 diff --git a/icu4j/main/shared/data/icutzdata.jar b/icu4j/main/shared/data/icutzdata.jar index bfed0fe603e..c086d155c5f 100755 --- a/icu4j/main/shared/data/icutzdata.jar +++ b/icu4j/main/shared/data/icutzdata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:acb00f0af330fbce209ffc15f9af2368edda40ced40f1b3367f040063f6e268a +oid sha256:bf9291b5ec8a8bd2be603c4aaadcf9c5fbb3f59fd3b0498bbe357e0fbd5fc5e5 size 92486 diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBIAPITest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBIAPITest.java index 05ede18660b..2bcb32a0c49 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBIAPITest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBIAPITest.java @@ -298,7 +298,7 @@ public class RBBIAPITest extends TestFmwk { } /** - * Testing the methods lastt(), previous(), and preceding() of RuleBasedBreakIterator + * Testing the methods last(), previous(), and preceding() of RuleBasedBreakIterator **/ @Test public void TestLastPreviousPreceding() { @@ -306,11 +306,11 @@ public class RBBIAPITest extends TestFmwk { String testString = "This is a word break. Isn't it? 2.25 dollars"; logln("Testing last(),previous(), preceding() with custom rules"); logln("testing word iteration for string \"" + testString + "\""); - RuleBasedBreakIterator wordIter1 = (RuleBasedBreakIterator) BreakIterator.getWordInstance(Locale.getDefault()); + RuleBasedBreakIterator wordIter1 = (RuleBasedBreakIterator) BreakIterator.getWordInstance(Locale.ENGLISH); wordIter1.setText(testString); p = wordIter1.last(); if (p != testString.length()) { - errln("ERROR: first() returned" + p + "instead of" + testString.length()); + errln("ERROR: last() returned" + p + "instead of" + testString.length()); } q = wordIter1.previous(); doTest(testString, p, q, 37, "dollars"); @@ -379,11 +379,11 @@ public class RBBIAPITest extends TestFmwk { @Test public void TestIsBoundary() { String testString1 = "Write here. \u092d\u0301\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 a\u0301u"; - RuleBasedBreakIterator charIter1 = (RuleBasedBreakIterator) BreakIterator.getCharacterInstance(Locale.getDefault()); + RuleBasedBreakIterator charIter1 = (RuleBasedBreakIterator) BreakIterator.getCharacterInstance(Locale.ENGLISH); charIter1.setText(testString1); int bounds1[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 20, 21, 22, 23, 25, 26}; doBoundaryTest(charIter1, testString1, bounds1); - RuleBasedBreakIterator wordIter2 = (RuleBasedBreakIterator) BreakIterator.getWordInstance(Locale.getDefault()); + RuleBasedBreakIterator wordIter2 = (RuleBasedBreakIterator) BreakIterator.getWordInstance(Locale.ENGLISH); wordIter2.setText(testString1); int bounds2[] = {0, 5, 6, 10, 11, 12, 16, 17, 22, 23, 26}; doBoundaryTest(wordIter2, testString1, bounds2); diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java index 8be70120ced..a24d4ee26a3 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java @@ -43,9 +43,9 @@ public RBBITestExtended() { static class TestParams { BreakIterator bi; StringBuilder dataToBreak = new StringBuilder(); - int[] expectedBreaks = new int[1000]; - int[] srcLine = new int[1000]; - int[] srcCol = new int[1000]; + int[] expectedBreaks = new int[4000]; + int[] srcLine = new int[4000]; + int[] srcCol = new int[4000]; ULocale currentLocale = new ULocale("en_US"); } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt index 0757bdf7dbc..1450a98d7be 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt @@ -1386,8 +1386,7 @@ Bangkok)• -#TODO: uncomment this line when quoted_literals_only is implemented. -#!!quoted_literals_only; +!!quoted_literals_only; !!forward; 'Hello World'; !!reverse; @@ -1395,3 +1394,83 @@ Bangkok)• •Hello World• + +# Test for circular buffer overflow during reverse iteration with inefficient reverse rules, +# Too many boundaries between safe back up position and current position. + + +!!forward; +.; +!!reverse; +.*; + +•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a• + +# +# Dictionary regression check +# Intended to find unexpected behavior changes when changing dictionary implementation code, +# but may also be fragile, failing with intended improvements to dictionary breaking. +# + + +•Unicode<200> •คือ<200>อะไร<200>?• +•Unicode<200> •กำหนด<200>หมายเลข<200>เฉพาะ<200>สำหรับ<200>ทุก<200>อักขระ<200> +•โดย<200>ไม่<200>สนใจ<200>ว่า<200>เป็น<200>แพ<200>ล็ต<200>ฟอร์ม<200>ใด<200> +•ไม่<200>ขึ้น<200>กับ<200>ว่า<200>จะ<200>เป็น<200>โปรแกรม<200>ใด<200> +•และ<200>ไม่<200>ว่า<200>จะ<200>เป็น<200>ภาษา<200>ใด<200> +• +•โดย<200>พื้น<200>ฐาน<200>แล้ว<200>,• •คอมพิวเตอร์<200>จะ<200>เกี่ยวข้อง<200>กับ<200>เรื่อง<200>ของ<200>ตัวเลข<200>.• •คอมพิวเตอร์<200>จัด<200>เก็บ<200>ตัว<200>อักษร<200>และ<200>อักข<200>ระ<200>อื่นๆ<200> •โดย<200>การ<200>กำหนด<200>หมายเลข<200>ให้<200>สำหรับ<200>แต่ละ<200>ตัว<200>.• •ก่อน<200>หน้า<200>ที่๊<200> •Unicode<200> •จะ<200>ถูก<200>สร้าง<200>ขึ้น<200>,• •ได้<200>มี<200>ระบบ<200> •encoding<200> •อยู่<200>หลาย<200>ร้อย<200>ระบบ<200>สำหรับ<200>การ<200>กำหนด<200>หมายเลข<200>เหล่า<200>นี้<200>.• •ไม่มี<200> •encoding<200> •ใด<200>ที่<200>มี<200>จำนวน<200>ตัว<200>อักขระ<200>มาก<200>เพียง<200>พอ<200>:• •ยก<200>ตัวอย่าง<200>เช่น<200>,• •เฉพาะ<200>ใน<200>กลุ่ม<200>สหภาพ<200>ยุโรป<200>เพียง<200>แห่ง<200>เดียว<200> •ก็<200>ต้องการ<200>หลาย<200> •encoding<200> •ใน<200>การ<200>ครอบคลุม<200>ทุก<200>ภาษา<200>ใน<200>กลุ่ม<200>.• •หรือ<200>แม้แต่<200>ใน<200>ภาษา<200>เดี่ยว<200> •เช่น<200> •ภาษา<200>อังกฤษ<200> •ก็<200>ไม่มี<200> •encoding<200> •ใด<200>ที่<200>เพียง<200>พอ<200>สำหรับ<200>ทุก<200>ตัว<200>อักษร<200>,• •เครื่องหมาย<200>วรรค<200>ตอน<200> •และ<200>สัญลักษณ์<200>ทาง<200>เทคนิค<200>ที่<200>ใช้<200>กัน<200>อยู่<200>ทั่วไป<200>.• +• +•ระบบ<200> •encoding<200> •เหล่า<200>นี้<200>ยัง<200>ขัด<200>แย้ง<200>ซึ่ง<200>กัน<200>และ<200>กัน<200>.• •นั่น<200>ก็<200>คือ<200>,• •ใน<200>สอง<200> •encoding<200> •สามารถ<200>ใช้<200>หมายเลข<200>เดียวกัน<200>สำหรับ<200>ตัว<200>อักขระ<200>สอง<200>ตัว<200>ที่<200>แตก<200>ต่าง<200>กัน<200>,•หรือ<200>ใช้<200>หมายเลข<200>ต่าง<200>กัน<200>สำหรับ<200>อักขระ<200>ตัว<200>เดียวกัน<200>.• •ใน<200>ระบบ<200>คอมพิวเตอร์<200> •(•โดย<200>เฉพาะ<200>เซิร์ฟเวอร์<200>)• •ต้อง<200>มี<200>การ<200>สนับสนุน<200>หลาย<200> •encoding<200>;• •และ<200>เมื่อ<200>ข้อมูล<200>ที่<200>ผ่าน<200>ไป<200>มาระ<200>หว่าง<200>การ<200>เข้า<200>รหัส<200>หรือ<200>แพ<200>ล็ต<200>ฟอร์ม<200>ที่<200>ต่าง<200>กัน<200>,• •ข้อมูล<200>นั้น<200>จะ<200>เสี่ยง<200>ต่อ<200>การ<200>ผิด<200>พลาด<200>เสีย<200>หาย<200>.• +• +•Unicode<200> •จะ<200>เปลี่ยนแปลง<200>สิ่ง<200>เหล่า<200>นั้น<200>ทั้งหมด<200>!• +• +•Unicode<200> •กำหนด<200>หมายเลข<200>เฉพาะ<200>สำหรับ<200>แต่ละ<200>อักขระ<200>,• •โดย<200>ไม่<200>สนใจ<200>ว่า<200>เป็น<200>แพ<200>ล็ต<200>ฟอร์ม<200>ใด<200>,• •ไม่<200>ขึ้น<200>กับ<200>ว่า<200>จะ<200>เป็น<200>โปรแกรม<200>ใด<200>และ<200>ไม่<200>ว่า<200>จะ<200>เป็น<200>ภาษา<200>ใด<200>.• •มาตรฐาน<200> •Unicode<200> •ได้<200>ถูก<200>นำ<200>ไป<200>ใช้<200>โดย<200>ผู้นำ<200>ใน<200>อุตสาหกรรม<200> •เช่น<200> •Apple<200>,• •HP<200>,• •IBM<200>,• •JustSystem<200>,• •Microsoft<200>,• •Oracle<200>,• •SAP<200>,• •Sun<200>,• •Sybase<200>,• •Unisys<200> •และ<200>อื่นๆ<200> •อีก<200>มาก<200>.• •Unicode<200> •เป็น<200>สิ่ง<200>ที่<200>จำเป็น<200>สำหรับ<200>มาตร<200>ฐาน<200>ใหม่ๆ<200> •เช่น<200> •XML<200>,• •Java<200>,• •ECMAScript<200> •(•JavaScript<200>)•,• •LDAP<200>,• •CORBA<200> •3.0<100>,• •WML<200> •ฯลฯ<200>.•,• •และ<200>เป็น<200>แนวทาง<200>อย่าง<200>เป็น<200>ทางการ<200>ใน<200>การ<200>ทำ<200> •ISO<200>/•IEC<200> •10646<100>.• •Unicode<200> •ได้<200>รับ<200>การ<200>สนับสนุน<200>ใน<200>ระบบ<200>ปฏิบัติ<200>การ<200>จำนวน<200>มาก<200>,• •บราวเซอร์<200>ใหม่ๆ<200> •ทก<200>ตัว<200>,• •และ<200>ผลิต<200>ภัณฑ์<200>อื่นๆ<200> •อีก<200>มาก<200>.• •การ<200>เกิด<200>ขึ้น<200>ของ<200> •Unicode<200> •Standard<200> •และ<200>ทูล<200>ส์<200>ต่างๆ<200> •ที่<200>มี<200>ใน<200>การ<200>สนับสนุน<200> •Unicode<200>,• •เป็น<200>หนึ่ง<200>ใน<200>แนว<200>โน้ม<200>ทาง<200>เทคโนโลยี<200>ซอฟต์แวร์<200>ระดับ<200>โลก<200>ที่<200>มี<200>ความ<200>สำคัญ<200>ที่สุด<200>.• +• +•การ<200>รวม<200> •Unicode<200> •เข้าไป<200>ใน<200>ระบบ<200>ไคลเอ็นต์<200>-•เซิร์ฟเวอร์<200> •หรือ<200>แอ็พ<200>พลิ<200>เค<200>ชัน<200>แบบ<200> •multi<200>-•tiered<200> •และ<200>เว็บไซต์<200> •จะ<200>ทำให้<200>เกิด<200>การ<200>ประหยัด<200>ค่า<200>ใช้<200>จ่าย<200>มากกว่า<200>การ<200>ใช้<200>ชุด<200>อักขระ<200>แบบ<200>เดิม<200>.• •Unicode<200> •ทำให้<200>ผลิตภัณฑ์<200>ซอฟต์แวร์<200>หนึ่ง<200>เดียว<200> •หรือ<200>เว็บไซต์<200>แห่ง<200>เดียว<200> •รองรับ<200>ได้<200>หลาย<200>แพ<200>ล็ต<200>ฟอร์ม<200>,• •หลาย<200>ภาษา<200>และ<200>หลาย<200>ประเทศ<200>โดย<200>ไม่<200>ต้อง<200>ทำการ<200>รื้อ<200>ปรับ<200>ระบบ<200>.• •Unicode<200> •ยัง<200>ทำให้<200>ข้อมูล<200>สามารถ<200>เคลื่อน<200>ย้าย<200>ไป<200>มา<200>ใน<200>หลายๆ<200> •ระบบ<200>โดย<200>ไม่<200>เกิด<200>ความ<200>ผิด<200>พลาด<200>เสีย<200>หาย<200>.• +• +•เกี่ยว<200>กับ<200> •Unicode<200> •Consortium<200> +• +•Unicode<200> •Consortium<200> •เป็น<200>องค์กร<200>ไม่<200>แสวงหา<200>กำไร<200>ที่<200>ก่อ<200>ตั้ง<200>ขึ้น<200>เพื่อ<200>พัฒนา<200>,• •ขยาย<200>และ<200>ส่ง<200>เสริม<200>การ<200>ใช้<200> •Unicode<200> •Standard<200>,• •ซึ่ง<200>กำหนด<200>รูป<200>แบบ<200>การ<200>แทน<200>ค่า<200>ของ<200>ข้อความ<200>ใน<200>ผลิตภัณฑ์<200>ซอฟต์แวร์<200>และ<200>มาตร<200>ฐาน<200>ใหม่ๆ<200>.• •สมาชิก<200>ของ<200>สมาคม<200>เป็น<200>ตัวแทน<200>จาก<200>บริษัท<200>และ<200>องค์กร<200>ใน<200>อุตสาหกรรม<200>คอมพิวเตอร์<200>และ<200>การ<200>ประมวล<200>ผล<200>สารสนเทศ<200>.• •สมาคม<200>ได้<200>รับ<200>การ<200>สนับสนุน<200>ทางการ<200>เงิน<200>ผ่าน<200>ทาง<200>ค่า<200>ธรรมเนียม<200>ของ<200>การ<200>เป็น<200>สมาชิก<200>เท่านั้น<200>.• •สมาชิก<200>ภาพ<200>ของ<200> •Unicode<200> •Consortium<200> •เปิด<200>กว้าง<200>สำหรับ<200>องค์กร<200>หรือ<200>บุคคล<200>ใดๆ<200> •ใน<200>โลก<200>ที่<200>ต้องการ<200>สนับสนุน<200> •Unicode<200> •Standard<200> •และ<200>ช่วย<200>เหลือ<200>การ<200>ขยาย<200>ตัว<200>และ<200>การนำ<200> •Unicode<200> •ไป<200>ใช้<200>งาน<200>.• +• +•สำหรับ<200>ข้อมูล<200>เพิ่ม<200>เติม<200>,• •ให้<200>ดู<200>ที่<200> •Glossary<200>,• •Sample<200> •Unicode<200>-•Enabled<200> •Products<200>,• •Technical<200> •Introduction<200> •และ<200> •Useful<200> •Resources<200>.• + + +# Burmese +•အ<200>လော<200>င္<200>မ<200>င္<200>တရား<200> +• • • • • •မဟာ<200>ဓမ္မရာဇာ<200>မိ<200>ပတိ<200>လ<200>က္<200>ထ<200>က္<200>တ္<200>ဝ<200>င္<200> •အ<200>င္<200>ဝ<200>နေ<200>ပ္<200>ရ<200>ည္<200>တော္<200>က္<200>ရီး<200>သ<200>ည္<200> •မ<200>င္<200>ရိ<200>မ္<200>မ<200>သ<200>က္<200>ဖ္<200>ရ<200>စ္<200>နေ<200>သ<200>ည္<200>။• •မဏိ<200>ပူ<200>ရ<200> •က<200>သ<200>ည္<200>မ္<200>ယား<200>က<200> •အ<200>င္<200>ဝ<200>နေ<200>ပ္<200>ရ<200>ည္<200>တော္<200>၏• •မ္<200>ရော<200>က္<200>ဘ<200>က္<200>တ<200>လ္<200>ဝ္<200>ဟား<200>ကုိ<200> •တုိ<200>က္<200>ခုိ<200>က္<200>ဖ္<200>ယ<200>က္<200>ဆီး<200>သ<200>ည္<200>။• •အော<200>က္<200>မ္<200>ရ<200>န္<200>မာ<200>နုိ<200>င္<200>ငံ<200> •ဟံ<200>သာ<200>ဝ<200>တီ<200>သား<200>တုိ့<200>က<200>လ<200>ည္<200> •ပု<200>န္<200>က<200>န္<200>သ<200>ည္<200>။• •မတ္တ<200>ရာ<200>အု<200>တ္<200>ဖုိ<200>ရ္<200>ဟိ<200> •က္<200>ဝေ့<200>ရ္<200>ဟ<200>မ္<200>မ္<200>ယား<200>က<200>လ<200>ည္<200> •ထ<200>က္<200>ရ္<200>ဝ<200>သ<200>ည္<200>။• +• +• • • • •ထုိ<200>အ<200>ခ္<200>ယိ<200>န္<200>တ္<200>ဝ<200>င္<200> •မု<200>ဆုိး<200>ဖုိ<200>ရ္<200>ဝာ<200>သူ<200>က္<200>ရီး<200> •အော<200>င္<200>ဇေ<200>ယ္<200>ယ<200>သ<200>ည္<200> •မိမိ<200>၏•ရ္<200>ဝာ<200>ကုိ<200> •လုံ<200>ခ္<200>ရုံ<200>အော<200>င္<200>ထ<200>န္<200>လုံး<200>တ<200>ပ္<200>မ္<200>ယား<200>ကာ<200>ရ<200>သ<200>ည္<200>။• •အနီး<200>အ<200>ပား<200> •က္<200>ယေး<200>ရ္<200>ဝာ<200> •လေး<200>ဆ<200>ယ့္<200>ခ္<200>ရော<200>က္<200>ရ္<200>ဝာ<200>ကုိ<200> •သိ<200>မ္း<200>သ္<200>ဝ<200>င္<200>ထား<200>သ<200>ည္<200>။• •မ<200>က္<200>ရာ<200>မီ<200>ပ<200>င္<200> •အ<200>င္<200>ဝ<200>နေ<200>ပ္<200>ရ<200>ည္<200>တော္<200>က္<200>ရီး<200>သ<200>ည္<200> •ဟံ<200>သာ<200>ဝ<200>တီ<200>တ<200>ပ္<200>မ္<200>ယား<200> •လ<200>က္<200>တ္<200>ဝ<200>င္<200>သ<200>က္<200>ဆ<200>င္<200>ရ<200>တော့<200>သ<200>ည္<200>။• +• +• • • • •အ<200>င္<200>ဝ<200>ကုိ<200> •သိ<200>မ္<200>ပုိ<200>က္<200>ပ္<200>ရီး<200>သော<200> •ဟံ<200>သာ<200>ဝ<200>တီ<200>တ<200>ပ္<200>မ္<200>ယား<200>သ<200>ည္<200> •မ္<200>ရော<200>က္<200>ဘ<200>က္<200>တ<200>လ္<200>ဝ္<200>ဟား<200>က္<200>ယေး<200>ရ္<200>ဝာ<200>မ္<200>ယား<200>ကုိ<200> •သစ္စာ<200>ခံ<200>ခုိ<200>င္<200>ရ<200>န္<200> •လာ<200>က္<200>ရ<200>ရာ<200> •မု<200>ဆုိး<200>ဖုိ<200>ရ္<200>ဝာ<200>သုိ့<200> •ရော<200>က္<200>ရ္<200>ဟိ<200>လာ<200>သ<200>ည္<200>။• •တ<200>ခ္<200>ယိ<200>န္<200>တ<200>ည္<200>မ္<200>ဟာ<200>ပ<200>င္<200> •က္<200>ဝေ့<200>ရ္<200>ဟ<200>မ္<200>မ္<200>ယား<200>က<200>လ<200>ည္<200> •သစ္စာ<200>ခံ<200>ခုိ<200>င္<200>ရ<200>န္<200> •ရော<200>က္<200>ရ္<200>ဟိ<200>လာ<200>သ<200>ည္<200>။• •ဦး<200>အော<200>င္<200>ဇေ<200>ယ္<200>ယ<200>သ<200>ည္<200> •အ<200>ဖ္<200>ဝဲ့<200>န္<200>ဟ<200>စ္<200>ဖ္<200>ဝဲ့<200>ကုိ<200> •ခ္<200>ရေ<200>ငံ<200>စ္<200>ဝာ<200> •ဆ<200>က္<200>ဆံ<200>သ<200>ည္<200>။• •မ<200>ည္<200>သူ့<200>သ<200>စ္<200>စာ<200>ကုိ<200>မ္<200>ယ္<200>ဟ<200> •ခံ<200>ယူ<200>ခ္<200>ရ<200>င္<200>မ<200>ပ္<200>ရု<200>ပေ<200>။• •ဟံ<200>သာ<200>ဝ<200>တီ<200>တ<200>ပ္<200>ဖ္<200>ဝဲ့<200>ကုိ<200> •အ<200>ပ္<200>ရ<200>န္<200>ခ<200>ရီး<200>တ္<200>ဝ<200>င္<200> •လ<200>မ္<200>မ္<200>ဟ<200>ဖ္<200>ရ<200>တ္<200>၍• •တုိ<200>က္<200>ခုိ<200>က္<200>သ<200>ည္<200>။• •ဟံ<200>သာ<200>ဝ<200>တီ<200>တ<200>ပ္<200>ဖ္<200>ဝဲ့<200>မ္<200>ယား<200> •အထိ<200>အ<200>ခုိ<200>က္<200>အ<200>က္<200>ယ<200>အ<200>ဆုံး<200>မ္<200>ယား<200>စ္<200>ဝာ<200>ဖ္<200>ရ<200>င္<200> •ပ္<200>ရ<200>န္<200>ရ<200>သ<200>ည္<200>။• +• +• • • • •ဟံ<200>သာ<200>ဝ<200>တီ<200>တ<200>ပ္<200>မ္<200>ယား<200>သ<200>ည္<200> •မု<200>ဆုိး<200>ဖုိ<200>ရ္<200>ဝာ<200>ကုိ<200> •လာ<200>ရော<200>က္<200>တုိ<200>က္<200>ခုိ<200>က္<200>က္<200>ရ<200>ပ္<200>ရ<200>န္<200>သ<200>ည္<200>။• •ဦး<200>အော<200>င္<200>ဇေ<200>ယ္<200>ယ<200>သ<200>ည္<200> •သ္<200>ဝေး<200>သော<200>က္<200>ရဲ<200>ဘော္<200> •ခ္<200>ရော<200>က္<200>က္<200>ယိ<200>ပ္<200>ရ္<200>ဟ<200>စ္<200>ယော<200>က္<200>န္<200>ဟ<200>င္<200>အတူ<200> •ဦးစီး<200>ကာ<200>အော<200>င္<200>မ္<200>ရ<200>င္<200>စ္<200>ဝာ<200>ခု<200>ခံ<200>တ္<200>ဝ<200>န္<200>လ္<200>ဟ<200>န္<200>နုိ<200>င္<200>ခဲ့<200>သ<200>ည္<200>။• •ထုိ့<200>နော<200>က္<200> •ဦး<200>အော<200>င္<200>ဇေ<200>ယ္<200>ယ<200>သ<200>ည္<200> •မ္<200>ရော<200>က္<200>ဘ<200>က္<200>တ<200>လ္<200>ဝ္<200>ဟား<200>ရ္<200>ဟိ<200> •ရ္<200>ဟ<200>မ္<200>မ္<200>ယား<200>န္<200>ဟ<200>င္<200> •မ္<200>ရ<200>န္<200>မာ<200>မ္<200>ယား<200>ကုိ<200>လ<200>ည္<200> •ဆ<200>က္<200>သ္<200>ဝ<200>ယ္<200>စ<200>ည္<200>ရုံး<200>နုိ<200>င္<200>ခဲ့<200>သ<200>ည္<200>။• •ဤ<200>သုိ့<200>ဖ္<200>ရ<200>င္<200> •ဦး<200>အော<200>င္<200>ဇေ<200>ယ္<200>ယ<200>၏• •အ<200>ရ္<200>ဟိ<200>န္<200>အ<200>ဝာ<200> •မ္<200>ရ<200>င္<200>မား<200>လာ<200>လေ<200>သ<200>ည္<200>။• +• +• • • • •ဦး<200>အော<200>င္<200>ဇေ<200>ယ္<200>ယ<200>သ<200>ည္<200> •အ<200>လော<200>င္<200>မ<200>င္<200>တ<200>ရား<200>ဘ္<200>ဝဲ့<200>ကုိ<200> •ခံယူ<200>ကာ<200> •ကု<200>န္<200>ဘော<200>င္<200>မ<200>င္<200>ဆ<200>က္<200>ကုိ<200>စ<200>တ<200>င္<200>တ<200>ည္<200>ထော<200>င္<200>သ<200>ည္<200>။• •မု<200>ဆုိး<200>ဖုိ<200>ရ္<200>ဝာ<200>ကုိ<200> •ရ္<200>ဝ္<200>ဟ<200>ဝေ<200>ဘုိ<200>ဟု<200> •သ<200>မု<200>တ္<200>ကာ<200> •မ္<200>ရုိ့<200>န<200>န္<200>တ<200>ည္<200>သ<200>ည္<200>။• •န<200>န္<200>တ<200>ည္<200>သ<200>က္<200>က<200>ရာ<200>ဇ္<200>ဖ္<200>ရ<200>စ္<200>သော<200> •၁၁၁၅<100> •ခု<200>ကုိ<200> •ဥ<200>ဩ<200>အော္<200>မ္<200>ရ<200>ည္<200> •ကု<200>န္<200>ဘော<200>င္<200>တ<200>ည္<200>ဟု<200> •အ<200>မ္<200>ဟ<200>တ္<200>အ<200>သား<200>ပ္<200>ရု<200>က္<200>ရ<200>သ<200>ည္<200>။• +• +• • • • •အ<200>လော<200>င္<200>မ<200>င္<200>တရား<200>သ<200>ည္<200> •ဧရာ<200>ဝ<200>တီ<200>န္<200>ဟ<200>င္<200>ခ္<200>ယ<200>င္<200>တ္<200>ဝ<200>င္<200> •မ္<200>ရ<200>စ္<200>န္<200>ဟ<200>စ္<200>သ္<200>ဝ<200>ယ္<200>အ<200>က္<200>ရား<200> •ဒေ<200>သ<200>မ္<200>ယား<200>ကုိ<200>အ<200>ခုိ<200>င္<200>အ<200>မာ<200> •စု<200>စ<200>ည္<200>ပ္<200>ရီး<200>နော<200>က္<200> •အ<200>င္<200>ဝ<200>ကုိ<200> •တုိ<200>က္<200>ခုိ<200>က္<200>အော<200>င္<200>မ္<200>ရ<200>င္<200>သ<200>ည္<200>။• •ထုိ<200>နော<200>က္<200>တ္<200>ဝ<200>င္<200>ပ္<200>ရ<200>ည္<200>၊• •လ္<200>ဝ<200>န္<200>ဆေး<200>၊• •ဒ<200>ဂုံ<200>မ္<200>ရုိ့<200>မ္<200>ယား<200>ကုိ<200> •သိ<200>မ္<200>ပုိ<200>က္<200>သ<200>ည္<200>။• •လ္<200>ဝ<200>န္<200>ဆေး<200> •ကုိ<200>မ္<200>ရ<200>န္<200>အော<200>င္<200>ဟူ<200>၍• •သ<200>မု<200>တ္<200>သ<200>ည္<200>။• •ဒ<200>ဂုံ<200>ကုိ<200>ရ<200>န္<200>ကု<200>န္<200>ဟူ<200>၍• •သ<200>မု<200>တ္<200>ထ<200>သ<200>ည္<200>။• + + +# japanese +•ユニ<400>コード<400>と<400>は<400>何<400>か<400>?• +•ユニ<400>コード<400>は<400>、•すべて<400>の<400>文字<400>に<400>固有<400>の<400>番号<400>を<400>付与<400>し<400>ます<400> +•プラットフォーム<400>に<400>は<400>依存<400>しま<400>せん<400> +•プログラム<400>に<400>も<400>依存<400>しま<400>せん<400> +•言語<400>に<400>も<400>依存<400>しま<400>せん<400> +• +•コンピューター<400>は<400>、•本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。•コンピューター<400>は<400>、•文字<400>や<400>記号<400>など<400>の<400>それぞれに<400>番号<400>を<400>割り振る<400>こと<400>によって<400>扱える<400>よう<400>にし<400>ます<400>。•ユニ<400>コード<400>が<400>出来る<400>まで<400>は<400>、•これらの<400>番号<400>を<400>割り振る<400>仕組み<400>が<400>何<400>百<400>種類<400>も<400>存在<400>しま<400>した<400>。•どの<400>一つ<400>を<400>とっても<400>、•十分<400>な<400>文字<400>を<400>含<400>んで<400>は<400>いま<400>せん<400>で<400>した<400>。•例えば<400>、•欧州<400>連合<400>一つ<400>を<400>見<400>て<400>も<400>、•その<400>すべて<400>の<400>言語<400>を<400>カバー<400>する<400>ため<400>に<400>は<400>、•いくつか<400>の<400>異なる<400>符号<400>化<400>の<400>仕組み<400>が<400>必要<400>で<400>した<400>。•英語<400>の<400>よう<400>な<400>一つ<400>の<400>言語<400>に<400>限<400>って<400>も<400>、•一つ<400>だけ<400>の<400>符号<400>化<400>の<400>仕組み<400>では<400>、•一般<400>的<400>に<400>使<400>われる<400>すべて<400>の<400>文字<400>、•句読点<400>、•技術<400>的<400>な<400>記号<400>など<400>を<400>扱う<400>に<400>は<400>不十分<400>で<400>した<400>。• +• +•これらの<400>符号<400>化<400>の<400>仕組み<400>は<400>、•相互<400>に<400>矛盾<400>する<400>もの<400>でも<400>ありま<400>した<400>。•二つ<400>の<400>異なる<400>符号<400>化<400>の<400>仕組み<400>が<400>、•二つ<400>の<400>異なる<400>文字<400>に<400>同一<400>の<400>番号<400>を<400>付ける<400>こと<400>も<400>できる<400>し<400>、•同じ<400>文字<400>に<400>異なる<400>番号<400>を<400>付ける<400>こと<400>も<400>できる<400>の<400>です<400>。•どの<400>よう<400>な<400>コンピューター<400>も<400>(•特に<400>サーバー<400>は<400>)•多く<400>の<400>異<400>な<400>っ<400>た<400>符号<400>化<400>の<400>仕組み<400>を<400>サポート<400>する<400>必要<400>が<400>あり<400>ます<400>。•たとえ<400>データ<400>が<400>異なる<400>符号<400>化<400>の<400>仕組み<400>や<400>プラットフォーム<400>を<400>通過<400>し<400>て<400>も<400>、•いつ<400>どこ<400>で<400>データ<400>が<400>乱れる<400>か<400>分<400>から<400>ない<400>危険<400>を<400>冒す<400>こと<400>の<400>なる<400>の<400>です<400>。• +• +•ユニ<400>コード<400>は<400>すべて<400>を<400>変<400>え<400>ます<400> +• +•ユニ<400>コード<400>は<400>、•プラットフォーム<400>に<400>係<400>わら<400>ず<400>、•プログラム<400>に<400>係<400>わら<400>ず<400>、•言語<400>に<400>係<400>わら<400>ず<400>、•すべて<400>の<400>文字<400>に<400>独立<400>した<400>番号<400>を<400>与<400>え<400>ます<400>。•ユニ<400>コード<400>標準<400>は<400>、•アップル<400>、•ヒュー<400>レット<400>パッ<400>カード<400>、•IBM<200>、•ジャスト<400>システム<400>、•マイクロ<400>ソフト<400>、•オラクル<400>、•SAP<200>、•サン<400>、•サイ<400>ベース<400>など<400>の<400>産業<400>界<400>の<400>主導<400>的<400>企業<400>と<400>他の<400>多く<400>の<400>企業<400>に<400>採用<400>さ<400>れ<400>てい<400>ます<400>。•ユニ<400>コード<400>は<400>、•XML<200>、•Java<200>、•ECMAScript<200>(•JavaScript<200>)•、•LDAP<200>、•CORBA<200> •3.0<100>など<400>の<400>最先端<400>の<400>標準<400>の<400>前提<400>と<400>な<400>って<400>おり<400>、•ユニ<400>コード<400>を<400>実装<400>す<400>れ<400>ば<400>、•ISO<200>/•IEC<200> •10646<100>に<400>適合<400>する<400>ことに<400>なり<400>ます<400>。•ユニ<400>コード<400>は<400>、•多く<400>の<400>オペレーティングシステム<400>と<400>すべて<400>の<400>最新<400>の<400>ブラウザー<400>と<400>他の<400>多く<400>の<400>製品<400>で<400>サポート<400>さ<400>れ<400>てい<400>ます<400>。•ユニ<400>コード<400>標準<400>の<400>出現<400>と<400>ユニ<400>コード<400>を<400>サポート<400>する<400>ツール<400>類<400>は<400>、•昨今<400>顕著<400>に<400>な<400>って<400>いる<400>ソフトウエア<400>技術<400>の<400>グローバル<400>化<400>の<400>流れ<400>に対して<400>、•特に<400>役<400>に<400>立<400>って<400>い<400>ます<400>。• +• +•ユニ<400>コード<400>を<400>ク<400>ライアン<400>ト<400>サーバー<400>型<400>の<400>アプリケーション<400>や<400>、•多層<400>構造<400>を<400>持つ<400>アプリケーション<400>、•ウェブサイト<400>など<400>に<400>組み込む<400>こと<400>で<400>、•従来<400>の<400>文字<400>コードセット<400>を<400>用いる<400>より<400>も<400>明らか<400>な<400>コスト<400>削減<400>が<400>可能<400>です<400>。•ユニ<400>コード<400>は<400>、•単一<400>の<400>ソフトウエア<400>製品<400>、•単一<400>の<400>ウェブサイト<400>に<400>、•何ら<400>手<400>を<400>加える<400>こと<400>なく<400>、•複数<400>の<400>プラットフォーム<400>、•複数<400>の<400>言語<400>、•複数<400>の<400>国<400>を<400>カバー<400>する<400>こと<400>が<400>出来る<400>の<400>です<400>。•ユニ<400>コード<400>は<400>、•データ<400>が<400>多く<400>の<400>異なる<400>システム<400>の<400>間<400>を<400>、•何<400>の<400>乱れ<400>も<400>なし<400>に<400>転送<400>する<400>こと<400>を<400>可能<400>と<400>する<400>の<400>です<400>。• +• +•ユニ<400>コード<400>コンソーシアム<400>について<400> +• +•ユニ<400>コード<400>コンソーシアム<400>は<400>、•最新<400>の<400>ソフトウエア<400>製品<400>と<400>標準<400>において<400>テキスト<400>を<400>表現<400>する<400>こと<400>を<400>意味<400>する<400>“•ユニ<400>コード<400>標準<400>”•の<400>構築<400>、•発展<400>、•普及<400>、•利用<400>促進<400>を<400>目的<400>として<400>設立<400>さ<400>れ<400>た<400>非<400>営利<400>組織<400>です<400>。•同<400>コンソーシアム<400>の<400>会員<400>は<400>、•コンピューター<400>と<400>情報処理<400>に<400>係わる<400>広汎<400>な<400>企業<400>や<400>組織<400>から<400>構成<400>さ<400>れ<400>てい<400>ます<400>。•同<400>コンソーシアム<400>は<400>、•財政<400>的<400>に<400>は<400>、•純粋<400>に<400>会費<400>のみ<400>によって<400>運営<400>さ<400>れ<400>てい<400>ます<400>。•ユニ<400>コード<400>標準<400>を<400>支持<400>し<400>、•その<400>拡張<400>と<400>実装<400>を<400>支援<400>する<400>世界中<400>の<400>組織<400>や<400>個人<400>は<400>、•だれ<400>も<400>が<400>ユニ<400>コード<400>コンソーシアム<400>の<400>会員<400>なる<400>こと<400>が<400>でき<400>ます<400>。• +• +•より<400>詳しい<400>こと<400>を<400>お<400>知<400>り<400>に<400>なり<400>たい<400>方<400>は<400>、•Glossary<200>,• •Technical<200> •Introduction<200> •および<400> •Useful<200> •Resources<200>を<400>ご<400>参照<400>くだ<400>さい<400>。• +•