For identifying text that needs to be handled by a word dictionary for Break Iteration,
change from using a bit in the character category to sorting all dictionary categories
together, and recording the boundary between the non-dictionary and dictionary ranges.
This is internal to the implementaion. It does not affect behavior.
It does increase the number of character categories that can be handled using a
compact 8 bit Trie, from 127 to 255.
bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8;
if (statetable->fFlags & RBBI_8BITS_ROWS) {
if (use8BitsTrie) {
- return handleNext<RBBIStateTableRow8, TrieFunc8, kDictBitFor8BitsTrie>();
+ return handleNext<RBBIStateTableRow8, TrieFunc8>();
} else {
- return handleNext<RBBIStateTableRow8, TrieFunc16, kDictBit>();
+ return handleNext<RBBIStateTableRow8, TrieFunc16>();
}
} else {
if (use8BitsTrie) {
- return handleNext<RBBIStateTableRow16, TrieFunc8, kDictBitFor8BitsTrie>();
+ return handleNext<RBBIStateTableRow16, TrieFunc8>();
} else {
- return handleNext<RBBIStateTableRow16, TrieFunc16, kDictBit>();
+ return handleNext<RBBIStateTableRow16, TrieFunc16>();
}
}
}
bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8;
if (statetable->fFlags & RBBI_8BITS_ROWS) {
if (use8BitsTrie) {
- return handleSafePrevious<RBBIStateTableRow8, TrieFunc8, kDictBitFor8BitsTrie>(fromPosition);
+ return handleSafePrevious<RBBIStateTableRow8, TrieFunc8>(fromPosition);
} else {
- return handleSafePrevious<RBBIStateTableRow8, TrieFunc16, kDictBit>(fromPosition);
+ return handleSafePrevious<RBBIStateTableRow8, TrieFunc16>(fromPosition);
}
} else {
if (use8BitsTrie) {
- return handleSafePrevious<RBBIStateTableRow16, TrieFunc8, kDictBitFor8BitsTrie>(fromPosition);
+ return handleSafePrevious<RBBIStateTableRow16, TrieFunc8>(fromPosition);
} else {
- return handleSafePrevious<RBBIStateTableRow16, TrieFunc16, kDictBit>(fromPosition);
+ return handleSafePrevious<RBBIStateTableRow16, TrieFunc16>(fromPosition);
}
}
}
// Run the state machine to find a boundary
//
//-----------------------------------------------------------------------------------
-template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc, uint16_t dictMask>
+template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc>
int32_t RuleBasedBreakIterator::handleNext() {
int32_t state;
uint16_t category = 0;
const RBBIStateTable *statetable = fData->fForwardTable;
const char *tableData = statetable->fTableData;
uint32_t tableRowLen = statetable->fRowLen;
+ uint32_t dictStart = statetable->fDictCategoriesStart;
#ifdef RBBI_DEBUG
if (gTrace) {
RBBIDebugPuts("Handle Next pos char state category");
// look up the current character's character category, which tells us
// which column in the state table to look at.
category = trieFunc(fData->fTrie, c);
-
- // Check the dictionary bit in the character's category.
- // Counter is only used by dictionary based iteration.
- // Chars that need to be handled by a dictionary have a flag bit set
- // in their category values.
- //
- if ((category & dictMask) != 0) {
- fDictionaryCharCount++;
- // And off the dictionary flag bit.
- category &= ~dictMask;
- }
+ fDictionaryCharCount += (category >= dictStart);
}
#ifdef RBBI_DEBUG
// because the safe table does not require as many options.
//
//-----------------------------------------------------------------------------------
-template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc, uint16_t dictMask>
+template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc>
int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
int32_t state;
//
// Off the dictionary flag bit. For reverse iteration it is not used.
category = trieFunc(fData->fTrie, c);
- category &= ~dictMask;
#ifdef RBBI_DEBUG
if (gTrace) {
void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos,
int32_t firstRuleStatus, int32_t otherRuleStatus) {
- uint32_t dictMask = ucptrie_getValueWidth(fBI->fData->fTrie) == UCPTRIE_VALUE_BITS_8 ?
- kDictBitFor8BitsTrie : kDictBit;
if ((endPos - startPos) <= 1) {
return;
}
utext_setNativeIndex(text, rangeStart);
UChar32 c = utext_current32(text);
category = ucptrie_get(fBI->fData->fTrie, c);
+ uint32_t dictStart = fBI->fData->fForwardTable->fDictCategoriesStart;
while(U_SUCCESS(status)) {
- while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd && (category & dictMask) == 0) {
+ while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd
+ && (category < dictStart)) {
utext_next32(text); // TODO: cleaner loop structure.
c = utext_current32(text);
category = ucptrie_get(fBI->fData->fTrie, c);
// Value 0: not an accepting state.
// 1: (ACCEPTING_UNCONDITIONAL) Unconditional Accepting state.
// >1: Look-ahead match has completed.
- // Actual boundary position happened earlier
+ // Actual boundary position happened earlier.
// Value here == fLookAhead in earlier
- // state, at actual boundary pos.
+ // state, at actual boundary pos.
T fLookAhead; // Non-zero if this row is for a state that
// corresponds to a '/' in the rule source.
// Value is the same as the fAccepting
- // value for the rule (which will appear
- // in a different state.
+ // value for the rule (which will appear
+ // in a different state.
T fTagsIdx; // Non-zero if this row covers a {tagged} position
- // from a rule. Value is the index in the
- // StatusTable of the set of matching
- // tags (rule status values)
+ // from a rule. Value is the index in the
+ // StatusTable of the set of matching
+ // tags (rule status values)
T fNextState[1]; // Next State, indexed by char category.
// Variable-length array declared with length 1
// to disable bounds checkers.
};
struct RBBIStateTable {
- uint32_t fNumStates; /* Number of states. */
- uint32_t fRowLen; /* Length of a state table row, in bytes. */
- uint32_t fFlags; /* Option Flags for this state table */
- char fTableData[1]; /* First RBBIStateTableRow begins here. */
- /* Variable-length array declared with length 1 */
- /* to disable bounds checkers. */
- /* (making it char[] simplifies ugly address */
- /* arithmetic for indexing variable length rows.) */
+ uint32_t fNumStates; // Number of states.
+ uint32_t fRowLen; // Length of a state table row, in bytes.
+ uint32_t fDictCategoriesStart; // Char category number of the first dictionary
+ // char class, or the the largest category number + 1
+ // if there are no dictionary categories.
+ uint32_t fFlags; // Option Flags for this state table.
+ char fTableData[1]; // First RBBIStateTableRow begins here.
+ // Variable-length array declared with length 1
+ // to disable bounds checkers.
+ // (making it char[] simplifies ugly address
+ // arithmetic for indexing variable length rows.)
};
constexpr uint32_t RBBI_LOOKAHEAD_HARD_BREAK = 1;
//
// UnicodeSet processing.
- // Munge the Unicode Sets to create a set of character categories.
- // Generate the mapping tables (TRIE) from input code points to
- // the character categories.
+ // Munge the Unicode Sets to create an initial set of character categories.
//
fSetBuilder->buildRanges();
}
fForwardTable->buildForwardTable();
+
+ // State table and character category optimization.
+ // Merge equivalent rows and columns.
+ // Note that this process alters the initial set of character categories,
+ // causing the representation of UnicodeSets in the parse tree to become invalid.
+
optimizeTables();
fForwardTable->buildSafeReverseTable(status);
}
#endif
+ // Generate the mapping tables (TRIE) from input code points to
+ // the character categories.
+ //
fSetBuilder->buildTrie();
//
// by the RBBI rules.
// - compute a set of non-overlapping character ranges
// with all characters within a range belonging to the same
-// set of input uniocde sets.
+// set of input unicode sets.
// - Derive a set of non-overlapping UnicodeSet (like things)
// that will correspond to columns in the state table for
// the RBBI execution engine. All characters within one
U_NAMESPACE_BEGIN
-const int32_t kMaxCharCategoriesFor8BitsTrie = 127;
+const int32_t kMaxCharCategoriesFor8BitsTrie = 255;
//------------------------------------------------------------------------
//
// Constructor
{
fRB = rb;
fStatus = rb->fStatus;
- fRangeList = 0;
+ fRangeList = nullptr;
fMutableTrie = nullptr;
fTrie = nullptr;
fTrieSize = 0;
fGroupCount = 0;
- fSawBOF = FALSE;
+ fSawBOF = false;
}
//
// Numbering: # 0 (state table column 0) is unused.
// # 1 is reserved - table column 1 is for end-of-input
- // # 2 is reserved - table column 2 is for beginning-in-input
+ // # 2 is reserved - table column 2 is for beginning-of-input
// # 3 is the first range list.
//
RangeDescriptor *rlSearchRange;
- for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
+ int32_t dictGroupCount = 0;
+
+ for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) {
rlRange->fNum = rlSearchRange->fNum;
+ rlRange->fIncludesDict = rlSearchRange->fIncludesDict;
break;
}
}
if (rlRange->fNum == 0) {
- fGroupCount ++;
- rlRange->fNum = fGroupCount+2;
- rlRange->setDictionaryFlag();
- addValToSets(rlRange->fIncludesSets, fGroupCount+2);
+ rlRange->fFirstInGroup = true;
+ if (rlRange->isDictionaryRange()) {
+ rlRange->fNum = ++dictGroupCount;
+ rlRange->fIncludesDict = true;
+ } else {
+ fGroupCount++;
+ rlRange->fNum = fGroupCount+2;
+ addValToSets(rlRange->fIncludesSets, rlRange->fNum);
+ }
+ }
+ }
+
+ // Move the character category numbers for any dictionary ranges up, so that they
+ // immediately follow the non-dictionary ranges.
+
+ fDictCategoriesStart = fGroupCount + 3;
+ for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
+ if (rlRange->fIncludesDict) {
+ rlRange->fNum += fDictCategoriesStart - 1;
+ if (rlRange->fFirstInGroup) {
+ addValToSets(rlRange->fIncludesSets, rlRange->fNum);
+ }
}
}
+ fGroupCount += dictGroupCount;
+
// Handle input sets that contain the special string {eof}.
// Column 1 of the state table is reserved for EOF on input.
// references to {bof}.)
// Add this column value (1 or 2) to the equivalent expression
// subtree for each UnicodeSet that contains the string {eof}
- // Because {bof} and {eof} are not a characters in the normal sense,
- // they doesn't affect the computation of ranges or TRIE.
- static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0};
- static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0};
+ // Because {bof} and {eof} are not characters in the normal sense,
+ // they don't affect the computation of the ranges or TRIE.
- UnicodeString eofString(eofUString);
- UnicodeString bofString(bofUString);
+ UnicodeString eofString(u"eof");
+ UnicodeString bofString(u"bof");
for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules
usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
if (usetNode==NULL) {
// range group number.
//
void RBBISetBuilder::buildTrie() {
- RangeDescriptor *rlRange;
-
fMutableTrie = umutablecptrie_open(
0, // Initial value for all code points.
0, // Error value for out-of-range input.
fStatus);
- bool use8Bits = getNumCharCategories() <= kMaxCharCategoriesFor8BitsTrie;
- for (rlRange = fRangeList; rlRange!=0 && U_SUCCESS(*fStatus); rlRange=rlRange->fNext) {
- uint32_t value = rlRange->fNum;
- if (use8Bits && ((value & RuleBasedBreakIterator::kDictBit) != 0)) {
- U_ASSERT((value & RuleBasedBreakIterator::kDictBitFor8BitsTrie) == 0);
- value = RuleBasedBreakIterator::kDictBitFor8BitsTrie | (value & ~RuleBasedBreakIterator::kDictBit);
- }
+ for (RangeDescriptor *range = fRangeList; range!=nullptr && U_SUCCESS(*fStatus); range=range->fNext) {
umutablecptrie_setRange(fMutableTrie,
- rlRange->fStartChar, // Range start
- rlRange->fEndChar, // Range end (inclusive)
- value, // value for range
+ range->fStartChar, // Range start
+ range->fEndChar, // Range end (inclusive)
+ range->fNum, // value for range
fStatus);
}
}
void RBBISetBuilder::mergeCategories(IntPair categories) {
U_ASSERT(categories.first >= 1);
U_ASSERT(categories.second > categories.first);
+ U_ASSERT((categories.first < fDictCategoriesStart && categories.second < fDictCategoriesStart) ||
+ (categories.first >= fDictCategoriesStart && categories.second >= fDictCategoriesStart));
+
for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) {
- int32_t rangeNum = rd->fNum & ~RuleBasedBreakIterator::kDictBit;
- int32_t rangeDict = rd->fNum & RuleBasedBreakIterator::kDictBit;
+ int32_t rangeNum = rd->fNum;
if (rangeNum == categories.second) {
- rd->fNum = categories.first | rangeDict;
+ rd->fNum = categories.first;
} else if (rangeNum > categories.second) {
rd->fNum--;
}
}
--fGroupCount;
+ if (categories.second <= fDictCategoriesStart) {
+ --fDictCategoriesStart;
+ }
}
}
+//------------------------------------------------------------------------
+//
+// getDictCategoriesStart
+//
+//------------------------------------------------------------------------
+int32_t RBBISetBuilder::getDictCategoriesStart() const {
+ return fDictCategoriesStart;
+}
+
+
//------------------------------------------------------------------------
//
// sawBOF
UChar32 RBBISetBuilder::getFirstChar(int32_t category) const {
RangeDescriptor *rlRange;
UChar32 retVal = (UChar32)-1;
- for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
+ for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
if (rlRange->fNum == category) {
retVal = rlRange->fStartChar;
break;
}
-
//------------------------------------------------------------------------
//
// printRanges A debugging function.
int i;
RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n");
- for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
- RBBIDebugPrintf("%2i %4x-%4x ", rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar);
+ for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
+ RBBIDebugPrintf("%4x-%4x ", rlRange->fStartChar, rlRange->fEndChar);
for (i=0; i<rlRange->fIncludesSets->size(); i++) {
RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
- UnicodeString setName = UNICODE_STRING("anon", 4);
+ UnicodeString setName {u"anon"};
RBBINode *setRef = usetNode->fParent;
- if (setRef != NULL) {
+ if (setRef != nullptr) {
RBBINode *varRef = setRef->fParent;
- if (varRef != NULL && varRef->fType == RBBINode::varRef) {
+ if (varRef != nullptr && varRef->fType == RBBINode::varRef) {
setName = varRef->fText;
}
}
//------------------------------------------------------------------------
#ifdef RBBI_DEBUG
void RBBISetBuilder::printRangeGroups() {
- RangeDescriptor *rlRange;
- RangeDescriptor *tRange;
int i;
- int lastPrintedGroupNum = 0;
RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n");
- for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
- int groupNum = rlRange->fNum & 0xbfff;
- if (groupNum > lastPrintedGroupNum) {
- lastPrintedGroupNum = groupNum;
+ for (RangeDescriptor *rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
+ if (rlRange->fFirstInGroup) {
+ int groupNum = rlRange->fNum;
RBBIDebugPrintf("%2i ", groupNum);
- if (rlRange->fNum & RuleBasedBreakIterator::kDictBit) { RBBIDebugPrintf(" <DICT> ");}
+ if (groupNum >= fDictCategoriesStart) { RBBIDebugPrintf(" <DICT> ");}
for (i=0; i<rlRange->fIncludesSets->size(); i++) {
RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
}
i = 0;
- for (tRange = rlRange; tRange != 0; tRange = tRange->fNext) {
+ for (RangeDescriptor *tRange = rlRange; tRange != nullptr; tRange = tRange->fNext) {
if (tRange->fNum == rlRange->fNum) {
if (i++ % 5 == 0) {
RBBIDebugPrintf("\n ");
//
//-------------------------------------------------------------------------------------
-RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) {
- int i;
-
- this->fStartChar = other.fStartChar;
- this->fEndChar = other.fEndChar;
- this->fNum = other.fNum;
- this->fNext = NULL;
- UErrorCode oldstatus = status;
- this->fIncludesSets = new UVector(status);
- if (U_FAILURE(oldstatus)) {
- status = oldstatus;
- }
+RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) :
+ fStartChar(other.fStartChar), fEndChar {other.fEndChar}, fNum {other.fNum},
+ fIncludesDict{other.fIncludesDict}, fFirstInGroup{other.fFirstInGroup} {
+
if (U_FAILURE(status)) {
return;
}
- /* test for NULL */
- if (this->fIncludesSets == 0) {
+ fIncludesSets = new UVector(status);
+ if (this->fIncludesSets == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
+ }
+ if (U_FAILURE(status)) {
return;
}
- for (i=0; i<other.fIncludesSets->size(); i++) {
+ for (int32_t i=0; i<other.fIncludesSets->size(); i++) {
this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status);
}
}
//
//-------------------------------------------------------------------------------------
RangeDescriptor::RangeDescriptor(UErrorCode &status) {
- this->fStartChar = 0;
- this->fEndChar = 0;
- this->fNum = 0;
- this->fNext = NULL;
- UErrorCode oldstatus = status;
- this->fIncludesSets = new UVector(status);
- if (U_FAILURE(oldstatus)) {
- status = oldstatus;
- }
if (U_FAILURE(status)) {
return;
}
- /* test for NULL */
- if(this->fIncludesSets == 0) {
+ fIncludesSets = new UVector(status);
+ if (fIncludesSets == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
- return;
}
-
}
//-------------------------------------------------------------------------------------
RangeDescriptor::~RangeDescriptor() {
delete fIncludesSets;
- fIncludesSets = NULL;
+ fIncludesSets = nullptr;
}
//-------------------------------------------------------------------------------------
void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
U_ASSERT(where>fStartChar && where<=fEndChar);
RangeDescriptor *nr = new RangeDescriptor(*this, status);
- if(nr == 0) {
+ if(nr == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
//-------------------------------------------------------------------------------------
//
-// RangeDescriptor::setDictionaryFlag
+// RangeDescriptor::isDictionaryRange
//
-// Character Category Numbers that include characters from
-// the original Unicode Set named "dictionary" have bit 14
-// set to 1. The RBBI runtime engine uses this to trigger
-// use of the word dictionary.
+// Test whether this range includes characters from
+// the original Unicode Set named "dictionary".
//
-// This function looks through the Unicode Sets that it
-// (the range) includes, and sets the bit in fNum when
-// "dictionary" is among them.
+// This function looks through the Unicode Sets that
+// the range includes, checking for one named "dictionary"
//
// TODO: a faster way would be to find the set node for
// "dictionary" just once, rather than looking it
// up by name every time.
//
//-------------------------------------------------------------------------------------
-void RangeDescriptor::setDictionaryFlag() {
- int i;
-
+bool RangeDescriptor::isDictionaryRange() {
static const char16_t *dictionary = u"dictionary";
- for (i=0; i<fIncludesSets->size(); i++) {
+ for (int32_t i=0; i<fIncludesSets->size(); i++) {
RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i);
RBBINode *setRef = usetNode->fParent;
if (setRef != nullptr) {
if (varRef && varRef->fType == RBBINode::varRef) {
const UnicodeString *setName = &varRef->fText;
if (setName->compare(dictionary, -1) == 0) {
- fNum |= RuleBasedBreakIterator::kDictBit;
- break;
+ return true;
}
}
}
}
+ return false;
}
-
-
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
//
class RangeDescriptor : public UMemory {
public:
- UChar32 fStartChar; // Start of range, unicode 32 bit value.
- UChar32 fEndChar; // End of range, unicode 32 bit value.
- int32_t fNum; // runtime-mapped input value for this range.
- UVector *fIncludesSets; // vector of the the original
- // Unicode sets that include this range.
- // (Contains ptrs to uset nodes)
- RangeDescriptor *fNext; // Next RangeDescriptor in the linked list.
+ UChar32 fStartChar {}; // Start of range, unicode 32 bit value.
+ UChar32 fEndChar {}; // End of range, unicode 32 bit value.
+ int32_t fNum {0}; // runtime-mapped input value for this range.
+ bool fIncludesDict {false}; // True if the range includes $dictionary.
+ bool fFirstInGroup {false}; // True if first range in a group with the same fNum.
+ UVector *fIncludesSets {nullptr}; // vector of the the original
+ // Unicode sets that include this range.
+ // (Contains ptrs to uset nodes)
+ RangeDescriptor *fNext {nullptr}; // Next RangeDescriptor in the linked list.
RangeDescriptor(UErrorCode &status);
RangeDescriptor(const RangeDescriptor &other, UErrorCode &status);
~RangeDescriptor();
void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with
// where appearing in the second (higher) part.
- void setDictionaryFlag(); // Check whether this range appears as part of
+ bool isDictionaryRange(); // Check whether this range appears as part of
// the Unicode set named "dictionary"
-private:
- RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class
- RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class
+ RangeDescriptor(const RangeDescriptor &other) = delete; // forbid default copying of this class
+ RangeDescriptor &operator=(const RangeDescriptor &other) = delete; // forbid assigning of this class
};
int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the
// runtime state machine, which are the same as
// columns in the DFA state table
+ int32_t getDictCategoriesStart() const; // First char category that includes $dictionary, or
+ // last category + 1 if there are no dictionary categories.
int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie.
void serializeTrie(uint8_t *where); // write out the serialized Trie.
UChar32 getFirstChar(int32_t val) const;
#endif
private:
- void numberSets();
-
RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us.
UErrorCode *fStatus;
UCPTrie *fTrie; // the Unicode Sets.
uint32_t fTrieSize;
- // Groups correspond to character categories -
- // groups of ranges that are in the same original UnicodeSets.
- // fGroupCount is the index of the last used group.
- // fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
- // State table column 0 is not used. Column 1 is for end-of-input.
- // column 2 is for group 0. Funny counting.
+ // Number of range groups, which are groups of ranges that are in the same original UnicodeSets.
int32_t fGroupCount;
+ // The number of the first dictionary char category.
+ // If there are no Dictionary categories, set to the last category + 1.
+ int32_t fDictCategoriesStart;
+
UBool fSawBOF;
RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class
int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
for (; categories->first < numCols-1; categories->first++) {
- for (categories->second=categories->first+1; categories->second < numCols; categories->second++) {
+ // Note: dictionary & non-dictionary columns cannot be merged.
+ // The limitSecond value prevents considering mixed pairs.
+ // Dictionary categories are >= DictCategoriesStart.
+ // Non dict categories are < DictCategoriesStart.
+ int limitSecond = categories->first < fRB->fSetBuilder->getDictCategoriesStart() ?
+ fRB->fSetBuilder->getDictCategoriesStart() : numCols;
+ for (categories->second=categories->first+1; categories->second < limitSecond; categories->second++) {
// Initialized to different values to prevent returning true if numStates = 0 (implies no duplicates).
uint16_t table_base = 0;
uint16_t table_dupl = 1;
}
table->fNumStates = fDStates->size();
+ table->fDictCategoriesStart = fRB->fSetBuilder->getDictCategoriesStart();
table->fFlags = 0;
if (use8BitsForTable()) {
table->fRowLen = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
RBBIDebugPrintf(" | Acc LA Tag");
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
- RBBIDebugPrintf(" %2d", c);
+ RBBIDebugPrintf(" %3d", c);
}
RBBIDebugPrintf("\n");
RBBIDebugPrintf(" |---------------");
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
- RBBIDebugPrintf("---");
+ RBBIDebugPrintf("----");
}
RBBIDebugPrintf("\n");
RBBIDebugPrintf(" %3d | " , n);
RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
- RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
+ RBBIDebugPrintf(" %3d", sd->fDtran->elementAti(c));
}
RBBIDebugPrintf("\n");
}
typedef uint16_t (*PTrieFunc)(const UCPTrie *, UChar32);
- template<typename RowType, PTrieFunc trieFunc, uint16_t dictMask>
+ template<typename RowType, PTrieFunc trieFunc>
int32_t handleSafePrevious(int32_t fromPosition);
- template<typename RowType, PTrieFunc trieFunc, uint16_t dictMask>
+ template<typename RowType, PTrieFunc trieFunc>
int32_t handleNext();
* @internal
*/
void dumpTables();
-
- /**
- * Bit for dictionary based category
- */
- static constexpr int32_t kDictBit = 0x4000;
-
- /**
- * Bit for dictionary based category in 8bits trie
- */
- static constexpr int32_t kDictBitFor8BitsTrie = 0x0080;
-
#endif /* U_HIDE_INTERNAL_API */
};
}
// Ignore column (char class) 0 while checking; it's special, and may have duplicates.
for (int c1=1; c1<numCharClasses; c1++) {
- for (int c2 = c1+1; c2 < numCharClasses; c2++) {
+ int limit = c1 < (int)fwtbl->fDictCategoriesStart ? fwtbl->fDictCategoriesStart : numCharClasses;
+ for (int c2 = c1+1; c2 < limit; c2++) {
if (columns.at(c1) == columns.at(c2)) {
errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
goto out;
}
void RBBITest::Test8BitsTrieWith8BitStateTable() {
- testTrieStateTable(123, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
+ testTrieStateTable(251, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
}
void RBBITest::Test16BitsTrieWith8BitStateTable() {
- testTrieStateTable(124, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
+ testTrieStateTable(252, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
}
void RBBITest::Test16BitsTrieWith16BitStateTable() {
- testTrieStateTable(255, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */);
+ testTrieStateTable(253, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */);
}
void RBBITest::Test8BitsTrieWith16BitStateTable() {
* Length of a table row in bytes. Note mismatch with table data, which is short[].
*/
public int fRowLen;
+ /**
+ * Char category number of the first dictionary char class,
+ * or the the largest category number + 1 if there are no dictionary categories.
+ */
+ public int fDictCategoriesStart;
/**
* Option Flags for this state table.
*/
public int fFlags;
+ /**
+ * Length in bytes of the state table header, of all the int32 fields
+ * preceding fTable in the serialized form.
+ */
+ public static int fHeaderSize = 16;
/**
* Linear array of next state values, accessed as short[state, char_class]
*/
if (length == 0) {
return null;
}
- if (length < 12) {
+ if (length < fHeaderSize) {
throw new IOException("Invalid RBBI state table length.");
}
RBBIStateTable This = new RBBIStateTable();
This.fNumStates = bytes.getInt();
This.fRowLen = bytes.getInt();
+ This.fDictCategoriesStart = bytes.getInt();
This.fFlags = bytes.getInt();
- int lengthOfTable = length - 12; // length in bytes.
+ int lengthOfTable = length - fHeaderSize; // length in bytes.
boolean use8Bits = (This.fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) == RBBIDataWrapper.RBBI_8BITS_ROWS;
if (use8Bits) {
This.fTable = new char[lengthOfTable];
public int put(DataOutputStream bytes) throws IOException {
bytes.writeInt(fNumStates);
bytes.writeInt(fRowLen);
+ bytes.writeInt(fDictCategoriesStart);
bytes.writeInt(fFlags);
if ((fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) == RBBIDataWrapper.RBBI_8BITS_ROWS) {
int tableLen = fRowLen * fNumStates; // fRowLen is bytes.
bytes.writeChar(fTable[i]);
}
}
- int bytesWritten = 12 + fRowLen * fNumStates; // total bytes written,
- // including 12 for the header.
+ int bytesWritten = fHeaderSize + fRowLen * fNumStates; // total bytes written,
+ // including the header.
while (bytesWritten % 8 != 0) {
bytes.writeByte(0);
++bytesWritten;
RBBIStateTable otherST = (RBBIStateTable)other;
if (fNumStates != otherST.fNumStates) return false;
if (fRowLen != otherST.fRowLen) return false;
+ if (fDictCategoriesStart != otherST.fDictCategoriesStart) return false;
if (fFlags != otherST.fFlags) return false;
return Arrays.equals(fTable, otherST.fTable);
}
public final static int RBBI_BOF_REQUIRED = 2;
public final static int RBBI_8BITS_ROWS = 4;
- public final static int DICT_BIT = 0x4000;
- public final static int DICT_BIT_FOR_8BITS_TRIE = 0x0080;
-
/**
* Data Header. A struct-like class with the fields from the RBBI data file header.
* Not intended for public use, declared public for testing purposes only.
int char32;
int category;
int lastNewline[] = new int[n+1];
- int dictMask = fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ? DICT_BIT_FOR_8BITS_TRIE : DICT_BIT;
for (category = 0; category <= fHeader.fCatCount; category ++) {
catStrings[category] = "";
out.println("--------------------");
for (char32 = 0; char32<=0x10ffff; char32++) {
category = fTrie.get(char32);
- category &= ~dictMask; // Mask off dictionary bit.
if (category < 0 || category > fHeader.fCatCount) {
out.println("Error, bad category " + Integer.toHexString(category) +
" for char " + Integer.toHexString(char32));
//
// Status {tag} values. These structures are common to all of the rule sets (Forward, Reverse, etc.).
//
- Map<Set<Integer>, Integer> fStatusSets = new HashMap<Set<Integer>, Integer>(); // Status value sets encountered so far.
+ Map<Set<Integer>, Integer> fStatusSets = new HashMap<>(); // Status value sets encountered so far.
// Map Key is the set of values.
// Map Value is the runtime array index.
ICUDebug.value("rbbi") : null;
fRules = rules;
fStrippedRules = new StringBuilder(rules);
- fUSetNodes = new ArrayList<RBBINode>();
- fRuleStatusVals = new ArrayList<Integer>();
+ fUSetNodes = new ArrayList<>();
+ fRuleStatusVals = new ArrayList<>();
fScanner = new RBBIRuleScanner(this);
fSetBuilder = new RBBISetBuilder(this);
}
//
// UnicodeSet processing.
- // Munge the Unicode Sets to create a set of character categories.
- // Generate the mapping tables (TRIE) from input code points to
- // the character categories.
+ // Munge the Unicode Sets to create an initial set of character categories.
//
fSetBuilder.buildRanges();
//
fForwardTable = new RBBITableBuilder(this, fForwardTree);
fForwardTable.buildForwardTable();
+ // State table and character category optimization.
+ // Merge equivalent rows and columns.
+ // Note that this process alters the the initial set of character categories,
+ // causing the representation of UnicodeSets in the parse tree to become invalid.
optimizeTables();
fForwardTable.buildSafeReverseTable();
fForwardTable.printRuleStatusTable();
fForwardTable.printReverseTable();
}
-
+ // Generate the mapping tables (TRIE) from input code points to
+ // the character categories.
+ //
fSetBuilder.buildTrie();
//
// Package up the compiled data, writing it to an output stream
// by the RBBI rules.
// - compute a set of non-overlapping character ranges
// with all characters within a range belonging to the same
-// set of input uniocde sets.
+// set of input unicode sets.
// - Derive a set of non-overlapping UnicodeSet (like things)
// that will correspond to columns in the state table for
// the RBBI execution engine. All characters within one
//
class RBBISetBuilder {
static class RangeDescriptor {
- int fStartChar; // Start of range, unicode 32 bit value.
- int fEndChar; // End of range, unicode 32 bit value.
- int fNum; // runtime-mapped input value for this range.
- List<RBBINode> fIncludesSets; // vector of the the original
- // Unicode sets that include this range.
- // (Contains ptrs to uset nodes)
- RangeDescriptor fNext; // Next RangeDescriptor in the linked list.
+ int fStartChar = 0; // Start of range, unicode 32 bit value.
+ int fEndChar = 0; // End of range, unicode 32 bit value.
+ int fNum = 0; // runtime-mapped input value for this range.
+ boolean fIncludesDict = false; // True if the range includes $dictionary.
+ boolean fFirstInGroup = false; // True if first range in a group with the same fNum.
+ List<RBBINode> fIncludesSets; // vector of the the original
+ // Unicode sets that include this range.
+ // (Contains ptrs to uset nodes)
+ RangeDescriptor fNext; // Next RangeDescriptor in the linked list.
RangeDescriptor() {
- fIncludesSets = new ArrayList<RBBINode>();
+ fIncludesSets = new ArrayList<>();
}
RangeDescriptor(RangeDescriptor other) {
fStartChar = other.fStartChar;
fEndChar = other.fEndChar;
fNum = other.fNum;
- fIncludesSets = new ArrayList<RBBINode>(other.fIncludesSets);
+ fIncludesDict = other.fIncludesDict;
+ fFirstInGroup = other.fFirstInGroup;
+ fIncludesSets = new ArrayList<>(other.fIncludesSets);
}
//-------------------------------------------------------------------------------------
}
- //-------------------------------------------------------------------------------------
- //
- // RangeDescriptor::setDictionaryFlag
- //
- // Character Category Numbers that include characters from
- // the original Unicode Set named "dictionary" have bit 14
- // set to 1. The RBBI runtime engine uses this to trigger
- // use of the word dictionary.
- //
- // This function looks through the Unicode Sets that it
- // (the range) includes, and sets the bit in fNum when
- // "dictionary" is among them.
- //
+ /**
+ * Test whether this range includes characters from the original Unicode Set named "dictionary".
+ *
+ * This function looks through the Unicode Sets that
+ * the range includes, checking for one named "dictionary"
+ */
// TODO: a faster way would be to find the set node for
// "dictionary" just once, rather than looking it
// up by name every time.
//
- // -------------------------------------------------------------------------------------
- void setDictionaryFlag() {
- int i;
-
- for (i=0; i<this.fIncludesSets.size(); i++) {
+ boolean isDictionaryRange() {
+ for (int i=0; i<this.fIncludesSets.size(); i++) {
RBBINode usetNode = fIncludesSets.get(i);
String setName = "";
RBBINode setRef = usetNode.fParent;
}
}
if (setName.equals("dictionary")) {
- this.fNum |= DICT_BIT;
- break;
+ return true;
}
}
-
+ return false;
}
}
// the Unicode Sets.
CodePointTrie fFrozenTrie;
- // Groups correspond to character categories -
- // groups of ranges that are in the same original UnicodeSets.
- // fGroupCount is the index of the last used group.
- // fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
- // State table column 0 is not used. Column 1 is for end-of-input.
- // column 2 is for group 0. Funny counting.
+ /**
+ * Number of range groups, which are groups of ranges that are in the same original UnicodeSets.
+ */
int fGroupCount;
+ /**
+ * The number of the first dictionary char category.
+ * If there are no Dictionary categories, set to the last category + 1.
+ */
+ int fDictCategoriesStart;
boolean fSawBOF;
- static final int DICT_BIT = 0x4000;
- static final int DICT_BIT_FOR_8BITS_TRIE = 0x0080;
-
//------------------------------------------------------------------------
//
//
// Numbering: # 0 (state table column 0) is unused.
// # 1 is reserved - table column 1 is for end-of-input
- // # 2 is reserved - table column 2 is for beginning-in-input
+ // # 2 is reserved - table column 2 is for beginning-of-input
// # 3 is the first range list.
//
RangeDescriptor rlSearchRange;
+ int dictGroupCount = 0;
+
for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange.fNext) {
if (rlRange.fIncludesSets.equals(rlSearchRange.fIncludesSets)) {
rlRange.fNum = rlSearchRange.fNum;
+ rlRange.fIncludesDict = rlSearchRange.fIncludesDict;
break;
}
}
if (rlRange.fNum == 0) {
- fGroupCount ++;
- rlRange.fNum = fGroupCount+2;
- rlRange.setDictionaryFlag();
- addValToSets(rlRange.fIncludesSets, fGroupCount+2);
+ rlRange.fFirstInGroup = true;
+ if (rlRange.isDictionaryRange()) {
+ rlRange.fNum = ++dictGroupCount;
+ rlRange.fIncludesDict = true;
+ } else {
+ fGroupCount++;
+ rlRange.fNum = fGroupCount + 2;
+ addValToSets(rlRange.fIncludesSets, fGroupCount + 2);
+ }
}
}
+ // Move the character category numbers for any dictionary ranges up, so that they
+ // immediately follow the non-dictionary ranges.
+
+ fDictCategoriesStart = fGroupCount + 3;
+ for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
+ if (rlRange.fIncludesDict) {
+ rlRange.fNum += fDictCategoriesStart - 1;
+ if (rlRange.fFirstInGroup) {
+ addValToSets(rlRange.fIncludesSets, rlRange.fNum);
+ }
+ }
+ }
+ fGroupCount += dictGroupCount;
+
+
+
// Handle input sets that contain the special string {eof}.
// Column 1 of the state table is reserved for EOF on input.
// Column 2 is reserved for before-the-start-input.
}
- private static final int MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE = 127;
+ private static final int MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE = 255;
/**
* Build the Trie table for mapping UChar32 values to the corresponding
* range group number.
*/
void buildTrie() {
- boolean use8Bits = getNumCharCategories() <= MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE;
- RangeDescriptor rlRange;
-
fTrie = new MutableCodePointTrie(0, // Initial value for all code points.
0); // Error value for out-of-range input.
- for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
- int value = rlRange.fNum;
- if (use8Bits && ((value & DICT_BIT) != 0)) {
- assert((value & DICT_BIT_FOR_8BITS_TRIE) == 0);
- // switch to the bit from DICT_BIT to DICT_BIT_FOR_8BITS_TRIE
- value = DICT_BIT_FOR_8BITS_TRIE | (value & ~DICT_BIT);
- }
- fTrie.setRange(
- rlRange.fStartChar, // Range start
- rlRange.fEndChar, // Range end (inclusive)
- value // value for range
- );
+ for (RangeDescriptor rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
+ fTrie.setRange(rlRange.fStartChar, // Range start
+ rlRange.fEndChar, // Range end (inclusive)
+ rlRange.fNum // value for range
+ );
}
}
void mergeCategories(IntPair categories) {
assert(categories.first >= 1);
assert(categories.second > categories.first);
+ assert((categories.first < fDictCategoriesStart && categories.second < fDictCategoriesStart) ||
+ (categories.first >= fDictCategoriesStart && categories.second >= fDictCategoriesStart));
for (RangeDescriptor rd = fRangeList; rd != null; rd = rd.fNext) {
- int rangeNum = rd.fNum & ~DICT_BIT;
- int rangeDict = rd.fNum & DICT_BIT;
+ int rangeNum = rd.fNum;
if (rangeNum == categories.second) {
- rd.fNum = categories.first | rangeDict;
+ rd.fNum = categories.first;
} else if (rangeNum > categories.second) {
rd.fNum--;
}
}
--fGroupCount;
+ if (categories.second <= fDictCategoriesStart) {
+ --fDictCategoriesStart;
+ }
}
//-----------------------------------------------------------------------------------
}
+ //------------------------------------------------------------------------
+ //
+ // getDictCategoriesStart
+ //
+ //------------------------------------------------------------------------
+ int getDictCategoriesStart() {
+ return fDictCategoriesStart;
+ }
+
+
//------------------------------------------------------------------------
//
// sawBOF
}
-
//------------------------------------------------------------------------
//
// printRanges A debugging function.
System.out.print("\n\n Nonoverlapping Ranges ...\n");
for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
- System.out.print(" " + rlRange.fNum + " " + rlRange.fStartChar + "-" + rlRange.fEndChar);
+ System.out.printf("%04x-%04x ", rlRange.fStartChar, rlRange.fEndChar);
for (i=0; i<rlRange.fIncludesSets.size(); i++) {
RBBINode usetNode = rlRange.fIncludesSets.get(i);
//------------------------------------------------------------------------
///CLOVER:OFF
void printRangeGroups() {
- RangeDescriptor rlRange;
- RangeDescriptor tRange;
int i;
- int lastPrintedGroupNum = 0;
System.out.print("\nRanges grouped by Unicode Set Membership...\n");
- for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
- int groupNum = rlRange.fNum & 0xbfff;
- if (groupNum > lastPrintedGroupNum) {
- lastPrintedGroupNum = groupNum;
+ for (RangeDescriptor rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
+ if (rlRange.fFirstInGroup) {
+ int groupNum = rlRange.fNum;
if (groupNum<10) {System.out.print(" ");}
System.out.print(groupNum + " ");
- if ((rlRange.fNum & DICT_BIT) != 0) { System.out.print(" <DICT> ");}
+ if (groupNum >= fDictCategoriesStart) { System.out.print(" <DICT> ");}
for (i=0; i<rlRange.fIncludesSets.size(); i++) {
RBBINode usetNode = rlRange.fIncludesSets.get(i);
}
i = 0;
- for (tRange = rlRange; tRange != null; tRange = tRange.fNext) {
+ for (RangeDescriptor tRange = rlRange; tRange != null; tRange = tRange.fNext) {
if (tRange.fNum == rlRange.fNum) {
if (i++ % 5 == 0) {
System.out.print("\n ");
int table_base = 0;
int table_dupl = 0;
for (; categories.first < numCols-1; ++categories.first) {
- for (categories.second=categories.first+1; categories.second < numCols; ++categories.second) {
+ // Note: dictionary & non-dictionary columns cannot be merged.
+ // The limitSecond value prevents considering mixed pairs.
+ // Dictionary categories are >= DictCategoriesStart.
+ // Non dict categories are < DictCategoriesStart.
+ int limitSecond = categories.first < fRB.fSetBuilder.getDictCategoriesStart() ?
+ fRB.fSetBuilder.getDictCategoriesStart() : numCols;
+ for (categories.second=categories.first+1; categories.second < limitSecond; ++categories.second) {
for (int state=0; state<numStates; state++) {
RBBIStateDescriptor sd = fDStates.get(state);
table_base = sd.fDtran[categories.first];
if (fRB.fTreeRoots[fRootIx] == null) {
return 0;
}
- int size = 12; // The header of 4 ints, with no rows to the table.
+ int size = RBBIDataWrapper.RBBIStateTable.fHeaderSize; // The header, with no rows to the table.
int numRows = fDStates.size();
int numCols = fRB.fSetBuilder.getNumCharCategories();
boolean use8Bits = numRows <= MAX_STATE_FOR_8BITS_TABLE;
Assert.assrt(fRB.fSetBuilder.getNumCharCategories() < 0x7fff &&
fDStates.size() < 0x7fff);
table.fNumStates = fDStates.size();
+ table.fDictCategoriesStart = fRB.fSetBuilder.getDictCategoriesStart();
boolean use8Bits = table.fNumStates <= MAX_STATE_FOR_8BITS_TABLE;
// Size of table size in shorts.
int rowLen = RBBIDataWrapper.NEXTSTATES + fRB.fSetBuilder.getNumCharCategories(); // Row Length in shorts.
int tableSize;
if (use8Bits) {
- tableSize = (getTableSize() - 12); // fTable length in bytes.
+ tableSize = (getTableSize() - RBBIDataWrapper.RBBIStateTable.fHeaderSize); // fTable length in bytes.
table.fTable = new char[tableSize];
table.fRowLen = rowLen; // Row length in bytes.
} else {
- tableSize = (getTableSize() - 12) / 2; // fTable length in shorts.
+ tableSize = (getTableSize() - RBBIDataWrapper.RBBIStateTable.fHeaderSize) / 2; // fTable length in shorts.
table.fTable = new char[tableSize];
table.fRowLen = rowLen * 2; // Row length in bytes.
}
if (fSafeTable == null) {
return 0;
}
- int size = 12; // The header of 4 ints, with no rows to the table.
+ int size = RBBIDataWrapper.RBBIStateTable.fHeaderSize; // The header, with no rows to the table.
int numRows = fSafeTable.size();
int numCols = fSafeTable.get(0).length;
boolean use8Bits = numRows <= MAX_STATE_FOR_8BITS_TABLE;
int rowLen = RBBIDataWrapper.NEXTSTATES + numCharCategories;
// TODO: tableSize is basically numStates * numCharCategories,
// except for alignment padding. Clean up here, and in main exportTable().
- int tableSize = (getSafeTableSize() - 12); // fTable length in bytes.
+ int tableSize = (getSafeTableSize() - RBBIDataWrapper.RBBIStateTable.fHeaderSize); // fTable length in bytes.
if (use8Bits) {
table.fFlags |= RBBIDataWrapper.RBBI_8BITS_ROWS;
table.fTable = new char[tableSize];
System.out.print("state | i n p u t s y m b o l s \n");
System.out.print(" | Acc LA Tag");
for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
- RBBINode.printInt(c, 3);
+ RBBINode.printInt(c, 4);
}
System.out.print("\n");
System.out.print(" |---------------");
for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
- System.out.print("---");
+ System.out.print("----");
}
System.out.print("\n");
RBBINode.printInt(sd.fTagsIdx, 6);
System.out.print(" ");
for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
- RBBINode.printInt(sd.fDtran[c], 3);
+ RBBINode.printInt(sd.fDtran[c], 4);
}
System.out.print("\n");
}
int row = fRData.getRowIndex(state);
short category = 3;
int flagsState = fRData.fFTable.fFlags;
+ int dictStart = fRData.fFTable.fDictCategoriesStart;
int mode = RBBI_RUN;
- int dictMask = fRData.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ?
- RBBIDataWrapper.DICT_BIT_FOR_8BITS_TRIE : RBBIDataWrapper.DICT_BIT;
if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
category = 2;
mode = RBBI_START;
//
category = (short) trie.get(c);
- // Check the dictionary bit in the character's category.
- // Counter is only used by dictionary based iterators (subclasses).
- // Chars that need to be handled by a dictionary have a flag bit set
- // in their category values.
- //
- if ((category & dictMask) != 0) {
+ // Check for categories that require word dictionary handling.
+ if (category >= dictStart) {
fDictionaryCharCount++;
- // And off the dictionary flag bit.
- category &= ~dictMask;
}
if (TRACE) {
CharacterIterator text = fText;
CodePointTrie trie = fRData.fTrie;
char[] stateTable = fRData.fRTable.fTable;
- int flagsState = fRData.fRTable.fFlags;
- int dictMask = fRData.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ?
- RBBIDataWrapper.DICT_BIT_FOR_8BITS_TRIE : RBBIDataWrapper.DICT_BIT;
CISetIndex32(text, fromPosition);
if (TRACE) {
//
// And off the dictionary flag bit. For reverse iteration it is not used.
category = (short) trie.get(c);
- category &= ~dictMask;
if (TRACE) {
System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
System.out.print(RBBIDataWrapper.intToHexString(c, 10));
int category;
int current;
int foundBreakCount = 0;
- int dictMask = fRData.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ?
- RBBIDataWrapper.DICT_BIT_FOR_8BITS_TRIE : RBBIDataWrapper.DICT_BIT;
// Loop through the text, looking for ranges of dictionary characters.
// For each span, find the appropriate break engine, and ask it to find
fText.setIndex(rangeStart);
int c = CharacterIteration.current32(fText);
category = (short)fRData.fTrie.get(c);
+ int dictStart = fRData.fFTable.fDictCategoriesStart;
while(true) {
- while((current = fText.getIndex()) < rangeEnd && (category & dictMask) == 0) {
+ while((current = fText.getIndex()) < rangeEnd && (category < dictStart)) {
c = CharacterIteration.next32(fText); // pre-increment
category = (short)fRData.fTrie.get(c);
}
version https://git-lfs.github.com/spec/v1
-oid sha256:bdf00a19b05bc52e17c2aea74e87cc1872a824d5a9cced226078c46a194a8799
-size 13141762
+oid sha256:53e4c3251f31233ffcfe3ff4229ea43d81422a3fa071ee774ed835e5e969d22c
+size 13142859
version https://git-lfs.github.com/spec/v1
-oid sha256:6d2882ccb44134313ff0365eb24776d4e859fa9dd223f10d608d65fdfd7f23d9
+oid sha256:72b712d8d19a5aa8d1cb36f070337010c29595c63d917cf81e3213a5ea5be2e7
size 94529
}
}
- List<Thread> threads = new ArrayList<Thread>();
+ List<Thread> threads = new ArrayList<>();
for (int n = 0; n<4; ++n) {
threads.add(new Thread(new WorkerThread()));
}
}
private static final BreakIterator BREAK_ITERATOR_CACHE = BreakIterator.getWordInstance(ULocale.ROOT);
public static List<Integer> getBoundary(String toParse) {
- List<Integer> retVal = new ArrayList<Integer>();
+ List<Integer> retVal = new ArrayList<>();
BreakIterator bi = (BreakIterator) BREAK_ITERATOR_CACHE.clone();
bi.setText(toParse);
for (int boundary=bi.first(); boundary != BreakIterator.DONE; boundary = bi.next()) {
int numCharClasses = dw.fHeader.fCatCount;
// Check for duplicate columns (character categories)
- List<String> columns = new ArrayList<String>();
+ List<String> columns = new ArrayList<>();
for (int column=0; column<numCharClasses; column++) {
StringBuilder s = new StringBuilder();
for (int r = 1; r < fwtbl.fNumStates; r++) {
int row = dw.getRowIndex(r);
char tableVal = fwtbl.fTable[row + RBBIDataWrapper.NEXTSTATES + column];
- s.append((char)tableVal);
+ s.append(tableVal);
}
columns.add(s.toString());
}
// Ignore column (char class) 0 while checking; it's special, and may have duplicates.
for (int c1=1; c1<numCharClasses; c1++) {
- for (int c2 = c1+1; c2 < numCharClasses; c2++) {
+ int limit = c1 < fwtbl.fDictCategoriesStart ? fwtbl.fDictCategoriesStart : numCharClasses;
+ for (int c2 = c1+1; c2 < limit; c2++) {
assertFalse(String.format("Duplicate columns (%d, %d)", c1, c2), columns.get(c1).equals(columns.get(c2)));
// if (columns.get(c1).equals(columns.get(c2))) {
// System.out.printf("Duplicate columns (%d, %d)\n", c1, c2);
}
// Check for duplicate states.
- List<String> rows = new ArrayList<String>();
+ List<String> rows = new ArrayList<>();
for (int r=0; r<fwtbl.fNumStates; r++) {
StringBuilder s = new StringBuilder();
int row = dw.getRowIndex(r);
public void TestTableRebuild() {
// Test to verify that rebuilding the state tables from rule source for the standard
// break iterator types yields the same tables as are imported from ICU4C as part of the default data.
- List<RuleBasedBreakIterator> breakIterators = new ArrayList<RuleBasedBreakIterator>();
+ List<RuleBasedBreakIterator> breakIterators = new ArrayList<>();
breakIterators.add((RuleBasedBreakIterator)BreakIterator.getCharacterInstance(Locale.ENGLISH));
breakIterators.add((RuleBasedBreakIterator)BreakIterator.getWordInstance(Locale.ENGLISH));
breakIterators.add((RuleBasedBreakIterator)BreakIterator.getSentenceInstance(Locale.ENGLISH));
@Test
public void Test8BitsTrieWith8BitStateTable() {
- testTrieStateTable(123, true /* expectUCPTrieValueWidthIn8Bits */, true /* expectStateRowIn8Bits */);
+ testTrieStateTable(251, true /* expectUCPTrieValueWidthIn8Bits */, true /* expectStateRowIn8Bits */);
}
@Test
public void Test16BitsTrieWith8BitStateTable() {
- testTrieStateTable(124, false /* expectUCPTrieValueWidthIn8Bits */, true /* expectStateRowIn8Bits */);
+ testTrieStateTable(252, false /* expectUCPTrieValueWidthIn8Bits */, true /* expectStateRowIn8Bits */);
}
@Test
public void Test16BitsTrieWith16BitStateTable() {
- testTrieStateTable(255, false /* expectUCPTrieValueWidthIn8Bits */, false /* expectStateRowIn8Bits */);
+ testTrieStateTable(253, false /* expectUCPTrieValueWidthIn8Bits */, false /* expectStateRowIn8Bits */);
}
@Test