From: Andy Heninger Date: Wed, 27 May 2020 23:36:22 +0000 (-0700) Subject: ICU-13565 RBBI, make all state table row data be unsigned. X-Git-Tag: cldr/2020-09-22~189 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=f0ad4546912b8b929ae8cc59b20a995424aa448b;p=icu ICU-13565 RBBI, make all state table row data be unsigned. --- diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp index 4182b6e0700..87f5f714107 100644 --- a/icu4c/source/common/rbbi.cpp +++ b/icu4c/source/common/rbbi.cpp @@ -714,11 +714,11 @@ static const int32_t kMaxLookaheads = 8; struct LookAheadResults { int32_t fUsedSlotLimit; int32_t fPositions[8]; - int16_t fKeys[8]; + uint16_t fKeys[8]; LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {} - int32_t getPosition(int16_t key) { + int32_t getPosition(uint16_t key) { for (int32_t i=0; ifAccepting == -1) { + uint16_t accepting = row->fAccepting; + if (accepting == ACCEPTING_UNCONDITIONAL) { // Match found, common case. if (mode != RBBI_START) { result = (int32_t)UTEXT_GETNATIVEINDEX(&fText); } - fRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. - } - - int16_t completedRule = row->fAccepting; - if (completedRule > 0) { + fRuleStatusIndex = row->fTagsIdx; // Remember the break status (tag) values. + } else if (accepting > ACCEPTING_UNCONDITIONAL) { // Lookahead match is completed. - int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule); + int32_t lookaheadResult = lookAheadMatches.getPosition(accepting); if (lookaheadResult >= 0) { - fRuleStatusIndex = row->fTagIdx; + fRuleStatusIndex = row->fTagsIdx; fPosition = lookaheadResult; return lookaheadResult; } @@ -937,7 +935,7 @@ int32_t RuleBasedBreakIterator::handleNext() { // This would enable hard-break rules with no following context. // But there are line break test failures when trying this. Investigate. // Issue ICU-20837 - int16_t rule = row->fLookAhead; + uint16_t rule = row->fLookAhead; if (rule != 0) { int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(&fText); lookAheadMatches.setPosition(rule, pos); diff --git a/icu4c/source/common/rbbidata.cpp b/icu4c/source/common/rbbidata.cpp index 7b624d90df2..970fa8197bd 100644 --- a/icu4c/source/common/rbbidata.cpp +++ b/icu4c/source/common/rbbidata.cpp @@ -253,12 +253,12 @@ void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *tab RBBIStateTableRow *row = (RBBIStateTableRow *) (table->fTableData + (table->fRowLen * s)); if (use8Bits) { - RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->r8.fAccepting, row->r8.fLookAhead, row->r8.fTagIdx); + RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->r8.fAccepting, row->r8.fLookAhead, row->r8.fTagsIdx); for (c=0; cfCatCount; c++) { RBBIDebugPrintf("%3d ", row->r8.fNextState[c]); } } else { - RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->r16.fAccepting, row->r16.fLookAhead, row->r16.fTagIdx); + RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->r16.fAccepting, row->r16.fLookAhead, row->r16.fTagsIdx); for (c=0; cfCatCount; c++) { RBBIDebugPrintf("%3d ", row->r16.fNextState[c]); } diff --git a/icu4c/source/common/rbbidata.h b/icu4c/source/common/rbbidata.h index 2458f902faa..963050d58e3 100644 --- a/icu4c/source/common/rbbidata.h +++ b/icu4c/source/common/rbbidata.h @@ -95,34 +95,36 @@ struct RBBIDataHeader { -template +template struct RBBIStateTableRowT { - ST fAccepting; /* Non-zero if this row is for an accepting state. */ - /* Value 0: not an accepting state. */ - /* -1: Unconditional Accepting state. */ - /* positive: Look-ahead match has completed. */ - /* Actual boundary position happened earlier */ - /* Value here == fLookAhead in earlier */ - /* state, at actual boundary pos. */ - ST fLookAhead; /* Non-zero if this row is for a state that */ - /* corresponds to a '/' in the rule source. */ - /* Value is the same as the fAccepting */ - /* value for the rule (which will appear */ - /* in a different state. */ - ST fTagIdx; /* Non-zero if this row covers a {tagged} position */ - /* from a rule. Value is the index in the */ - /* StatusTable of the set of matching */ - /* tags (rule status values) */ - UT fNextState[1]; /* Next State, indexed by char category. */ - /* Variable-length array declared with length 1 */ - /* to disable bounds checkers. */ - /* Array Size is actually fData->fHeader->fCatCount*/ - /* CAUTION: see RBBITableBuilder::getTableSize() */ - /* before changing anything here. */ + T fAccepting; // Non-zero if this row is for an accepting state. + // Value 0: not an accepting state. + // 1: (ACCEPTING_UNCONDITIONAL) Unconditional Accepting state. + // >1: Look-ahead match has completed. + // Actual boundary position happened earlier + // Value here == fLookAhead in earlier + // state, at actual boundary pos. + T fLookAhead; // Non-zero if this row is for a state that + // corresponds to a '/' in the rule source. + // Value is the same as the fAccepting + // value for the rule (which will appear + // in a different state. + T fTagsIdx; // Non-zero if this row covers a {tagged} position + // from a rule. Value is the index in the + // StatusTable of the set of matching + // tags (rule status values) + T fNextState[1]; // Next State, indexed by char category. + // Variable-length array declared with length 1 + // to disable bounds checkers. + // Array Size is actually fData->fHeader->fCatCount + // CAUTION: see RBBITableBuilder::getTableSize() + // before changing anything here. }; -typedef RBBIStateTableRowT RBBIStateTableRow8; -typedef RBBIStateTableRowT RBBIStateTableRow16; +typedef RBBIStateTableRowT RBBIStateTableRow8; +typedef RBBIStateTableRowT RBBIStateTableRow16; + +constexpr uint16_t ACCEPTING_UNCONDITIONAL = 1; // Value constant for RBBIStateTableRow::fAccepting union RBBIStateTableRow { RBBIStateTableRow16 r16; diff --git a/icu4c/source/common/rbbitblb.cpp b/icu4c/source/common/rbbitblb.cpp index ce6e637f813..ebf1f858c56 100644 --- a/icu4c/source/common/rbbitblb.cpp +++ b/icu4c/source/common/rbbitblb.cpp @@ -714,7 +714,12 @@ void RBBITableBuilder::mapLookAheadRules() { return; } fLookAheadRuleMap->setSize(fRB->fScanner->numRules() + 1); - int32_t laSlotsInUse = 0; + + // Counter used when assigning lookahead rule numbers. + // Contains the last look-ahead number already in use. + // The first look-ahead number is 2; Number 1 (ACCEPTING_UNCONDITIONAL) is reserved + // for non-lookahead accepting states. See the declarations of RBBIStateTableRowT. + int32_t laSlotsInUse = ACCEPTING_UNCONDITIONAL; for (int32_t n=0; nsize(); n++) { RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); @@ -807,23 +812,23 @@ void RBBITableBuilder::flagAcceptingStates() { if (sd->fPositions->indexOf(endMarker) >= 0) { // Any non-zero value for fAccepting means this is an accepting node. // The value is what will be returned to the user as the break status. - // If no other value was specified, force it to -1. + // If no other value was specified, force it to ACCEPTING_UNCONDITIONAL (1). if (sd->fAccepting==0) { // State hasn't been marked as accepting yet. Do it now. sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal); if (sd->fAccepting == 0) { - sd->fAccepting = -1; + sd->fAccepting = ACCEPTING_UNCONDITIONAL; } } - if (sd->fAccepting==-1 && endMarker->fVal != 0) { + if (sd->fAccepting==ACCEPTING_UNCONDITIONAL && endMarker->fVal != 0) { // Both lookahead and non-lookahead accepting for this state. // Favor the look-ahead, because a look-ahead match needs to // immediately stop the run-time engine. First match, not longest. sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal); } // implicit else: - // if sd->fAccepting already had a value other than 0 or -1, leave it be. + // if sd->fAccepting already had a value other than 0 or 1, leave it be. } } } @@ -857,7 +862,7 @@ void RBBITableBuilder::flagLookAheadStates() { int32_t positionsIdx = sd->fPositions->indexOf(lookAheadNode); if (positionsIdx >= 0) { U_ASSERT(lookAheadNode == sd->fPositions->elementAt(positionsIdx)); - int32_t lookaheadSlot = fLookAheadRuleMap->elementAti(lookAheadNode->fVal); + uint32_t lookaheadSlot = fLookAheadRuleMap->elementAti(lookAheadNode->fVal); U_ASSERT(sd->fLookAhead == 0 || sd->fLookAhead == lookaheadSlot); // if (sd->fLookAhead != 0 && sd->fLookAhead != lookaheadSlot) { // printf("%s:%d Bingo. sd->fLookAhead:%d lookaheadSlot:%d\n", @@ -1392,22 +1397,23 @@ void RBBITableBuilder::exportTable(void *where) { RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state); RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen); if (use8BitsForTable()) { - U_ASSERT (-128 < sd->fAccepting && sd->fAccepting <= 127); - U_ASSERT (-128 < sd->fLookAhead && sd->fLookAhead <= 127); - U_ASSERT (-128 < sd->fTagsIdx && sd->fTagsIdx <= 127); - row->r8.fAccepting = (int8_t)sd->fAccepting; - row->r8.fLookAhead = (int8_t)sd->fLookAhead; - row->r8.fTagIdx = (int8_t)sd->fTagsIdx; + U_ASSERT (sd->fAccepting <= 255); + U_ASSERT (sd->fLookAhead <= 255); + U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 255); + row->r8.fAccepting = sd->fAccepting; + row->r8.fLookAhead = sd->fLookAhead; + row->r8.fTagsIdx = sd->fTagsIdx; for (col=0; colfDtran->elementAti(col) <= kMaxStateFor8BitsTable); row->r8.fNextState[col] = sd->fDtran->elementAti(col); } } else { - U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767); - U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767); - row->r16.fAccepting = (int16_t)sd->fAccepting; - row->r16.fLookAhead = (int16_t)sd->fLookAhead; - row->r16.fTagIdx = (int16_t)sd->fTagsIdx; + U_ASSERT (sd->fAccepting <= 0xffff); + U_ASSERT (sd->fLookAhead <= 0xffff); + U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 0xffff); + row->r16.fAccepting = sd->fAccepting; + row->r16.fLookAhead = sd->fLookAhead; + row->r16.fTagsIdx = sd->fTagsIdx; for (col=0; colr16.fNextState[col] = sd->fDtran->elementAti(col); } @@ -1597,7 +1603,7 @@ void RBBITableBuilder::exportSafeTable(void *where) { if (use8BitsForSafeTable()) { row->r8.fAccepting = 0; row->r8.fLookAhead = 0; - row->r8.fTagIdx = 0; + row->r8.fTagsIdx = 0; for (col=0; colcharAt(col) <= kMaxStateFor8BitsTable); row->r8.fNextState[col] = rowString->charAt(col); @@ -1605,7 +1611,7 @@ void RBBITableBuilder::exportSafeTable(void *where) { } else { row->r16.fAccepting = 0; row->r16.fLookAhead = 0; - row->r16.fTagIdx = 0; + row->r16.fTagsIdx = 0; for (col=0; colr16.fNextState[col] = rowString->charAt(col); } diff --git a/icu4c/source/common/rbbitblb.h b/icu4c/source/common/rbbitblb.h index 9bd73a56f83..bab4ff595bf 100644 --- a/icu4c/source/common/rbbitblb.h +++ b/icu4c/source/common/rbbitblb.h @@ -195,8 +195,8 @@ private: class RBBIStateDescriptor : public UMemory { public: UBool fMarked; - int32_t fAccepting; - int32_t fLookAhead; + uint32_t fAccepting; + uint32_t fLookAhead; UVector *fTagVals; int32_t fTagsIdx; UVector *fPositions; // Set of parse tree positions associated diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 484b58994bc..002ab94893e 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -4672,18 +4672,16 @@ void RBBITest::TestTableRedundancies() { UnicodeString s; RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r)); if (in8Bits) { - assertTrue(WHERE, row->r8.fAccepting >= -1); - s.append(row->r8.fAccepting + 1); // values of -1 are expected. + s.append(row->r8.fAccepting); s.append(row->r8.fLookAhead); - s.append(row->r8.fTagIdx); + s.append(row->r8.fTagsIdx); for (int32_t column = 0; column < numCharClasses; column++) { s.append(row->r8.fNextState[column]); } } else { - assertTrue(WHERE, row->r16.fAccepting >= -1); - s.append(row->r16.fAccepting + 1); // values of -1 are expected. + s.append(row->r16.fAccepting); s.append(row->r16.fLookAhead); - s.append(row->r16.fTagIdx); + s.append(row->r16.fTagsIdx); for (int32_t column = 0; column < numCharClasses; column++) { s.append(row->r16.fNextState[column]); } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/RBBIDataWrapper.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/RBBIDataWrapper.java index 4e66170f066..c0375bdb4f6 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/RBBIDataWrapper.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/RBBIDataWrapper.java @@ -70,11 +70,7 @@ public final class RBBIDataWrapper { This.fTable = new char[lengthOfTable]; for (int i = 0; i < lengthOfTable; i++) { byte b = bytes.get(); - if (i % This.fRowLen < NEXTSTATES) { - This.fTable[i] = (char) b; // Treat b as signed. - } else { - This.fTable[i] = (char)(0xff & b); // Treat b as unsigned. - } + This.fTable[i] = (char)(0xff & b); // Treat b as unsigned. } ICUBinary.skipBytes(bytes, lengthOfTable & 1); } else { @@ -202,12 +198,17 @@ public final class RBBIDataWrapper { /** * offset to the "tagIndex" field in a state table row. */ - public final static int TAGIDX = 2; + public final static int TAGSIDX = 2; /** * offset to the start of the next states array in a state table row. */ public final static int NEXTSTATES = 3; + /** + * value constant for the ACCEPTING field of a state table row. + */ + public final static int ACCEPTING_UNCONDITIONAL = 1; + // Bit selectors for the "FLAGS" field of the state table header // enum RBBIStateTableFlags in the C version. // @@ -477,7 +478,7 @@ public final class RBBIDataWrapper { } else { dest.append(" "); } - dest.append(intToString(table.fTable[row+TAGIDX], 5)); + dest.append(intToString(table.fTable[row+TAGSIDX], 5)); for (int col=0; col= 0) { if (sd.fPositions.contains(endMarker)) { // Any non-zero value for fAccepting means this is an accepting node. // The value is what will be returned to the user as the break status. - // If no other value was specified, force it to -1. + // If no other value was specified, force it to ACCEPTING_UNCONDITIONAL (1). if (sd.fAccepting==0) { // State hasn't been marked as accepting yet. Do it now. sd.fAccepting = fLookAheadRuleMap[endMarker.fVal]; if (sd.fAccepting == 0) { - sd.fAccepting = -1; + sd.fAccepting = RBBIDataWrapper.ACCEPTING_UNCONDITIONAL; } } - if (sd.fAccepting==-1 && endMarker.fVal != 0) { + if (sd.fAccepting==RBBIDataWrapper.ACCEPTING_UNCONDITIONAL && endMarker.fVal != 0) { // Both lookahead and non-lookahead accepting for this state. // Favor the look-ahead, because a look-ahead match needs to // immediately stop the run-time engine. First match, not longest. sd.fAccepting = fLookAheadRuleMap[endMarker.fVal]; } // implicit else: - // if sd.fAccepting already had a value other than 0 or -1, leave it be. + // if sd.fAccepting already had a value other than 0 or 1, leave it be. } } } @@ -1158,18 +1162,18 @@ class RBBITableBuilder { RBBIStateDescriptor sd = fDStates.get(state); int row = state*rowLen; if (use8Bits) { - Assert.assrt (-128 < sd.fAccepting && sd.fAccepting <= MAX_STATE_FOR_8BITS_TABLE); - Assert.assrt (-128 < sd.fLookAhead && sd.fLookAhead <= MAX_STATE_FOR_8BITS_TABLE); + Assert.assrt (0 <= sd.fAccepting && sd.fAccepting <= 255); + Assert.assrt (0 <= sd.fLookAhead && sd.fLookAhead <= 255); } else { - Assert.assrt (-32768 < sd.fAccepting && sd.fAccepting <= 32767); - Assert.assrt (-32768 < sd.fLookAhead && sd.fLookAhead <= 32767); + Assert.assrt (0 <= sd.fAccepting && sd.fAccepting <= 0xffff); + Assert.assrt (0 <= sd.fLookAhead && sd.fLookAhead <= 0xffff); } table.fTable[row + RBBIDataWrapper.ACCEPTING] = (char)sd.fAccepting; table.fTable[row + RBBIDataWrapper.LOOKAHEAD] = (char)sd.fLookAhead; - table.fTable[row + RBBIDataWrapper.TAGIDX] = (char)sd.fTagsIdx; + table.fTable[row + RBBIDataWrapper.TAGSIDX] = (char)sd.fTagsIdx; for (col=0; col= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { @@ -924,15 +925,12 @@ public class RuleBasedBreakIterator extends BreakIterator { } // Remember the break status (tag) values. - fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX]; - } - - int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING]; - if (completedRule > 0 && completedRule != 0xffff) { + fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX]; + } else if (accepting > RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) { // Lookahead match is completed - int lookaheadResult = fLookAheadMatches.getPosition(completedRule); + int lookaheadResult = fLookAheadMatches.getPosition(accepting); if (lookaheadResult >= 0) { - fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX]; + fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX]; fPosition = lookaheadResult; return lookaheadResult; } diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar index e126ae2a1d2..0d7cbef46a5 100644 --- a/icu4j/main/shared/data/icudata.jar +++ b/icu4j/main/shared/data/icudata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ed7db50765b06c8a35f48048543c5c9a2c2e19993f752bd71a15e6ac89aa3b3 -size 13141781 +oid sha256:bdf00a19b05bc52e17c2aea74e87cc1872a824d5a9cced226078c46a194a8799 +size 13141762 diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java index f790db886ae..493098ad605 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java @@ -606,7 +606,7 @@ public class RBBITest extends TestFmwk { int row = dw.getRowIndex(r); s.append(fwtbl.fTable[row + RBBIDataWrapper.ACCEPTING]); s.append(fwtbl.fTable[row + RBBIDataWrapper.LOOKAHEAD]); - s.append(fwtbl.fTable[row + RBBIDataWrapper.TAGIDX]); + s.append(fwtbl.fTable[row + RBBIDataWrapper.TAGSIDX]); for (int column=0; column