}
// Adjust offset to be on a code point boundary and not beyond the end of the text.
- // Note that isBoundary() is always be false for offsets that are not on code point boundaries.
+ // Note that isBoundary() is always false for offsets that are not on code point boundaries.
// But we still need the side effect of leaving iteration at the following boundary.
utext_setNativeIndex(&fText, offset);
UChar32 c;
int32_t result = 0;
- const RBBIStateTable *stateTable = fData->fSafeRevTable;
+ const RBBIStateTable *stateTable = fData->fReverseTable;
UTEXT_SETNATIVEINDEX(&fText, fromPosition);
#ifdef RBBI_DEBUG
if (gTrace) {
* BreakCache implemetation
*/
-RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
+RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
fBI(bi), fSideBuffer(status) {
reset();
}
fBI->fPosition = fTextIdx;
fBI->fRuleStatusIndex = fStatuses[fBufIdx];
return;
-}
+}
UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) {
fTextIdx = fBoundaries[fBufIdx];
return TRUE;
}
-
+
int32_t min = fStartBufIdx;
int32_t max = fEndBufIdx;
while (min != max) {
if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
int32_t aBoundary = 0;
int32_t ruleStatusIndex = 0;
- // TODO: check for position == length of text. Although may still need to back up to get rule status.
if (position > 20) {
int32_t backupPos = fBI->handleSafePrevious(position);
- fBI->fPosition = backupPos;
- aBoundary = fBI->handleNext(); // Ignore dictionary, just finding a rule based boundary.
- if (aBoundary == backupPos + 1) { // TODO: + 1 is wrong for supplementals.
- // Safe rules work on pairs. +1 from start pos may be a false match.
+
+ if (backupPos > 0) {
+ // Advance to the boundary following the backup position.
+ // There is a complication: the safe reverse rules identify pairs of code points
+ // that are safe. If advancing from the safe point moves forwards by less than
+ // two code points, we need to advance one more time to ensure that the boundary
+ // is good, including a correct rules status value.
+ //
+ fBI->fPosition = backupPos;
aBoundary = fBI->handleNext();
+ if (aBoundary <= backupPos + 4) {
+ // +4 is a quick test for possibly having advanced only one codepoint.
+ // Four being the length of the longest potential code point, a supplementary in UTF-8
+ utext_setNativeIndex(&fBI->fText, aBoundary);
+ if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) {
+ // The initial handleNext() only advanced by a single code point. Go again.
+ aBoundary = fBI->handleNext(); // Safe rules identify safe pairs.
+ }
+ }
+ ruleStatusIndex = fBI->fRuleStatusIndex;
}
- ruleStatusIndex = fBI->fRuleStatusIndex;
}
reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
}
-
+
// Fill in boundaries between existing cache content and the new requested position.
if (fBoundaries[fEndBufIdx] < position) {
position = 0;
positionStatusIdx = 0;
} else {
- fBI->fPosition = backupPosition; // TODO: pass starting position in a clearer way.
- position = fBI->handleNext(); // TODO: supplementals don't work with the +1.
- if (position == backupPosition + 1) {
- position = fBI->handleNext(); // Safe rules identify safe pairs.
+ // Advance to the boundary following the backup position.
+ // There is a complication: the safe reverse rules identify pairs of code points
+ // that are safe. If advancing from the safe point moves forwards by less than
+ // two code points, we need to advance one more time to ensure that the boundary
+ // is good, including a correct rules status value.
+ //
+ fBI->fPosition = backupPosition;
+ position = fBI->handleNext();
+ if (position <= backupPosition + 4) {
+ // +4 is a quick test for possibly having advanced only one codepoint.
+ // Four being the length of the longest potential code point, a supplementary in UTF-8
+ utext_setNativeIndex(&fBI->fText, position);
+ if (backupPosition == utext_getPreviousNativeIndex(&fBI->fText)) {
+ // The initial handleNext() only advanced by a single code point. Go again.
+ position = fBI->handleNext(); // Safe rules identify safe pairs.
+ }
};
positionStatusIdx = fBI->fRuleStatusIndex;
-
}
} while (position >= fromPosition);
}
U_ASSERT(position==dictSegEndPosition || position>=fromPosition);
}
-
+
if (!segmentHandledByDictionary && position < fromPosition) {
fSideBuffer.addElement(position, status);
fSideBuffer.addElement(positionStatusIdx, status);
break;
}
}
-
+
return success;
}
if (data->fFTableLen != 0) {
fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
}
- if (data->fSRTableLen != 0) {
+ if (data->fRTableLen != 0) {
+ fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
+ }
+ if (data->fSRTableLen != 0) { // TODO: obsolete. Remove.
fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
}
/* */
const RBBIDataHeader *fHeader;
const RBBIStateTable *fForwardTable;
- const RBBIStateTable *fSafeRevTable;
+ const RBBIStateTable *fReverseTable; // auto-generated safe reverse.
+ const RBBIStateTable *fSafeRevTable; // hand-written safe reverse. TODO: delete this.
const UChar *fRuleSource;
const int32_t *fRuleStatusTable;
fForwardTables->buildSafe(status);
- if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
#ifdef RBBI_DEBUG
if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
- fForwardTables
+ fForwardTables->printStates();
fForwardTables->printRuleStatusTable();
fForwardTables->printSafeTable();
}
if (wantedEndState == endState) {
int32_t pair = c1 << 16 | c2;
safePairs.addElement(pair, status);
- printf("(%d, %d) ", c1, c2);
+ // printf("(%d, %d) ", c1, c2);
}
}
- printf("\n");
+ // printf("\n");
}
// Populate the initial safe table.
rowState.setCharAt(c1, 0);
}
- // Merge similar states.
+ // TODO: Merge similar states.
}
//-----------------------------------------------------------------------------
//
-// exportTable() export the state transition table in the format required
-// by the runtime engine. getTableSize() bytes of memory
-// must be available at the output address "where".
+// exportSafeTable() export the state transition table in the format required
+// by the runtime engine. getTableSize() bytes of memory
+// must be available at the output address "where".
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::exportSafeTable(void *where) {
log_verbose("\nTesting the functions for sentence\n");
- ubrk_first(sentence);
+ pos = ubrk_first(sentence);
pos = ubrk_current(sentence);
log_verbose("Current(sentence) = %d\n", (int32_t)pos);
pos = ubrk_last(sentence);
if(pos!=49)
log_err("error ubrk_last for sentence did not return 49\n");
log_verbose("Last (sentence) = %d\n", (int32_t)pos);
- ubrk_first(sentence);
+ pos = ubrk_first(sentence);
to = ubrk_following( sentence, 0 );
if (to == 0) log_err("ubrk_following returned 0\n");
to = ubrk_preceding( sentence, to );
UParseError pe;
LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
assertSuccess(WHERE, status);
-
-#if 0
- bi->dumpTables();
-
- RBBIDataWrapper *dw = bi->fData;
- const RBBIStateTable *fwtbl = dw->fForwardTable;
- int32_t numCharClasses = dw->fHeader->fCatCount;
- printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
-
- for (int32_t c1=0; c1<numCharClasses; ++c1) {
- for (int32_t c2=0; c2 < numCharClasses; ++c2) {
- int32_t wantedEndState = -1;
- int32_t endState = 0;
- for (int32_t startState = 1; startState < (int32_t)fwtbl->fNumStates; ++startState) {
- RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * startState));
- int32_t s2 = row->fNextState[c1];
- row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * s2));
- endState = row->fNextState[c2];
- if (wantedEndState < 0) {
- wantedEndState = endState;
- } else {
- if (wantedEndState != endState) {
- break;
- }
- }
- }
- if (wantedEndState == endState) {
- printf("(%d, %d) ", c1, c2);
- }
- }
- printf("\n");
- }
- printf("\n");
-#endif
}
void RBBITest::TestProperties() {
# Temp debugging tests
#
-
+<word>
+<data>•
+•</data>
## FILTERED BREAK TESTS