}
-
-//-----------------------------------------------------------------------------------
-//
-// handlePrevious()
-//
-// Iterate backwards using the safe reverse rules.
-// The logic of this function is very similar to handleNext(), above.
-//
-//-----------------------------------------------------------------------------------
-int32_t RuleBasedBreakIterator::handlePrevious(int32_t fromPosition) {
- int32_t state;
- uint16_t category = 0;
- RBBIRunMode mode;
- RBBIStateTableRow *row;
- UChar32 c;
- LookAheadResults lookAheadMatches;
- int32_t result = 0;
- int32_t initialPosition = 0;
-
- const RBBIStateTable *stateTable = fData->fSafeRevTable;
- UTEXT_SETNATIVEINDEX(&fText, fromPosition);
- #ifdef RBBI_DEBUG
- if (gTrace) {
- RBBIDebugPuts("Handle Previous pos char state category");
- }
- #endif
-
- // if we're already at the start of the text, return DONE.
- if (fData == NULL || UTEXT_GETNATIVEINDEX(&fText)==0) {
- return BreakIterator::DONE;
- }
-
- // Set up the starting char.
- initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
- result = initialPosition;
- c = UTEXT_PREVIOUS32(&fText);
-
- // Set the initial state for the state machine
- state = START_STATE;
- row = (RBBIStateTableRow *)
- (stateTable->fTableData + (stateTable->fRowLen * state));
- category = 3;
- mode = RBBI_RUN;
- if (stateTable->fFlags & RBBI_BOF_REQUIRED) {
- category = 2;
- mode = RBBI_START;
- }
-
-
- // loop until we reach the start of the text or transition to state 0
- //
- for (;;) {
- if (c == U_SENTINEL) {
- // Reached end of input string.
- if (mode == RBBI_END) {
- // We have already run the loop one last time with the
- // character set to the psueudo {eof} value. Now it is time
- // to unconditionally bail out.
- break;
- }
- // Run the loop one last time with the fake end-of-input character category.
- mode = RBBI_END;
- category = 1;
- }
-
- //
- // Get the char category. An incoming category of 1 or 2 means that
- // we are preset for doing the beginning or end of input, and
- // that we shouldn't get a category from an actual text input character.
- //
- if (mode == RBBI_RUN) {
- // look up the current character's character category, which tells us
- // which column in the state table to look at.
- // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
- // not the size of the character going in, which is a UChar32.
- //
- // And off the dictionary flag bit. For reverse iteration it is not used.
- category = UTRIE2_GET16(fData->fTrie, c);
- category &= ~0x4000;
- }
-
- #ifdef RBBI_DEBUG
- if (gTrace) {
- RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(&fText));
- if (0x20<=c && c<0x7f) {
- RBBIDebugPrintf("\"%c\" ", c);
- } else {
- RBBIDebugPrintf("%5x ", c);
- }
- RBBIDebugPrintf("%3d %3d\n", state, category);
- }
- #endif
-
- // State Transition - move machine to its next state
- //
-
- // fNextState is a variable-length array.
- U_ASSERT(category<fData->fHeader->fCatCount);
- state = row->fNextState[category]; /*Not accessing beyond memory*/
- row = (RBBIStateTableRow *)
- (stateTable->fTableData + (stateTable->fRowLen * state));
-
- if (row->fAccepting == -1) {
- // Match found, common case.
- result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
- }
-
- int16_t completedRule = row->fAccepting;
- if (completedRule > 0) {
- // Lookahead match is completed.
- int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
- if (lookaheadResult >= 0) {
- UTEXT_SETNATIVEINDEX(&fText, lookaheadResult);
- return lookaheadResult;
- }
- }
- int16_t rule = row->fLookAhead;
- if (rule != 0) {
- // At the position of a '/' in a look-ahead match. Record it.
- int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
- lookAheadMatches.setPosition(rule, pos);
- }
-
- if (state == STOP_STATE) {
- // This is the normal exit from the lookup state machine.
- // We have advanced through the string until it is certain that no
- // longer match is possible, no matter what characters follow.
- break;
- }
-
- // Move (backwards) to the next character to process.
- // If this is a beginning-of-input loop iteration, don't advance
- // the input position. The next iteration will be processing the
- // first real input character.
- if (mode == RBBI_RUN) {
- c = UTEXT_PREVIOUS32(&fText);
- } else {
- if (mode == RBBI_START) {
- mode = RBBI_RUN;
- }
- }
- }
-
- // The state machine is done. Check whether it found a match...
-
- // If the iterator failed to advance in the match engine, force it ahead by one.
- // (This really indicates a defect in the break rules. They should always match
- // at least one character.)
- if (result == initialPosition) {
- UTEXT_SETNATIVEINDEX(&fText, initialPosition);
- UTEXT_PREVIOUS32(&fText);
- result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
- }
-
- #ifdef RBBI_DEBUG
- if (gTrace) {
- RBBIDebugPrintf("result = %d\n\n", result);
- }
- #endif
- return result;
-}
-
-
//-----------------------------------------------------------------------------------
//
// handleSafePrevious()
void RBBIDataWrapper::init0() {
fHeader = NULL;
fForwardTable = NULL;
- fSafeRevTable = NULL;
+ fReverseTable = NULL;
fRuleSource = NULL;
fRuleStatusTable = NULL;
fTrie = NULL;
if (data->fRTableLen != 0) {
fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
}
- if (data->fSRTableLen != 0) { // TODO: obsolete. Remove.
- fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
- }
fTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
(uint8_t *)data + fHeader->fTrie,
RBBIDebugPrintf(" number of character categories = %d\n\n", fHeader->fCatCount);
printTable("Forward State Transition Table", fForwardTable);
- printTable("Safe Reverse State Transition Table", fSafeRevTable);
+ printTable("Reverse State Transition Table", fReverseTable);
RBBIDebugPrintf("\nOrignal Rules source:\n");
for (int32_t c=0; fRuleSource[c] != 0; c++) {
outBytes+tableStartOffset+topSize, status);
}
- // Safe Forward state table. Same layout as forward table, above.
- tableStartOffset = ds->readUInt32(rbbiDH->fSFTable);
- tableLength = ds->readUInt32(rbbiDH->fSFTableLen);
-
- if (tableLength > 0) {
- ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
- outBytes+tableStartOffset, status);
- ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
- outBytes+tableStartOffset+topSize, status);
- }
-
- // Safe Reverse state table. Same layout as forward table, above.
- tableStartOffset = ds->readUInt32(rbbiDH->fSRTable);
- tableLength = ds->readUInt32(rbbiDH->fSRTableLen);
-
- if (tableLength > 0) {
- ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
- outBytes+tableStartOffset, status);
- ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
- outBytes+tableStartOffset+topSize, status);
- }
-
// Trie table for character categories
utrie2_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
outBytes+ds->readUInt32(rbbiDH->fTrie), status);
uint32_t fFTableLen;
uint32_t fRTable; /* Offset to the reverse state transition table. */
uint32_t fRTableLen;
- uint32_t fSFTable; /* safe point forward transition table */
- uint32_t fSFTableLen;
- uint32_t fSRTable; /* safe point reverse transition table */
- uint32_t fSRTableLen;
uint32_t fTrie; /* Offset to Trie data for character categories */
uint32_t fTrieLen;
uint32_t fRuleSource; /* Offset to the source for for the break */
/* */
const RBBIDataHeader *fHeader;
const RBBIStateTable *fForwardTable;
- const RBBIStateTable *fReverseTable; // auto-generated safe reverse.
- const RBBIStateTable *fSafeRevTable; // hand-written safe reverse. TODO: delete this.
+ const RBBIStateTable *fReverseTable;
const UChar *fRuleSource;
const int32_t *fRuleStatusTable;
fSafeFwdTree = NULL;
fSafeRevTree = NULL;
fDefaultTree = &fForwardTree;
- fForwardTables = NULL;
- fSafeRevTables = NULL;
+ fForwardTable = NULL;
fRuleStatusVals = NULL;
fChainRules = FALSE;
fLBCMNoChain = FALSE;
delete fUSetNodes;
delete fSetBuilder;
- delete fForwardTables;
- delete fSafeRevTables;
-
+ delete fForwardTable;
delete fForwardTree;
delete fReverseTree;
delete fSafeFwdTree;
// without the padding.
//
int32_t headerSize = align8(sizeof(RBBIDataHeader));
- int32_t forwardTableSize = align8(fForwardTables->getTableSize());
- int32_t reverseTableSize = align8(fForwardTables->getSafeTableSize());
- int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize()); // TODO: remove hand-written rules.
+ int32_t forwardTableSize = align8(fForwardTable->getTableSize());
+ int32_t reverseTableSize = align8(fForwardTable->getSafeTableSize());
int32_t trieSize = align8(fSetBuilder->getTrieSize());
int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
int32_t rulesSize = align8((fStrippedRules.length()+1) * sizeof(UChar));
int32_t totalSize = headerSize
+ forwardTableSize
+ reverseTableSize
- + safeRevTableSize
+ statusTableSize + trieSize + rulesSize;
RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize);
data->fRTable = data->fFTable + data->fFTableLen;
data->fRTableLen = reverseTableSize;
- // Do not save the Safe Forward table.
- data->fSFTable = data->fRTable + data->fRTableLen;
- data->fSFTableLen = 0;
-
- // Hand written reverse rules. TODO: remove, once synthesized ones are working.
- data->fSRTable = data->fSFTable + data->fSFTableLen;
- data->fSRTableLen = safeRevTableSize;
- U_ASSERT(safeRevTableSize > 0);
-
- data->fTrie = data->fSRTable + data->fSRTableLen;
+ data->fTrie = data->fRTable + data->fRTableLen;
data->fTrieLen = fSetBuilder->getTrieSize();
data->fStatusTable = data->fTrie + trieSize;
data->fStatusTableLen= statusTableSize;
uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
- fForwardTables->exportTable((uint8_t *)data + data->fFTable);
- fForwardTables->exportSafeTable((uint8_t *)data + data->fRTable);
- fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
-
+ fForwardTable->exportTable((uint8_t *)data + data->fFTable);
+ fForwardTable->exportSafeTable((uint8_t *)data + data->fRTable);
fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
//
// Generate the DFA state transition table.
//
- fForwardTables = new RBBITableBuilder(this, &fForwardTree, status);
- fSafeRevTables = new RBBITableBuilder(this, &fSafeRevTree, status);
- if (fForwardTables == nullptr || fSafeRevTables == nullptr)
- {
+ fForwardTable = new RBBITableBuilder(this, &fForwardTree, status);
+ if (fForwardTable == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return nullptr;
}
- fForwardTables->build();
- fSafeRevTables->build();
+ fForwardTable->buildForwardTable();
optimizeTables();
- fForwardTables->buildSafe(status);
+ fForwardTable->buildSafeReverseTable(status);
#ifdef RBBI_DEBUG
if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
- fForwardTables->printStates();
- fForwardTables->printRuleStatusTable();
- fForwardTables->printSafeTable();
+ fForwardTable->printStates();
+ fForwardTable->printRuleStatusTable();
+ fForwardTable->printReverseTable();
}
#endif
leftClass = 3;
rightClass = 0;
- while (fForwardTables->findDuplCharClassFrom(leftClass, rightClass)) {
+ while (fForwardTable->findDuplCharClassFrom(leftClass, rightClass)) {
fSetBuilder->mergeCategories(leftClass, rightClass);
- fForwardTables->removeColumn(rightClass);
- fSafeRevTables->removeColumn(rightClass);
+ fForwardTable->removeColumn(rightClass);
}
-
- fForwardTables->removeDuplicateStates();
- fSafeRevTables->removeDuplicateStates();
+ fForwardTable->removeDuplicateStates();
}
U_NAMESPACE_END
RBBISetBuilder *fSetBuilder; // Set and Character Category builder.
UVector *fUSetNodes; // Vector of all uset nodes.
- RBBITableBuilder *fForwardTables; // State transition tables
- RBBITableBuilder *fSafeRevTables;
+ RBBITableBuilder *fForwardTable; // State transition table, build time form.
UVector *fRuleStatusVals; // The values that can be returned
// from getRuleStatus().
// table from the RBBI rules parse tree.
//
//-----------------------------------------------------------------------------
-void RBBITableBuilder::build() {
+void RBBITableBuilder::buildForwardTable() {
if (U_FAILURE(*fStatus)) {
return;
return false;
}
+
+bool RBBITableBuilder::findDuplicateSafeState(int32_t *firstState, int32_t *duplState) {
+ int32_t numStates = fSafeTable->size();
+
+ for (; *firstState<numStates-1; ++(*firstState)) {
+ UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(*firstState));
+ for (*duplState=*firstState+1; *duplState<numStates; ++(*duplState)) {
+ UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(*duplState));
+ bool rowsMatch = true;
+ int32_t numCols = firstRow->length();
+ for (int32_t col=0; col < numCols; ++col) {
+ int32_t firstVal = firstRow->charAt(col);
+ int32_t duplVal = duplRow->charAt(col);
+ if (!((firstVal == duplVal) ||
+ ((firstVal == *firstState || firstVal == *duplState) &&
+ (duplVal == *firstState || duplVal == *duplState)))) {
+ rowsMatch = false;
+ break;
+ }
+ }
+ if (rowsMatch) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+
void RBBITableBuilder::removeState(int32_t keepState, int32_t duplState) {
U_ASSERT(keepState < duplState);
U_ASSERT(duplState < fDStates->size());
}
}
+void RBBITableBuilder::removeSafeState(int32_t keepState, int32_t duplState) {
+ U_ASSERT(keepState < duplState);
+ U_ASSERT(duplState < fSafeTable->size());
+
+ fSafeTable->removeElementAt(duplState); // Note that fSafeTable has a deleter function
+ // and will auto-delete the removed element.
+ int32_t numStates = fSafeTable->size();
+ int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
+ for (int32_t state=0; state<numStates; ++state) {
+ UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state);
+ for (int32_t col=0; col<numCols; col++) {
+ int32_t existingVal = sd->charAt(col);
+ int32_t newVal = existingVal;
+ if (existingVal == duplState) {
+ newVal = keepState;
+ } else if (existingVal > duplState) {
+ newVal = existingVal - 1;
+ }
+ sd->setCharAt(col, newVal);
+ }
+ }
+}
+
/*
* RemoveDuplicateStates
}
}
+
//-----------------------------------------------------------------------------
//
// getTableSize() Calculate the size of the runtime form of this
/**
* Synthesize a safe state table from the main state table.
*/
-void RBBITableBuilder::buildSafe(UErrorCode &status) {
+void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
// Find safe char class pairs.
// make a state table row for each trailing class, and map from class to row.
rowState.setCharAt(c1, 0);
}
- // TODO: Merge similar states.
-
+ // Remove duplicate or redundant rows from the table.
+ int32_t firstState = 1;
+ int32_t duplicateState = 0; // initial value is not used; set by findDuplicateSafeState().
+ while (findDuplicateSafeState(&firstState, &duplicateState)) {
+ // printf("Removing duplicate safe states (%d, %d)\n", firstState, duplicateState);
+ removeSafeState(firstState, duplicateState);
+ }
}
//
//-----------------------------------------------------------------------------
#ifdef RBBI_DEBUG
-void RBBITableBuilder::printSafeTable() {
+void RBBITableBuilder::printReverseTable() {
int c; // input "character"
int n; // state number
RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status);
~RBBITableBuilder();
- void build();
+ void buildForwardTable();
/** Return the runtime size in bytes of the built state table. */
int32_t getTableSize() const;
/** Check for, and remove dupicate states (table rows). */
void removeDuplicateStates();
- void buildSafe(UErrorCode &status);
+ /** Build the safe reverse table from the already-constructed forward table. */
+ void buildSafeReverseTable(UErrorCode &status);
/** Return the runtime size in bytes of the built safe reverse state table. */
int32_t getSafeTableSize() const;
*/
void removeState(int32_t keepState, int32_t duplState);
+ /** Find the next duplicate state in the safe reverse table. An iterator function.
+ * @param firstState ptr to state variable. Begin looking at this state, set to the first of the
+ * pair of duplicates on return.
+ * @param duplicateState ptr to where to return the duplicate state of fistState. Output only.
+ * @return true if a duplicate pair of states was found.
+ */
+ bool findDuplicateSafeState(int32_t *firstState, int32_t *duplicateState);
+
+ /** Remove a duplicate state from the safe table.
+ * @param keepState First of the duplicate pair. Keep it.
+ * @param duplState Duplicate state. Remove it. Redirect all table references to the duplicate state
+ * to refer to keepState instead.
+ */
+ void removeSafeState(int32_t keepState, int32_t duplState);
+
// Set functions for UVector.
// TODO: make a USet subclass of UVector
void printPosSets(RBBINode *n /* = NULL*/);
void printStates();
void printRuleStatusTable();
- void printSafeTable();
+ void printReverseTable();
#else
#define printSet(s)
#define printPosSets(n)
## -------------------------------------------------
!!chain;
!!lookAheadHardBreak;
-!!forward;
$CR $LF;
# GB 999 Match a single code point if no other rule applies.
.;
-
-## -------------------------------------------------
-
-!!safe_reverse;
-$Regional_Indicator $Regional_Indicator;
-($Extend | $ZWJ | $EmojiNRK | $Extended_Pict)+ .;
## -------------------------------------------------
-!!forward;
-
#
# CAN_CM is the set of characters that may combine with CM combining chars.
# Note that Linebreak UAX 14's concept of a combining char and the rules
# LB 31 Break everywhere else.
# Match a single code point if no other rule applies.
.;
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-# LB 9
-^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
-
-# LB 14
-$SP+ $CM* $OP;
-
-# LB 15
-$SP+ $CM* $QU;
-
-# LB 16
-$SP+ $CM* ($CL | $CP);
-
-# LB 17
-$SP+ $CM* $B2;
-
-# LB 21
-$CM* ($HY | $BA) $CM* $HL;
-
-# LB 25
-($CM* ($IS | $SY))+ $CM* $NU;
-($CL | $CP) $CM* ($NU | $IS | $SY);
-
-# LB 30
-($CM* $RI)+;
-
-# For dictionary-based break
-$dictionary $dictionary;
-
## -------------------------------------------------
-!!forward;
-
#
# CAN_CM is the set of characters that may combine with CM combining chars.
# Note that Linebreak UAX 14's concept of a combining char and the rules
# LB 31 Break everywhere else.
# Match a single code point if no other rule applies.
.;
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-# LB 9
-^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
-
-# LB 14
-$SP+ $CM* $OP;
-
-# LB 15
-$SP+ $CM* $QU;
-
-# LB 16
-$SP+ $CM* ($CL | $CP);
-
-# LB 17
-$SP+ $CM* $B2;
-
-# LB 21
-$CM* ($HY | $BA | $HH) $CM* $HL;
-
-# LB 25
-($CM* ($IS | $SY))+ $CM* $NU;
-($CL | $CP) $CM* ($NU | $IS | $SY);
-
-# LB 30
-($CM* $RI)+;
-
-# For dictionary-based break
-$dictionary $dictionary;
## -------------------------------------------------
-!!forward;
-
#
# CAN_CM is the set of characters that may combine with CM combining chars.
# Note that Linebreak UAX 14's concept of a combining char and the rules
# LB 31 Break everywhere else.
# Match a single code point if no other rule applies.
.;
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-# LB 9
-^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
-
-# LB 14
-$SP+ $CM* $OP;
-
-# LB 15
-$SP+ $CM* $QU;
-
-# LB 16
-$SP+ $CM* ($CL | $CP);
-
-# LB 17
-$SP+ $CM* $B2;
-
-# LB 21
-$CM* ($HY | $BA) $CM* $HL;
-
-# LB 25
-($CM* ($IS | $SY))+ $CM* $NU;
-($CL | $CP) $CM* ($NU | $IS | $SY);
-
-# LB 30
-($CM* $RI)+;
-
-# For dictionary-based break
-$dictionary $dictionary;
## -------------------------------------------------
-!!forward;
-
#
# CAN_CM is the set of characters that may combine with CM combining chars.
# Note that Linebreak UAX 14's concept of a combining char and the rules
# LB 31 Break everywhere else.
# Match a single code point if no other rule applies.
.;
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-# LB 9
-^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
-
-# LB 14
-$SP+ $CM* $OP;
-
-# LB 15
-$SP+ $CM* $QU;
-
-# LB 16
-$SP+ $CM* ($CL | $CP);
-
-# LB 17
-$SP+ $CM* $B2;
-
-# LB 21
-$CM* ($HY | $BA | $BAX) $CM* $HL;
-
-# LB 25
-($CM* ($IS | $SY))+ $CM* $NU;
-($CL | $CP) $CM* ($NU | $IS | $SY);
-
-# LB 30
-($CM* $RI)+;
-
-# For dictionary-based break
-$dictionary $dictionary;
## -------------------------------------------------
-!!forward;
-
#
# CAN_CM is the set of characters that may combine with CM combining chars.
# Note that Linebreak UAX 14's concept of a combining char and the rules
# LB 31 Break everywhere else.
# Match a single code point if no other rule applies.
.;
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-# LB 9
-^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
-
-# LB 14
-$SP+ $CM* $OP;
-
-# LB 15
-$SP+ $CM* $QU;
-
-# LB 16
-$SP+ $CM* ($CL | $CP);
-
-# LB 17
-$SP+ $CM* $B2;
-
-# LB 21
-$CM* ($HY | $BA | $HH) $CM* $HL;
-
-# LB 25
-($CM* ($IS | $SY))+ $CM* $NU;
-($CL | $CP) $CM* ($NU | $IS | $SY);
-
-# LB 30
-($CM* $RI)+;
-
-# For dictionary-based break
-$dictionary $dictionary;
## -------------------------------------------------
-!!forward;
-
#
# CAN_CM is the set of characters that may combine with CM combining chars.
# Note that Linebreak UAX 14's concept of a combining char and the rules
# LB 31 Break everywhere else.
# Match a single code point if no other rule applies.
.;
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-# LB 9
-^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
-
-# LB 14
-$SP+ $CM* $OP;
-
-# LB 15
-$SP+ $CM* $QU;
-
-# LB 16
-$SP+ $CM* ($CL | $CP);
-
-# LB 17
-$SP+ $CM* $B2;
-
-# LB 21
-$CM* ($HY | $BA) $CM* $HL;
-
-# LB 25
-($CM* ($IS | $SY))+ $CM* $NU;
-($CL | $CP) $CM* ($NU | $IS | $SY);
-
-# LB 30
-($CM* $RI)+;
-
-# For dictionary-based break
-$dictionary $dictionary;
## -------------------------------------------------
-!!forward;
-
#
# CAN_CM is the set of characters that may combine with CM combining chars.
# Note that Linebreak UAX 14's concept of a combining char and the rules
# LB 31 Break everywhere else.
# Match a single code point if no other rule applies.
.;
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-# LB 9
-^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
-
-# LB 14
-$SP+ $CM* $OP;
-
-# LB 15
-$SP+ $CM* $QU;
-
-# LB 16
-$SP+ $CM* ($CL | $CP);
-
-# LB 17
-$SP+ $CM* $B2;
-
-# LB 21
-$CM* ($HY | $BA | $BAX) $CM* $HL;
-
-# LB 25
-($CM* ($IS | $SY))+ $CM* $NU;
-($CL | $CP) $CM* ($NU | $IS | $SY);
-
-# LB 30
-($CM* $RI)+;
-
-# For dictionary-based break
-$dictionary $dictionary;
## -------------------------------------------------
-!!forward;
-
#
# CAN_CM is the set of characters that may combine with CM combining chars.
# Note that Linebreak UAX 14's concept of a combining char and the rules
# LB 31 Break everywhere else.
# Match a single code point if no other rule applies.
.;
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-# LB 9
-^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
-^$CM+ $SP / .;
-
-# LB 14
-$SP+ $CM* $OP;
-
-# LB 15
-$SP+ $CM* $QU;
-
-# LB 16
-$SP+ $CM* ($CL | $CP);
-
-# LB 17
-$SP+ $CM* $B2;
-
-# LB 21
-$CM* ($HY | $BA | $HH) $CM* $HL;
-
-# LB 25
-($CM* ($IS | $SY))+ $CM* $NU;
-($CL | $CP) $CM* ($NU | $IS | $SY);
-
-# LB 30
-($CM* $RI)+;
-
-# For dictionary-based break
-$dictionary $dictionary;
## -------------------------------------------------
!!chain;
-!!forward;
# Rule 3 - break after separators. Keep CR/LF together.
#
#Rule 12
[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .;
[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100};
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-$SpEx_R = ($Extend | $Format)* $Sp;
-$ATermEx_R = ($Extend | $Format)* $ATerm;
-$STermEx_R = ($Extend | $Format)* $STerm;
-$CloseEx_R = ($Extend | $Format)* $Close;
-
-[{bof}] (.? | $LF $CR) [^$Sep $CR $LF]* [$Sep $CR $LF {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*;
-#.*;
-
-# Explanation for this rule:
-#
-# It needs to back over
-# The $Sep at which we probably begin
-# All of the non $Sep chars leading to the preceding $Sep
-# The preceding $Sep, which will be the second one that the rule matches.
-# Any immediately preceding STerm or ATerm sequences. We need to see these
-# to get the correct rule status when moving forwards again.
-#
-# [{bof}] inhibit rule chaining. Without this, rule would loop on itself and match
-# the entire string. TODO: can bof be replaced with ^
-#
-# (.? | $LF $CR) Match one $Sep instance. Use .? rather than $Sep because position might be
-# at the beginning of the string at this point, and we don't want to fail.
-# Can only use {eof} once, and it is used later.
-#
## -------------------------------------------------
!!chain;
-!!forward;
# Rule 3 - break after separators. Keep CR/LF together.
#
#Rule 12
[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .;
[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100};
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-$SpEx_R = ($Extend | $Format)* $Sp;
-$ATermEx_R = ($Extend | $Format)* $ATerm;
-$STermEx_R = ($Extend | $Format)* $STerm;
-$CloseEx_R = ($Extend | $Format)* $Close;
-
-#
-# Reverse rules.
-# For now, use the old style inexact reverse rules, which are easier
-# to write, but less efficient.
-# TODO: exact reverse rules. It appears that exact reverse rules
-# may require improving support for look-ahead breaks in the
-# builder. Needs more investigation.
-#
-
-[{bof}] (.? | $LF $CR) [^$Sep $CR $LF]* [$Sep $CR $LF {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*;
-
-# Explanation for this rule:
-#
-# It needs to back over
-# The $Sep at which we probably begin
-# All of the non $Sep chars leading to the preceding $Sep
-# The preceding $Sep, which will be the second one that the rule matches.
-# Any immediately preceding STerm or ATerm sequences. We need to see these
-# to get the correct rule status when moving forwards again.
-#
-# [{bof}] inhibit rule chaining. Without this, rule would loop on itself and match
-# the entire string.
-#
-# (.? | $LF $CR) Match one $Sep instance. Use .? rather than $Sep because position might be
-# at the beginning of the string at this point, and we don't want to fail.
-# Can only use {eof} once, and it is used later.
-#
$Cased = [[:Upper_Case:][:Lower_Case:][:Lt:] - $CaseIgnorable];
$NotCased = [[^ $Cased] - $CaseIgnorable];
-!!forward;
-
# If the iterator begins on a CaseIgnorable, advance it past it/them.
# This can occur at the start-of-text, or after application of the
# safe-reverse rule.
# the uncased characters following the word.
$Cased ($Cased | $CaseIgnorable)* ($NotCased | $CaseIgnorable)*;
-
-
-!!safe_reverse;
-
-# Safe Reverse: the exact forward rule must not start in the middle
-# of a word, so the safe reverse skips over any Cased characters,
-# leaving it just before the start of a word.
-
-($Cased | $CaseIgnorable)*;
## -------------------------------------------------
-!!forward;
-
-
# Rule 3 - CR x LF
#
$CR $LF;
# Rule 999
# Match a single code point if no other rule applies.
.;
-
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-# rule 3
-($Extend | $Format | $ZWJ)+ .?;
-
-# rule 6
-($MidLetter | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* ($Hebrew_Letter | $ALetterPlus);
-
-# rule 7b
-$Double_Quote ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
-
-
-# rule 11
-($MidNum | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* $Numeric;
-
-# rule 13c
-$Regional_Indicator ($Format | $Extend | $ZWJ)* $Regional_Indicator;
-
-# For dictionary-based break
-$dictionary $dictionary;
## -------------------------------------------------
-!!forward;
-
-
# Rule 3 - CR x LF
#
$CR $LF;
# Rule 999
# Match a single code point if no other rule applies.
.;
-
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-# rule 3
-($Extend | $Format | $ZWJ)+ .?;
-
-# rule 6
-($MidLetter | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* ($Hebrew_Letter | $ALetterPlus);
-
-# rule 7b
-$Double_Quote ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
-
-
-# rule 11
-($MidNum | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* $Numeric;
-
-# rule 13c
-$Regional_Indicator ($Format | $Extend | $ZWJ)* $Regional_Indicator;
-
-# For dictionary-based break
-$dictionary $dictionary;