};
+// Map from look-ahead break states (corresponds to rules) to boundary positions.
+// Allows multiple lookahead break rules to be in flight at the same time.
+//
+// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers
+// in the state table be sequential, then we can just index an array. And the
+// table could also tell us in advance how big that array needs to be.
+//
+// Before ICU 57 there was just a single simple variable for a look-ahead match that
+// was in progress. Two rules at once did not work.
+
+static const int32_t kMaxLookaheads = 8;
+struct LookAheadResults {
+ int32_t fUsedSlotLimit;
+ int32_t fPositions[8];
+ int16_t fKeys[8];
+
+ LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {};
+
+ int32_t getPosition(int16_t key) {
+ for (int32_t i=0; i<fUsedSlotLimit; ++i) {
+ if (fKeys[i] == key) {
+ return fPositions[i];
+ }
+ }
+ U_ASSERT(FALSE);
+ return -1;
+ }
+
+ void setPosition(int16_t key, int32_t position) {
+ int32_t i;
+ for (i=0; i<fUsedSlotLimit; ++i) {
+ if (fKeys[i] == key) {
+ fPositions[i] = position;
+ return;
+ }
+ }
+ if (i >= kMaxLookaheads) {
+ U_ASSERT(FALSE);
+ i = kMaxLookaheads - 1;
+ }
+ fKeys[i] = key;
+ fPositions[i] = position;
+ U_ASSERT(fUsedSlotLimit == i);
+ fUsedSlotLimit = i + 1;
+ }
+};
+
+
//-----------------------------------------------------------------------------------
//
// handleNext(stateTable)
RBBIStateTableRow *row;
UChar32 c;
- int32_t lookaheadStatus = 0;
- int32_t lookaheadTagIdx = 0;
- int32_t result = 0;
- int32_t initialPosition = 0;
- int32_t lookaheadResult = 0;
- UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
- const char *tableData = statetable->fTableData;
- uint32_t tableRowLen = statetable->fRowLen;
+ LookAheadResults lookAheadMatches;
+ int32_t result = 0;
+ int32_t initialPosition = 0;
+ const char *tableData = statetable->fTableData;
+ uint32_t tableRowLen = statetable->fRowLen;
#ifdef RBBI_DEBUG
if (fTrace) {
// We have already run the loop one last time with the
// character set to the psueudo {eof} value. Now it is time
// to unconditionally bail out.
- if (lookaheadResult > result) {
- // We ran off the end of the string with a pending look-ahead match.
- // Treat this as if the look-ahead condition had been met, and return
- // the match at the / position from the look-ahead rule.
- result = lookaheadResult;
- fLastRuleStatusIndex = lookaheadTagIdx;
- lookaheadStatus = 0;
- }
break;
}
// Run the loop one last time with the fake end-of-input character category.
fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
}
- if (row->fLookAhead != 0) {
- if (lookaheadStatus != 0
- && row->fAccepting == lookaheadStatus) {
- // Lookahead match is completed.
- result = lookaheadResult;
- fLastRuleStatusIndex = lookaheadTagIdx;
- lookaheadStatus = 0;
- // TODO: make a standalone hard break in a rule work.
- if (lookAheadHardBreak) {
- UTEXT_SETNATIVEINDEX(fText, result);
- return result;
- }
- // Look-ahead completed, but other rules may match further. Continue on
- // TODO: junk this feature? I don't think it's used anywhwere.
- goto continueOn;
+ int16_t completedRule = row->fAccepting;
+ if (completedRule > 0) {
+ // Lookahead match is completed.
+ int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
+ if (lookaheadResult >= 0) {
+ fLastRuleStatusIndex = row->fTagIdx;
+ UTEXT_SETNATIVEINDEX(fText, lookaheadResult);
+ return lookaheadResult;
}
-
- int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
- lookaheadResult = r;
- lookaheadStatus = row->fLookAhead;
- lookaheadTagIdx = row->fTagIdx;
- goto continueOn;
}
-
-
- if (row->fAccepting != 0) {
- // Because this is an accepting state, any in-progress look-ahead match
- // is no longer relavant. Clear out the pending lookahead status.
- lookaheadStatus = 0; // clear out any pending look-ahead match.
+ int16_t rule = row->fLookAhead;
+ if (rule != 0) {
+ // At the position of a '/' in a look-ahead match. Record it.
+ int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
+ lookAheadMatches.setPosition(rule, pos);
}
-continueOn:
if (state == STOP_STATE) {
// This is the normal exit from the lookup state machine.
// We have advanced through the string until it is certain that no
RBBIRunMode mode;
RBBIStateTableRow *row;
UChar32 c;
- int32_t lookaheadStatus = 0;
+ LookAheadResults lookAheadMatches;
int32_t result = 0;
int32_t initialPosition = 0;
- int32_t lookaheadResult = 0;
- UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
#ifdef RBBI_DEBUG
if (fTrace) {
// We have already run the loop one last time with the
// character set to the psueudo {eof} value. Now it is time
// to unconditionally bail out.
- if (lookaheadResult < result) {
- // We ran off the end of the string with a pending look-ahead match.
- // Treat this as if the look-ahead condition had been met, and return
- // the match at the / position from the look-ahead rule.
- result = lookaheadResult;
- lookaheadStatus = 0;
- } else if (result == initialPosition) {
+ if (result == initialPosition) {
// Ran off start, no match found.
// move one index one (towards the start, since we are doing a previous())
UTEXT_SETNATIVEINDEX(fText, initialPosition);
result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
}
- if (row->fLookAhead != 0) {
- if (lookaheadStatus != 0
- && row->fAccepting == lookaheadStatus) {
- // Lookahead match is completed.
- result = lookaheadResult;
- lookaheadStatus = 0;
- // TODO: make a standalone hard break in a rule work.
- if (lookAheadHardBreak) {
- UTEXT_SETNATIVEINDEX(fText, result);
- return result;
- }
- // Look-ahead completed, but other rules may match further. Continue on
- // TODO: junk this feature? I don't think it's used anywhwere.
- goto continueOn;
+ int16_t completedRule = row->fAccepting;
+ if (completedRule > 0) {
+ // Lookahead match is completed.
+ int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
+ if (lookaheadResult >= 0) {
+ UTEXT_SETNATIVEINDEX(fText, lookaheadResult);
+ return lookaheadResult;
}
-
- int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
- lookaheadResult = r;
- lookaheadStatus = row->fLookAhead;
- goto continueOn;
}
-
-
- if (row->fAccepting != 0) {
- // Because this is an accepting state, any in-progress look-ahead match
- // is no longer relavant. Clear out the pending lookahead status.
- lookaheadStatus = 0;
+ int16_t rule = row->fLookAhead;
+ if (rule != 0) {
+ // At the position of a '/' in a look-ahead match. Record it.
+ int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
+ lookAheadMatches.setPosition(rule, pos);
}
-continueOn:
if (state == STOP_STATE) {
// This is the normal exit from the lookup state machine.
// We have advanced through the string until it is certain that no
/*
***************************************************************************
-* Copyright (C) 2002-2008 International Business Machines Corporation *
+* Copyright (C) 2002-2016 International Business Machines Corporation *
* and others. All rights reserved. *
***************************************************************************
*/
fLastPos = 0;
fNullable = FALSE;
fLookAheadEnd = FALSE;
+ fRuleRoot = FALSE;
+ fChainIn = FALSE;
fVal = 0;
fPrecedence = precZero;
fLastPos = other.fLastPos;
fNullable = other.fNullable;
fVal = other.fVal;
+ fRuleRoot = FALSE;
+ fChainIn = other.fChainIn;
UErrorCode status = U_ZERO_ERROR;
fFirstPosSet = new UVector(status); // TODO - get a real status from somewhere
fLastPosSet = new UVector(status);
}
}
}
+ n->fRuleRoot = this->fRuleRoot;
+ n->fChainIn = this->fChainIn;
return n;
}
//
//-------------------------------------------------------------------------
#ifdef RBBI_DEBUG
+
+static int32_t serial(const RBBINode *node) {
+ return (node == NULL? -1 : node->fSerialNum);
+}
+
+
void RBBINode::printNode() {
static const char * const nodeTypeNames[] = {
"setRef",
if (this==NULL) {
RBBIDebugPrintf("%10p", (void *)this);
} else {
- RBBIDebugPrintf("%10p %12s %10p %10p %10p %4d %6d %d ",
- (void *)this, nodeTypeNames[fType], (void *)fParent, (void *)fLeftChild, (void *)fRightChild,
- fSerialNum, fFirstPos, fVal);
+ RBBIDebugPrintf("%10p %5d %12s %c%c %5d %5d %5d %6d %d ",
+ (void *)this, fSerialNum, nodeTypeNames[fType], fRuleRoot?'R':' ', fChainIn?'C':' ',
+ serial(fLeftChild), serial(fRightChild), serial(fParent),
+ fFirstPos, fVal);
if (fType == varRef) {
RBBI_DEBUG_printUnicodeString(fText);
}
//
//-------------------------------------------------------------------------
#ifdef RBBI_DEBUG
+void RBBINode::printNodeHeader() {
+ RBBIDebugPrintf(" Address serial type LeftChild RightChild Parent position value\n");
+}
+
void RBBINode::printTree(UBool printHeading) {
if (printHeading) {
- RBBIDebugPrintf( "-------------------------------------------------------------------\n"
- " Address type Parent LeftChild RightChild serial position value\n"
- );
+ printNodeHeader();
}
this->printNode();
if (this != NULL) {
UBool fLookAheadEnd; // For endMark nodes, set TRUE if
// marking the end of a look-ahead rule.
+ UBool fRuleRoot; // True if this node is the root of a rule.
+ UBool fChainIn; // True if chaining into this rule is allowed
+ // (no '^' present).
+
UVector *fFirstPosSet;
UVector *fLastPosSet; // TODO: rename fFirstPos & fLastPos to avoid confusion.
UVector *fFollowPos;
void findNodes(UVector *dest, RBBINode::NodeType kind, UErrorCode &status);
#ifdef RBBI_DEBUG
+ static void printNodeHeader();
void printNode();
void printTree(UBool withHeading);
#endif
UBool operator == (const RBBINode &other); // Private, so these functions won't accidently be used.
#ifdef RBBI_DEBUG
+ public:
int fSerialNum; // Debugging aids.
#endif
};
doExprStart,
doLParen,
doNOP,
+ doNoChain,
doOptionEnd,
doOptionStart,
doReverseDir,
static const struct RBBIRuleTableEl gRuleParseStateTable[] = {
{doNOP, 0, 0, 0, TRUE}
- , {doExprStart, 254, 21, 8, FALSE} // 1 start
+ , {doExprStart, 254, 29, 9, FALSE} // 1 start
, {doNOP, 132, 1,0, TRUE} // 2
- , {doExprStart, 36 /* $ */, 80, 90, FALSE} // 3
- , {doNOP, 33 /* ! */, 11,0, TRUE} // 4
- , {doNOP, 59 /* ; */, 1,0, TRUE} // 5
- , {doNOP, 252, 0,0, FALSE} // 6
- , {doExprStart, 255, 21, 8, FALSE} // 7
- , {doEndOfRule, 59 /* ; */, 1,0, TRUE} // 8 break-rule-end
- , {doNOP, 132, 8,0, TRUE} // 9
- , {doRuleError, 255, 95,0, FALSE} // 10
- , {doNOP, 33 /* ! */, 13,0, TRUE} // 11 rev-option
- , {doReverseDir, 255, 20, 8, FALSE} // 12
- , {doOptionStart, 130, 15,0, TRUE} // 13 option-scan1
- , {doRuleError, 255, 95,0, FALSE} // 14
- , {doNOP, 129, 15,0, TRUE} // 15 option-scan2
- , {doOptionEnd, 255, 17,0, FALSE} // 16
- , {doNOP, 59 /* ; */, 1,0, TRUE} // 17 option-scan3
- , {doNOP, 132, 17,0, TRUE} // 18
- , {doRuleError, 255, 95,0, FALSE} // 19
- , {doExprStart, 255, 21, 8, FALSE} // 20 reverse-rule
- , {doRuleChar, 254, 30,0, TRUE} // 21 term
- , {doNOP, 132, 21,0, TRUE} // 22
- , {doRuleChar, 131, 30,0, TRUE} // 23
- , {doNOP, 91 /* [ */, 86, 30, FALSE} // 24
- , {doLParen, 40 /* ( */, 21, 30, TRUE} // 25
- , {doNOP, 36 /* $ */, 80, 29, FALSE} // 26
- , {doDotAny, 46 /* . */, 30,0, TRUE} // 27
- , {doRuleError, 255, 95,0, FALSE} // 28
- , {doCheckVarDef, 255, 30,0, FALSE} // 29 term-var-ref
- , {doNOP, 132, 30,0, TRUE} // 30 expr-mod
- , {doUnaryOpStar, 42 /* * */, 35,0, TRUE} // 31
- , {doUnaryOpPlus, 43 /* + */, 35,0, TRUE} // 32
- , {doUnaryOpQuestion, 63 /* ? */, 35,0, TRUE} // 33
- , {doNOP, 255, 35,0, FALSE} // 34
- , {doExprCatOperator, 254, 21,0, FALSE} // 35 expr-cont
- , {doNOP, 132, 35,0, TRUE} // 36
- , {doExprCatOperator, 131, 21,0, FALSE} // 37
- , {doExprCatOperator, 91 /* [ */, 21,0, FALSE} // 38
- , {doExprCatOperator, 40 /* ( */, 21,0, FALSE} // 39
- , {doExprCatOperator, 36 /* $ */, 21,0, FALSE} // 40
- , {doExprCatOperator, 46 /* . */, 21,0, FALSE} // 41
- , {doExprCatOperator, 47 /* / */, 47,0, FALSE} // 42
- , {doExprCatOperator, 123 /* { */, 59,0, TRUE} // 43
- , {doExprOrOperator, 124 /* | */, 21,0, TRUE} // 44
- , {doExprRParen, 41 /* ) */, 255,0, TRUE} // 45
- , {doExprFinished, 255, 255,0, FALSE} // 46
- , {doSlash, 47 /* / */, 49,0, TRUE} // 47 look-ahead
- , {doNOP, 255, 95,0, FALSE} // 48
- , {doExprCatOperator, 254, 21,0, FALSE} // 49 expr-cont-no-slash
- , {doNOP, 132, 35,0, TRUE} // 50
- , {doExprCatOperator, 131, 21,0, FALSE} // 51
- , {doExprCatOperator, 91 /* [ */, 21,0, FALSE} // 52
- , {doExprCatOperator, 40 /* ( */, 21,0, FALSE} // 53
- , {doExprCatOperator, 36 /* $ */, 21,0, FALSE} // 54
- , {doExprCatOperator, 46 /* . */, 21,0, FALSE} // 55
- , {doExprOrOperator, 124 /* | */, 21,0, TRUE} // 56
- , {doExprRParen, 41 /* ) */, 255,0, TRUE} // 57
- , {doExprFinished, 255, 255,0, FALSE} // 58
- , {doNOP, 132, 59,0, TRUE} // 59 tag-open
- , {doStartTagValue, 128, 62,0, FALSE} // 60
- , {doTagExpectedError, 255, 95,0, FALSE} // 61
- , {doNOP, 132, 66,0, TRUE} // 62 tag-value
- , {doNOP, 125 /* } */, 66,0, FALSE} // 63
- , {doTagDigit, 128, 62,0, TRUE} // 64
- , {doTagExpectedError, 255, 95,0, FALSE} // 65
- , {doNOP, 132, 66,0, TRUE} // 66 tag-close
- , {doTagValue, 125 /* } */, 69,0, TRUE} // 67
- , {doTagExpectedError, 255, 95,0, FALSE} // 68
- , {doExprCatOperator, 254, 21,0, FALSE} // 69 expr-cont-no-tag
- , {doNOP, 132, 69,0, TRUE} // 70
- , {doExprCatOperator, 131, 21,0, FALSE} // 71
- , {doExprCatOperator, 91 /* [ */, 21,0, FALSE} // 72
- , {doExprCatOperator, 40 /* ( */, 21,0, FALSE} // 73
- , {doExprCatOperator, 36 /* $ */, 21,0, FALSE} // 74
- , {doExprCatOperator, 46 /* . */, 21,0, FALSE} // 75
- , {doExprCatOperator, 47 /* / */, 47,0, FALSE} // 76
- , {doExprOrOperator, 124 /* | */, 21,0, TRUE} // 77
- , {doExprRParen, 41 /* ) */, 255,0, TRUE} // 78
- , {doExprFinished, 255, 255,0, FALSE} // 79
- , {doStartVariableName, 36 /* $ */, 82,0, TRUE} // 80 scan-var-name
- , {doNOP, 255, 95,0, FALSE} // 81
- , {doNOP, 130, 84,0, TRUE} // 82 scan-var-start
- , {doVariableNameExpectedErr, 255, 95,0, FALSE} // 83
- , {doNOP, 129, 84,0, TRUE} // 84 scan-var-body
- , {doEndVariableName, 255, 255,0, FALSE} // 85
- , {doScanUnicodeSet, 91 /* [ */, 255,0, TRUE} // 86 scan-unicode-set
- , {doScanUnicodeSet, 112 /* p */, 255,0, TRUE} // 87
- , {doScanUnicodeSet, 80 /* P */, 255,0, TRUE} // 88
- , {doNOP, 255, 95,0, FALSE} // 89
- , {doNOP, 132, 90,0, TRUE} // 90 assign-or-rule
- , {doStartAssign, 61 /* = */, 21, 93, TRUE} // 91
- , {doNOP, 255, 29, 8, FALSE} // 92
- , {doEndAssign, 59 /* ; */, 1,0, TRUE} // 93 assign-end
- , {doRuleErrorAssignExpr, 255, 95,0, FALSE} // 94
- , {doExit, 255, 95,0, TRUE} // 95 errorDeath
+ , {doNoChain, 94 /* ^ */, 12, 9, TRUE} // 3
+ , {doExprStart, 36 /* $ */, 88, 98, FALSE} // 4
+ , {doNOP, 33 /* ! */, 19,0, TRUE} // 5
+ , {doNOP, 59 /* ; */, 1,0, TRUE} // 6
+ , {doNOP, 252, 0,0, FALSE} // 7
+ , {doExprStart, 255, 29, 9, FALSE} // 8
+ , {doEndOfRule, 59 /* ; */, 1,0, TRUE} // 9 break-rule-end
+ , {doNOP, 132, 9,0, TRUE} // 10
+ , {doRuleError, 255, 103,0, FALSE} // 11
+ , {doExprStart, 254, 29,0, FALSE} // 12 start-after-caret
+ , {doNOP, 132, 12,0, TRUE} // 13
+ , {doRuleError, 94 /* ^ */, 103,0, FALSE} // 14
+ , {doExprStart, 36 /* $ */, 88, 37, FALSE} // 15
+ , {doRuleError, 59 /* ; */, 103,0, FALSE} // 16
+ , {doRuleError, 252, 103,0, FALSE} // 17
+ , {doExprStart, 255, 29,0, FALSE} // 18
+ , {doNOP, 33 /* ! */, 21,0, TRUE} // 19 rev-option
+ , {doReverseDir, 255, 28, 9, FALSE} // 20
+ , {doOptionStart, 130, 23,0, TRUE} // 21 option-scan1
+ , {doRuleError, 255, 103,0, FALSE} // 22
+ , {doNOP, 129, 23,0, TRUE} // 23 option-scan2
+ , {doOptionEnd, 255, 25,0, FALSE} // 24
+ , {doNOP, 59 /* ; */, 1,0, TRUE} // 25 option-scan3
+ , {doNOP, 132, 25,0, TRUE} // 26
+ , {doRuleError, 255, 103,0, FALSE} // 27
+ , {doExprStart, 255, 29, 9, FALSE} // 28 reverse-rule
+ , {doRuleChar, 254, 38,0, TRUE} // 29 term
+ , {doNOP, 132, 29,0, TRUE} // 30
+ , {doRuleChar, 131, 38,0, TRUE} // 31
+ , {doNOP, 91 /* [ */, 94, 38, FALSE} // 32
+ , {doLParen, 40 /* ( */, 29, 38, TRUE} // 33
+ , {doNOP, 36 /* $ */, 88, 37, FALSE} // 34
+ , {doDotAny, 46 /* . */, 38,0, TRUE} // 35
+ , {doRuleError, 255, 103,0, FALSE} // 36
+ , {doCheckVarDef, 255, 38,0, FALSE} // 37 term-var-ref
+ , {doNOP, 132, 38,0, TRUE} // 38 expr-mod
+ , {doUnaryOpStar, 42 /* * */, 43,0, TRUE} // 39
+ , {doUnaryOpPlus, 43 /* + */, 43,0, TRUE} // 40
+ , {doUnaryOpQuestion, 63 /* ? */, 43,0, TRUE} // 41
+ , {doNOP, 255, 43,0, FALSE} // 42
+ , {doExprCatOperator, 254, 29,0, FALSE} // 43 expr-cont
+ , {doNOP, 132, 43,0, TRUE} // 44
+ , {doExprCatOperator, 131, 29,0, FALSE} // 45
+ , {doExprCatOperator, 91 /* [ */, 29,0, FALSE} // 46
+ , {doExprCatOperator, 40 /* ( */, 29,0, FALSE} // 47
+ , {doExprCatOperator, 36 /* $ */, 29,0, FALSE} // 48
+ , {doExprCatOperator, 46 /* . */, 29,0, FALSE} // 49
+ , {doExprCatOperator, 47 /* / */, 55,0, FALSE} // 50
+ , {doExprCatOperator, 123 /* { */, 67,0, TRUE} // 51
+ , {doExprOrOperator, 124 /* | */, 29,0, TRUE} // 52
+ , {doExprRParen, 41 /* ) */, 255,0, TRUE} // 53
+ , {doExprFinished, 255, 255,0, FALSE} // 54
+ , {doSlash, 47 /* / */, 57,0, TRUE} // 55 look-ahead
+ , {doNOP, 255, 103,0, FALSE} // 56
+ , {doExprCatOperator, 254, 29,0, FALSE} // 57 expr-cont-no-slash
+ , {doNOP, 132, 43,0, TRUE} // 58
+ , {doExprCatOperator, 131, 29,0, FALSE} // 59
+ , {doExprCatOperator, 91 /* [ */, 29,0, FALSE} // 60
+ , {doExprCatOperator, 40 /* ( */, 29,0, FALSE} // 61
+ , {doExprCatOperator, 36 /* $ */, 29,0, FALSE} // 62
+ , {doExprCatOperator, 46 /* . */, 29,0, FALSE} // 63
+ , {doExprOrOperator, 124 /* | */, 29,0, TRUE} // 64
+ , {doExprRParen, 41 /* ) */, 255,0, TRUE} // 65
+ , {doExprFinished, 255, 255,0, FALSE} // 66
+ , {doNOP, 132, 67,0, TRUE} // 67 tag-open
+ , {doStartTagValue, 128, 70,0, FALSE} // 68
+ , {doTagExpectedError, 255, 103,0, FALSE} // 69
+ , {doNOP, 132, 74,0, TRUE} // 70 tag-value
+ , {doNOP, 125 /* } */, 74,0, FALSE} // 71
+ , {doTagDigit, 128, 70,0, TRUE} // 72
+ , {doTagExpectedError, 255, 103,0, FALSE} // 73
+ , {doNOP, 132, 74,0, TRUE} // 74 tag-close
+ , {doTagValue, 125 /* } */, 77,0, TRUE} // 75
+ , {doTagExpectedError, 255, 103,0, FALSE} // 76
+ , {doExprCatOperator, 254, 29,0, FALSE} // 77 expr-cont-no-tag
+ , {doNOP, 132, 77,0, TRUE} // 78
+ , {doExprCatOperator, 131, 29,0, FALSE} // 79
+ , {doExprCatOperator, 91 /* [ */, 29,0, FALSE} // 80
+ , {doExprCatOperator, 40 /* ( */, 29,0, FALSE} // 81
+ , {doExprCatOperator, 36 /* $ */, 29,0, FALSE} // 82
+ , {doExprCatOperator, 46 /* . */, 29,0, FALSE} // 83
+ , {doExprCatOperator, 47 /* / */, 55,0, FALSE} // 84
+ , {doExprOrOperator, 124 /* | */, 29,0, TRUE} // 85
+ , {doExprRParen, 41 /* ) */, 255,0, TRUE} // 86
+ , {doExprFinished, 255, 255,0, FALSE} // 87
+ , {doStartVariableName, 36 /* $ */, 90,0, TRUE} // 88 scan-var-name
+ , {doNOP, 255, 103,0, FALSE} // 89
+ , {doNOP, 130, 92,0, TRUE} // 90 scan-var-start
+ , {doVariableNameExpectedErr, 255, 103,0, FALSE} // 91
+ , {doNOP, 129, 92,0, TRUE} // 92 scan-var-body
+ , {doEndVariableName, 255, 255,0, FALSE} // 93
+ , {doScanUnicodeSet, 91 /* [ */, 255,0, TRUE} // 94 scan-unicode-set
+ , {doScanUnicodeSet, 112 /* p */, 255,0, TRUE} // 95
+ , {doScanUnicodeSet, 80 /* P */, 255,0, TRUE} // 96
+ , {doNOP, 255, 103,0, FALSE} // 97
+ , {doNOP, 132, 98,0, TRUE} // 98 assign-or-rule
+ , {doStartAssign, 61 /* = */, 29, 101, TRUE} // 99
+ , {doNOP, 255, 37, 9, FALSE} // 100
+ , {doEndAssign, 59 /* ; */, 1,0, TRUE} // 101 assign-end
+ , {doRuleErrorAssignExpr, 255, 103,0, FALSE} // 102
+ , {doExit, 255, 103,0, TRUE} // 103 errorDeath
};
#ifdef RBBI_DEBUG
static const char * const RBBIRuleStateNames[] = { 0,
0,
0,
0,
+ 0,
0,
"break-rule-end",
0,
+ 0,
+ "start-after-caret",
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
0,
"rev-option",
0,
#*****************************************************************************
#
-# Copyright (C) 2002-2003, International Business Machines Corporation and others.
+# Copyright (C) 2002-2016, International Business Machines Corporation and others.
# All Rights Reserved.
#
#*****************************************************************************
# This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays
# that are then built with the rule parser.
#
+# perl rbbicst.pl < rbbirpt.txt > rbbirpt.h
#
# Here is the syntax of the state definitions in this file:
start:
escaped term ^break-rule-end doExprStart
white_space n start
+ '^' n start-after-caret ^break-rule-end doNoChain
'$' scan-var-name ^assign-or-rule doExprStart
'!' n rev-option
';' n start # ignore empty rules.
white_space n break-rule-end
default errorDeath doRuleError
-
+#
+# start of a rule, after having seen a '^' (inhibits rule chain in).
+# Similar to the main 'start' state in most respects, except
+# - empty rule is an error.
+# - A second '^' is an error.
+#
+start-after-caret:
+ escaped term doExprStart
+ white_space n start-after-caret
+ '^' errorDeath doRuleError # two '^'s
+ '$' scan-var-name ^term-var-ref doExprStart
+ ';' errorDeath doRuleError # ^ ;
+ eof errorDeath doRuleError
+ default term doExprStart
+
#
# ! We've just scanned a '!', indicating either a !!key word flag or a
# !Reverse rule.
//
// file: rbbiscan.cpp
//
-// Copyright (C) 2002-2015, International Business Machines Corporation and others.
+// Copyright (C) 2002-2016, International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains the Rule Based Break Iterator Rule Builder functions for
RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb)
{
fRB = rb;
+ fScanIndex = 0;
+ fNextIndex = 0;
+ fQuoteMode = FALSE;
+ fLineNum = 1;
+ fCharNum = 0;
+ fLastChar = 0;
+
+ fStateTable = NULL;
+ fStack[0] = 0;
fStackPtr = 0;
- fStack[fStackPtr] = 0;
- fNodeStackPtr = 0;
- fRuleNum = 0;
fNodeStack[0] = NULL;
-
- fSymbolTable = NULL;
- fSetTable = NULL;
-
- fScanIndex = 0;
- fNextIndex = 0;
+ fNodeStackPtr = 0;
fReverseRule = FALSE;
fLookAheadRule = FALSE;
+ fNoChainInRule = FALSE;
- fLineNum = 1;
- fCharNum = 0;
- fQuoteMode = FALSE;
+ fSymbolTable = NULL;
+ fSetTable = NULL;
+ fRuleNum = 0;
+ fOptionStart = 0;
// Do not check status until after all critical fields are sufficiently initialized
// that the destructor can run cleanly.
break;
+ case doNoChain:
+ // Scanned a '^' while on the rule start state.
+ fNoChainInRule = TRUE;
+ break;
+
+
case doExprOrOperator:
{
fixOpStack(RBBINode::precOpCat);
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");}
#endif
U_ASSERT(fNodeStackPtr == 1);
+ RBBINode *thisRule = fNodeStack[fNodeStackPtr];
// If this rule includes a look-ahead '/', add a endMark node to the
// expression tree.
if (fLookAheadRule) {
- RBBINode *thisRule = fNodeStack[fNodeStackPtr];
RBBINode *endNode = pushNewNode(RBBINode::endMark);
RBBINode *catNode = pushNewNode(RBBINode::opCat);
if (U_FAILURE(*fRB->fStatus)) {
fNodeStack[fNodeStackPtr] = catNode;
endNode->fVal = fRuleNum;
endNode->fLookAheadEnd = TRUE;
+ thisRule = catNode;
+
+ // TODO: Disable chaining out of look-ahead (hard break) rules.
+ // The break on rule match is forced, so there is no point in building up
+ // the state table to chain into another rule for a longer match.
}
+ // Mark this node as being the root of a rule.
+ thisRule->fRuleRoot = TRUE;
+
+ // Flag if chaining into this rule is wanted.
+ //
+ if (fRB->fChainRules && // If rule chaining is enabled globally via !!chain
+ !fNoChainInRule) { // and no '^' chain-in inhibit was on this rule
+ thisRule->fChainIn = TRUE;
+ }
+
+
// All rule expressions are ORed together.
// The ';' that terminates an expression really just functions as a '|' with
// a low operator prededence.
}
fReverseRule = FALSE; // in preparation for the next rule.
fLookAheadRule = FALSE;
+ fNoChainInRule = FALSE;
fNodeStackPtr = 0;
}
break;
for (;;) {
#ifdef RBBI_DEBUG
- if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf(".");}
+ if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf("."); fflush(stdout);}
#endif
if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE && tableEl->fCharClass == fC.fChar) {
// Table row specified an individual character, not a set, and
struct RBBIRuleChar {
UChar32 fChar;
UBool fEscaped;
+ RBBIRuleChar() : fChar(0), fEscaped(FALSE) {};
};
RBBIRuleScanner(RBBIRuleBuilder *rb);
UBool fLookAheadRule; // True if the rule includes a '/'
// somewhere within it.
+ UBool fNoChainInRule; // True if the current rule starts with a '^'.
+
RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of
// $variable symbols.
/*
**********************************************************************
-* Copyright (c) 2002-2009, International Business Machines
+* Copyright (c) 2002-2016, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
fTree = fTree->flattenVariables();
#ifdef RBBI_DEBUG
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
- RBBIDebugPuts("Parse tree after flattening variable references.");
+ RBBIDebugPuts("\nParse tree after flattening variable references.");
fTree->printTree(TRUE);
}
#endif
fTree->flattenSets();
#ifdef RBBI_DEBUG
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
- RBBIDebugPuts("Parse tree after flattening Unicode Set references.");
+ RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
fTree->printTree(TRUE);
}
#endif
}
+//-----------------------------------------------------------------------------
+//
+// addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged
+// as roots of a rule to a destination vector.
+//
+//-----------------------------------------------------------------------------
+void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
+ if (node == NULL || U_FAILURE(*fStatus)) {
+ return;
+ }
+ if (node->fRuleRoot) {
+ dest->addElement(node, *fStatus);
+ // Note: rules cannot nest. If we found a rule start node,
+ // no child node can also be a start node.
+ return;
+ }
+ addRuleRootNodes(dest, node->fLeftChild);
+ addRuleRootNodes(dest, node->fRightChild);
+}
//-----------------------------------------------------------------------------
//
return;
}
- // Get all nodes that can be the start a match, which is FirstPosition()
- // of the portion of the tree corresponding to user-written rules.
- // See the tree description in bofFixup().
- RBBINode *userRuleRoot = tree;
- if (fRB->fSetBuilder->sawBOF()) {
- userRuleRoot = tree->fLeftChild->fRightChild;
- }
- U_ASSERT(userRuleRoot != NULL);
- UVector *matchStartNodes = userRuleRoot->fFirstPosSet;
+ // Collect all leaf nodes that can start matches for rules
+ // with inbound chaining enabled, which is the union of the
+ // firstPosition sets from each of the rule root nodes.
+
+ UVector ruleRootNodes(*fStatus);
+ addRuleRootNodes(&ruleRootNodes, tree);
+ UVector matchStartNodes(*fStatus);
+ for (int i=0; i<ruleRootNodes.size(); ++i) {
+ RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(i));
+ if (node->fChainIn) {
+ setAdd(&matchStartNodes, node->fFirstPosSet);
+ }
+ }
+ if (U_FAILURE(*fStatus)) {
+ return;
+ }
- // Iteratate over all leaf nodes,
- //
int32_t endNodeIx;
int32_t startNodeIx;
// Now iterate over the nodes that can start a match, looking for ones
// with the same char class as our ending node.
RBBINode *startNode;
- for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
- startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
+ for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
+ startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx);
if (startNode->fType != RBBINode::leafChar) {
continue;
}
if (n==NULL) {
return;
}
+ printf("\n");
+ RBBINode::printNodeHeader();
n->printNode();
RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE");
void RBBITableBuilder::printSet(UVector *s) {
int32_t i;
for (i=0; i<s->size(); i++) {
- void *v = s->elementAt(i);
- RBBIDebugPrintf("%10p", v);
+ const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
+ RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum);
}
RBBIDebugPrintf("\n");
}
/*
**********************************************************************
-* Copyright (c) 2002-2005, International Business Machines
+* Copyright (c) 2002-2016, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
void flagTaggedStates();
void mergeRuleStatusVals();
+ void addRuleRootNodes(UVector *dest, RBBINode *node);
+
// Set functions for UVector.
// TODO: make a USet subclass of UVector
#
-# Copyright (C) 2002-2015, International Business Machines Corporation and others.
+# Copyright (C) 2002-2016, International Business Machines Corporation and others.
# All Rights Reserved.
#
# file: char.txt
#
# ICU Character Break Rules, also known as Grapheme Cluster Boundaries
# See Unicode Standard Annex #29.
-# These rules are based on UAX #29 Revision 20 for Unicode Version 6.2
+# These rules are based on UAX #29 Revision 28 (Draft 3) for Unicode Version 9.0
#
#
#
$CR = [\p{Grapheme_Cluster_Break = CR}];
$LF = [\p{Grapheme_Cluster_Break = LF}];
-$Control = [\p{Grapheme_Cluster_Break = Control}];
+$Control = [[\p{Grapheme_Cluster_Break = Control}]-[:Block=Tags:]];
# TODO: Restore if the Prepend set becomes non-empty again: $Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
-$Extend = [\p{Grapheme_Cluster_Break = Extend}];
+$Extend = [[\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]];
$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
$Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
$LV = [\p{Grapheme_Cluster_Break = LV}];
$LVT = [\p{Grapheme_Cluster_Break = LVT}];
+# Emoji defintions scraped from http://www.unicode.org/Public/emoji/2.0//emoji-data.txt
+
+$E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
+
+$E_Modifier = [\U0001F3FB-\U0001F3FF];
+
+$ZWJ = [\u200D];
+$GAZ = [\U0001F455-\U0001F469\U0001F48B\U0001F5E8\u2764];
## -------------------------------------------------
!!chain;
-
+!!lookAheadHardBreak;
!!forward;
$CR $LF;
($LV | $V) ($V | $T);
($LVT | $T) $T;
-$Regional_Indicator $Regional_Indicator;
+# GB 8. Keep pairs of regional indicators together
+# Note that hard break '/' rule triggers only if there are three or more initial RIs,
+
+^$Regional_Indicator $Regional_Indicator / $Regional_Indicator;
+^$Regional_Indicator $Regional_Indicator;
-[^$Control $CR $LF] $Extend;
+# GB 9
+[^$Control $CR $LF] ($Extend | $ZWJ);
+# GB 9a (only for extended grapheme clusters)
[^$Control $CR $LF] $SpacingMark;
-# TODO: Restore if the Prepend set becomes non-empty again: $Prepend [^$Control $CR $LF];
+# GB 9b Restore if the Prepend set becomes non-empty again: $Prepend [^$Control $CR $LF];
+# GB9c Emoji proposal
+($E_Base | $GAZ) $E_Modifier;
+
+# GB 9d Don't break between ZWJ and Glue_After_Zwj
+$ZWJ $GAZ;
## -------------------------------------------------
($V | $T) ($LV | $V);
$T ($LVT | $T);
-$Regional_Indicator $Regional_Indicator;
+# GB 8. Going backwards, we must scan through any number of regional indicators as pairs.
+#
+$Regional_Indicator $Regional_Indicator / ($Regional_Indicator $Regional_Indicator)* [{eof}[^$Regional_Indicator]];
+
+# GB 9
+($Extend | $ZWJ) [^$Control $CR $LF]; #note that this will chain into Regional_Indicator when needed.
-$Extend [^$Control $CR $LF];
+# GB 9a
$SpacingMark [^$Control $CR $LF];
-# TODO: Restore if the Prepend set becomes non-empty again: [^$Control $CR $LF] $Prepend;
+# GB 9b Restore if the Prepend set becomes non-empty again: [^$Control $CR $LF] $Prepend;
+# GB 9c
+$E_Modifier ($E_Base | $GAZ);
+
+# GB 9d Don't break between ZWJ and Glue_After_Zwj
+$GAZ $ZWJ;
## -------------------------------------------------
-# We don't logically need safe char break rules, but if we don't provide any at all
-# the engine for preceding() and following() will fall back to the
-# old style inefficient algorithm.
!!safe_reverse;
-$LF $CR;
+$Regional_Indicator $Regional_Indicator;
## -------------------------------------------------
!!safe_forward;
-$CR $LF;
-
+$Regional_Indicator $Regional_Indicator;
-# Copyright (c) 2002-2015 International Business Machines Corporation and
+# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
# file: line.txt
#
# Line Breaking Rules
-# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
+# Implement default line breaking as defined by
+# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
#
+# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
+# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
+#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
# not because the older behavior is desirable.
#
!!chain;
-!!LBCMNoChain;
-
!!lookAheadHardBreak;
#
# See rule LB 19 for an example.
#
+# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
+
+$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
+$EM = [\U0001F3FB-\U0001F3FF];
+
$AI = [:LineBreak = Ambiguous:];
-$AL = [:LineBreak = Alphabetic:];
+$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]];
$BA = [:LineBreak = Break_After:];
$BB = [:LineBreak = Break_Before:];
$BK = [:LineBreak = Mandatory_Break:];
$CB = [:LineBreak = Contingent_Break:];
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
$CL = [:LineBreak = Close_Punctuation:];
-$CM = [:LineBreak = Combining_Mark:];
+$CM = [[:LineBreak = Combining_Mark:] \u200d];
$CP = [:LineBreak = Close_Parenthesis:];
$CR = [:LineBreak = Carriage_Return:];
$EX = [:LineBreak = Exclamation:];
$HY = [:LineBreak = Hyphen:];
$H2 = [:LineBreak = H2:];
$H3 = [:LineBreak = H3:];
-$ID = [:LineBreak = Ideographic:];
+$ID = [[:LineBreak = Ideographic:][\u2764] - $EB];
$IN = [:LineBreak = Inseperable:];
$IS = [:LineBreak = Infix_Numeric:];
$JL = [:LineBreak = JL:];
$WJ = [:LineBreak = Word_Joiner:];
$XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
+$ZWJ = [\u200d];
# Dictionary character set, for triggering language-based break engines. Currently
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
$HYcm = $HY $CM*;
$H2cm = $H2 $CM*;
$H3cm = $H3 $CM*;
-$IDcm = $ID $CM*;
$INcm = $IN $CM*;
$IScm = $IS $CM*;
$JLcm = $JL $CM*;
$B2 $CM+;
$CL $CM+;
$CP $CM+;
+$EB $CM+;
+$EM $CM+;
$EX $CM+;
$GL $CM+;
$HL $CM+;
# Rule LB 4, 5 Mandatory (Hard) breaks.
#
$LB4Breaks = [$BK $CR $LF $NL];
-$LB4NonBreaks = [^$BK $CR $LF $NL];
+$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
$CR $LF {100};
#
#
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
$CAN_CM $CM* $LB4Breaks {100};
-$CM+ $LB4Breaks {100};
+^$CM+ $LB4Breaks {100};
# LB 7 x SP
# x ZW
$LB4NonBreaks [$SP $ZW];
$CAN_CM $CM* [$SP $ZW];
-$CM+ [$SP $ZW];
+^$CM+ [$SP $ZW];
#
# LB 8 Break after zero width space
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+# LB 8a ZWJ x ID Emoji proposal.
+#
+$ZWJ ($ID | $EB | $EM);
-# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
-# $CM not covered by the above needs to behave like $AL
+# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
+# $CM not covered by the above needs to behave like $AL
# See definition of $CAN_CM.
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
-$CM+;
+^$CM+;
#
# LB 11 Do not break before or after WORD JOINER & related characters.
#
$CAN_CM $CM* $WJcm;
$LB8NonBreaks $WJcm;
-$CM+ $WJcm;
+^$CM+ $WJcm;
$WJcm $CANT_CM;
$WJcm $CAN_CM $CM*;
#
$GLcm $CAN_CM $CM*;
$GLcm $CANT_CM;
-
+
#
# LB 12a Do not break before NBSP and related characters ...
# [^SP BA HY] x GL
#
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm;
-$CM+ GLcm;
+^$CM+ $GLcm;
#
$LB8NonBreaks $CL;
$CAN_CM $CM* $CL;
-$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $CP;
$CAN_CM $CM* $CP;
-$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $EX;
$CAN_CM $CM* $EX;
-$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $IS;
$CAN_CM $CM* $IS;
-$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $SY;
$CAN_CM $CM* $SY;
-$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
#
# LB 19
# x QU
$LB18NonBreaks $CM* $QUcm;
-$CM+ $QUcm;
+^$CM+ $QUcm;
# QU x
$QUcm .?;
-$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
- # TODO: I don't think this rule is needed.
# LB 20
# LB 21 x (BA | HY | NS)
# BB x
#
-$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
+$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
+^$CM+ ($BAcm | $HYcm | $NScm);
$BBcm [^$CB]; # $BB x
$BBcm $LB20NonBreaks $CM*;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
-#
+#
$HLcm ($HYcm | $BAcm) [^$CB]?;
# LB 21b (forward) Don't break between SY and HL
# LB 22
($ALcm | $HLcm) $INcm;
-$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
+^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
$EXcm $INcm;
-$IDcm $INcm;
+($ID | $EB | $EM) $CM* $INcm;
$INcm $INcm;
$NUcm $INcm;
# $LB 23
-$IDcm $POcm;
+($ID | $EB | $EM) $CM* $POcm;
$ALcm $NUcm; # includes $LB19
$HLcm $NUcm;
-$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
+^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
$NUcm $ALcm;
$NUcm $HLcm;
#
# LB 24
#
-$PRcm $IDcm;
+$PRcm ($ID | $EB | $EM);
$PRcm ($ALcm | $HLcm);
$POcm ($ALcm | $HLcm);
# LB 28 Do not break between alphabetics
#
($ALcm | $HLcm) ($ALcm | $HLcm);
-$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
+^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
# LB 29
$IScm ($ALcm | $HLcm);
# LB 30
($ALcm | $HLcm | $NUcm) $OPcm;
-$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
+^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CPcm ($ALcm | $HLcm | $NUcm);
-# LB 30a Do not break between regional indicators.
-$RIcm $RIcm;
+# LB 30a Do not break between regional indicators. Break after pairs of them.
+# Tricky interaction with LB8a: ZWJ x ID
+$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
+$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
+$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
+
+$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
+$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
+
+# LB 30b Do not break between an Emoji Base and an Emoji Modifier
+$EB $CM* $EM;
#
# Reverse Rules.
!!reverse;
-$CM+ $ALPlus;
-$CM+ $BA;
-$CM+ $BB;
-$CM+ $B2;
-$CM+ $CL;
-$CM+ $CP;
-$CM+ $EX;
-$CM+ $GL;
-$CM+ $HL;
-$CM+ $HY;
-$CM+ $H2;
-$CM+ $H3;
-$CM+ $ID;
-$CM+ $IN;
-$CM+ $IS;
-$CM+ $JL;
-$CM+ $JV;
-$CM+ $JT;
-$CM+ $NS;
-$CM+ $NU;
-$CM+ $OP;
-$CM+ $PO;
-$CM+ $PR;
-$CM+ $QU;
-$CM+ $RI;
-$CM+ $SY;
-$CM+ $WJ;
-$CM+;
+^$CM+ $ALPlus;
+^$CM+ $BA;
+^$CM+ $BB;
+^$CM+ $B2;
+^$CM+ $CL;
+^$CM+ $CP;
+^$CM+ $EB;
+^$CM+ $EM;
+^$CM+ $EX;
+^$CM+ $GL;
+^$CM+ $HL;
+^$CM+ $HY;
+^$CM+ $H2;
+^$CM+ $H3;
+^$CM+ $ID;
+^$CM+ $IN;
+^$CM+ $IS;
+^$CM+ $JL;
+^$CM+ $JV;
+^$CM+ $JT;
+^$CM+ $NS;
+^$CM+ $NU;
+^$CM+ $OP;
+^$CM+ $PO;
+^$CM+ $PR;
+^$CM+ $QU;
+^$CM+ $RI;
+^$CM+ $SY;
+^$CM+ $WJ;
+^$CM+;
#
[$BK $CR $LF $NL $ZW {eof}] |
$SP+ $CM+ $SP |
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
- # LB14 says OP SP* x .
+ # LB14 says OP SP* x .
# becomes OP SP* x AL
# becomes OP SP* x CM+ AL_FOLLOW
#
# Further note: the $AL in [$AL {eof}] is only to work around
# a rule compiler bug which complains about
# empty sets otherwise.
-
+
#
# Sequences of the form (shown forwards)
# [CANT_CM] <break> [CM] <break> [PR]
-# LB 4, 5, 5
+# LB 4, 5, 6
$LB4Breaks [$LB4NonBreaks-$CM];
$LB4Breaks $CM+ $CAN_CM;
# Requires an engine enhancement.
# / $SP* $ZW
+# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
+# The ZWJ will look like a CM to whatever precedes it.
+#
+($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
+
+
# LB 9,10 Combining marks.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# Stick together any combining sequences that don't match other rules.
-$CM+ $CAN_CM;
+^$CM+ $CAN_CM;
# LB 11
-$CM* $WJ $CM* $CAN_CM;
-$CM* $WJ [$LB8NonBreaks-$CM];
+#
+$WJ $CM* $CAN_CM;
+$WJ [$LB8NonBreaks-$CM];
$CANT_CM $CM* $WJ;
-$CM* $CAN_CM $CM* $WJ;
+$CAN_CM $CM* $WJ;
# LB 12a
# [^SP BA HY] x GL
#
-$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
+$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
# LB 12
# GL x
#
$CANT_CM $CM* $GL;
-$CM* $CAN_CM $CM* $GL;
+$CAN_CM $CM* $GL;
# LB 13
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
# This really wants to chain at the $CM+ (which is acting as an $AL)
# except for $CM chaining being disabled.
-[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
+[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
# LB 14 OP SP* x
#
-$CM* $CAN_CM $SP* $CM* $OP;
+$CAN_CM $SP* $CM* $OP;
$CANT_CM $SP* $CM* $OP;
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
-
- $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
-$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
-$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
+ $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
+$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
# LB 15
-$CM* $OP $SP* $CM* $QU;
+$OP $SP* $CM* $QU;
# LB 16
-$CM* $NS $SP* $CM* ($CL | $CP);
+$NS $SP* $CM* ($CL | $CP);
# LB 17
-$CM* $B2 $SP* $CM* $B2;
+$B2 $SP* $CM* $B2;
# LB 18 break after spaces
# Nothing explicit needed here.
#
# LB 19
#
-$CM* $QU $CM* $CAN_CM; # . x QU
-$CM* $QU $LB18NonBreaks;
+$QU $CM* $CAN_CM; # . x QU
+$QU $LB18NonBreaks;
-$CM* $CAN_CM $CM* $QU; # QU x .
+$CAN_CM $CM* $QU; # QU x .
$CANT_CM $CM* $QU;
-
+
#
# LB 20 Break before and after CB.
# nothing needed here.
#
# LB 21
-$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
+($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
-$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
-[^$CB] $CM* $BB; #
+[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
+[^$CB] $CM* $BB; #
# LB21a
[^$CB] $CM* ($HY | $BA) $CM* $HL;
# LB21b (reverse)
-$CM* $HL $CM* $SY;
+$HL $CM* $SY;
# LB 22
-$CM* $IN $CM* ($ALPlus | $HL);
-$CM* $IN $CM* $EX;
-$CM* $IN $CM* $ID;
-$CM* $IN $CM* $IN;
-$CM* $IN $CM* $NU;
+$IN $CM* ($ALPlus | $HL);
+$IN $CM* $EX;
+$IN $CM* ($ID | $EB | $EM);
+$IN $CM* $IN;
+$IN $CM* $NU;
# LB 23
-$CM* $PO $CM* $ID;
-$CM* $NU $CM* ($ALPlus | $HL);
-$CM* ($ALPlus | $HL) $CM* $NU;
+$PO $CM* ($ID | $EB | $EM);
+$NU $CM* ($ALPlus | $HL);
+($ALPlus | $HL) $CM* $NU;
# LB 24
-$CM* $ID $CM* $PR;
-$CM* ($ALPlus | $HL) $CM* $PR;
-$CM* ($ALPlus | $HL) $CM* $PO;
+($ID | $EB | $EM) $CM* $PR;
+($ALPlus | $HL) $CM* $PR;
+($ALPlus | $HL) $CM* $PO;
# LB 25
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
# LB 26
-$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
-$CM* ($JT | $JV) $CM* ($H2 | $JV);
-$CM* $JT $CM* ($H3 | $JT);
+($H3 | $H2 | $JV | $JL) $CM* $JL;
+($JT | $JV) $CM* ($H2 | $JV);
+$JT $CM* ($H3 | $JT);
# LB 27
-$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
-$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
-$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
+$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
+ ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
# LB 28
-$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
+($ALPlus | $HL) $CM* ($ALPlus | $HL);
# LB 29
-$CM* ($ALPlus | $HL) $CM* $IS;
+($ALPlus | $HL) $CM* $IS;
# LB 30
-$CM* $OP $CM* ($ALPlus | $HL | $NU);
-$CM* ($ALPlus | $HL | $NU) $CM* $CP;
+$OP $CM* ($ALPlus | $HL | $NU);
+($ALPlus | $HL | $NU) $CM* $CP;
# LB 30a
-$CM* $RI $CM* $RI;
+# Pairs of Regional Indicators.
+# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
+# the second with an even number. Stripping away the cruft they look like
+# [^RI] RI / (RI RI)+ ^RI;
+# [^RI] RI RI / (RI RI)+ ^RI;
+#
+[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
+[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
+
+# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
+$RI $CM* $RI;
+
+# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
+$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
+
+
+# LB 30b Do not break between an Emoji Base and an Emoji Modifier
+$EM $CM* $EB;
+
## -------------------------------------------------
!!safe_reverse;
# LB 9
-$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
-$CM+ $SP / .;
+^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
+^$CM+ $SP / .;
# LB 14
$SP+ $CM* $OP;
($CM* ($IS | $SY))+ $CM* $NU;
($CL | $CP) $CM* ($NU | $IS | $SY);
+# LB 30
+($CM* $RI)+;
+
# For dictionary-based break
$dictionary $dictionary;
# turn off rule chaining. We don't want to move more
# than necessary.
#
-[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];
+^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
$dictionary $dictionary;
-# Copyright (c) 2002-2015 International Business Machines Corporation and
+# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
# file: line_fi.txt
#
# Line Breaking Rules
-# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
+# Implement default line breaking as defined by
+# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
# tailored as noted in 2nd paragraph below..
#
+# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
+# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
+#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
# not because the older behavior is desirable.
#
!!chain;
-!!LBCMNoChain;
-
!!lookAheadHardBreak;
#
# See rule LB 19 for an example.
#
+# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
+
+$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
+$EM = [\U0001F3FB-\U0001F3FF];
+
$AI = [:LineBreak = Ambiguous:];
-$AL = [:LineBreak = Alphabetic:];
-$BA = [[:LineBreak = Break_After:] - [\u2010]];
+$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]];
+$BA = [:LineBreak = Break_After:];
$HH = [\u2010];
$BB = [:LineBreak = Break_Before:];
$BK = [:LineBreak = Mandatory_Break:];
$CB = [:LineBreak = Contingent_Break:];
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
$CL = [:LineBreak = Close_Punctuation:];
-$CM = [:LineBreak = Combining_Mark:];
+$CM = [[:LineBreak = Combining_Mark:] \u200d];
$CP = [:LineBreak = Close_Parenthesis:];
$CR = [:LineBreak = Carriage_Return:];
$EX = [:LineBreak = Exclamation:];
$HY = [:LineBreak = Hyphen:];
$H2 = [:LineBreak = H2:];
$H3 = [:LineBreak = H3:];
-$ID = [:LineBreak = Ideographic:];
+$ID = [[:LineBreak = Ideographic:][\u2764] - $EB];
$IN = [:LineBreak = Inseperable:];
$IS = [:LineBreak = Infix_Numeric:];
$JL = [:LineBreak = JL:];
$WJ = [:LineBreak = Word_Joiner:];
$XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
+$ZWJ = [\u200d];
# Dictionary character set, for triggering language-based break engines. Currently
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
$HYcm = $HY $CM*;
$H2cm = $H2 $CM*;
$H3cm = $H3 $CM*;
-$IDcm = $ID $CM*;
$INcm = $IN $CM*;
$IScm = $IS $CM*;
$JLcm = $JL $CM*;
$B2 $CM+;
$CL $CM+;
$CP $CM+;
+$EB $CM+;
+$EM $CM+;
$EX $CM+;
$GL $CM+;
$HL $CM+;
# Rule LB 4, 5 Mandatory (Hard) breaks.
#
$LB4Breaks = [$BK $CR $LF $NL];
-$LB4NonBreaks = [^$BK $CR $LF $NL];
+$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
$CR $LF {100};
#
#
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
$CAN_CM $CM* $LB4Breaks {100};
-$CM+ $LB4Breaks {100};
+^$CM+ $LB4Breaks {100};
# LB 7 x SP
# x ZW
$LB4NonBreaks [$SP $ZW];
$CAN_CM $CM* [$SP $ZW];
-$CM+ [$SP $ZW];
+^$CM+ [$SP $ZW];
#
# LB 8 Break after zero width space
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+# LB 8a ZWJ x ID Emoji proposal.
+#
+$ZWJ ($ID | $EB | $EM);
-# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
-# $CM not covered by the above needs to behave like $AL
+# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
+# $CM not covered by the above needs to behave like $AL
# See definition of $CAN_CM.
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
-$CM+;
+^$CM+;
#
# LB 11 Do not break before or after WORD JOINER & related characters.
#
$CAN_CM $CM* $WJcm;
$LB8NonBreaks $WJcm;
-$CM+ $WJcm;
+^$CM+ $WJcm;
$WJcm $CANT_CM;
$WJcm $CAN_CM $CM*;
#
$GLcm $CAN_CM $CM*;
$GLcm $CANT_CM;
-
+
#
# LB 12a Do not break before NBSP and related characters ...
# [^SP BA HY] x GL
#
[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GLcm;
-$CM+ GLcm;
+^$CM+ $GLcm;
#
$LB8NonBreaks $CL;
$CAN_CM $CM* $CL;
-$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $CP;
$CAN_CM $CM* $CP;
-$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $EX;
$CAN_CM $CM* $EX;
-$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $IS;
$CAN_CM $CM* $IS;
-$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $SY;
$CAN_CM $CM* $SY;
-$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
#
# LB 19
# x QU
$LB18NonBreaks $CM* $QUcm;
-$CM+ $QUcm;
+^$CM+ $QUcm;
# QU x
$QUcm .?;
-$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
- # TODO: I don't think this rule is needed.
# LB 20
$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm) / $AL;
$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm);
($HY | $HH) $AL;
+^$CM+ ($BAcm | $HYcm | $HHcm | $NScm);
$BBcm [^$CB]; # $BB x
$BBcm $LB20NonBreaks $CM*;
# LB 22
($ALcm | $HLcm) $INcm;
-$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
+^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
$EXcm $INcm;
-$IDcm $INcm;
+($ID | $EB | $EM) $CM* $INcm;
$INcm $INcm;
$NUcm $INcm;
# $LB 23
-$IDcm $POcm;
+($ID | $EB | $EM) $CM* $POcm;
$ALcm $NUcm; # includes $LB19
$HLcm $NUcm;
-$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
+^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
$NUcm $ALcm;
$NUcm $HLcm;
#
# LB 24
#
-$PRcm $IDcm;
+$PRcm ($ID | $EB | $EM);
$PRcm ($ALcm | $HLcm);
$POcm ($ALcm | $HLcm);
# LB 28 Do not break between alphabetics
#
($ALcm | $HLcm) ($ALcm | $HLcm);
-$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
+^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
# LB 29
$IScm ($ALcm | $HLcm);
# LB 30
($ALcm | $HLcm | $NUcm) $OPcm;
-$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
+^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CPcm ($ALcm | $HLcm | $NUcm);
-# LB 30a Do not break between regional indicators.
-$RIcm $RIcm;
+# LB 30a Do not break between regional indicators. Break after pairs of them.
+# Tricky interaction with LB8a: ZWJ x ID
+$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
+$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
+$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
+
+$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
+$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
+
+# LB 30b Do not break between an Emoji Base and an Emoji Modifier
+$EB $CM* $EM;
#
# Reverse Rules.
!!reverse;
-$CM+ $ALPlus;
-$CM+ $BA;
-$CM+ $HH;
-$CM+ $BB;
-$CM+ $B2;
-$CM+ $CL;
-$CM+ $CP;
-$CM+ $EX;
-$CM+ $GL;
-$CM+ $HL;
-$CM+ $HY;
-$CM+ $H2;
-$CM+ $H3;
-$CM+ $ID;
-$CM+ $IN;
-$CM+ $IS;
-$CM+ $JL;
-$CM+ $JV;
-$CM+ $JT;
-$CM+ $NS;
-$CM+ $NU;
-$CM+ $OP;
-$CM+ $PO;
-$CM+ $PR;
-$CM+ $QU;
-$CM+ $RI;
-$CM+ $SY;
-$CM+ $WJ;
-$CM+;
+^$CM+ $ALPlus;
+^$CM+ $BA;
+^$CM+ $HH;
+^$CM+ $BB;
+^$CM+ $B2;
+^$CM+ $CL;
+^$CM+ $CP;
+^$CM+ $EB;
+^$CM+ $EM;
+^$CM+ $EX;
+^$CM+ $GL;
+^$CM+ $HL;
+^$CM+ $HY;
+^$CM+ $H2;
+^$CM+ $H3;
+^$CM+ $ID;
+^$CM+ $IN;
+^$CM+ $IS;
+^$CM+ $JL;
+^$CM+ $JV;
+^$CM+ $JT;
+^$CM+ $NS;
+^$CM+ $NU;
+^$CM+ $OP;
+^$CM+ $PO;
+^$CM+ $PR;
+^$CM+ $QU;
+^$CM+ $RI;
+^$CM+ $SY;
+^$CM+ $WJ;
+^$CM+;
#
[$BK $CR $LF $NL $ZW {eof}] |
$SP+ $CM+ $SP |
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
- # LB14 says OP SP* x .
+ # LB14 says OP SP* x .
# becomes OP SP* x AL
# becomes OP SP* x CM+ AL_FOLLOW
#
# Further note: the $AL in [$AL {eof}] is only to work around
# a rule compiler bug which complains about
# empty sets otherwise.
-
+
#
# Sequences of the form (shown forwards)
# [CANT_CM] <break> [CM] <break> [PR]
-# LB 4, 5, 5
+# LB 4, 5, 6
$LB4Breaks [$LB4NonBreaks-$CM];
$LB4Breaks $CM+ $CAN_CM;
# Requires an engine enhancement.
# / $SP* $ZW
+# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
+# The ZWJ will look like a CM to whatever precedes it.
+#
+($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
+
+
# LB 9,10 Combining marks.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# Stick together any combining sequences that don't match other rules.
-$CM+ $CAN_CM;
+^$CM+ $CAN_CM;
# LB 11
-$CM* $WJ $CM* $CAN_CM;
-$CM* $WJ [$LB8NonBreaks-$CM];
+#
+$WJ $CM* $CAN_CM;
+$WJ [$LB8NonBreaks-$CM];
$CANT_CM $CM* $WJ;
-$CM* $CAN_CM $CM* $WJ;
+$CAN_CM $CM* $WJ;
# LB 12a
# [^SP BA HY] x GL
#
-$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]];
+$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]];
# LB 12
# GL x
#
$CANT_CM $CM* $GL;
-$CM* $CAN_CM $CM* $GL;
+$CAN_CM $CM* $GL;
# LB 13
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
# This really wants to chain at the $CM+ (which is acting as an $AL)
# except for $CM chaining being disabled.
-[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
+[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
# LB 14 OP SP* x
#
-$CM* $CAN_CM $SP* $CM* $OP;
+$CAN_CM $SP* $CM* $OP;
$CANT_CM $SP* $CM* $OP;
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
-
- $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
-$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
-$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
+ $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
+$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
# LB 15
-$CM* $OP $SP* $CM* $QU;
+$OP $SP* $CM* $QU;
# LB 16
-$CM* $NS $SP* $CM* ($CL | $CP);
+$NS $SP* $CM* ($CL | $CP);
# LB 17
-$CM* $B2 $SP* $CM* $B2;
+$B2 $SP* $CM* $B2;
# LB 18 break after spaces
# Nothing explicit needed here.
#
# LB 19
#
-$CM* $QU $CM* $CAN_CM; # . x QU
-$CM* $QU $LB18NonBreaks;
+$QU $CM* $CAN_CM; # . x QU
+$QU $LB18NonBreaks;
-$CM* $CAN_CM $CM* $QU; # QU x .
+$CAN_CM $CM* $QU; # QU x .
$CANT_CM $CM* $QU;
-
+
#
# LB 20 Break before and after CB.
# nothing needed here.
$AL ($HY | $HH) / $SP;
# LB 21
-$CM* ($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
+($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
-$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
-[^$CB] $CM* $BB; #
+[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
+[^$CB] $CM* $BB; #
# LB21a
[^$CB] $CM* ($HY | $BA | $HH) $CM* $HL;
# LB21b (reverse)
-$CM* $HL $CM* $SY;
+$HL $CM* $SY;
# LB 22
-$CM* $IN $CM* ($ALPlus | $HL);
-$CM* $IN $CM* $EX;
-$CM* $IN $CM* $ID;
-$CM* $IN $CM* $IN;
-$CM* $IN $CM* $NU;
+$IN $CM* ($ALPlus | $HL);
+$IN $CM* $EX;
+$IN $CM* ($ID | $EB | $EM);
+$IN $CM* $IN;
+$IN $CM* $NU;
# LB 23
-$CM* $PO $CM* $ID;
-$CM* $NU $CM* ($ALPlus | $HL);
-$CM* ($ALPlus | $HL) $CM* $NU;
+$PO $CM* ($ID | $EB | $EM);
+$NU $CM* ($ALPlus | $HL);
+($ALPlus | $HL) $CM* $NU;
# LB 24
-$CM* $ID $CM* $PR;
-$CM* ($ALPlus | $HL) $CM* $PR;
-$CM* ($ALPlus | $HL) $CM* $PO;
+($ID | $EB | $EM) $CM* $PR;
+($ALPlus | $HL) $CM* $PR;
+($ALPlus | $HL) $CM* $PO;
# LB 25
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
# LB 26
-$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
-$CM* ($JT | $JV) $CM* ($H2 | $JV);
-$CM* $JT $CM* ($H3 | $JT);
+($H3 | $H2 | $JV | $JL) $CM* $JL;
+($JT | $JV) $CM* ($H2 | $JV);
+$JT $CM* ($H3 | $JT);
# LB 27
-$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
-$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
-$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
+$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
+ ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
# LB 28
-$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
+($ALPlus | $HL) $CM* ($ALPlus | $HL);
# LB 29
-$CM* ($ALPlus | $HL) $CM* $IS;
+($ALPlus | $HL) $CM* $IS;
# LB 30
-$CM* $OP $CM* ($ALPlus | $HL | $NU);
-$CM* ($ALPlus | $HL | $NU) $CM* $CP;
+$OP $CM* ($ALPlus | $HL | $NU);
+($ALPlus | $HL | $NU) $CM* $CP;
# LB 30a
-$CM* $RI $CM* $RI;
+# Pairs of Regional Indicators.
+# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
+# the second with an even number. Stripping away the cruft they look like
+# [^RI] RI / (RI RI)+ ^RI;
+# [^RI] RI RI / (RI RI)+ ^RI;
+#
+[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
+[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
+
+# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
+$RI $CM* $RI;
+
+# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
+$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
+
+
+# LB 30b Do not break between an Emoji Base and an Emoji Modifier
+$EM $CM* $EB;
+
## -------------------------------------------------
!!safe_reverse;
# LB 9
-$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
-$CM+ $SP / .;
+^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
+^$CM+ $SP / .;
# LB 14
$SP+ $CM* $OP;
($CM* ($IS | $SY))+ $CM* $NU;
($CL | $CP) $CM* ($NU | $IS | $SY);
+# LB 30
+($CM* $RI)+;
+
# For dictionary-based break
$dictionary $dictionary;
# turn off rule chaining. We don't want to move more
# than necessary.
#
-[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];
+^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
$dictionary $dictionary;
-# Copyright (c) 2002-2015 International Business Machines Corporation and
+# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
# file: line_loose.txt
#
# Line Breaking Rules
-# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
+# Implement default line breaking as defined by
+# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
-# tailored as noted in 2nd paragraph below..
+#
+# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
+# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
+#
+# tailored as noted in 2nd paragraph below.
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
#
!!chain;
-!!LBCMNoChain;
-
!!lookAheadHardBreak;
#
# See rule LB 19 for an example.
#
+# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
+
+$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
+$EM = [\U0001F3FB-\U0001F3FF];
+
$AI = [:LineBreak = Ambiguous:];
-$AL = [:LineBreak = Alphabetic:];
+$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]];
$BA = [:LineBreak = Break_After:];
$BB = [:LineBreak = Break_Before:];
$BK = [:LineBreak = Mandatory_Break:];
$CB = [:LineBreak = Contingent_Break:];
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
$CL = [:LineBreak = Close_Punctuation:];
-$CM = [:LineBreak = Combining_Mark:];
+$CM = [[:LineBreak = Combining_Mark:] \u200d];
$CP = [:LineBreak = Close_Parenthesis:];
$CR = [:LineBreak = Carriage_Return:];
$EX = [:LineBreak = Exclamation:];
$HY = [:LineBreak = Hyphen:];
$H2 = [:LineBreak = H2:];
$H3 = [:LineBreak = H3:];
-$ID = [[:LineBreak = Ideographic:] $CJ];
+$ID = [[:LineBreak = Ideographic:]$CJ[\u2764] - $EB];
$IN = [:LineBreak = Inseperable:];
$IS = [:LineBreak = Infix_Numeric:];
$JL = [:LineBreak = JL:];
$WJ = [:LineBreak = Word_Joiner:];
$XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
+$ZWJ = [\u200d];
# Dictionary character set, for triggering language-based break engines. Currently
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
$HYcm = $HY $CM*;
$H2cm = $H2 $CM*;
$H3cm = $H3 $CM*;
-$IDcm = $ID $CM*;
$INcm = $IN $CM*;
$IScm = $IS $CM*;
$JLcm = $JL $CM*;
$B2 $CM+;
$CL $CM+;
$CP $CM+;
+$EB $CM+;
+$EM $CM+;
$EX $CM+;
$GL $CM+;
$HL $CM+;
# Rule LB 4, 5 Mandatory (Hard) breaks.
#
$LB4Breaks = [$BK $CR $LF $NL];
-$LB4NonBreaks = [^$BK $CR $LF $NL];
+$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
$CR $LF {100};
#
#
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
$CAN_CM $CM* $LB4Breaks {100};
-$CM+ $LB4Breaks {100};
+^$CM+ $LB4Breaks {100};
# LB 7 x SP
# x ZW
$LB4NonBreaks [$SP $ZW];
$CAN_CM $CM* [$SP $ZW];
-$CM+ [$SP $ZW];
+^$CM+ [$SP $ZW];
#
# LB 8 Break after zero width space
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+# LB 8a ZWJ x ID Emoji proposal.
+#
+$ZWJ ($ID | $EB | $EM);
-# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
-# $CM not covered by the above needs to behave like $AL
+# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
+# $CM not covered by the above needs to behave like $AL
# See definition of $CAN_CM.
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
-$CM+;
+^$CM+;
#
# LB 11 Do not break before or after WORD JOINER & related characters.
#
$CAN_CM $CM* $WJcm;
$LB8NonBreaks $WJcm;
-$CM+ $WJcm;
+^$CM+ $WJcm;
$WJcm $CANT_CM;
$WJcm $CAN_CM $CM*;
#
$GLcm $CAN_CM $CM*;
$GLcm $CANT_CM;
-
+
#
# LB 12a Do not break before NBSP and related characters ...
# [^SP BA HY] x GL
#
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm;
-$CM+ GLcm;
+^$CM+ $GLcm;
#
$LB8NonBreaks $CL;
$CAN_CM $CM* $CL;
-$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $CP;
$CAN_CM $CM* $CP;
-$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $EX;
$CAN_CM $CM* $EX;
-$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $IS;
$CAN_CM $CM* $IS;
-$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $SY;
$CAN_CM $CM* $SY;
-$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
#
# LB 19
# x QU
$LB18NonBreaks $CM* $QUcm;
-$CM+ $QUcm;
+^$CM+ $QUcm;
# QU x
$QUcm .?;
-$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
- # TODO: I don't think this rule is needed.
# LB 20
# BB x
#
# DO allow breaks here before NSXcm, so don't include it
-$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
+$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
+^$CM+ ($BAcm | $HYcm | $NScm);
$BBcm [^$CB]; # $BB x
$BBcm $LB20NonBreaks $CM*;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
-#
+#
$HLcm ($HYcm | $BAcm) [^$CB]?;
# LB 21b (forward) Don't break between SY and HL
# LB 22
($ALcm | $HLcm) $INcm;
-$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
+^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
$EXcm $INcm;
-$IDcm $INcm;
+($ID | $EB | $EM) $CM* $INcm;
# $INcm $INcm; # delete this rule for CSS loose
$NUcm $INcm;
# $LB 23
-$IDcm $POcm;
+($ID | $EB | $EM) $CM* $POcm;
$ALcm $NUcm; # includes $LB19
$HLcm $NUcm;
-$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
+^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
$NUcm $ALcm;
$NUcm $HLcm;
#
# LB 24
#
-$PRcm $IDcm;
+$PRcm ($ID | $EB | $EM);
$PRcm ($ALcm | $HLcm);
$POcm ($ALcm | $HLcm);
# LB 28 Do not break between alphabetics
#
($ALcm | $HLcm) ($ALcm | $HLcm);
-$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
+^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
# LB 29
$IScm ($ALcm | $HLcm);
# LB 30
($ALcm | $HLcm | $NUcm) $OPcm;
-$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
+^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CPcm ($ALcm | $HLcm | $NUcm);
-# LB 30a Do not break between regional indicators.
-$RIcm $RIcm;
+# LB 30a Do not break between regional indicators. Break after pairs of them.
+# Tricky interaction with LB8a: ZWJ x ID
+$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
+$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
+$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
+
+$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
+$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
+
+# LB 30b Do not break between an Emoji Base and an Emoji Modifier
+$EB $CM* $EM;
#
# Reverse Rules.
!!reverse;
-$CM+ $ALPlus;
-$CM+ $BA;
-$CM+ $BB;
-$CM+ $B2;
-$CM+ $CL;
-$CM+ $CP;
-$CM+ $EX;
-$CM+ $GL;
-$CM+ $HL;
-$CM+ $HY;
-$CM+ $H2;
-$CM+ $H3;
-$CM+ $ID;
-$CM+ $IN;
-$CM+ $IS;
-$CM+ $JL;
-$CM+ $JV;
-$CM+ $JT;
-$CM+ $NS;
-$CM+ $NSX;
-$CM+ $NU;
-$CM+ $OP;
-$CM+ $PO;
-$CM+ $PR;
-$CM+ $QU;
-$CM+ $RI;
-$CM+ $SY;
-$CM+ $WJ;
-$CM+;
+^$CM+ $ALPlus;
+^$CM+ $BA;
+^$CM+ $BB;
+^$CM+ $B2;
+^$CM+ $CL;
+^$CM+ $CP;
+^$CM+ $EB;
+^$CM+ $EM;
+^$CM+ $EX;
+^$CM+ $GL;
+^$CM+ $HL;
+^$CM+ $HY;
+^$CM+ $H2;
+^$CM+ $H3;
+^$CM+ $ID;
+^$CM+ $IN;
+^$CM+ $IS;
+^$CM+ $JL;
+^$CM+ $JV;
+^$CM+ $JT;
+^$CM+ $NS;
+^$CM+ $NSX;
+^$CM+ $NU;
+^$CM+ $OP;
+^$CM+ $PO;
+^$CM+ $PR;
+^$CM+ $QU;
+^$CM+ $RI;
+^$CM+ $SY;
+^$CM+ $WJ;
+^$CM+;
#
[$BK $CR $LF $NL $ZW {eof}] |
$SP+ $CM+ $SP |
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
- # LB14 says OP SP* x .
+ # LB14 says OP SP* x .
# becomes OP SP* x AL
# becomes OP SP* x CM+ AL_FOLLOW
#
# Further note: the $AL in [$AL {eof}] is only to work around
# a rule compiler bug which complains about
# empty sets otherwise.
-
+
#
# Sequences of the form (shown forwards)
# [CANT_CM] <break> [CM] <break> [PR]
-# LB 4, 5, 5
+# LB 4, 5, 6
$LB4Breaks [$LB4NonBreaks-$CM];
$LB4Breaks $CM+ $CAN_CM;
# Requires an engine enhancement.
# / $SP* $ZW
+# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
+# The ZWJ will look like a CM to whatever precedes it.
+#
+($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
+
+
# LB 9,10 Combining marks.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# Stick together any combining sequences that don't match other rules.
-$CM+ $CAN_CM;
+^$CM+ $CAN_CM;
# LB 11
-$CM* $WJ $CM* $CAN_CM;
-$CM* $WJ [$LB8NonBreaks-$CM];
+#
+$WJ $CM* $CAN_CM;
+$WJ [$LB8NonBreaks-$CM];
$CANT_CM $CM* $WJ;
-$CM* $CAN_CM $CM* $WJ;
+$CAN_CM $CM* $WJ;
# LB 12a
# [^SP BA HY] x GL
#
-$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
+$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
# LB 12
# GL x
#
$CANT_CM $CM* $GL;
-$CM* $CAN_CM $CM* $GL;
+$CAN_CM $CM* $GL;
# LB 13
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
# This really wants to chain at the $CM+ (which is acting as an $AL)
# except for $CM chaining being disabled.
-[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
+[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
# LB 14 OP SP* x
#
-$CM* $CAN_CM $SP* $CM* $OP;
+$CAN_CM $SP* $CM* $OP;
$CANT_CM $SP* $CM* $OP;
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
-
- $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
-$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
-$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
+ $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
+$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
# LB 15
-$CM* $OP $SP* $CM* $QU;
+$OP $SP* $CM* $QU;
# LB 16
# Don't include $NSX here
-$CM* $NS $SP* $CM* ($CL | $CP);
+$NS $SP* $CM* ($CL | $CP);
# LB 17
-$CM* $B2 $SP* $CM* $B2;
+$B2 $SP* $CM* $B2;
# LB 18 break after spaces
# Nothing explicit needed here.
#
# LB 19
#
-$CM* $QU $CM* $CAN_CM; # . x QU
-$CM* $QU $LB18NonBreaks;
+$QU $CM* $CAN_CM; # . x QU
+$QU $LB18NonBreaks;
-$CM* $CAN_CM $CM* $QU; # QU x .
+$CAN_CM $CM* $QU; # QU x .
$CANT_CM $CM* $QU;
-
+
#
# LB 20 Break before and after CB.
# nothing needed here.
# LB 21
# Don't include $NSX here
-$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
+($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
-$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
-[^$CB] $CM* $BB; #
+[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
+[^$CB] $CM* $BB; #
# LB21a
[^$CB] $CM* ($HY | $BA) $CM* $HL;
# LB21b (reverse)
-$CM* $HL $CM* $SY;
+$HL $CM* $SY;
# LB 22
-$CM* $IN $CM* ($ALPlus | $HL);
-$CM* $IN $CM* $EX;
-$CM* $IN $CM* $ID;
-# $CM* $IN $CM* $IN; # delete this rule for CSS loose
-$CM* $IN $CM* $NU;
+$IN $CM* ($ALPlus | $HL);
+$IN $CM* $EX;
+$IN $CM* ($ID | $EB | $EM);
+# $IN $CM* $IN; # delete this rule for CSS loose
+$IN $CM* $NU;
# LB 23
-$CM* $PO $CM* $ID;
-$CM* $NU $CM* ($ALPlus | $HL);
-$CM* ($ALPlus | $HL) $CM* $NU;
+$PO $CM* ($ID | $EB | $EM);
+$NU $CM* ($ALPlus | $HL);
+($ALPlus | $HL) $CM* $NU;
# LB 24
-$CM* $ID $CM* $PR;
-$CM* ($ALPlus | $HL) $CM* $PR;
-$CM* ($ALPlus | $HL) $CM* $PO;
+($ID | $EB | $EM) $CM* $PR;
+($ALPlus | $HL) $CM* $PR;
+($ALPlus | $HL) $CM* $PO;
# LB 25
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
# LB 26
-$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
-$CM* ($JT | $JV) $CM* ($H2 | $JV);
-$CM* $JT $CM* ($H3 | $JT);
+($H3 | $H2 | $JV | $JL) $CM* $JL;
+($JT | $JV) $CM* ($H2 | $JV);
+$JT $CM* ($H3 | $JT);
# LB 27
-$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
-$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
-$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
+$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
+ ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
# LB 28
-$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
+($ALPlus | $HL) $CM* ($ALPlus | $HL);
# LB 29
-$CM* ($ALPlus | $HL) $CM* $IS;
+($ALPlus | $HL) $CM* $IS;
# LB 30
-$CM* $OP $CM* ($ALPlus | $HL | $NU);
-$CM* ($ALPlus | $HL | $NU) $CM* $CP;
+$OP $CM* ($ALPlus | $HL | $NU);
+($ALPlus | $HL | $NU) $CM* $CP;
# LB 30a
-$CM* $RI $CM* $RI;
+# Pairs of Regional Indicators.
+# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
+# the second with an even number. Stripping away the cruft they look like
+# [^RI] RI / (RI RI)+ ^RI;
+# [^RI] RI RI / (RI RI)+ ^RI;
+#
+# Line Loose tailoring: Don't include NSX here.
+[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
+[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
+
+# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
+$RI $CM* $RI;
+
+# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
+$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
+
+
+# LB 30b Do not break between an Emoji Base and an Emoji Modifier
+$EM $CM* $EB;
+
## -------------------------------------------------
!!safe_reverse;
# LB 9
-$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
-$CM+ $SP / .;
+^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
+^$CM+ $SP / .;
# LB 14
$SP+ $CM* $OP;
($CM* ($IS | $SY))+ $CM* $NU;
($CL | $CP) $CM* ($NU | $IS | $SY);
+# LB 30
+($CM* $RI)+;
+
# For dictionary-based break
$dictionary $dictionary;
# turn off rule chaining. We don't want to move more
# than necessary.
#
-[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];
+^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
$dictionary $dictionary;
-# Copyright (c) 2002-2015 International Business Machines Corporation and
+# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
# file: line_loose_cj.txt
#
# Line Breaking Rules
-# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
+# Implement default line breaking as defined by
+# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
+#
+# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
+# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
+#
# tailored as noted in 2nd paragraph below..
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
#
!!chain;
-!!LBCMNoChain;
-
!!lookAheadHardBreak;
#
# See rule LB 19 for an example.
#
+# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
+
+$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
+$EM = [\U0001F3FB-\U0001F3FF];
+
$AI = [:LineBreak = Ambiguous:];
-$AL = [:LineBreak = Alphabetic:];
+$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]];
$BAX = [\u2010 \u2013];
$BA = [[:LineBreak = Break_After:] - $BAX];
$BB = [:LineBreak = Break_Before:];
$CB = [:LineBreak = Contingent_Break:];
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
$CL = [:LineBreak = Close_Punctuation:];
-$CM = [:LineBreak = Combining_Mark:];
+$CM = [[:LineBreak = Combining_Mark:] \u200d];
$CP = [:LineBreak = Close_Parenthesis:];
$CR = [:LineBreak = Carriage_Return:];
$EXX = [\uFF01 \uFF1F];
$HY = [:LineBreak = Hyphen:];
$H2 = [:LineBreak = H2:];
$H3 = [:LineBreak = H3:];
-$ID = [[:LineBreak = Ideographic:] $CJ];
+$ID = [[:LineBreak = Ideographic:] $CJ [\u2764] - $EB];
$IN = [:LineBreak = Inseperable:];
$IS = [:LineBreak = Infix_Numeric:];
$JL = [:LineBreak = JL:];
$WJ = [:LineBreak = Word_Joiner:];
$XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
+$ZWJ = [\u200d];
# Dictionary character set, for triggering language-based break engines. Currently
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
$HYcm = $HY $CM*;
$H2cm = $H2 $CM*;
$H3cm = $H3 $CM*;
-$IDcm = $ID $CM*;
$INcm = $IN $CM*;
$IScm = $IS $CM*;
$JLcm = $JL $CM*;
$B2 $CM+;
$CL $CM+;
$CP $CM+;
+$EB $CM+;
+$EM $CM+;
$EX $CM+;
$EXX $CM+;
$GL $CM+;
# Rule LB 4, 5 Mandatory (Hard) breaks.
#
$LB4Breaks = [$BK $CR $LF $NL];
-$LB4NonBreaks = [^$BK $CR $LF $NL];
+$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
$CR $LF {100};
#
#
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
$CAN_CM $CM* $LB4Breaks {100};
-$CM+ $LB4Breaks {100};
+^$CM+ $LB4Breaks {100};
# LB 7 x SP
# x ZW
$LB4NonBreaks [$SP $ZW];
$CAN_CM $CM* [$SP $ZW];
-$CM+ [$SP $ZW];
+^$CM+ [$SP $ZW];
#
# LB 8 Break after zero width space
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+# LB 8a ZWJ x ID Emoji proposal.
+#
+$ZWJ ($ID | $EB | $EM);
-# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
-# $CM not covered by the above needs to behave like $AL
+# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
+# $CM not covered by the above needs to behave like $AL
# See definition of $CAN_CM.
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
-$CM+;
+^$CM+;
#
# LB 11 Do not break before or after WORD JOINER & related characters.
#
$CAN_CM $CM* $WJcm;
$LB8NonBreaks $WJcm;
-$CM+ $WJcm;
+^$CM+ $WJcm;
$WJcm $CANT_CM;
$WJcm $CAN_CM $CM*;
#
$GLcm $CAN_CM $CM*;
$GLcm $CANT_CM;
-
+
#
# LB 12a Do not break before NBSP and related characters ...
# [^SP BA HY] x GL
#
[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GLcm;
-$CM+ GLcm;
-
+^$CM+ $GLcm;
#
# Do not include $EXX here
$LB8NonBreaks $CL;
$CAN_CM $CM* $CL;
-$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $CP;
$CAN_CM $CM* $CP;
-$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $EX;
$CAN_CM $CM* $EX;
-$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $IS;
$CAN_CM $CM* $IS;
-$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $SY;
$CAN_CM $CM* $SY;
-$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
#
# LB 19
# x QU
$LB18NonBreaks $CM* $QUcm;
-$CM+ $QUcm;
+^$CM+ $QUcm;
# QU x
$QUcm .?;
-$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
- # TODO: I don't think this rule is needed.
# LB 20
#
# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
+^$CM+ ($BAcm | $HYcm | $NScm);
$BBcm [^$CB]; # $BB x
$BBcm $LB20NonBreaks $CM*;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
-#
+#
$HLcm ($HYcm | $BAcm | $BAXcm) [^$CB]?;
# LB 21b (forward) Don't break between SY and HL
# LB 22
($ALcm | $HLcm) $INcm;
-$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
+^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
$EXcm $INcm;
-$IDcm $INcm;
+($ID | $EB | $EM) $CM* $INcm;
# $INcm $INcm; # delete this rule for CSS loose
$NUcm $INcm;
-# LB 23
+# $LB 23
# Do not include $POX here
-$IDcm $POcm;
+($ID | $EB | $EM) $CM* $POcm;
$ALcm $NUcm; # includes $LB19
$HLcm $NUcm;
-$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
+^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
$NUcm $ALcm;
$NUcm $HLcm;
# LB 24
#
# Do not include $PRX here
-$PRcm $IDcm;
+$PRcm ($ID | $EB | $EM);
$PRcm ($ALcm | $HLcm);
($POcm | $POXcm) ($ALcm | $HLcm);
# LB 28 Do not break between alphabetics
#
($ALcm | $HLcm) ($ALcm | $HLcm);
-$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
+^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
# LB 29
$IScm ($ALcm | $HLcm);
# LB 30
($ALcm | $HLcm | $NUcm) $OPcm;
-$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
+^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CPcm ($ALcm | $HLcm | $NUcm);
-# LB 30a Do not break between regional indicators.
-$RIcm $RIcm;
+# LB 30a Do not break between regional indicators. Break after pairs of them.
+# Tricky interaction with LB8a: ZWJ x ID
+$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
+$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
+$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
+
+$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
+$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
+
+# LB 30b Do not break between an Emoji Base and an Emoji Modifier
+$EB $CM* $EM;
#
# Reverse Rules.
!!reverse;
-$CM+ $ALPlus;
-$CM+ $BA;
-$CM+ $BAX;
-$CM+ $BB;
-$CM+ $B2;
-$CM+ $CL;
-$CM+ $CP;
-$CM+ $EX;
-$CM+ $EXX;
-$CM+ $GL;
-$CM+ $HL;
-$CM+ $HY;
-$CM+ $H2;
-$CM+ $H3;
-$CM+ $ID;
-$CM+ $IN;
-$CM+ $IS;
-$CM+ $JL;
-$CM+ $JV;
-$CM+ $JT;
-$CM+ $NS;
-$CM+ $NSX;
-$CM+ $NU;
-$CM+ $OP;
-$CM+ $PO;
-$CM+ $POX;
-$CM+ $PR;
-$CM+ $PRX;
-$CM+ $QU;
-$CM+ $RI;
-$CM+ $SY;
-$CM+ $WJ;
-$CM+;
+^$CM+ $ALPlus;
+^$CM+ $BA;
+^$CM+ $BAX;
+^$CM+ $BB;
+^$CM+ $B2;
+^$CM+ $CL;
+^$CM+ $CP;
+^$CM+ $EB;
+^$CM+ $EM;
+^$CM+ $EX;
+^$CM+ $EXX;
+^$CM+ $GL;
+^$CM+ $HL;
+^$CM+ $HY;
+^$CM+ $H2;
+^$CM+ $H3;
+^$CM+ $ID;
+^$CM+ $IN;
+^$CM+ $IS;
+^$CM+ $JL;
+^$CM+ $JV;
+^$CM+ $JT;
+^$CM+ $NS;
+^$CM+ $NSX;
+^$CM+ $NU;
+^$CM+ $OP;
+^$CM+ $PO;
+^$CM+ $POX;
+^$CM+ $PR;
+^$CM+ $PRX;
+^$CM+ $QU;
+^$CM+ $RI;
+^$CM+ $SY;
+^$CM+ $WJ;
+^$CM+;
#
[$BK $CR $LF $NL $ZW {eof}] |
$SP+ $CM+ $SP |
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
- # LB14 says OP SP* x .
+ # LB14 says OP SP* x .
# becomes OP SP* x AL
# becomes OP SP* x CM+ AL_FOLLOW
#
# Further note: the $AL in [$AL {eof}] is only to work around
# a rule compiler bug which complains about
# empty sets otherwise.
-
+
#
# Sequences of the form (shown forwards)
# [CANT_CM] <break> [CM] <break> [PR]
-# LB 4, 5, 5
+# LB 4, 5, 6
$LB4Breaks [$LB4NonBreaks-$CM];
$LB4Breaks $CM+ $CAN_CM;
# Requires an engine enhancement.
# / $SP* $ZW
+# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
+# The ZWJ will look like a CM to whatever precedes it.
+#
+($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
+
+
# LB 9,10 Combining marks.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# Stick together any combining sequences that don't match other rules.
-$CM+ $CAN_CM;
+^$CM+ $CAN_CM;
# LB 11
-$CM* $WJ $CM* $CAN_CM;
-$CM* $WJ [$LB8NonBreaks-$CM];
+#
+$WJ $CM* $CAN_CM;
+$WJ [$LB8NonBreaks-$CM];
$CANT_CM $CM* $WJ;
-$CM* $CAN_CM $CM* $WJ;
+$CAN_CM $CM* $WJ;
# LB 12a
# [^SP BA HY] x GL
#
-$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]];
+$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]];
# LB 12
# GL x
#
$CANT_CM $CM* $GL;
-$CM* $CAN_CM $CM* $GL;
+$CAN_CM $CM* $GL;
# LB 13
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
# This really wants to chain at the $CM+ (which is acting as an $AL)
# except for $CM chaining being disabled.
-[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
+[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
# LB 14 OP SP* x
#
-$CM* $CAN_CM $SP* $CM* $OP;
+$CAN_CM $SP* $CM* $OP;
$CANT_CM $SP* $CM* $OP;
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
-
- $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
-$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
-$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
+ $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
+$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
# LB 15
-$CM* $OP $SP* $CM* $QU;
+$OP $SP* $CM* $QU;
# LB 16
# Don't include $NSX here
-$CM* $NS $SP* $CM* ($CL | $CP);
+$NS $SP* $CM* ($CL | $CP);
# LB 17
-$CM* $B2 $SP* $CM* $B2;
+$B2 $SP* $CM* $B2;
# LB 18 break after spaces
# Nothing explicit needed here.
#
# LB 19
#
-$CM* $QU $CM* $CAN_CM; # . x QU
-$CM* $QU $LB18NonBreaks;
+$QU $CM* $CAN_CM; # . x QU
+$QU $LB18NonBreaks;
-$CM* $CAN_CM $CM* $QU; # QU x .
+$CAN_CM $CM* $QU; # QU x .
$CANT_CM $CM* $QU;
-
+
#
# LB 20 Break before and after CB.
# nothing needed here.
# LB 21
# Don't include $BAX or $NSX here
-$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
+($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
-$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
-[^$CB] $CM* $BB; #
+[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
+[^$CB] $CM* $BB; #
# LB21a
[^$CB]? $CM* ($HY | $BA | $BAX) $CM* $HL;
# LB21b (reverse)
-$CM* $HL $CM* $SY;
+$HL $CM* $SY;
# LB 22
-$CM* $IN $CM* ($ALPlus | $HL);
-$CM* $IN $CM* $EX;
-$CM* $IN $CM* $ID;
-# $CM* $IN $CM* $IN; # delete this rule for CSS loose
+$IN $CM* ($ALPlus | $HL);
+$IN $CM* $EX;
+$IN $CM* ($ID | $EB | $EM);
+# $IN $CM* $IN; # delete this rule for CSS loose
$CM* $IN $CM* $NU;
# LB 23
# Do not include $POX here
-$CM* $PO $CM* $ID;
-$CM* $NU $CM* ($ALPlus | $HL);
-$CM* ($ALPlus | $HL) $CM* $NU;
+$PO $CM* ($ID | $EB | $EM);
+$NU $CM* ($ALPlus | $HL);
+($ALPlus | $HL) $CM* $NU;
# LB 24
# Do not include $PRX here
-$CM* $ID $CM* $PR;
-$CM* ($ALPlus | $HL) $CM* $PR;
-$CM* ($ALPlus | $HL) $CM* ($PO | $POX);
-
+($ID | $EB | $EM) $CM* $PR;
+($ALPlus | $HL) $CM* $PR;
+($ALPlus | $HL) $CM* ($PO | $POX);
# LB 25
# Here do not include $POX at the beginning or $PRX at the end
($CM* ($PR | $PRX | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO | $POX))?;
# LB 26
-$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
-$CM* ($JT | $JV) $CM* ($H2 | $JV);
-$CM* $JT $CM* ($H3 | $JT);
+($H3 | $H2 | $JV | $JL) $CM* $JL;
+($JT | $JV) $CM* ($H2 | $JV);
+$JT $CM* ($H3 | $JT);
# LB 27
# Do not include $POX or $PRX here
-$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
-$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
-$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
+$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
+($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
# LB 28
-$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
+($ALPlus | $HL) $CM* ($ALPlus | $HL);
# LB 29
-$CM* ($ALPlus | $HL) $CM* $IS;
+($ALPlus | $HL) $CM* $IS;
# LB 30
-$CM* $OP $CM* ($ALPlus | $HL | $NU);
-$CM* ($ALPlus | $HL | $NU) $CM* $CP;
+$OP $CM* ($ALPlus | $HL | $NU);
+($ALPlus | $HL | $NU) $CM* $CP;
# LB 30a
-$CM* $RI $CM* $RI;
+# Pairs of Regional Indicators.
+# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
+# the second with an even number. Stripping away the cruft they look like
+# [^RI] RI / (RI RI)+ ^RI;
+# [^RI] RI RI / (RI RI)+ ^RI;
+#
+[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
+[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
+
+# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
+$RI $CM* $RI;
+
+# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
+$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
+
+
+# LB 30b Do not break between an Emoji Base and an Emoji Modifier
+$EM $CM* $EB;
+
## -------------------------------------------------
!!safe_reverse;
# LB 9
-$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
-$CM+ $SP / .;
+^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
+^$CM+ $SP / .;
# LB 14
$SP+ $CM* $OP;
($CM* ($IS | $SY))+ $CM* $NU;
($CL | $CP) $CM* ($NU | $IS | $SY);
+# LB 30
+($CM* $RI)+;
+
# For dictionary-based break
$dictionary $dictionary;
# turn off rule chaining. We don't want to move more
# than necessary.
#
-[$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $dictionary];
+^[$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $RI $ZWJ $dictionary];
$dictionary $dictionary;
-# Copyright (c) 2002-2015 International Business Machines Corporation and
+# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
# file: line_loose_fi.txt
#
# Line Breaking Rules
-# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
+# Implement default line breaking as defined by
+# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
-# tailored as noted in 2nd paragraph below..
+#
+# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
+# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
+#
+# tailored as noted in 2nd paragraph below.
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
#
!!chain;
-!!LBCMNoChain;
-
!!lookAheadHardBreak;
#
# See rule LB 19 for an example.
#
+# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
+
+$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
+$EM = [\U0001F3FB-\U0001F3FF];
+
$AI = [:LineBreak = Ambiguous:];
-$AL = [:LineBreak = Alphabetic:];
+$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]];
$BA = [[:LineBreak = Break_After:] - [\u2010]];
$HH = [\u2010];
$BB = [:LineBreak = Break_Before:];
$CB = [:LineBreak = Contingent_Break:];
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
$CL = [:LineBreak = Close_Punctuation:];
-$CM = [:LineBreak = Combining_Mark:];
+$CM = [[:LineBreak = Combining_Mark:] \u200d];
$CP = [:LineBreak = Close_Parenthesis:];
$CR = [:LineBreak = Carriage_Return:];
$EX = [:LineBreak = Exclamation:];
$HY = [:LineBreak = Hyphen:];
$H2 = [:LineBreak = H2:];
$H3 = [:LineBreak = H3:];
-$ID = [[:LineBreak = Ideographic:] $CJ];
+$ID = [[:LineBreak = Ideographic:]$CJ[\u2764] - $EB];
$IN = [:LineBreak = Inseperable:];
$IS = [:LineBreak = Infix_Numeric:];
$JL = [:LineBreak = JL:];
$WJ = [:LineBreak = Word_Joiner:];
$XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
+$ZWJ = [\u200d];
# Dictionary character set, for triggering language-based break engines. Currently
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
$HYcm = $HY $CM*;
$H2cm = $H2 $CM*;
$H3cm = $H3 $CM*;
-$IDcm = $ID $CM*;
$INcm = $IN $CM*;
$IScm = $IS $CM*;
$JLcm = $JL $CM*;
$B2 $CM+;
$CL $CM+;
$CP $CM+;
+$EB $CM+;
+$EM $CM+;
$EX $CM+;
$GL $CM+;
$HL $CM+;
# Rule LB 4, 5 Mandatory (Hard) breaks.
#
$LB4Breaks = [$BK $CR $LF $NL];
-$LB4NonBreaks = [^$BK $CR $LF $NL];
+$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
$CR $LF {100};
#
#
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
$CAN_CM $CM* $LB4Breaks {100};
-$CM+ $LB4Breaks {100};
+^$CM+ $LB4Breaks {100};
# LB 7 x SP
# x ZW
$LB4NonBreaks [$SP $ZW];
$CAN_CM $CM* [$SP $ZW];
-$CM+ [$SP $ZW];
+^$CM+ [$SP $ZW];
#
# LB 8 Break after zero width space
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+# LB 8a ZWJ x ID Emoji proposal.
+#
+$ZWJ ($ID | $EB | $EM);
-# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
-# $CM not covered by the above needs to behave like $AL
+# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
+# $CM not covered by the above needs to behave like $AL
# See definition of $CAN_CM.
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
-$CM+;
+^$CM+;
#
# LB 11 Do not break before or after WORD JOINER & related characters.
#
$CAN_CM $CM* $WJcm;
$LB8NonBreaks $WJcm;
-$CM+ $WJcm;
+^$CM+ $WJcm;
$WJcm $CANT_CM;
$WJcm $CAN_CM $CM*;
#
$GLcm $CAN_CM $CM*;
$GLcm $CANT_CM;
-
+
#
# LB 12a Do not break before NBSP and related characters ...
# [^SP BA HY] x GL
#
[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GLcm;
-$CM+ GLcm;
+^$CM+ $GLcm;
#
$LB8NonBreaks $CL;
$CAN_CM $CM* $CL;
-$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $CP;
$CAN_CM $CM* $CP;
-$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $EX;
$CAN_CM $CM* $EX;
-$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $IS;
$CAN_CM $CM* $IS;
-$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $SY;
$CAN_CM $CM* $SY;
-$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
#
# LB 19
# x QU
$LB18NonBreaks $CM* $QUcm;
-$CM+ $QUcm;
+^$CM+ $QUcm;
# QU x
$QUcm .?;
-$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
- # TODO: I don't think this rule is needed.
-
# LB 20
# <break> $CB
$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm) / $AL;
$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm);
($HY | $HH) $AL;
+^$CM+ ($BAcm | $HHcm | $HYcm | $NScm);
$BBcm [^$CB]; # $BB x
$BBcm $LB20NonBreaks $CM*;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
-#
+#
$HLcm ($HYcm | $BAcm | $HHcm) [^$CB]?;
# LB 21b (forward) Don't break between SY and HL
# LB 22
($ALcm | $HLcm) $INcm;
-$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
+^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
$EXcm $INcm;
-$IDcm $INcm;
+($ID | $EB | $EM) $CM* $INcm;
$INcm $INcm;
$NUcm $INcm;
# $LB 23
-$IDcm $POcm;
+($ID | $EB | $EM) $CM* $POcm;
$ALcm $NUcm; # includes $LB19
$HLcm $NUcm;
-$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
+^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
$NUcm $ALcm;
$NUcm $HLcm;
#
# LB 24
#
-$PRcm $IDcm;
+$PRcm ($ID | $EB | $EM);
$PRcm ($ALcm | $HLcm);
$POcm ($ALcm | $HLcm);
# LB 28 Do not break between alphabetics
#
($ALcm | $HLcm) ($ALcm | $HLcm);
-$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
+^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
# LB 29
$IScm ($ALcm | $HLcm);
# LB 30
($ALcm | $HLcm | $NUcm) $OPcm;
-$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
+^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CPcm ($ALcm | $HLcm | $NUcm);
-# LB 30a Do not break between regional indicators.
-$RIcm $RIcm;
+# LB 30a Do not break between regional indicators. Break after pairs of them.
+# Tricky interaction with LB8a: ZWJ x ID
+$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HH $HY $NS $NSX $CM] {eof}];
+$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HH $HY $NS $NSX $CM $ID $EB $EM] {eof}];
+$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HH $HY $NS $NSX $CM] {eof}];
+
+$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HH $HY $NS $NSX {eof}];
+$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
+
+# LB 30b Do not break between an Emoji Base and an Emoji Modifier
+$EB $CM* $EM;
#
# Reverse Rules.
!!reverse;
-$CM+ $ALPlus;
-$CM+ $BA;
-$CM+ $HH;
-$CM+ $BB;
-$CM+ $B2;
-$CM+ $CL;
-$CM+ $CP;
-$CM+ $EX;
-$CM+ $GL;
-$CM+ $HL;
-$CM+ $HY;
-$CM+ $H2;
-$CM+ $H3;
-$CM+ $ID;
-$CM+ $IN;
-$CM+ $IS;
-$CM+ $JL;
-$CM+ $JV;
-$CM+ $JT;
-$CM+ $NS;
-$CM+ $NSX;
-$CM+ $NU;
-$CM+ $OP;
-$CM+ $PO;
-$CM+ $PR;
-$CM+ $QU;
-$CM+ $RI;
-$CM+ $SY;
-$CM+ $WJ;
-$CM+;
+^$CM+ $ALPlus;
+^$CM+ $BA;
+^$CM+ $BB;
+^$CM+ $B2;
+^$CM+ $CL;
+^$CM+ $CP;
+^$CM+ $EB;
+^$CM+ $EM;
+^$CM+ $EX;
+^$CM+ $GL;
+^$CM+ $HH;
+^$CM+ $HL;
+^$CM+ $HY;
+^$CM+ $H2;
+^$CM+ $H3;
+^$CM+ $ID;
+^$CM+ $IN;
+^$CM+ $IS;
+^$CM+ $JL;
+^$CM+ $JV;
+^$CM+ $JT;
+^$CM+ $NS;
+^$CM+ $NSX;
+^$CM+ $NU;
+^$CM+ $OP;
+^$CM+ $PO;
+^$CM+ $PR;
+^$CM+ $QU;
+^$CM+ $RI;
+^$CM+ $SY;
+^$CM+ $WJ;
+^$CM+;
#
[$BK $CR $LF $NL $ZW {eof}] |
$SP+ $CM+ $SP |
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
- # LB14 says OP SP* x .
+ # LB14 says OP SP* x .
# becomes OP SP* x AL
# becomes OP SP* x CM+ AL_FOLLOW
#
# Further note: the $AL in [$AL {eof}] is only to work around
# a rule compiler bug which complains about
# empty sets otherwise.
-
+
#
# Sequences of the form (shown forwards)
# [CANT_CM] <break> [CM] <break> [PR]
-# LB 4, 5, 5
+# LB 4, 5, 6
$LB4Breaks [$LB4NonBreaks-$CM];
$LB4Breaks $CM+ $CAN_CM;
# Requires an engine enhancement.
# / $SP* $ZW
+# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
+# The ZWJ will look like a CM to whatever precedes it.
+#
+($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
+
+
# LB 9,10 Combining marks.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# Stick together any combining sequences that don't match other rules.
-$CM+ $CAN_CM;
+^$CM+ $CAN_CM;
# LB 11
-$CM* $WJ $CM* $CAN_CM;
-$CM* $WJ [$LB8NonBreaks-$CM];
+#
+$WJ $CM* $CAN_CM;
+$WJ [$LB8NonBreaks-$CM];
$CANT_CM $CM* $WJ;
-$CM* $CAN_CM $CM* $WJ;
+$CAN_CM $CM* $WJ;
# LB 12a
# [^SP BA HY] x GL
#
-$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]];
+$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]];
# LB 12
# GL x
#
$CANT_CM $CM* $GL;
-$CM* $CAN_CM $CM* $GL;
+$CAN_CM $CM* $GL;
# LB 13
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
# This really wants to chain at the $CM+ (which is acting as an $AL)
# except for $CM chaining being disabled.
-[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
+[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
# LB 14 OP SP* x
#
-$CM* $CAN_CM $SP* $CM* $OP;
+$CAN_CM $SP* $CM* $OP;
$CANT_CM $SP* $CM* $OP;
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
-
- $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
-$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
-$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
+ $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
+$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
# LB 15
-$CM* $OP $SP* $CM* $QU;
+$OP $SP* $CM* $QU;
# LB 16
# Don't include $NSX here
-$CM* $NS $SP* $CM* ($CL | $CP);
+$NS $SP* $CM* ($CL | $CP);
# LB 17
-$CM* $B2 $SP* $CM* $B2;
+$B2 $SP* $CM* $B2;
# LB 18 break after spaces
# Nothing explicit needed here.
#
# LB 19
#
-$CM* $QU $CM* $CAN_CM; # . x QU
-$CM* $QU $LB18NonBreaks;
+$QU $CM* $CAN_CM; # . x QU
+$QU $LB18NonBreaks;
-$CM* $CAN_CM $CM* $QU; # QU x .
+$CAN_CM $CM* $QU; # QU x .
$CANT_CM $CM* $QU;
-
+
#
# LB 20 Break before and after CB.
# nothing needed here.
# LB 21
# Don't include $NSX here
-$CM* ($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
+($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
-$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
-[^$CB] $CM* $BB; #
+[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
+[^$CB] $CM* $BB; #
# LB21a
[^$CB] $CM* ($HY | $BA | $HH) $CM* $HL;
# LB21b (reverse)
-$CM* $HL $CM* $SY;
+$HL $CM* $SY;
# LB 22
-$CM* $IN $CM* ($ALPlus | $HL);
-$CM* $IN $CM* $EX;
-$CM* $IN $CM* $ID;
-$CM* $IN $CM* $IN;
-$CM* $IN $CM* $NU;
+$IN $CM* ($ALPlus | $HL);
+$IN $CM* $EX;
+$IN $CM* ($ID | $EB | $EM);
+$IN $CM* $IN;
+$IN $CM* $NU;
# LB 23
-$CM* $PO $CM* $ID;
-$CM* $NU $CM* ($ALPlus | $HL);
-$CM* ($ALPlus | $HL) $CM* $NU;
+$PO $CM* ($ID | $EB | $EM);
+$NU $CM* ($ALPlus | $HL);
+($ALPlus | $HL) $CM* $NU;
# LB 24
-$CM* $ID $CM* $PR;
-$CM* ($ALPlus | $HL) $CM* $PR;
-$CM* ($ALPlus | $HL) $CM* $PO;
+($ID | $EB | $EM) $CM* $PR;
+($ALPlus | $HL) $CM* $PR;
+($ALPlus | $HL) $CM* $PO;
# LB 25
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
# LB 26
-$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
-$CM* ($JT | $JV) $CM* ($H2 | $JV);
-$CM* $JT $CM* ($H3 | $JT);
+($H3 | $H2 | $JV | $JL) $CM* $JL;
+($JT | $JV) $CM* ($H2 | $JV);
+$JT $CM* ($H3 | $JT);
# LB 27
-$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
-$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
-$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
+$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
+ ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
# LB 28
-$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
+($ALPlus | $HL) $CM* ($ALPlus | $HL);
# LB 29
-$CM* ($ALPlus | $HL) $CM* $IS;
+($ALPlus | $HL) $CM* $IS;
# LB 30
-$CM* $OP $CM* ($ALPlus | $HL | $NU);
-$CM* ($ALPlus | $HL | $NU) $CM* $CP;
+$OP $CM* ($ALPlus | $HL | $NU);
+($ALPlus | $HL | $NU) $CM* $CP;
# LB 30a
-$CM* $RI $CM* $RI;
+# Pairs of Regional Indicators.
+# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
+# the second with an even number. Stripping away the cruft they look like
+# [^RI] RI / (RI RI)+ ^RI;
+# [^RI] RI RI / (RI RI)+ ^RI;
+#
+[{bof} $NS $NSX $HY $BA $HH $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
+[{bof} $NS $NSX $HY $BA $HH $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
+
+# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
+$RI $CM* $RI;
+
+# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
+$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
+
+
+# LB 30b Do not break between an Emoji Base and an Emoji Modifier
+$EM $CM* $EB;
+
## -------------------------------------------------
!!safe_reverse;
# LB 9
-$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
-$CM+ $SP / .;
+^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
+^$CM+ $SP / .;
# LB 14
$SP+ $CM* $OP;
$SP+ $CM* $B2;
# LB 21
-$CM* ($HY | $BA | $HH) $CM* $HL;
+($HY | $BA | $HH) $CM* $HL;
# LB 25
($CM* ($IS | $SY))+ $CM* $NU;
($CL | $CP) $CM* ($NU | $IS | $SY);
+# LB 30
+($CM* $RI)+;
+
# For dictionary-based break
$dictionary $dictionary;
# turn off rule chaining. We don't want to move more
# than necessary.
#
-[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];
+^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $HH $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $HH $RI $ZWJ $dictionary];
$dictionary $dictionary;
-# Copyright (c) 2002-2015 International Business Machines Corporation and
+# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
# file: line_normal.txt
#
# Line Breaking Rules
-# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
+# Implement default line breaking as defined by
+# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
-# tailored as noted in 2nd paragraph below..
+#
+# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
+# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
+#
+# tailored as noted in 2nd paragraph below.
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
#
!!chain;
-!!LBCMNoChain;
-
!!lookAheadHardBreak;
#
# See rule LB 19 for an example.
#
+# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
+
+$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
+$EM = [\U0001F3FB-\U0001F3FF];
+
$AI = [:LineBreak = Ambiguous:];
-$AL = [:LineBreak = Alphabetic:];
+$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]];
$BA = [:LineBreak = Break_After:];
$BB = [:LineBreak = Break_Before:];
$BK = [:LineBreak = Mandatory_Break:];
$CB = [:LineBreak = Contingent_Break:];
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
$CL = [:LineBreak = Close_Punctuation:];
-$CM = [:LineBreak = Combining_Mark:];
+$CM = [[:LineBreak = Combining_Mark:] \u200d];
$CP = [:LineBreak = Close_Parenthesis:];
$CR = [:LineBreak = Carriage_Return:];
$EX = [:LineBreak = Exclamation:];
$HY = [:LineBreak = Hyphen:];
$H2 = [:LineBreak = H2:];
$H3 = [:LineBreak = H3:];
-$ID = [[:LineBreak = Ideographic:] $CJ];
+$ID = [[:LineBreak = Ideographic:] $CJ [\u2764] - $EB];
$IN = [:LineBreak = Inseperable:];
$IS = [:LineBreak = Infix_Numeric:];
$JL = [:LineBreak = JL:];
$WJ = [:LineBreak = Word_Joiner:];
$XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
+$ZWJ = [\u200d];
# Dictionary character set, for triggering language-based break engines. Currently
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
$HYcm = $HY $CM*;
$H2cm = $H2 $CM*;
$H3cm = $H3 $CM*;
-$IDcm = $ID $CM*;
$INcm = $IN $CM*;
$IScm = $IS $CM*;
$JLcm = $JL $CM*;
$B2 $CM+;
$CL $CM+;
$CP $CM+;
+$EB $CM+;
+$EM $CM+;
$EX $CM+;
$GL $CM+;
$HL $CM+;
# Rule LB 4, 5 Mandatory (Hard) breaks.
#
$LB4Breaks = [$BK $CR $LF $NL];
-$LB4NonBreaks = [^$BK $CR $LF $NL];
+$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
$CR $LF {100};
#
#
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
$CAN_CM $CM* $LB4Breaks {100};
-$CM+ $LB4Breaks {100};
+^$CM+ $LB4Breaks {100};
# LB 7 x SP
# x ZW
$LB4NonBreaks [$SP $ZW];
$CAN_CM $CM* [$SP $ZW];
-$CM+ [$SP $ZW];
+^$CM+ [$SP $ZW];
#
# LB 8 Break after zero width space
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+# LB 8a ZWJ x ID Emoji proposal.
+#
+$ZWJ ($ID | $EB | $EM);
-# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
-# $CM not covered by the above needs to behave like $AL
+# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
+# $CM not covered by the above needs to behave like $AL
# See definition of $CAN_CM.
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
-$CM+;
+^$CM+;
#
# LB 11 Do not break before or after WORD JOINER & related characters.
#
$CAN_CM $CM* $WJcm;
$LB8NonBreaks $WJcm;
-$CM+ $WJcm;
+^$CM+ $WJcm;
$WJcm $CANT_CM;
$WJcm $CAN_CM $CM*;
#
$GLcm $CAN_CM $CM*;
$GLcm $CANT_CM;
-
+
#
# LB 12a Do not break before NBSP and related characters ...
# [^SP BA HY] x GL
#
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm;
-$CM+ GLcm;
+^$CM+ $GLcm;
#
$LB8NonBreaks $CL;
$CAN_CM $CM* $CL;
-$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $CP;
$CAN_CM $CM* $CP;
-$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $EX;
$CAN_CM $CM* $EX;
-$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $IS;
$CAN_CM $CM* $IS;
-$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $SY;
$CAN_CM $CM* $SY;
-$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
#
# LB 19
# x QU
$LB18NonBreaks $CM* $QUcm;
-$CM+ $QUcm;
+^$CM+ $QUcm;
# QU x
$QUcm .?;
-$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
- # TODO: I don't think this rule is needed.
# LB 20
# LB 21 x (BA | HY | NS)
# BB x
#
-$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
+$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
+^$CM+ ($BAcm | $HYcm | $NScm);
$BBcm [^$CB]; # $BB x
$BBcm $LB20NonBreaks $CM*;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
-#
+#
$HLcm ($HYcm | $BAcm) [^$CB]?;
# LB 21b (forward) Don't break between SY and HL
# LB 22
($ALcm | $HLcm) $INcm;
-$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
+^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
$EXcm $INcm;
-$IDcm $INcm;
+($ID | $EB | $EM) $CM* $INcm;
$INcm $INcm;
$NUcm $INcm;
# $LB 23
-$IDcm $POcm;
+($ID | $EB | $EM) $CM* $POcm;
$ALcm $NUcm; # includes $LB19
$HLcm $NUcm;
-$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
+^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
$NUcm $ALcm;
$NUcm $HLcm;
#
# LB 24
#
-$PRcm $IDcm;
+$PRcm ($ID | $EB | $EM);
$PRcm ($ALcm | $HLcm);
$POcm ($ALcm | $HLcm);
# LB 28 Do not break between alphabetics
#
($ALcm | $HLcm) ($ALcm | $HLcm);
-$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
+^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
# LB 29
$IScm ($ALcm | $HLcm);
# LB 30
($ALcm | $HLcm | $NUcm) $OPcm;
-$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
+^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CPcm ($ALcm | $HLcm | $NUcm);
-# LB 30a Do not break between regional indicators.
-$RIcm $RIcm;
+# LB 30a Do not break between regional indicators. Break after pairs of them.
+# Tricky interaction with LB8a: ZWJ x ID
+$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
+$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
+$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
+
+$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
+$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
+
+# LB 30b Do not break between an Emoji Base and an Emoji Modifier
+$EB $CM* $EM;
#
# Reverse Rules.
!!reverse;
-$CM+ $ALPlus;
-$CM+ $BA;
-$CM+ $BB;
-$CM+ $B2;
-$CM+ $CL;
-$CM+ $CP;
-$CM+ $EX;
-$CM+ $GL;
-$CM+ $HL;
-$CM+ $HY;
-$CM+ $H2;
-$CM+ $H3;
-$CM+ $ID;
-$CM+ $IN;
-$CM+ $IS;
-$CM+ $JL;
-$CM+ $JV;
-$CM+ $JT;
-$CM+ $NS;
-$CM+ $NU;
-$CM+ $OP;
-$CM+ $PO;
-$CM+ $PR;
-$CM+ $QU;
-$CM+ $RI;
-$CM+ $SY;
-$CM+ $WJ;
-$CM+;
+^$CM+ $ALPlus;
+^$CM+ $BA;
+^$CM+ $BB;
+^$CM+ $B2;
+^$CM+ $CL;
+^$CM+ $CP;
+^$CM+ $EB;
+^$CM+ $EM;
+^$CM+ $EX;
+^$CM+ $GL;
+^$CM+ $HL;
+^$CM+ $HY;
+^$CM+ $H2;
+^$CM+ $H3;
+^$CM+ $ID;
+^$CM+ $IN;
+^$CM+ $IS;
+^$CM+ $JL;
+^$CM+ $JV;
+^$CM+ $JT;
+^$CM+ $NS;
+^$CM+ $NU;
+^$CM+ $OP;
+^$CM+ $PO;
+^$CM+ $PR;
+^$CM+ $QU;
+^$CM+ $RI;
+^$CM+ $SY;
+^$CM+ $WJ;
+^$CM+;
#
[$BK $CR $LF $NL $ZW {eof}] |
$SP+ $CM+ $SP |
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
- # LB14 says OP SP* x .
+ # LB14 says OP SP* x .
# becomes OP SP* x AL
# becomes OP SP* x CM+ AL_FOLLOW
#
# Further note: the $AL in [$AL {eof}] is only to work around
# a rule compiler bug which complains about
# empty sets otherwise.
-
+
#
# Sequences of the form (shown forwards)
# [CANT_CM] <break> [CM] <break> [PR]
-# LB 4, 5, 5
+# LB 4, 5, 6
$LB4Breaks [$LB4NonBreaks-$CM];
$LB4Breaks $CM+ $CAN_CM;
# Requires an engine enhancement.
# / $SP* $ZW
+# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
+# The ZWJ will look like a CM to whatever precedes it.
+#
+($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
+
+
# LB 9,10 Combining marks.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# Stick together any combining sequences that don't match other rules.
-$CM+ $CAN_CM;
+^$CM+ $CAN_CM;
# LB 11
-$CM* $WJ $CM* $CAN_CM;
-$CM* $WJ [$LB8NonBreaks-$CM];
+#
+$WJ $CM* $CAN_CM;
+$WJ [$LB8NonBreaks-$CM];
$CANT_CM $CM* $WJ;
-$CM* $CAN_CM $CM* $WJ;
+$CAN_CM $CM* $WJ;
# LB 12a
# [^SP BA HY] x GL
#
-$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
+$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
# LB 12
# GL x
#
$CANT_CM $CM* $GL;
-$CM* $CAN_CM $CM* $GL;
+$CAN_CM $CM* $GL;
# LB 13
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
# This really wants to chain at the $CM+ (which is acting as an $AL)
# except for $CM chaining being disabled.
-[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
+[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
# LB 14 OP SP* x
#
-$CM* $CAN_CM $SP* $CM* $OP;
+$CAN_CM $SP* $CM* $OP;
$CANT_CM $SP* $CM* $OP;
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
-
- $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
-$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
-$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
+ $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
+$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
# LB 15
-$CM* $OP $SP* $CM* $QU;
+$OP $SP* $CM* $QU;
# LB 16
-$CM* $NS $SP* $CM* ($CL | $CP);
+$NS $SP* $CM* ($CL | $CP);
# LB 17
-$CM* $B2 $SP* $CM* $B2;
+$B2 $SP* $CM* $B2;
# LB 18 break after spaces
# Nothing explicit needed here.
#
# LB 19
#
-$CM* $QU $CM* $CAN_CM; # . x QU
-$CM* $QU $LB18NonBreaks;
+$QU $CM* $CAN_CM; # . x QU
+$QU $LB18NonBreaks;
-$CM* $CAN_CM $CM* $QU; # QU x .
+$CAN_CM $CM* $QU; # QU x .
$CANT_CM $CM* $QU;
-
+
#
# LB 20 Break before and after CB.
# nothing needed here.
#
# LB 21
-$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
+($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
-$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
-[^$CB] $CM* $BB; #
+[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
+[^$CB] $CM* $BB; #
# LB21a
[^$CB] $CM* ($HY | $BA) $CM* $HL;
# LB21b (reverse)
-$CM* $HL $CM* $SY;
+$HL $CM* $SY;
# LB 22
-$CM* $IN $CM* ($ALPlus | $HL);
-$CM* $IN $CM* $EX;
-$CM* $IN $CM* $ID;
-$CM* $IN $CM* $IN;
-$CM* $IN $CM* $NU;
+$IN $CM* ($ALPlus | $HL);
+$IN $CM* $EX;
+$IN $CM* ($ID | $EB | $EM);
+$IN $CM* $IN;
+$IN $CM* $NU;
# LB 23
-$CM* $PO $CM* $ID;
-$CM* $NU $CM* ($ALPlus | $HL);
-$CM* ($ALPlus | $HL) $CM* $NU;
+$PO $CM* ($ID | $EB | $EM);
+$NU $CM* ($ALPlus | $HL);
+($ALPlus | $HL) $CM* $NU;
# LB 24
-$CM* $ID $CM* $PR;
-$CM* ($ALPlus | $HL) $CM* $PR;
-$CM* ($ALPlus | $HL) $CM* $PO;
+($ID | $EB | $EM) $CM* $PR;
+($ALPlus | $HL) $CM* $PR;
+($ALPlus | $HL) $CM* $PO;
# LB 25
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
# LB 26
-$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
-$CM* ($JT | $JV) $CM* ($H2 | $JV);
-$CM* $JT $CM* ($H3 | $JT);
+($H3 | $H2 | $JV | $JL) $CM* $JL;
+($JT | $JV) $CM* ($H2 | $JV);
+$JT $CM* ($H3 | $JT);
# LB 27
-$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
-$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
-$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
+$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
+ ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
# LB 28
-$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
+($ALPlus | $HL) $CM* ($ALPlus | $HL);
# LB 29
-$CM* ($ALPlus | $HL) $CM* $IS;
+($ALPlus | $HL) $CM* $IS;
# LB 30
-$CM* $OP $CM* ($ALPlus | $HL | $NU);
-$CM* ($ALPlus | $HL | $NU) $CM* $CP;
+$OP $CM* ($ALPlus | $HL | $NU);
+($ALPlus | $HL | $NU) $CM* $CP;
# LB 30a
-$CM* $RI $CM* $RI;
+# Pairs of Regional Indicators.
+# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
+# the second with an even number. Stripping away the cruft they look like
+# [^RI] RI / (RI RI)+ ^RI;
+# [^RI] RI RI / (RI RI)+ ^RI;
+#
+[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
+[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
+
+# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
+$RI $CM* $RI;
+
+# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
+$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
+
+
+# LB 30b Do not break between an Emoji Base and an Emoji Modifier
+$EM $CM* $EB;
+
## -------------------------------------------------
!!safe_reverse;
# LB 9
-$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
-$CM+ $SP / .;
+^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
+^$CM+ $SP / .;
# LB 14
$SP+ $CM* $OP;
($CM* ($IS | $SY))+ $CM* $NU;
($CL | $CP) $CM* ($NU | $IS | $SY);
+# LB 30
+($CM* $RI)+;
+
# For dictionary-based break
$dictionary $dictionary;
# turn off rule chaining. We don't want to move more
# than necessary.
#
-[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];
+^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
$dictionary $dictionary;
-# Copyright (c) 2002-2015 International Business Machines Corporation and
+# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
# file: line_normal_cj.txt
#
# Line Breaking Rules
-# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
+# Implement default line breaking as defined by
+# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
-# tailored as noted in 2nd paragraph below..
+#
+# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
+# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
+#
+# tailored as noted in 2nd paragraph below.
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
#
!!chain;
-!!LBCMNoChain;
-
!!lookAheadHardBreak;
#
# See rule LB 19 for an example.
#
+# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
+
+$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
+$EM = [\U0001F3FB-\U0001F3FF];
+
$AI = [:LineBreak = Ambiguous:];
-$AL = [:LineBreak = Alphabetic:];
+$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]];
$BAX = [\u2010 \u2013];
$BA = [[:LineBreak = Break_After:] - $BAX];
$BB = [:LineBreak = Break_Before:];
$CB = [:LineBreak = Contingent_Break:];
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
$CL = [:LineBreak = Close_Punctuation:];
-$CM = [:LineBreak = Combining_Mark:];
+$CM = [[:LineBreak = Combining_Mark:] \u200d];
$CP = [:LineBreak = Close_Parenthesis:];
$CR = [:LineBreak = Carriage_Return:];
$EX = [:LineBreak = Exclamation:];
$HY = [:LineBreak = Hyphen:];
$H2 = [:LineBreak = H2:];
$H3 = [:LineBreak = H3:];
-$ID = [[:LineBreak = Ideographic:] $CJ];
+$ID = [[:LineBreak = Ideographic:] $CJ [\u2764] - $EB];
$IN = [:LineBreak = Inseperable:];
$IS = [:LineBreak = Infix_Numeric:];
$JL = [:LineBreak = JL:];
$WJ = [:LineBreak = Word_Joiner:];
$XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
+$ZWJ = [\u200d];
# Dictionary character set, for triggering language-based break engines. Currently
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
$HYcm = $HY $CM*;
$H2cm = $H2 $CM*;
$H3cm = $H3 $CM*;
-$IDcm = $ID $CM*;
$INcm = $IN $CM*;
$IScm = $IS $CM*;
$JLcm = $JL $CM*;
$B2 $CM+;
$CL $CM+;
$CP $CM+;
+$EB $CM+;
+$EM $CM+;
$EX $CM+;
$GL $CM+;
$HL $CM+;
# Rule LB 4, 5 Mandatory (Hard) breaks.
#
$LB4Breaks = [$BK $CR $LF $NL];
-$LB4NonBreaks = [^$BK $CR $LF $NL];
+$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
$CR $LF {100};
#
#
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
$CAN_CM $CM* $LB4Breaks {100};
-$CM+ $LB4Breaks {100};
+^$CM+ $LB4Breaks {100};
# LB 7 x SP
# x ZW
$LB4NonBreaks [$SP $ZW];
$CAN_CM $CM* [$SP $ZW];
-$CM+ [$SP $ZW];
+^$CM+ [$SP $ZW];
#
# LB 8 Break after zero width space
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+# LB 8a ZWJ x ID Emoji proposal.
+#
+$ZWJ ($ID | $EB | $EM);
-# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
-# $CM not covered by the above needs to behave like $AL
+# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
+# $CM not covered by the above needs to behave like $AL
# See definition of $CAN_CM.
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
-$CM+;
+^$CM+;
#
# LB 11 Do not break before or after WORD JOINER & related characters.
#
$CAN_CM $CM* $WJcm;
$LB8NonBreaks $WJcm;
-$CM+ $WJcm;
+^$CM+ $WJcm;
$WJcm $CANT_CM;
$WJcm $CAN_CM $CM*;
#
$GLcm $CAN_CM $CM*;
$GLcm $CANT_CM;
-
+
#
# LB 12a Do not break before NBSP and related characters ...
# [^SP BA HY] x GL
#
[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GLcm;
-$CM+ GLcm;
+^$CM+ $GLcm;
#
$LB8NonBreaks $CL;
$CAN_CM $CM* $CL;
-$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $CP;
$CAN_CM $CM* $CP;
-$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $EX;
$CAN_CM $CM* $EX;
-$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $IS;
$CAN_CM $CM* $IS;
-$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $SY;
$CAN_CM $CM* $SY;
-$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
+^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
#
# LB 19
# x QU
$LB18NonBreaks $CM* $QUcm;
-$CM+ $QUcm;
+^$CM+ $QUcm;
# QU x
$QUcm .?;
-$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
- # TODO: I don't think this rule is needed.
# LB 20
# BB x
#
# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
-$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
+$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
+^$CM+ ($BAcm | $HYcm | $NScm);
$BBcm [^$CB]; # $BB x
$BBcm $LB20NonBreaks $CM*;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
-#
+#
$HLcm ($HYcm | $BAcm | $BAXcm) [^$CB]?;
# LB 21b (forward) Don't break between SY and HL
# LB 22
($ALcm | $HLcm) $INcm;
-$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
+^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
$EXcm $INcm;
-$IDcm $INcm;
+($ID | $EB | $EM) $CM* $INcm;
$INcm $INcm;
$NUcm $INcm;
# $LB 23
-$IDcm $POcm;
+($ID | $EB | $EM) $CM* $POcm;
$ALcm $NUcm; # includes $LB19
$HLcm $NUcm;
-$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
+^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
$NUcm $ALcm;
$NUcm $HLcm;
#
# LB 24
#
-$PRcm $IDcm;
+$PRcm ($ID | $EB | $EM);
$PRcm ($ALcm | $HLcm);
$POcm ($ALcm | $HLcm);
# LB 28 Do not break between alphabetics
#
($ALcm | $HLcm) ($ALcm | $HLcm);
-$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
+^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
# LB 29
$IScm ($ALcm | $HLcm);
# LB 30
($ALcm | $HLcm | $NUcm) $OPcm;
-$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
+^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CPcm ($ALcm | $HLcm | $NUcm);
-# LB 30a Do not break between regional indicators.
-$RIcm $RIcm;
+# LB 30a Do not break between regional indicators. Break after pairs of them.
+# Tricky interaction with LB8a: ZWJ x ID
+$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
+$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
+$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
+
+$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
+$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
+
+# LB 30b Do not break between an Emoji Base and an Emoji Modifier
+$EB $CM* $EM;
#
# Reverse Rules.
!!reverse;
-$CM+ $ALPlus;
-$CM+ $BA;
-$CM+ $BAX;
-$CM+ $BB;
-$CM+ $B2;
-$CM+ $CL;
-$CM+ $CP;
-$CM+ $EX;
-$CM+ $GL;
-$CM+ $HL;
-$CM+ $HY;
-$CM+ $H2;
-$CM+ $H3;
-$CM+ $ID;
-$CM+ $IN;
-$CM+ $IS;
-$CM+ $JL;
-$CM+ $JV;
-$CM+ $JT;
-$CM+ $NS;
-$CM+ $NSX;
-$CM+ $NU;
-$CM+ $OP;
-$CM+ $PO;
-$CM+ $PR;
-$CM+ $QU;
-$CM+ $RI;
-$CM+ $SY;
-$CM+ $WJ;
-$CM+;
+^$CM+ $ALPlus;
+^$CM+ $BA;
+^$CM+ $BAX;
+^$CM+ $BB;
+^$CM+ $B2;
+^$CM+ $CL;
+^$CM+ $CP;
+^$CM+ $EB;
+^$CM+ $EM;
+^$CM+ $EX;
+^$CM+ $GL;
+^$CM+ $HL;
+^$CM+ $HY;
+^$CM+ $H2;
+^$CM+ $H3;
+^$CM+ $ID;
+^$CM+ $IN;
+^$CM+ $IS;
+^$CM+ $JL;
+^$CM+ $JV;
+^$CM+ $JT;
+^$CM+ $NS;
+^$CM+ $NSX;
+^$CM+ $NU;
+^$CM+ $OP;
+^$CM+ $PO;
+^$CM+ $PR;
+^$CM+ $QU;
+^$CM+ $RI;
+^$CM+ $SY;
+^$CM+ $WJ;
+^$CM+;
#
[$BK $CR $LF $NL $ZW {eof}] |
$SP+ $CM+ $SP |
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
- # LB14 says OP SP* x .
+ # LB14 says OP SP* x .
# becomes OP SP* x AL
# becomes OP SP* x CM+ AL_FOLLOW
#
# Further note: the $AL in [$AL {eof}] is only to work around
# a rule compiler bug which complains about
# empty sets otherwise.
-
+
#
# Sequences of the form (shown forwards)
# [CANT_CM] <break> [CM] <break> [PR]
-# LB 4, 5, 5
+# LB 4, 5, 6
$LB4Breaks [$LB4NonBreaks-$CM];
$LB4Breaks $CM+ $CAN_CM;
# Requires an engine enhancement.
# / $SP* $ZW
+# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
+# The ZWJ will look like a CM to whatever precedes it.
+#
+($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
+
+
# LB 9,10 Combining marks.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# Stick together any combining sequences that don't match other rules.
-$CM+ $CAN_CM;
+^$CM+ $CAN_CM;
# LB 11
-$CM* $WJ $CM* $CAN_CM;
-$CM* $WJ [$LB8NonBreaks-$CM];
+#
+$WJ $CM* $CAN_CM;
+$WJ [$LB8NonBreaks-$CM];
$CANT_CM $CM* $WJ;
-$CM* $CAN_CM $CM* $WJ;
+$CAN_CM $CM* $WJ;
# LB 12a
# [^SP BA HY] x GL
#
-$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]];
+$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]];
# LB 12
# GL x
#
$CANT_CM $CM* $GL;
-$CM* $CAN_CM $CM* $GL;
+$CAN_CM $CM* $GL;
# LB 13
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
# This really wants to chain at the $CM+ (which is acting as an $AL)
# except for $CM chaining being disabled.
-[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
+[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
# LB 14 OP SP* x
#
-$CM* $CAN_CM $SP* $CM* $OP;
+$CAN_CM $SP* $CM* $OP;
$CANT_CM $SP* $CM* $OP;
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
-
- $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
-$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
-$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
+ $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
+$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
# LB 15
-$CM* $OP $SP* $CM* $QU;
+$OP $SP* $CM* $QU;
# LB 16
# Don't include $NSX here
-$CM* $NS $SP* $CM* ($CL | $CP);
+$NS $SP* $CM* ($CL | $CP);
# LB 17
-$CM* $B2 $SP* $CM* $B2;
+$B2 $SP* $CM* $B2;
# LB 18 break after spaces
# Nothing explicit needed here.
#
# LB 19
#
-$CM* $QU $CM* $CAN_CM; # . x QU
-$CM* $QU $LB18NonBreaks;
+$QU $CM* $CAN_CM; # . x QU
+$QU $LB18NonBreaks;
-$CM* $CAN_CM $CM* $QU; # QU x .
+$CAN_CM $CM* $QU; # QU x .
$CANT_CM $CM* $QU;
-
+
#
# LB 20 Break before and after CB.
# nothing needed here.
# LB 21
# Don't include $BAX or $NSX here
-$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
+($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
-$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
-[^$CB] $CM* $BB; #
+[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
+[^$CB] $CM* $BB; #
-# LB21a
-[^$CB]? $CM* ($HY | $BA | $BAX) $CM* $HL;
+# LB21a Don't break after Hebrew + Hyphen.
+([^$CB] $CM*)? ($HY | $BA | $BAX) $CM* $HL;
# LB21b (reverse)
-$CM* $HL $CM* $SY;
+$HL $CM* $SY;
# LB 22
-$CM* $IN $CM* ($ALPlus | $HL);
-$CM* $IN $CM* $EX;
-$CM* $IN $CM* $ID;
-$CM* $IN $CM* $IN;
-$CM* $IN $CM* $NU;
+$IN $CM* ($ALPlus | $HL);
+$IN $CM* $EX;
+$IN $CM* ($ID | $EB | $EM);
+$IN $CM* $IN;
+$IN $CM* $NU;
# LB 23
-$CM* $PO $CM* $ID;
-$CM* $NU $CM* ($ALPlus | $HL);
-$CM* ($ALPlus | $HL) $CM* $NU;
+$PO $CM* ($ID | $EB | $EM);
+$NU $CM* ($ALPlus | $HL);
+($ALPlus | $HL) $CM* $NU;
# LB 24
-$CM* $ID $CM* $PR;
-$CM* ($ALPlus | $HL) $CM* $PR;
-$CM* ($ALPlus | $HL) $CM* $PO;
+($ID | $EB | $EM) $CM* $PR;
+($ALPlus | $HL) $CM* $PR;
+($ALPlus | $HL) $CM* $PO;
# LB 25
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
# LB 26
-$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
-$CM* ($JT | $JV) $CM* ($H2 | $JV);
-$CM* $JT $CM* ($H3 | $JT);
+($H3 | $H2 | $JV | $JL) $CM* $JL;
+($JT | $JV) $CM* ($H2 | $JV);
+$JT $CM* ($H3 | $JT);
# LB 27
-$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
-$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
-$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
+$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
+ ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
# LB 28
-$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
+($ALPlus | $HL) $CM* ($ALPlus | $HL);
# LB 29
-$CM* ($ALPlus | $HL) $CM* $IS;
+($ALPlus | $HL) $CM* $IS;
# LB 30
-$CM* $OP $CM* ($ALPlus | $HL | $NU);
-$CM* ($ALPlus | $HL | $NU) $CM* $CP;
+$OP $CM* ($ALPlus | $HL | $NU);
+($ALPlus | $HL | $NU) $CM* $CP;
# LB 30a
-$CM* $RI $CM* $RI;
+# Pairs of Regional Indicators.
+# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
+# the second with an even number. Stripping away the cruft they look like
+# [^RI] RI / (RI RI)+ ^RI;
+# [^RI] RI RI / (RI RI)+ ^RI;
+#
+[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
+[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
+
+# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
+$RI $CM* $RI;
+
+# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
+$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
+
+
+# LB 30b Do not break between an Emoji Base and an Emoji Modifier
+$EM $CM* $EB;
+
## -------------------------------------------------
!!safe_reverse;
# LB 9
-$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
-$CM+ $SP / .;
+^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
+^$CM+ $SP / .;
# LB 14
$SP+ $CM* $OP;
($CM* ($IS | $SY))+ $CM* $NU;
($CL | $CP) $CM* ($NU | $IS | $SY);
+# LB 30
+($CM* $RI)+;
+
# For dictionary-based break
$dictionary $dictionary;
# turn off rule chaining. We don't want to move more
# than necessary.
#
-[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $BAX $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $BAX $dictionary];
+^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $BAX $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $BAX $RI $ZWJ $dictionary];
$dictionary $dictionary;
-# Copyright (c) 2002-2015 International Business Machines Corporation and
+# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
# file: line_normal_fi.txt
# [^SP BA HY] x GL
#
[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GLcm;
-$CM+ GLcm;
+$CM+ $GLcm;
#
-# Copyright (C) 2002-2015, International Business Machines Corporation
+# Copyright (C) 2002-2016, International Business Machines Corporation
# and others. All Rights Reserved.
#
# file: word.txt
#
# ICU Word Break Rules
# See Unicode Standard Annex #29.
-# These rules are based on UAX #29 Revision 22 for Unicode Version 6.3
+# These rules are based on UAX #29 Revision 27 for Unicode Version 8.0
+# with additions from L2/16-011R3 for Emoji sequences.
#
# Note: Updates to word.txt will usually need to be merged into
# word_POSIX.txt also.
# Character Class Definitions.
#
+$E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
+$E_Modifier = [\U0001F3FB-\U0001F3FF];
+$ZWJ = [\u200D];
+$GAZ = [\U0001F466-\U0001F469\U0001F48B\U0001F5E8\u2764];
+
$CR = [\p{Word_Break = CR}];
$LF = [\p{Word_Break = LF}];
-$Newline = [\p{Word_Break = Newline}];
-$Extend = [\p{Word_Break = Extend}];
+$Newline = [\p{Word_Break = Newline} ];
+$Extend = [[\p{Word_Break = Extend}][:Block=Tags:]];
$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
-$Format = [\p{Word_Break = Format}];
+$Format = [[\p{Word_Break = Format}] - [:Block=Tags:]];
$Katakana = [\p{Word_Break = Katakana}];
$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
$ALetter = [\p{Word_Break = ALetter}];
# except when they appear at the beginning of a region of text.
#
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
-$KatakanaEx = $Katakana ($Extend | $Format)*;
-$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format)*;
-$ALetterEx = $ALetterPlus ($Extend | $Format)*;
-$Single_QuoteEx = $Single_Quote ($Extend | $Format)*;
-$Double_QuoteEx = $Double_Quote ($Extend | $Format)*;
-$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
-$MidLetterEx = $MidLetter ($Extend | $Format)*;
-$MidNumEx = $MidNum ($Extend | $Format)*;
-$NumericEx = $Numeric ($Extend | $Format)*;
-$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
-$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format)*;
+$KatakanaEx = $Katakana ($Extend | $Format | $ZWJ)*;
+$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format | $ZWJ)*;
+$ALetterEx = $ALetterPlus ($Extend | $Format | $ZWJ)*;
+$Single_QuoteEx = $Single_Quote ($Extend | $Format | $ZWJ)*;
+$Double_QuoteEx = $Double_Quote ($Extend | $Format | $ZWJ)*;
+$MidNumLetEx = $MidNumLet ($Extend | $Format | $ZWJ)*;
+$MidLetterEx = $MidLetter ($Extend | $Format | $ZWJ)*;
+$MidNumEx = $MidNum ($Extend | $Format | $ZWJ)*;
+$NumericEx = $Numeric ($Extend | $Format | $ZWJ)*;
+$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format | $ZWJ)*;
+$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format | $ZWJ)*;
$Ideographic = [\p{Ideographic}];
-$HiraganaEx = $Hiragana ($Extend | $Format)*;
-$IdeographicEx = $Ideographic ($Extend | $Format)*;
+$HiraganaEx = $Hiragana ($Extend | $Format | $ZWJ)*;
+$IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*;
## -------------------------------------------------
#
$CR $LF;
+# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed.
+#
+$ZWJ $GAZ;
+
+
# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
# of a region of Text. The rule here comes into play when the start of text
# begins with a group of Format chars, or with a "word" consisting of a single
# char that is not in any of the listed word break categories followed by
# format char(s), or is not a CJK dictionary character.
-[^$CR $LF $Newline]? ($Extend | $Format)+;
+[^$CR $LF $Newline]? ($Extend | $Format | $ZWJ)+;
$NumericEx {100};
$ALetterEx {200};
$HiraganaEx {400}; # by virtue of being numerically larger.
$IdeographicEx {400}; #
+$E_Base ($Extend | $Format | $ZWJ)*;
+$E_Modifier ($Extend | $Format | $ZWJ)*;
+$GAZ ($Extend | $Format | $ZWJ)*;
+
#
# rule 5
# Do not break between most letters.
$ExtendNumLetEx $KatakanaEx {400}; # (13b)
# rule 13c
-
-$Regional_IndicatorEx $Regional_IndicatorEx;
+# Pairs of Regional Indicators stay together.
+# With rule chaining disabled by ^, this rule will match exactly two of them.
+# No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
+#
+^$Regional_IndicatorEx $Regional_IndicatorEx;
# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable {200};
$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
+# rule 13d
+# E_Base x E_Modifier
+#
+($E_Base | $GAZ) ($Format | $Extend | $ZWJ)* $E_Modifier;
+
## -------------------------------------------------
!!reverse;
-$BackHebrew_LetterEx = ($Format | $Extend)* $Hebrew_Letter;
-$BackALetterEx = ($Format | $Extend)* $ALetterPlus;
-$BackSingle_QuoteEx = ($Format | $Extend)* $Single_Quote;
-$BackDouble_QuoteEx = ($Format | $Extend)* $Double_Quote;
-$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet;
-$BackNumericEx = ($Format | $Extend)* $Numeric;
-$BackMidNumEx = ($Format | $Extend)* $MidNum;
-$BackMidLetterEx = ($Format | $Extend)* $MidLetter;
-$BackKatakanaEx = ($Format | $Extend)* $Katakana;
-$BackHiraganaEx = ($Format | $Extend)* $Hiragana;
-$BackExtendNumLetEx = ($Format | $Extend)* $ExtendNumLet;
-$BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator;
+$BackHebrew_LetterEx = ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
+$BackALetterEx = ($Format | $Extend | $ZWJ)* $ALetterPlus;
+$BackSingle_QuoteEx = ($Format | $Extend | $ZWJ)* $Single_Quote;
+$BackDouble_QuoteEx = ($Format | $Extend | $ZWJ)* $Double_Quote;
+$BackMidNumLetEx = ($Format | $Extend | $ZWJ)* $MidNumLet;
+$BackNumericEx = ($Format | $Extend | $ZWJ)* $Numeric;
+$BackMidNumEx = ($Format | $Extend | $ZWJ)* $MidNum;
+$BackMidLetterEx = ($Format | $Extend | $ZWJ)* $MidLetter;
+$BackKatakanaEx = ($Format | $Extend | $ZWJ)* $Katakana;
+$BackHiraganaEx = ($Format | $Extend | $ZWJ)* $Hiragana;
+$BackExtendNumLetEx = ($Format | $Extend | $ZWJ)* $ExtendNumLet;
+$BackRegional_IndicatorEx = ($Format | $Extend | $ZWJ)* $Regional_Indicator;
# rule 3
$LF $CR;
+# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed.
+#
+$GAZ $ZWJ;
+
# rule 4
-($Format | $Extend)* [^$CR $LF $Newline]?;
+($Format | $Extend | $ZWJ)* [^$CR $LF $Newline]?;
# rule 5
# rule 13c
-$BackRegional_IndicatorEx $BackRegional_IndicatorEx;
+^$BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
+ ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
+^$BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
+ ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
+
+$GAZ $ZWJ $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
+ ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
+$GAZ $ZWJ $BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
+ ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable;
$KanaKanji $KanaKanji; #different rule status if both kanji and kana found
+# rule 13d
+
+$E_Modifier ($Format | $Extend | $ZWJ)* ($E_Base | $GAZ);
+
+
+
## -------------------------------------------------
!!safe_reverse;
# rule 3
-($Extend | $Format)+ .?;
+($Extend | $Format | $ZWJ)+ .?;
# rule 6
($MidLetter | $MidNumLet | $Single_Quote) ($BackALetterEx | $BackHebrew_LetterEx);
# rule 11
($MidNum | $MidNumLet | $Single_Quote) $BackNumericEx;
+# rule 13c
+$BackRegional_IndicatorEx*;
+
# For dictionary-based break
$dictionary $dictionary;
!!safe_forward;
# rule 4
-($Extend | $Format)+ .?;
+($Extend | $Format | $ZWJ)+ .?;
# rule 6
($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx);
# rule 11
($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx;
+# rule 13c
+$Regional_IndicatorEx*;
+
# For dictionary-based break
$dictionary $dictionary;
#
-# Copyright (C) 2002-2015, International Business Machines Corporation
+# Copyright (C) 2002-2016, International Business Machines Corporation
# and others. All Rights Reserved.
#
# file: word_POSIX.txt
#
# ICU Word Break Rules, POSIX locale.
# See Unicode Standard Annex #29.
-# These rules are based on UAX #29 Revision 22 for Unicode Version 6.3
+# These rules are based on UAX #29 Revision 27 for Unicode Version 8.0
+# with additions from L2/16-011R3 for Emoji sequences.
#
# Note: Updates to word.txt will usually need to be merged into
# word_POSIX.txt also.
# Character Class Definitions.
#
+$E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
+$E_Modifier = [\U0001F3FB-\U0001F3FF];
+$ZWJ = [\u200D];
+$GAZ = [\U0001F466-\U0001F469\U0001F48B\U0001F5E8\u2764];
+
$CR = [\p{Word_Break = CR}];
$LF = [\p{Word_Break = LF}];
-$Newline = [\p{Word_Break = Newline}];
-$Extend = [\p{Word_Break = Extend}];
+$Newline = [\p{Word_Break = Newline} ];
+$Extend = [[\p{Word_Break = Extend}][:Block=Tags:]];
$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
-$Format = [\p{Word_Break = Format}];
+$Format = [[\p{Word_Break = Format}] - [:Block=Tags:]];
$Katakana = [\p{Word_Break = Katakana}];
$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
$ALetter = [\p{Word_Break = ALetter}];
# 5.0 or later as the definition of Complex_Context was corrected to include all
# characters requiring dictionary break.
-$Control = [\p{Grapheme_Cluster_Break = Control}];
+$Control = [\p{Grapheme_Cluster_Break = Control}];
$HangulSyllable = [\uac00-\ud7a3];
$ComplexContext = [:LineBreak = Complex_Context:];
$KanaKanji = [$Han $Hiragana $Katakana];
#
-# Rules 4 Ignore Format and Extend characters,
+# Rules 4 Ignore Format and Extend characters,
# except when they appear at the beginning of a region of text.
#
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
-$KatakanaEx = $Katakana ($Extend | $Format)*;
-$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format)*;
-$ALetterEx = $ALetterPlus ($Extend | $Format)*;
-$Single_QuoteEx = $Single_Quote ($Extend | $Format)*;
-$Double_QuoteEx = $Double_Quote ($Extend | $Format)*;
-$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
-$MidLetterEx = $MidLetter ($Extend | $Format)*;
-$MidNumEx = $MidNum ($Extend | $Format)*;
-$NumericEx = $Numeric ($Extend | $Format)*;
-$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
-$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format)*;
+$KatakanaEx = $Katakana ($Extend | $Format | $ZWJ)*;
+$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format | $ZWJ)*;
+$ALetterEx = $ALetterPlus ($Extend | $Format | $ZWJ)*;
+$Single_QuoteEx = $Single_Quote ($Extend | $Format | $ZWJ)*;
+$Double_QuoteEx = $Double_Quote ($Extend | $Format | $ZWJ)*;
+$MidNumLetEx = $MidNumLet ($Extend | $Format | $ZWJ)*;
+$MidLetterEx = $MidLetter ($Extend | $Format | $ZWJ)*;
+$MidNumEx = $MidNum ($Extend | $Format | $ZWJ)*;
+$NumericEx = $Numeric ($Extend | $Format | $ZWJ)*;
+$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format | $ZWJ)*;
+$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format | $ZWJ)*;
$Ideographic = [\p{Ideographic}];
-$HiraganaEx = $Hiragana ($Extend | $Format)*;
-$IdeographicEx = $Ideographic ($Extend | $Format)*;
+$HiraganaEx = $Hiragana ($Extend | $Format | $ZWJ)*;
+$IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*;
## -------------------------------------------------
#
$CR $LF;
+# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed.
+#
+$ZWJ $GAZ;
+
+
# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
# of a region of Text. The rule here comes into play when the start of text
# begins with a group of Format chars, or with a "word" consisting of a single
# char that is not in any of the listed word break categories followed by
# format char(s), or is not a CJK dictionary character.
-[^$CR $LF $Newline]? ($Extend | $Format)+;
+[^$CR $LF $Newline]? ($Extend | $Format | $ZWJ)+;
$NumericEx {100};
$ALetterEx {200};
$HiraganaEx {400}; # by virtue of being numerically larger.
$IdeographicEx {400}; #
+$E_Base ($Extend | $Format | $ZWJ)*;
+$E_Modifier ($Extend | $Format | $ZWJ)*;
+$GAZ ($Extend | $Format | $ZWJ)*;
+
#
# rule 5
# Do not break between most letters.
$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
-# rule 11 and 12
+# rule 11 and 12
$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
$ExtendNumLetEx $KatakanaEx {400}; # (13b)
# rule 13c
-
-$Regional_IndicatorEx $Regional_IndicatorEx;
+# Pairs of Regional Indicators stay together.
+# With rule chaining disabled by ^, this rule will match exactly two of them.
+# No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
+#
+^$Regional_IndicatorEx $Regional_IndicatorEx;
# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable {200};
-$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
+$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
+
+# rule 13d
+# E_Base x E_Modifier
+#
+($E_Base | $GAZ) ($Format | $Extend | $ZWJ)* $E_Modifier;
## -------------------------------------------------
!!reverse;
-$BackHebrew_LetterEx = ($Format | $Extend)* $Hebrew_Letter;
-$BackALetterEx = ($Format | $Extend)* $ALetterPlus;
-$BackSingle_QuoteEx = ($Format | $Extend)* $Single_Quote;
-$BackDouble_QuoteEx = ($Format | $Extend)* $Double_Quote;
-$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet;
-$BackNumericEx = ($Format | $Extend)* $Numeric;
-$BackMidNumEx = ($Format | $Extend)* $MidNum;
-$BackMidLetterEx = ($Format | $Extend)* $MidLetter;
-$BackKatakanaEx = ($Format | $Extend)* $Katakana;
-$BackHiraganaEx = ($Format | $Extend)* $Hiragana;
-$BackExtendNumLetEx = ($Format | $Extend)* $ExtendNumLet;
-$BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator;
+$BackHebrew_LetterEx = ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
+$BackALetterEx = ($Format | $Extend | $ZWJ)* $ALetterPlus;
+$BackSingle_QuoteEx = ($Format | $Extend | $ZWJ)* $Single_Quote;
+$BackDouble_QuoteEx = ($Format | $Extend | $ZWJ)* $Double_Quote;
+$BackMidNumLetEx = ($Format | $Extend | $ZWJ)* $MidNumLet;
+$BackNumericEx = ($Format | $Extend | $ZWJ)* $Numeric;
+$BackMidNumEx = ($Format | $Extend | $ZWJ)* $MidNum;
+$BackMidLetterEx = ($Format | $Extend | $ZWJ)* $MidLetter;
+$BackKatakanaEx = ($Format | $Extend | $ZWJ)* $Katakana;
+$BackHiraganaEx = ($Format | $Extend | $ZWJ)* $Hiragana;
+$BackExtendNumLetEx = ($Format | $Extend | $ZWJ)* $ExtendNumLet;
+$BackRegional_IndicatorEx = ($Format | $Extend | $ZWJ)* $Regional_Indicator;
# rule 3
$LF $CR;
+# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed.
+#
+$GAZ $ZWJ;
+
# rule 4
-($Format | $Extend)* [^$CR $LF $Newline]?;
+($Format | $Extend | $ZWJ)* [^$CR $LF $Newline]?;
# rule 5
# rules 13 a/b
#
$BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);
-($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;
+($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;
# rule 13c
-$BackRegional_IndicatorEx $BackRegional_IndicatorEx;
+^$BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
+ ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
+^$BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
+ ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
+
+$GAZ $ZWJ $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
+ ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
+$GAZ $ZWJ $BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
+ ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable;
$KanaKanji $KanaKanji; #different rule status if both kanji and kana found
+# rule 13d
+
+$E_Modifier ($Format | $Extend | $ZWJ)* ($E_Base | $GAZ);
+
+
+
## -------------------------------------------------
!!safe_reverse;
# rule 3
-($Extend | $Format)+ .?;
+($Extend | $Format | $ZWJ)+ .?;
# rule 6
($MidLetter | $MidNumLet | $Single_Quote) ($BackALetterEx | $BackHebrew_LetterEx);
# rule 11
($MidNum | $MidNumLet | $Single_Quote) $BackNumericEx;
+# rule 13c
+$BackRegional_IndicatorEx*;
+
# For dictionary-based break
$dictionary $dictionary;
!!safe_forward;
# rule 4
-($Extend | $Format)+ .?;
+($Extend | $Format | $ZWJ)+ .?;
# rule 6
($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx);
# rule 11
($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx;
+# rule 13c
+$Regional_IndicatorEx*;
+
# For dictionary-based break
$dictionary $dictionary;
* keep together 'abc', but only when followed by 'def', OTHERWISE
* just return one char at a time.
*/
- char rules[] = "abc{666}/def;\n [\\p{L} - [a]]* {2}; . {1};";
+ char rules[] = "abc/def{666};\n [\\p{L} - [a]]* {2}; . {1};";
/* 0123456789012345678 */
char data[] = "abcdex abcdefgh-def"; /* the test data string */
char breaks[] = "** ** * ** *"; /* * the expected break positions */
tsmthred.o tsnmfmt.o tsputil.o tstnrapi.o tstnorm.o tzbdtest.o \
tzregts.o tztest.o ucdtest.o usettest.o ustrtest.o strcase.o transtst.o strtest.o thcoll.o \
bytestrietest.o ucharstrietest.o \
-itrbbi.o rbbiapts.o rbbitst.o ittrans.o transapi.o cpdtrtst.o \
+itrbbi.o rbbiapts.o rbbitst.o rbbimonkeytest.o ittrans.o transapi.o cpdtrtst.o \
testutil.o transrt.o trnserr.o normconf.o sfwdchit.o \
jamotest.o srchtest.o reptest.o regextst.o \
itrbnf.o itrbnfrt.o itrbnfp.o ucaconf.o icusvtst.o \
/********************************************************************
* COPYRIGHT:
- * Copyright (c) 1997-2015, International Business Machines Corporation and
+ * Copyright (c) 1997-2016, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
return random(&RAND_SEED);
}
+
+/*
+ * Integer random number class implementation.
+ * Similar to C++ std::minstd_rand, with the same algorithm & constants.
+ */
+IntlTest::icu_rand::icu_rand(uint32_t seed) {
+ seed = seed % 2147483647UL;
+ if (seed == 0) {
+ seed = 1;
+ }
+ fLast = seed;
+}
+
+IntlTest::icu_rand::~icu_rand() {};
+
+void IntlTest::icu_rand::seed(uint32_t seed) {
+ if (seed == 0) {
+ seed = 1;
+ }
+ fLast = seed;
+}
+
+uint32_t IntlTest::icu_rand::operator() () {
+ fLast = ((uint64_t)fLast * 48271UL) % 2147483647UL;
+ return fLast;
+}
+
+uint32_t IntlTest::icu_rand::getSeed() {
+ return (uint32_t) fLast;
+}
+
+
+
static inline UChar toHex(int32_t i) {
return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10)));
}
/********************************************************************
- * COPYRIGHT:
- * Copyright (c) 1997-2015, International Business Machines Corporation and
+ * COPYRIGHT:
+ * Copyright (c) 1997-2016, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
#if U_NO_DEFAULT_INCLUDE_UTF_HEADERS
/* deprecated - make tests pass with U_NO_DEFAULT_INCLUDE_UTF_HEADERS */
-#include "unicode/utf_old.h"
+#include "unicode/utf_old.h"
#endif
U_NAMESPACE_USE
/**
* Replaces isICUVersionAtLeast and isICUVersionBefore
* log that an issue is known.
- * Usually used this way:
+ * Usually used this way:
* <code>if( ... && logKnownIssue("12345", "some bug")) continue; </code>
* @param ticket ticket string, "12345" or "cldrbug:1234"
* @param message optional message string
void errcheckln(UErrorCode status, const char *fmt, ...);
// Print ALL named errors encountered so far
- void printErrors();
+ void printErrors();
// print known issues. return TRUE if there were any.
UBool printKnownIssues();
-
+
virtual void usage( void ) ;
/**
*/
static float random();
+
+ /**
+ * Integer random numbers, similar to C++ std::minstd_rand, with the same algorithm
+ * and constants. Allow additional access to internal state, for use by monkey tests,
+ * which need to recreate previous random sequences beginning near a failure point.
+ */
+ class icu_rand {
+ public:
+ icu_rand(uint32_t seed = 1);
+ ~icu_rand();
+ void seed(uint32_t seed);
+ uint32_t operator()();
+ /**
+ * Get a seed corresponding to the current state of the generator.
+ * Seeding any generator with this value will cause it to produce the
+ * same sequence as this one will from this point forward.
+ */
+ uint32_t getSeed();
+ private:
+ uint32_t fLast;
+ };
+
+
+
enum { kMaxProps = 16 };
virtual void setProperty(const char* propline);
int32_t dataErrorCount;
IntlTest* caller;
char* testPath; // specifies subtests
-
+
char basePath[1024];
char currName[1024]; // current test name
<DisableLanguageExtensions>false</DisableLanguageExtensions>
</ClCompile>
<ClCompile Include="rbbitst.cpp" />
+ <ClCompile Include="rbbimonkeytest.cpp" />
<ClCompile Include="itspoof.cpp" />
<ClCompile Include="allcoll.cpp" />
<ClCompile Include="alphaindextst.cpp" />
<ClInclude Include="itrbbi.h" />
<ClInclude Include="rbbiapts.h" />
<ClInclude Include="rbbitst.h" />
+ <ClInclude Include="rbbimonkeytest.h" />
<ClInclude Include="itspoof.h" />
<ClInclude Include="allcoll.h" />
<ClInclude Include="alphaindextst.h" />
<ClCompile Include="rbbitst.cpp">
<Filter>break iteration</Filter>
</ClCompile>
+ <ClCompile Include="rbbimonkeytest.cpp">
+ <Filter>break iteration</Filter>
+ </ClCompile>
<ClCompile Include="itspoof.cpp">
<Filter>spoof detection</Filter>
</ClCompile>
<ClInclude Include="rbbitst.h">
<Filter>break iteration</Filter>
</ClInclude>
+ <ClInclude Include="rbbimonkeytest.h">
+ <Filter>break iteration</Filter>
+ </ClInclude>
<ClInclude Include="itspoof.h">
<Filter>spoof detection</Filter>
</ClInclude>
/*
**********************************************************************
-* Copyright (C) 1998-2012, International Business Machines Corporation
+* Copyright (C) 1998-2016, International Business Machines Corporation
* and others. All Rights Reserved.
**********************************************************************
*/
#if !UCONFIG_NO_BREAK_ITERATION
+#include "intltest.h"
#include "itrbbi.h"
#include "rbbiapts.h"
#include "rbbitst.h"
-
-#define TESTCLASS(n,classname) \
- case n: \
- name = #classname; \
- if (exec) { \
- logln(#classname "---"); \
- logln(""); \
- classname t; \
- callTest(t, par); \
- } \
- break
+#include "rbbimonkeytest.h"
void IntlTestRBBI::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par )
{
- if (exec) logln("TestSuite RuleBasedBreakIterator: ");
- switch (index) {
- TESTCLASS(0, RBBIAPITest);
- TESTCLASS(1, RBBITest);
- default: name=""; break;
+ if (exec) {
+ logln("TestSuite RuleBasedBreakIterator: ");
}
+ TESTCASE_AUTO_BEGIN;
+ TESTCASE_AUTO_CLASS(RBBIAPITest);
+ TESTCASE_AUTO_CLASS(RBBITest);
+ TESTCASE_AUTO_CLASS(RBBIMonkeyTest);
+ TESTCASE_AUTO_END;
}
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
--- /dev/null
+/********************************************************************
+ * Copyright (c) 2016, International Business Machines Corporation and
+ * others. All Rights Reserved.
+ ********************************************************************/
+
+
+#include "unicode/utypes.h"
+
+#include "rbbimonkeytest.h"
+#include "unicode/utypes.h"
+#include "unicode/brkiter.h"
+#include "unicode/utf16.h"
+#include "unicode/uniset.h"
+#include "unicode/unistr.h"
+
+#include "charstr.h"
+#include "cmemory.h"
+#include "cstr.h"
+#include "uelement.h"
+#include "uhash.h"
+
+#include "iostream"
+#include "string"
+
+using namespace icu;
+
+
+void RBBIMonkeyTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* params) {
+ fParams = params; // Work around TESTCASE_AUTO not being able to pass params to test function.
+
+ TESTCASE_AUTO_BEGIN;
+ TESTCASE_AUTO(testMonkey);
+ TESTCASE_AUTO_END;
+}
+
+//---------------------------------------------------------------------------------------
+//
+// class BreakRule implementation.
+//
+//---------------------------------------------------------------------------------------
+
+BreakRule::BreakRule() // : all field default initialized.
+{
+}
+
+BreakRule::~BreakRule() {};
+
+
+//---------------------------------------------------------------------------------------
+//
+// class BreakRules implementation.
+//
+//---------------------------------------------------------------------------------------
+BreakRules::BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status) :
+ fMonkeyImpl(monkeyImpl), fBreakRules(status), fType(UBRK_COUNT) {
+ fCharClasses.adoptInstead(uhash_open(uhash_hashUnicodeString,
+ uhash_compareUnicodeString,
+ NULL, // value comparator.
+ &status));
+ if (U_FAILURE(status)) {
+ return;
+ }
+ uhash_setKeyDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
+ uhash_setValueDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
+ fBreakRules.setDeleter(uprv_deleteUObject);
+
+ fCharClassList.adoptInstead(new UVector(status));
+
+ fSetRefsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
+ "(?!(?:\\{|=|\\[:)[ \\t]{0,4})" // Negative lookbehind for '{' or '=' or '[:'
+ // (the identifier is a unicode property name or value)
+ "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"), // The char class name
+ 0, status));
+
+ // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
+ fCommentsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
+ "(^|(?<=;))" // Start either at start of line, or just after a ';' (look-behind for ';')
+ "[ \\t]*+" // Match white space.
+ "(#.*)?+" // Optional # plus whatever follows
+ "\\R$" // new-line at end of line.
+ ), 0, status));
+
+ // Match (initial parse) of a character class defintion line.
+ fClassDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
+ "[ \\t]*" // leading white space
+ "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)" // The char class name
+ "[ \\t]*=[ \\t]*" // =
+ "(?<ClassDef>.*?)" // The char class UnicodeSet expression
+ "[ \\t]*;$"), // ; <end of line>
+ 0, status));
+
+ // Match (initial parse) of a break rule line.
+ fRuleDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
+ "[ \\t]*" // leading white space
+ "(?<RuleName>[A-Za-z_][A-Za-z0-9_.]*)" // The rule name
+ "[ \\t]*:[ \\t]*" // :
+ "(?<RuleDef>.*?)" // The rule definition
+ "[ \\t]*;$"), // ; <end of line>
+ 0, status));
+
+}
+
+
+BreakRules::~BreakRules() {};
+
+
+CharClass *BreakRules::addCharClass(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
+
+ // Create the expanded definition for this char class,
+ // replacing any set references with the corresponding definition.
+
+ UnicodeString expandedDef;
+ UnicodeString emptyString;
+ fSetRefsMatcher->reset(definition);
+ while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
+ const UnicodeString name =
+ fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
+ CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name));
+ const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
+
+ fSetRefsMatcher->appendReplacement(expandedDef, emptyString, status);
+ expandedDef.append(expansionForName);
+ }
+ fSetRefsMatcher->appendTail(expandedDef);
+
+ // Verify that the expanded set defintion is valid.
+
+ if (fMonkeyImpl->fDumpExpansions) {
+ printf("epandedDef: %s\n", CStr(expandedDef)());
+ }
+
+ UnicodeSet *s = new UnicodeSet(expandedDef, USET_IGNORE_SPACE, NULL, status);
+ if (U_FAILURE(status)) {
+ IntlTest::gTest->errln("%s:%d: error %s creating UnicodeSet %s", __FILE__, __LINE__,
+ u_errorName(status), CStr(name)());
+ return NULL;
+ }
+ CharClass *cclass = new CharClass(name, definition, expandedDef, s);
+ CharClass *previousClass = static_cast<CharClass *>(uhash_put(fCharClasses.getAlias(),
+ new UnicodeString(name), // Key, owned by hash table.
+ cclass, // Value, owned by hash table.
+ &status));
+
+ if (previousClass != NULL) {
+ // Duplicate class def.
+ // These are legitimate, they are adustments of an existing class.
+ // TODO: will need to keep the old around when we handle tailorings.
+ IntlTest::gTest->logln("Redefinition of character class %s\n", CStr(cclass->fName)());
+ delete previousClass;
+ }
+ return cclass;
+}
+
+
+void BreakRules::addRule(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
+ LocalPointer<BreakRule> thisRule(new BreakRule);
+ thisRule->fName = name;
+ thisRule->fRule = definition;
+
+ // If the rule name contains embedded digits, pad the first numeric field to a fixed length with leading zeroes,
+ // This gives a numeric sort order that matches Unicode UAX rule numbering conventions.
+ UnicodeString emptyString;
+
+ // Expand the char class definitions within the rule.
+ fSetRefsMatcher->reset(definition);
+ while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
+ const UnicodeString name =
+ fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
+ CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name));
+ if (!nameClass) {
+ IntlTest::gTest->errln("%s:%d char class \"%s\" unrecognized in rule \"%s\"",
+ __FILE__, __LINE__, CStr(name)(), CStr(definition)());
+ }
+ const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
+
+ fSetRefsMatcher->appendReplacement(thisRule->fExpandedRule, emptyString, status);
+ thisRule->fExpandedRule.append(expansionForName);
+ }
+ fSetRefsMatcher->appendTail(thisRule->fExpandedRule);
+
+ // Replace the divide sign (\u00f7) with a regular expression named capture.
+ // When running the rules, a match that includes this group means we found a break position.
+
+ int32_t dividePos = thisRule->fExpandedRule.indexOf((UChar)0x00f7);
+ if (dividePos >= 0) {
+ thisRule->fExpandedRule.replace(dividePos, 1, UnicodeString("(?<BreakPosition>)"));
+ }
+ if (thisRule->fExpandedRule.indexOf((UChar)0x00f7) != -1) {
+ status = U_ILLEGAL_ARGUMENT_ERROR; // TODO: produce a good error message.
+ }
+
+ // UAX break rule set definitions can be empty, just [].
+ // Regular expression set expressions don't accept this. Substitute with [^\u0000-\U0010ffff], which
+ // also matches nothing.
+
+ static const UChar emptySet[] = {(UChar)0x5b, (UChar)0x5d, 0};
+ int32_t where = 0;
+ while ((where = thisRule->fExpandedRule.indexOf(emptySet, 2, 0)) >= 0) {
+ thisRule->fExpandedRule.replace(where, 2, UnicodeString("[^\\u0000-\\U0010ffff]"));
+ }
+ if (fMonkeyImpl->fDumpExpansions) {
+ printf("fExpandedRule: %s\n", CStr(thisRule->fExpandedRule)());
+ }
+
+ // Compile a regular expression for this rule.
+ thisRule->fRuleMatcher.adoptInstead(new RegexMatcher(thisRule->fExpandedRule, UREGEX_COMMENTS | UREGEX_DOTALL, status));
+ if (U_FAILURE(status)) {
+ IntlTest::gTest->errln("%s:%d Error creating regular expression for %s",
+ __FILE__, __LINE__, CStr(thisRule->fExpandedRule)());
+ return;
+ }
+
+ // Put this new rule into the vector of all Rules.
+ fBreakRules.addElement(thisRule.orphan(), status);
+}
+
+
+bool BreakRules::setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status) {
+ if (keyword == UnicodeString("locale")) {
+ CharString localeName;
+ localeName.append(CStr(value)(), -1, status);
+ fLocale = Locale::createFromName(localeName.data());
+ return true;
+ }
+ if (keyword == UnicodeString("type")) {
+ if (value == UnicodeString("grapheme")) {
+ fType = UBRK_CHARACTER;
+ } else if (value == UnicodeString("word")) {
+ fType = UBRK_WORD;
+ } else if (value == UnicodeString("line")) {
+ fType = UBRK_LINE;
+ } else if (value == UnicodeString("sentence")) {
+ fType = UBRK_SENTENCE;
+ } else {
+ IntlTest::gTest->errln("%s:%d Unrecognized break type %s", __FILE__, __LINE__, CStr(value)());
+ }
+ return true;
+ }
+ // TODO: add tailoring base setting here.
+ return false;
+}
+
+RuleBasedBreakIterator *BreakRules::createICUBreakIterator(UErrorCode &status) {
+ if (U_FAILURE(status)) {
+ return NULL;
+ }
+ RuleBasedBreakIterator *bi = NULL;
+ switch(fType) {
+ case UBRK_CHARACTER:
+ bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createCharacterInstance(fLocale, status));
+ break;
+ case UBRK_WORD:
+ bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createWordInstance(fLocale, status));
+ break;
+ case UBRK_LINE:
+ bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createLineInstance(fLocale, status));
+ break;
+ case UBRK_SENTENCE:
+ bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createSentenceInstance(fLocale, status));
+ break;
+ default:
+ IntlTest::gTest->errln("%s:%d Bad break iterator type of %d", __FILE__, __LINE__, fType);
+ status = U_ILLEGAL_ARGUMENT_ERROR;
+ }
+ return bi;
+}
+
+
+void BreakRules::compileRules(UCHARBUF *rules, UErrorCode &status) {
+ if (U_FAILURE(status)) {
+ return;
+ }
+
+ UnicodeString emptyString;
+ for (int32_t lineNumber=0; ;lineNumber++) { // Loop once per input line.
+ if (U_FAILURE(status)) {
+ return;
+ }
+ int32_t lineLength = 0;
+ const UChar *lineBuf = ucbuf_readline(rules, &lineLength, &status);
+ if (lineBuf == NULL) {
+ break;
+ }
+ UnicodeString line(lineBuf, lineLength);
+
+ // Strip comment lines.
+ fCommentsMatcher->reset(line);
+ line = fCommentsMatcher->replaceFirst(emptyString, status);
+ if (line.isEmpty()) {
+ continue;
+ }
+
+ // Recognize character class definition and keyword lines
+ fClassDefMatcher->reset(line);
+ if (fClassDefMatcher->matches(status)) {
+ UnicodeString className = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassName", status), status);
+ UnicodeString classDef = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassDef", status), status);
+ if (fMonkeyImpl->fDumpExpansions) {
+ printf("scanned class: %s = %s\n", CStr(className)(), CStr(classDef)());
+ }
+ if (setKeywordParameter(className, classDef, status)) {
+ // The scanned item was "type = ..." or "locale = ...", etc.
+ // which are not actual character classes.
+ continue;
+ }
+ addCharClass(className, classDef, status);
+ continue;
+ }
+
+ // Recognize rule lines.
+ fRuleDefMatcher->reset(line);
+ if (fRuleDefMatcher->matches(status)) {
+ UnicodeString ruleName = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleName", status), status);
+ UnicodeString ruleDef = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleDef", status), status);
+ if (fMonkeyImpl->fDumpExpansions) {
+ printf("scanned rule: %s : %s\n", CStr(ruleName)(), CStr(ruleDef)());
+ }
+ addRule(ruleName, ruleDef, status);
+ continue;
+ }
+
+ IntlTest::gTest->errln("%s:%d: Unrecognized line in rule file %s: \"%s\"\n",
+ __FILE__, __LINE__, fMonkeyImpl->fRuleFileName, CStr(line)());
+ }
+
+ // Build the vector of char classes, omitting the dictionary class if there is one.
+ // This will be used when constructing the random text to be tested.
+
+ // Also compute the "other" set, consisting of any characters not included in
+ // one or more of the user defined sets.
+
+ UnicodeSet otherSet((UChar32)0, 0x10ffff);
+ int32_t pos = UHASH_FIRST;
+ const UHashElement *el = NULL;
+ while ((el = uhash_nextElement(fCharClasses.getAlias(), &pos)) != NULL) {
+ const UnicodeString *ccName = static_cast<const UnicodeString *>(el->key.pointer);
+ CharClass *cclass = static_cast<CharClass *>(el->value.pointer);
+ // printf(" Adding %s\n", CStr(*ccName)());
+ if (*ccName != cclass->fName) {
+ IntlTest::gTest->errln("%s:%d: internal error, set names (%s, %s) inconsistent.\n",
+ __FILE__, __LINE__, CStr(*ccName)(), CStr(cclass->fName)());
+ }
+ const UnicodeSet *set = cclass->fSet.getAlias();
+ otherSet.removeAll(*set);
+ if (*ccName == UnicodeString("dictionary")) {
+ fDictionarySet = *set;
+ } else {
+ fCharClassList->addElement(cclass, status);
+ }
+ }
+
+ if (!otherSet.isEmpty()) {
+ // fprintf(stderr, "have an other set.\n");
+ UnicodeString pattern;
+ CharClass *cclass = addCharClass(UnicodeString("__Others"), otherSet.toPattern(pattern), status);
+ fCharClassList->addElement(cclass, status);
+ }
+}
+
+
+const CharClass *BreakRules::getClassForChar(UChar32 c, int32_t *iter) const {
+ int32_t localIter = 0;
+ int32_t &it = iter? *iter : localIter;
+
+ while (it < fCharClassList->size()) {
+ const CharClass *cc = static_cast<const CharClass *>(fCharClassList->elementAt(it));
+ ++it;
+ if (cc->fSet->contains(c)) {
+ return cc;
+ }
+ }
+ return NULL;
+}
+
+//---------------------------------------------------------------------------------------
+//
+// class MonkeyTestData implementation.
+//
+//---------------------------------------------------------------------------------------
+
+void MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status) {
+ const int32_t dataLength = 1000;
+
+ // Fill the test string with random characters.
+ // First randomly pick a char class, then randomly pick a character from that class.
+ // Exclude any characters from the dictionary set.
+
+ // std::cout << "Populating Test Data" << std::endl;
+ fRandomSeed = rand.getSeed(); // Save initial seed for use in error messages,
+ // allowing recreation of failing data.
+ fBkRules = rules;
+ fString.remove();
+ for (int32_t n=0; n<dataLength;) {
+ int charClassIndex = rand() % rules->fCharClassList->size();
+ const CharClass *cclass = static_cast<CharClass *>(rules->fCharClassList->elementAt(charClassIndex));
+ if (cclass->fSet->size() == 0) {
+ // Some rules or tailorings do end up with empty char classes.
+ continue;
+ }
+ int32_t charIndex = rand() % cclass->fSet->size();
+ UChar32 c = cclass->fSet->charAt(charIndex);
+ if (U16_IS_TRAIL(c) && fString.length() > 0 && U16_IS_LEAD(fString.charAt(fString.length()-1))) {
+ // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
+ // Don't let random unpaired surrogates combine in the test data because they might
+ // produce an unwanted dictionary character.
+ continue;
+ }
+
+ if (!rules->fDictionarySet.contains(c)) {
+ fString.append(c);
+ ++n;
+ }
+ }
+
+ // Reset each rule matcher regex with this new string.
+ // (Although we are always using the same string object, ICU regular expressions
+ // don't like the underlying string data changing without doing a reset).
+
+ for (int32_t ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
+ BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
+ rule->fRuleMatcher->reset(fString);
+ }
+
+ // Init the expectedBreaks, actualBreaks and ruleForPosition strings (used as arrays).
+ // Expected and Actual breaks are one longer than the input string; a non-zero value
+ // will indicate a boundary preceding that position.
+
+ clearActualBreaks();
+ fExpectedBreaks = fActualBreaks;
+ fRuleForPosition = fActualBreaks;
+ f2ndRuleForPos = fActualBreaks;
+
+ // Apply reference rules to find the expected breaks.
+
+ fExpectedBreaks.setCharAt(0, (UChar)1); // Force an expected break before the start of the text.
+ // ICU always reports a break there.
+ // The reference rules do not have a means to do so.
+ int32_t strIdx = 0;
+ while (strIdx < fString.length()) {
+ BreakRule *matchingRule = NULL;
+ UBool hasBreak = FALSE;
+ int32_t ruleNum = 0;
+ int32_t matchStart = 0;
+ int32_t matchEnd = 0;
+ int32_t breakGroup = 0;
+ for (ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
+ BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
+ rule->fRuleMatcher->reset();
+ if (rule->fRuleMatcher->lookingAt(strIdx, status)) {
+ // A candidate rule match, check further to see if we take it or continue to check other rules.
+ // Matches of zero or one codepoint count only if they also specify a break.
+ matchStart = rule->fRuleMatcher->start(status);
+ matchEnd = rule->fRuleMatcher->end(status);
+ breakGroup = rule->fRuleMatcher->pattern().groupNumberFromName("BreakPosition", status);
+ hasBreak = U_SUCCESS(status);
+ if (status == U_REGEX_INVALID_CAPTURE_GROUP_NAME) {
+ status = U_ZERO_ERROR;
+ }
+ if (hasBreak || fString.moveIndex32(matchStart, 1) < matchEnd) {
+ matchingRule = rule;
+ break;
+ }
+ }
+ }
+ if (matchingRule == NULL) {
+ // No reference rule matched. This is an error in the rules that should never happen.
+ IntlTest::gTest->errln("%s:%d Trouble with monkey test reference rules at position %d. ",
+ __FILE__, __LINE__, strIdx);
+ dump(strIdx);
+ status = U_INVALID_FORMAT_ERROR;
+ return;
+ }
+ if (matchingRule->fRuleMatcher->group(status).length() == 0) {
+ // Zero length rule match. This is also an error in the rule expressions.
+ IntlTest::gTest->errln("%s:%d Zero length rule match.",
+ __FILE__, __LINE__);
+ status = U_INVALID_FORMAT_ERROR;
+ return;
+ }
+
+ // Record which rule matched over the length of the match.
+ for (int i = matchStart; i < matchEnd; i++) {
+ if (fRuleForPosition.charAt(i) == 0) {
+ fRuleForPosition.setCharAt(i, (UChar)ruleNum);
+ } else {
+ f2ndRuleForPos.setCharAt(i, (UChar)ruleNum);
+ }
+ }
+
+ // Break positions appear in rules as a matching named capture of zero length at the break position,
+ // the adjusted pattern contains (?<BreakPosition>)
+ if (hasBreak) {
+ int32_t breakPos = matchingRule->fRuleMatcher->start(breakGroup, status);
+ if (U_FAILURE(status) || breakPos < 0) {
+ // Rule specified a break, but that break wasn't part of the match, even
+ // though the rule as a whole matched.
+ // Can't happen with regular expressions derived from (equivalent to) ICU break rules.
+ // Shouldn't get here.
+ IntlTest::gTest->errln("%s:%d Internal Rule Error.", __FILE__, __LINE__);
+ status = U_INVALID_FORMAT_ERROR;
+ break;
+ }
+ fExpectedBreaks.setCharAt(breakPos, (UChar)1);
+ // printf("recording break at %d\n", breakPos);
+ // For the next iteration, pick up applying rules immediately after the break,
+ // which may differ from end of the match. The matching rule may have included
+ // context following the boundary that needs to be looked at again.
+ strIdx = matchingRule->fRuleMatcher->end(breakGroup, status);
+ } else {
+ // Original rule didn't specify a break.
+ // Continue applying rules starting on the last code point of this match.
+ strIdx = fString.moveIndex32(matchEnd, -1);
+ if (strIdx == matchStart) {
+ // Match was only one code point, no progress if we continue.
+ // Shouldn't get here, case is filtered out at top of loop.
+ CharString ruleName;
+ ruleName.appendInvariantChars(matchingRule->fName, status);
+ IntlTest::gTest->errln("%s:%d Rule %s internal error",
+ __FILE__, __LINE__, ruleName.data());
+ status = U_INVALID_FORMAT_ERROR;
+ break;
+ }
+ }
+ if (U_FAILURE(status)) {
+ IntlTest::gTest->errln("%s:%d status = %s. Unexpected failure, perhaps problem internal to test.",
+ __FILE__, __LINE__, u_errorName(status));
+ break;
+ }
+ }
+}
+
+void MonkeyTestData::clearActualBreaks() {
+ fActualBreaks.remove();
+ // Actual Breaks length is one longer than the data string length, allowing
+ // for breaks before the first and after the last character in the data.
+ for (int32_t i=0; i<=fString.length(); i++) {
+ fActualBreaks.append((UChar)0);
+ }
+}
+
+void MonkeyTestData::dump(int32_t around) const {
+ printf("\n"
+ " char break Rule Character\n"
+ " pos code class R I name name\n"
+ "---------------------------------------------------------------------------------------------\n");
+
+ int32_t start;
+ int32_t end;
+
+ if (around == -1) {
+ start = 0;
+ end = fString.length();
+ } else {
+ // Display context around a failure.
+ start = fString.moveIndex32(around, -30);
+ end = fString.moveIndex32(around, +30);
+ }
+
+ for (int charIdx = start; charIdx < end; charIdx=fString.moveIndex32(charIdx, 1)) {
+ UErrorCode status = U_ZERO_ERROR;
+ UChar32 c = fString.char32At(charIdx);
+ const CharClass *cc = fBkRules->getClassForChar(c);
+ CharString ccName;
+ ccName.appendInvariantChars(cc->fName, status);
+ CharString ruleName, secondRuleName;
+ const BreakRule *rule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(fRuleForPosition.charAt(charIdx)));
+ ruleName.appendInvariantChars(rule->fName, status);
+ if (f2ndRuleForPos.charAt(charIdx) > 0) {
+ const BreakRule *secondRule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(f2ndRuleForPos.charAt(charIdx)));
+ secondRuleName.appendInvariantChars(secondRule->fName, status);
+ }
+ char cName[200];
+ u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
+
+ printf(" %4.1d %6.4x %-20s %c %c %-10s %-10s %s\n",
+ charIdx, c, ccName.data(),
+ fExpectedBreaks.charAt(charIdx) ? '*' : '.',
+ fActualBreaks.charAt(charIdx) ? '*' : '.',
+ ruleName.data(), secondRuleName.data(), cName
+ );
+ }
+}
+
+
+//---------------------------------------------------------------------------------------
+//
+// class RBBIMonkeyImpl
+//
+//---------------------------------------------------------------------------------------
+
+RBBIMonkeyImpl::RBBIMonkeyImpl(UErrorCode &status) : fDumpExpansions(FALSE), fThread(this) {
+ (void)status; // suppress unused parameter compiler warning.
+}
+
+
+// RBBIMonkeyImpl setup does all of the setup for a single rule set - compiling the
+// reference rules and creating the icu breakiterator to test,
+// with its type and locale coming from the reference rules.
+
+void RBBIMonkeyImpl::setup(const char *ruleFile, UErrorCode &status) {
+ fRuleFileName = ruleFile;
+ openBreakRules(ruleFile, status);
+ if (U_FAILURE(status)) {
+ IntlTest::gTest->errln("%s:%d Error %s opening file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
+ return;
+ }
+ fRuleSet.adoptInstead(new BreakRules(this, status));
+ fRuleSet->compileRules(fRuleCharBuffer.getAlias(), status);
+ if (U_FAILURE(status)) {
+ IntlTest::gTest->errln("%s:%d Error %s processing file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
+ return;
+ }
+ fBI.adoptInstead(fRuleSet->createICUBreakIterator(status));
+ fTestData.adoptInstead(new MonkeyTestData());
+}
+
+
+RBBIMonkeyImpl::~RBBIMonkeyImpl() {
+}
+
+
+void RBBIMonkeyImpl::openBreakRules(const char *fileName, UErrorCode &status) {
+ CharString path;
+ path.append(IntlTest::getSourceTestData(status), status);
+ path.append("break_rules" U_FILE_SEP_STRING, status);
+ path.appendPathPart(fileName, status);
+ const char *codePage = "UTF-8";
+ fRuleCharBuffer.adoptInstead(ucbuf_open(path.data(), &codePage, TRUE, FALSE, &status));
+}
+
+
+void RBBIMonkeyImpl::startTest() {
+ fThread.start(); // invokes runTest() in a separate thread.
+}
+
+void RBBIMonkeyImpl::join() {
+ fThread.join();
+}
+
+
+#define MONKEY_ERROR(msg, index) { \
+ IntlTest::gTest->errln("%s:%d %s at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", \
+ __FILE__, __LINE__, msg, index, fRuleFileName, fTestData->fRandomSeed); \
+ if (fVerbose) { fTestData->dump(index); } \
+ status = U_INVALID_STATE_ERROR; \
+}
+
+void RBBIMonkeyImpl::runTest() {
+ UErrorCode status = U_ZERO_ERROR;
+ int32_t errorCount = 0;
+ for (int64_t loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) {
+ status = U_ZERO_ERROR;
+ fTestData->set(fRuleSet.getAlias(), fRandomGenerator, status);
+ // fTestData->dump();
+ testForwards(status);
+ testPrevious(status);
+ testFollowing(status);
+ testPreceding(status);
+ testIsBoundary(status);
+
+ if (fLoopCount < 0 && loopCount % 100 == 0) {
+ fprintf(stderr, ".");
+ }
+ if (U_FAILURE(status)) {
+ if (++errorCount > 10) {
+ return;
+ }
+ }
+ }
+}
+
+void RBBIMonkeyImpl::testForwards(UErrorCode &status) {
+ if (U_FAILURE(status)) {
+ return;
+ }
+ fTestData->clearActualBreaks();
+ fBI->setText(fTestData->fString);
+ int32_t previousBreak = -2;
+ for (int32_t bk=fBI->first(); bk != BreakIterator::DONE; bk=fBI->next()) {
+ if (bk <= previousBreak) {
+ MONKEY_ERROR("Break Iterator Stall", bk);
+ return;
+ }
+ if (bk < 0 || bk > fTestData->fString.length()) {
+ MONKEY_ERROR("Boundary out of bounds", bk);
+ return;
+ }
+ fTestData->fActualBreaks.setCharAt(bk, 1);
+ }
+ checkResults("testForwards", FORWARD, status);
+}
+
+void RBBIMonkeyImpl::testFollowing(UErrorCode &status) {
+ if (U_FAILURE(status)) {
+ return;
+ }
+ fTestData->clearActualBreaks();
+ fBI->setText(fTestData->fString);
+ int32_t nextBreak = -1;
+ for (int32_t i=-1 ; i<fTestData->fString.length(); ++i) {
+ int32_t bk = fBI->following(i);
+ if (bk == BreakIterator::DONE && i == fTestData->fString.length()) {
+ continue;
+ }
+ if (bk == nextBreak && bk > i) {
+ // i is in the gap between two breaks.
+ continue;
+ }
+ if (i == nextBreak && bk > nextBreak) {
+ fTestData->fActualBreaks.setCharAt(bk, 1);
+ nextBreak = bk;
+ continue;
+ }
+ MONKEY_ERROR("following(i)", i);
+ return;
+ }
+ checkResults("testFollowing", FORWARD, status);
+}
+
+
+
+void RBBIMonkeyImpl::testPrevious(UErrorCode &status) {
+ if (U_FAILURE(status)) {return;}
+
+ fTestData->clearActualBreaks();
+ fBI->setText(fTestData->fString);
+ int32_t previousBreak = INT32_MAX;
+ for (int32_t bk=fBI->last(); bk != BreakIterator::DONE; bk=fBI->previous()) {
+ if (bk >= previousBreak) {
+ MONKEY_ERROR("Break Iterator Stall", bk);
+ return;
+ }
+ if (bk < 0 || bk > fTestData->fString.length()) {
+ MONKEY_ERROR("Boundary out of bounds", bk);
+ return;
+ }
+ fTestData->fActualBreaks.setCharAt(bk, 1);
+ }
+ checkResults("testPrevius", REVERSE, status);
+}
+
+
+void RBBIMonkeyImpl::testPreceding(UErrorCode &status) {
+ if (U_FAILURE(status)) {
+ return;
+ }
+ fTestData->clearActualBreaks();
+ fBI->setText(fTestData->fString);
+ int32_t nextBreak = fTestData->fString.length()+1;
+ for (int32_t i=fTestData->fString.length()+1 ; i>=0; --i) {
+ int32_t bk = fBI->preceding(i);
+ // printf("i:%d bk:%d nextBreak:%d\n", i, bk, nextBreak);
+ if (bk == BreakIterator::DONE && i == 0) {
+ continue;
+ }
+ if (bk == nextBreak && bk < i) {
+ // i is in the gap between two breaks.
+ continue;
+ }
+ if (i<fTestData->fString.length() && fTestData->fString.getChar32Start(i) < i) {
+ // i indexes to a trailing surrogate.
+ // Break Iterators treat an index to either half as referring to the supplemental code point,
+ // with preceding going to some preceding code point.
+ if (fBI->preceding(i) != fBI->preceding(fTestData->fString.getChar32Start(i))) {
+ MONKEY_ERROR("preceding of trailing surrogate error", i);
+ }
+ continue;
+ }
+ if (i == nextBreak && bk < nextBreak) {
+ fTestData->fActualBreaks.setCharAt(bk, 1);
+ nextBreak = bk;
+ continue;
+ }
+ MONKEY_ERROR("preceding(i)", i);
+ return;
+ }
+ checkResults("testPreceding", REVERSE, status);
+}
+
+
+void RBBIMonkeyImpl::testIsBoundary(UErrorCode &status) {
+ if (U_FAILURE(status)) {
+ return;
+ }
+ fTestData->clearActualBreaks();
+ fBI->setText(fTestData->fString);
+ for (int i=fTestData->fString.length(); i>=0; --i) {
+ if (fBI->isBoundary(i)) {
+ fTestData->fActualBreaks.setCharAt(i, 1);
+ }
+ }
+ checkResults("testForwards", FORWARD, status);
+}
+
+void RBBIMonkeyImpl::checkResults(const char *msg, CheckDirection direction, UErrorCode &status) {
+ if (U_FAILURE(status)) {
+ return;
+ }
+ if (direction == FORWARD) {
+ for (int i=0; i<=fTestData->fString.length(); ++i) {
+ if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
+ IntlTest::gTest->errln("%s:%d %s failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
+ __FILE__, __LINE__, msg, i, fRuleFileName, fTestData->fRandomSeed);
+ if (fVerbose) {
+ fTestData->dump(i);
+ }
+ status = U_INVALID_STATE_ERROR; // Prevent the test from continuing, which would likely
+ break; // produce many redundant errors.
+ }
+ }
+ } else {
+ for (int i=fTestData->fString.length(); i>=0; i--) {
+ if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
+ IntlTest::gTest->errln("%s:%d %s failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
+ __FILE__, __LINE__, msg, i, fRuleFileName, fTestData->fRandomSeed);
+ if (fVerbose) {
+ fTestData->dump(i);
+ }
+ status = U_INVALID_STATE_ERROR;
+ break;
+ }
+ }
+ }
+}
+
+
+
+//---------------------------------------------------------------------------------------
+//
+// class RBBIMonkeyTest implementation.
+//
+//---------------------------------------------------------------------------------------
+RBBIMonkeyTest::RBBIMonkeyTest() {
+}
+
+RBBIMonkeyTest::~RBBIMonkeyTest() {
+}
+
+
+// params, taken from this->fParams.
+// rules=file_name Name of file containing the reference rules.
+// seed=nnnnn Random number starting seed.
+// Setting the seed allows errors to be reproduced.
+// loop=nnn Looping count. Controls running time.
+// -1: run forever.
+// 0 or greater: run length.
+// expansions debug option, show expansions of rules and sets.
+// verbose Display details of the failure.
+//
+// Parameters on the intltest command line follow the test name, and are preceded by '@'.
+// For example,
+// intltest rbbi/RBBIMonkeyTest/testMonkey@rules=line.txt,loop=-1
+//
+void RBBIMonkeyTest::testMonkey() {
+ // printf("Test parameters: %s\n", fParams);
+ UnicodeString params(fParams);
+ UErrorCode status = U_ZERO_ERROR;
+
+ const char *tests[] = {"grapheme.txt", "word.txt", "line.txt", "sentence.txt", "line_normal.txt",
+ "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt",
+ NULL };
+ CharString testNameFromParams;
+ if (getStringParam("rules", params, testNameFromParams, status)) {
+ tests[0] = testNameFromParams.data();
+ tests[1] = NULL;
+ }
+
+ int64_t loopCount = quick? 100 : 5000;
+ getIntParam("loop", params, loopCount, status);
+
+ UBool dumpExpansions = FALSE;
+ getBoolParam("expansions", params, dumpExpansions, status);
+
+ UBool verbose = FALSE;
+ getBoolParam("verbose", params, verbose, status);
+
+ int64_t seed = 0;
+ getIntParam("seed", params, seed, status);
+
+ if (params.length() != 0) {
+ // Options processing did not consume all of the parameters. Something unrecognized was present.
+ CharString unrecognizedParameters;
+ unrecognizedParameters.append(CStr(params)(), -1, status);
+ errln("%s:%d unrecognized test parameter(s) \"%s\"", __FILE__, __LINE__, unrecognizedParameters.data());
+ return;
+ }
+
+ UVector startedTests(status);
+ if (U_FAILURE(status)) {
+ errln("%s:%d: error %s while setting up test.", __FILE__, __LINE__, u_errorName(status));
+ return;
+ }
+
+ // Monkey testing is multi-threaded.
+ // Each set of break rules to be tested is run in a separate thread.
+ // Each thread/set of rules gets a separate RBBIMonkeyImpl object.
+ int32_t i;
+ for (i=0; tests[i] != NULL; ++i) {
+ logln("beginning testing of %s", tests[i]);
+ RBBIMonkeyImpl *test = new RBBIMonkeyImpl(status);
+ test->fDumpExpansions = dumpExpansions;
+ test->fVerbose = verbose;
+ test->fRandomGenerator.seed((uint32_t)seed);
+ test->fLoopCount = loopCount;
+ test->setup(tests[i], status);
+ test->startTest();
+ startedTests.addElement(test, status);
+ if (U_FAILURE(status)) {
+ break;
+ }
+ }
+
+ if (U_FAILURE(status)) {
+ errln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
+ }
+
+ for (i=0; i<startedTests.size(); ++i) {
+ RBBIMonkeyImpl *test = static_cast<RBBIMonkeyImpl *>(startedTests.elementAt(i));
+ test->join();
+ delete test;
+ }
+}
+
+
+UBool RBBIMonkeyTest::getIntParam(UnicodeString name, UnicodeString ¶ms, int64_t &val, UErrorCode &status) {
+ name.append(" *= *(-?\\d+) *,? *");
+ RegexMatcher m(name, params, 0, status);
+ if (m.find()) {
+ // The param exists. Convert the string to an int.
+ CharString str;
+ str.append(CStr(m.group(1, status))(), -1, status);
+ val = strtol(str.data(), NULL, 10);
+
+ // Delete this parameter from the params string.
+ m.reset();
+ params = m.replaceFirst(UnicodeString(), status);
+ return TRUE;
+ }
+ return FALSE;
+}
+
+UBool RBBIMonkeyTest::getStringParam(UnicodeString name, UnicodeString ¶ms, CharString &dest, UErrorCode &status) {
+ name.append(" *= *([^ ,]*) *,? *");
+ RegexMatcher m(name, params, 0, status);
+ if (m.find()) {
+ // The param exists.
+ dest.append(CStr(m.group(1, status))(), -1, status);
+
+ // Delete this parameter from the params string.
+ m.reset();
+ params = m.replaceFirst(UnicodeString(), status);
+ return TRUE;
+ }
+ return FALSE;
+}
+
+UBool RBBIMonkeyTest::getBoolParam(UnicodeString name, UnicodeString ¶ms, UBool &dest, UErrorCode &status) {
+ name.append("(?: *= *(true|false))? *,? *");
+ RegexMatcher m(name, params, UREGEX_CASE_INSENSITIVE, status);
+ if (m.find()) {
+ if (m.start(1, status) > 0) {
+ // user option included a value.
+ dest = m.group(1, status).caseCompare(UnicodeString("true"), U_FOLD_CASE_DEFAULT) == 0;
+ } else {
+ // No explicit user value, implies true.
+ dest = TRUE;
+ }
+
+ // Delete this parameter from the params string.
+ m.reset();
+ params = m.replaceFirst(UnicodeString(), status);
+ return TRUE;
+ }
+ return FALSE;
+}
+
--- /dev/null
+/*************************************************************************
+ * Copyright (c) 2016, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ *************************************************************************
+*/
+#ifndef RBBIMONKEYTEST_H
+#define RBBIMONKEYTEST_H
+
+#include "unicode/utypes.h"
+
+#include "intltest.h"
+
+#include "unicode/rbbi.h"
+#include "unicode/regex.h"
+#include "unicode/uniset.h"
+#include "unicode/unistr.h"
+#include "unicode/uobject.h"
+
+#include "simplethread.h"
+#include "ucbuf.h"
+#include "uhash.h"
+#include "uvector.h"
+
+//
+// TODO:
+// Develop a tailoring format.
+// Hook to old tests that use monkey impl to get expected data.
+// Remove old tests.
+
+class BreakRules; // Forward declaration
+class RBBIMonkeyImpl;
+
+/**
+ * Test the RuleBasedBreakIterator class giving different rules
+ */
+class RBBIMonkeyTest: public IntlTest {
+ public:
+ RBBIMonkeyTest();
+ virtual ~RBBIMonkeyTest();
+
+ void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
+ void testMonkey();
+
+
+ private:
+ const char *fParams; // Copy of user parameters passed in from IntlTest.
+
+
+ void testRules(const char *ruleFile);
+ static UBool getIntParam(UnicodeString name, UnicodeString ¶ms, int64_t &val, UErrorCode &status);
+ static UBool getStringParam(UnicodeString name, UnicodeString ¶ms, CharString &dest, UErrorCode &status);
+ static UBool getBoolParam(UnicodeString name, UnicodeString ¶ms, UBool &dest, UErrorCode &status);
+
+};
+
+// The following classes are internal to the RBBI Monkey Test implementation.
+
+
+
+// class CharClass Represents a single character class from the source break rules.
+// Inherits from UObject because instances are adopted by UHashtable, which ultimately
+// deletes them using hash's object deleter function.
+
+class CharClass: public UObject {
+ public:
+ UnicodeString fName;
+ UnicodeString fOriginalDef; // set definition as it appeared in user supplied rules.
+ UnicodeString fExpandedDef; // set definition with any embedded named sets replaced by their defs, recursively.
+ LocalPointer<const UnicodeSet> fSet;
+ CharClass(const UnicodeString &name, const UnicodeString &originalDef, const UnicodeString &expandedDef, const UnicodeSet *set) :
+ fName(name), fOriginalDef(originalDef), fExpandedDef(expandedDef), fSet(set) {}
+};
+
+
+// class BreakRule represents a single rule from a set of break rules.
+// Each rule has the set definitions expanded, and
+// is compiled to a regular expression.
+
+class BreakRule: public UObject {
+ public:
+ BreakRule();
+ ~BreakRule();
+ UnicodeString fName; // Name of the rule.
+ UnicodeString fRule; // Rule expression, excluding the name, as written in user source.
+ UnicodeString fExpandedRule; // Rule expression after expanding the set definitions.
+ LocalPointer<RegexMatcher> fRuleMatcher; // Regular expression that matches the rule.
+};
+
+
+// class BreakRules represents a complete set of break rules, possibly tailored,
+// compiled from testdata break rules.
+
+class BreakRules: public UObject {
+ public:
+ BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status);
+ ~BreakRules();
+
+ void compileRules(UCHARBUF *rules, UErrorCode &status);
+
+ const CharClass *getClassForChar(UChar32 c, int32_t *iter=NULL) const;
+
+
+ RBBIMonkeyImpl *fMonkeyImpl; // Pointer back to the owning MonkeyImpl instance.
+ icu::UVector fBreakRules; // Contents are of type (BreakRule *).
+
+ LocalUHashtablePointer fCharClasses; // Key is set name (UnicodeString).
+ // Value is (CharClass *)
+ LocalPointer<UVector> fCharClassList; // Char Classes, same contents as fCharClasses values,
+ // but in a vector so they can be accessed by index.
+ UnicodeSet fDictionarySet; // Dictionary set, empty if none is defined.
+ Locale fLocale;
+ UBreakIteratorType fType;
+
+ CharClass *addCharClass(const UnicodeString &name, const UnicodeString &def, UErrorCode &status);
+ void addRule(const UnicodeString &name, const UnicodeString &def, UErrorCode &status);
+ bool setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status);
+ RuleBasedBreakIterator *createICUBreakIterator(UErrorCode &status);
+
+ LocalPointer<RegexMatcher> fSetRefsMatcher;
+ LocalPointer<RegexMatcher> fCommentsMatcher;
+ LocalPointer<RegexMatcher> fClassDefMatcher;
+ LocalPointer<RegexMatcher> fRuleDefMatcher;
+};
+
+
+// class MonkeyTestData represents a randomly synthesized test data string together
+// with the expected break positions obtained by applying
+// the test break rules.
+
+class MonkeyTestData: public UObject {
+ public:
+ MonkeyTestData() {};
+ ~MonkeyTestData() {};
+ void set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status);
+ void clearActualBreaks();
+ void dump(int32_t around = -1) const;
+
+ uint32_t fRandomSeed; // The initial seed value from the random number genererator.
+ const BreakRules *fBkRules; // The break rules used to generate this data.
+ UnicodeString fString; // The text.
+ UnicodeString fExpectedBreaks; // Breaks as found by the reference rules.
+ // Parallel to fString. Non-zero if break preceding.
+ UnicodeString fActualBreaks; // Breaks as found by ICU break iterator.
+ UnicodeString fRuleForPosition; // Index into BreakRules.fBreakRules of rule that applied at each position.
+ // Also parallel to fString.
+ UnicodeString f2ndRuleForPos; // As above. A 2nd rule applies when the preceding rule
+ // didn't cause a break, and a subsequent rule match starts
+ // on the last code point of the preceding match.
+
+};
+
+
+
+
+// class RBBIMonkeyImpl holds (some indirectly) everything associated with running a monkey
+// test for one set of break rules.
+//
+// When running RBBIMonkeyTest with multiple threads, there is a 1:1 correspondence
+// between instances of RBBIMonkeyImpl and threads.
+//
+class RBBIMonkeyImpl: public UObject {
+ public:
+ RBBIMonkeyImpl(UErrorCode &status);
+ ~RBBIMonkeyImpl();
+
+ void setup(const char *ruleFileName, UErrorCode &status);
+
+ void startTest();
+ void runTest();
+ void join();
+
+ LocalUCHARBUFPointer fRuleCharBuffer; // source file contents of the reference rules.
+ LocalPointer<BreakRules> fRuleSet;
+ LocalPointer<RuleBasedBreakIterator> fBI;
+ LocalPointer<MonkeyTestData> fTestData;
+ IntlTest::icu_rand fRandomGenerator;
+ const char *fRuleFileName;
+ UBool fVerbose; // True to do long dump of failing data.
+ int32_t fLoopCount;
+
+ UBool fDumpExpansions; // Debug flag to output epananded form of rules and sets.
+
+ enum CheckDirection {
+ FORWARD = 1,
+ REVERSE = 2
+ };
+ void clearActualBreaks();
+ void testForwards(UErrorCode &status);
+ void testPrevious(UErrorCode &status);
+ void testFollowing(UErrorCode &status);
+ void testPreceding(UErrorCode &status);
+ void testIsBoundary(UErrorCode &status);
+ void checkResults(const char *msg, CheckDirection dir, UErrorCode &status);
+
+ class RBBIMonkeyThread: public SimpleThread {
+ private:
+ RBBIMonkeyImpl *fMonkeyImpl;
+ public:
+ RBBIMonkeyThread(RBBIMonkeyImpl *impl) : fMonkeyImpl(impl) {};
+ void run() U_OVERRIDE { fMonkeyImpl->runTest(); };
+ };
+ private:
+ void openBreakRules(const char *fileName, UErrorCode &status);
+ RBBIMonkeyThread fThread;
+
+};
+
+#endif // RBBIMONKEYTEST_H
* 01/12/2000 Madhu Updated for changed API and added new tests
************************************************************************/
-#include "utypeinfo.h" // for 'typeid' to work
-
#include "unicode/utypes.h"
-
#if !UCONFIG_NO_BREAK_ITERATION
-#include "unicode/utypes.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
#include "unicode/brkiter.h"
+#include "unicode/localpointer.h"
+#include "unicode/numfmt.h"
#include "unicode/rbbi.h"
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS
+#include "unicode/regex.h"
+#endif
+#include "unicode/schriter.h"
#include "unicode/uchar.h"
#include "unicode/utf16.h"
#include "unicode/ucnv.h"
-#include "unicode/schriter.h"
#include "unicode/uniset.h"
-#if !UCONFIG_NO_REGULAR_EXPRESSIONS
-#include "unicode/regex.h"
-#endif
+#include "unicode/uscript.h"
#include "unicode/ustring.h"
#include "unicode/utext.h"
+
+#include "charstr.h"
+#include "cmemory.h"
#include "intltest.h"
#include "rbbitst.h"
-#include <string.h>
-#include "charstr.h"
+#include "utypeinfo.h" // for 'typeid' to work
#include "uvector.h"
#include "uvectr32.h"
-#include <stdio.h>
-#include <stdlib.h>
-#include "unicode/numfmt.h"
-#include "unicode/uscript.h"
-#include "cmemory.h"
#if !UCONFIG_NO_FILTERED_BREAK_ITERATION
#include "unicode/filteredbrk.h"
//---------------------------------------------
-// Note: Before adding new tests to this file, check whether the desired test data can
+// Note: Before adding new tests to this file, check whether the desired test data can
// simply be added to the file testdata/rbbitest.txt. In most cases it can,
// it's much less work than writing a new test, diagnostic output in the event of failures
// is good, and the test data file will is shared with ICU4J, so eventually the test
break;
case 2: name = "TestStatusReturn";
if(exec) TestStatusReturn(); break;
-
+
#if !UCONFIG_NO_FILE_IO
case 3: name = "TestUnicodeFiles";
if(exec) TestUnicodeFiles(); break;
#endif
#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
- case 16:
+ case 16:
name = "TestMonkey"; if(exec) TestMonkey(params); break;
#else
case 16:
"$Numbers = [:N:];\n"
"$Letters+{1};\n"
"$Numbers+{2};\n"
- "Help\\ {4}/me\\!;\n"
+ "Help\\ /me\\!{4};\n"
"[^$Letters $Numbers];\n"
"!.*;\n", -1, US_INV);
UnicodeString testString1 = "abc123..abc Help me Help me!";
UErrorCode status=U_ZERO_ERROR;
UParseError parseError;
- BreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
+ LocalPointer <BreakIterator> bi(new RuleBasedBreakIterator(rulesString1, parseError, status));
if(U_FAILURE(status)) {
- dataerrln("FAIL : in construction - %s", u_errorName(status));
- } else {
- int32_t pos;
- int32_t i = 0;
- bi->setText(testString1);
- for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
- if (pos != bounds1[i]) {
- errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos);
- break;
- }
+ dataerrln("%s:%d error in break iterator construction - %s", __FILE__, __LINE__, u_errorName(status));
+ return;
+ }
+ int32_t pos;
+ int32_t i = 0;
+ bi->setText(testString1);
+ for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
+ if (pos != bounds1[i]) {
+ errln("%s:%d expected break at %d, got %d\n", __FILE__, __LINE__, bounds1[i], pos);
+ break;
+ }
- int tag = bi->getRuleStatus();
- if (tag != brkStatus[i]) {
- errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
- break;
- }
- i++;
+ int tag = bi->getRuleStatus();
+ if (tag != brkStatus[i]) {
+ errln("%s:%d break at %d, expected tag %d, got tag %d\n", __FILE__, __LINE__, pos, brkStatus[i], tag);
+ break;
}
+ i++;
}
- delete bi;
}
if (bi == NULL) {
return;
}
-
+
UnicodeString s("One.\\u00ad Two.", -1, US_INV);
// 01234 56789
s = s.unescape();
utext_close(textToBreak);
delete textMap;
}
-
+
int32_t getSrcLine(int32_t bp);
int32_t getExpectedBreak(int32_t bp);
int32_t getSrcCol(int32_t bp);
0xfffd, NULL, &status);
dest.append(buffer, utf8Length, status);
}
-
+
void TestParams::setUTF16(UErrorCode &status) {
textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
//-------------------------------------------------------------------------------
//
// ReadAndConvertFile Read a text data file, convert it to UChars, and
-// return the datain one big UChar * buffer, which the caller must delete.
+// return the data in one big UChar * buffer, which the caller must delete.
//
// parameters:
// fileName: the name of the file, with no directory part. The test data directory
}
strcpy(testFileName, testDataDirectory);
strcat(testFileName, fileName);
-
+
logln("Opening data file %s\n", fileName);
int len;
else if (tokenMatcher.start(4, status) >= 0) {
// Scanned to end of a line, possibly skipping over a comment in the process.
// If the line from the file contained test data, run the test now.
- if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
+ if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
}
UnicodeSet *fLVTSet;
UnicodeSet *fHangulSet;
UnicodeSet *fAnySet;
+ UnicodeSet *fEmojiModifierSet;
+ UnicodeSet *fEmojiBaseSet;
+ UnicodeSet *fZWJSet;
+ UnicodeSet *fGAZSet;
const UnicodeString *fText;
};
fText = NULL;
fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
- fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
- fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
+ fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]-[:Block=Tags:]]"), status);
+ fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]]"), status);
fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
fHangulSet->addAll(*fLVTSet);
fAnySet = new UnicodeSet(0, 0x10ffff);
+
+
+ fEmojiBaseSet = new UnicodeSet(UnicodeString(
+ "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
+ "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
+ "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
+ "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status);
+
+ fEmojiModifierSet = new UnicodeSet(0x0001F3FB, 0x0001F3FF);
+ fZWJSet = new UnicodeSet(0x200D, 0x200D);
+ fGAZSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F466-\\U0001F469\\U0001F48B\\U0001F5E8\\u2764]"), status);
+
fSets = new UVector(status);
fSets->addElement(fCRLFSet, status);
fSets->addElement(fControlSet, status);
fSets->addElement(fSpacingSet, status);
fSets->addElement(fHangulSet, status);
fSets->addElement(fAnySet, status);
+ fSets->addElement(fEmojiBaseSet, status);
+ fSets->addElement(fEmojiModifierSet, status);
+ fSets->addElement(fZWJSet, status);
+ fSets->addElement(fGAZSet, status);
if (U_FAILURE(status)) {
deferredStatus = status;
}
int breakPos = -1;
UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
-
+
if (U_FAILURE(deferredStatus)) {
return -1;
}
}
// Rule (GB8a) Regional_Indicator x Regional_Indicator
+ // Note: The first if condition is a little tricky. We only need to force
+ // a break if there are three or more contiguous RIs. If there are
+ // only two, a break following will occur via other rules, and will include
+ // any trailing extend characters, which is needed behavior.
+ if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
+ && fRegionalIndicatorSet->contains(c2)) {
+ break;
+ }
if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
continue;
}
- // Rule (GB9) Numeric x ALetter
- if (fExtendSet->contains(c2)) {
+ // Rule (GB9) x Extend
+ if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) {
continue;
}
continue;
}
+ // Rule (GB9c) Emoji_Base x Emoji_Modifier
+ if ((fEmojiBaseSet->contains(c1) || fGAZSet->contains(c1)) && fEmojiModifierSet->contains(c2)) {
+ continue;
+ }
+
+ // Rule (GB9d) ZWJ x Glue_After_Zwj
+ if (fZWJSet->contains(c1) && fGAZSet->contains(c2)) {
+ continue;
+ }
+
// Rule (GB10) Any <break> Any
break;
}
delete fLVTSet;
delete fHangulSet;
delete fAnySet;
+ delete fEmojiBaseSet;
+ delete fEmojiModifierSet;
+ delete fZWJSet;
+ delete fGAZSet;
}
//------------------------------------------------------------------------------------------
UnicodeSet *fKatakanaSet;
UnicodeSet *fHebrew_LetterSet;
UnicodeSet *fALetterSet;
- // TODO(jungshik): Do we still need this change?
+ // TODO(jungshik): Do we still need this change?
// UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt
UnicodeSet *fSingle_QuoteSet;
UnicodeSet *fDouble_QuoteSet;
UnicodeSet *fExtendSet;
UnicodeSet *fExtendNumLetSet;
UnicodeSet *fDictionaryCjkSet;
+ UnicodeSet *fEBaseSet;
+ UnicodeSet *fEModifierSet;
+ UnicodeSet *fZWSSet;
+ UnicodeSet *fGAZSet;
const UnicodeString *fText;
};
fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
// Exclude Hangul syllables from ALetterSet during testing.
// Leave CJK dictionary characters out from the monkey tests!
-#if 0
+#if 0
fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"
"[\\p{Line_Break = Complex_Context}"
"-\\p{Grapheme_Cluster_Break = Extend}"
fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);
+ fEBaseSet = new UnicodeSet(UnicodeString(
+ "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
+ "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
+ "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
+ "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status);
+
+ fEModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status);
+ fZWSSet = new UnicodeSet((UChar32)0x200D, (UChar32)0x200D);;
+ fGAZSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F466-\\U0001F469\\U0001F48B\\U0001F5E8\\u2764]"), status);
+ fExtendSet->removeAll(*fZWSSet);
+
+
fOtherSet = new UnicodeSet();
if(U_FAILURE(status)) {
deferredStatus = status;
fOtherSet->removeAll(*fFormatSet);
fOtherSet->removeAll(*fExtendSet);
fOtherSet->removeAll(*fRegionalIndicatorSet);
+ fOtherSet->removeAll(*fEBaseSet);
+ fOtherSet->removeAll(*fEModifierSet);
+ fOtherSet->removeAll(*fZWSSet);
+ fOtherSet->removeAll(*fGAZSet);
+
// Inhibit dictionary characters from being tested at all.
fOtherSet->removeAll(*fDictionaryCjkSet);
fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
fSets->addElement(fOtherSet, status);
fSets->addElement(fExtendNumLetSet, status);
+ fSets->addElement(fEBaseSet, status);
+ fSets->addElement(fEModifierSet, status);
+ fSets->addElement(fZWSSet, status);
+ fSets->addElement(fGAZSet, status);
+
if (U_FAILURE(status)) {
deferredStatus = status;
}
int breakPos = -1;
UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
-
+
if (U_FAILURE(deferredStatus)) {
return -1;
}
break;
};
}
- while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
+ while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWSSet->contains(c3));
if (p1 == p2) {
if (c1==0x0D && c2==0x0A) {
continue;
}
-
+
// Rule (3a) Break before and after newlines (including CR and LF)
//
if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
break;
};
+ // Rule (3c) ZWJ x GAZ (Glue after ZWJ).
+ // Not ignoring extend chars, so peek into input text to
+ // get the potential ZWJ, the character immediately preceding c2.
+ // Sloppy UChar32 indexing: p2-1 may reference trail half
+ // but char32At will get the full code point.
+ if (fZWSSet->contains(fText->char32At(p2-1)) && fGAZSet->contains(c2)) {
+ continue;
+ }
+
// Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
(fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
}
// Rule 13c
+ if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
+ break;
+ }
if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
continue;
}
+ // Rule 13d
+ if ((fEBaseSet->contains(c1) || fGAZSet->contains(c1)) && fEModifierSet->contains(c2)) {
+ continue;
+ }
+
// Rule 14. Break found here.
break;
}
delete fRegionalIndicatorSet;
delete fDictionaryCjkSet;
delete fOtherSet;
+ delete fEBaseSet;
+ delete fEModifierSet;
+ delete fZWSSet;
+ delete fGAZSet;
}
UnicodeSet *fHL;
UnicodeSet *fID;
UnicodeSet *fRI;
- UnicodeSet *fSA;
UnicodeSet *fXX;
+ UnicodeSet *fEB;
+ UnicodeSet *fEM;
+ UnicodeSet *fZJ;
BreakIterator *fCharBI;
const UnicodeString *fText;
RegexMatcher *fNumberMatcher;
};
+RBBILineMonkey::RBBILineMonkey() :
+ RBBIMonkeyKind(),
+ fSets(NULL),
+
+ fCharBI(NULL),
+ fText(NULL),
+ fNumberMatcher(NULL)
-RBBILineMonkey::RBBILineMonkey()
{
+ if (U_FAILURE(deferredStatus)) {
+ return;
+ }
+
UErrorCode status = U_ZERO_ERROR;
fSets = new UVector(status);
fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
- fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
+ fEB = new UnicodeSet(UnicodeString(
+ "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
+ "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
+ "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
+ "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status);
+ fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status);
+ fZJ = new UnicodeSet((UChar32)0x200D, (UChar32)0x200D);
if (U_FAILURE(status)) {
deferredStatus = status;
- fCharBI = NULL;
- fNumberMatcher = NULL;
return;
}
fAL->addAll(*fXX); // Default behavior for XX is identical to AL
fAL->addAll(*fAI); // Default behavior for AI is identical to AL
- fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL
fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
+ fID->addAll(*fEB); // Emoji Base and Emoji Modifier behave as ID.
+ fID->addAll(*fEM);
+ fAL->removeAll(*fEM);
+
+
+ fAL->remove((UChar32)0x2764); // Emoji Proposal: move u2764 from Al to Id
+ fID->add((UChar32)0x2764);
+
fSets->addElement(fBK, status);
fSets->addElement(fCR, status);
fSets->addElement(fLF, status);
fSets->addElement(fID, status);
fSets->addElement(fWJ, status);
fSets->addElement(fRI, status);
- fSets->addElement(fSA, status);
fSets->addElement(fSG, status);
+ fSets->addElement(fEB, status);
+ fSets->addElement(fEM, status);
+ fSets->addElement(fZJ, status);
- const char *rules =
+ const char *rules =
"((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
"((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
"\\p{Line_Break=NU}\\p{Line_Break=CM}*"
break;
}
+ // LB 8a ZJ x ID
+ // The monkey test's way of ignoring combining characters doesn't work
+ // for this rule. ZJ is also a CM. Need to get the actual character
+ // preceding "thisChar", not ignoring combining marks, possibly ZJ.
+ {
+ int32_t prevIdx = fText->moveIndex32(pos, -1);
+ UChar32 prevC = fText->char32At(prevIdx);
+ if (fZJ->contains(prevC) && fID->contains(thisChar)) {
+ continue;
+ }
+ }
+
// LB 9, 10 Already done, at top of loop.
//
if (fGL->contains(prevChar)) {
continue;
}
-
+
// LB 12a
// [^SP BA HY] x GL
if (!(fSP->contains(prevChar) ||
// LB 21a
// HL (HY | BA) x
- if (fHL->contains(prevCharX2) &&
+ if (fHL->contains(prevCharX2) &&
(fHY->contains(prevChar) || fBA->contains(prevChar))) {
continue;
}
continue;
}
- // LB30a Do not break between regional indicators.
- // RI x RI
+ // LB30a RI RI <break> RI
+ // RI x RI
+ if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
+ break;
+ }
if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
continue;
}
+ // LB30b Emoji Base x Emoji Modifier
+ if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
+ continue;
+ }
+
// LB 31 Break everywhere else
break;
delete fHL;
delete fID;
delete fRI;
- delete fSA;
delete fSG;
- delete fXX;
+ delete fEB;
+ delete fEM;
+ delete fZJ;
delete fCharBI;
delete fNumberMatcher;
//
// type = char | word | line | sent | title
//
+// Example:
+// intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
+//
//-------------------------------------------------------------------------------------------
static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) {
"\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
"\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
"\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
- "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
"\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
"\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
"\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
"\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
"\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
"\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
- "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
"\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
"\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
"\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
"\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
"\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
- "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
"\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
"\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
- "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
"\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
"\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
"\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
"\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
"\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
"\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
- "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
- "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
- "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
"\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
"\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
};
int32_t charIdx = m_rand() % classSet->size();
UChar32 c = classSet->charAt(charIdx);
if (c < 0) { // TODO: deal with sets containing strings.
- errln("c < 0");
+ errln("%s:%d c < 0", __FILE__, __LINE__);
break;
}
+ // Do not assemble a supplementary character from randomly generated separate surrogates.
+ // (It could be a dictionary character)
+ if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
+ continue;
+ }
+
testText.append(c);
}
} else {
if (breakPos >= 0) {
precedingBreaks[breakPos] = 1;
- }
+ }
lastBreakPos = breakPos;
}
}
errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
charErrorTxt[sizeof(charErrorTxt)-1] = 0;
const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
-
+
errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
errorType, seed, i, charErrorTxt);
// Text includes a mixture of Thai and Latin.
const unsigned char utf8Data[] = {
0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
- 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
+ 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
- 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
- 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
- 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
- 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
- 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
+ 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
+ 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
+ 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
+ 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
+ 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
UErrorCode status = U_ZERO_ERROR;
rstatus = brkiter->getRuleStatus();
(void)rstatus; // Suppress set but not used warning.
if (iterationCount >= 10) {
- break;
+ break;
}
}
TEST_ASSERT(iterationCount == 6);
rstatus = brkiterPOSIX->getRuleStatus();
(void)rstatus; // Suppress set but not used warning.
if (iterationCount >= 10) {
- break;
+ break;
}
}
TEST_ASSERT(iterationCount == 6);
# GraphemeBreakTest-8.0.0.txt
# Date: 2015-02-13, 13:47:15 GMT [MD]
+# Hand patched for Emoji breaking proposal L2/16-011R3.
#
# Unicode Character Database
# Copyright (c) 1991-2015 Unicode, Inc.
# Default Grapheme Break Test
#
# Format:
-# <string> (# <comment>)?
-# <string> contains hex Unicode code points, with
-# ÷ wherever there is a break opportunity, and
+# <string> (# <comment>)?
+# <string> contains hex Unicode code points, with
+# ÷ wherever there is a break opportunity, and
# × wherever there is not.
# <comment> the format can change, but currently it shows:
# - the sample character name
÷ D800 ÷ 0308 ÷ D800 ÷ # ÷ [0.2] <surrogate-D800> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <surrogate-D800> (Control) ÷ [0.3]
÷ 0061 ÷ 1F1E6 ÷ 0062 ÷ # ÷ [0.2] LATIN SMALL LETTER A (Other) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) ÷ [999.0] LATIN SMALL LETTER B (Other) ÷ [0.3]
÷ 1F1F7 × 1F1FA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) ÷ [0.3]
-÷ 1F1F7 Ã\97 1F1FA Ã\97 1F1F8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) ÷ [0.3]
-÷ 1F1F7 Ã\97 1F1FA Ã\97 1F1F8 × 1F1EA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER E (Regional_Indicator) ÷ [0.3]
+÷ 1F1F7 Ã\97 1F1FA ÷ 1F1F8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) ÷ [0.3]
+÷ 1F1F7 Ã\97 1F1FA ÷ 1F1F8 × 1F1EA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER E (Regional_Indicator) ÷ [0.3]
÷ 1F1F7 × 1F1FA ÷ 200B ÷ 1F1F8 × 1F1EA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) ÷ [5.0] ZERO WIDTH SPACE (Control) ÷ [4.0] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER E (Regional_Indicator) ÷ [0.3]
-÷ 1F1E6 Ã\97 1F1E7 Ã\97 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
+÷ 1F1E6 Ã\97 1F1E7 ÷ 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
÷ 1F1E6 × 200D ÷ 1F1E7 × 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [9.0] ZERO WIDTH JOINER (Extend) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
÷ 1F1E6 × 1F1E7 × 200D ÷ 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [9.0] ZERO WIDTH JOINER (Extend) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
÷ 0020 × 200D ÷ 0646 ÷ # ÷ [0.2] SPACE (Other) × [9.0] ZERO WIDTH JOINER (Extend) ÷ [999.0] ARABIC LETTER NOON (Other) ÷ [0.3]
# LineBreakTest-8.0.0.txt
# Date: 2015-04-30, 09:40:15 GMT [MD]
+# Hand patched for Emoji break proposal L2/16-011R3
#
# Unicode Character Database
# Copyright (c) 1991-2015 Unicode, Inc.
× 3057 × 3001 ÷ 0061 × 0062 ÷ 3068 ÷ # × [0.3] HIRAGANA LETTER SI (ID) × [13.02] IDEOGRAPHIC COMMA (CL) ÷ [999.0] LATIN SMALL LETTER A (AL) × [28.0] LATIN SMALL LETTER B (AL) ÷ [999.0] HIRAGANA LETTER TO (ID) ÷ [0.3]
× 0061 ÷ 1F1E6 ÷ 0062 ÷ # × [0.3] LATIN SMALL LETTER A (AL) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER A (RI) ÷ [999.0] LATIN SMALL LETTER B (AL) ÷ [0.3]
× 1F1F7 × 1F1FA ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) ÷ [0.3]
-Ã\97 1F1F7 Ã\97 1F1FA Ã\97 1F1F8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER S (RI) ÷ [0.3]
-Ã\97 1F1F7 Ã\97 1F1FA Ã\97 1F1F8 × 1F1EA ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER S (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER E (RI) ÷ [0.3]
+Ã\97 1F1F7 Ã\97 1F1FA ÷ 1F1F8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER S (RI) ÷ [0.3]
+Ã\97 1F1F7 Ã\97 1F1FA ÷ 1F1F8 × 1F1EA ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER S (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER E (RI) ÷ [0.3]
× 1F1F7 × 1F1FA × 200B ÷ 1F1F8 × 1F1EA ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) × [7.02] ZERO WIDTH SPACE (ZW) ÷ [8.0] REGIONAL INDICATOR SYMBOL LETTER S (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER E (RI) ÷ [0.3]
× 05D0 × 002D × 05D0 ÷ # × [0.3] HEBREW LETTER ALEF (HL) × [21.02] HYPHEN-MINUS (HY) × [21.1] HEBREW LETTER ALEF (HL) ÷ [0.3]
-× 1F1E6 × 1F1E7 × 1F1E8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER A (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER B (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER C (RI) ÷ [0.3]
-× 1F1E6 × 200D × 1F1E7 × 1F1E8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER A (RI) × [9.0] ZERO WIDTH JOINER (CM) × [30.11] REGIONAL INDICATOR SYMBOL LETTER B (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER C (RI) ÷ [0.3]
-× 1F1E6 × 1F1E7 × 200D × 1F1E8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER A (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER B (RI) × [9.0] ZERO WIDTH JOINER (CM) × [30.11] REGIONAL INDICATOR SYMBOL LETTER C (RI) ÷ [0.3]
+× 1F1E6 × 1F1E7 ÷ 1F1E8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER A (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER B (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER C (RI) ÷ [0.3]
+# Patched the following two lines for RI pairing. Note ZWJ behaves as CM and logically disappears.
+× 1F1E6 × 200D × 1F1E7 ÷ 1F1E8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER A (RI) × [9.0] ZERO WIDTH JOINER (CM) × [30.11] REGIONAL INDICATOR SYMBOL LETTER B (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER C (RI) ÷ [0.3]
+× 1F1E6 × 1F1E7 × 200D ÷ 1F1E8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER A (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER B (RI) × [9.0] ZERO WIDTH JOINER (CM) × [30.11] REGIONAL INDICATOR SYMBOL LETTER C (RI) ÷ [0.3]
× 0020 ÷ 200D × 0646 ÷ # × [0.3] SPACE (SP) ÷ [18.0] ZERO WIDTH JOINER (CM) × [28.0] ARABIC LETTER NOON (AL) ÷ [0.3]
× 0646 × 200D × 0020 ÷ # × [0.3] ARABIC LETTER NOON (AL) × [9.0] ZERO WIDTH JOINER (CM) × [7.01] SPACE (SP) ÷ [0.3]
#
# WordBreakTest-8.0.0.txt
# Date: 2015-05-02, 14:48:55 GMT [MD]
+
+# Hand Patched for Emoji breaking proposal L2/16-011R3
#
# Unicode Character Database
# Copyright (c) 1991-2015 Unicode, Inc.
÷ 2060 ÷ 0043 × 2060 × 002E × 2060 × 0044 × 2060 × 2060 ÷ # ÷ [0.2] WORD JOINER (Format_FE) ÷ [999.0] LATIN CAPITAL LETTER C (ALetter) × [4.0] WORD JOINER (Format_FE) × [6.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [7.0] LATIN CAPITAL LETTER D (ALetter) × [4.0] WORD JOINER (Format_FE) × [4.0] WORD JOINER (Format_FE) ÷ [0.3]
÷ 0061 ÷ 1F1E6 ÷ 0062 ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) ÷ [999.0] LATIN SMALL LETTER B (ALetter) ÷ [0.3]
÷ 1F1F7 × 1F1FA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) ÷ [0.3]
-÷ 1F1F7 Ã\97 1F1FA Ã\97 1F1F8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) ÷ [0.3]
-÷ 1F1F7 Ã\97 1F1FA Ã\97 1F1F8 × 1F1EA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER E (Regional_Indicator) ÷ [0.3]
+÷ 1F1F7 Ã\97 1F1FA ÷ 1F1F8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) ÷ [0.3]
+÷ 1F1F7 Ã\97 1F1FA ÷ 1F1F8 × 1F1EA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER E (Regional_Indicator) ÷ [0.3]
÷ 1F1F7 × 1F1FA ÷ 200B ÷ 1F1F8 × 1F1EA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) ÷ [999.0] ZERO WIDTH SPACE (Other) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER E (Regional_Indicator) ÷ [0.3]
÷ 05D0 × 0022 × 05D0 ÷ # ÷ [0.2] HEBREW LETTER ALEF (Hebrew_Letter) × [7.2] QUOTATION MARK (Double_Quote) × [7.3] HEBREW LETTER ALEF (Hebrew_Letter) ÷ [0.3]
-÷ 1F1E6 Ã\97 1F1E7 Ã\97 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
-÷ 1F1E6 Ã\97 200D Ã\97 1F1E7 Ã\97 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [4.0] ZERO WIDTH JOINER (Extend_FE) × [13.3] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
-÷ 1F1E6 Ã\97 1F1E7 Ã\97 200D Ã\97 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [4.0] ZERO WIDTH JOINER (Extend_FE) × [13.3] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
+÷ 1F1E6 Ã\97 1F1E7 ÷ 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
+÷ 1F1E6 Ã\97 200D Ã\97 1F1E7 ÷ 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [4.0] ZERO WIDTH JOINER (Extend_FE) × [13.3] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
+÷ 1F1E6 Ã\97 1F1E7 Ã\97 200D ÷ 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [4.0] ZERO WIDTH JOINER (Extend_FE) × [13.3] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
÷ 0020 × 200D ÷ 0646 ÷ # ÷ [0.2] SPACE (Other) × [4.0] ZERO WIDTH JOINER (Extend_FE) ÷ [999.0] ARABIC LETTER NOON (ALetter) ÷ [0.3]
÷ 0646 × 200D ÷ 0020 ÷ # ÷ [0.2] ARABIC LETTER NOON (ALetter) × [4.0] ZERO WIDTH JOINER (Extend_FE) ÷ [999.0] SPACE (Other) ÷ [0.3]
÷ 0031 ÷ 003A ÷ 003A ÷ 0031 ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) ÷ [999.0] COLON (MidLetter) ÷ [999.0] DIGIT ONE (Numeric) ÷ [0.3]
--- /dev/null
+#
+# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
+
+# file: grapheme.txt
+#
+# Reference Grapheme Break rules for intltest rbbi/RBBIMonkeyTest
+#
+#
+# Note: Rule syntax and the monkey test itself are still a work in progress.
+# They are expected to change with review and the addition of support for rule tailoring.
+
+type = grapheme; # one of grapheme | word | line | sentence
+locale = en;
+
+CR = [\u000d];
+LF = [\u000a];
+
+Control = [[\p{Grapheme_Cluster_Break = Control}]-[:Block=Tags:]];
+Extend = [[\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]];
+Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
+Prepend = [];
+SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
+E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
+E_Modifier = [\U0001F3FB-\U0001F3FF];
+GAZ = [\U0001F455-\U0001F469\U0001F48B\U0001F5E8\u2764];
+ZWJ = [\u200D];
+
+#
+# Korean Syllable Definitions
+#
+L = [\p{Grapheme_Cluster_Break = L}];
+V = [\p{Grapheme_Cluster_Break = V}];
+T = [\p{Grapheme_Cluster_Break = T}];
+
+LV = [\p{Grapheme_Cluster_Break = LV}];
+LVT = [\p{Grapheme_Cluster_Break = LVT}];
+
+GB3: CR LF;
+GB4: (Control | CR | LF) ÷;
+GB5: . ÷ (Control | CR | LF);
+
+GB6: L (L | V | LV | LVT);
+GB7: (LV | V) (V | T);
+GB8: (LVT | T) T;
+
+# Regional Indicators, split into pairs.
+# Note that a pair of RIs that is not followed by a third RI will fall into
+# the normal rules for Extend, etc.
+#
+GB8a.1: Regional_Indicator Regional_Indicator ÷ Regional_Indicator;
+GB8a.2: Regional_Indicator Regional_Indicator;
+
+GB9: . Extend;
+
+GB9a: . SpacingMark;
+GB9b: Prepend .;
+GB9c: (E_Base | GAZ) E_Modifier;
+GB9d: ZWJ GAZ;
+
+GB10: . ÷;
--- /dev/null
+#
+# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
+
+# file: line.txt
+#
+# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
+#
+# Note: Rule syntax and the monkey test itself are still a work in progress.
+# They are expected to change with review and the addition of support for rule tailoring.
+
+
+type = line;
+locale = en;
+
+
+AI = [:LineBreak = Ambiguous:];
+AL = [[:LineBreak = Alphabetic:]-[\u2764]];
+BA = [:LineBreak = Break_After:];
+BB = [:LineBreak = Break_Before:];
+BK = [:LineBreak = Mandatory_Break:];
+B2 = [:LineBreak = Break_Both:];
+CB = [:LineBreak = Contingent_Break:];
+CJ = [:LineBreak = Conditional_Japanese_Starter:];
+CL = [:LineBreak = Close_Punctuation:];
+CM = [:LineBreak = Combining_Mark:];
+CP = [:LineBreak = Close_Parenthesis:];
+CR = [:LineBreak = Carriage_Return:];
+
+EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
+EM = [\U0001F3FB-\U0001F3FF];
+
+EX = [:LineBreak = Exclamation:];
+GL = [:LineBreak = Glue:];
+HL = [:LineBreak = Hebrew_Letter:];
+HY = [:LineBreak = Hyphen:];
+H2 = [:LineBreak = H2:];
+H3 = [:LineBreak = H3:];
+ID = [[:LineBreak = Ideographic:][\u2764]];
+IN = [:LineBreak = Inseperable:];
+IS = [:LineBreak = Infix_Numeric:];
+JL = [:LineBreak = JL:];
+JV = [:LineBreak = JV:];
+JT = [:LineBreak = JT:];
+LF = [:LineBreak = Line_Feed:];
+NL = [:LineBreak = Next_Line:];
+NS = [[:LineBreak = Nonstarter:] CJ];
+NU = [:LineBreak = Numeric:];
+OP = [:LineBreak = Open_Punctuation:];
+PO = [:LineBreak = Postfix_Numeric:];
+PR = [:LineBreak = Prefix_Numeric:];
+QU = [:LineBreak = Quotation:];
+RI = [:LineBreak = Regional_Indicator:];
+SA = [:LineBreak = Complex_Context:];
+SG = [:LineBreak = Surrogate:];
+SP = [:LineBreak = Space:];
+SY = [:LineBreak = Break_Symbols:];
+WJ = [:LineBreak = Word_Joiner:];
+XX = [:LineBreak = Unknown:];
+ZW = [:LineBreak = ZWSpace:];
+ZJ = [\u200D];
+
+# TODO: adjustment to sets needed only until Unicode properties are updated for Emoji.
+ID = [ID - EB];
+AL = [AL - EM];
+
+dictionary = [:LineBreak = Complex_Context:];
+
+# Redfine AL. LB1. TODO: refine according to latest UAX.
+AL = [ AL AI SA SG XX ];
+
+LB4: BK ÷;
+LB5: CR LF;
+LB5.1: CR ÷;
+LB5.2: LF ÷;
+LB5.3: NL ÷;
+
+LB6: . (BK | CR | LF | NL);
+LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
+
+# Rules LB14 - LB17.
+# Moved before LB7, because they can match a longer sequence that would also match LB7,
+# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
+# "while only the prefix "OP CM SP" matches LB7.1
+LB14: OP CM* SP* .;
+LB15: QU CM* SP* OP;
+LB16: (CL | CP)CM* SP* NS;
+LB17: B2 CM* SP* B2;
+
+LB7.1: [^ZW SP] CM* [SP ZW];
+LB7.2: [ZW SP] [SP ZW];
+
+# LB8, ICU differs from UAX-14,
+# ICU: ZW ÷;
+# UAX 14: ZW SP* ÷;
+LB8: ZW ÷;
+
+# LB8a, from Emoji proposal L2/16-011R3
+# ZWJ x ID
+LB8a: ZJ (ID | EB | EM);
+
+
+# LB9: X CM -> X
+# LB10: Unattached CM -> AL
+
+#LB11: × WJ;
+# WJ ×
+
+LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
+LB11.2: SP WJ;
+LB11.3: WJ CM* [^CM];
+
+LB12: GL CM* [^CM];
+
+LB12a: [^SP BA HY] CM* GL;
+
+# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
+#
+# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
+# LB13.2 SP CM* [CL CP EX IS SY]
+
+LB13.1: [^NU SP] CM* [CL CP IS SY];
+LB13.2: [^SP] CM* EX;
+LB13.2: SP [CL CP EX IS SY];
+
+
+# LB 14-17 are moved above LB 7.
+
+LB18: SP ÷;
+
+LB19: . CM* QU;
+LB19.1: QU CM* [^CM];
+
+# LB 20 Break before and after CB.
+# Interaction with LB8a: ZJ x ID is tricky because CM includes ZJ.
+# ZJ acts like a CM to the left, combining with CB.
+# ZJ acts independently to the right, no break from ID by LB8a.
+LB20: . CM* ÷ CB;
+LB20.1a: CB CM* ZJ (ID | EB | EM);
+LB20.1b: CB CM* ÷;
+
+# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
+# not picking up the continuing match after the BA from 21a.
+LB21a: HL CM* (HY | BA) CM* [^CM CB];
+
+LB21.1: . CM* [BA HY NS];
+LB21.2: BB CM* [^CM CB];
+
+LB21b: SY CM* HL;
+
+LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
+LB22.2: EX CM* IN;
+LB22.3: (ID | EB | EM) CM* IN;
+LB22.4: IN CM* IN;
+LB22.5: NU CM* IN;
+
+LB23.1: (ID | EB | EM) CM* PO;
+LB23.2: (AL | HL | CM) CM* NU;
+LB23.3: NU CM* (AL | HL);
+
+LB24.1: PR CM* (ID | EB | EM);
+LB24.2: PR CM* (AL | HL);
+LB24.3: PO CM* (AL | HL);
+
+# Numbers. Equivalent to Tailoring example 8 from UAX 14.
+LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
+
+LB26.1: JL CM* (JL | JV | H2 | H3);
+LB26.2: (JV | H2) CM* (JV | JT);
+LB26.3: (JT | H3) CM* JT;
+
+LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
+LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
+LB27.3: PR CM* (JL | JV | JT | H2 | H3);
+
+# LB28 Do not break between Alphabetics.
+# Unattached (leading) CM treated as AL.
+LB28: (AL | HL | CM)CM* (AL | HL);
+
+LB29: IS CM* (AL | HL);
+
+# LB30 is adjusted for unattached leading CM being treated as AL.
+LB30.1: (AL | CM | HL | NU) CM* OP;
+LB30.2: CP CM* (AL | HL | NU);
+
+# LB31 keep pairs of RI together.
+LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
+LB30a.2: RI CM* RI CM* ZJ (ID | EB | EM);
+LB30a.3: RI CM* RI CM* ÷;
+
+# LB30b Do not break between Emoji Base and Emoji Modifier
+LB30b: EB CM* EM;
+
+# LB31 Break Everywhere Else.
+# Include combining marks
+LB31.1: . CM* ZJ (ID | EB | EM);
+LB31.2: . CM* ÷;
--- /dev/null
+#
+# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
+#
+# file: line_loose.txt
+#
+# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
+#
+# Note: Rule syntax and the monkey test itself are still a work in progress.
+# They are expected to change with review and the addition of support for rule tailoring.
+#
+# This tailors the line break behavior to correspond to CSS
+# line-break=loose (BCP47 -u-lb-loose) as defined for languages other than
+# Chinese & Japanese.
+# It sets characters of class CJ to behave like ID.
+# In addition, it allows breaks:
+# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
+# * between characters of LineBreak class IN
+
+type = line;
+locale = en@lb=loose;
+
+
+AI = [:LineBreak = Ambiguous:];
+AL = [[:LineBreak = Alphabetic:]-[\u2764]];
+BA = [:LineBreak = Break_After:];
+BB = [:LineBreak = Break_Before:];
+BK = [:LineBreak = Mandatory_Break:];
+B2 = [:LineBreak = Break_Both:];
+CB = [:LineBreak = Contingent_Break:];
+CJ = [:LineBreak = Conditional_Japanese_Starter:];
+CL = [:LineBreak = Close_Punctuation:];
+CM = [:LineBreak = Combining_Mark:];
+CP = [:LineBreak = Close_Parenthesis:];
+CR = [:LineBreak = Carriage_Return:];
+
+EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
+EM = [\U0001F3FB-\U0001F3FF];
+
+EX = [:LineBreak = Exclamation:];
+GL = [:LineBreak = Glue:];
+HL = [:LineBreak = Hebrew_Letter:];
+HY = [:LineBreak = Hyphen:];
+H2 = [:LineBreak = H2:];
+H3 = [:LineBreak = H3:];
+ID = [[:LineBreak = Ideographic:] CJ [\u2764]];
+IN = [:LineBreak = Inseperable:];
+IS = [:LineBreak = Infix_Numeric:];
+JL = [:LineBreak = JL:];
+JV = [:LineBreak = JV:];
+JT = [:LineBreak = JT:];
+LF = [:LineBreak = Line_Feed:];
+NL = [:LineBreak = Next_Line:];
+NSX = [\u3005 \u303B \u309D \u309E \u30FD \u30FE];
+NS = [[:LineBreak = Nonstarter:] - NSX];
+NU = [:LineBreak = Numeric:];
+OP = [:LineBreak = Open_Punctuation:];
+PO = [:LineBreak = Postfix_Numeric:];
+PR = [:LineBreak = Prefix_Numeric:];
+QU = [:LineBreak = Quotation:];
+RI = [:LineBreak = Regional_Indicator:];
+SA = [:LineBreak = Complex_Context:];
+SG = [:LineBreak = Surrogate:];
+SP = [:LineBreak = Space:];
+SY = [:LineBreak = Break_Symbols:];
+WJ = [:LineBreak = Word_Joiner:];
+XX = [:LineBreak = Unknown:];
+ZW = [:LineBreak = ZWSpace:];
+ZJ = [\u200D];
+
+# TODO: adjustment to sets needed only until Unicode properties are updated for Emoji.
+ID = [ID - EB];
+AL = [AL - EM];
+
+dictionary = [:LineBreak = Complex_Context:];
+
+# Redfine AL. LB1. TODO: refine according to latest UAX.
+AL = [ AL AI SA SG XX ];
+
+LB4: BK ÷;
+LB5: CR LF;
+LB5.1: CR ÷;
+LB5.2: LF ÷;
+LB5.3: NL ÷;
+
+LB6: . (BK | CR | LF | NL);
+LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
+
+# Rules LB14 - LB17.
+# Moved before LB7, because they can match a longer sequence that would also match LB7,
+# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
+# "while only the prefix "OP CM SP" matches LB7.1
+LB14: OP CM* SP* .;
+LB15: QU CM* SP* OP;
+LB16: (CL | CP)CM* SP* NS;
+LB17: B2 CM* SP* B2;
+
+LB7.1: [^ZW SP] CM* [SP ZW];
+LB7.2: [ZW SP] [SP ZW];
+
+# LB8, ICU differs from UAX-14,
+# ICU: ZW ÷;
+# UAX 14: ZW SP* ÷;
+LB8: ZW ÷;
+
+# LB8a, from Emoji proposal L2/16-011R3
+# ZWJ x ID
+LB8a: ZJ (ID | EB | EM);
+
+
+# LB9: X CM -> X
+# LB10: Unattached CM -> AL
+
+#LB11: × WJ;
+# WJ ×
+
+LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
+LB11.2: SP WJ;
+LB11.3: WJ CM* [^CM];
+
+LB12: GL CM* [^CM];
+
+LB12a: [^SP BA HY] CM* GL;
+
+# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
+#
+# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
+# LB13.2 SP CM* [CL CP EX IS SY]
+
+LB13.1: [^NU SP] CM* [CL CP IS SY];
+LB13.2: [^SP] CM* EX;
+LB13.2: SP [CL CP EX IS SY];
+
+
+# LB 14-17 are moved above LB 7.
+
+LB18: SP ÷;
+
+LB19: . CM* QU;
+LB19.1: QU CM* [^CM];
+
+# LB 20 Break before and after CB.
+# Interaction with LB8a: ZJ x ID is tricky because CM includes ZJ.
+# ZJ acts like a CM to the left, combining with CB.
+# ZJ acts independently to the right, no break from ID by LB8a.
+LB20: . CM* ÷ CB;
+LB20.1a: CB CM* ZJ (ID | EB | EM);
+LB20.1b: CB CM* ÷;
+
+# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
+# not picking up the continuing match after the BA from 21a.
+LB21a: HL CM* (HY | BA) CM* [^CM CB];
+
+LB21.1: . CM* [BA HY NS];
+LB21.2: BB CM* [^CM CB];
+
+LB21b: SY CM* HL;
+
+LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
+LB22.2: EX CM* IN;
+LB22.3: (ID | EB | EM) CM* IN;
+# LB22.4: IN CM* IN; # delete this rule for CSS loose.
+LB22.5: NU CM* IN;
+
+LB23.1: (ID | EB | EM) CM* PO;
+LB23.2: (AL | HL | CM) CM* NU;
+LB23.3: NU CM* (AL | HL);
+
+LB24.1: PR CM* (ID | EB | EM);
+LB24.2: PR CM* (AL | HL);
+LB24.3: PO CM* (AL | HL);
+
+# Numbers. Equivalent to Tailoring example 8 from UAx 14.
+LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
+
+LB26.1: JL CM* (JL | JV | H2 | H3);
+LB26.2: (JV | H2) CM* (JV | JT);
+LB26.3: (JT | H3) CM* JT;
+
+LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
+LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
+LB27.3: PR CM* (JL | JV | JT | H2 | H3);
+
+# LB28 Do not break between Alphabetics.
+# Unattached (leading) CM treated as AL.
+LB28: (AL | HL | CM)CM* (AL | HL);
+
+LB29: IS CM* (AL | HL);
+
+# LB30 is adjusted for unattached leading CM being treated as AL.
+LB30.1: (AL | CM | HL | NU) CM* OP;
+LB30.2: CP CM* (AL | HL | NU);
+
+# LB31 keep pairs of RI together.
+LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
+LB30a.2: RI CM* RI CM* ZJ (ID | EB | EM);
+LB30a.3: RI CM* RI CM* ÷;
+
+# LB30b Do not break between Emoji Base and Emoji Modifier
+LB30b: EB CM* EM;
+
+# LB31 Break Everywhere Else.
+# Include combining marks
+LB31.1: . CM* ZJ (ID | EB | EM);
+LB31.2: . CM* ÷;
--- /dev/null
+#
+# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
+#
+# file: line_loose_cj.txt
+#
+# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
+#
+# Note: Rule syntax and the monkey test itself are still a work in progress.
+# They are expected to change with review and the addition of support for rule tailoring.
+#
+# Line Breaking Rules
+# Implement default line breaking as defined by
+# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
+# http://www.unicode.org/reports/tr14/
+# tailored as noted in 2nd paragraph below..
+#
+# This tailors the line break behavior to correspond to CSS
+# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
+# It sets characters of class CJ to behave like ID.
+# In addition, it allows breaks:
+# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
+# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
+# * between characters of LineBreak class IN such as 2026
+# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
+# FF65 (all NS) and FF01, FF1F (both EX).
+# * before suffix characters with LineBreak class PO and EastAsianWidth A,F,W;
+# this includes: 00B0 2030 2032 2033 2035 2103 2109 FE6A FF05 FFE0
+# * after prefix characters with LineBreak class PR and EastAsianWidth A,F,W;
+# this includes: 00A4 00B1 20AC 2116 FE69 FF04 FFE1 FFE5 FFE6
+
+
+type = line;
+locale = ja@lb=loose;
+
+
+AI = [:LineBreak = Ambiguous:];
+AL = [[:LineBreak = Alphabetic:]-[\u2764]];
+BAX = [\u2010 \u2013];
+BA = [[:LineBreak = Break_After:] - BAX];
+BB = [:LineBreak = Break_Before:];
+BK = [:LineBreak = Mandatory_Break:];
+B2 = [:LineBreak = Break_Both:];
+CB = [:LineBreak = Contingent_Break:];
+CJ = [:LineBreak = Conditional_Japanese_Starter:];
+CL = [:LineBreak = Close_Punctuation:];
+CM = [:LineBreak = Combining_Mark:];
+CP = [:LineBreak = Close_Parenthesis:];
+CR = [:LineBreak = Carriage_Return:];
+
+EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
+EM = [\U0001F3FB-\U0001F3FF];
+
+EXX = [\uFF01 \uFF1F];
+EX = [[:LineBreak = Exclamation:] - EXX];
+GL = [:LineBreak = Glue:];
+HL = [:LineBreak = Hebrew_Letter:];
+HY = [:LineBreak = Hyphen:];
+H2 = [:LineBreak = H2:];
+H3 = [:LineBreak = H3:];
+ID = [[:LineBreak = Ideographic:][\u2764]CJ];
+IN = [:LineBreak = Inseperable:];
+IS = [:LineBreak = Infix_Numeric:];
+JL = [:LineBreak = JL:];
+JV = [:LineBreak = JV:];
+JT = [:LineBreak = JT:];
+LF = [:LineBreak = Line_Feed:];
+NL = [:LineBreak = Next_Line:];
+NSX = [\u301C \u30A0 \u3005 \u303B \u309D \u309E \u30FD \u30FE \u203C \u2047 \u2048 \u2049 \u30FB \uFF1A \uFF1B \uFF65];
+NS = [[:LineBreak = Nonstarter:] - NSX];
+NU = [:LineBreak = Numeric:];
+OP = [:LineBreak = Open_Punctuation:];
+POX = [\u00B0 \u2030 \u2032 \u2033 \u2035 \u2103 \u2109 \uFE6A \uFF05 \uFFE0];
+PO = [[:LineBreak = Postfix_Numeric:] - POX];
+PRX = [\u00A4 \u00B1 \u20AC \u2116 \uFE69 \uFF04 \uFFE1 \uFFE5 \uFFE6];
+PR = [[:LineBreak = Prefix_Numeric:] - PRX];
+QU = [:LineBreak = Quotation:];
+RI = [:LineBreak = Regional_Indicator:];
+SA = [:LineBreak = Complex_Context:];
+SG = [:LineBreak = Surrogate:];
+SP = [:LineBreak = Space:];
+SY = [:LineBreak = Break_Symbols:];
+WJ = [:LineBreak = Word_Joiner:];
+XX = [:LineBreak = Unknown:];
+ZW = [:LineBreak = ZWSpace:];
+ZJ = [\u200D];
+
+# TODO: adjustment to sets needed only until Unicode properties are updated for Emoji.
+ID = [ID - EB];
+AL = [AL - EM];
+
+dictionary = [:LineBreak = Complex_Context:];
+
+# Redfine AL. LB1. TODO: refine according to latest UAX.
+AL = [ AL AI SA SG XX ];
+
+LB4: BK ÷;
+LB5: CR LF;
+LB5.1: CR ÷;
+LB5.2: LF ÷;
+LB5.3: NL ÷;
+
+LB6: . (BK | CR | LF | NL);
+LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
+
+# Rules LB14 - LB17.
+# Moved before LB7, because they can match a longer sequence that would also match LB7,
+# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
+# "while only the prefix "OP CM SP" matches LB7.1
+LB14: OP CM* SP* .;
+LB15: QU CM* SP* OP;
+LB16: (CL | CP)CM* SP* NS;
+LB17: B2 CM* SP* B2;
+
+LB7.1: [^ZW SP] CM* [SP ZW];
+LB7.2: [ZW SP] [SP ZW];
+
+# LB8, ICU differs from UAX-14,
+# ICU: ZW ÷;
+# UAX 14: ZW SP* ÷;
+LB8: ZW ÷;
+
+# LB8a, from Emoji proposal L2/16-011R3
+# ZWJ x ID
+LB8a: ZJ (ID | EB | EM);
+
+
+# LB9: X CM -> X
+# LB10: Unattached CM -> AL
+
+#LB11: × WJ;
+# WJ ×
+
+LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
+LB11.2: SP WJ;
+LB11.3: WJ CM* [^CM];
+
+LB12: GL CM* [^CM];
+
+LB12a: [^SP BA BAX HY] CM* GL;
+
+# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
+#
+# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
+# LB13.2 SP CM* [CL CP EX IS SY]
+
+LB13.1: [^NU SP] CM* [CL CP IS SY];
+LB13.2: [^SP] CM* EX;
+LB13.2: SP [CL CP EX IS SY];
+
+
+# LB 14-17 are moved above LB 7.
+
+LB18: SP ÷;
+
+LB19: . CM* QU;
+LB19.1: QU CM* [^CM];
+
+# LB 20 Break before and after CB.
+# Interaction with LB8a: ZJ x ID is tricky because CM includes ZJ.
+# ZJ acts like a CM to the left, combining with CB.
+# ZJ acts independently to the right, no break from ID by LB8a.
+LB20: . CM* ÷ CB;
+LB20.1a: CB CM* ZJ (ID | EB | EM);
+LB20.1b: CB CM* ÷;
+
+# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
+# not picking up the continuing match after the BA from 21a.
+# LB 21a Don't break after Hebrew + Hyphen
+# HL (HY | BA) x
+
+LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
+
+LB21.1: . CM* [BA HY NS];
+LB21.2: BB CM* [^CM CB];
+
+LB21b: SY CM* HL;
+
+LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
+LB22.2: EX CM* IN;
+LB22.3: (ID | EB | EM) CM* IN;
+# LB22.4: IN CM* IN; # delete this rule for CSS loose.
+LB22.5: NU CM* IN;
+
+LB23.1: (ID | EB | EM) CM* PO;
+LB23.2: (AL | HL | CM) CM* NU;
+LB23.3: NU CM* (AL | HL);
+
+LB24.1: PR CM* (ID | EB | EM);
+LB24.2: PR CM* (AL | HL);
+LB24.3: (PO | POX) CM* (AL | HL);
+
+# Numbers. Equivalent to Tailoring example 8 from UAx 14.
+# Loose_cj tailoring: do not include $PRX at the beginning or $POX at the end.
+LB25: ((PR | PO | POX)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PRX | PO))?;
+
+LB26.1: JL CM* (JL | JV | H2 | H3);
+LB26.2: (JV | H2) CM* (JV | JT);
+LB26.3: (JT | H3) CM* JT;
+
+LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
+LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
+LB27.3: PR CM* (JL | JV | JT | H2 | H3);
+
+# LB28 Do not break between Alphabetics.
+# Unattached (leading) CM treated as AL.
+LB28: (AL | HL | CM)CM* (AL | HL);
+
+LB29: IS CM* (AL | HL);
+
+# LB30 is adjusted for unattached leading CM being treated as AL.
+LB30.1: (AL | CM | HL | NU) CM* OP;
+LB30.2: CP CM* (AL | HL | NU);
+
+# LB31 keep pairs of RI together.
+LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
+LB30a.2: RI CM* RI CM* ZJ (ID | EB | EM);
+LB30a.3: RI CM* RI CM* ÷;
+
+# LB30b Do not break between Emoji Base and Emoji Modifier
+LB30b: EB CM* EM;
+
+# LB31 Break Everywhere Else.
+# Include combining marks
+LB31.1: . CM* ZJ (ID | EB | EM);
+LB31.2: . CM* ÷;
--- /dev/null
+#
+# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
+#
+# file: line_normal.txt
+#
+# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
+#
+# Note: Rule syntax and the monkey test itself are still a work in progress.
+# They are expected to change with review and the addition of support for rule tailoring.
+#
+# Line Breaking Rules
+# Implement default line breaking as defined by
+# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
+# http://www.unicode.org/reports/tr14/
+# tailored as noted in 2nd paragraph below..
+#
+# TODO: Rule LB 8 remains as it was in Unicode 5.2
+# This is only because of a limitation of ICU break engine implementation,
+# not because the older behavior is desirable.
+#
+# This tailors the line break behavior to correspond to CSS
+# line-break=normal (BCP47 -u-lb-normal) as defined for languages other than
+# Chinese & Japanese.
+# It sets characters of class CJ to behave like ID.
+
+
+type = line;
+locale = en@lb=normal;
+
+AI = [:LineBreak = Ambiguous:];
+AL = [[:LineBreak = Alphabetic:]-[\u2764]];
+BA = [:LineBreak = Break_After:];
+BB = [:LineBreak = Break_Before:];
+BK = [:LineBreak = Mandatory_Break:];
+B2 = [:LineBreak = Break_Both:];
+CB = [:LineBreak = Contingent_Break:];
+CJ = [:LineBreak = Conditional_Japanese_Starter:];
+CL = [:LineBreak = Close_Punctuation:];
+CM = [:LineBreak = Combining_Mark:];
+CP = [:LineBreak = Close_Parenthesis:];
+CR = [:LineBreak = Carriage_Return:];
+
+EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
+EM = [\U0001F3FB-\U0001F3FF];
+
+EX = [:LineBreak = Exclamation:];
+GL = [:LineBreak = Glue:];
+HL = [:LineBreak = Hebrew_Letter:];
+HY = [:LineBreak = Hyphen:];
+H2 = [:LineBreak = H2:];
+H3 = [:LineBreak = H3:];
+ID = [[:LineBreak = Ideographic:] CJ [\u2764]];
+IN = [:LineBreak = Inseperable:];
+IS = [:LineBreak = Infix_Numeric:];
+JL = [:LineBreak = JL:];
+JV = [:LineBreak = JV:];
+JT = [:LineBreak = JT:];
+LF = [:LineBreak = Line_Feed:];
+NL = [:LineBreak = Next_Line:];
+NS = [:LineBreak = Nonstarter:];
+NU = [:LineBreak = Numeric:];
+OP = [:LineBreak = Open_Punctuation:];
+PO = [:LineBreak = Postfix_Numeric:];
+PR = [:LineBreak = Prefix_Numeric:];
+QU = [:LineBreak = Quotation:];
+RI = [:LineBreak = Regional_Indicator:];
+SA = [:LineBreak = Complex_Context:];
+SG = [:LineBreak = Surrogate:];
+SP = [:LineBreak = Space:];
+SY = [:LineBreak = Break_Symbols:];
+WJ = [:LineBreak = Word_Joiner:];
+XX = [:LineBreak = Unknown:];
+ZW = [:LineBreak = ZWSpace:];
+ZJ = [\u200D];
+
+# TODO: adjustment to sets needed only until Unicode properties are updated for Emoji.
+ID = [ID - EB];
+AL = [AL - EM];
+
+dictionary = [:LineBreak = Complex_Context:];
+
+# Redfine AL. LB1. TODO: refine according to latest UAX.
+AL = [ AL AI SA SG XX ];
+
+LB4: BK ÷;
+LB5: CR LF;
+LB5.1: CR ÷;
+LB5.2: LF ÷;
+LB5.3: NL ÷;
+
+LB6: . (BK | CR | LF | NL);
+LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
+
+# Rules LB14 - LB17.
+# Moved before LB7, because they can match a longer sequence that would also match LB7,
+# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
+# "while only the prefix "OP CM SP" matches LB7.1
+LB14: OP CM* SP* .;
+LB15: QU CM* SP* OP;
+LB16: (CL | CP)CM* SP* NS;
+LB17: B2 CM* SP* B2;
+
+LB7.1: [^ZW SP] CM* [SP ZW];
+LB7.2: [ZW SP] [SP ZW];
+
+# LB8, ICU differs from UAX-14,
+# ICU: ZW ÷;
+# UAX 14: ZW SP* ÷;
+LB8: ZW ÷;
+
+# LB8a, from Emoji proposal L2/16-011R3
+# ZWJ x ID
+LB8a: ZJ (ID | EB | EM);
+
+
+# LB9: X CM -> X
+# LB10: Unattached CM -> AL
+
+#LB11: × WJ;
+# WJ ×
+
+LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
+LB11.2: SP WJ;
+LB11.3: WJ CM* [^CM];
+
+LB12: GL CM* [^CM];
+
+LB12a: [^SP BA HY] CM* GL;
+
+# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
+#
+# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
+# LB13.2 SP CM* [CL CP EX IS SY]
+
+LB13.1: [^NU SP] CM* [CL CP IS SY];
+LB13.2: [^SP] CM* EX;
+LB13.2: SP [CL CP EX IS SY];
+
+
+# LB 14-17 are moved above LB 7.
+
+LB18: SP ÷;
+
+LB19: . CM* QU;
+LB19.1: QU CM* [^CM];
+
+# LB 20 Break before and after CB.
+# Interaction with LB8a: ZJ x ID is tricky because CM includes ZJ.
+# ZJ acts like a CM to the left, combining with CB.
+# ZJ acts independently to the right, no break from ID by LB8a.
+LB20: . CM* ÷ CB;
+LB20.1a: CB CM* ZJ (ID | EB | EM);
+LB20.1b: CB CM* ÷;
+
+# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
+# not picking up the continuing match after the BA from 21a.
+LB21a: HL CM* (HY | BA) CM* [^CM CB];
+
+LB21.1: . CM* [BA HY NS];
+LB21.2: BB CM* [^CM CB];
+
+LB21b: SY CM* HL;
+
+LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
+LB22.2: EX CM* IN;
+LB22.3: (ID | EB | EM) CM* IN;
+LB22.4: IN CM* IN;
+LB22.5: NU CM* IN;
+
+LB23.1: (ID | EB | EM) CM* PO;
+LB23.2: (AL | HL | CM) CM* NU;
+LB23.3: NU CM* (AL | HL);
+
+LB24.1: PR CM* (ID | EB | EM);
+LB24.2: PR CM* (AL | HL);
+LB24.3: PO CM* (AL | HL);
+
+# Numbers. Equivalent to Tailoring example 8 from UAx 14.
+LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
+
+LB26.1: JL CM* (JL | JV | H2 | H3);
+LB26.2: (JV | H2) CM* (JV | JT);
+LB26.3: (JT | H3) CM* JT;
+
+LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
+LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
+LB27.3: PR CM* (JL | JV | JT | H2 | H3);
+
+# LB28 Do not break between Alphabetics.
+# Unattached (leading) CM treated as AL.
+LB28: (AL | HL | CM)CM* (AL | HL);
+
+LB29: IS CM* (AL | HL);
+
+# LB30 is adjusted for unattached leading CM being treated as AL.
+LB30.1: (AL | CM | HL | NU) CM* OP;
+LB30.2: CP CM* (AL | HL | NU);
+
+# LB31 keep pairs of RI together.
+LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
+LB30a.2: RI CM* RI CM* ZJ (ID | EB | EM);
+LB30a.3: RI CM* RI CM* ÷;
+
+# LB30b Do not break between Emoji Base and Emoji Modifier
+LB30b: EB CM* EM;
+
+# LB31 Break Everywhere Else.
+# Include combining marks
+LB31.1: . CM* ZJ (ID | EB | EM);
+LB31.2: . CM* ÷;
--- /dev/null
+# Copyright (c) 2016 International Business Machines Corporation and # others. All Rights Reserved.
+#
+# file: line_normal_cj.txt
+#
+# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
+#
+# Note: Rule syntax and the monkey test itself are still a work in progress.
+# They are expected to change with review and the addition of support for rule tailoring.
+#
+# Line Breaking Rules
+# Implement default line breaking as defined by
+# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
+# http://www.unicode.org/reports/tr14/
+# tailored as noted in 2nd paragraph below..
+#
+# TODO: Rule LB 8 remains as it was in Unicode 5.2
+# This is only because of a limitation of ICU break engine implementation,
+# not because the older behavior is desirable.
+#
+# This tailors the line break behavior to correspond to CSS
+# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
+# It sets characters of class CJ to behave like ID.
+# In addition, it allows breaks:
+# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
+
+type = line;
+locale = ja@lb=normal;
+
+AI = [:LineBreak = Ambiguous:];
+AL = [[:LineBreak = Alphabetic:]-[\u2764]];
+BAX = [\u2010 \u2013];
+BA = [[:LineBreak = Break_After:] - BAX];
+BB = [:LineBreak = Break_Before:];
+BK = [:LineBreak = Mandatory_Break:];
+B2 = [:LineBreak = Break_Both:];
+CB = [:LineBreak = Contingent_Break:];
+CJ = [:LineBreak = Conditional_Japanese_Starter:];
+CL = [:LineBreak = Close_Punctuation:];
+CM = [:LineBreak = Combining_Mark:];
+CP = [:LineBreak = Close_Parenthesis:];
+CR = [:LineBreak = Carriage_Return:];
+
+EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
+EM = [\U0001F3FB-\U0001F3FF];
+
+EX = [:LineBreak = Exclamation:];
+GL = [:LineBreak = Glue:];
+HL = [:LineBreak = Hebrew_Letter:];
+HY = [:LineBreak = Hyphen:];
+H2 = [:LineBreak = H2:];
+H3 = [:LineBreak = H3:];
+ID = [[:LineBreak = Ideographic:] CJ [\u2764]];
+IN = [:LineBreak = Inseperable:];
+IS = [:LineBreak = Infix_Numeric:];
+JL = [:LineBreak = JL:];
+JV = [:LineBreak = JV:];
+JT = [:LineBreak = JT:];
+LF = [:LineBreak = Line_Feed:];
+NL = [:LineBreak = Next_Line:];
+NSX = [\u301C \u30A0];
+NS = [[:LineBreak = Nonstarter:] - NSX];
+NU = [:LineBreak = Numeric:];
+OP = [:LineBreak = Open_Punctuation:];
+PO = [:LineBreak = Postfix_Numeric:];
+PR = [:LineBreak = Prefix_Numeric:];
+QU = [:LineBreak = Quotation:];
+RI = [:LineBreak = Regional_Indicator:];
+SA = [:LineBreak = Complex_Context:];
+SG = [:LineBreak = Surrogate:];
+SP = [:LineBreak = Space:];
+SY = [:LineBreak = Break_Symbols:];
+WJ = [:LineBreak = Word_Joiner:];
+XX = [:LineBreak = Unknown:];
+ZW = [:LineBreak = ZWSpace:];
+ZJ = [\u200D];
+
+# TODO: adjustment to sets needed only until Unicode properties are updated for Emoji.
+ID = [ID - EB];
+AL = [AL - EM];
+
+dictionary = [:LineBreak = Complex_Context:];
+
+# Redfine AL. LB1. TODO: refine according to latest UAX.
+AL = [ AL AI SA SG XX ];
+
+LB4: BK ÷;
+LB5: CR LF;
+LB5.1: CR ÷;
+LB5.2: LF ÷;
+LB5.3: NL ÷;
+
+LB6: . (BK | CR | LF | NL);
+LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
+
+# Rules LB14 - LB17.
+# Moved before LB7, because they can match a longer sequence that would also match LB7,
+# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
+# "while only the prefix "OP CM SP" matches LB7.1
+LB14: OP CM* SP* .;
+LB15: QU CM* SP* OP;
+
+# Do not break between closing punctuation and $NS, even with intervening spaces
+# But DO allow a break between closing punctuation and $NSX, don't include it here
+LB16: (CL | CP)CM* SP* NS;
+LB17: B2 CM* SP* B2;
+
+LB7.1: [^ZW SP] CM* [SP ZW];
+LB7.2: [ZW SP] [SP ZW];
+
+# LB8, ICU differs from UAX-14,
+# ICU: ZW ÷;
+# UAX 14: ZW SP* ÷;
+LB8: ZW ÷;
+
+# LB8a, from Emoji proposal L2/16-011R3
+# ZWJ x ID
+LB8a: ZJ (ID | EB | EM);
+
+
+# LB9: X CM -> X
+# LB10: Unattached CM -> AL
+
+#LB11: × WJ;
+# WJ ×
+
+LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
+LB11.2: SP WJ;
+LB11.3: WJ CM* [^CM];
+
+LB12: GL CM* [^CM];
+
+LB12a: [^SP BA BAX HY] CM* GL;
+
+# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
+#
+# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
+# LB13.2 SP CM* [CL CP EX IS SY]
+
+LB13.1: [^NU SP] CM* [CL CP IS SY];
+LB13.2: [^SP] CM* EX;
+LB13.2: SP [CL CP EX IS SY];
+
+
+# LB 14-17 are moved above LB 7.
+
+LB18: SP ÷;
+
+LB19: . CM* QU;
+LB19.1: QU CM* [^CM];
+
+# LB 20 Break before and after CB.
+# Interaction with LB8a: ZJ x ID is tricky because CM includes ZJ.
+# ZJ acts like a CM to the left, combining with CB.
+# ZJ acts independently to the right, no break from ID by LB8a.
+LB20: . CM* ÷ CB;
+LB20.1a: CB CM* ZJ (ID | EB | EM);
+LB20.1b: CB CM* ÷;
+
+# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
+# not picking up the continuing match after the BA from 21a.
+# TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so,
+# should "HL BAX" not break when followed by a CB? Thats what the current
+# rules do, which is why "[^CM CB]?"\13 includes the ?.
+LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
+
+# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
+LB21.1: . CM* [BA HY NS];
+LB21.2: BB CM* [^CM CB];
+
+LB21b: SY CM* HL;
+
+LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
+LB22.2: EX CM* IN;
+LB22.3: (ID | EB | EM) CM* IN;
+LB22.4: IN CM* IN;
+LB22.5: NU CM* IN;
+
+LB23.1: (ID | EB | EM) CM* PO;
+LB23.2: (AL | HL | CM) CM* NU;
+LB23.3: NU CM* (AL | HL);
+
+LB24.1: PR CM* (ID | EB | EM);
+LB24.2: PR CM* (AL | HL);
+LB24.3: PO CM* (AL | HL);
+
+# Numbers. Equivalent to Tailoring example 8 from UAx 14.
+LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
+
+LB26.1: JL CM* (JL | JV | H2 | H3);
+LB26.2: (JV | H2) CM* (JV | JT);
+LB26.3: (JT | H3) CM* JT;
+
+LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
+LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
+LB27.3: PR CM* (JL | JV | JT | H2 | H3);
+
+# LB28 Do not break between Alphabetics.
+# Unattached (leading) CM treated as AL.
+LB28: (AL | HL | CM)CM* (AL | HL);
+
+LB29: IS CM* (AL | HL);
+
+# LB30 is adjusted for unattached leading CM being treated as AL.
+LB30.1: (AL | CM | HL | NU) CM* OP;
+LB30.2: CP CM* (AL | HL | NU);
+
+# LB31 keep pairs of RI together.
+LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
+LB30a.2: RI CM* RI CM* ZJ (ID | EB | EM);
+LB30a.3: RI CM* RI CM* ÷;
+
+# LB30b Do not break between Emoji Base and Emoji Modifier
+LB30b: EB CM* EM;
+
+# LB31 Break Everywhere Else.
+# Include combining marks
+LB31.1: . CM* ZJ (ID | EB | EM);
+LB31.2: . CM* ÷;
--- /dev/null
+file: testdata/break_rules/readme.txt
+Copyright (c) 2015, International Business Machines Corporation and others. All Rights Reserved.
+
+This directory contains the break iterator reference rule files used by intltest rbbi/RBBIMonkeyTest/testMonkey.
+The rules in this directory track the boundary rules from Unicode UAX 14 and 29. They are interpretted
+to provide an expected set of boundary positions to compare with the results from ICU break iteration.
+
+Each set of reference break rules lives in a separate file.
+The list of rule files to run by default is hardcoded into the test code, in rbbimonkeytest.cpp.
+
+Each test file includes
+ - The type of ICU break interator to create (word, line, sentence, etc.)
+ - The locale to use
+ - Character Class definitions
+ - Rule definitions
+
+To Do
+ - Syntax for tailoring.
+
+
+Character Class Definition:
+ name = set_regular_expression;
+
+Rule Definition:
+ rule_regular_expression;
+
+name:
+ [A-Za-z_][A-Za-z0-9_]*
+
+set_regular_expression:
+ The intersection of an ICU regular expression [set] expression and a UnicodeSet pattern.
+ (They are mostly the same)
+ May include previously defined set names, which are logically expanded in-place.
+
+rule_regular_expresson:
+ An ICU Regular Expression.
+ May include set names, which are logically expanded in-place.
+ May include a '÷', which defines a boundary position.
+
+Application of the rules:
+ Matching begins at the start of text, or after a previously identified boundary.
+ The pseudo-code below finds the next boundary.
+
+ while position < end of text
+ for each rule
+ if the text at position matches this rule
+ if the rule has a '÷'
+ Boundary is found.
+ return the position of the '÷' within the match.
+ else
+ position = last character of the rule match.
+ break from the rule loop, continue the outer loop.
+
+ This differs from the Unicode UAX algorithm in that each position in the text is
+ not tested separately. Instead, when a rule match is found, rule application restarts with the last
+ character of the preceding rule match. ICU's break rules also operate this way.
+
+ Expressing rules this way simplifies UAX rules that have leading or trailing context; it
+ is no longer necessary to write expressions that match the context starting from
+ any position within it.
+
+ This rule form differs from ICU rules in that the rules are applied sequentially, as they
+ are with the Unicode UAX rules. With the main ICU break rules, all are applied in parallel.
+
+Word Dictionaries
+ The monkey test does not test dictionary based breaking. The set named 'dicitionary' is special,
+ as it is in the main ICU rules. For the monkey test, no characters from the dictionary set are
+ included in the randomly-generated test data.
+
--- /dev/null
+type = sentence; # one of grapheme | word | line | sentence
+locale = en;
+
+CR = [\p{Sentence_Break = CR}];
+LF = [\p{Sentence_Break = LF}];
+Extend = [\p{Sentence_Break = Extend}];
+Sep = [\p{Sentence_Break = Sep}];
+Format = [\p{Sentence_Break = Format}];
+Sp = [\p{Sentence_Break = Sp}];
+Lower = [\p{Sentence_Break = Lower}];
+Upper = [\p{Sentence_Break = Upper}];
+OLetter = [\p{Sentence_Break = OLetter}];
+Numeric = [\p{Sentence_Break = Numeric}];
+ATerm = [\p{Sentence_Break = ATerm}];
+SContinue = [\p{Sentence_Break = SContinue}];
+STerm = [\p{Sentence_Break = STerm}];
+Close = [\p{Sentence_Break = Close}];
+
+ParaSep = [Sep CR LF];
+SATerm = [STerm ATerm];
+ExtFmt = [Extend Format];
+
+# SB2: ÷ eot
+# Conventional regular expression matching for '$' as end-of-text also matches
+# at a line separator just preceding the physical end of text.
+# Instead, use a look-ahead assertion that there is no following character.
+SB2: . ÷ (?!.);
+
+SB3: CR LF;
+SB4: ParaSep ÷;
+
+# SB5: ignore Format and Extend characters.
+
+SB6: ATerm ExtFmt* Numeric;
+SB7: (Upper | Lower) ExtFmt* ATerm ExtFmt* Upper;
+SB8: ATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* ([^OLetter Upper Lower ParaSep SATerm ExtFmt] ExtFmt *)* Lower;
+SB8a: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (SContinue | SATerm);
+
+SB9: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (CR LF | ParaSep)? ÷;
+ # Also covers SB10, SB11.
+
+SB12: . ExtFmt* [^ExtFmt]?;
+
--- /dev/null
+#
+# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
+
+# file: word.txt
+#
+# Reference Word Break rules for intltest rbbi/RBBIMonkeyTest
+#
+# Note: Rule syntax and the monkey test itself are still a work in progress.
+# They are expected to change with review and the addition of support for rule tailoring.
+
+
+type = word; # one of grapheme | word | line | sentence
+locale = en;
+
+E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
+E_Modifier = [\U0001F3FB-\U0001F3FF];
+ZWJ = [\u200D];
+GAZ = [\U0001F466-\U0001F469\U0001F48B\U0001F5E8\u2764];
+
+CR = [\p{Word_Break = CR}];
+LF = [\p{Word_Break = LF}];
+Newline = [\p{Word_Break = Newline}];
+Extend = [[[\p{Word_Break = Extend}][:Block=Tags:]]-ZWJ];
+Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
+Format = [[\p{Word_Break = Format}]-[:Block=Tags:]];
+Katakana = [\p{Word_Break = Katakana}];
+Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
+ALetter = [\p{Word_Break = ALetter}];
+Single_Quote = [\p{Word_Break = Single_Quote}];
+Double_Quote = [\p{Word_Break = Double_Quote}];
+MidNumLet = [\p{Word_Break = MidNumLet}];
+MidLetter = [\p{Word_Break = MidLetter}];
+MidNum = [\p{Word_Break = MidNum}];
+Numeric = [\p{Word_Break = Numeric}];
+ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
+
+#define dicitionary, with the effect being that those characters don't appear in test data.
+
+Han = [:Han:];
+Hiragana = [:Hiragana:];
+
+Control = [\p{Grapheme_Cluster_Break = Control}];
+HangulSyllable = [\uac00-\ud7a3];
+ComplexContext = [:LineBreak = Complex_Context:];
+KanaKanji = [Han Hiragana Katakana];
+dictionaryCJK = [KanaKanji HangulSyllable];
+dictionary = [ComplexContext dictionaryCJK];
+
+# leave CJK scripts out of ALetterPlus
+# Tricky. Redfine a set.
+# For tailorings, if it modifies itself, do at end of sets ????
+# Tweak redefine to mean replace existing definition at its original location.
+# Insert defs without redefine just after last pre-existing def of that name.
+# Maybe drop redefine, add warning for sets defined and not used, should catch typos.
+
+ALetter = [ALetter - dictionary];
+
+AHLetter = [ALetter Hebrew_Letter];
+MidNumLetQ = [MidNumLet Single_Quote];
+ExtFmt = [Extend Format ZWJ];
+
+WB3: CR LF;
+WB3a: (Newline | CR | LF) ÷;
+WB3b: . ÷ (Newline | CR | LF); # actually redundant? No other rule combines.
+ # (but needed with UAX treat-as scheme.)
+WB3c: ZWJ GAZ;
+
+WB5: AHLetter ExtFmt* AHLetter;
+
+# includes both WB6 and WB7
+WB6: AHLetter ExtFmt* (MidLetter | MidNumLetQ) ExtFmt* AHLetter;
+
+WB7a: Hebrew_Letter ExtFmt* Single_Quote;
+WB7b: Hebrew_Letter ExtFmt* Double_Quote ExtFmt* Hebrew_Letter; # Include WB7c
+
+WB8: Numeric ExtFmt* Numeric;
+WB9: AHLetter ExtFmt* Numeric;
+WB10: Numeric ExtFmt* AHLetter;
+
+WB11: Numeric ExtFmt* (MidNum | MidNumLetQ) ExtFmt* Numeric; # includes WB12
+WB13: Katakana ExtFmt* Katakana;
+
+WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) ExtFmt* ExtendNumLet;
+WB13b: ExtendNumLet ExtFmt* (AHLetter | Numeric | Katakana);
+
+# WB rule 13c, pairs of Regional Indicators stay unbroken.
+# Interacts with WB3c.
+WB13c.1: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ GAZ;
+WB13c.2: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷;
+
+WB13d: (E_Base | GAZ) ExtFmt* E_Modifier;
+
+# Rule WB 14 Any ÷ Any
+# Interacts with WB3c, do not break between ZWJ and GAZ.
+WB14.1: . ExtFmt* ZWJ GAZ;
+WB14.2: . ExtFmt* ÷;
+
--- /dev/null
+#
+# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
+
+# file: word_POSIX.txt
+#
+# Reference Word Break rules for intltest rbbi/RBBIMonkeyTest
+#
+# Note: Rule syntax and the monkey test itself are still a work in progress.
+# They are expected to change with review and the addition of support for rule tailoring.
+
+type = word; # one of grapheme | word | line | sentence
+locale = en_US_POSIX;
+
+E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
+E_Modifier = [\U0001F3FB-\U0001F3FF];
+ZWJ = [\u200D];
+GAZ = [\U0001F466-\U0001F469\U0001F48B\U0001F5E8\u2764];
+
+CR = [\p{Word_Break = CR}];
+LF = [\p{Word_Break = LF}];
+Newline = [\p{Word_Break = Newline}];
+Extend = [[[\p{Word_Break = Extend}][:Block=Tags:]]-ZWJ];
+Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
+Format = [[\p{Word_Break = Format}]-[:Block=Tags:]];
+Katakana = [\p{Word_Break = Katakana}];
+Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
+ALetter = [\p{Word_Break = ALetter}];
+Single_Quote = [\p{Word_Break = Single_Quote}];
+Double_Quote = [\p{Word_Break = Double_Quote}];
+MidNumLet = [\p{Word_Break = MidNumLet} - [.]];
+MidLetter = [\p{Word_Break = MidLetter} - [\:]];
+MidNum = [\p{Word_Break = MidNum} [.]];
+Numeric = [\p{Word_Break = Numeric}];
+ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
+
+#define dicitionary, with the effect being that those characters don't appear in test data.
+
+Han = [:Han:];
+Hiragana = [:Hiragana:];
+
+Control = [\p{Grapheme_Cluster_Break = Control}];
+HangulSyllable = [\uac00-\ud7a3];
+ComplexContext = [:LineBreak = Complex_Context:];
+KanaKanji = [Han Hiragana Katakana];
+dictionaryCJK = [KanaKanji HangulSyllable];
+dictionary = [ComplexContext dictionaryCJK];
+
+# leave CJK scripts out of ALetterPlus
+# Tricky. Redfine a set.
+# For tailorings, if it modifies itself, do at end of sets ????
+# Tweak redefine to mean replace existing definition at its original location.
+# Insert defs without redefine just after last pre-existing def of that name.
+# Maybe drop redefine, add warning for sets defined and not used, should catch typos.
+
+ALetter = [ALetter - dictionary];
+
+AHLetter = [ALetter Hebrew_Letter];
+MidNumLetQ = [MidNumLet Single_Quote];
+ExtFmt = [Extend Format ZWJ];
+
+WB3: CR LF;
+WB3a: (Newline | CR | LF) ÷;
+WB3b: . ÷ (Newline | CR | LF); # actually redundant? No other rule combines.
+ # (but needed with UAX treat-as scheme.)
+WB3c: ZWJ GAZ;
+
+WB5: AHLetter ExtFmt* AHLetter;
+
+# includes both WB6 and WB7
+WB6: AHLetter ExtFmt* (MidLetter | MidNumLetQ) ExtFmt* AHLetter;
+
+WB7a: Hebrew_Letter ExtFmt* Single_Quote;
+WB7b: Hebrew_Letter ExtFmt* Double_Quote ExtFmt* Hebrew_Letter; # Include WB7c
+
+WB8: Numeric ExtFmt* Numeric;
+WB9: AHLetter ExtFmt* Numeric;
+WB10: Numeric ExtFmt* AHLetter;
+
+WB11: Numeric ExtFmt* (MidNum | MidNumLetQ) ExtFmt* Numeric; # includes WB12
+WB13: Katakana ExtFmt* Katakana;
+
+WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) ExtFmt* ExtendNumLet;
+WB13b: ExtendNumLet ExtFmt* (AHLetter | Numeric | Katakana);
+
+# WB rule 13c, pairs of Regional Indicators stay unbroken.
+# Interacts with WB3c.
+WB13c.1: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ GAZ;
+WB13c.2: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷;
+
+WB13d: (E_Base | GAZ) ExtFmt* E_Modifier;
+
+# Rule WB 14 Any ÷ Any
+# Interacts with WB3c, do not break between ZWJ and GAZ.
+WB14.1: . ExtFmt* ZWJ GAZ;
+WB14.2: . ExtFmt* ÷;
+
-# Copyright (c) 2001-2015 International Business Machines
+# Copyright (c) 2001-2016 International Business Machines
# Corporation and others. All Rights Reserved.
#
# RBBI Test Data
<data>• •\uF8FF\u2028<100>\uF8FF•</data>
<data>• \u200B\u2028<100>\u200B•</data>
+# Regional Indicator sequences. They group in pairs. The reverse rules are tricky.
+# Sequences are long enough that the non-exaustive monkey test won't reliably pick up problems.
+
+<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•</data>
+<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6•</data>
+
+<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6\u00a0\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•</data>
+<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6\u00a0\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6•</data>
+<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\u00a0\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•</data>
+<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\u00a0\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6•</data>
+
+
# User Guide example
<data>•Parlez-•vous •français ?•</data>
<ClCompile Include="toolutil.cpp">
<DisableLanguageExtensions>false</DisableLanguageExtensions>
</ClCompile>
- <ClCompile Include="ucbuf.c" />
+ <ClCompile Include="ucbuf.cpp" />
<ClCompile Include="ucm.c" />
<ClCompile Include="ucmstate.c" />
<ClCompile Include="unewdata.c" />
/*
*******************************************************************************
*
-* Copyright (C) 1998-2014, International Business Machines
+* Copyright (C) 1998-2016, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
*
-* File ucbuf.c
+* File ucbuf.cpp
*
* Modification History:
*
/* check if u_unescapeAt unescaped and converted
* to c32 or not
*/
- if(c32==0xFFFFFFFF){
+ if(c32==(UChar32)0xFFFFFFFF){
if(buf->showWarning) {
char context[CONTEXT_LEN+1];
int32_t len = CONTEXT_LEN;
/*
*******************************************************************************
*
-* Copyright (C) 1998-2015, International Business Machines
+* Copyright (C) 1998-2016, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
*
-* File ucbuf.c
+* File ucbuf.h
*
* Modification History:
*
*******************************************************************************
*/
+#include "unicode/localpointer.h"
#include "unicode/ucnv.h"
#include "filestrm.h"
/**
* Opens the UCHARBUF with the given file stream and code page for conversion
* @param fileName Name of the file to open.
- * @param codepage The encoding of the file stream to convert to Unicode.
+ * @param codepage The encoding of the file stream to convert to Unicode.
* If *codepoge is NULL on input the API will try to autodetect
* popular Unicode encodings
* @param showWarning Flag to print out warnings to STDOUT
- * @param buffered If TRUE performs a buffered read of the input file. If FALSE reads
+ * @param buffered If TRUE performs a buffered read of the input file. If FALSE reads
* the whole file into memory and converts it.
* @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
* indicates a failure on entry, the function will immediately return.
ucbuf_getc32(UCHARBUF* buf,UErrorCode* err);
/**
- * Gets a UTF-16 code unit at the current position from the converted buffer after
+ * Gets a UTF-16 code unit at the current position from the converted buffer after
* unescaping and increments the current position. If the escape sequence is for UTF-32
* code point (\\Uxxxxxxxx) then a UTF-32 codepoint is returned
* @param buf Pointer to UCHARBUF structure
/**
* Gets a pointer to the current position in the internal buffer and length of the line.
- * It imperative to make a copy of the returned buffere before performing operations on it.
+ * It imperative to make a copy of the returned buffer before performing operations on it.
* @param buf Pointer to UCHARBUF structure
* @param len Output param to receive the len of the buffer returned till end of the line
* @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
U_NAMESPACE_BEGIN
+/**
+ * \class LocalUCHARBUFPointer
+ * "Smart pointer" class, closes a UCHARBUF via ucbuf_close().
+ * For most methods see the LocalPointerBase base class.
+ *
+ * @see LocalPointerBase
+ * @see LocalPointer
+ */
U_DEFINE_LOCAL_OPEN_POINTER(LocalUCHARBUFPointer, UCHARBUF, ucbuf_close);
U_NAMESPACE_END
/**
- * Autodetects the encoding of the file stream. Only Unicode charsets are autodectected.
+ * Autodetects the encoding of the file stream. Only Unicode charsets are autodectected.
* Some Unicode charsets are stateful and need byte identifiers to be converted also to bring
* the converter to correct state for converting the rest of the stream. So the UConverter parameter
* is necessary.
int32_t* signatureLength, UErrorCode* status);
/**
- * Autodetects the encoding of the file stream. Only Unicode charsets are autodectected.
+ * Autodetects the encoding of the file stream. Only Unicode charsets are autodectected.
* Some Unicode charsets are stateful and need byte identifiers to be converted also to bring
* the converter to correct state for converting the rest of the stream. So the UConverter parameter
* is necessary.